Big DataLoader refactor: BOS-aligned dataloaders with epoch tracking for pre/mid-training

The new DataLoader ensures that every token sequence in train/val batches has a BOS token at the beginning. Therefore, no token streams start abruptly in the middle of a document, which could be confusing for the model. Note that this changes the loss scale because there are fewer confusing tokens in the train/val batches. The main downside is that we now waste about 35% of tokens due to cropping. This is ok because we have a lot of data. See dev/LOG.md entry for this change for a lot more information.
2026-01-30 04:22:02 +00:00 · 2026-01-13 20:05:47 +00:00
parent 23985413aa
commit 43c29dd9d5
7 changed files with 330 additions and 106 deletions
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -21,7 +21,7 @@ import wandb
 import torch

 from nanochat.gpt import GPT, GPTConfig
-from nanochat.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state
+from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit, tokenizing_distributed_data_loader_with_state_bos_bestfit
 from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
 from nanochat.tokenizer import get_tokenizer, get_token_bytes
 from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint
@@ -210,8 +210,8 @@ if resuming:
 # Initialize the DataLoaders for train/val
 tokens_dir = os.path.join(base_dir, "tokenized_data")
 dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"]
-train_loader = tokenizing_distributed_data_loader_with_state(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict)
-build_val_loader = lambda: tokenizing_distributed_data_loader(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device)
+train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict)
+build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device)
 x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data

 # -----------------------------------------------------------------------------
@@ -395,7 +395,8 @@ while True:
        eta_str = f" | eta: {eta_seconds/60:.1f}m"
    else:
        eta_str = ""
-    print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m{eta_str}")
+    epoch = dataloader_state_dict["epoch"]
+    print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}")
    if step % 100 == 0:
        log_data = {
            "step": step,
@@ -406,6 +407,7 @@ while True:
            "train/dt": dt,
            "train/tok_per_sec": tok_per_sec,
            "train/mfu": mfu,
+            "train/epoch": epoch,
        }
        wandb_run.log(log_data)

--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@@ -10,7 +10,6 @@ torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_
 """

 import argparse
-from collections import deque
 import os
 os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 import time
@@ -125,49 +124,95 @@ val_dataset = TaskMixture([
 # these two global variables and update them from within the data generator.
 last_step = False # we will toggle this to True when we reach the end of the training dataset
 approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch
-def mid_data_generator(split):
-    global last_step, approx_progress
+current_epoch = 1 # track epoch for logging
+def mid_data_generator_bos_bestfit(split, buffer_size=100):
+    """
+    BOS-aligned dataloader for midtraining with bestfit-crop packing.
+
+    Each row in the batch starts with BOS (beginning of a conversation).
+    Conversations are packed using best-fit algorithm to minimize cropping.
+    This matches the BOS-aligned approach used in pretraining.
+    """
+    global last_step, approx_progress, current_epoch
    assert split in {"train", "val"}, "split must be 'train' or 'val'"
    dataset = train_dataset if split == "train" else val_dataset
    dataset_size = len(dataset)
    assert dataset_size > 0
-    needed_tokens = args.device_batch_size * args.max_seq_len + 1 # to form one training batch of inputs,targets
-    token_buffer = deque()
-    # CUDA supports memory pinning for faster transfers between CPU and GPU:
-    scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=(device_type == "cuda"))
-    cursor = ddp_rank # increments by ddp_world_size each time, so each rank processes unique documents
-    it = 0 # iteration counter
-    while True:
-        # Accumulate enough tokens for one iteration before yielding
-        while len(token_buffer) < needed_tokens:
+    row_capacity = args.max_seq_len + 1  # +1 for target at last position
+
+    # Conversation buffer: list of token lists
+    conv_buffer = []
+    cursor = ddp_rank  # Each rank processes different conversations
+    epoch = 1
+    it = 0  # iteration counter
+
+    def refill_buffer():
+        nonlocal cursor, epoch
+        while len(conv_buffer) < buffer_size:
            conversation = dataset[cursor]
            ids, _ = tokenizer.render_conversation(conversation)
-            token_buffer.extend(ids)
+            conv_buffer.append(ids)
            cursor += ddp_world_size
            if cursor >= dataset_size:
-                cursor -= dataset_size # wrap around for another epoch
+                cursor = cursor % dataset_size
+                epoch += 1
                if split == "train":
-                    last_step = True # toggle last_step to True, which will terminate the training loop
+                    last_step = True  # toggle last_step to True, which will terminate the training loop
+
+    while True:
+        rows = []
+        for _ in range(args.device_batch_size):
+            row = []
+            while len(row) < row_capacity:
+                # Ensure buffer has conversations
+                while len(conv_buffer) < buffer_size:
+                    refill_buffer()
+
+                remaining = row_capacity - len(row)
+
+                # Find largest conversation that fits entirely
+                best_idx = -1
+                best_len = 0
+                for i, conv in enumerate(conv_buffer):
+                    conv_len = len(conv)
+                    if conv_len <= remaining and conv_len > best_len:
+                        best_idx = i
+                        best_len = conv_len
+
+                if best_idx >= 0:
+                    # Found a conversation that fits - use it entirely
+                    conv = conv_buffer.pop(best_idx)
+                    row.extend(conv)
+                else:
+                    # No conversation fits - crop first conversation to fill remaining
+                    conv = conv_buffer.pop(0)
+                    row.extend(conv[:remaining])
+
+            rows.append(row[:row_capacity])
+
        # Stopping condition to respect num_iterations, if given
        it += 1
        if 0 < args.num_iterations <= it and split == "train":
-            last_step = True # toggle last_step to True, which will terminate the training loop
-        # Build up inputs/targets and yield
-        for i in range(needed_tokens):
-            scratch[i] = token_buffer.popleft()
-        inputs_cpu = scratch[:-1].to(dtype=torch.int32)
-        targets_cpu = scratch[1:]
-        inputs = inputs_cpu.view(args.device_batch_size, args.max_seq_len).to(device=device, dtype=torch.int32, non_blocking=True)
-        targets = targets_cpu.view(args.device_batch_size, args.max_seq_len).to(device=device, dtype=torch.int64, non_blocking=True)
+            last_step = True
+
+        # Update progress tracking
        if split == "train":
+            current_epoch = epoch
            if args.num_iterations > 0:
-                approx_progress = it / args.num_iterations # calculate progress from the max number of iterations
+                approx_progress = it / args.num_iterations
            else:
-                approx_progress = cursor / dataset_size # approximate progress as a fraction of the dataset
+                approx_progress = cursor / dataset_size
+
+        # Build tensors
+        use_cuda = device_type == "cuda"
+        batch_tensor = torch.tensor(rows, dtype=torch.long, pin_memory=use_cuda)
+        inputs = batch_tensor[:, :-1].to(device=device, dtype=torch.int32, non_blocking=use_cuda)
+        targets = batch_tensor[:, 1:].to(device=device, dtype=torch.int64, non_blocking=use_cuda)
+
        yield inputs, targets

-train_loader = mid_data_generator("train")
-build_val_loader = lambda: mid_data_generator("val")
+train_loader = mid_data_generator_bos_bestfit("train")
+build_val_loader = lambda: mid_data_generator_bos_bestfit("val")
 progress = 0 # will go from 0 to 1 over the course of the epoch

 # Learning rate scheduler
@@ -285,7 +330,7 @@ while True:
    mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
    if step > 10:
        total_training_time += dt # only count the time after the first 10 steps
-    print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
+    print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {current_epoch} | total time: {total_training_time/60:.2f}m")
    if step % 10 == 0:
        wandb_run.log({
            "step": step,
@@ -296,6 +341,7 @@ while True:
            "train/dt": dt,
            "train/tok_per_sec": tok_per_sec,
            "train/mfu": mfu,
+            "train/epoch": current_epoch,
        })

 # print a few more stats