Fix Torch crash caused by pinning on CPU

2026-01-30 04:22:02 +00:00 · 2025-10-21 19:43:38 +00:00
parent 796f84527f
commit 32571664b1
2 changed files with 4 additions and 2 deletions
--- a/nanochat/dataloader.py
+++ b/nanochat/dataloader.py
@@ -38,7 +38,8 @@ def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokeniz
            batch_index += 1
        # Move tokens from the deque into the scratch buffer
        tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
-        scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=True)
+        # CUDA supports memory pinning for faster transfers between CPU and GPU:
+        scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=(device == "cuda"))
        # Create the inputs/targets as 1D tensors
        inputs_cpu = scratch[:-1].to(dtype=torch.int32)
        targets_cpu = scratch[1:]
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@@ -119,7 +119,8 @@ def mid_data_generator(split):
    assert dataset_size > 0
    needed_tokens = device_batch_size * max_seq_len + 1 # to form one training batch of inputs,targets
    token_buffer = deque()
-    scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=True)
+    # CUDA supports memory pinning for faster transfers between CPU and GPU:
+    scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=(device_type == "cuda"))
    cursor = ddp_rank # increments by ddp_world_size each time, so each rank processes unique documents
    it = 0 # iteration counter
    while True: