From 8630d32be43912c1f8670c03fe6c0bdc843c1215 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 26 Jan 2026 22:31:42 +0000 Subject: [PATCH] quick fix to not OOM main speedrun script --- runs/speedrun.sh | 4 ++-- scripts/tok_train.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/runs/speedrun.sh b/runs/speedrun.sh index 8fff564..ef4fa00 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -58,8 +58,8 @@ python -m nanochat.dataset -n 8 # See comment below for why 370 is the right number here python -m nanochat.dataset -n 370 & DATASET_DOWNLOAD_PID=$! -# train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data -python -m scripts.tok_train --max-chars=2000000000 --vocab-size=65536 +# train the tokenizer with vocab size 2**15 = 32768 on ~2B characters of data +python -m scripts.tok_train # evaluate the tokenizer (report compression ratio etc.) python -m scripts.tok_eval diff --git a/scripts/tok_train.py b/scripts/tok_train.py index 9c7979d..480e0e1 100644 --- a/scripts/tok_train.py +++ b/scripts/tok_train.py @@ -14,7 +14,7 @@ from nanochat.dataset import parquets_iter_batched # Parse command line arguments parser = argparse.ArgumentParser(description='Train a BPE tokenizer') -parser.add_argument('--max-chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)') +parser.add_argument('--max-chars', type=int, default=2_000_000_000, help='Maximum characters to train on (default: 10B)') parser.add_argument('--doc-cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)') parser.add_argument('--vocab-size', type=int, default=32768, help='Vocabulary size (default: 32768 = 2^15)') args = parser.parse_args()