From 05a051dbe92c3617529a607d4fb652355bada804 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 24 Oct 2025 15:06:06 +0000
Subject: [PATCH] fix tokenization bug, there should be no space before first
 letter. sigh

---
 tasks/spellingbee.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tasks/spellingbee.py b/tasks/spellingbee.py
index b394571..c051fe7 100644
--- a/tasks/spellingbee.py
+++ b/tasks/spellingbee.py
@@ -260,7 +260,7 @@ class SimpleSpelling(Task):
         # return the full conversation
         messages = [
             {"role": "user", "content": f"Spell the word: {word}"},
-            {"role": "assistant", "content": f"{word}: {word_letters}"}
+            {"role": "assistant", "content": f"{word}:{word_letters}"}
         ]
         conversation = {
             "messages": messages,
@@ -289,7 +289,16 @@ if __name__ == "__main__":
         print()
         print("-" * 100)
 
-    # also scrutinize the tokenization (last example only)
+    # # preview the SimpleSpelling task, first 10 examples
+    # task = SimpleSpelling()
+    # for i in range(10):
+    #     ex = task.get_example(i)
+    #     print("=" * 100)
+    #     print(ex['messages'][0]['content'])
+    #     print("-" * 100)
+    #     print(ex['messages'][1]['content'])
+
+    # # also scrutinize the tokenization (last example only)
     # from nanochat.tokenizer import get_tokenizer
     # tokenizer = get_tokenizer()
     # ids, mask = tokenizer.render_conversation(ex)