fix tokenization bug, there should be no space before first letter. sigh

2025-10-24 15:06:06 +00:00
parent 8892470f29
commit 05a051dbe9
1 changed files with 11 additions and 2 deletions
--- a/tasks/spellingbee.py
+++ b/tasks/spellingbee.py
@@ -260,7 +260,7 @@ class SimpleSpelling(Task):
        # return the full conversation
        messages = [
            {"role": "user", "content": f"Spell the word: {word}"},
-            {"role": "assistant", "content": f"{word}: {word_letters}"}
+            {"role": "assistant", "content": f"{word}:{word_letters}"}
        ]
        conversation = {
            "messages": messages,
@@ -289,7 +289,16 @@ if __name__ == "__main__":
        print()
        print("-" * 100)
-    # also scrutinize the tokenization (last example only)
+    # # preview the SimpleSpelling task, first 10 examples
    # task = SimpleSpelling()
    # for i in range(10):
    #     ex = task.get_example(i)
    #     print("=" * 100)
    #     print(ex['messages'][0]['content'])
    #     print("-" * 100)
    #     print(ex['messages'][1]['content'])
    # # also scrutinize the tokenization (last example only)
    # from nanochat.tokenizer import get_tokenizer
    # tokenizer = get_tokenizer()
    # ids, mask = tokenizer.render_conversation(ex)