fix tokenization bug, there should be no space before first letter. sigh

This commit is contained in:
Andrej Karpathy
2025-10-24 15:06:06 +00:00
parent 8892470f29
commit 05a051dbe9

View File

@@ -260,7 +260,7 @@ class SimpleSpelling(Task):
# return the full conversation # return the full conversation
messages = [ messages = [
{"role": "user", "content": f"Spell the word: {word}"}, {"role": "user", "content": f"Spell the word: {word}"},
{"role": "assistant", "content": f"{word}: {word_letters}"} {"role": "assistant", "content": f"{word}:{word_letters}"}
] ]
conversation = { conversation = {
"messages": messages, "messages": messages,
@@ -289,7 +289,16 @@ if __name__ == "__main__":
print() print()
print("-" * 100) print("-" * 100)
# also scrutinize the tokenization (last example only) # # preview the SimpleSpelling task, first 10 examples
# task = SimpleSpelling()
# for i in range(10):
# ex = task.get_example(i)
# print("=" * 100)
# print(ex['messages'][0]['content'])
# print("-" * 100)
# print(ex['messages'][1]['content'])
# # also scrutinize the tokenization (last example only)
# from nanochat.tokenizer import get_tokenizer # from nanochat.tokenizer import get_tokenizer
# tokenizer = get_tokenizer() # tokenizer = get_tokenizer()
# ids, mask = tokenizer.render_conversation(ex) # ids, mask = tokenizer.render_conversation(ex)