add personality to nanochat. breaks previous code on git pull and requires download of a new file from s3, but there is a helpful error message so hopefully its ok

2025-10-21 15:04:58 +00:00
parent 0f007889dd
commit fe5aed940b
7 changed files with 468 additions and 2 deletions
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@@ -27,6 +27,7 @@ from tasks.common import TaskMixture
 from tasks.gsm8k import GSM8K
 from tasks.mmlu import MMLU
 from tasks.smoltalk import SmolTalk
+from tasks.customjson import CustomJSON

 # -----------------------------------------------------------------------------
 run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
@@ -88,10 +89,13 @@ for opt in optimizers:

 # Midtraining data mixture and DataLoader
 base_dir = get_base_dir()
+identity_conversations_filepath = os.path.join(base_dir, "identity_conversations.jsonl")
 train_dataset = TaskMixture([
    SmolTalk(split="train"), # 460K rows of general conversations
    MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems drawn from ARC, MC_TEST, OBQA, RACE
    GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use
+    CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations
+    CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these
 ]) # total: 460K + 100K + 8K = 568K rows
 val_dataset = TaskMixture([
    SmolTalk(split="test"), # 24K rows in test set