merge and resolve conflict

2026-01-30 04:22:02 +00:00 · 2025-10-21 17:19:10 +00:00
parent dfcb1c16f1 bb786c5560
commit 5bdc99abfb
17 changed files with 246 additions and 81 deletions
--- a/dev/runcpu.sh
+++ b/dev/runcpu.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks)
+# Run as:
+# bash dev/cpu_demo_run.sh
+
+# NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook.
+# Think of this run as educational/fun demo, not something you should expect to work well.
+# This is also why I hide this script away in dev/
+
+# all the setup stuff
+export OMP_NUM_THREADS=1
+NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
+mkdir -p $NANOCHAT_BASE_DIR
+command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
+[ -d ".venv" ] || uv venv
+uv sync
+source .venv/bin/activate
+if [ -z "$WANDB_RUN" ]; then
+    WANDB_RUN=dummy
+fi
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+source "$HOME/.cargo/env"
+uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
+EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
+if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
+    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
+    unzip -q eval_bundle.zip
+    rm eval_bundle.zip
+    mv eval_bundle $NANOCHAT_BASE_DIR
+fi
+
+# wipe the report
+python -m nanochat.report reset
+
+# train tokenizer on ~1B characters
+python -m nanochat.dataset -n 4
+python -m scripts.tok_train --max_chars=1000000000
+python -m scripts.tok_eval
+
+# train a very small 4 layer model on the CPU
+# each optimization step processes a single sequence of 1024 tokens
+# we only run 50 steps of optimization (bump this to get better results)
+python -m scripts.base_train \
+    --depth=4 \
+    --max_seq_len=1024 \
+    --device_batch_size=1 \
+    --total_batch_size=1024 \
+    --eval_every=50 \
+    --eval_tokens=4096 \
+    --core_metric_every=50 \
+    --core_metric_max_per_task=12 \
+    --sample_every=50 \
+    --num_iterations=50
+python -m scripts.base_loss --device_batch_size=1 --split_tokens=4096
+python -m scripts.base_eval --max-per-task=5
+
+# midtraining
+python -m scripts.mid_train \
+    --max_seq_len=1024 \
+    --device_batch_size=1 \
+    --eval_every=50 \
+    --eval_tokens=4096 \
+    --total_batch_size=1024 \
+    --num_iterations=100
+# eval results will be terrible, this is just to execute the code paths.
+# note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems
+python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20
+
+# SFT
+python -m scripts.chat_sft \
+    --device_batch_size=1 \
+    --target_examples_per_step=4 \
+    --num_iterations=100 \
+    --eval_steps=4 \
+    --eval_metrics_max_problems=16
+
+# Chat CLI
+# python -m scripts.chat_cli -p "Why is the sky blue?"
+
+# Chat Web
+# python -m scripts.chat_web
+
+python -m nanochat.report generate