many small tweaks. base, eval, core work now i think

2026-01-30 04:22:02 +00:00 · 2025-10-16 15:46:18 -07:00
parent 786119d593
commit df600b6ed5
5 changed files with 30 additions and 19 deletions
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -93,8 +93,9 @@ def autodetect_device_type():
    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
    if torch.cuda.is_available():
        device_type = "cuda"
-    if torch.backends.mps.is_available():
+    elif torch.backends.mps.is_available():
        device_type = "mps"
+    else:
        device_type = "cpu"
    print0(f"Autodetected device type: {device_type}")
    return device_type
--- a/nanochat/report.py
+++ b/nanochat/report.py
@@ -283,6 +283,10 @@ class Report:
                    # capture bloat data for summary later (the stuff after Bloat header and until \n\n)
                    bloat_data = re.search(r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL)
                    bloat_data = bloat_data.group(1) if bloat_data else ""
+            else:
+                start_time = None # will cause us to not write the total wall clock time
+                bloat_data = "[bloat data missing]"
+                print(f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?")
            # process all the individual sections
            for file_name in EXPECTED_FILES:
                section_file = os.path.join(report_dir, file_name)
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -15,6 +15,7 @@ import time
 import json
 import random
 import yaml
+from contextlib import nullcontext

 import pandas as pd
 import torch
@@ -118,18 +119,21 @@ def load_hf_model(hf_path: str, device):

 # -----------------------------------------------------------------------------
 def main():
-    assert len(sys.argv) in [1, 2], "Usage: python base_eval.py [hf_path]"
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')
+    parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')
+    args = parser.parse_args()

    # distributed / precision setup
    device_type = autodetect_device_type()
    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-    dtype = torch.bfloat16 if device_type == "cuda" else torch.float32 # use fp32 on CPU|MPS
-    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=dtype)
+    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()

    # Load model and tokenizer from command line or from file system
-    if len(sys.argv) >= 2:
+    if args.hf_path is not None:
        # atm assume that if a path is given, it's a huggingface model path
-        hf_path = sys.argv[1]
+        hf_path = args.hf_path
        print0(f"Loading huggingface model from: {hf_path}")
        model, tokenizer = load_hf_model(hf_path, device)
        model_name = hf_path # just for logging
@@ -142,7 +146,7 @@ def main():

    # Evaluate the model
    with autocast_ctx:
-        out = evaluate_model(model, tokenizer, device)
+        out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)

    # Write out the results to a csv file
    core_metric = None
--- a/scripts/base_loss.py
+++ b/scripts/base_loss.py
@@ -7,6 +7,7 @@ Example run as:
 torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
 """
 import os
+from contextlib import nullcontext
 import torch
 from nanochat.checkpoint_manager import load_model
 from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type
@@ -26,10 +27,9 @@ exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from
 # Load the base model and the tokenizer
 device_type = autodetect_device_type() if device_type == "" else device_type
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-dtype = torch.bfloat16 if device_type == "cuda" else torch.float32 # use fp32 on CPU|MPS
 model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=model_tag, step=model_step)
 sequence_len = meta["model_config"]["sequence_len"] # could be arbitrary really
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=dtype)
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()

 # Evaluate the loss on each split
 tokens_per_step = device_batch_size * sequence_len * ddp_world_size
@@ -38,7 +38,7 @@ steps = split_tokens // tokens_per_step
 token_bytes = get_token_bytes(device=device)
 bpb_results = {}
 for split_name in ["train", "val"]:
-    loader = tokenizing_distributed_data_loader(device_batch_size, sequence_len, split_name)
+    loader = tokenizing_distributed_data_loader(device_batch_size, sequence_len, split_name, device=device)
    with autocast_ctx:
        bpb = evaluate_bpb(model, loader, steps, token_bytes)
    print0(f"{split_name} bpb: {bpb:.4f}")
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -7,13 +7,15 @@ or distributed as:

 torchrun --nproc_per_node=8 base_train.py

-python -m scripts.base_train --device_type=cpu --depth=4 --max_seq_len=512 --device_batch_size=1 --eval_tokens=512 --core_metric_max_per_task=8 --total_batch_size=512 --num_iterations=500
-If you have a Macbook, you're better off using device_type=mps instead of cpu
+If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example:
+python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 --eval_tokens=512 --core_metric_every=-1 --total_batch_size=512 --num_iterations=20
 """

 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 import time
+from contextlib import nullcontext
+
 import wandb
 import torch

@@ -50,7 +52,7 @@ grad_clip = 1.0 # gradient clipping value (0.0 = disabled)
 # Evaluation
 eval_every = 250 # every how many steps to evaluate the model for val bpb
 eval_tokens = 20*524288 # number of tokens to evaluate val loss on
-core_metric_every = 2000 # every how many steps to evaluate the core metric
+core_metric_every = 2000 # every how many steps to evaluate the core metric (-1 = disable)
 core_metric_max_per_task = 500 # examples per task in estimating the core metric
 sample_every = 2000 # every how many steps to sample from the model
 # Output
@@ -65,8 +67,7 @@ user_config = {k: globals()[k] for k in config_keys} # will be useful for loggin
 device_type = autodetect_device_type() if device_type == "" else device_type
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
-dtype = torch.bfloat16 if device_type == "cuda" else torch.float32 # use fp32 on CPU|MPS
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=dtype)
+autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
 synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
 get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0

@@ -202,7 +203,8 @@ for step in range(num_iterations + 1):

    # once in a while: estimate the CORE metric (all ranks participate)
    # use the original uncompiled model because the inputs keep changing shape
-    if last_step or (step > 0 and step % core_metric_every == 0):
+    results = {}
+    if core_metric_every > 0 and (last_step or (step > 0 and step % core_metric_every == 0)):
        model.eval()
        with autocast_ctx:
            results = evaluate_model(orig_model, tokenizer, device, max_per_task=core_metric_max_per_task)
@@ -228,7 +230,7 @@ for step in range(num_iterations + 1):
            "My favorite color is",
            "If 5*x + 3 = 13, then x is",
        ]
-        engine = Engine(model, tokenizer)
+        engine = Engine(orig_model, tokenizer)
        for prompt in prompts:
            tokens = tokenizer(prompt, prepend="<|bos|>")
            with autocast_ctx:
@@ -335,7 +337,7 @@ get_report().log(section="Base model training", data=[
    { # stats about training outcomes
        "Minimum validation bpb": min_val_bpb,
        "Final validation bpb": val_bpb,
-        "CORE metric estimate": results["core_metric"],
+        "CORE metric estimate": results.get("core_metric", None),
        "MFU %": f"{mfu:.2f}%",
        "Total training flops": f"{flops_so_far:e}",
        "Total training time": f"{total_training_time/60:.2f}m",