diff --git a/miniseries.sh b/miniseries.sh index 9287def..077418a 100644 --- a/miniseries.sh +++ b/miniseries.sh @@ -27,8 +27,11 @@ RESULTS_DIR="$NANOCHAT_BASE_DIR/jan7_miniseries_results" mkdir -p "$RESULTS_DIR" RESULTS_FILE="$RESULTS_DIR/results.csv" -# Write CSV header -echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" +# Write CSV header only if file doesn't exist +if [ ! -f "$RESULTS_FILE" ]; then + echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" +fi + log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" } diff --git a/scaling_laws.sh b/scaling_laws.sh new file mode 100644 index 0000000..102ba11 --- /dev/null +++ b/scaling_laws.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +FLOPS_BUDGETS=( + 1e18 + 3e18 + 6e18 +) +DEPTHS=(8 10 12 14 16 18 20) +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" +WANDB_RUN="${WANDB_RUN:-scaling}" +EVAL_TOKENS=$((100 * 524288)) # ~100M tokens for final eval (default is ~10M) + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}" +source .venv/bin/activate + +RESULTS_DIR="$NANOCHAT_BASE_DIR/scaling_laws_results" +mkdir -p "$RESULTS_DIR" +RESULTS_FILE="$RESULTS_DIR/results.csv" + +# Write CSV header only if file doesn't exist +if [ ! -f "$RESULTS_FILE" ]; then + echo "flops_budget,depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" +fi + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +# Check if a run already exists in results +run_exists() { + local flops=$1 + local depth=$2 + grep -q "^${flops},${depth}," "$RESULTS_FILE" 2>/dev/null +} + +# ============================================================================= +# Main Loop +# ============================================================================= + +for flops in "${FLOPS_BUDGETS[@]}"; do + log "==============================================" + log "Compute budget: $flops FLOPs" + log "==============================================" + + for d in "${DEPTHS[@]}"; do + + # Skip if already completed + if run_exists "$flops" "$d"; then + log "Skipping d=$d at $flops FLOPs (already in results)" + continue + fi + + log "Training d=$d at $flops FLOPs..." + + # Unique tag for this run + TAG="scaling_${flops}_d${d}" + + # Record start time + START_TIME=$(date +%s) + + # Train the model with fixed flops budget + # The script will auto-calculate num_iterations to hit target_flops + # CORE eval happens once at the end (999999 ensures only final step) + torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ + --depth=$d \ + --target_flops=$flops \ + --target_param_data_ratio=-1 \ + --run="${WANDB_RUN}_${TAG}" \ + --model_tag="${TAG}" \ + --eval_tokens=$EVAL_TOKENS \ + --core_metric_every=999999 \ + --core_metric_max_per_task=-1 \ + --sample_every=-1 \ + --save_every=-1 \ + 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" + + END_TIME=$(date +%s) + TRAIN_TIME=$((END_TIME - START_TIME)) + + # Extract training stats from the log + LOG_FILE="$RESULTS_DIR/${TAG}_train.log" + NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',') + NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',') + NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',') + # Calculate tokens trained (iterations * batch_size, default 524288) + TOKENS_TRAINED=$((NUM_ITERS * 524288)) + # Param:data ratio (using scaling params per Kaplan et al.) + PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')") + # Model dim + MODEL_DIM=$((d * 64)) + # Val BPB from final eval + VAL_BPB=$(grep "Validation bpb:" "$LOG_FILE" | tail -1 | grep -oP '[\d.]+$') + + # Extract CORE score from training log (evaluated on final step) + CORE_SCORE=$(grep "CORE metric:" "$LOG_FILE" | tail -1 | awk '{print $NF}') + if [ -z "$CORE_SCORE" ]; then + log "WARNING: Could not extract CORE score for d=$d" + CORE_SCORE="0.0" + fi + + log " Params: $NUM_PARAMS, Iters: $NUM_ITERS, Ratio: $PARAM_DATA_RATIO, Val BPB: $VAL_BPB, CORE: $CORE_SCORE" + + # Append to CSV + echo "$flops,$d,$MODEL_DIM,$NUM_PARAMS,$NUM_SCALING_PARAMS,$NUM_ITERS,$TOKENS_TRAINED,$PARAM_DATA_RATIO,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" + done +done + +log "==============================================" +log "Scaling Laws Sweep Complete" +log "==============================================" +log "Results saved to: $RESULTS_FILE" +echo "" +echo "Results:" +column -t -s',' "$RESULTS_FILE"