diff --git a/scripts/base_train.py b/scripts/base_train.py index 4ca8cdc..3725805 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -49,6 +49,9 @@ unembedding_lr = 0.004 # learning rate for the unembedding parameters (Adam) weight_decay = 0.0 # weight decay for the embedding/unembedding parameters (Adam) matrix_lr = 0.02 # learning rate for the matrix parameters (Muon) grad_clip = 1.0 # gradient clipping value (0.0 = disabled) +warmup_ratio = 0.0 # ratio of iterations for LR warmup +warmdown_ratio = 0.2 # ratio of iterations for LR warmdown +final_lr_frac = 0.0 # final LR is this fraction of the initial LR # Evaluation eval_every = 250 # every how many steps to evaluate the model for val bpb eval_tokens = 20*524288 # number of tokens to evaluate val loss on @@ -151,10 +154,6 @@ x, y = next(train_loader) # kick off load of the very first batch of data # Set up hyperparameter schedulers # Learning rate scheduler -# TODO: experiment with a short warmup for the AdamW params (expecting slight improvement) -warmup_ratio = 0.0 # ratio of iterations for LR warmup -warmdown_ratio = 0.2 # ratio of iterations for LR warmdown -final_lr_frac = 0.0 # final LR is this fraction of the initial LR def get_lr_multiplier(it): warmup_iters = round(warmup_ratio * num_iterations) warmdown_iters = round(warmdown_ratio * num_iterations)