From 4ddc8037975f0c11e11038a27eaf81f070971dc8 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 8 Jan 2026 18:18:22 +0000 Subject: [PATCH] fix adamw slight bug. this chunk was copy pasted originally from modded-nanogpt, which still seems to have the bug --- nanochat/adamw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nanochat/adamw.py b/nanochat/adamw.py index 8816057..0b97ae2 100644 --- a/nanochat/adamw.py +++ b/nanochat/adamw.py @@ -68,8 +68,8 @@ class DistAdamW(torch.optim.Optimizer): bias1 = 1 - beta1 ** t bias2 = 1 - beta2 ** t # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (torch.sqrt(bias2) / bias1) + denom = (exp_avg_sq / bias2).sqrt().add_(eps) + step_size = lr / bias1 update = exp_avg.div(denom).mul_(step_size) p_slice.add_(other=update, alpha=-1.0) idx += 1