fix adamw slight bug. this chunk was copy pasted originally from modded-nanogpt, which still seems to have the bug

This commit is contained in:
Andrej Karpathy
2026-01-08 18:18:22 +00:00
parent a1ccb3dc0b
commit 4ddc803797

View File

@@ -68,8 +68,8 @@ class DistAdamW(torch.optim.Optimizer):
bias1 = 1 - beta1 ** t
bias2 = 1 - beta2 ** t
# compute step
denom = exp_avg_sq.sqrt().add_(eps)
step_size = lr * (torch.sqrt(bias2) / bias1)
denom = (exp_avg_sq / bias2).sqrt().add_(eps)
step_size = lr / bias1
update = exp_avg.div(denom).mul_(step_size)
p_slice.add_(other=update, alpha=-1.0)
idx += 1