mirror of
https://github.com/karpathy/nanochat.git
synced 2026-01-30 04:22:02 +00:00
optimal ratio is now around 4
This commit is contained in:
@@ -47,7 +47,7 @@ parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding
|
|||||||
# Training horizon (only one used, in order of precedence)
|
# Training horizon (only one used, in order of precedence)
|
||||||
parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)")
|
parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)")
|
||||||
parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)")
|
parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)")
|
||||||
parser.add_argument("--target-param-data-ratio", type=int, default=8, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)")
|
parser.add_argument("--target-param-data-ratio", type=int, default=4, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)")
|
||||||
# Optimization
|
# Optimization
|
||||||
parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size")
|
parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size")
|
||||||
parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens")
|
parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens")
|
||||||
|
|||||||
Reference in New Issue
Block a user