update the default GPTConfig kwargs otherwise they are confusing

This commit is contained in:
Andrej Karpathy
2026-01-17 21:16:46 +00:00
parent 3b95d4fd39
commit e7ed2082b8

View File

@@ -28,8 +28,8 @@ from nanochat.flash_attention import flash_attn
@dataclass @dataclass
class GPTConfig: class GPTConfig:
sequence_len: int = 1024 sequence_len: int = 2048
vocab_size: int = 50304 vocab_size: int = 32768
n_layer: int = 12 n_layer: int = 12
n_head: int = 6 # number of query heads n_head: int = 6 # number of query heads
n_kv_head: int = 6 # number of key/value heads (GQA) n_kv_head: int = 6 # number of key/value heads (GQA)
@@ -37,7 +37,7 @@ class GPTConfig:
# Sliding window attention pattern string, tiled across layers. Final layer always L. # Sliding window attention pattern string, tiled across layers. Final layer always L.
# Characters: L=long (full context), S=short (half context) # Characters: L=long (full context), S=short (half context)
# Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long
window_pattern: str = "L" window_pattern: str = "SSSL"
def norm(x): def norm(x):