mirror of
https://github.com/karpathy/nanochat.git
synced 2026-01-30 04:22:02 +00:00
update the CPU/MPS script to give reasonable results. The model can at least answer that Paris is the capital of France and knows that the sky is blue, for about 40 minutes of training on my macbook. Also fixed a bug that existed due to KVCache bfloat16 dtype assumption
This commit is contained in:
@@ -96,6 +96,7 @@ def test_kv_cache_basic():
|
||||
head_dim=head_dim,
|
||||
num_layers=num_layers,
|
||||
device="cpu",
|
||||
dtype=torch.float32,
|
||||
)
|
||||
|
||||
# Check initial state
|
||||
@@ -130,7 +131,7 @@ def test_kv_cache_prefill():
|
||||
# Create source cache and advance it
|
||||
src_cache = KVCache(
|
||||
batch_size=batch_size, num_heads=num_heads, seq_len=32,
|
||||
head_dim=head_dim, num_layers=num_layers, device="cpu",
|
||||
head_dim=head_dim, num_layers=num_layers, device="cpu", dtype=torch.float32,
|
||||
)
|
||||
# Write some data to source cache
|
||||
src_cache.k_cache[0, 0, :16, :, :] = 1.0
|
||||
@@ -140,7 +141,7 @@ def test_kv_cache_prefill():
|
||||
# Create destination cache with larger seq_len
|
||||
dst_cache = KVCache(
|
||||
batch_size=batch_size, num_heads=num_heads, seq_len=64,
|
||||
head_dim=head_dim, num_layers=num_layers, device="cpu",
|
||||
head_dim=head_dim, num_layers=num_layers, device="cpu", dtype=torch.float32,
|
||||
)
|
||||
|
||||
# Prefill
|
||||
|
||||
Reference in New Issue
Block a user