initial commit

2026-01-30 04:22:02 +00:00 · 2025-10-13 06:49:24 -07:00
commit 3a5e0bc50b
47 changed files with 10292 additions and 0 deletions
--- a/dev/generate_logo.html
+++ b/dev/generate_logo.html
@@ -0,0 +1,29 @@
+<!DOCTYPE html>
+<html>
+<body style="margin:0; display:flex; justify-content:center; align-items:center; height:100vh; background:#fff">
+  <svg width="400" height="400" xmlns="http://www.w3.org/2000/svg">
+    <defs>
+      <radialGradient id="g" cx="50%" cy="50%">
+        <stop offset="0%" style="stop-color:#667eea;stop-opacity:1"/>
+        <stop offset="100%" style="stop-color:#764ba2;stop-opacity:0.3"/>
+      </radialGradient>
+    </defs>
+  </svg>
+  <script>
+    const svg = document.querySelector('svg');
+    const r = 120;
+    let path = '';
+    for(let i = 0; i < 24; i += 2) {
+      let a1 = i * Math.PI / 12;
+      let a2 = (i + 1) * Math.PI / 12;
+      let x2 = 200 + Math.cos(a2) * r;
+      let y2 = 200 + Math.sin(a2) * r;
+      let x3 = 200 + Math.cos(a2) * (r - 90);
+      let y3 = 200 + Math.sin(a2) * (r - 90);
+      path += `M${x2},${y2} L${x3},${y3} `;
+    }
+    svg.innerHTML += `<path d="${path}" stroke="url(#g)" stroke-width="6" stroke-linecap="round" fill="none"/>`;
+    svg.innerHTML += `<path d="M200,-12 L212,0 L200,12 L188,0 Z" transform="translate(0,200)" fill="#000"/>`;
+  </script>
+</body>
+</html>
--- a/dev/repackage_data_reference.py
+++ b/dev/repackage_data_reference.py
@@ -0,0 +1,92 @@
+"""
+Repackage the FinewebEdu-100B dataset into shards:
+
+- each shard is ~100MB in size (after zstd compression)
+- parquets are written with row group size of 1000
+- shuffle the dataset
+
+This will be uploaded to HuggingFace for hosting.
+The big deal is that our DataLoader will be able to stream
+the data and cache it along the way on disk, decreasing the
+training latency.
+
+NOTE: This file is meant only as reference/documentation of the
+dataset preparation and it is not used during the project runtime.
+"""
+import os
+import time
+
+from datasets import load_dataset
+import pyarrow.parquet as pq
+import pyarrow as pa
+
+# Source dataset
+dataset_kwargs = {
+    "path": "HuggingFaceFW/fineweb-edu",
+    "split": "train",
+    "name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
+}
+ds = load_dataset(**dataset_kwargs)
+
+# Shuffle to scramble the order
+ds = ds.shuffle(seed=42)
+ndocs = len(ds) # total number of documents to process
+print(f"Total number of documents: {ndocs}")
+
+# Repackage into parquet files
+output_dir = "/home/ubuntu/.cache/nanochat/base_data"
+os.makedirs(output_dir, exist_ok=True)
+
+# Write to parquet files
+chars_per_shard = 250_000_000
+row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
+shard_docs = []
+shard_index = 0
+shard_characters = 0
+total_docs_processed = 0
+total_time_spent = 0
+t0 = time.time()
+for doc in ds:
+    text = doc['text']
+    shard_docs.append(text)
+    shard_characters += len(text)
+    collected_enough_chars = shard_characters >= chars_per_shard
+    docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
+    if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
+        shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
+        shard_table = pa.Table.from_pydict({"text": shard_docs})
+        pq.write_table(
+            shard_table,
+            shard_path,
+            row_group_size=row_group_size,
+            use_dictionary=False, # this is usually used for categorical data
+            compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
+            compression_level=3,
+            write_statistics=False, # not needed for text
+        )
+        t1 = time.time()
+        dt = t1 - t0 # for this shard alone
+        t0 = t1
+        total_docs_processed += len(shard_docs)
+        total_time_spent += dt
+        remaining_docs = ndocs - total_docs_processed
+        avg_time_per_doc = total_time_spent / total_docs_processed
+        remaining_time = remaining_docs * avg_time_per_doc
+        remaining_time_hours = remaining_time / 3600
+        print(f"Wrote {shard_path}. #documents: {len(shard_docs)} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
+        shard_docs = []
+        shard_characters = 0
+        shard_index += 1
+
+# Demonstration of how the data was later uploaded to HuggingFace
+def upload():
+    import os
+    from huggingface_hub import HfApi
+    token = os.getenv("HF_TOKEN")
+    api = HfApi(token=token)
+    api.upload_large_folder(
+        folder_path=output_dir,
+        repo_id="karpathy/fineweb-edu-100b-shuffle",
+        repo_type="dataset",
+    )
+# upload()