initial commit

2025-10-13 06:49:24 -07:00
commit 3a5e0bc50b
47 changed files with 10292 additions and 0 deletions
--- a/dev/repackage_data_reference.py
+++ b/dev/repackage_data_reference.py
@@ -0,0 +1,92 @@
+"""
+Repackage the FinewebEdu-100B dataset into shards:
+
+- each shard is ~100MB in size (after zstd compression)
+- parquets are written with row group size of 1000
+- shuffle the dataset
+
+This will be uploaded to HuggingFace for hosting.
+The big deal is that our DataLoader will be able to stream
+the data and cache it along the way on disk, decreasing the
+training latency.
+
+NOTE: This file is meant only as reference/documentation of the
+dataset preparation and it is not used during the project runtime.
+"""
+import os
+import time
+
+from datasets import load_dataset
+import pyarrow.parquet as pq
+import pyarrow as pa
+
+# Source dataset
+dataset_kwargs = {
+    "path": "HuggingFaceFW/fineweb-edu",
+    "split": "train",
+    "name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
+}
+ds = load_dataset(**dataset_kwargs)
+
+# Shuffle to scramble the order
+ds = ds.shuffle(seed=42)
+ndocs = len(ds) # total number of documents to process
+print(f"Total number of documents: {ndocs}")
+
+# Repackage into parquet files
+output_dir = "/home/ubuntu/.cache/nanochat/base_data"
+os.makedirs(output_dir, exist_ok=True)
+
+# Write to parquet files
+chars_per_shard = 250_000_000
+row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
+shard_docs = []
+shard_index = 0
+shard_characters = 0
+total_docs_processed = 0
+total_time_spent = 0
+t0 = time.time()
+for doc in ds:
+    text = doc['text']
+    shard_docs.append(text)
+    shard_characters += len(text)
+    collected_enough_chars = shard_characters >= chars_per_shard
+    docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
+    if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
+        shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
+        shard_table = pa.Table.from_pydict({"text": shard_docs})
+        pq.write_table(
+            shard_table,
+            shard_path,
+            row_group_size=row_group_size,
+            use_dictionary=False, # this is usually used for categorical data
+            compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
+            compression_level=3,
+            write_statistics=False, # not needed for text
+        )
+        t1 = time.time()
+        dt = t1 - t0 # for this shard alone
+        t0 = t1
+        total_docs_processed += len(shard_docs)
+        total_time_spent += dt
+        remaining_docs = ndocs - total_docs_processed
+        avg_time_per_doc = total_time_spent / total_docs_processed
+        remaining_time = remaining_docs * avg_time_per_doc
+        remaining_time_hours = remaining_time / 3600
+        print(f"Wrote {shard_path}. #documents: {len(shard_docs)} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
+        shard_docs = []
+        shard_characters = 0
+        shard_index += 1
+
+# Demonstration of how the data was later uploaded to HuggingFace
+def upload():
+    import os
+    from huggingface_hub import HfApi
+    token = os.getenv("HF_TOKEN")
+    api = HfApi(token=token)
+    api.upload_large_folder(
+        folder_path=output_dir,
+        repo_id="karpathy/fineweb-edu-100b-shuffle",
+        repo_type="dataset",
+    )
+# upload()