initial commit
This commit is contained in:
29
dev/generate_logo.html
Normal file
29
dev/generate_logo.html
Normal file
@@ -0,0 +1,29 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body style="margin:0; display:flex; justify-content:center; align-items:center; height:100vh; background:#fff">
|
||||
<svg width="400" height="400" xmlns="http://www.w3.org/2000/svg">
|
||||
<defs>
|
||||
<radialGradient id="g" cx="50%" cy="50%">
|
||||
<stop offset="0%" style="stop-color:#667eea;stop-opacity:1"/>
|
||||
<stop offset="100%" style="stop-color:#764ba2;stop-opacity:0.3"/>
|
||||
</radialGradient>
|
||||
</defs>
|
||||
</svg>
|
||||
<script>
|
||||
const svg = document.querySelector('svg');
|
||||
const r = 120;
|
||||
let path = '';
|
||||
for(let i = 0; i < 24; i += 2) {
|
||||
let a1 = i * Math.PI / 12;
|
||||
let a2 = (i + 1) * Math.PI / 12;
|
||||
let x2 = 200 + Math.cos(a2) * r;
|
||||
let y2 = 200 + Math.sin(a2) * r;
|
||||
let x3 = 200 + Math.cos(a2) * (r - 90);
|
||||
let y3 = 200 + Math.sin(a2) * (r - 90);
|
||||
path += `M${x2},${y2} L${x3},${y3} `;
|
||||
}
|
||||
svg.innerHTML += `<path d="${path}" stroke="url(#g)" stroke-width="6" stroke-linecap="round" fill="none"/>`;
|
||||
svg.innerHTML += `<path d="M200,-12 L212,0 L200,12 L188,0 Z" transform="translate(0,200)" fill="#000"/>`;
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
92
dev/repackage_data_reference.py
Normal file
92
dev/repackage_data_reference.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
Repackage the FinewebEdu-100B dataset into shards:
|
||||
|
||||
- each shard is ~100MB in size (after zstd compression)
|
||||
- parquets are written with row group size of 1000
|
||||
- shuffle the dataset
|
||||
|
||||
This will be uploaded to HuggingFace for hosting.
|
||||
The big deal is that our DataLoader will be able to stream
|
||||
the data and cache it along the way on disk, decreasing the
|
||||
training latency.
|
||||
|
||||
NOTE: This file is meant only as reference/documentation of the
|
||||
dataset preparation and it is not used during the project runtime.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
|
||||
from datasets import load_dataset
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow as pa
|
||||
|
||||
# Source dataset
|
||||
dataset_kwargs = {
|
||||
"path": "HuggingFaceFW/fineweb-edu",
|
||||
"split": "train",
|
||||
"name": "sample-100BT", # ~100B GPT-2 tokens at ~3 chars/token => ~300B chars total
|
||||
}
|
||||
ds = load_dataset(**dataset_kwargs)
|
||||
|
||||
# Shuffle to scramble the order
|
||||
ds = ds.shuffle(seed=42)
|
||||
ndocs = len(ds) # total number of documents to process
|
||||
print(f"Total number of documents: {ndocs}")
|
||||
|
||||
# Repackage into parquet files
|
||||
output_dir = "/home/ubuntu/.cache/nanochat/base_data"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Write to parquet files
|
||||
chars_per_shard = 250_000_000
|
||||
row_group_size = 1024 # HF uses 1000 but we use multiple of 2, nicer for distributed data loader later
|
||||
shard_docs = []
|
||||
shard_index = 0
|
||||
shard_characters = 0
|
||||
total_docs_processed = 0
|
||||
total_time_spent = 0
|
||||
t0 = time.time()
|
||||
for doc in ds:
|
||||
text = doc['text']
|
||||
shard_docs.append(text)
|
||||
shard_characters += len(text)
|
||||
collected_enough_chars = shard_characters >= chars_per_shard
|
||||
docs_multiple_of_row_group_size = len(shard_docs) % row_group_size == 0
|
||||
if collected_enough_chars and docs_multiple_of_row_group_size: # leads to ~100MB of text (compressed)
|
||||
shard_path = os.path.join(output_dir, f"shard_{shard_index:05d}.parquet")
|
||||
shard_table = pa.Table.from_pydict({"text": shard_docs})
|
||||
pq.write_table(
|
||||
shard_table,
|
||||
shard_path,
|
||||
row_group_size=row_group_size,
|
||||
use_dictionary=False, # this is usually used for categorical data
|
||||
compression="zstd", # Valid values: {‘NONE’, ‘SNAPPY’, ‘GZIP’, ‘BROTLI’, ‘LZ4’, ‘ZSTD’}
|
||||
compression_level=3,
|
||||
write_statistics=False, # not needed for text
|
||||
)
|
||||
t1 = time.time()
|
||||
dt = t1 - t0 # for this shard alone
|
||||
t0 = t1
|
||||
total_docs_processed += len(shard_docs)
|
||||
total_time_spent += dt
|
||||
remaining_docs = ndocs - total_docs_processed
|
||||
avg_time_per_doc = total_time_spent / total_docs_processed
|
||||
remaining_time = remaining_docs * avg_time_per_doc
|
||||
remaining_time_hours = remaining_time / 3600
|
||||
print(f"Wrote {shard_path}. #documents: {len(shard_docs)} | #characters: {shard_characters} | time: {dt:.2f}s | remaining time: {remaining_time_hours:.2f}h")
|
||||
shard_docs = []
|
||||
shard_characters = 0
|
||||
shard_index += 1
|
||||
|
||||
# Demonstration of how the data was later uploaded to HuggingFace
|
||||
def upload():
|
||||
import os
|
||||
from huggingface_hub import HfApi
|
||||
token = os.getenv("HF_TOKEN")
|
||||
api = HfApi(token=token)
|
||||
api.upload_large_folder(
|
||||
folder_path=output_dir,
|
||||
repo_id="karpathy/fineweb-edu-100b-shuffle",
|
||||
repo_type="dataset",
|
||||
)
|
||||
# upload()
|
||||
Reference in New Issue
Block a user