diff --git a/.gitignore b/.gitignore index 6518953..0944657 100644 --- a/.gitignore +++ b/.gitignore @@ -13,5 +13,7 @@ # Large scaling-run corpora + tokenized id caches live on dash5 only, never in # git (the small data/tinystories-valid-3mb.txt is committed as a fixture). /data/tinystories-train.txt +/data/fineweb-edu.txt +/data/*.parquet *.u16.bin *.ckpt diff --git a/scripts/fineweb_to_txt.py b/scripts/fineweb_to_txt.py new file mode 100644 index 0000000..ef36fb6 --- /dev/null +++ b/scripts/fineweb_to_txt.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +"""FineWeb-edu parquet -> plain UTF-8 .txt for xtrain's Corpus loader (Scaling v6). + +v0-v5 trained on TinyStories; v6 broadens the data to FineWeb-edu +(HuggingFaceFW/fineweb-edu, the `sample/10BT` subset) to isolate the +data-source variable while keeping the v4/v5 architecture fixed. + +xtrain's `Corpus::load` (crates/xtrain-train/src/data.rs) reads a UTF-8 text +file and tokenizes it with the gpt2 BPE; it treats `<|endoftext|>` as the +document boundary (the gpt2 tokenizer emits id 50256 for it). This script +extracts the `text` column from one or more FineWeb-edu parquet shards and +writes each document followed by `<|endoftext|>`, producing exactly that +format. Reads row-group by row-group so memory stays bounded regardless of +shard size. + +Usage (on dash5, in data/): + python3 scripts/fineweb_to_txt.py OUT.txt SHARD1.parquet [SHARD2.parquet ...] + +The corpus / .txt / .u16.bin caches are multi-GB and live on dash5 only +(gitignored); only this script is committed. +""" + +import sys +import pyarrow.parquet as pq + +EOS = "<|endoftext|>" + + +def main() -> None: + if len(sys.argv) < 3: + sys.exit(f"usage: {sys.argv[0]} OUT.txt SHARD.parquet [SHARD.parquet ...]") + out_path = sys.argv[1] + shards = sys.argv[2:] + + docs = 0 + chars = 0 + with open(out_path, "w", encoding="utf-8") as out: + for shard in shards: + pf = pq.ParquetFile(shard) + n_rg = pf.num_row_groups + print(f"{shard}: {pf.metadata.num_rows} rows, {n_rg} row groups", flush=True) + for rg in range(n_rg): + col = pf.read_row_group(rg, columns=["text"]).column("text") + for s in col: + text = s.as_py() + if text is None: + continue + out.write(text) + out.write(EOS) + docs += 1 + chars += len(text) + print(f" done {shard}: cumulative {docs} docs", flush=True) + + print(f"wrote {out_path}: {docs} docs, {chars} text chars " + f"(+ {docs} x '{EOS}' separators)", flush=True) + + +if __name__ == "__main__": + main()