#!/usr/bin/env python3 """FineWeb-edu parquet -> plain UTF-8 .txt for xtrain's Corpus loader (Scaling v6). v0-v5 trained on TinyStories; v6 broadens the data to FineWeb-edu (HuggingFaceFW/fineweb-edu, the `sample/10BT` subset) to isolate the data-source variable while keeping the v4/v5 architecture fixed. xtrain's `Corpus::load` (crates/xtrain-train/src/data.rs) reads a UTF-8 text file and tokenizes it with the gpt2 BPE; it treats `<|endoftext|>` as the document boundary (the gpt2 tokenizer emits id 50256 for it). This script extracts the `text` column from one or more FineWeb-edu parquet shards and writes each document followed by `<|endoftext|>`, producing exactly that format. Reads row-group by row-group so memory stays bounded regardless of shard size. Usage (on dash5, in data/): python3 scripts/fineweb_to_txt.py OUT.txt SHARD1.parquet [SHARD2.parquet ...] The corpus / .txt / .u16.bin caches are multi-GB and live on dash5 only (gitignored); only this script is committed. """ import sys import pyarrow.parquet as pq EOS = "<|endoftext|>" def main() -> None: if len(sys.argv) < 3: sys.exit(f"usage: {sys.argv[0]} OUT.txt SHARD.parquet [SHARD.parquet ...]") out_path = sys.argv[1] shards = sys.argv[2:] docs = 0 chars = 0 with open(out_path, "w", encoding="utf-8") as out: for shard in shards: pf = pq.ParquetFile(shard) n_rg = pf.num_row_groups print(f"{shard}: {pf.metadata.num_rows} rows, {n_rg} row groups", flush=True) for rg in range(n_rg): col = pf.read_row_group(rg, columns=["text"]).column("text") for s in col: text = s.as_py() if text is None: continue out.write(text) out.write(EOS) docs += 1 chars += len(text) print(f" done {shard}: cumulative {docs} docs", flush=True) print(f"wrote {out_path}: {docs} docs, {chars} text chars " f"(+ {docs} x '{EOS}' separators)", flush=True) if __name__ == "__main__": main()