data: gpt2 bpe via xserv-tokenizer + TinyStories corpus + lr schedule + grad clip

New xtrain-train crate scaffold. Data pipeline reuses xserv's from-scratch
GPT-2/Qwen BPE via a path-dep (../../../xserv/crates/xserv-tokenizer, resolves
on both ~/projects and dash5 /opt/wjh/projects): Corpus::load tokenizes the
corpus into one id stream and samples fixed-length (input, target) next-token
windows (LCG-seeded, reproducible). Trims a range-downloaded file to whole
stories (<|endoftext|> boundaries).

Also the host-only training math: LrSchedule (linear warmup + cosine decay)
and global L2 grad-norm + clip scale, each with a local unit test.

Corpus: data/tinystories-valid-3mb.txt — first ~3MB of TinyStories-valid
(fetched on dash5 via hf-mirror.com; HF direct unreachable). Substitution
noted: a real TinyStories subset, not the full set.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 16:29:32 +08:00
parent f22429f5b8
commit 7d84a64f5c
9 changed files with 23007 additions and 0 deletions

120
Cargo.lock generated
View File

@@ -2,6 +2,15 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
dependencies = [
"memchr",
]
[[package]]
name = "cc"
version = "1.2.64"
@@ -41,6 +50,18 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "itoa"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
[[package]]
name = "memchr"
version = "2.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
[[package]]
name = "proc-macro2"
version = "1.0.106"
@@ -59,6 +80,78 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.150"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
dependencies = [
"itoa",
"memchr",
"serde",
"serde_core",
"zmij",
]
[[package]]
name = "shlex"
version = "2.0.1"
@@ -88,6 +181,15 @@ version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "xserv-tokenizer"
version = "0.1.0"
dependencies = [
"regex",
"serde",
"serde_json",
]
[[package]]
name = "xtrain-autodiff"
version = "0.1.0"
@@ -131,6 +233,18 @@ dependencies = [
"xtrain-cuda",
]
[[package]]
name = "xtrain-train"
version = "0.1.0"
dependencies = [
"xserv-tokenizer",
"xtrain-autodiff",
"xtrain-cuda",
"xtrain-model",
"xtrain-optim",
"xtrain-tensor",
]
[[package]]
name = "zerocopy"
version = "0.8.52"
@@ -150,3 +264,9 @@ dependencies = [
"quote",
"syn",
]
[[package]]
name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

View File

@@ -6,6 +6,7 @@ members = [
"crates/xtrain-autodiff",
"crates/xtrain-model",
"crates/xtrain-optim",
"crates/xtrain-train",
]
[workspace.package]

View File

@@ -0,0 +1,16 @@
[package]
name = "xtrain-train"
version.workspace = true
edition.workspace = true
[dependencies]
xtrain-tensor = { path = "../xtrain-tensor" }
xtrain-autodiff = { path = "../xtrain-autodiff" }
xtrain-model = { path = "../xtrain-model" }
xtrain-optim = { path = "../xtrain-optim" }
xtrain-cuda = { path = "../xtrain-cuda" }
# Reuse xserv's from-scratch GPT-2/Qwen BPE (project decision). This relative
# path resolves on both ~/projects (local) and /opt/wjh/projects (dash5). The
# crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads
# the target package's workspace, not ours.
xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" }

View File

@@ -0,0 +1,26 @@
use std::env;
use std::path::Path;
use std::process::Command;
// Per-crate convention: the training loop / sampler / checkpoint all drive GPU
// ops through the model + tensor layers, so the bulk of this crate is gated
// behind `not(no_cuda)`. The LR schedule and the grad-clip *math* are host-only
// and always compile. cfg does not propagate across crates, so re-detect nvcc.
fn main() {
println!("cargo:rustc-check-cfg=cfg(no_cuda)");
let cuda_path = env::var("CUDA_HOME")
.or_else(|_| env::var("CUDA_PATH"))
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
if !nvcc_available(&cuda_path) {
println!("cargo:rustc-cfg=no_cuda");
}
}
fn nvcc_available(cuda_path: &str) -> bool {
if Command::new("nvcc").arg("--version").output().is_ok() {
return true;
}
Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
}

View File

@@ -0,0 +1,93 @@
//! Global-norm gradient clipping. The norm is computed across *all* parameter
//! gradients jointly (the same as `torch.nn.utils.clip_grad_norm_`): if the total
//! L2 norm exceeds `max_norm`, every gradient is scaled by `max_norm / total`.
//!
//! The norm math is host-only and testable; [`clip_grad_norm`] wraps it over the
//! parameter `Var`s (GPU round-trip), gated behind `not(no_cuda)`.
/// Compute the global L2 norm over a set of flat gradient buffers.
pub fn global_l2_norm(grads: &[Vec<f32>]) -> f32 {
let mut sumsq = 0.0f64;
for g in grads {
for &x in g {
sumsq += (x as f64) * (x as f64);
}
}
sumsq.sqrt() as f32
}
/// The scale factor to apply for clipping to `max_norm` (1.0 if already under).
pub fn clip_scale(total_norm: f32, max_norm: f32) -> f32 {
if total_norm > max_norm && total_norm > 0.0 {
max_norm / total_norm
} else {
1.0
}
}
#[cfg(not(no_cuda))]
mod gpu {
use super::{clip_scale, global_l2_norm};
use xtrain_autodiff::tape::Var;
use xtrain_tensor::{Device, Tensor};
/// First multiply every parameter's `.grad()` by `pre_scale` (use `1/batch`
/// to turn accumulated summed grads into a batch mean; `1.0` for no-op), then
/// clip the result to a joint global L2 norm of `max_norm`, writing the final
/// grads back via the tape's grad slot. Returns the post-pre_scale total norm
/// (the value the clip threshold is compared against, handy for logging).
/// Parameters without a grad contribute 0.
pub fn clip_grad_norm(params: &[Var], max_norm: f32, pre_scale: f32) -> f32 {
let device = params[0].value().device();
let grads: Vec<Option<Vec<f32>>> = params
.iter()
.map(|p| {
p.grad()
.map(|g| g.to_device(Device::Cpu).as_slice::<f32>().to_vec())
})
.collect();
// Norm is measured on the (pre_scale-applied) grads — what the optimizer
// will actually see if no clipping is needed.
let scaled_present: Vec<Vec<f32>> = grads
.iter()
.flatten()
.map(|g| g.iter().map(|x| x * pre_scale).collect())
.collect();
let total = global_l2_norm(&scaled_present);
let factor = pre_scale * clip_scale(total, max_norm);
if (factor - 1.0).abs() < f32::EPSILON {
return total; // pre_scale==1 and under threshold → grads untouched
}
for (p, g) in params.iter().zip(&grads) {
if let Some(g) = g {
let shape = p.grad().unwrap().shape().to_vec();
let scaled: Vec<f32> = g.iter().map(|x| x * factor).collect();
let t = Tensor::from_slice(&scaled, &shape).to_device(device);
p.zero_grad();
Var::push_grad(p, t);
}
}
total
}
}
#[cfg(not(no_cuda))]
pub use gpu::clip_grad_norm;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn norm_and_scale() {
// grads = [3,4] → norm 5.
let g = vec![vec![3.0f32], vec![4.0]];
assert!((global_l2_norm(&g) - 5.0).abs() < 1e-6);
// Clip to 2.5 → scale 0.5.
assert!((clip_scale(5.0, 2.5) - 0.5).abs() < 1e-6);
// Already under → no scaling.
assert!((clip_scale(5.0, 10.0) - 1.0).abs() < 1e-6);
}
}

View File

@@ -0,0 +1,75 @@
//! Data pipeline: load the GPT-2 BPE (reusing xserv's from-scratch tokenizer),
//! tokenize a text corpus into one flat token stream, and sample fixed-length
//! `(input, target)` windows for next-token prediction. Host-only (no GPU).
use std::path::Path;
use xserv_tokenizer::Tokenizer;
/// A tokenized corpus: one flat stream of token ids, plus the vocab size.
pub struct Corpus {
pub tokens: Vec<i32>,
pub vocab_size: usize,
}
impl Corpus {
/// Load `tokenizer.json` (GPT-2 BPE) and tokenize the UTF-8 text at
/// `corpus_path` into a single id stream. TinyStories separates stories with
/// `<|endoftext|>`; the GPT-2 tokenizer emits that as a single special token,
/// so document boundaries are preserved in the stream.
pub fn load(tokenizer_path: &Path, corpus_path: &Path) -> Self {
let tok = Tokenizer::from_file(tokenizer_path);
let text = std::fs::read_to_string(corpus_path)
.unwrap_or_else(|e| panic!("failed to read corpus {}: {e}", corpus_path.display()));
// The range-fetched corpus may start/end mid-story; drop a leading partial
// line and a trailing partial story so we only train on whole sentences.
let text = trim_to_whole_stories(&text);
let ids: Vec<i32> = tok.encode(text).into_iter().map(|t| t as i32).collect();
Self {
tokens: ids,
vocab_size: tok.vocab_size(),
}
}
/// Total number of tokens.
pub fn len(&self) -> usize {
self.tokens.len()
}
pub fn is_empty(&self) -> bool {
self.tokens.is_empty()
}
/// Sample one `(input, target)` pair of length `seq` for next-token
/// prediction: a window `[s, s+seq+1)` → input `[s, s+seq)`, target shifted
/// by one. `rng_state` is advanced (a tiny LCG, so sampling is reproducible
/// from a seed without pulling in an RNG crate).
pub fn sample(&self, seq: usize, rng_state: &mut u64) -> (Vec<i32>, Vec<i32>) {
assert!(self.tokens.len() > seq + 1, "corpus shorter than a window");
let max_start = self.tokens.len() - seq - 1;
let start = (next_rand(rng_state) % (max_start as u64 + 1)) as usize;
let input = self.tokens[start..start + seq].to_vec();
let target = self.tokens[start + 1..start + seq + 1].to_vec();
(input, target)
}
}
/// Drop a leading partial line (before the first newline) and everything after
/// the last `<|endoftext|>` marker, so a byte-range download still yields only
/// complete stories. Falls back to the raw text if no marker is present.
fn trim_to_whole_stories(text: &str) -> &str {
let start = text.find('\n').map(|i| i + 1).unwrap_or(0);
let body = &text[start..];
match body.rfind("<|endoftext|>") {
Some(end) => &body[..end + "<|endoftext|>".len()],
None => body,
}
}
/// Tiny LCG (same constants as the model tests' deterministic fill) so dataset
/// sampling is reproducible from a single u64 seed.
fn next_rand(state: &mut u64) -> u64 {
*state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
*state >> 16
}

View File

@@ -0,0 +1,12 @@
//! Training stack (Phase T6): LR schedule, global-norm grad clipping, checkpoint
//! save/load, the GPT-2 BPE data pipeline (reusing xserv's tokenizer), an
//! autoregressive sampler, and the training loop that wires them onto the T5
//! `TinyTransformer` + the hand-written AdamW (`xtrain-optim`).
//!
//! Host-only pieces (LR schedule, grad-norm math) always compile so the crate
//! `cargo check`s on a GPU-less host; everything that touches GPU tensors is
//! gated behind `not(no_cuda)`.
pub mod clip;
pub mod data;
pub mod schedule;

View File

@@ -0,0 +1,58 @@
//! Learning-rate schedule: linear warmup → cosine decay to a floor. Host-only
//! and pure (just arithmetic over the step index), so it unit-tests locally.
/// Warmup-then-cosine LR schedule.
///
/// - steps `0..warmup`: linear ramp `0 → max_lr`
/// - steps `warmup..total`: cosine decay `max_lr → min_lr`
/// - steps `>= total`: clamped at `min_lr`
#[derive(Clone, Copy, Debug)]
pub struct LrSchedule {
pub max_lr: f32,
pub min_lr: f32,
pub warmup: usize,
pub total: usize,
}
impl LrSchedule {
/// LR for a 0-indexed step.
pub fn lr(&self, step: usize) -> f32 {
if step < self.warmup {
// Linear warmup; +1 so step 0 already takes a (tiny) nonzero LR.
return self.max_lr * (step + 1) as f32 / self.warmup.max(1) as f32;
}
if step >= self.total {
return self.min_lr;
}
let progress = (step - self.warmup) as f32 / (self.total - self.warmup).max(1) as f32;
let cosine = 0.5 * (1.0 + (std::f32::consts::PI * progress).cos()); // 1 → 0
self.min_lr + (self.max_lr - self.min_lr) * cosine
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn warmup_then_cosine_shape() {
let s = LrSchedule {
max_lr: 1.0,
min_lr: 0.1,
warmup: 10,
total: 100,
};
// Warmup ramps up and reaches the peak at the end of warmup.
assert!(s.lr(0) < s.lr(5));
assert!(s.lr(5) < s.lr(9));
assert!((s.lr(9) - 1.0).abs() < 1e-6);
// Just after warmup, near the peak; midway, near the midpoint.
assert!(s.lr(10) > 0.95);
let mid = s.lr(55); // progress ~0.5 → cosine ~0.5
assert!((mid - (0.1 + 0.9 * 0.5)).abs() < 0.02);
// Decays monotonically to the floor by the end.
assert!(s.lr(99) < s.lr(55));
assert!(s.lr(100) <= s.min_lr + 1e-6);
assert!(s.lr(1_000) <= s.min_lr + 1e-6);
}
}

22606
data/tinystories-valid-3mb.txt Normal file

File diff suppressed because it is too large Load Diff