data: gpt2 bpe via xserv-tokenizer + TinyStories corpus + lr schedule + grad clip

New xtrain-train crate scaffold. Data pipeline reuses xserv's from-scratch GPT-2/Qwen BPE via a path-dep (../../../xserv/crates/xserv-tokenizer, resolves on both ~/projects and dash5 /opt/wjh/projects): Corpus::load tokenizes the corpus into one id stream and samples fixed-length (input, target) next-token windows (LCG-seeded, reproducible). Trims a range-downloaded file to whole stories (<|endoftext|> boundaries). Also the host-only training math: LrSchedule (linear warmup + cosine decay) and global L2 grad-norm + clip scale, each with a local unit test. Corpus: data/tinystories-valid-3mb.txt — first ~3MB of TinyStories-valid (fetched on dash5 via hf-mirror.com; HF direct unreachable). Substitution noted: a real TinyStories subset, not the full set. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 16:29:32 +08:00
parent f22429f5b8
commit 7d84a64f5c
9 changed files with 23007 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,15 @@
 # It is not intended for manual editing.
 version = 4

+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.64"
@@ -41,6 +50,18 @@ dependencies = [
 "zerocopy",
 ]

+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "memchr"
+version = "2.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.106"
@@ -59,6 +80,78 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "regex"
+version = "1.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.150"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
 [[package]]
 name = "shlex"
 version = "2.0.1"
@@ -88,6 +181,15 @@ version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"

+[[package]]
+name = "xserv-tokenizer"
+version = "0.1.0"
+dependencies = [
+ "regex",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "xtrain-autodiff"
 version = "0.1.0"
@@ -131,6 +233,18 @@ dependencies = [
 "xtrain-cuda",
 ]

+[[package]]
+name = "xtrain-train"
+version = "0.1.0"
+dependencies = [
+ "xserv-tokenizer",
+ "xtrain-autodiff",
+ "xtrain-cuda",
+ "xtrain-model",
+ "xtrain-optim",
+ "xtrain-tensor",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.8.52"
@@ -150,3 +264,9 @@ dependencies = [
 "quote",
 "syn",
 ]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
    "crates/xtrain-autodiff",
    "crates/xtrain-model",
    "crates/xtrain-optim",
+    "crates/xtrain-train",
 ]

 [workspace.package]
--- a/crates/xtrain-train/Cargo.toml
+++ b/crates/xtrain-train/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "xtrain-train"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+xtrain-tensor = { path = "../xtrain-tensor" }
+xtrain-autodiff = { path = "../xtrain-autodiff" }
+xtrain-model = { path = "../xtrain-model" }
+xtrain-optim = { path = "../xtrain-optim" }
+xtrain-cuda = { path = "../xtrain-cuda" }
+# Reuse xserv's from-scratch GPT-2/Qwen BPE (project decision). This relative
+# path resolves on both ~/projects (local) and /opt/wjh/projects (dash5). The
+# crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads
+# the target package's workspace, not ours.
+xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" }
--- a/crates/xtrain-train/build.rs
+++ b/crates/xtrain-train/build.rs
@@ -0,0 +1,26 @@
+use std::env;
+use std::path::Path;
+use std::process::Command;
+
+// Per-crate convention: the training loop / sampler / checkpoint all drive GPU
+// ops through the model + tensor layers, so the bulk of this crate is gated
+// behind `not(no_cuda)`. The LR schedule and the grad-clip *math* are host-only
+// and always compile. cfg does not propagate across crates, so re-detect nvcc.
+fn main() {
+    println!("cargo:rustc-check-cfg=cfg(no_cuda)");
+
+    let cuda_path = env::var("CUDA_HOME")
+        .or_else(|_| env::var("CUDA_PATH"))
+        .unwrap_or_else(|_| "/usr/local/cuda".to_string());
+
+    if !nvcc_available(&cuda_path) {
+        println!("cargo:rustc-cfg=no_cuda");
+    }
+}
+
+fn nvcc_available(cuda_path: &str) -> bool {
+    if Command::new("nvcc").arg("--version").output().is_ok() {
+        return true;
+    }
+    Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
+}
--- a/crates/xtrain-train/src/clip.rs
+++ b/crates/xtrain-train/src/clip.rs
@@ -0,0 +1,93 @@
+//! Global-norm gradient clipping. The norm is computed across *all* parameter
+//! gradients jointly (the same as `torch.nn.utils.clip_grad_norm_`): if the total
+//! L2 norm exceeds `max_norm`, every gradient is scaled by `max_norm / total`.
+//!
+//! The norm math is host-only and testable; [`clip_grad_norm`] wraps it over the
+//! parameter `Var`s (GPU round-trip), gated behind `not(no_cuda)`.
+
+/// Compute the global L2 norm over a set of flat gradient buffers.
+pub fn global_l2_norm(grads: &[Vec<f32>]) -> f32 {
+    let mut sumsq = 0.0f64;
+    for g in grads {
+        for &x in g {
+            sumsq += (x as f64) * (x as f64);
+        }
+    }
+    sumsq.sqrt() as f32
+}
+
+/// The scale factor to apply for clipping to `max_norm` (1.0 if already under).
+pub fn clip_scale(total_norm: f32, max_norm: f32) -> f32 {
+    if total_norm > max_norm && total_norm > 0.0 {
+        max_norm / total_norm
+    } else {
+        1.0
+    }
+}
+
+#[cfg(not(no_cuda))]
+mod gpu {
+    use super::{clip_scale, global_l2_norm};
+    use xtrain_autodiff::tape::Var;
+    use xtrain_tensor::{Device, Tensor};
+
+    /// First multiply every parameter's `.grad()` by `pre_scale` (use `1/batch`
+    /// to turn accumulated summed grads into a batch mean; `1.0` for no-op), then
+    /// clip the result to a joint global L2 norm of `max_norm`, writing the final
+    /// grads back via the tape's grad slot. Returns the post-pre_scale total norm
+    /// (the value the clip threshold is compared against, handy for logging).
+    /// Parameters without a grad contribute 0.
+    pub fn clip_grad_norm(params: &[Var], max_norm: f32, pre_scale: f32) -> f32 {
+        let device = params[0].value().device();
+        let grads: Vec<Option<Vec<f32>>> = params
+            .iter()
+            .map(|p| {
+                p.grad()
+                    .map(|g| g.to_device(Device::Cpu).as_slice::<f32>().to_vec())
+            })
+            .collect();
+
+        // Norm is measured on the (pre_scale-applied) grads — what the optimizer
+        // will actually see if no clipping is needed.
+        let scaled_present: Vec<Vec<f32>> = grads
+            .iter()
+            .flatten()
+            .map(|g| g.iter().map(|x| x * pre_scale).collect())
+            .collect();
+        let total = global_l2_norm(&scaled_present);
+        let factor = pre_scale * clip_scale(total, max_norm);
+        if (factor - 1.0).abs() < f32::EPSILON {
+            return total; // pre_scale==1 and under threshold → grads untouched
+        }
+
+        for (p, g) in params.iter().zip(&grads) {
+            if let Some(g) = g {
+                let shape = p.grad().unwrap().shape().to_vec();
+                let scaled: Vec<f32> = g.iter().map(|x| x * factor).collect();
+                let t = Tensor::from_slice(&scaled, &shape).to_device(device);
+                p.zero_grad();
+                Var::push_grad(p, t);
+            }
+        }
+        total
+    }
+}
+
+#[cfg(not(no_cuda))]
+pub use gpu::clip_grad_norm;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn norm_and_scale() {
+        // grads = [3,4] → norm 5.
+        let g = vec![vec![3.0f32], vec![4.0]];
+        assert!((global_l2_norm(&g) - 5.0).abs() < 1e-6);
+        // Clip to 2.5 → scale 0.5.
+        assert!((clip_scale(5.0, 2.5) - 0.5).abs() < 1e-6);
+        // Already under → no scaling.
+        assert!((clip_scale(5.0, 10.0) - 1.0).abs() < 1e-6);
+    }
+}
--- a/crates/xtrain-train/src/data.rs
+++ b/crates/xtrain-train/src/data.rs
@@ -0,0 +1,75 @@
+//! Data pipeline: load the GPT-2 BPE (reusing xserv's from-scratch tokenizer),
+//! tokenize a text corpus into one flat token stream, and sample fixed-length
+//! `(input, target)` windows for next-token prediction. Host-only (no GPU).
+
+use std::path::Path;
+use xserv_tokenizer::Tokenizer;
+
+/// A tokenized corpus: one flat stream of token ids, plus the vocab size.
+pub struct Corpus {
+    pub tokens: Vec<i32>,
+    pub vocab_size: usize,
+}
+
+impl Corpus {
+    /// Load `tokenizer.json` (GPT-2 BPE) and tokenize the UTF-8 text at
+    /// `corpus_path` into a single id stream. TinyStories separates stories with
+    /// `<|endoftext|>`; the GPT-2 tokenizer emits that as a single special token,
+    /// so document boundaries are preserved in the stream.
+    pub fn load(tokenizer_path: &Path, corpus_path: &Path) -> Self {
+        let tok = Tokenizer::from_file(tokenizer_path);
+        let text = std::fs::read_to_string(corpus_path)
+            .unwrap_or_else(|e| panic!("failed to read corpus {}: {e}", corpus_path.display()));
+        // The range-fetched corpus may start/end mid-story; drop a leading partial
+        // line and a trailing partial story so we only train on whole sentences.
+        let text = trim_to_whole_stories(&text);
+        let ids: Vec<i32> = tok.encode(text).into_iter().map(|t| t as i32).collect();
+        Self {
+            tokens: ids,
+            vocab_size: tok.vocab_size(),
+        }
+    }
+
+    /// Total number of tokens.
+    pub fn len(&self) -> usize {
+        self.tokens.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.tokens.is_empty()
+    }
+
+    /// Sample one `(input, target)` pair of length `seq` for next-token
+    /// prediction: a window `[s, s+seq+1)` → input `[s, s+seq)`, target shifted
+    /// by one. `rng_state` is advanced (a tiny LCG, so sampling is reproducible
+    /// from a seed without pulling in an RNG crate).
+    pub fn sample(&self, seq: usize, rng_state: &mut u64) -> (Vec<i32>, Vec<i32>) {
+        assert!(self.tokens.len() > seq + 1, "corpus shorter than a window");
+        let max_start = self.tokens.len() - seq - 1;
+        let start = (next_rand(rng_state) % (max_start as u64 + 1)) as usize;
+        let input = self.tokens[start..start + seq].to_vec();
+        let target = self.tokens[start + 1..start + seq + 1].to_vec();
+        (input, target)
+    }
+}
+
+/// Drop a leading partial line (before the first newline) and everything after
+/// the last `<|endoftext|>` marker, so a byte-range download still yields only
+/// complete stories. Falls back to the raw text if no marker is present.
+fn trim_to_whole_stories(text: &str) -> &str {
+    let start = text.find('\n').map(|i| i + 1).unwrap_or(0);
+    let body = &text[start..];
+    match body.rfind("<|endoftext|>") {
+        Some(end) => &body[..end + "<|endoftext|>".len()],
+        None => body,
+    }
+}
+
+/// Tiny LCG (same constants as the model tests' deterministic fill) so dataset
+/// sampling is reproducible from a single u64 seed.
+fn next_rand(state: &mut u64) -> u64 {
+    *state = state
+        .wrapping_mul(6364136223846793005)
+        .wrapping_add(1442695040888963407);
+    *state >> 16
+}
--- a/crates/xtrain-train/src/lib.rs
+++ b/crates/xtrain-train/src/lib.rs
@@ -0,0 +1,12 @@
+//! Training stack (Phase T6): LR schedule, global-norm grad clipping, checkpoint
+//! save/load, the GPT-2 BPE data pipeline (reusing xserv's tokenizer), an
+//! autoregressive sampler, and the training loop that wires them onto the T5
+//! `TinyTransformer` + the hand-written AdamW (`xtrain-optim`).
+//!
+//! Host-only pieces (LR schedule, grad-norm math) always compile so the crate
+//! `cargo check`s on a GPU-less host; everything that touches GPU tensors is
+//! gated behind `not(no_cuda)`.
+
+pub mod clip;
+pub mod data;
+pub mod schedule;
--- a/crates/xtrain-train/src/schedule.rs
+++ b/crates/xtrain-train/src/schedule.rs
@@ -0,0 +1,58 @@
+//! Learning-rate schedule: linear warmup → cosine decay to a floor. Host-only
+//! and pure (just arithmetic over the step index), so it unit-tests locally.
+
+/// Warmup-then-cosine LR schedule.
+///
+/// - steps `0..warmup`: linear ramp `0 → max_lr`
+/// - steps `warmup..total`: cosine decay `max_lr → min_lr`
+/// - steps `>= total`: clamped at `min_lr`
+#[derive(Clone, Copy, Debug)]
+pub struct LrSchedule {
+    pub max_lr: f32,
+    pub min_lr: f32,
+    pub warmup: usize,
+    pub total: usize,
+}
+
+impl LrSchedule {
+    /// LR for a 0-indexed step.
+    pub fn lr(&self, step: usize) -> f32 {
+        if step < self.warmup {
+            // Linear warmup; +1 so step 0 already takes a (tiny) nonzero LR.
+            return self.max_lr * (step + 1) as f32 / self.warmup.max(1) as f32;
+        }
+        if step >= self.total {
+            return self.min_lr;
+        }
+        let progress = (step - self.warmup) as f32 / (self.total - self.warmup).max(1) as f32;
+        let cosine = 0.5 * (1.0 + (std::f32::consts::PI * progress).cos()); // 1 → 0
+        self.min_lr + (self.max_lr - self.min_lr) * cosine
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn warmup_then_cosine_shape() {
+        let s = LrSchedule {
+            max_lr: 1.0,
+            min_lr: 0.1,
+            warmup: 10,
+            total: 100,
+        };
+        // Warmup ramps up and reaches the peak at the end of warmup.
+        assert!(s.lr(0) < s.lr(5));
+        assert!(s.lr(5) < s.lr(9));
+        assert!((s.lr(9) - 1.0).abs() < 1e-6);
+        // Just after warmup, near the peak; midway, near the midpoint.
+        assert!(s.lr(10) > 0.95);
+        let mid = s.lr(55); // progress ~0.5 → cosine ~0.5
+        assert!((mid - (0.1 + 0.9 * 0.5)).abs() < 0.02);
+        // Decays monotonically to the floor by the end.
+        assert!(s.lr(99) < s.lr(55));
+        assert!(s.lr(100) <= s.min_lr + 1e-6);
+        assert!(s.lr(1_000) <= s.min_lr + 1e-6);
+    }
+}
--- a/data/tinystories-valid-3mb.txt
+++ b/data/tinystories-valid-3mb.txt