run: v4 archive + export (dim768, 8-GPU DDP, val 1.17)

v4 scaling run finished: dim768/18L, core 127.43M (total 204.63M), trained
720.9M tokens (~1.54 epoch) on 8x RTX 5090 DDP fp32, ~145K tok/s, ~84 min,
best val 1.1690. Checkpoint archived to registry
(~/projects/tiny-models/v4-tinystories-dim768/) and exported to xserv HF Qwen3
safetensors (201 tensors, BF16); xserv serves it and matches xtrain greedy
token-for-token on all 3 fixed prompts (40 tok).

Add `greedy_sample` bin: load a trained ckpt with its arch flags and print
xtrain's own greedy continuations for the fixed run prompts, so they can be
diffed against xserv's greedy on the exported weights (the per-run token-match
check). Same model/config/init scheme as bin/train.rs + bin/export_safetensors.rs.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 13:14:28 +08:00
parent f85bd4d276
commit 734e119db3

View File

@@ -0,0 +1,113 @@
//! Greedy-generation helper for run verification — load a trained checkpoint with
//! its arch flags and print xtrain's OWN greedy continuations for the fixed run
//! prompts, so they can be diffed against xserv's greedy on the exported weights
//! (the token-match check each scaling run reports). f32 forward, same
//! model/config/ckpt + init scheme as bin/train.rs and bin/export_safetensors.rs.
//!
//! export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
//! cargo run -p xtrain-train --release --bin greedy_sample -- \
//! /tmp/xtrain_v4.ckpt /opt/wjh/models/gpt2/tokenizer.json \
//! --heads 24 --head-dim 32 --layers 18 --ffn 2048
#[cfg(no_cuda)]
fn main() {
eprintln!("greedy_sample: built without CUDA (no_cuda); run on a GPU host (dash5).");
}
#[cfg(not(no_cuda))]
use std::path::PathBuf;
#[cfg(not(no_cuda))]
use xtrain_cuda::device;
#[cfg(not(no_cuda))]
use xtrain_model::{Config, TinyTransformer};
#[cfg(not(no_cuda))]
use xtrain_tensor::Device;
#[cfg(not(no_cuda))]
use xtrain_train::sample::generate;
// Same deterministic LCG init scheme as bin/train.rs / bin/export_safetensors.rs.
#[cfg(not(no_cuda))]
fn fill(n: usize, seed: u64, scale: f32) -> Vec<f32> {
let mut state = seed
.wrapping_mul(2862933555777941757)
.wrapping_add(3037000493);
(0..n)
.map(|_| {
state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
(((state >> 33) as f32 / (1u64 << 31) as f32) - 0.5) * 2.0 * scale
})
.collect()
}
// A flag like `--layers 18`: scan argv for `name`, parse the following token.
#[cfg(not(no_cuda))]
fn flag<T: std::str::FromStr>(args: &[String], name: &str, default: T) -> T {
args.iter()
.position(|a| a == name)
.and_then(|i| args.get(i + 1))
.and_then(|s| s.parse().ok())
.unwrap_or(default)
}
#[cfg(not(no_cuda))]
fn main() {
use xserv_tokenizer::Tokenizer;
let args: Vec<String> = std::env::args().collect();
let positionals: Vec<&String> = args[1..].iter().filter(|a| !a.starts_with("--")).collect();
let ckpt = positionals
.first()
.map(|s| PathBuf::from(s.as_str()))
.unwrap_or_else(|| PathBuf::from("/tmp/xtrain_tinystories.ckpt"));
let tok_path = positionals
.get(1)
.map(|s| PathBuf::from(s.as_str()))
.unwrap_or_else(|| PathBuf::from("/opt/wjh/models/gpt2/tokenizer.json"));
// Architecture must match the checkpoint. Defaults = v0-baseline tiny config.
let n_heads = flag(&args, "--heads", 2usize);
let head_dim = flag(&args, "--head-dim", 16usize);
let n_layers = flag(&args, "--layers", 4usize);
let ffn = flag(&args, "--ffn", 64usize);
let max_new = flag(&args, "--max-tokens", 40usize);
assert!(device::device_count().unwrap() > 0, "no CUDA device");
device::set_device(0).unwrap();
let device = Device::Cuda(0);
let tok = Tokenizer::from_file(&tok_path);
let cfg = Config::from_arch(tok.vocab_size(), n_heads, head_dim, n_layers, ffn);
println!(
"greedy_sample: ckpt {} (vocab {}, dim {}, layers {}, heads {}, head_dim {})",
ckpt.display(),
cfg.vocab,
cfg.dim,
cfg.n_layers,
cfg.n_heads,
cfg.head_dim,
);
let mut seed = 1u64;
let model = TinyTransformer::new(cfg, device, |shape| {
seed = seed.wrapping_add(1);
let n: usize = shape.iter().product();
if shape.len() == 1 {
fill(n, seed, 0.02).iter().map(|v| v + 1.0).collect()
} else {
fill(n, seed, 0.04)
}
});
xtrain_train::checkpoint::load_into(&ckpt, &model.params()).expect("load checkpoint");
let prompts = ["Once upon a time", "One day", "The little"];
for p in prompts {
let ids: Vec<i32> = tok.encode(p).into_iter().map(|t| t as i32).collect();
let mut rng = 7u64;
let out = generate(&model, device, &ids, max_new, 0.0, &mut rng);
let text = tok.decode(&out.iter().map(|&t| t as u32).collect::<Vec<_>>());
println!("[{p}] → {text}");
}
}