diff --git a/Cargo.lock b/Cargo.lock index e3512af..594dfba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -109,6 +109,16 @@ version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" +[[package]] +name = "safetensors" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc0cdb7198d738a111f6df8fef42cb175412c311d0c4ac9126ff4e550ad1a0e8" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "serde" version = "1.0.228" @@ -249,6 +259,8 @@ dependencies = [ name = "xtrain-train" version = "0.1.0" dependencies = [ + "half", + "safetensors", "xserv-tokenizer", "xtrain-autodiff", "xtrain-cuda", diff --git a/crates/xtrain-train/Cargo.toml b/crates/xtrain-train/Cargo.toml index 07bcd42..c983f18 100644 --- a/crates/xtrain-train/Cargo.toml +++ b/crates/xtrain-train/Cargo.toml @@ -14,7 +14,14 @@ xtrain-cuda = { path = "../xtrain-cuda" } # crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads # the target package's workspace, not ours. xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" } +# T9 export to xserv: HF Qwen3 safetensors + BF16 weight cast. +half.workspace = true +safetensors = "0.5" [[bin]] name = "train" path = "src/bin/train.rs" + +[[bin]] +name = "export_safetensors" +path = "src/bin/export_safetensors.rs" diff --git a/crates/xtrain-train/src/bin/export_safetensors.rs b/crates/xtrain-train/src/bin/export_safetensors.rs new file mode 100644 index 0000000..37f3a71 --- /dev/null +++ b/crates/xtrain-train/src/bin/export_safetensors.rs @@ -0,0 +1,258 @@ +//! Phase T9 — export a trained xtrain checkpoint into the format xserv loads: +//! an HF Qwen3-style `config.json` + `model.safetensors` (+ a copy of the GPT-2 +//! `tokenizer.json`), so xserv's `Qwen3` loader can serve the same weights. +//! +//! xtrain's `TinyTransformer` is (after T9) architecturally a tiny Qwen3: +//! RoPE (rotate_half, pos=row) + RMSNorm + per-head QK-norm + SwiGLU + separate +//! lm_head, MHA (n_kv_heads = n_heads). The only deltas to xserv are mechanical: +//! - tensor NAMES → HF Qwen3 names (`model.layers.{i}.self_attn.q_proj.weight` …) +//! - 2D proj LAYOUT → xtrain stores `[in,out]` (computes `x@W`); xserv/HF want +//! `[out,in]` (computes `x@Wᵀ`) → transpose every 2D projection weight. +//! 1D norms and the `[vocab,dim]` embedding/lm_head rows are unchanged. +//! - DTYPE → xserv's Qwen3 forward is BF16-only, so weights are written as BF16. +//! +//! See `docs/08-export-xserv.md` for the full architecture diff + mapping table. +//! +//! Run on dash5 (needs a GPU to materialise the checkpoint params): +//! export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH +//! cargo run -p xtrain-train --release --bin export_safetensors -- \ +//! /tmp/xtrain_tinystories.ckpt \ +//! /opt/wjh/models/gpt2/tokenizer.json \ +//! /tmp/xtrain_export + +#[cfg(no_cuda)] +fn main() { + eprintln!("export_safetensors: built without CUDA (no_cuda); run on a GPU host (dash5)."); +} + +#[cfg(not(no_cuda))] +use std::path::{Path, PathBuf}; + +#[cfg(not(no_cuda))] +use half::bf16; +#[cfg(not(no_cuda))] +use xtrain_autodiff::tape::Var; +#[cfg(not(no_cuda))] +use xtrain_cuda::device; +#[cfg(not(no_cuda))] +use xtrain_model::{Config, TinyTransformer, param_to_host}; +#[cfg(not(no_cuda))] +use xtrain_tensor::Device; + +// Same deterministic init scheme as bin/train.rs, so a freshly-built model has +// the right shapes before `load_into` overwrites the values from the checkpoint. +#[cfg(not(no_cuda))] +fn fill(n: usize, seed: u64, scale: f32) -> Vec { + let mut state = seed + .wrapping_mul(2862933555777941757) + .wrapping_add(3037000493); + (0..n) + .map(|_| { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + (((state >> 33) as f32 / (1u64 << 31) as f32) - 0.5) * 2.0 * scale + }) + .collect() +} + +/// A param ready to serialize: HF name + the (possibly transposed) row-major +/// data + its shape. Stored as BF16 (xserv's Qwen3 forward is BF16-only). +#[cfg(not(no_cuda))] +struct Export { + name: String, + data: Vec, + shape: Vec, +} + +/// 1D norm / embedding row-table: keep layout, just cast to BF16. +#[cfg(not(no_cuda))] +fn keep(name: &str, v: &Var) -> Export { + let host = param_to_host(v); + let shape = v.value().shape().to_vec(); + Export { + name: name.to_string(), + data: host.iter().map(|&x| bf16::from_f32(x)).collect(), + shape, + } +} + +/// 2D projection weight: xtrain `[in,out]` (x@W) → HF `[out,in]` (x@Wᵀ). Transpose +/// the row-major matrix and cast to BF16. +#[cfg(not(no_cuda))] +fn transpose(name: &str, v: &Var) -> Export { + let host = param_to_host(v); + let shape = v.value().shape().to_vec(); + assert_eq!(shape.len(), 2, "transpose expects a 2D weight: {name}"); + let (rows, cols) = (shape[0], shape[1]); // [in, out] + let mut out = vec![bf16::ZERO; rows * cols]; + for r in 0..rows { + for c in 0..cols { + // out[c, r] = in[r, c] + out[c * rows + r] = bf16::from_f32(host[r * cols + c]); + } + } + Export { + name: name.to_string(), + data: out, + shape: vec![cols, rows], // [out, in] + } +} + +/// Assemble every export tensor in HF Qwen3 naming, reading the xtrain params in +/// their stable `params()` order: +/// embed → per block [attn_norm, wq, wk, wv, q_norm, k_norm, wo, ffn_norm, +/// w_gate, w_up, w_down] → final_norm → lm_head +#[cfg(not(no_cuda))] +fn build_exports(model: &TinyTransformer) -> Vec { + let cfg = model.config(); + let p = model.params(); + let mut it = p.iter(); + let mut next = || it.next().expect("params() ran short"); + + let mut ex = Vec::new(); + ex.push(keep("model.embed_tokens.weight", next())); // [vocab, dim] + for l in 0..cfg.n_layers { + let b = format!("model.layers.{l}"); + ex.push(keep(&format!("{b}.input_layernorm.weight"), next())); + ex.push(transpose(&format!("{b}.self_attn.q_proj.weight"), next())); + ex.push(transpose(&format!("{b}.self_attn.k_proj.weight"), next())); + ex.push(transpose(&format!("{b}.self_attn.v_proj.weight"), next())); + ex.push(keep(&format!("{b}.self_attn.q_norm.weight"), next())); + ex.push(keep(&format!("{b}.self_attn.k_norm.weight"), next())); + ex.push(transpose(&format!("{b}.self_attn.o_proj.weight"), next())); + ex.push(keep( + &format!("{b}.post_attention_layernorm.weight"), + next(), + )); + ex.push(transpose(&format!("{b}.mlp.gate_proj.weight"), next())); + ex.push(transpose(&format!("{b}.mlp.up_proj.weight"), next())); + ex.push(transpose(&format!("{b}.mlp.down_proj.weight"), next())); + } + ex.push(keep("model.norm.weight", next())); // [dim] + ex.push(transpose("lm_head.weight", next())); // [dim,vocab] → [vocab,dim] + assert!(it.next().is_none(), "params() had extra tensors"); + ex +} + +/// config.json matching xserv's `ModelConfig` for a Qwen3 with xtrain's dims and +/// reconciled fields (eps, rope theta, head_dim, n_kv_heads = n_heads, untied). +#[cfg(not(no_cuda))] +fn config_json(cfg: &Config) -> String { + format!( + r#"{{ + "architectures": ["Qwen3ForCausalLM"], + "model_type": "qwen3", + "vocab_size": {vocab}, + "hidden_size": {dim}, + "intermediate_size": {ffn}, + "num_hidden_layers": {layers}, + "num_attention_heads": {heads}, + "num_key_value_heads": {kv_heads}, + "head_dim": {head_dim}, + "max_position_embeddings": 2048, + "rms_norm_eps": {eps}, + "rope_theta": {theta}, + "tie_word_embeddings": false, + "attention_bias": false, + "hidden_act": "silu" +}} +"#, + vocab = cfg.vocab, + dim = cfg.dim, + ffn = cfg.ffn_hidden, + layers = cfg.n_layers, + heads = cfg.n_heads, + kv_heads = cfg.n_heads, // xtrain is MHA → kv heads == query heads + head_dim = cfg.head_dim, + eps = cfg.eps, + theta = cfg.rope_theta, + ) +} + +#[cfg(not(no_cuda))] +fn main() { + use safetensors::tensor::{Dtype, TensorView}; + use xserv_tokenizer::Tokenizer; + + let args: Vec = std::env::args().collect(); + let ckpt = args + .get(1) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("/tmp/xtrain_tinystories.ckpt")); + let tok_path = args + .get(2) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("/opt/wjh/models/gpt2/tokenizer.json")); + let out_dir = args + .get(3) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("/tmp/xtrain_export")); + + assert!(device::device_count().unwrap() > 0, "no CUDA device"); + device::set_device(0).unwrap(); + let dev = Device::Cuda(0); + + // Size the model exactly like bin/train.rs: gpt2 vocab + n_layers = 4. + let tok = Tokenizer::from_file(&tok_path); + let vocab = tok.vocab_size(); + let mut cfg = Config::tiny(); + cfg.vocab = vocab; + cfg.n_layers = 4; + println!( + "export: ckpt {} → {} (vocab {}, dim {}, layers {}, heads {}, head_dim {})", + ckpt.display(), + out_dir.display(), + cfg.vocab, + cfg.dim, + cfg.n_layers, + cfg.n_heads, + cfg.head_dim, + ); + + let mut seed = 1u64; + let model = TinyTransformer::new(cfg, dev, |shape| { + seed = seed.wrapping_add(1); + let n: usize = shape.iter().product(); + if shape.len() == 1 { + fill(n, seed, 0.02).iter().map(|v| v + 1.0).collect() + } else { + fill(n, seed, 0.04) + } + }); + xtrain_train::checkpoint::load_into(&ckpt, &model.params()).expect("load checkpoint"); + + let exports = build_exports(&model); + println!("export: {} tensors", exports.len()); + + // Serialize to safetensors. Each TensorView borrows the raw BF16 bytes. + let views: Vec<(String, TensorView)> = exports + .iter() + .map(|e| { + let bytes = unsafe { + std::slice::from_raw_parts(e.data.as_ptr() as *const u8, e.data.len() * 2) + }; + let view = TensorView::new(Dtype::BF16, e.shape.clone(), bytes) + .unwrap_or_else(|err| panic!("bad tensor view {}: {err}", e.name)); + (e.name.clone(), view) + }) + .collect(); + + std::fs::create_dir_all(&out_dir).expect("mkdir out_dir"); + let st = safetensors::tensor::serialize(views.iter().map(|(n, v)| (n.as_str(), v)), &None) + .expect("serialize safetensors"); + std::fs::write(out_dir.join("model.safetensors"), st).expect("write model.safetensors"); + std::fs::write(out_dir.join("config.json"), config_json(&cfg)).expect("write config.json"); + copy_tokenizer(&tok_path, &out_dir); + + println!( + "export: wrote config.json + model.safetensors + tokenizer.json to {}", + out_dir.display() + ); +} + +/// Place the tokenizer beside the weights so xserv loads it from the model dir. +#[cfg(not(no_cuda))] +fn copy_tokenizer(tok_path: &Path, out_dir: &Path) { + std::fs::copy(tok_path, out_dir.join("tokenizer.json")).expect("copy tokenizer.json"); +}