export: safetensors + config.json for xserv qwen3

New bin export_safetensors: load an xtrain checkpoint, map every param to its HF Qwen3 tensor name, transpose 2D projection weights [in,out]->[out,in] (1D norms + [vocab,dim] embed/lm_head kept), cast to BF16 (xserv's qwen3 forward is BF16-only), and write config.json + model.safetensors + a copy of the gpt2 tokenizer.json. Sized exactly like bin/train.rs. safetensors 0.5 to match xserv. GPU body gated behind not(no_cuda). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 17:33:26 +08:00
parent 7a4f69e430
commit 1c76573cb4
3 changed files with 277 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,6 +109,16 @@ version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
 [[package]]
 name = "safetensors"
 version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cc0cdb7198d738a111f6df8fef42cb175412c311d0c4ac9126ff4e550ad1a0e8"
 dependencies = [
 "serde",
 "serde_json",
 ]
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -249,6 +259,8 @@ dependencies = [
 name = "xtrain-train"
 version = "0.1.0"
 dependencies = [
 "half",
 "safetensors",
 "xserv-tokenizer",
 "xtrain-autodiff",
 "xtrain-cuda",
--- a/crates/xtrain-train/Cargo.toml
+++ b/crates/xtrain-train/Cargo.toml
@@ -14,7 +14,14 @@ xtrain-cuda = { path = "../xtrain-cuda" }
 # crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads
 # the target package's workspace, not ours.
 xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" }
 # T9 export to xserv: HF Qwen3 safetensors + BF16 weight cast.
 half.workspace = true
 safetensors = "0.5"
 [[bin]]
 name = "train"
 path = "src/bin/train.rs"
 [[bin]]
 name = "export_safetensors"
 path = "src/bin/export_safetensors.rs"
--- a/crates/xtrain-train/src/bin/export_safetensors.rs
+++ b/crates/xtrain-train/src/bin/export_safetensors.rs
@@ -0,0 +1,258 @@
 //! Phase T9 — export a trained xtrain checkpoint into the format xserv loads:
 //! an HF Qwen3-style `config.json` + `model.safetensors` (+ a copy of the GPT-2
 //! `tokenizer.json`), so xserv's `Qwen3` loader can serve the same weights.
 //!
 //! xtrain's `TinyTransformer` is (after T9) architecturally a tiny Qwen3:
 //! RoPE (rotate_half, pos=row) + RMSNorm + per-head QK-norm + SwiGLU + separate
 //! lm_head, MHA (n_kv_heads = n_heads). The only deltas to xserv are mechanical:
 //!   - tensor NAMES  → HF Qwen3 names (`model.layers.{i}.self_attn.q_proj.weight` …)
 //!   - 2D proj LAYOUT → xtrain stores `[in,out]` (computes `x@W`); xserv/HF want
 //!     `[out,in]` (computes `x@Wᵀ`) → transpose every 2D projection weight.
 //!     1D norms and the `[vocab,dim]` embedding/lm_head rows are unchanged.
 //!   - DTYPE → xserv's Qwen3 forward is BF16-only, so weights are written as BF16.
 //!
 //! See `docs/08-export-xserv.md` for the full architecture diff + mapping table.
 //!
 //! Run on dash5 (needs a GPU to materialise the checkpoint params):
 //!   export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
 //!   cargo run -p xtrain-train --release --bin export_safetensors -- \
 //!       /tmp/xtrain_tinystories.ckpt \
 //!       /opt/wjh/models/gpt2/tokenizer.json \
 //!       /tmp/xtrain_export
 #[cfg(no_cuda)]
 fn main() {
    eprintln!("export_safetensors: built without CUDA (no_cuda); run on a GPU host (dash5).");
 }
 #[cfg(not(no_cuda))]
 use std::path::{Path, PathBuf};
 #[cfg(not(no_cuda))]
 use half::bf16;
 #[cfg(not(no_cuda))]
 use xtrain_autodiff::tape::Var;
 #[cfg(not(no_cuda))]
 use xtrain_cuda::device;
 #[cfg(not(no_cuda))]
 use xtrain_model::{Config, TinyTransformer, param_to_host};
 #[cfg(not(no_cuda))]
 use xtrain_tensor::Device;
 // Same deterministic init scheme as bin/train.rs, so a freshly-built model has
 // the right shapes before `load_into` overwrites the values from the checkpoint.
 #[cfg(not(no_cuda))]
 fn fill(n: usize, seed: u64, scale: f32) -> Vec<f32> {
    let mut state = seed
        .wrapping_mul(2862933555777941757)
        .wrapping_add(3037000493);
    (0..n)
        .map(|_| {
            state = state
                .wrapping_mul(6364136223846793005)
                .wrapping_add(1442695040888963407);
            (((state >> 33) as f32 / (1u64 << 31) as f32) - 0.5) * 2.0 * scale
        })
        .collect()
 }
 /// A param ready to serialize: HF name + the (possibly transposed) row-major
 /// data + its shape. Stored as BF16 (xserv's Qwen3 forward is BF16-only).
 #[cfg(not(no_cuda))]
 struct Export {
    name: String,
    data: Vec<bf16>,
    shape: Vec<usize>,
 }
 /// 1D norm / embedding row-table: keep layout, just cast to BF16.
 #[cfg(not(no_cuda))]
 fn keep(name: &str, v: &Var) -> Export {
    let host = param_to_host(v);
    let shape = v.value().shape().to_vec();
    Export {
        name: name.to_string(),
        data: host.iter().map(|&x| bf16::from_f32(x)).collect(),
        shape,
    }
 }
 /// 2D projection weight: xtrain `[in,out]` (x@W) → HF `[out,in]` (x@Wᵀ). Transpose
 /// the row-major matrix and cast to BF16.
 #[cfg(not(no_cuda))]
 fn transpose(name: &str, v: &Var) -> Export {
    let host = param_to_host(v);
    let shape = v.value().shape().to_vec();
    assert_eq!(shape.len(), 2, "transpose expects a 2D weight: {name}");
    let (rows, cols) = (shape[0], shape[1]); // [in, out]
    let mut out = vec![bf16::ZERO; rows * cols];
    for r in 0..rows {
        for c in 0..cols {
            // out[c, r] = in[r, c]
            out[c * rows + r] = bf16::from_f32(host[r * cols + c]);
        }
    }
    Export {
        name: name.to_string(),
        data: out,
        shape: vec![cols, rows], // [out, in]
    }
 }
 /// Assemble every export tensor in HF Qwen3 naming, reading the xtrain params in
 /// their stable `params()` order:
 ///   embed → per block [attn_norm, wq, wk, wv, q_norm, k_norm, wo, ffn_norm,
 ///                      w_gate, w_up, w_down] → final_norm → lm_head
 #[cfg(not(no_cuda))]
 fn build_exports(model: &TinyTransformer) -> Vec<Export> {
    let cfg = model.config();
    let p = model.params();
    let mut it = p.iter();
    let mut next = || it.next().expect("params() ran short");
    let mut ex = Vec::new();
    ex.push(keep("model.embed_tokens.weight", next())); // [vocab, dim]
    for l in 0..cfg.n_layers {
        let b = format!("model.layers.{l}");
        ex.push(keep(&format!("{b}.input_layernorm.weight"), next()));
        ex.push(transpose(&format!("{b}.self_attn.q_proj.weight"), next()));
        ex.push(transpose(&format!("{b}.self_attn.k_proj.weight"), next()));
        ex.push(transpose(&format!("{b}.self_attn.v_proj.weight"), next()));
        ex.push(keep(&format!("{b}.self_attn.q_norm.weight"), next()));
        ex.push(keep(&format!("{b}.self_attn.k_norm.weight"), next()));
        ex.push(transpose(&format!("{b}.self_attn.o_proj.weight"), next()));
        ex.push(keep(
            &format!("{b}.post_attention_layernorm.weight"),
            next(),
        ));
        ex.push(transpose(&format!("{b}.mlp.gate_proj.weight"), next()));
        ex.push(transpose(&format!("{b}.mlp.up_proj.weight"), next()));
        ex.push(transpose(&format!("{b}.mlp.down_proj.weight"), next()));
    }
    ex.push(keep("model.norm.weight", next())); // [dim]
    ex.push(transpose("lm_head.weight", next())); // [dim,vocab] → [vocab,dim]
    assert!(it.next().is_none(), "params() had extra tensors");
    ex
 }
 /// config.json matching xserv's `ModelConfig` for a Qwen3 with xtrain's dims and
 /// reconciled fields (eps, rope theta, head_dim, n_kv_heads = n_heads, untied).
 #[cfg(not(no_cuda))]
 fn config_json(cfg: &Config) -> String {
    format!(
        r#"{{
  "architectures": ["Qwen3ForCausalLM"],
  "model_type": "qwen3",
  "vocab_size": {vocab},
  "hidden_size": {dim},
  "intermediate_size": {ffn},
  "num_hidden_layers": {layers},
  "num_attention_heads": {heads},
  "num_key_value_heads": {kv_heads},
  "head_dim": {head_dim},
  "max_position_embeddings": 2048,
  "rms_norm_eps": {eps},
  "rope_theta": {theta},
  "tie_word_embeddings": false,
  "attention_bias": false,
  "hidden_act": "silu"
 }}
 "#,
        vocab = cfg.vocab,
        dim = cfg.dim,
        ffn = cfg.ffn_hidden,
        layers = cfg.n_layers,
        heads = cfg.n_heads,
        kv_heads = cfg.n_heads, // xtrain is MHA → kv heads == query heads
        head_dim = cfg.head_dim,
        eps = cfg.eps,
        theta = cfg.rope_theta,
    )
 }
 #[cfg(not(no_cuda))]
 fn main() {
    use safetensors::tensor::{Dtype, TensorView};
    use xserv_tokenizer::Tokenizer;
    let args: Vec<String> = std::env::args().collect();
    let ckpt = args
        .get(1)
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from("/tmp/xtrain_tinystories.ckpt"));
    let tok_path = args
        .get(2)
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from("/opt/wjh/models/gpt2/tokenizer.json"));
    let out_dir = args
        .get(3)
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from("/tmp/xtrain_export"));
    assert!(device::device_count().unwrap() > 0, "no CUDA device");
    device::set_device(0).unwrap();
    let dev = Device::Cuda(0);
    // Size the model exactly like bin/train.rs: gpt2 vocab + n_layers = 4.
    let tok = Tokenizer::from_file(&tok_path);
    let vocab = tok.vocab_size();
    let mut cfg = Config::tiny();
    cfg.vocab = vocab;
    cfg.n_layers = 4;
    println!(
        "export: ckpt {} → {} (vocab {}, dim {}, layers {}, heads {}, head_dim {})",
        ckpt.display(),
        out_dir.display(),
        cfg.vocab,
        cfg.dim,
        cfg.n_layers,
        cfg.n_heads,
        cfg.head_dim,
    );
    let mut seed = 1u64;
    let model = TinyTransformer::new(cfg, dev, |shape| {
        seed = seed.wrapping_add(1);
        let n: usize = shape.iter().product();
        if shape.len() == 1 {
            fill(n, seed, 0.02).iter().map(|v| v + 1.0).collect()
        } else {
            fill(n, seed, 0.04)
        }
    });
    xtrain_train::checkpoint::load_into(&ckpt, &model.params()).expect("load checkpoint");
    let exports = build_exports(&model);
    println!("export: {} tensors", exports.len());
    // Serialize to safetensors. Each TensorView borrows the raw BF16 bytes.
    let views: Vec<(String, TensorView)> = exports
        .iter()
        .map(|e| {
            let bytes = unsafe {
                std::slice::from_raw_parts(e.data.as_ptr() as *const u8, e.data.len() * 2)
            };
            let view = TensorView::new(Dtype::BF16, e.shape.clone(), bytes)
                .unwrap_or_else(|err| panic!("bad tensor view {}: {err}", e.name));
            (e.name.clone(), view)
        })
        .collect();
    std::fs::create_dir_all(&out_dir).expect("mkdir out_dir");
    let st = safetensors::tensor::serialize(views.iter().map(|(n, v)| (n.as_str(), v)), &None)
        .expect("serialize safetensors");
    std::fs::write(out_dir.join("model.safetensors"), st).expect("write model.safetensors");
    std::fs::write(out_dir.join("config.json"), config_json(&cfg)).expect("write config.json");
    copy_tokenizer(&tok_path, &out_dir);
    println!(
        "export: wrote config.json + model.safetensors + tokenizer.json to {}",
        out_dir.display()
    );
 }
 /// Place the tokenizer beside the weights so xserv loads it from the model dir.
 #[cfg(not(no_cuda))]
 fn copy_tokenizer(tok_path: &Path, out_dir: &Path) {
    std::fs::copy(tok_path, out_dir.join("tokenizer.json")).expect("copy tokenizer.json");
 }