export: safetensors + config.json for xserv qwen3

New bin export_safetensors: load an xtrain checkpoint, map every param to its
HF Qwen3 tensor name, transpose 2D projection weights [in,out]->[out,in]
(1D norms + [vocab,dim] embed/lm_head kept), cast to BF16 (xserv's qwen3
forward is BF16-only), and write config.json + model.safetensors + a copy of
the gpt2 tokenizer.json. Sized exactly like bin/train.rs. safetensors 0.5 to
match xserv. GPU body gated behind not(no_cuda).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 17:33:26 +08:00
parent 7a4f69e430
commit 1c76573cb4
3 changed files with 277 additions and 0 deletions

12
Cargo.lock generated
View File

@@ -109,6 +109,16 @@ version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
[[package]]
name = "safetensors"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc0cdb7198d738a111f6df8fef42cb175412c311d0c4ac9126ff4e550ad1a0e8"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "serde"
version = "1.0.228"
@@ -249,6 +259,8 @@ dependencies = [
name = "xtrain-train"
version = "0.1.0"
dependencies = [
"half",
"safetensors",
"xserv-tokenizer",
"xtrain-autodiff",
"xtrain-cuda",

View File

@@ -14,7 +14,14 @@ xtrain-cuda = { path = "../xtrain-cuda" }
# crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads
# the target package's workspace, not ours.
xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" }
# T9 export to xserv: HF Qwen3 safetensors + BF16 weight cast.
half.workspace = true
safetensors = "0.5"
[[bin]]
name = "train"
path = "src/bin/train.rs"
[[bin]]
name = "export_safetensors"
path = "src/bin/export_safetensors.rs"

View File

@@ -0,0 +1,258 @@
//! Phase T9 — export a trained xtrain checkpoint into the format xserv loads:
//! an HF Qwen3-style `config.json` + `model.safetensors` (+ a copy of the GPT-2
//! `tokenizer.json`), so xserv's `Qwen3` loader can serve the same weights.
//!
//! xtrain's `TinyTransformer` is (after T9) architecturally a tiny Qwen3:
//! RoPE (rotate_half, pos=row) + RMSNorm + per-head QK-norm + SwiGLU + separate
//! lm_head, MHA (n_kv_heads = n_heads). The only deltas to xserv are mechanical:
//! - tensor NAMES → HF Qwen3 names (`model.layers.{i}.self_attn.q_proj.weight` …)
//! - 2D proj LAYOUT → xtrain stores `[in,out]` (computes `x@W`); xserv/HF want
//! `[out,in]` (computes `x@Wᵀ`) → transpose every 2D projection weight.
//! 1D norms and the `[vocab,dim]` embedding/lm_head rows are unchanged.
//! - DTYPE → xserv's Qwen3 forward is BF16-only, so weights are written as BF16.
//!
//! See `docs/08-export-xserv.md` for the full architecture diff + mapping table.
//!
//! Run on dash5 (needs a GPU to materialise the checkpoint params):
//! export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
//! cargo run -p xtrain-train --release --bin export_safetensors -- \
//! /tmp/xtrain_tinystories.ckpt \
//! /opt/wjh/models/gpt2/tokenizer.json \
//! /tmp/xtrain_export
#[cfg(no_cuda)]
fn main() {
eprintln!("export_safetensors: built without CUDA (no_cuda); run on a GPU host (dash5).");
}
#[cfg(not(no_cuda))]
use std::path::{Path, PathBuf};
#[cfg(not(no_cuda))]
use half::bf16;
#[cfg(not(no_cuda))]
use xtrain_autodiff::tape::Var;
#[cfg(not(no_cuda))]
use xtrain_cuda::device;
#[cfg(not(no_cuda))]
use xtrain_model::{Config, TinyTransformer, param_to_host};
#[cfg(not(no_cuda))]
use xtrain_tensor::Device;
// Same deterministic init scheme as bin/train.rs, so a freshly-built model has
// the right shapes before `load_into` overwrites the values from the checkpoint.
#[cfg(not(no_cuda))]
fn fill(n: usize, seed: u64, scale: f32) -> Vec<f32> {
let mut state = seed
.wrapping_mul(2862933555777941757)
.wrapping_add(3037000493);
(0..n)
.map(|_| {
state = state
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
(((state >> 33) as f32 / (1u64 << 31) as f32) - 0.5) * 2.0 * scale
})
.collect()
}
/// A param ready to serialize: HF name + the (possibly transposed) row-major
/// data + its shape. Stored as BF16 (xserv's Qwen3 forward is BF16-only).
#[cfg(not(no_cuda))]
struct Export {
name: String,
data: Vec<bf16>,
shape: Vec<usize>,
}
/// 1D norm / embedding row-table: keep layout, just cast to BF16.
#[cfg(not(no_cuda))]
fn keep(name: &str, v: &Var) -> Export {
let host = param_to_host(v);
let shape = v.value().shape().to_vec();
Export {
name: name.to_string(),
data: host.iter().map(|&x| bf16::from_f32(x)).collect(),
shape,
}
}
/// 2D projection weight: xtrain `[in,out]` (x@W) → HF `[out,in]` (x@Wᵀ). Transpose
/// the row-major matrix and cast to BF16.
#[cfg(not(no_cuda))]
fn transpose(name: &str, v: &Var) -> Export {
let host = param_to_host(v);
let shape = v.value().shape().to_vec();
assert_eq!(shape.len(), 2, "transpose expects a 2D weight: {name}");
let (rows, cols) = (shape[0], shape[1]); // [in, out]
let mut out = vec![bf16::ZERO; rows * cols];
for r in 0..rows {
for c in 0..cols {
// out[c, r] = in[r, c]
out[c * rows + r] = bf16::from_f32(host[r * cols + c]);
}
}
Export {
name: name.to_string(),
data: out,
shape: vec![cols, rows], // [out, in]
}
}
/// Assemble every export tensor in HF Qwen3 naming, reading the xtrain params in
/// their stable `params()` order:
/// embed → per block [attn_norm, wq, wk, wv, q_norm, k_norm, wo, ffn_norm,
/// w_gate, w_up, w_down] → final_norm → lm_head
#[cfg(not(no_cuda))]
fn build_exports(model: &TinyTransformer) -> Vec<Export> {
let cfg = model.config();
let p = model.params();
let mut it = p.iter();
let mut next = || it.next().expect("params() ran short");
let mut ex = Vec::new();
ex.push(keep("model.embed_tokens.weight", next())); // [vocab, dim]
for l in 0..cfg.n_layers {
let b = format!("model.layers.{l}");
ex.push(keep(&format!("{b}.input_layernorm.weight"), next()));
ex.push(transpose(&format!("{b}.self_attn.q_proj.weight"), next()));
ex.push(transpose(&format!("{b}.self_attn.k_proj.weight"), next()));
ex.push(transpose(&format!("{b}.self_attn.v_proj.weight"), next()));
ex.push(keep(&format!("{b}.self_attn.q_norm.weight"), next()));
ex.push(keep(&format!("{b}.self_attn.k_norm.weight"), next()));
ex.push(transpose(&format!("{b}.self_attn.o_proj.weight"), next()));
ex.push(keep(
&format!("{b}.post_attention_layernorm.weight"),
next(),
));
ex.push(transpose(&format!("{b}.mlp.gate_proj.weight"), next()));
ex.push(transpose(&format!("{b}.mlp.up_proj.weight"), next()));
ex.push(transpose(&format!("{b}.mlp.down_proj.weight"), next()));
}
ex.push(keep("model.norm.weight", next())); // [dim]
ex.push(transpose("lm_head.weight", next())); // [dim,vocab] → [vocab,dim]
assert!(it.next().is_none(), "params() had extra tensors");
ex
}
/// config.json matching xserv's `ModelConfig` for a Qwen3 with xtrain's dims and
/// reconciled fields (eps, rope theta, head_dim, n_kv_heads = n_heads, untied).
#[cfg(not(no_cuda))]
fn config_json(cfg: &Config) -> String {
format!(
r#"{{
"architectures": ["Qwen3ForCausalLM"],
"model_type": "qwen3",
"vocab_size": {vocab},
"hidden_size": {dim},
"intermediate_size": {ffn},
"num_hidden_layers": {layers},
"num_attention_heads": {heads},
"num_key_value_heads": {kv_heads},
"head_dim": {head_dim},
"max_position_embeddings": 2048,
"rms_norm_eps": {eps},
"rope_theta": {theta},
"tie_word_embeddings": false,
"attention_bias": false,
"hidden_act": "silu"
}}
"#,
vocab = cfg.vocab,
dim = cfg.dim,
ffn = cfg.ffn_hidden,
layers = cfg.n_layers,
heads = cfg.n_heads,
kv_heads = cfg.n_heads, // xtrain is MHA → kv heads == query heads
head_dim = cfg.head_dim,
eps = cfg.eps,
theta = cfg.rope_theta,
)
}
#[cfg(not(no_cuda))]
fn main() {
use safetensors::tensor::{Dtype, TensorView};
use xserv_tokenizer::Tokenizer;
let args: Vec<String> = std::env::args().collect();
let ckpt = args
.get(1)
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("/tmp/xtrain_tinystories.ckpt"));
let tok_path = args
.get(2)
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("/opt/wjh/models/gpt2/tokenizer.json"));
let out_dir = args
.get(3)
.map(PathBuf::from)
.unwrap_or_else(|| PathBuf::from("/tmp/xtrain_export"));
assert!(device::device_count().unwrap() > 0, "no CUDA device");
device::set_device(0).unwrap();
let dev = Device::Cuda(0);
// Size the model exactly like bin/train.rs: gpt2 vocab + n_layers = 4.
let tok = Tokenizer::from_file(&tok_path);
let vocab = tok.vocab_size();
let mut cfg = Config::tiny();
cfg.vocab = vocab;
cfg.n_layers = 4;
println!(
"export: ckpt {}{} (vocab {}, dim {}, layers {}, heads {}, head_dim {})",
ckpt.display(),
out_dir.display(),
cfg.vocab,
cfg.dim,
cfg.n_layers,
cfg.n_heads,
cfg.head_dim,
);
let mut seed = 1u64;
let model = TinyTransformer::new(cfg, dev, |shape| {
seed = seed.wrapping_add(1);
let n: usize = shape.iter().product();
if shape.len() == 1 {
fill(n, seed, 0.02).iter().map(|v| v + 1.0).collect()
} else {
fill(n, seed, 0.04)
}
});
xtrain_train::checkpoint::load_into(&ckpt, &model.params()).expect("load checkpoint");
let exports = build_exports(&model);
println!("export: {} tensors", exports.len());
// Serialize to safetensors. Each TensorView borrows the raw BF16 bytes.
let views: Vec<(String, TensorView)> = exports
.iter()
.map(|e| {
let bytes = unsafe {
std::slice::from_raw_parts(e.data.as_ptr() as *const u8, e.data.len() * 2)
};
let view = TensorView::new(Dtype::BF16, e.shape.clone(), bytes)
.unwrap_or_else(|err| panic!("bad tensor view {}: {err}", e.name));
(e.name.clone(), view)
})
.collect();
std::fs::create_dir_all(&out_dir).expect("mkdir out_dir");
let st = safetensors::tensor::serialize(views.iter().map(|(n, v)| (n.as_str(), v)), &None)
.expect("serialize safetensors");
std::fs::write(out_dir.join("model.safetensors"), st).expect("write model.safetensors");
std::fs::write(out_dir.join("config.json"), config_json(&cfg)).expect("write config.json");
copy_tokenizer(&tok_path, &out_dir);
println!(
"export: wrote config.json + model.safetensors + tokenizer.json to {}",
out_dir.display()
);
}
/// Place the tokenizer beside the weights so xserv loads it from the model dir.
#[cfg(not(no_cuda))]
fn copy_tokenizer(tok_path: &Path, out_dir: &Path) {
std::fs::copy(tok_path, out_dir.join("tokenizer.json")).expect("copy tokenizer.json");
}