export: safetensors + config.json for xserv qwen3
New bin export_safetensors: load an xtrain checkpoint, map every param to its HF Qwen3 tensor name, transpose 2D projection weights [in,out]->[out,in] (1D norms + [vocab,dim] embed/lm_head kept), cast to BF16 (xserv's qwen3 forward is BF16-only), and write config.json + model.safetensors + a copy of the gpt2 tokenizer.json. Sized exactly like bin/train.rs. safetensors 0.5 to match xserv. GPU body gated behind not(no_cuda). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
12
Cargo.lock
generated
12
Cargo.lock
generated
@@ -109,6 +109,16 @@ version = "0.8.11"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
|
checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "safetensors"
|
||||||
|
version = "0.5.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cc0cdb7198d738a111f6df8fef42cb175412c311d0c4ac9126ff4e550ad1a0e8"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.228"
|
version = "1.0.228"
|
||||||
@@ -249,6 +259,8 @@ dependencies = [
|
|||||||
name = "xtrain-train"
|
name = "xtrain-train"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"half",
|
||||||
|
"safetensors",
|
||||||
"xserv-tokenizer",
|
"xserv-tokenizer",
|
||||||
"xtrain-autodiff",
|
"xtrain-autodiff",
|
||||||
"xtrain-cuda",
|
"xtrain-cuda",
|
||||||
|
|||||||
@@ -14,7 +14,14 @@ xtrain-cuda = { path = "../xtrain-cuda" }
|
|||||||
# crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads
|
# crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads
|
||||||
# the target package's workspace, not ours.
|
# the target package's workspace, not ours.
|
||||||
xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" }
|
xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" }
|
||||||
|
# T9 export to xserv: HF Qwen3 safetensors + BF16 weight cast.
|
||||||
|
half.workspace = true
|
||||||
|
safetensors = "0.5"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "train"
|
name = "train"
|
||||||
path = "src/bin/train.rs"
|
path = "src/bin/train.rs"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "export_safetensors"
|
||||||
|
path = "src/bin/export_safetensors.rs"
|
||||||
|
|||||||
258
crates/xtrain-train/src/bin/export_safetensors.rs
Normal file
258
crates/xtrain-train/src/bin/export_safetensors.rs
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
//! Phase T9 — export a trained xtrain checkpoint into the format xserv loads:
|
||||||
|
//! an HF Qwen3-style `config.json` + `model.safetensors` (+ a copy of the GPT-2
|
||||||
|
//! `tokenizer.json`), so xserv's `Qwen3` loader can serve the same weights.
|
||||||
|
//!
|
||||||
|
//! xtrain's `TinyTransformer` is (after T9) architecturally a tiny Qwen3:
|
||||||
|
//! RoPE (rotate_half, pos=row) + RMSNorm + per-head QK-norm + SwiGLU + separate
|
||||||
|
//! lm_head, MHA (n_kv_heads = n_heads). The only deltas to xserv are mechanical:
|
||||||
|
//! - tensor NAMES → HF Qwen3 names (`model.layers.{i}.self_attn.q_proj.weight` …)
|
||||||
|
//! - 2D proj LAYOUT → xtrain stores `[in,out]` (computes `x@W`); xserv/HF want
|
||||||
|
//! `[out,in]` (computes `x@Wᵀ`) → transpose every 2D projection weight.
|
||||||
|
//! 1D norms and the `[vocab,dim]` embedding/lm_head rows are unchanged.
|
||||||
|
//! - DTYPE → xserv's Qwen3 forward is BF16-only, so weights are written as BF16.
|
||||||
|
//!
|
||||||
|
//! See `docs/08-export-xserv.md` for the full architecture diff + mapping table.
|
||||||
|
//!
|
||||||
|
//! Run on dash5 (needs a GPU to materialise the checkpoint params):
|
||||||
|
//! export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
|
||||||
|
//! cargo run -p xtrain-train --release --bin export_safetensors -- \
|
||||||
|
//! /tmp/xtrain_tinystories.ckpt \
|
||||||
|
//! /opt/wjh/models/gpt2/tokenizer.json \
|
||||||
|
//! /tmp/xtrain_export
|
||||||
|
|
||||||
|
#[cfg(no_cuda)]
|
||||||
|
fn main() {
|
||||||
|
eprintln!("export_safetensors: built without CUDA (no_cuda); run on a GPU host (dash5).");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
use half::bf16;
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
use xtrain_autodiff::tape::Var;
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
use xtrain_cuda::device;
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
use xtrain_model::{Config, TinyTransformer, param_to_host};
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
use xtrain_tensor::Device;
|
||||||
|
|
||||||
|
// Same deterministic init scheme as bin/train.rs, so a freshly-built model has
|
||||||
|
// the right shapes before `load_into` overwrites the values from the checkpoint.
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
fn fill(n: usize, seed: u64, scale: f32) -> Vec<f32> {
|
||||||
|
let mut state = seed
|
||||||
|
.wrapping_mul(2862933555777941757)
|
||||||
|
.wrapping_add(3037000493);
|
||||||
|
(0..n)
|
||||||
|
.map(|_| {
|
||||||
|
state = state
|
||||||
|
.wrapping_mul(6364136223846793005)
|
||||||
|
.wrapping_add(1442695040888963407);
|
||||||
|
(((state >> 33) as f32 / (1u64 << 31) as f32) - 0.5) * 2.0 * scale
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A param ready to serialize: HF name + the (possibly transposed) row-major
|
||||||
|
/// data + its shape. Stored as BF16 (xserv's Qwen3 forward is BF16-only).
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
struct Export {
|
||||||
|
name: String,
|
||||||
|
data: Vec<bf16>,
|
||||||
|
shape: Vec<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 1D norm / embedding row-table: keep layout, just cast to BF16.
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
fn keep(name: &str, v: &Var) -> Export {
|
||||||
|
let host = param_to_host(v);
|
||||||
|
let shape = v.value().shape().to_vec();
|
||||||
|
Export {
|
||||||
|
name: name.to_string(),
|
||||||
|
data: host.iter().map(|&x| bf16::from_f32(x)).collect(),
|
||||||
|
shape,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 2D projection weight: xtrain `[in,out]` (x@W) → HF `[out,in]` (x@Wᵀ). Transpose
|
||||||
|
/// the row-major matrix and cast to BF16.
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
fn transpose(name: &str, v: &Var) -> Export {
|
||||||
|
let host = param_to_host(v);
|
||||||
|
let shape = v.value().shape().to_vec();
|
||||||
|
assert_eq!(shape.len(), 2, "transpose expects a 2D weight: {name}");
|
||||||
|
let (rows, cols) = (shape[0], shape[1]); // [in, out]
|
||||||
|
let mut out = vec![bf16::ZERO; rows * cols];
|
||||||
|
for r in 0..rows {
|
||||||
|
for c in 0..cols {
|
||||||
|
// out[c, r] = in[r, c]
|
||||||
|
out[c * rows + r] = bf16::from_f32(host[r * cols + c]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Export {
|
||||||
|
name: name.to_string(),
|
||||||
|
data: out,
|
||||||
|
shape: vec![cols, rows], // [out, in]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Assemble every export tensor in HF Qwen3 naming, reading the xtrain params in
|
||||||
|
/// their stable `params()` order:
|
||||||
|
/// embed → per block [attn_norm, wq, wk, wv, q_norm, k_norm, wo, ffn_norm,
|
||||||
|
/// w_gate, w_up, w_down] → final_norm → lm_head
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
fn build_exports(model: &TinyTransformer) -> Vec<Export> {
|
||||||
|
let cfg = model.config();
|
||||||
|
let p = model.params();
|
||||||
|
let mut it = p.iter();
|
||||||
|
let mut next = || it.next().expect("params() ran short");
|
||||||
|
|
||||||
|
let mut ex = Vec::new();
|
||||||
|
ex.push(keep("model.embed_tokens.weight", next())); // [vocab, dim]
|
||||||
|
for l in 0..cfg.n_layers {
|
||||||
|
let b = format!("model.layers.{l}");
|
||||||
|
ex.push(keep(&format!("{b}.input_layernorm.weight"), next()));
|
||||||
|
ex.push(transpose(&format!("{b}.self_attn.q_proj.weight"), next()));
|
||||||
|
ex.push(transpose(&format!("{b}.self_attn.k_proj.weight"), next()));
|
||||||
|
ex.push(transpose(&format!("{b}.self_attn.v_proj.weight"), next()));
|
||||||
|
ex.push(keep(&format!("{b}.self_attn.q_norm.weight"), next()));
|
||||||
|
ex.push(keep(&format!("{b}.self_attn.k_norm.weight"), next()));
|
||||||
|
ex.push(transpose(&format!("{b}.self_attn.o_proj.weight"), next()));
|
||||||
|
ex.push(keep(
|
||||||
|
&format!("{b}.post_attention_layernorm.weight"),
|
||||||
|
next(),
|
||||||
|
));
|
||||||
|
ex.push(transpose(&format!("{b}.mlp.gate_proj.weight"), next()));
|
||||||
|
ex.push(transpose(&format!("{b}.mlp.up_proj.weight"), next()));
|
||||||
|
ex.push(transpose(&format!("{b}.mlp.down_proj.weight"), next()));
|
||||||
|
}
|
||||||
|
ex.push(keep("model.norm.weight", next())); // [dim]
|
||||||
|
ex.push(transpose("lm_head.weight", next())); // [dim,vocab] → [vocab,dim]
|
||||||
|
assert!(it.next().is_none(), "params() had extra tensors");
|
||||||
|
ex
|
||||||
|
}
|
||||||
|
|
||||||
|
/// config.json matching xserv's `ModelConfig` for a Qwen3 with xtrain's dims and
|
||||||
|
/// reconciled fields (eps, rope theta, head_dim, n_kv_heads = n_heads, untied).
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
fn config_json(cfg: &Config) -> String {
|
||||||
|
format!(
|
||||||
|
r#"{{
|
||||||
|
"architectures": ["Qwen3ForCausalLM"],
|
||||||
|
"model_type": "qwen3",
|
||||||
|
"vocab_size": {vocab},
|
||||||
|
"hidden_size": {dim},
|
||||||
|
"intermediate_size": {ffn},
|
||||||
|
"num_hidden_layers": {layers},
|
||||||
|
"num_attention_heads": {heads},
|
||||||
|
"num_key_value_heads": {kv_heads},
|
||||||
|
"head_dim": {head_dim},
|
||||||
|
"max_position_embeddings": 2048,
|
||||||
|
"rms_norm_eps": {eps},
|
||||||
|
"rope_theta": {theta},
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"attention_bias": false,
|
||||||
|
"hidden_act": "silu"
|
||||||
|
}}
|
||||||
|
"#,
|
||||||
|
vocab = cfg.vocab,
|
||||||
|
dim = cfg.dim,
|
||||||
|
ffn = cfg.ffn_hidden,
|
||||||
|
layers = cfg.n_layers,
|
||||||
|
heads = cfg.n_heads,
|
||||||
|
kv_heads = cfg.n_heads, // xtrain is MHA → kv heads == query heads
|
||||||
|
head_dim = cfg.head_dim,
|
||||||
|
eps = cfg.eps,
|
||||||
|
theta = cfg.rope_theta,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
fn main() {
|
||||||
|
use safetensors::tensor::{Dtype, TensorView};
|
||||||
|
use xserv_tokenizer::Tokenizer;
|
||||||
|
|
||||||
|
let args: Vec<String> = std::env::args().collect();
|
||||||
|
let ckpt = args
|
||||||
|
.get(1)
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.unwrap_or_else(|| PathBuf::from("/tmp/xtrain_tinystories.ckpt"));
|
||||||
|
let tok_path = args
|
||||||
|
.get(2)
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.unwrap_or_else(|| PathBuf::from("/opt/wjh/models/gpt2/tokenizer.json"));
|
||||||
|
let out_dir = args
|
||||||
|
.get(3)
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.unwrap_or_else(|| PathBuf::from("/tmp/xtrain_export"));
|
||||||
|
|
||||||
|
assert!(device::device_count().unwrap() > 0, "no CUDA device");
|
||||||
|
device::set_device(0).unwrap();
|
||||||
|
let dev = Device::Cuda(0);
|
||||||
|
|
||||||
|
// Size the model exactly like bin/train.rs: gpt2 vocab + n_layers = 4.
|
||||||
|
let tok = Tokenizer::from_file(&tok_path);
|
||||||
|
let vocab = tok.vocab_size();
|
||||||
|
let mut cfg = Config::tiny();
|
||||||
|
cfg.vocab = vocab;
|
||||||
|
cfg.n_layers = 4;
|
||||||
|
println!(
|
||||||
|
"export: ckpt {} → {} (vocab {}, dim {}, layers {}, heads {}, head_dim {})",
|
||||||
|
ckpt.display(),
|
||||||
|
out_dir.display(),
|
||||||
|
cfg.vocab,
|
||||||
|
cfg.dim,
|
||||||
|
cfg.n_layers,
|
||||||
|
cfg.n_heads,
|
||||||
|
cfg.head_dim,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut seed = 1u64;
|
||||||
|
let model = TinyTransformer::new(cfg, dev, |shape| {
|
||||||
|
seed = seed.wrapping_add(1);
|
||||||
|
let n: usize = shape.iter().product();
|
||||||
|
if shape.len() == 1 {
|
||||||
|
fill(n, seed, 0.02).iter().map(|v| v + 1.0).collect()
|
||||||
|
} else {
|
||||||
|
fill(n, seed, 0.04)
|
||||||
|
}
|
||||||
|
});
|
||||||
|
xtrain_train::checkpoint::load_into(&ckpt, &model.params()).expect("load checkpoint");
|
||||||
|
|
||||||
|
let exports = build_exports(&model);
|
||||||
|
println!("export: {} tensors", exports.len());
|
||||||
|
|
||||||
|
// Serialize to safetensors. Each TensorView borrows the raw BF16 bytes.
|
||||||
|
let views: Vec<(String, TensorView)> = exports
|
||||||
|
.iter()
|
||||||
|
.map(|e| {
|
||||||
|
let bytes = unsafe {
|
||||||
|
std::slice::from_raw_parts(e.data.as_ptr() as *const u8, e.data.len() * 2)
|
||||||
|
};
|
||||||
|
let view = TensorView::new(Dtype::BF16, e.shape.clone(), bytes)
|
||||||
|
.unwrap_or_else(|err| panic!("bad tensor view {}: {err}", e.name));
|
||||||
|
(e.name.clone(), view)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
std::fs::create_dir_all(&out_dir).expect("mkdir out_dir");
|
||||||
|
let st = safetensors::tensor::serialize(views.iter().map(|(n, v)| (n.as_str(), v)), &None)
|
||||||
|
.expect("serialize safetensors");
|
||||||
|
std::fs::write(out_dir.join("model.safetensors"), st).expect("write model.safetensors");
|
||||||
|
std::fs::write(out_dir.join("config.json"), config_json(&cfg)).expect("write config.json");
|
||||||
|
copy_tokenizer(&tok_path, &out_dir);
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"export: wrote config.json + model.safetensors + tokenizer.json to {}",
|
||||||
|
out_dir.display()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Place the tokenizer beside the weights so xserv loads it from the model dir.
|
||||||
|
#[cfg(not(no_cuda))]
|
||||||
|
fn copy_tokenizer(tok_path: &Path, out_dir: &Path) {
|
||||||
|
std::fs::copy(tok_path, out_dir.join("tokenizer.json")).expect("copy tokenizer.json");
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user