diff --git a/Cargo.lock b/Cargo.lock
index e3512af..594dfba 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -109,6 +109,16 @@ version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
 
+[[package]]
+name = "safetensors"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc0cdb7198d738a111f6df8fef42cb175412c311d0c4ac9126ff4e550ad1a0e8"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -249,6 +259,8 @@ dependencies = [
 name = "xtrain-train"
 version = "0.1.0"
 dependencies = [
+ "half",
+ "safetensors",
  "xserv-tokenizer",
  "xtrain-autodiff",
  "xtrain-cuda",
diff --git a/crates/xtrain-train/Cargo.toml b/crates/xtrain-train/Cargo.toml
index 07bcd42..c983f18 100644
--- a/crates/xtrain-train/Cargo.toml
+++ b/crates/xtrain-train/Cargo.toml
@@ -14,7 +14,14 @@ xtrain-cuda = { path = "../xtrain-cuda" }
 # crate inherits xserv's workspace for its own deps (serde/regex) — Cargo reads
 # the target package's workspace, not ours.
 xserv-tokenizer = { path = "../../../xserv/crates/xserv-tokenizer" }
+# T9 export to xserv: HF Qwen3 safetensors + BF16 weight cast.
+half.workspace = true
+safetensors = "0.5"
 
 [[bin]]
 name = "train"
 path = "src/bin/train.rs"
+
+[[bin]]
+name = "export_safetensors"
+path = "src/bin/export_safetensors.rs"
diff --git a/crates/xtrain-train/src/bin/export_safetensors.rs b/crates/xtrain-train/src/bin/export_safetensors.rs
new file mode 100644
index 0000000..37f3a71
--- /dev/null
+++ b/crates/xtrain-train/src/bin/export_safetensors.rs
@@ -0,0 +1,258 @@
+//! Phase T9 — export a trained xtrain checkpoint into the format xserv loads:
+//! an HF Qwen3-style `config.json` + `model.safetensors` (+ a copy of the GPT-2
+//! `tokenizer.json`), so xserv's `Qwen3` loader can serve the same weights.
+//!
+//! xtrain's `TinyTransformer` is (after T9) architecturally a tiny Qwen3:
+//! RoPE (rotate_half, pos=row) + RMSNorm + per-head QK-norm + SwiGLU + separate
+//! lm_head, MHA (n_kv_heads = n_heads). The only deltas to xserv are mechanical:
+//!   - tensor NAMES  → HF Qwen3 names (`model.layers.{i}.self_attn.q_proj.weight` …)
+//!   - 2D proj LAYOUT → xtrain stores `[in,out]` (computes `x@W`); xserv/HF want
+//!     `[out,in]` (computes `x@Wᵀ`) → transpose every 2D projection weight.
+//!     1D norms and the `[vocab,dim]` embedding/lm_head rows are unchanged.
+//!   - DTYPE → xserv's Qwen3 forward is BF16-only, so weights are written as BF16.
+//!
+//! See `docs/08-export-xserv.md` for the full architecture diff + mapping table.
+//!
+//! Run on dash5 (needs a GPU to materialise the checkpoint params):
+//!   export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
+//!   cargo run -p xtrain-train --release --bin export_safetensors -- \
+//!       /tmp/xtrain_tinystories.ckpt \
+//!       /opt/wjh/models/gpt2/tokenizer.json \
+//!       /tmp/xtrain_export
+
+#[cfg(no_cuda)]
+fn main() {
+    eprintln!("export_safetensors: built without CUDA (no_cuda); run on a GPU host (dash5).");
+}
+
+#[cfg(not(no_cuda))]
+use std::path::{Path, PathBuf};
+
+#[cfg(not(no_cuda))]
+use half::bf16;
+#[cfg(not(no_cuda))]
+use xtrain_autodiff::tape::Var;
+#[cfg(not(no_cuda))]
+use xtrain_cuda::device;
+#[cfg(not(no_cuda))]
+use xtrain_model::{Config, TinyTransformer, param_to_host};
+#[cfg(not(no_cuda))]
+use xtrain_tensor::Device;
+
+// Same deterministic init scheme as bin/train.rs, so a freshly-built model has
+// the right shapes before `load_into` overwrites the values from the checkpoint.
+#[cfg(not(no_cuda))]
+fn fill(n: usize, seed: u64, scale: f32) -> Vec<f32> {
+    let mut state = seed
+        .wrapping_mul(2862933555777941757)
+        .wrapping_add(3037000493);
+    (0..n)
+        .map(|_| {
+            state = state
+                .wrapping_mul(6364136223846793005)
+                .wrapping_add(1442695040888963407);
+            (((state >> 33) as f32 / (1u64 << 31) as f32) - 0.5) * 2.0 * scale
+        })
+        .collect()
+}
+
+/// A param ready to serialize: HF name + the (possibly transposed) row-major
+/// data + its shape. Stored as BF16 (xserv's Qwen3 forward is BF16-only).
+#[cfg(not(no_cuda))]
+struct Export {
+    name: String,
+    data: Vec<bf16>,
+    shape: Vec<usize>,
+}
+
+/// 1D norm / embedding row-table: keep layout, just cast to BF16.
+#[cfg(not(no_cuda))]
+fn keep(name: &str, v: &Var) -> Export {
+    let host = param_to_host(v);
+    let shape = v.value().shape().to_vec();
+    Export {
+        name: name.to_string(),
+        data: host.iter().map(|&x| bf16::from_f32(x)).collect(),
+        shape,
+    }
+}
+
+/// 2D projection weight: xtrain `[in,out]` (x@W) → HF `[out,in]` (x@Wᵀ). Transpose
+/// the row-major matrix and cast to BF16.
+#[cfg(not(no_cuda))]
+fn transpose(name: &str, v: &Var) -> Export {
+    let host = param_to_host(v);
+    let shape = v.value().shape().to_vec();
+    assert_eq!(shape.len(), 2, "transpose expects a 2D weight: {name}");
+    let (rows, cols) = (shape[0], shape[1]); // [in, out]
+    let mut out = vec![bf16::ZERO; rows * cols];
+    for r in 0..rows {
+        for c in 0..cols {
+            // out[c, r] = in[r, c]
+            out[c * rows + r] = bf16::from_f32(host[r * cols + c]);
+        }
+    }
+    Export {
+        name: name.to_string(),
+        data: out,
+        shape: vec![cols, rows], // [out, in]
+    }
+}
+
+/// Assemble every export tensor in HF Qwen3 naming, reading the xtrain params in
+/// their stable `params()` order:
+///   embed → per block [attn_norm, wq, wk, wv, q_norm, k_norm, wo, ffn_norm,
+///                      w_gate, w_up, w_down] → final_norm → lm_head
+#[cfg(not(no_cuda))]
+fn build_exports(model: &TinyTransformer) -> Vec<Export> {
+    let cfg = model.config();
+    let p = model.params();
+    let mut it = p.iter();
+    let mut next = || it.next().expect("params() ran short");
+
+    let mut ex = Vec::new();
+    ex.push(keep("model.embed_tokens.weight", next())); // [vocab, dim]
+    for l in 0..cfg.n_layers {
+        let b = format!("model.layers.{l}");
+        ex.push(keep(&format!("{b}.input_layernorm.weight"), next()));
+        ex.push(transpose(&format!("{b}.self_attn.q_proj.weight"), next()));
+        ex.push(transpose(&format!("{b}.self_attn.k_proj.weight"), next()));
+        ex.push(transpose(&format!("{b}.self_attn.v_proj.weight"), next()));
+        ex.push(keep(&format!("{b}.self_attn.q_norm.weight"), next()));
+        ex.push(keep(&format!("{b}.self_attn.k_norm.weight"), next()));
+        ex.push(transpose(&format!("{b}.self_attn.o_proj.weight"), next()));
+        ex.push(keep(
+            &format!("{b}.post_attention_layernorm.weight"),
+            next(),
+        ));
+        ex.push(transpose(&format!("{b}.mlp.gate_proj.weight"), next()));
+        ex.push(transpose(&format!("{b}.mlp.up_proj.weight"), next()));
+        ex.push(transpose(&format!("{b}.mlp.down_proj.weight"), next()));
+    }
+    ex.push(keep("model.norm.weight", next())); // [dim]
+    ex.push(transpose("lm_head.weight", next())); // [dim,vocab] → [vocab,dim]
+    assert!(it.next().is_none(), "params() had extra tensors");
+    ex
+}
+
+/// config.json matching xserv's `ModelConfig` for a Qwen3 with xtrain's dims and
+/// reconciled fields (eps, rope theta, head_dim, n_kv_heads = n_heads, untied).
+#[cfg(not(no_cuda))]
+fn config_json(cfg: &Config) -> String {
+    format!(
+        r#"{{
+  "architectures": ["Qwen3ForCausalLM"],
+  "model_type": "qwen3",
+  "vocab_size": {vocab},
+  "hidden_size": {dim},
+  "intermediate_size": {ffn},
+  "num_hidden_layers": {layers},
+  "num_attention_heads": {heads},
+  "num_key_value_heads": {kv_heads},
+  "head_dim": {head_dim},
+  "max_position_embeddings": 2048,
+  "rms_norm_eps": {eps},
+  "rope_theta": {theta},
+  "tie_word_embeddings": false,
+  "attention_bias": false,
+  "hidden_act": "silu"
+}}
+"#,
+        vocab = cfg.vocab,
+        dim = cfg.dim,
+        ffn = cfg.ffn_hidden,
+        layers = cfg.n_layers,
+        heads = cfg.n_heads,
+        kv_heads = cfg.n_heads, // xtrain is MHA → kv heads == query heads
+        head_dim = cfg.head_dim,
+        eps = cfg.eps,
+        theta = cfg.rope_theta,
+    )
+}
+
+#[cfg(not(no_cuda))]
+fn main() {
+    use safetensors::tensor::{Dtype, TensorView};
+    use xserv_tokenizer::Tokenizer;
+
+    let args: Vec<String> = std::env::args().collect();
+    let ckpt = args
+        .get(1)
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("/tmp/xtrain_tinystories.ckpt"));
+    let tok_path = args
+        .get(2)
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("/opt/wjh/models/gpt2/tokenizer.json"));
+    let out_dir = args
+        .get(3)
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("/tmp/xtrain_export"));
+
+    assert!(device::device_count().unwrap() > 0, "no CUDA device");
+    device::set_device(0).unwrap();
+    let dev = Device::Cuda(0);
+
+    // Size the model exactly like bin/train.rs: gpt2 vocab + n_layers = 4.
+    let tok = Tokenizer::from_file(&tok_path);
+    let vocab = tok.vocab_size();
+    let mut cfg = Config::tiny();
+    cfg.vocab = vocab;
+    cfg.n_layers = 4;
+    println!(
+        "export: ckpt {} → {} (vocab {}, dim {}, layers {}, heads {}, head_dim {})",
+        ckpt.display(),
+        out_dir.display(),
+        cfg.vocab,
+        cfg.dim,
+        cfg.n_layers,
+        cfg.n_heads,
+        cfg.head_dim,
+    );
+
+    let mut seed = 1u64;
+    let model = TinyTransformer::new(cfg, dev, |shape| {
+        seed = seed.wrapping_add(1);
+        let n: usize = shape.iter().product();
+        if shape.len() == 1 {
+            fill(n, seed, 0.02).iter().map(|v| v + 1.0).collect()
+        } else {
+            fill(n, seed, 0.04)
+        }
+    });
+    xtrain_train::checkpoint::load_into(&ckpt, &model.params()).expect("load checkpoint");
+
+    let exports = build_exports(&model);
+    println!("export: {} tensors", exports.len());
+
+    // Serialize to safetensors. Each TensorView borrows the raw BF16 bytes.
+    let views: Vec<(String, TensorView)> = exports
+        .iter()
+        .map(|e| {
+            let bytes = unsafe {
+                std::slice::from_raw_parts(e.data.as_ptr() as *const u8, e.data.len() * 2)
+            };
+            let view = TensorView::new(Dtype::BF16, e.shape.clone(), bytes)
+                .unwrap_or_else(|err| panic!("bad tensor view {}: {err}", e.name));
+            (e.name.clone(), view)
+        })
+        .collect();
+
+    std::fs::create_dir_all(&out_dir).expect("mkdir out_dir");
+    let st = safetensors::tensor::serialize(views.iter().map(|(n, v)| (n.as_str(), v)), &None)
+        .expect("serialize safetensors");
+    std::fs::write(out_dir.join("model.safetensors"), st).expect("write model.safetensors");
+    std::fs::write(out_dir.join("config.json"), config_json(&cfg)).expect("write config.json");
+    copy_tokenizer(&tok_path, &out_dir);
+
+    println!(
+        "export: wrote config.json + model.safetensors + tokenizer.json to {}",
+        out_dir.display()
+    );
+}
+
+/// Place the tokenizer beside the weights so xserv loads it from the model dir.
+#[cfg(not(no_cuda))]
+fn copy_tokenizer(tok_path: &Path, out_dir: &Path) {
+    std::fs::copy(tok_path, out_dir.join("tokenizer.json")).expect("copy tokenizer.json");
+}