phase 10: add Qwen3-8B benchmark + performance fix

Benchmark infrastructure: - bench-qwen3 binary: 50 prompts × 20 tokens with KV cache - bench_compare_qwen3.py: comparison against HF transformers (BF16) Performance fix: - Precompute transposed weights at model load time (eliminated per-token weight transpose CPU round-trip: was 252 transposes × 32MB each = 8GB/token) - Result: from "infinite" (>10 min/token) to 144ms/token Results (50 prompts): - Prefill top-1: 42/50 (84%), top-5: 50/50 (100%) vs HF transformers - Greedy sequence: 0/50 exact match (BF16 precision drift over 36 layers) - Performance: TTFT=138ms, TBT=144ms, 6.9 tok/s (HF: 21ms, 45.6 tok/s) - All outputs are coherent English/Chinese Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 10:25:33 +08:00
parent 246ae1c590
commit 268e40d764
4 changed files with 389 additions and 30 deletions
--- a/crates/xserv-model/src/bin/bench-qwen3.rs
+++ b/crates/xserv-model/src/bin/bench-qwen3.rs
@@ -0,0 +1,160 @@
+use std::path::PathBuf;
+use std::time::Instant;
+use xserv_model::qwen3::sample_greedy;
+use xserv_model::{loader, KVCache, ModelConfig, Qwen3};
+use xserv_tensor::{DType, Device};
+use xserv_tokenizer::Tokenizer;
+
+fn main() {
+    let args: Vec<String> = std::env::args().collect();
+    if args.len() < 2 {
+        eprintln!("Usage: bench-qwen3 <model-dir> [--gen-tokens N]");
+        std::process::exit(1);
+    }
+    let model_dir = PathBuf::from(&args[1]);
+    let gen_tokens: usize = args
+        .iter()
+        .position(|a| a == "--gen-tokens")
+        .and_then(|i| args.get(i + 1))
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(20);
+
+    xserv_cuda::device::set_device(0).unwrap();
+
+    let config = ModelConfig::from_file(&model_dir.join("config.json"));
+    eprintln!("Loading Qwen3-8B weights...");
+    let weights = loader::load_model_dir(&model_dir, Device::Cuda(0));
+    eprintln!("Loaded {} tensors", weights.len());
+    let model = Qwen3::from_weights(config.clone(), weights);
+    let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
+
+    // Warmup
+    {
+        let ids = tokenizer.encode("warmup");
+        let mut cache = KVCache::new(
+            config.num_layers(), config.num_kv_heads(), config.head_dim(),
+            DType::BF16, Device::Cuda(0),
+        );
+        let _ = model.forward_with_cache(&ids, &mut cache);
+    }
+    eprintln!("Warmup done. Running benchmark...");
+
+    let prompts: Vec<&str> = vec![
+        "The capital of France is",
+        "Once upon a time in a land far away",
+        "Hello, how are you doing today",
+        "In a shocking finding, scientists discovered a",
+        "The weather today is sunny, so I decided to",
+        "Alan Turing was a British mathematician who",
+        "The best way to learn programming is",
+        "Artificial intelligence will change the world because",
+        "The history of the internet began in the",
+        "A good morning routine starts with",
+        "The stock market crashed because investors",
+        "Deep learning is a subset of machine learning that",
+        "The president of the United States announced",
+        "In the year 2050, humans will",
+        "The secret to happiness is",
+        "When I was a child, I used to",
+        "The most important scientific discovery of the century",
+        "Climate change is caused by",
+        "The recipe for chocolate cake requires",
+        "In conclusion, the evidence suggests that",
+        "The cat sat on the mat and",
+        "According to recent studies, exercise can",
+        "The first step in solving any problem is",
+        "Technology has transformed the way we",
+        "The novel begins with the protagonist",
+        "Education is the most powerful weapon",
+        "The ocean covers more than seventy percent of",
+        "Last night I had a dream about",
+        "The company announced its quarterly earnings",
+        "Music has the power to",
+        "The difference between success and failure is",
+        "In the beginning, there was nothing but",
+        "The doctor told me that I should",
+        "Python is a popular programming language because",
+        "The ancient Romans built roads that",
+        "A balanced diet should include",
+        "The movie received mixed reviews from critics",
+        "Space exploration has led to many",
+        "The teacher asked the students to",
+        "Global warming is one of the most",
+        "The bridge collapsed due to structural",
+        "Quantum computing promises to revolutionize",
+        "The new policy will affect millions of",
+        "During the winter months, it is important to",
+        "The human brain contains approximately",
+        "Democracy depends on the active participation of",
+        "The train arrived at the station exactly",
+        "Researchers at MIT have developed a new",
+        "The smartphone has become an essential part of",
+        "After careful consideration, the committee decided to",
+    ];
+
+    println!("[");
+    for (i, prompt) in prompts.iter().enumerate() {
+        let input_ids = tokenizer.encode(prompt);
+        let input_len = input_ids.len();
+
+        let mut cache = KVCache::new(
+            config.num_layers(), config.num_kv_heads(), config.head_dim(),
+            DType::BF16, Device::Cuda(0),
+        );
+
+        // Prefill
+        let t0 = Instant::now();
+        let logits = model.forward_with_cache(&input_ids, &mut cache);
+        let first_token = sample_greedy(&logits);
+        let ttft_us = t0.elapsed().as_micros();
+
+        let mut generated = vec![first_token];
+        let mut token_times = Vec::new();
+
+        // Decode
+        for _ in 1..gen_tokens {
+            let last = *generated.last().unwrap();
+            let t_start = Instant::now();
+            let logits = model.forward_with_cache(&[last], &mut cache);
+            let next = sample_greedy(&logits);
+            token_times.push(t_start.elapsed().as_micros());
+            generated.push(next);
+            if tokenizer.eos_token_id() == Some(next) { break; }
+        }
+
+        let num_generated = generated.len();
+        let generated_text = tokenizer.decode(&generated);
+        let tbt_us = if !token_times.is_empty() {
+            token_times.iter().sum::<u128>() / token_times.len() as u128
+        } else { 0 };
+        let total_gen_us: u128 = ttft_us + token_times.iter().sum::<u128>();
+        let tpot_us = if num_generated > 0 { total_gen_us / num_generated as u128 } else { 0 };
+
+        let gen_text_escaped = generated_text
+            .replace('\\', "\\\\")
+            .replace('"', "\\\"")
+            .replace('\n', "\\n")
+            .replace('\r', "\\r")
+            .replace('\t', "\\t");
+        let gen_ids_str: Vec<String> = generated.iter().map(|id| id.to_string()).collect();
+
+        print!("  {{\"prompt\": \"{}\", ", prompt.replace('"', "\\\""));
+        print!("\"input_len\": {input_len}, ");
+        print!("\"num_generated\": {num_generated}, ");
+        print!("\"generated_ids\": [{}], ", gen_ids_str.join(", "));
+        print!("\"generated_text\": \"{gen_text_escaped}\", ");
+        print!("\"ttft_us\": {ttft_us}, ");
+        print!("\"tbt_us\": {tbt_us}, ");
+        print!("\"tpot_us\": {tpot_us}}}");
+        if i < prompts.len() - 1 { println!(","); } else { println!(); }
+
+        eprintln!(
+            "[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
+            i + 1, prompts.len(),
+            ttft_us as f64 / 1000.0,
+            tbt_us as f64 / 1000.0,
+            &generated_text.replace('\n', " ")[..generated_text.len().min(60)]
+        );
+    }
+    println!("]");
+}
--- a/crates/xserv-model/src/qwen3.rs
+++ b/crates/xserv-model/src/qwen3.rs
@@ -11,22 +11,22 @@ pub struct Qwen3 {
    embed_tokens: Tensor,
    layers: Vec<Qwen3Block>,
    norm: Tensor,
-    lm_head: Tensor,
+    lm_head_t: Tensor,  // precomputed transpose
    rope_cache: RopeCache,
 }

 struct Qwen3Block {
    input_norm: Tensor,       // [hidden]
-    q_proj_w: Tensor,         // [num_heads*head_dim, hidden]
-    k_proj_w: Tensor,         // [num_kv_heads*head_dim, hidden]
-    v_proj_w: Tensor,
-    o_proj_w: Tensor,         // [hidden, num_heads*head_dim]
-    q_norm: Tensor,           // [head_dim] — per-head QK norm
+    q_proj_wt: Tensor,        // TRANSPOSED: [hidden, num_heads*head_dim]
+    k_proj_wt: Tensor,        // TRANSPOSED: [hidden, num_kv_heads*head_dim]
+    v_proj_wt: Tensor,
+    o_proj_wt: Tensor,        // TRANSPOSED: [num_heads*head_dim, hidden]
+    q_norm: Tensor,           // [head_dim]
    k_norm: Tensor,           // [head_dim]
    post_norm: Tensor,        // [hidden]
-    gate_proj_w: Tensor,      // [intermediate, hidden]
-    up_proj_w: Tensor,
-    down_proj_w: Tensor,      // [hidden, intermediate]
+    gate_proj_wt: Tensor,     // TRANSPOSED: [hidden, intermediate]
+    up_proj_wt: Tensor,
+    down_proj_wt: Tensor,     // TRANSPOSED: [intermediate, hidden]
 }

 impl Qwen3 {
@@ -37,7 +37,7 @@ impl Qwen3 {

        let embed_tokens = take(&mut w, "model.embed_tokens.weight");
        let norm = take(&mut w, "model.norm.weight");
-        let lm_head = take(&mut w, "lm_head.weight");
+        let lm_head_raw = take(&mut w, "lm_head.weight");

        let rope_cache = RopeCache::new(
            config.max_seq_len().min(8192), // limit for memory
@@ -45,26 +45,33 @@ impl Qwen3 {
            config.rope_theta.unwrap_or(1_000_000.0) as f32,
        );

+        // Precompute transposed weights: [out, in] → [in, out] so we can do x @ wt directly
+        let transpose_w = |t: Tensor| -> Tensor {
+            t.transpose(0, 1).contiguous()
+        };
+
        let num_layers = config.num_layers();
        let mut layers = Vec::with_capacity(num_layers);
+        eprintln!("Transposing weights for {} layers...", num_layers);
        for i in 0..num_layers {
            let p = format!("model.layers.{i}");
            layers.push(Qwen3Block {
                input_norm: take(&mut w, &format!("{p}.input_layernorm.weight")),
-                q_proj_w: take(&mut w, &format!("{p}.self_attn.q_proj.weight")),
-                k_proj_w: take(&mut w, &format!("{p}.self_attn.k_proj.weight")),
-                v_proj_w: take(&mut w, &format!("{p}.self_attn.v_proj.weight")),
-                o_proj_w: take(&mut w, &format!("{p}.self_attn.o_proj.weight")),
+                q_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.q_proj.weight"))),
+                k_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.k_proj.weight"))),
+                v_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.v_proj.weight"))),
+                o_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.o_proj.weight"))),
                q_norm: take(&mut w, &format!("{p}.self_attn.q_norm.weight")),
                k_norm: take(&mut w, &format!("{p}.self_attn.k_norm.weight")),
                post_norm: take(&mut w, &format!("{p}.post_attention_layernorm.weight")),
-                gate_proj_w: take(&mut w, &format!("{p}.mlp.gate_proj.weight")),
-                up_proj_w: take(&mut w, &format!("{p}.mlp.up_proj.weight")),
-                down_proj_w: take(&mut w, &format!("{p}.mlp.down_proj.weight")),
+                gate_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.gate_proj.weight"))),
+                up_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.up_proj.weight"))),
+                down_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.down_proj.weight"))),
            });
        }

-        Self { config, embed_tokens, layers, norm, lm_head, rope_cache }
+        let lm_head_t = transpose_w(lm_head_raw);
+        Self { config, embed_tokens, layers, norm, lm_head_t, rope_cache }
    }

    pub fn forward_with_cache(&self, token_ids: &[u32], cache: &mut KVCache) -> Tensor {
@@ -83,10 +90,10 @@ impl Qwen3 {
            let residual = x.clone();
            let normed = rmsnorm(&x, &layer.input_norm, eps);

-            // Q/K/V projections (no bias, weight is [out, in])
-            let q = linear_t(&normed, &layer.q_proj_w);
-            let k = linear_t(&normed, &layer.k_proj_w);
-            let v = linear_t(&normed, &layer.v_proj_w);
+            // Q/K/V projections (pre-transposed weights, x @ wt)
+            let q = matmul_2d(&normed, &layer.q_proj_wt);
+            let k = matmul_2d(&normed, &layer.k_proj_wt);
+            let v = matmul_2d(&normed, &layer.v_proj_wt);

            // Reshape to [1, heads, seq, head_dim]
            let q = reshape_heads(&q, new_tokens, num_heads, head_dim);
@@ -121,30 +128,31 @@ impl Qwen3 {
            // Attention
            let attn_out = attention(&q, &k_full, &v_full, true);
            let attn_merged = merge_heads_any(&attn_out, new_tokens, hidden);
-            let attn_proj = linear_t(&attn_merged, &layer.o_proj_w);
+            let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
            x = add_any(&residual, &attn_proj);

            // SwiGLU FFN
            let residual = x.clone();
            let normed = rmsnorm(&x, &layer.post_norm, eps);
-            let gate = linear_t(&normed, &layer.gate_proj_w);
-            let up = linear_t(&normed, &layer.up_proj_w);
+            let gate = matmul_2d(&normed, &layer.gate_proj_wt);
+            let up = matmul_2d(&normed, &layer.up_proj_wt);
            let gate_activated = silu(&gate);
            let hidden_states = mul_any(&gate_activated, &up);
-            let down = linear_t(&hidden_states, &layer.down_proj_w);
+            let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
            x = add_any(&residual, &down);
        }

        let x = rmsnorm(&x, &self.norm, eps);
-        linear_t(&x, &self.lm_head)
+        matmul_2d(&x, &self.lm_head_t)
    }
 }

 // --- Helpers ---

-fn linear_t(x: &Tensor, weight: &Tensor) -> Tensor {
-    let w_t = weight.transpose(0, 1).contiguous();
-    matmul(x, &w_t, GemmBackend::CuBlas)
+fn matmul_2d(a: &Tensor, b: &Tensor) -> Tensor {
+    assert_eq!(a.ndim(), 2);
+    assert_eq!(b.ndim(), 2);
+    matmul(a, b, GemmBackend::CuBlas)
 }

 fn reshape_heads(x: &Tensor, seq_len: usize, num_heads: usize, head_dim: usize) -> Tensor {