phase 10: add Qwen3-8B benchmark + performance fix

Benchmark infrastructure:
- bench-qwen3 binary: 50 prompts × 20 tokens with KV cache
- bench_compare_qwen3.py: comparison against HF transformers (BF16)

Performance fix:
- Precompute transposed weights at model load time (eliminated per-token
  weight transpose CPU round-trip: was 252 transposes × 32MB each = 8GB/token)
- Result: from "infinite" (>10 min/token) to 144ms/token

Results (50 prompts):
- Prefill top-1: 42/50 (84%), top-5: 50/50 (100%) vs HF transformers
- Greedy sequence: 0/50 exact match (BF16 precision drift over 36 layers)
- Performance: TTFT=138ms, TBT=144ms, 6.9 tok/s (HF: 21ms, 45.6 tok/s)
- All outputs are coherent English/Chinese

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-22 10:25:33 +08:00
parent 246ae1c590
commit 268e40d764
4 changed files with 389 additions and 30 deletions

View File

@@ -0,0 +1,160 @@
use std::path::PathBuf;
use std::time::Instant;
use xserv_model::qwen3::sample_greedy;
use xserv_model::{loader, KVCache, ModelConfig, Qwen3};
use xserv_tensor::{DType, Device};
use xserv_tokenizer::Tokenizer;
fn main() {
let args: Vec<String> = std::env::args().collect();
if args.len() < 2 {
eprintln!("Usage: bench-qwen3 <model-dir> [--gen-tokens N]");
std::process::exit(1);
}
let model_dir = PathBuf::from(&args[1]);
let gen_tokens: usize = args
.iter()
.position(|a| a == "--gen-tokens")
.and_then(|i| args.get(i + 1))
.and_then(|s| s.parse().ok())
.unwrap_or(20);
xserv_cuda::device::set_device(0).unwrap();
let config = ModelConfig::from_file(&model_dir.join("config.json"));
eprintln!("Loading Qwen3-8B weights...");
let weights = loader::load_model_dir(&model_dir, Device::Cuda(0));
eprintln!("Loaded {} tensors", weights.len());
let model = Qwen3::from_weights(config.clone(), weights);
let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
// Warmup
{
let ids = tokenizer.encode("warmup");
let mut cache = KVCache::new(
config.num_layers(), config.num_kv_heads(), config.head_dim(),
DType::BF16, Device::Cuda(0),
);
let _ = model.forward_with_cache(&ids, &mut cache);
}
eprintln!("Warmup done. Running benchmark...");
let prompts: Vec<&str> = vec![
"The capital of France is",
"Once upon a time in a land far away",
"Hello, how are you doing today",
"In a shocking finding, scientists discovered a",
"The weather today is sunny, so I decided to",
"Alan Turing was a British mathematician who",
"The best way to learn programming is",
"Artificial intelligence will change the world because",
"The history of the internet began in the",
"A good morning routine starts with",
"The stock market crashed because investors",
"Deep learning is a subset of machine learning that",
"The president of the United States announced",
"In the year 2050, humans will",
"The secret to happiness is",
"When I was a child, I used to",
"The most important scientific discovery of the century",
"Climate change is caused by",
"The recipe for chocolate cake requires",
"In conclusion, the evidence suggests that",
"The cat sat on the mat and",
"According to recent studies, exercise can",
"The first step in solving any problem is",
"Technology has transformed the way we",
"The novel begins with the protagonist",
"Education is the most powerful weapon",
"The ocean covers more than seventy percent of",
"Last night I had a dream about",
"The company announced its quarterly earnings",
"Music has the power to",
"The difference between success and failure is",
"In the beginning, there was nothing but",
"The doctor told me that I should",
"Python is a popular programming language because",
"The ancient Romans built roads that",
"A balanced diet should include",
"The movie received mixed reviews from critics",
"Space exploration has led to many",
"The teacher asked the students to",
"Global warming is one of the most",
"The bridge collapsed due to structural",
"Quantum computing promises to revolutionize",
"The new policy will affect millions of",
"During the winter months, it is important to",
"The human brain contains approximately",
"Democracy depends on the active participation of",
"The train arrived at the station exactly",
"Researchers at MIT have developed a new",
"The smartphone has become an essential part of",
"After careful consideration, the committee decided to",
];
println!("[");
for (i, prompt) in prompts.iter().enumerate() {
let input_ids = tokenizer.encode(prompt);
let input_len = input_ids.len();
let mut cache = KVCache::new(
config.num_layers(), config.num_kv_heads(), config.head_dim(),
DType::BF16, Device::Cuda(0),
);
// Prefill
let t0 = Instant::now();
let logits = model.forward_with_cache(&input_ids, &mut cache);
let first_token = sample_greedy(&logits);
let ttft_us = t0.elapsed().as_micros();
let mut generated = vec![first_token];
let mut token_times = Vec::new();
// Decode
for _ in 1..gen_tokens {
let last = *generated.last().unwrap();
let t_start = Instant::now();
let logits = model.forward_with_cache(&[last], &mut cache);
let next = sample_greedy(&logits);
token_times.push(t_start.elapsed().as_micros());
generated.push(next);
if tokenizer.eos_token_id() == Some(next) { break; }
}
let num_generated = generated.len();
let generated_text = tokenizer.decode(&generated);
let tbt_us = if !token_times.is_empty() {
token_times.iter().sum::<u128>() / token_times.len() as u128
} else { 0 };
let total_gen_us: u128 = ttft_us + token_times.iter().sum::<u128>();
let tpot_us = if num_generated > 0 { total_gen_us / num_generated as u128 } else { 0 };
let gen_text_escaped = generated_text
.replace('\\', "\\\\")
.replace('"', "\\\"")
.replace('\n', "\\n")
.replace('\r', "\\r")
.replace('\t', "\\t");
let gen_ids_str: Vec<String> = generated.iter().map(|id| id.to_string()).collect();
print!(" {{\"prompt\": \"{}\", ", prompt.replace('"', "\\\""));
print!("\"input_len\": {input_len}, ");
print!("\"num_generated\": {num_generated}, ");
print!("\"generated_ids\": [{}], ", gen_ids_str.join(", "));
print!("\"generated_text\": \"{gen_text_escaped}\", ");
print!("\"ttft_us\": {ttft_us}, ");
print!("\"tbt_us\": {tbt_us}, ");
print!("\"tpot_us\": {tpot_us}}}");
if i < prompts.len() - 1 { println!(","); } else { println!(); }
eprintln!(
"[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
i + 1, prompts.len(),
ttft_us as f64 / 1000.0,
tbt_us as f64 / 1000.0,
&generated_text.replace('\n', " ")[..generated_text.len().min(60)]
);
}
println!("]");
}

View File

@@ -11,22 +11,22 @@ pub struct Qwen3 {
embed_tokens: Tensor,
layers: Vec<Qwen3Block>,
norm: Tensor,
lm_head: Tensor,
lm_head_t: Tensor, // precomputed transpose
rope_cache: RopeCache,
}
struct Qwen3Block {
input_norm: Tensor, // [hidden]
q_proj_w: Tensor, // [num_heads*head_dim, hidden]
k_proj_w: Tensor, // [num_kv_heads*head_dim, hidden]
v_proj_w: Tensor,
o_proj_w: Tensor, // [hidden, num_heads*head_dim]
q_norm: Tensor, // [head_dim] — per-head QK norm
q_proj_wt: Tensor, // TRANSPOSED: [hidden, num_heads*head_dim]
k_proj_wt: Tensor, // TRANSPOSED: [hidden, num_kv_heads*head_dim]
v_proj_wt: Tensor,
o_proj_wt: Tensor, // TRANSPOSED: [num_heads*head_dim, hidden]
q_norm: Tensor, // [head_dim]
k_norm: Tensor, // [head_dim]
post_norm: Tensor, // [hidden]
gate_proj_w: Tensor, // [intermediate, hidden]
up_proj_w: Tensor,
down_proj_w: Tensor, // [hidden, intermediate]
gate_proj_wt: Tensor, // TRANSPOSED: [hidden, intermediate]
up_proj_wt: Tensor,
down_proj_wt: Tensor, // TRANSPOSED: [intermediate, hidden]
}
impl Qwen3 {
@@ -37,7 +37,7 @@ impl Qwen3 {
let embed_tokens = take(&mut w, "model.embed_tokens.weight");
let norm = take(&mut w, "model.norm.weight");
let lm_head = take(&mut w, "lm_head.weight");
let lm_head_raw = take(&mut w, "lm_head.weight");
let rope_cache = RopeCache::new(
config.max_seq_len().min(8192), // limit for memory
@@ -45,26 +45,33 @@ impl Qwen3 {
config.rope_theta.unwrap_or(1_000_000.0) as f32,
);
// Precompute transposed weights: [out, in] → [in, out] so we can do x @ wt directly
let transpose_w = |t: Tensor| -> Tensor {
t.transpose(0, 1).contiguous()
};
let num_layers = config.num_layers();
let mut layers = Vec::with_capacity(num_layers);
eprintln!("Transposing weights for {} layers...", num_layers);
for i in 0..num_layers {
let p = format!("model.layers.{i}");
layers.push(Qwen3Block {
input_norm: take(&mut w, &format!("{p}.input_layernorm.weight")),
q_proj_w: take(&mut w, &format!("{p}.self_attn.q_proj.weight")),
k_proj_w: take(&mut w, &format!("{p}.self_attn.k_proj.weight")),
v_proj_w: take(&mut w, &format!("{p}.self_attn.v_proj.weight")),
o_proj_w: take(&mut w, &format!("{p}.self_attn.o_proj.weight")),
q_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.q_proj.weight"))),
k_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.k_proj.weight"))),
v_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.v_proj.weight"))),
o_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.o_proj.weight"))),
q_norm: take(&mut w, &format!("{p}.self_attn.q_norm.weight")),
k_norm: take(&mut w, &format!("{p}.self_attn.k_norm.weight")),
post_norm: take(&mut w, &format!("{p}.post_attention_layernorm.weight")),
gate_proj_w: take(&mut w, &format!("{p}.mlp.gate_proj.weight")),
up_proj_w: take(&mut w, &format!("{p}.mlp.up_proj.weight")),
down_proj_w: take(&mut w, &format!("{p}.mlp.down_proj.weight")),
gate_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.gate_proj.weight"))),
up_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.up_proj.weight"))),
down_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.down_proj.weight"))),
});
}
Self { config, embed_tokens, layers, norm, lm_head, rope_cache }
let lm_head_t = transpose_w(lm_head_raw);
Self { config, embed_tokens, layers, norm, lm_head_t, rope_cache }
}
pub fn forward_with_cache(&self, token_ids: &[u32], cache: &mut KVCache) -> Tensor {
@@ -83,10 +90,10 @@ impl Qwen3 {
let residual = x.clone();
let normed = rmsnorm(&x, &layer.input_norm, eps);
// Q/K/V projections (no bias, weight is [out, in])
let q = linear_t(&normed, &layer.q_proj_w);
let k = linear_t(&normed, &layer.k_proj_w);
let v = linear_t(&normed, &layer.v_proj_w);
// Q/K/V projections (pre-transposed weights, x @ wt)
let q = matmul_2d(&normed, &layer.q_proj_wt);
let k = matmul_2d(&normed, &layer.k_proj_wt);
let v = matmul_2d(&normed, &layer.v_proj_wt);
// Reshape to [1, heads, seq, head_dim]
let q = reshape_heads(&q, new_tokens, num_heads, head_dim);
@@ -121,30 +128,31 @@ impl Qwen3 {
// Attention
let attn_out = attention(&q, &k_full, &v_full, true);
let attn_merged = merge_heads_any(&attn_out, new_tokens, hidden);
let attn_proj = linear_t(&attn_merged, &layer.o_proj_w);
let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
x = add_any(&residual, &attn_proj);
// SwiGLU FFN
let residual = x.clone();
let normed = rmsnorm(&x, &layer.post_norm, eps);
let gate = linear_t(&normed, &layer.gate_proj_w);
let up = linear_t(&normed, &layer.up_proj_w);
let gate = matmul_2d(&normed, &layer.gate_proj_wt);
let up = matmul_2d(&normed, &layer.up_proj_wt);
let gate_activated = silu(&gate);
let hidden_states = mul_any(&gate_activated, &up);
let down = linear_t(&hidden_states, &layer.down_proj_w);
let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
x = add_any(&residual, &down);
}
let x = rmsnorm(&x, &self.norm, eps);
linear_t(&x, &self.lm_head)
matmul_2d(&x, &self.lm_head_t)
}
}
// --- Helpers ---
fn linear_t(x: &Tensor, weight: &Tensor) -> Tensor {
let w_t = weight.transpose(0, 1).contiguous();
matmul(x, &w_t, GemmBackend::CuBlas)
fn matmul_2d(a: &Tensor, b: &Tensor) -> Tensor {
assert_eq!(a.ndim(), 2);
assert_eq!(b.ndim(), 2);
matmul(a, b, GemmBackend::CuBlas)
}
fn reshape_heads(x: &Tensor, seq_len: usize, num_heads: usize, head_dim: usize) -> Tensor {