phase 10: add Qwen3-8B benchmark + performance fix
Benchmark infrastructure: - bench-qwen3 binary: 50 prompts × 20 tokens with KV cache - bench_compare_qwen3.py: comparison against HF transformers (BF16) Performance fix: - Precompute transposed weights at model load time (eliminated per-token weight transpose CPU round-trip: was 252 transposes × 32MB each = 8GB/token) - Result: from "infinite" (>10 min/token) to 144ms/token Results (50 prompts): - Prefill top-1: 42/50 (84%), top-5: 50/50 (100%) vs HF transformers - Greedy sequence: 0/50 exact match (BF16 precision drift over 36 layers) - Performance: TTFT=138ms, TBT=144ms, 6.9 tok/s (HF: 21ms, 45.6 tok/s) - All outputs are coherent English/Chinese Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
160
crates/xserv-model/src/bin/bench-qwen3.rs
Normal file
160
crates/xserv-model/src/bin/bench-qwen3.rs
Normal file
@@ -0,0 +1,160 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
use xserv_model::qwen3::sample_greedy;
|
||||
use xserv_model::{loader, KVCache, ModelConfig, Qwen3};
|
||||
use xserv_tensor::{DType, Device};
|
||||
use xserv_tokenizer::Tokenizer;
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = std::env::args().collect();
|
||||
if args.len() < 2 {
|
||||
eprintln!("Usage: bench-qwen3 <model-dir> [--gen-tokens N]");
|
||||
std::process::exit(1);
|
||||
}
|
||||
let model_dir = PathBuf::from(&args[1]);
|
||||
let gen_tokens: usize = args
|
||||
.iter()
|
||||
.position(|a| a == "--gen-tokens")
|
||||
.and_then(|i| args.get(i + 1))
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(20);
|
||||
|
||||
xserv_cuda::device::set_device(0).unwrap();
|
||||
|
||||
let config = ModelConfig::from_file(&model_dir.join("config.json"));
|
||||
eprintln!("Loading Qwen3-8B weights...");
|
||||
let weights = loader::load_model_dir(&model_dir, Device::Cuda(0));
|
||||
eprintln!("Loaded {} tensors", weights.len());
|
||||
let model = Qwen3::from_weights(config.clone(), weights);
|
||||
let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
|
||||
|
||||
// Warmup
|
||||
{
|
||||
let ids = tokenizer.encode("warmup");
|
||||
let mut cache = KVCache::new(
|
||||
config.num_layers(), config.num_kv_heads(), config.head_dim(),
|
||||
DType::BF16, Device::Cuda(0),
|
||||
);
|
||||
let _ = model.forward_with_cache(&ids, &mut cache);
|
||||
}
|
||||
eprintln!("Warmup done. Running benchmark...");
|
||||
|
||||
let prompts: Vec<&str> = vec![
|
||||
"The capital of France is",
|
||||
"Once upon a time in a land far away",
|
||||
"Hello, how are you doing today",
|
||||
"In a shocking finding, scientists discovered a",
|
||||
"The weather today is sunny, so I decided to",
|
||||
"Alan Turing was a British mathematician who",
|
||||
"The best way to learn programming is",
|
||||
"Artificial intelligence will change the world because",
|
||||
"The history of the internet began in the",
|
||||
"A good morning routine starts with",
|
||||
"The stock market crashed because investors",
|
||||
"Deep learning is a subset of machine learning that",
|
||||
"The president of the United States announced",
|
||||
"In the year 2050, humans will",
|
||||
"The secret to happiness is",
|
||||
"When I was a child, I used to",
|
||||
"The most important scientific discovery of the century",
|
||||
"Climate change is caused by",
|
||||
"The recipe for chocolate cake requires",
|
||||
"In conclusion, the evidence suggests that",
|
||||
"The cat sat on the mat and",
|
||||
"According to recent studies, exercise can",
|
||||
"The first step in solving any problem is",
|
||||
"Technology has transformed the way we",
|
||||
"The novel begins with the protagonist",
|
||||
"Education is the most powerful weapon",
|
||||
"The ocean covers more than seventy percent of",
|
||||
"Last night I had a dream about",
|
||||
"The company announced its quarterly earnings",
|
||||
"Music has the power to",
|
||||
"The difference between success and failure is",
|
||||
"In the beginning, there was nothing but",
|
||||
"The doctor told me that I should",
|
||||
"Python is a popular programming language because",
|
||||
"The ancient Romans built roads that",
|
||||
"A balanced diet should include",
|
||||
"The movie received mixed reviews from critics",
|
||||
"Space exploration has led to many",
|
||||
"The teacher asked the students to",
|
||||
"Global warming is one of the most",
|
||||
"The bridge collapsed due to structural",
|
||||
"Quantum computing promises to revolutionize",
|
||||
"The new policy will affect millions of",
|
||||
"During the winter months, it is important to",
|
||||
"The human brain contains approximately",
|
||||
"Democracy depends on the active participation of",
|
||||
"The train arrived at the station exactly",
|
||||
"Researchers at MIT have developed a new",
|
||||
"The smartphone has become an essential part of",
|
||||
"After careful consideration, the committee decided to",
|
||||
];
|
||||
|
||||
println!("[");
|
||||
for (i, prompt) in prompts.iter().enumerate() {
|
||||
let input_ids = tokenizer.encode(prompt);
|
||||
let input_len = input_ids.len();
|
||||
|
||||
let mut cache = KVCache::new(
|
||||
config.num_layers(), config.num_kv_heads(), config.head_dim(),
|
||||
DType::BF16, Device::Cuda(0),
|
||||
);
|
||||
|
||||
// Prefill
|
||||
let t0 = Instant::now();
|
||||
let logits = model.forward_with_cache(&input_ids, &mut cache);
|
||||
let first_token = sample_greedy(&logits);
|
||||
let ttft_us = t0.elapsed().as_micros();
|
||||
|
||||
let mut generated = vec![first_token];
|
||||
let mut token_times = Vec::new();
|
||||
|
||||
// Decode
|
||||
for _ in 1..gen_tokens {
|
||||
let last = *generated.last().unwrap();
|
||||
let t_start = Instant::now();
|
||||
let logits = model.forward_with_cache(&[last], &mut cache);
|
||||
let next = sample_greedy(&logits);
|
||||
token_times.push(t_start.elapsed().as_micros());
|
||||
generated.push(next);
|
||||
if tokenizer.eos_token_id() == Some(next) { break; }
|
||||
}
|
||||
|
||||
let num_generated = generated.len();
|
||||
let generated_text = tokenizer.decode(&generated);
|
||||
let tbt_us = if !token_times.is_empty() {
|
||||
token_times.iter().sum::<u128>() / token_times.len() as u128
|
||||
} else { 0 };
|
||||
let total_gen_us: u128 = ttft_us + token_times.iter().sum::<u128>();
|
||||
let tpot_us = if num_generated > 0 { total_gen_us / num_generated as u128 } else { 0 };
|
||||
|
||||
let gen_text_escaped = generated_text
|
||||
.replace('\\', "\\\\")
|
||||
.replace('"', "\\\"")
|
||||
.replace('\n', "\\n")
|
||||
.replace('\r', "\\r")
|
||||
.replace('\t', "\\t");
|
||||
let gen_ids_str: Vec<String> = generated.iter().map(|id| id.to_string()).collect();
|
||||
|
||||
print!(" {{\"prompt\": \"{}\", ", prompt.replace('"', "\\\""));
|
||||
print!("\"input_len\": {input_len}, ");
|
||||
print!("\"num_generated\": {num_generated}, ");
|
||||
print!("\"generated_ids\": [{}], ", gen_ids_str.join(", "));
|
||||
print!("\"generated_text\": \"{gen_text_escaped}\", ");
|
||||
print!("\"ttft_us\": {ttft_us}, ");
|
||||
print!("\"tbt_us\": {tbt_us}, ");
|
||||
print!("\"tpot_us\": {tpot_us}}}");
|
||||
if i < prompts.len() - 1 { println!(","); } else { println!(); }
|
||||
|
||||
eprintln!(
|
||||
"[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
|
||||
i + 1, prompts.len(),
|
||||
ttft_us as f64 / 1000.0,
|
||||
tbt_us as f64 / 1000.0,
|
||||
&generated_text.replace('\n', " ")[..generated_text.len().min(60)]
|
||||
);
|
||||
}
|
||||
println!("]");
|
||||
}
|
||||
@@ -11,22 +11,22 @@ pub struct Qwen3 {
|
||||
embed_tokens: Tensor,
|
||||
layers: Vec<Qwen3Block>,
|
||||
norm: Tensor,
|
||||
lm_head: Tensor,
|
||||
lm_head_t: Tensor, // precomputed transpose
|
||||
rope_cache: RopeCache,
|
||||
}
|
||||
|
||||
struct Qwen3Block {
|
||||
input_norm: Tensor, // [hidden]
|
||||
q_proj_w: Tensor, // [num_heads*head_dim, hidden]
|
||||
k_proj_w: Tensor, // [num_kv_heads*head_dim, hidden]
|
||||
v_proj_w: Tensor,
|
||||
o_proj_w: Tensor, // [hidden, num_heads*head_dim]
|
||||
q_norm: Tensor, // [head_dim] — per-head QK norm
|
||||
q_proj_wt: Tensor, // TRANSPOSED: [hidden, num_heads*head_dim]
|
||||
k_proj_wt: Tensor, // TRANSPOSED: [hidden, num_kv_heads*head_dim]
|
||||
v_proj_wt: Tensor,
|
||||
o_proj_wt: Tensor, // TRANSPOSED: [num_heads*head_dim, hidden]
|
||||
q_norm: Tensor, // [head_dim]
|
||||
k_norm: Tensor, // [head_dim]
|
||||
post_norm: Tensor, // [hidden]
|
||||
gate_proj_w: Tensor, // [intermediate, hidden]
|
||||
up_proj_w: Tensor,
|
||||
down_proj_w: Tensor, // [hidden, intermediate]
|
||||
gate_proj_wt: Tensor, // TRANSPOSED: [hidden, intermediate]
|
||||
up_proj_wt: Tensor,
|
||||
down_proj_wt: Tensor, // TRANSPOSED: [intermediate, hidden]
|
||||
}
|
||||
|
||||
impl Qwen3 {
|
||||
@@ -37,7 +37,7 @@ impl Qwen3 {
|
||||
|
||||
let embed_tokens = take(&mut w, "model.embed_tokens.weight");
|
||||
let norm = take(&mut w, "model.norm.weight");
|
||||
let lm_head = take(&mut w, "lm_head.weight");
|
||||
let lm_head_raw = take(&mut w, "lm_head.weight");
|
||||
|
||||
let rope_cache = RopeCache::new(
|
||||
config.max_seq_len().min(8192), // limit for memory
|
||||
@@ -45,26 +45,33 @@ impl Qwen3 {
|
||||
config.rope_theta.unwrap_or(1_000_000.0) as f32,
|
||||
);
|
||||
|
||||
// Precompute transposed weights: [out, in] → [in, out] so we can do x @ wt directly
|
||||
let transpose_w = |t: Tensor| -> Tensor {
|
||||
t.transpose(0, 1).contiguous()
|
||||
};
|
||||
|
||||
let num_layers = config.num_layers();
|
||||
let mut layers = Vec::with_capacity(num_layers);
|
||||
eprintln!("Transposing weights for {} layers...", num_layers);
|
||||
for i in 0..num_layers {
|
||||
let p = format!("model.layers.{i}");
|
||||
layers.push(Qwen3Block {
|
||||
input_norm: take(&mut w, &format!("{p}.input_layernorm.weight")),
|
||||
q_proj_w: take(&mut w, &format!("{p}.self_attn.q_proj.weight")),
|
||||
k_proj_w: take(&mut w, &format!("{p}.self_attn.k_proj.weight")),
|
||||
v_proj_w: take(&mut w, &format!("{p}.self_attn.v_proj.weight")),
|
||||
o_proj_w: take(&mut w, &format!("{p}.self_attn.o_proj.weight")),
|
||||
q_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.q_proj.weight"))),
|
||||
k_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.k_proj.weight"))),
|
||||
v_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.v_proj.weight"))),
|
||||
o_proj_wt: transpose_w(take(&mut w, &format!("{p}.self_attn.o_proj.weight"))),
|
||||
q_norm: take(&mut w, &format!("{p}.self_attn.q_norm.weight")),
|
||||
k_norm: take(&mut w, &format!("{p}.self_attn.k_norm.weight")),
|
||||
post_norm: take(&mut w, &format!("{p}.post_attention_layernorm.weight")),
|
||||
gate_proj_w: take(&mut w, &format!("{p}.mlp.gate_proj.weight")),
|
||||
up_proj_w: take(&mut w, &format!("{p}.mlp.up_proj.weight")),
|
||||
down_proj_w: take(&mut w, &format!("{p}.mlp.down_proj.weight")),
|
||||
gate_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.gate_proj.weight"))),
|
||||
up_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.up_proj.weight"))),
|
||||
down_proj_wt: transpose_w(take(&mut w, &format!("{p}.mlp.down_proj.weight"))),
|
||||
});
|
||||
}
|
||||
|
||||
Self { config, embed_tokens, layers, norm, lm_head, rope_cache }
|
||||
let lm_head_t = transpose_w(lm_head_raw);
|
||||
Self { config, embed_tokens, layers, norm, lm_head_t, rope_cache }
|
||||
}
|
||||
|
||||
pub fn forward_with_cache(&self, token_ids: &[u32], cache: &mut KVCache) -> Tensor {
|
||||
@@ -83,10 +90,10 @@ impl Qwen3 {
|
||||
let residual = x.clone();
|
||||
let normed = rmsnorm(&x, &layer.input_norm, eps);
|
||||
|
||||
// Q/K/V projections (no bias, weight is [out, in])
|
||||
let q = linear_t(&normed, &layer.q_proj_w);
|
||||
let k = linear_t(&normed, &layer.k_proj_w);
|
||||
let v = linear_t(&normed, &layer.v_proj_w);
|
||||
// Q/K/V projections (pre-transposed weights, x @ wt)
|
||||
let q = matmul_2d(&normed, &layer.q_proj_wt);
|
||||
let k = matmul_2d(&normed, &layer.k_proj_wt);
|
||||
let v = matmul_2d(&normed, &layer.v_proj_wt);
|
||||
|
||||
// Reshape to [1, heads, seq, head_dim]
|
||||
let q = reshape_heads(&q, new_tokens, num_heads, head_dim);
|
||||
@@ -121,30 +128,31 @@ impl Qwen3 {
|
||||
// Attention
|
||||
let attn_out = attention(&q, &k_full, &v_full, true);
|
||||
let attn_merged = merge_heads_any(&attn_out, new_tokens, hidden);
|
||||
let attn_proj = linear_t(&attn_merged, &layer.o_proj_w);
|
||||
let attn_proj = matmul_2d(&attn_merged, &layer.o_proj_wt);
|
||||
x = add_any(&residual, &attn_proj);
|
||||
|
||||
// SwiGLU FFN
|
||||
let residual = x.clone();
|
||||
let normed = rmsnorm(&x, &layer.post_norm, eps);
|
||||
let gate = linear_t(&normed, &layer.gate_proj_w);
|
||||
let up = linear_t(&normed, &layer.up_proj_w);
|
||||
let gate = matmul_2d(&normed, &layer.gate_proj_wt);
|
||||
let up = matmul_2d(&normed, &layer.up_proj_wt);
|
||||
let gate_activated = silu(&gate);
|
||||
let hidden_states = mul_any(&gate_activated, &up);
|
||||
let down = linear_t(&hidden_states, &layer.down_proj_w);
|
||||
let down = matmul_2d(&hidden_states, &layer.down_proj_wt);
|
||||
x = add_any(&residual, &down);
|
||||
}
|
||||
|
||||
let x = rmsnorm(&x, &self.norm, eps);
|
||||
linear_t(&x, &self.lm_head)
|
||||
matmul_2d(&x, &self.lm_head_t)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
fn linear_t(x: &Tensor, weight: &Tensor) -> Tensor {
|
||||
let w_t = weight.transpose(0, 1).contiguous();
|
||||
matmul(x, &w_t, GemmBackend::CuBlas)
|
||||
fn matmul_2d(a: &Tensor, b: &Tensor) -> Tensor {
|
||||
assert_eq!(a.ndim(), 2);
|
||||
assert_eq!(b.ndim(), 2);
|
||||
matmul(a, b, GemmBackend::CuBlas)
|
||||
}
|
||||
|
||||
fn reshape_heads(x: &Tensor, seq_len: usize, num_heads: usize, head_dim: usize) -> Tensor {
|
||||
|
||||
Reference in New Issue
Block a user