phase 8: add benchmark framework + baseline results

- bench-gpt2 binary: runs 50 prompts, measures TTFT/TBT per prompt, outputs JSON
- bench_compare.py: compares xserv vs transformers token-by-token + timing
- Baseline results: 50/50 correctness, 400ms TTFT / 407ms TBT (100x slower than PyTorch)
- Bottlenecks documented: no KV cache, CPU round-trips, cuBLAS handle churn

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-21 23:29:41 +08:00
parent e1e75fc7f6
commit cb12250ef0
3 changed files with 344 additions and 0 deletions

View File

@@ -0,0 +1,155 @@
use std::path::PathBuf;
use std::time::Instant;
use xserv_model::gpt2::sample_greedy;
use xserv_model::{loader, GPT2, ModelConfig};
use xserv_tensor::Device;
use xserv_tokenizer::Tokenizer;
fn main() {
let args: Vec<String> = std::env::args().collect();
if args.len() < 2 {
eprintln!("Usage: bench-gpt2 <model-dir> [--gen-tokens N]");
std::process::exit(1);
}
let model_dir = PathBuf::from(&args[1]);
let gen_tokens: usize = args
.iter()
.position(|a| a == "--gen-tokens")
.and_then(|i| args.get(i + 1))
.and_then(|s| s.parse().ok())
.unwrap_or(20);
xserv_cuda::device::set_device(0).unwrap();
let config = ModelConfig::from_file(&model_dir.join("config.json"));
let weights = loader::load_model_dir(&model_dir, Device::Cuda(0));
let model = GPT2::from_weights(config, weights);
let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
// Warmup
{
let ids = tokenizer.encode("warmup");
let _ = model.forward(&ids);
}
let prompts = vec![
"The capital of France is",
"Once upon a time in a land far away",
"Hello, how are you doing today",
"In a shocking finding, scientists discovered a",
"The weather today is sunny, so I decided to",
"Alan Turing was a British mathematician who",
"The best way to learn programming is",
"Artificial intelligence will change the world because",
"The history of the internet began in the",
"A good morning routine starts with",
"The stock market crashed because investors",
"Deep learning is a subset of machine learning that",
"The president of the United States announced",
"In the year 2050, humans will",
"The secret to happiness is",
"When I was a child, I used to",
"The most important scientific discovery of the century",
"Climate change is caused by",
"The recipe for chocolate cake requires",
"In conclusion, the evidence suggests that",
"The cat sat on the mat and",
"According to recent studies, exercise can",
"The first step in solving any problem is",
"Technology has transformed the way we",
"The novel begins with the protagonist",
"Education is the most powerful weapon",
"The ocean covers more than seventy percent of",
"Last night I had a dream about",
"The company announced its quarterly earnings",
"Music has the power to",
"The difference between success and failure is",
"In the beginning, there was nothing but",
"The doctor told me that I should",
"Python is a popular programming language because",
"The ancient Romans built roads that",
"A balanced diet should include",
"The movie received mixed reviews from critics",
"Space exploration has led to many",
"The teacher asked the students to",
"Global warming is one of the most",
"The bridge collapsed due to structural",
"Quantum computing promises to revolutionize",
"The new policy will affect millions of",
"During the winter months, it is important to",
"The human brain contains approximately",
"Democracy depends on the active participation of",
"The train arrived at the station exactly",
"Researchers at MIT have developed a new",
"The smartphone has become an essential part of",
"After careful consideration, the committee decided to",
];
// JSON output
println!("[");
for (i, prompt) in prompts.iter().enumerate() {
let input_ids = tokenizer.encode(prompt);
let input_len = input_ids.len();
let mut all_ids = input_ids.clone();
// TTFT: time for first forward pass (prefill)
let t0 = Instant::now();
let logits = model.forward(&all_ids);
let first_token = sample_greedy(&logits);
let ttft_us = t0.elapsed().as_micros();
all_ids.push(first_token);
// Generate remaining tokens, measure each
let mut token_times_us = Vec::new();
for _ in 1..gen_tokens {
let t_start = Instant::now();
let logits = model.forward(&all_ids);
let next = sample_greedy(&logits);
let elapsed = t_start.elapsed().as_micros();
token_times_us.push(elapsed);
all_ids.push(next);
if tokenizer.eos_token_id() == Some(next) {
break;
}
}
let generated_ids: Vec<u32> = all_ids[input_len..].to_vec();
let generated_text = tokenizer.decode(&generated_ids);
let num_generated = generated_ids.len();
let total_gen_us: u128 = ttft_us + token_times_us.iter().sum::<u128>();
let tpot_us = if num_generated > 0 { total_gen_us / num_generated as u128 } else { 0 };
let tbt_us = if !token_times_us.is_empty() {
token_times_us.iter().sum::<u128>() / token_times_us.len() as u128
} else { 0 };
let gen_text_escaped = generated_text
.replace('\\', "\\\\")
.replace('"', "\\\"")
.replace('\n', "\\n")
.replace('\r', "\\r")
.replace('\t', "\\t");
let gen_ids_str: Vec<String> = generated_ids.iter().map(|id| id.to_string()).collect();
print!(" {{\"prompt\": \"{}\", ", prompt.replace('"', "\\\""));
print!("\"input_len\": {input_len}, ");
print!("\"num_generated\": {num_generated}, ");
print!("\"generated_ids\": [{}], ", gen_ids_str.join(", "));
print!("\"generated_text\": \"{gen_text_escaped}\", ");
print!("\"ttft_us\": {ttft_us}, ");
print!("\"tbt_us\": {tbt_us}, ");
print!("\"tpot_us\": {tpot_us}}}");
if i < prompts.len() - 1 { println!(","); } else { println!(); }
eprintln!(
"[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
i + 1, prompts.len(),
ttft_us as f64 / 1000.0,
tbt_us as f64 / 1000.0,
&generated_text.replace('\n', " ")[..generated_text.len().min(60)]
);
}
println!("]");
}