phase 8: add benchmark framework + baseline results

- bench-gpt2 binary: runs 50 prompts, measures TTFT/TBT per prompt, outputs JSON - bench_compare.py: compares xserv vs transformers token-by-token + timing - Baseline results: 50/50 correctness, 400ms TTFT / 407ms TBT (100x slower than PyTorch) - Bottlenecks documented: no KV cache, CPU round-trips, cuBLAS handle churn Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 23:29:41 +08:00
parent e1e75fc7f6
commit cb12250ef0
3 changed files with 344 additions and 0 deletions
--- a/crates/xserv-model/src/bin/bench-gpt2.rs
+++ b/crates/xserv-model/src/bin/bench-gpt2.rs
@@ -0,0 +1,155 @@
 use std::path::PathBuf;
 use std::time::Instant;
 use xserv_model::gpt2::sample_greedy;
 use xserv_model::{loader, GPT2, ModelConfig};
 use xserv_tensor::Device;
 use xserv_tokenizer::Tokenizer;
 fn main() {
    let args: Vec<String> = std::env::args().collect();
    if args.len() < 2 {
        eprintln!("Usage: bench-gpt2 <model-dir> [--gen-tokens N]");
        std::process::exit(1);
    }
    let model_dir = PathBuf::from(&args[1]);
    let gen_tokens: usize = args
        .iter()
        .position(|a| a == "--gen-tokens")
        .and_then(|i| args.get(i + 1))
        .and_then(|s| s.parse().ok())
        .unwrap_or(20);
    xserv_cuda::device::set_device(0).unwrap();
    let config = ModelConfig::from_file(&model_dir.join("config.json"));
    let weights = loader::load_model_dir(&model_dir, Device::Cuda(0));
    let model = GPT2::from_weights(config, weights);
    let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
    // Warmup
    {
        let ids = tokenizer.encode("warmup");
        let _ = model.forward(&ids);
    }
    let prompts = vec![
        "The capital of France is",
        "Once upon a time in a land far away",
        "Hello, how are you doing today",
        "In a shocking finding, scientists discovered a",
        "The weather today is sunny, so I decided to",
        "Alan Turing was a British mathematician who",
        "The best way to learn programming is",
        "Artificial intelligence will change the world because",
        "The history of the internet began in the",
        "A good morning routine starts with",
        "The stock market crashed because investors",
        "Deep learning is a subset of machine learning that",
        "The president of the United States announced",
        "In the year 2050, humans will",
        "The secret to happiness is",
        "When I was a child, I used to",
        "The most important scientific discovery of the century",
        "Climate change is caused by",
        "The recipe for chocolate cake requires",
        "In conclusion, the evidence suggests that",
        "The cat sat on the mat and",
        "According to recent studies, exercise can",
        "The first step in solving any problem is",
        "Technology has transformed the way we",
        "The novel begins with the protagonist",
        "Education is the most powerful weapon",
        "The ocean covers more than seventy percent of",
        "Last night I had a dream about",
        "The company announced its quarterly earnings",
        "Music has the power to",
        "The difference between success and failure is",
        "In the beginning, there was nothing but",
        "The doctor told me that I should",
        "Python is a popular programming language because",
        "The ancient Romans built roads that",
        "A balanced diet should include",
        "The movie received mixed reviews from critics",
        "Space exploration has led to many",
        "The teacher asked the students to",
        "Global warming is one of the most",
        "The bridge collapsed due to structural",
        "Quantum computing promises to revolutionize",
        "The new policy will affect millions of",
        "During the winter months, it is important to",
        "The human brain contains approximately",
        "Democracy depends on the active participation of",
        "The train arrived at the station exactly",
        "Researchers at MIT have developed a new",
        "The smartphone has become an essential part of",
        "After careful consideration, the committee decided to",
    ];
    // JSON output
    println!("[");
    for (i, prompt) in prompts.iter().enumerate() {
        let input_ids = tokenizer.encode(prompt);
        let input_len = input_ids.len();
        let mut all_ids = input_ids.clone();
        // TTFT: time for first forward pass (prefill)
        let t0 = Instant::now();
        let logits = model.forward(&all_ids);
        let first_token = sample_greedy(&logits);
        let ttft_us = t0.elapsed().as_micros();
        all_ids.push(first_token);
        // Generate remaining tokens, measure each
        let mut token_times_us = Vec::new();
        for _ in 1..gen_tokens {
            let t_start = Instant::now();
            let logits = model.forward(&all_ids);
            let next = sample_greedy(&logits);
            let elapsed = t_start.elapsed().as_micros();
            token_times_us.push(elapsed);
            all_ids.push(next);
            if tokenizer.eos_token_id() == Some(next) {
                break;
            }
        }
        let generated_ids: Vec<u32> = all_ids[input_len..].to_vec();
        let generated_text = tokenizer.decode(&generated_ids);
        let num_generated = generated_ids.len();
        let total_gen_us: u128 = ttft_us + token_times_us.iter().sum::<u128>();
        let tpot_us = if num_generated > 0 { total_gen_us / num_generated as u128 } else { 0 };
        let tbt_us = if !token_times_us.is_empty() {
            token_times_us.iter().sum::<u128>() / token_times_us.len() as u128
        } else { 0 };
        let gen_text_escaped = generated_text
            .replace('\\', "\\\\")
            .replace('"', "\\\"")
            .replace('\n', "\\n")
            .replace('\r', "\\r")
            .replace('\t', "\\t");
        let gen_ids_str: Vec<String> = generated_ids.iter().map(|id| id.to_string()).collect();
        print!("  {{\"prompt\": \"{}\", ", prompt.replace('"', "\\\""));
        print!("\"input_len\": {input_len}, ");
        print!("\"num_generated\": {num_generated}, ");
        print!("\"generated_ids\": [{}], ", gen_ids_str.join(", "));
        print!("\"generated_text\": \"{gen_text_escaped}\", ");
        print!("\"ttft_us\": {ttft_us}, ");
        print!("\"tbt_us\": {tbt_us}, ");
        print!("\"tpot_us\": {tpot_us}}}");
        if i < prompts.len() - 1 { println!(","); } else { println!(); }
        eprintln!(
            "[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
            i + 1, prompts.len(),
            ttft_us as f64 / 1000.0,
            tbt_us as f64 / 1000.0,
            &generated_text.replace('\n', " ")[..generated_text.len().min(60)]
        );
    }
    println!("]");
 }
--- a/docs/benchmarks/phase8-gpt2-baseline.md
+++ b/docs/benchmarks/phase8-gpt2-baseline.md
@@ -0,0 +1,35 @@
 # Phase 8 Benchmark: GPT-2 124M Baseline
 **Date**: 2026-05-21
 **Hardware**: RTX 5090 (32GB, CC 12.0, 170 SMs)
 **Model**: GPT-2 124M (FP32)
 **Config**: 50 prompts × 20 generated tokens, greedy decoding, no KV cache
 ## Correctness
 | Metric | Result |
 |--------|--------|
 | Prompts tested | 50 |
 | Token-level match vs transformers | **50/50 (100.0%)** |
 | Mismatches | 0 |
 ## Performance
 | Metric | xserv | transformers (PyTorch) | Ratio |
 |--------|-------|----------------------|-------|
 | TTFT (avg) | 400.6 ms | 4.0 ms | 100x slower |
 | TBT (avg) | 407.2 ms | 3.8 ms | 106x slower |
 | Throughput | 2.5 tok/s | 260 tok/s | 0.01x |
 ## Known Bottlenecks
 1. **No KV Cache**: full recompute per token (O(S²) attention every step)
 2. **CPU round-trips**: ~100 GPU→CPU→GPU transfers per forward pass for add/bias/split_qkv/merge_heads
 3. **cuBLAS handle per matmul**: ~50 handle create/destroy per forward pass
 4. **No kernel fusion**: every op is a separate kernel launch + sync
 ## Tracking
 | Phase | TTFT (ms) | TBT (ms) | tok/s | Correctness | Notes |
 |-------|-----------|----------|-------|-------------|-------|
 | 8 (baseline) | 400.6 | 407.2 | 2.5 | 50/50 | No KV cache, CPU round-trips |
--- a/tools/bench_compare.py
+++ b/tools/bench_compare.py
@@ -0,0 +1,154 @@
 """
 Compare xserv GPT-2 output against HuggingFace transformers.
 Reads xserv results from JSON, runs same prompts through transformers, compares token-by-token.
 Also measures transformers timing for performance comparison.
 Usage:
    python3 tools/bench_compare.py <xserv_results.json> <model_dir>
 """
 import json
 import sys
 import time
 import torch
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 def main():
    if len(sys.argv) < 3:
        print(f"Usage: {sys.argv[0]} <xserv_results.json> <model_dir>")
        sys.exit(1)
    xserv_path = sys.argv[1]
    model_dir = sys.argv[2]
    with open(xserv_path) as f:
        xserv_results = json.load(f)
    print(f"Loading transformers model from {model_dir}...")
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    model.eval()
    model.cuda()
    # Warmup
    with torch.no_grad():
        model(torch.tensor([[tokenizer.encode("warmup")[0]]]).cuda())
    torch.cuda.synchronize()
    total = len(xserv_results)
    match_count = 0
    mismatch_count = 0
    xserv_ttft_sum = 0.0
    xserv_tbt_sum = 0.0
    hf_ttft_sum = 0.0
    hf_tbt_sum = 0.0
    num_with_tbt = 0
    print(f"\n{'='*100}")
    print(f"{'#':>3} {'Match':>5} {'Prompt':<45} {'xserv TTFT':>10} {'HF TTFT':>10} {'xserv TBT':>10} {'HF TBT':>10}")
    print(f"{'='*100}")
    for i, xr in enumerate(xserv_results):
        prompt = xr["prompt"]
        gen_tokens = xr["num_generated"]
        xserv_ids = xr["generated_ids"]
        input_ids = tokenizer.encode(prompt)
        input_tensor = torch.tensor([input_ids]).cuda()
        # Generate with transformers, measuring timing
        hf_generated = []
        hf_token_times = []
        with torch.no_grad():
            all_ids = input_tensor.clone()
            # TTFT
            torch.cuda.synchronize()
            t0 = time.perf_counter()
            out = model(all_ids)
            torch.cuda.synchronize()
            hf_ttft_us = (time.perf_counter() - t0) * 1e6
            next_id = out.logits[0, -1].argmax().item()
            hf_generated.append(next_id)
            all_ids = torch.cat([all_ids, torch.tensor([[next_id]]).cuda()], dim=1)
            # Remaining tokens
            for _ in range(1, gen_tokens):
                torch.cuda.synchronize()
                t_start = time.perf_counter()
                out = model(all_ids)
                torch.cuda.synchronize()
                elapsed = (time.perf_counter() - t_start) * 1e6
                hf_token_times.append(elapsed)
                next_id = out.logits[0, -1].argmax().item()
                hf_generated.append(next_id)
                all_ids = torch.cat([all_ids, torch.tensor([[next_id]]).cuda()], dim=1)
                eos_id = tokenizer.eos_token_id
                if eos_id is not None and next_id == eos_id:
                    break
        hf_tbt_us = sum(hf_token_times) / len(hf_token_times) if hf_token_times else 0
        # Compare
        match = xserv_ids == hf_generated
        if match:
            match_count += 1
            status = "  OK "
        else:
            mismatch_count += 1
            status = "FAIL!"
        xserv_ttft_ms = xr["ttft_us"] / 1000.0
        xserv_tbt_ms = xr["tbt_us"] / 1000.0
        hf_ttft_ms = hf_ttft_us / 1000.0
        hf_tbt_ms = hf_tbt_us / 1000.0
        prompt_short = prompt[:43] + ".." if len(prompt) > 45 else prompt
        print(f"{i+1:>3} {status} {prompt_short:<45} {xserv_ttft_ms:>8.1f}ms {hf_ttft_ms:>8.1f}ms {xserv_tbt_ms:>8.1f}ms {hf_tbt_ms:>8.1f}ms")
        if not match:
            # Show first divergence
            for j in range(max(len(xserv_ids), len(hf_generated))):
                x = xserv_ids[j] if j < len(xserv_ids) else None
                h = hf_generated[j] if j < len(hf_generated) else None
                if x != h:
                    x_tok = tokenizer.decode([x]) if x is not None else "<none>"
                    h_tok = tokenizer.decode([h]) if h is not None else "<none>"
                    print(f"      ↳ diverge at token {j}: xserv={x}({repr(x_tok)}) vs hf={h}({repr(h_tok)})")
                    break
        xserv_ttft_sum += xr["ttft_us"]
        xserv_tbt_sum += xr["tbt_us"]
        hf_ttft_sum += hf_ttft_us
        hf_tbt_sum += hf_tbt_us
        if xr["tbt_us"] > 0:
            num_with_tbt += 1
    print(f"{'='*100}")
    print(f"\n=== CORRECTNESS ===")
    print(f"Total prompts: {total}")
    print(f"Match:    {match_count}/{total} ({match_count/total*100:.1f}%)")
    print(f"Mismatch: {mismatch_count}/{total}")
    print(f"\n=== PERFORMANCE (average) ===")
    print(f"{'Metric':<20} {'xserv':>12} {'transformers':>12} {'ratio':>10}")
    print(f"{'-'*54}")
    avg_x_ttft = xserv_ttft_sum / total / 1000
    avg_h_ttft = hf_ttft_sum / total / 1000
    avg_x_tbt = xserv_tbt_sum / num_with_tbt / 1000 if num_with_tbt > 0 else 0
    avg_h_tbt = hf_tbt_sum / num_with_tbt / 1000 if num_with_tbt > 0 else 0
    print(f"{'TTFT (ms)':<20} {avg_x_ttft:>10.1f}ms {avg_h_ttft:>10.1f}ms {avg_x_ttft/avg_h_ttft:>9.1f}x")
    print(f"{'TBT (ms)':<20} {avg_x_tbt:>10.1f}ms {avg_h_tbt:>10.1f}ms {avg_x_tbt/avg_h_tbt if avg_h_tbt > 0 else 0:>9.1f}x")
    xserv_tps = 1000.0 / avg_x_tbt if avg_x_tbt > 0 else 0
    hf_tps = 1000.0 / avg_h_tbt if avg_h_tbt > 0 else 0
    print(f"{'Throughput (tok/s)':<20} {xserv_tps:>10.1f}   {hf_tps:>10.1f}   {xserv_tps/hf_tps if hf_tps > 0 else 0:>9.2f}x")
    print(f"\nNote: xserv currently has no KV cache — full recompute per token.")
    print(f"      transformers also runs without KV cache in this benchmark for fair comparison.")
 if __name__ == "__main__":
    main()