phase 8: add benchmark framework + baseline results
- bench-gpt2 binary: runs 50 prompts, measures TTFT/TBT per prompt, outputs JSON - bench_compare.py: compares xserv vs transformers token-by-token + timing - Baseline results: 50/50 correctness, 400ms TTFT / 407ms TBT (100x slower than PyTorch) - Bottlenecks documented: no KV cache, CPU round-trips, cuBLAS handle churn Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
155
crates/xserv-model/src/bin/bench-gpt2.rs
Normal file
155
crates/xserv-model/src/bin/bench-gpt2.rs
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
use std::time::Instant;
|
||||||
|
use xserv_model::gpt2::sample_greedy;
|
||||||
|
use xserv_model::{loader, GPT2, ModelConfig};
|
||||||
|
use xserv_tensor::Device;
|
||||||
|
use xserv_tokenizer::Tokenizer;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let args: Vec<String> = std::env::args().collect();
|
||||||
|
if args.len() < 2 {
|
||||||
|
eprintln!("Usage: bench-gpt2 <model-dir> [--gen-tokens N]");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
let model_dir = PathBuf::from(&args[1]);
|
||||||
|
let gen_tokens: usize = args
|
||||||
|
.iter()
|
||||||
|
.position(|a| a == "--gen-tokens")
|
||||||
|
.and_then(|i| args.get(i + 1))
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(20);
|
||||||
|
|
||||||
|
xserv_cuda::device::set_device(0).unwrap();
|
||||||
|
|
||||||
|
let config = ModelConfig::from_file(&model_dir.join("config.json"));
|
||||||
|
let weights = loader::load_model_dir(&model_dir, Device::Cuda(0));
|
||||||
|
let model = GPT2::from_weights(config, weights);
|
||||||
|
let tokenizer = Tokenizer::from_file(&model_dir.join("tokenizer.json"));
|
||||||
|
|
||||||
|
// Warmup
|
||||||
|
{
|
||||||
|
let ids = tokenizer.encode("warmup");
|
||||||
|
let _ = model.forward(&ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
let prompts = vec![
|
||||||
|
"The capital of France is",
|
||||||
|
"Once upon a time in a land far away",
|
||||||
|
"Hello, how are you doing today",
|
||||||
|
"In a shocking finding, scientists discovered a",
|
||||||
|
"The weather today is sunny, so I decided to",
|
||||||
|
"Alan Turing was a British mathematician who",
|
||||||
|
"The best way to learn programming is",
|
||||||
|
"Artificial intelligence will change the world because",
|
||||||
|
"The history of the internet began in the",
|
||||||
|
"A good morning routine starts with",
|
||||||
|
"The stock market crashed because investors",
|
||||||
|
"Deep learning is a subset of machine learning that",
|
||||||
|
"The president of the United States announced",
|
||||||
|
"In the year 2050, humans will",
|
||||||
|
"The secret to happiness is",
|
||||||
|
"When I was a child, I used to",
|
||||||
|
"The most important scientific discovery of the century",
|
||||||
|
"Climate change is caused by",
|
||||||
|
"The recipe for chocolate cake requires",
|
||||||
|
"In conclusion, the evidence suggests that",
|
||||||
|
"The cat sat on the mat and",
|
||||||
|
"According to recent studies, exercise can",
|
||||||
|
"The first step in solving any problem is",
|
||||||
|
"Technology has transformed the way we",
|
||||||
|
"The novel begins with the protagonist",
|
||||||
|
"Education is the most powerful weapon",
|
||||||
|
"The ocean covers more than seventy percent of",
|
||||||
|
"Last night I had a dream about",
|
||||||
|
"The company announced its quarterly earnings",
|
||||||
|
"Music has the power to",
|
||||||
|
"The difference between success and failure is",
|
||||||
|
"In the beginning, there was nothing but",
|
||||||
|
"The doctor told me that I should",
|
||||||
|
"Python is a popular programming language because",
|
||||||
|
"The ancient Romans built roads that",
|
||||||
|
"A balanced diet should include",
|
||||||
|
"The movie received mixed reviews from critics",
|
||||||
|
"Space exploration has led to many",
|
||||||
|
"The teacher asked the students to",
|
||||||
|
"Global warming is one of the most",
|
||||||
|
"The bridge collapsed due to structural",
|
||||||
|
"Quantum computing promises to revolutionize",
|
||||||
|
"The new policy will affect millions of",
|
||||||
|
"During the winter months, it is important to",
|
||||||
|
"The human brain contains approximately",
|
||||||
|
"Democracy depends on the active participation of",
|
||||||
|
"The train arrived at the station exactly",
|
||||||
|
"Researchers at MIT have developed a new",
|
||||||
|
"The smartphone has become an essential part of",
|
||||||
|
"After careful consideration, the committee decided to",
|
||||||
|
];
|
||||||
|
|
||||||
|
// JSON output
|
||||||
|
println!("[");
|
||||||
|
for (i, prompt) in prompts.iter().enumerate() {
|
||||||
|
let input_ids = tokenizer.encode(prompt);
|
||||||
|
let input_len = input_ids.len();
|
||||||
|
let mut all_ids = input_ids.clone();
|
||||||
|
|
||||||
|
// TTFT: time for first forward pass (prefill)
|
||||||
|
let t0 = Instant::now();
|
||||||
|
let logits = model.forward(&all_ids);
|
||||||
|
let first_token = sample_greedy(&logits);
|
||||||
|
let ttft_us = t0.elapsed().as_micros();
|
||||||
|
all_ids.push(first_token);
|
||||||
|
|
||||||
|
// Generate remaining tokens, measure each
|
||||||
|
let mut token_times_us = Vec::new();
|
||||||
|
for _ in 1..gen_tokens {
|
||||||
|
let t_start = Instant::now();
|
||||||
|
let logits = model.forward(&all_ids);
|
||||||
|
let next = sample_greedy(&logits);
|
||||||
|
let elapsed = t_start.elapsed().as_micros();
|
||||||
|
token_times_us.push(elapsed);
|
||||||
|
all_ids.push(next);
|
||||||
|
|
||||||
|
if tokenizer.eos_token_id() == Some(next) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let generated_ids: Vec<u32> = all_ids[input_len..].to_vec();
|
||||||
|
let generated_text = tokenizer.decode(&generated_ids);
|
||||||
|
let num_generated = generated_ids.len();
|
||||||
|
|
||||||
|
let total_gen_us: u128 = ttft_us + token_times_us.iter().sum::<u128>();
|
||||||
|
let tpot_us = if num_generated > 0 { total_gen_us / num_generated as u128 } else { 0 };
|
||||||
|
let tbt_us = if !token_times_us.is_empty() {
|
||||||
|
token_times_us.iter().sum::<u128>() / token_times_us.len() as u128
|
||||||
|
} else { 0 };
|
||||||
|
|
||||||
|
let gen_text_escaped = generated_text
|
||||||
|
.replace('\\', "\\\\")
|
||||||
|
.replace('"', "\\\"")
|
||||||
|
.replace('\n', "\\n")
|
||||||
|
.replace('\r', "\\r")
|
||||||
|
.replace('\t', "\\t");
|
||||||
|
|
||||||
|
let gen_ids_str: Vec<String> = generated_ids.iter().map(|id| id.to_string()).collect();
|
||||||
|
|
||||||
|
print!(" {{\"prompt\": \"{}\", ", prompt.replace('"', "\\\""));
|
||||||
|
print!("\"input_len\": {input_len}, ");
|
||||||
|
print!("\"num_generated\": {num_generated}, ");
|
||||||
|
print!("\"generated_ids\": [{}], ", gen_ids_str.join(", "));
|
||||||
|
print!("\"generated_text\": \"{gen_text_escaped}\", ");
|
||||||
|
print!("\"ttft_us\": {ttft_us}, ");
|
||||||
|
print!("\"tbt_us\": {tbt_us}, ");
|
||||||
|
print!("\"tpot_us\": {tpot_us}}}");
|
||||||
|
if i < prompts.len() - 1 { println!(","); } else { println!(); }
|
||||||
|
|
||||||
|
eprintln!(
|
||||||
|
"[{}/{}] input={input_len}tok gen={num_generated}tok ttft={:.1}ms tbt={:.1}ms | {}",
|
||||||
|
i + 1, prompts.len(),
|
||||||
|
ttft_us as f64 / 1000.0,
|
||||||
|
tbt_us as f64 / 1000.0,
|
||||||
|
&generated_text.replace('\n', " ")[..generated_text.len().min(60)]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
println!("]");
|
||||||
|
}
|
||||||
35
docs/benchmarks/phase8-gpt2-baseline.md
Normal file
35
docs/benchmarks/phase8-gpt2-baseline.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Phase 8 Benchmark: GPT-2 124M Baseline
|
||||||
|
|
||||||
|
**Date**: 2026-05-21
|
||||||
|
**Hardware**: RTX 5090 (32GB, CC 12.0, 170 SMs)
|
||||||
|
**Model**: GPT-2 124M (FP32)
|
||||||
|
**Config**: 50 prompts × 20 generated tokens, greedy decoding, no KV cache
|
||||||
|
|
||||||
|
## Correctness
|
||||||
|
|
||||||
|
| Metric | Result |
|
||||||
|
|--------|--------|
|
||||||
|
| Prompts tested | 50 |
|
||||||
|
| Token-level match vs transformers | **50/50 (100.0%)** |
|
||||||
|
| Mismatches | 0 |
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
|
| Metric | xserv | transformers (PyTorch) | Ratio |
|
||||||
|
|--------|-------|----------------------|-------|
|
||||||
|
| TTFT (avg) | 400.6 ms | 4.0 ms | 100x slower |
|
||||||
|
| TBT (avg) | 407.2 ms | 3.8 ms | 106x slower |
|
||||||
|
| Throughput | 2.5 tok/s | 260 tok/s | 0.01x |
|
||||||
|
|
||||||
|
## Known Bottlenecks
|
||||||
|
|
||||||
|
1. **No KV Cache**: full recompute per token (O(S²) attention every step)
|
||||||
|
2. **CPU round-trips**: ~100 GPU→CPU→GPU transfers per forward pass for add/bias/split_qkv/merge_heads
|
||||||
|
3. **cuBLAS handle per matmul**: ~50 handle create/destroy per forward pass
|
||||||
|
4. **No kernel fusion**: every op is a separate kernel launch + sync
|
||||||
|
|
||||||
|
## Tracking
|
||||||
|
|
||||||
|
| Phase | TTFT (ms) | TBT (ms) | tok/s | Correctness | Notes |
|
||||||
|
|-------|-----------|----------|-------|-------------|-------|
|
||||||
|
| 8 (baseline) | 400.6 | 407.2 | 2.5 | 50/50 | No KV cache, CPU round-trips |
|
||||||
154
tools/bench_compare.py
Normal file
154
tools/bench_compare.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
"""
|
||||||
|
Compare xserv GPT-2 output against HuggingFace transformers.
|
||||||
|
Reads xserv results from JSON, runs same prompts through transformers, compares token-by-token.
|
||||||
|
Also measures transformers timing for performance comparison.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 tools/bench_compare.py <xserv_results.json> <model_dir>
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print(f"Usage: {sys.argv[0]} <xserv_results.json> <model_dir>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
xserv_path = sys.argv[1]
|
||||||
|
model_dir = sys.argv[2]
|
||||||
|
|
||||||
|
with open(xserv_path) as f:
|
||||||
|
xserv_results = json.load(f)
|
||||||
|
|
||||||
|
print(f"Loading transformers model from {model_dir}...")
|
||||||
|
model = GPT2LMHeadModel.from_pretrained(model_dir)
|
||||||
|
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
|
||||||
|
model.eval()
|
||||||
|
model.cuda()
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
with torch.no_grad():
|
||||||
|
model(torch.tensor([[tokenizer.encode("warmup")[0]]]).cuda())
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
total = len(xserv_results)
|
||||||
|
match_count = 0
|
||||||
|
mismatch_count = 0
|
||||||
|
xserv_ttft_sum = 0.0
|
||||||
|
xserv_tbt_sum = 0.0
|
||||||
|
hf_ttft_sum = 0.0
|
||||||
|
hf_tbt_sum = 0.0
|
||||||
|
num_with_tbt = 0
|
||||||
|
|
||||||
|
print(f"\n{'='*100}")
|
||||||
|
print(f"{'#':>3} {'Match':>5} {'Prompt':<45} {'xserv TTFT':>10} {'HF TTFT':>10} {'xserv TBT':>10} {'HF TBT':>10}")
|
||||||
|
print(f"{'='*100}")
|
||||||
|
|
||||||
|
for i, xr in enumerate(xserv_results):
|
||||||
|
prompt = xr["prompt"]
|
||||||
|
gen_tokens = xr["num_generated"]
|
||||||
|
xserv_ids = xr["generated_ids"]
|
||||||
|
|
||||||
|
input_ids = tokenizer.encode(prompt)
|
||||||
|
input_tensor = torch.tensor([input_ids]).cuda()
|
||||||
|
|
||||||
|
# Generate with transformers, measuring timing
|
||||||
|
hf_generated = []
|
||||||
|
hf_token_times = []
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
all_ids = input_tensor.clone()
|
||||||
|
|
||||||
|
# TTFT
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
out = model(all_ids)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
hf_ttft_us = (time.perf_counter() - t0) * 1e6
|
||||||
|
next_id = out.logits[0, -1].argmax().item()
|
||||||
|
hf_generated.append(next_id)
|
||||||
|
all_ids = torch.cat([all_ids, torch.tensor([[next_id]]).cuda()], dim=1)
|
||||||
|
|
||||||
|
# Remaining tokens
|
||||||
|
for _ in range(1, gen_tokens):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
t_start = time.perf_counter()
|
||||||
|
out = model(all_ids)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
elapsed = (time.perf_counter() - t_start) * 1e6
|
||||||
|
hf_token_times.append(elapsed)
|
||||||
|
next_id = out.logits[0, -1].argmax().item()
|
||||||
|
hf_generated.append(next_id)
|
||||||
|
all_ids = torch.cat([all_ids, torch.tensor([[next_id]]).cuda()], dim=1)
|
||||||
|
|
||||||
|
eos_id = tokenizer.eos_token_id
|
||||||
|
if eos_id is not None and next_id == eos_id:
|
||||||
|
break
|
||||||
|
|
||||||
|
hf_tbt_us = sum(hf_token_times) / len(hf_token_times) if hf_token_times else 0
|
||||||
|
|
||||||
|
# Compare
|
||||||
|
match = xserv_ids == hf_generated
|
||||||
|
if match:
|
||||||
|
match_count += 1
|
||||||
|
status = " OK "
|
||||||
|
else:
|
||||||
|
mismatch_count += 1
|
||||||
|
status = "FAIL!"
|
||||||
|
|
||||||
|
xserv_ttft_ms = xr["ttft_us"] / 1000.0
|
||||||
|
xserv_tbt_ms = xr["tbt_us"] / 1000.0
|
||||||
|
hf_ttft_ms = hf_ttft_us / 1000.0
|
||||||
|
hf_tbt_ms = hf_tbt_us / 1000.0
|
||||||
|
|
||||||
|
prompt_short = prompt[:43] + ".." if len(prompt) > 45 else prompt
|
||||||
|
print(f"{i+1:>3} {status} {prompt_short:<45} {xserv_ttft_ms:>8.1f}ms {hf_ttft_ms:>8.1f}ms {xserv_tbt_ms:>8.1f}ms {hf_tbt_ms:>8.1f}ms")
|
||||||
|
|
||||||
|
if not match:
|
||||||
|
# Show first divergence
|
||||||
|
for j in range(max(len(xserv_ids), len(hf_generated))):
|
||||||
|
x = xserv_ids[j] if j < len(xserv_ids) else None
|
||||||
|
h = hf_generated[j] if j < len(hf_generated) else None
|
||||||
|
if x != h:
|
||||||
|
x_tok = tokenizer.decode([x]) if x is not None else "<none>"
|
||||||
|
h_tok = tokenizer.decode([h]) if h is not None else "<none>"
|
||||||
|
print(f" ↳ diverge at token {j}: xserv={x}({repr(x_tok)}) vs hf={h}({repr(h_tok)})")
|
||||||
|
break
|
||||||
|
|
||||||
|
xserv_ttft_sum += xr["ttft_us"]
|
||||||
|
xserv_tbt_sum += xr["tbt_us"]
|
||||||
|
hf_ttft_sum += hf_ttft_us
|
||||||
|
hf_tbt_sum += hf_tbt_us
|
||||||
|
if xr["tbt_us"] > 0:
|
||||||
|
num_with_tbt += 1
|
||||||
|
|
||||||
|
print(f"{'='*100}")
|
||||||
|
print(f"\n=== CORRECTNESS ===")
|
||||||
|
print(f"Total prompts: {total}")
|
||||||
|
print(f"Match: {match_count}/{total} ({match_count/total*100:.1f}%)")
|
||||||
|
print(f"Mismatch: {mismatch_count}/{total}")
|
||||||
|
|
||||||
|
print(f"\n=== PERFORMANCE (average) ===")
|
||||||
|
print(f"{'Metric':<20} {'xserv':>12} {'transformers':>12} {'ratio':>10}")
|
||||||
|
print(f"{'-'*54}")
|
||||||
|
avg_x_ttft = xserv_ttft_sum / total / 1000
|
||||||
|
avg_h_ttft = hf_ttft_sum / total / 1000
|
||||||
|
avg_x_tbt = xserv_tbt_sum / num_with_tbt / 1000 if num_with_tbt > 0 else 0
|
||||||
|
avg_h_tbt = hf_tbt_sum / num_with_tbt / 1000 if num_with_tbt > 0 else 0
|
||||||
|
print(f"{'TTFT (ms)':<20} {avg_x_ttft:>10.1f}ms {avg_h_ttft:>10.1f}ms {avg_x_ttft/avg_h_ttft:>9.1f}x")
|
||||||
|
print(f"{'TBT (ms)':<20} {avg_x_tbt:>10.1f}ms {avg_h_tbt:>10.1f}ms {avg_x_tbt/avg_h_tbt if avg_h_tbt > 0 else 0:>9.1f}x")
|
||||||
|
xserv_tps = 1000.0 / avg_x_tbt if avg_x_tbt > 0 else 0
|
||||||
|
hf_tps = 1000.0 / avg_h_tbt if avg_h_tbt > 0 else 0
|
||||||
|
print(f"{'Throughput (tok/s)':<20} {xserv_tps:>10.1f} {hf_tps:>10.1f} {xserv_tps/hf_tps if hf_tps > 0 else 0:>9.2f}x")
|
||||||
|
|
||||||
|
print(f"\nNote: xserv currently has no KV cache — full recompute per token.")
|
||||||
|
print(f" transformers also runs without KV cache in this benchmark for fair comparison.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user