Files
xserv/tools/bench/speed.py
Gahow Wang 49c7653222 tools: add llama.cpp comparison baseline + standard benchmark suite
Vendor llama.cpp as a submodule pinned to b9371 and add a one-click
benchmark driver that compares xserv against it on identical workloads:

- setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh
  converts the same safetensors to BF16 GGUF for an apples-to-apples baseline.
- tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput
  (single-stream + concurrent) and response quality on AIME 2025 + GSM8K.
- fetch_datasets.py pulls datasets to local JSON (GPU host has no network);
  task loaders prefer the local JSON.
- sync-and-build.sh: `bench` subcommand transfers source + datasets to the
  GPU host via tar-over-ssh (no rsync there), builds, and runs the suite.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00

170 lines
6.2 KiB
Python

"""Speed suite: TTFT, TPOT, throughput; serial and concurrent.
Single-stream and concurrent throughput are reported separately because they
stress different things — TTFT/TPOT are kernel/latency bound (single stream),
throughput at high concurrency is scheduler/batching bound.
"""
from __future__ import annotations
import asyncio
import statistics
from dataclasses import asdict, dataclass
from typing import Any
from .client import StreamResult, chat_concurrent
from .config import BenchConfig, SystemEndpoint
# Three prompt-length buckets cover the common interesting points:
# short = greeting-style; medium = QA; long = summarize-ish (prefill-heavy).
SPEED_PROMPTS = {
"short": "What is 2 + 2?",
"medium": "Explain the difference between TCP and UDP, briefly.",
"long": (
"Write a detailed comparison of Python and Rust for systems programming. "
"Cover memory management, performance, ergonomics, ecosystem, and typical "
"use cases. Be specific."
),
}
@dataclass
class SpeedRow:
system: str
scenario: str # e.g. "single/short", "concurrent-4"
requests: int
completion_tokens_total: int
wall_s: float
ttft_ms_p50: float
ttft_ms_p95: float
tpot_ms_p50: float
tpot_ms_p95: float
throughput_tok_s: float # aggregate completion_tokens / wall
per_req_throughput_tok_s_mean: float
errors: int
def _percentile(values: list[float], p: float) -> float:
if not values:
return -1.0
s = sorted(values)
idx = max(0, min(len(s) - 1, int(round((p / 100.0) * (len(s) - 1)))))
return s[idx]
def _summarize(system: str, scenario: str, results: list[StreamResult], wall_s: float) -> SpeedRow:
ok = [r for r in results if r.error is None]
ttft_ms = [r.ttft_s * 1000 for r in ok if r.ttft_s >= 0]
tpot_ms = [r.tpot_s * 1000 for r in ok if r.tpot_s >= 0]
per_req_tps = [r.throughput_tok_s for r in ok if r.throughput_tok_s > 0]
total_tokens = sum(r.completion_tokens for r in ok)
return SpeedRow(
system=system,
scenario=scenario,
requests=len(results),
completion_tokens_total=total_tokens,
wall_s=wall_s,
ttft_ms_p50=_percentile(ttft_ms, 50),
ttft_ms_p95=_percentile(ttft_ms, 95),
tpot_ms_p50=_percentile(tpot_ms, 50),
tpot_ms_p95=_percentile(tpot_ms, 95),
throughput_tok_s=total_tokens / wall_s if wall_s > 0 else -1.0,
per_req_throughput_tok_s_mean=statistics.mean(per_req_tps) if per_req_tps else -1.0,
errors=len(results) - len(ok),
)
async def run_single_stream(
ep: SystemEndpoint, cfg: BenchConfig,
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
"""One request at a time, three prompt lengths. Repeat each `cfg.speed_prompts` times."""
rows: list[SpeedRow] = []
raw: list[dict[str, Any]] = []
for bucket, prompt in SPEED_PROMPTS.items():
messages = [[{"role": "user", "content": prompt}]] * cfg.speed_prompts
results, wall = await chat_concurrent(
ep.base_url, ep.model_id, messages,
max_tokens=cfg.speed_max_tokens,
temperature=0.0,
api_key=ep.api_key,
timeout=cfg.request_timeout_s,
concurrency=1,
)
rows.append(_summarize(ep.name, f"single/{bucket}", results, wall))
for i, r in enumerate(results):
raw.append({
"system": ep.name, "scenario": f"single/{bucket}", "i": i,
"ttft_s": r.ttft_s, "tpot_s": r.tpot_s,
"completion_tokens": r.completion_tokens,
"e2e_s": r.e2e_s, "error": r.error,
"finish_reason": r.finish_reason,
})
return rows, raw
async def run_concurrent(
ep: SystemEndpoint, cfg: BenchConfig,
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
"""Fixed medium-length prompt, sweep concurrency."""
rows: list[SpeedRow] = []
raw: list[dict[str, Any]] = []
prompt = SPEED_PROMPTS["medium"]
for c in cfg.speed_concurrency:
# Send 4x concurrency requests so the scheduler sees sustained load,
# not just one wave.
n = max(c * 4, 8)
messages = [[{"role": "user", "content": prompt}]] * n
results, wall = await chat_concurrent(
ep.base_url, ep.model_id, messages,
max_tokens=cfg.speed_max_tokens,
temperature=0.0,
api_key=ep.api_key,
timeout=cfg.request_timeout_s,
concurrency=c,
)
rows.append(_summarize(ep.name, f"concurrent-{c}", results, wall))
for i, r in enumerate(results):
raw.append({
"system": ep.name, "scenario": f"concurrent-{c}", "i": i,
"ttft_s": r.ttft_s, "tpot_s": r.tpot_s,
"completion_tokens": r.completion_tokens,
"e2e_s": r.e2e_s, "error": r.error,
"finish_reason": r.finish_reason,
})
return rows, raw
def run_speed(
endpoints: list[SystemEndpoint], cfg: BenchConfig,
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
all_rows: list[SpeedRow] = []
all_raw: list[dict[str, Any]] = []
for ep in endpoints:
print(f"[speed] === {ep.name} ===")
# Tiny warmup so the first row isn't penalized by lazy cache allocation.
warm_messages = [[{"role": "user", "content": "Hello"}]]
asyncio.run(chat_concurrent(
ep.base_url, ep.model_id, warm_messages,
max_tokens=8, temperature=0.0, api_key=ep.api_key,
timeout=120, concurrency=1,
))
rows1, raw1 = asyncio.run(run_single_stream(ep, cfg))
all_rows.extend(rows1); all_raw.extend(raw1)
for r in rows1:
print(f" {r.scenario:18s} ttft p50={r.ttft_ms_p50:7.1f}ms "
f"tpot p50={r.tpot_ms_p50:6.2f}ms thpt={r.throughput_tok_s:6.1f} tok/s")
rows2, raw2 = asyncio.run(run_concurrent(ep, cfg))
all_rows.extend(rows2); all_raw.extend(raw2)
for r in rows2:
print(f" {r.scenario:18s} reqs={r.requests:3d} thpt={r.throughput_tok_s:6.1f} tok/s "
f"ttft p95={r.ttft_ms_p95:7.1f}ms errs={r.errors}")
return all_rows, all_raw
def rows_to_dicts(rows: list[SpeedRow]) -> list[dict[str, Any]]:
return [asdict(r) for r in rows]