Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
170 lines
6.2 KiB
Python
170 lines
6.2 KiB
Python
"""Speed suite: TTFT, TPOT, throughput; serial and concurrent.
|
|
|
|
Single-stream and concurrent throughput are reported separately because they
|
|
stress different things — TTFT/TPOT are kernel/latency bound (single stream),
|
|
throughput at high concurrency is scheduler/batching bound.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import statistics
|
|
from dataclasses import asdict, dataclass
|
|
from typing import Any
|
|
|
|
from .client import StreamResult, chat_concurrent
|
|
from .config import BenchConfig, SystemEndpoint
|
|
|
|
|
|
# Three prompt-length buckets cover the common interesting points:
|
|
# short = greeting-style; medium = QA; long = summarize-ish (prefill-heavy).
|
|
SPEED_PROMPTS = {
|
|
"short": "What is 2 + 2?",
|
|
"medium": "Explain the difference between TCP and UDP, briefly.",
|
|
"long": (
|
|
"Write a detailed comparison of Python and Rust for systems programming. "
|
|
"Cover memory management, performance, ergonomics, ecosystem, and typical "
|
|
"use cases. Be specific."
|
|
),
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class SpeedRow:
|
|
system: str
|
|
scenario: str # e.g. "single/short", "concurrent-4"
|
|
requests: int
|
|
completion_tokens_total: int
|
|
wall_s: float
|
|
ttft_ms_p50: float
|
|
ttft_ms_p95: float
|
|
tpot_ms_p50: float
|
|
tpot_ms_p95: float
|
|
throughput_tok_s: float # aggregate completion_tokens / wall
|
|
per_req_throughput_tok_s_mean: float
|
|
errors: int
|
|
|
|
|
|
def _percentile(values: list[float], p: float) -> float:
|
|
if not values:
|
|
return -1.0
|
|
s = sorted(values)
|
|
idx = max(0, min(len(s) - 1, int(round((p / 100.0) * (len(s) - 1)))))
|
|
return s[idx]
|
|
|
|
|
|
def _summarize(system: str, scenario: str, results: list[StreamResult], wall_s: float) -> SpeedRow:
|
|
ok = [r for r in results if r.error is None]
|
|
ttft_ms = [r.ttft_s * 1000 for r in ok if r.ttft_s >= 0]
|
|
tpot_ms = [r.tpot_s * 1000 for r in ok if r.tpot_s >= 0]
|
|
per_req_tps = [r.throughput_tok_s for r in ok if r.throughput_tok_s > 0]
|
|
total_tokens = sum(r.completion_tokens for r in ok)
|
|
return SpeedRow(
|
|
system=system,
|
|
scenario=scenario,
|
|
requests=len(results),
|
|
completion_tokens_total=total_tokens,
|
|
wall_s=wall_s,
|
|
ttft_ms_p50=_percentile(ttft_ms, 50),
|
|
ttft_ms_p95=_percentile(ttft_ms, 95),
|
|
tpot_ms_p50=_percentile(tpot_ms, 50),
|
|
tpot_ms_p95=_percentile(tpot_ms, 95),
|
|
throughput_tok_s=total_tokens / wall_s if wall_s > 0 else -1.0,
|
|
per_req_throughput_tok_s_mean=statistics.mean(per_req_tps) if per_req_tps else -1.0,
|
|
errors=len(results) - len(ok),
|
|
)
|
|
|
|
|
|
async def run_single_stream(
|
|
ep: SystemEndpoint, cfg: BenchConfig,
|
|
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
|
|
"""One request at a time, three prompt lengths. Repeat each `cfg.speed_prompts` times."""
|
|
rows: list[SpeedRow] = []
|
|
raw: list[dict[str, Any]] = []
|
|
for bucket, prompt in SPEED_PROMPTS.items():
|
|
messages = [[{"role": "user", "content": prompt}]] * cfg.speed_prompts
|
|
results, wall = await chat_concurrent(
|
|
ep.base_url, ep.model_id, messages,
|
|
max_tokens=cfg.speed_max_tokens,
|
|
temperature=0.0,
|
|
api_key=ep.api_key,
|
|
timeout=cfg.request_timeout_s,
|
|
concurrency=1,
|
|
)
|
|
rows.append(_summarize(ep.name, f"single/{bucket}", results, wall))
|
|
for i, r in enumerate(results):
|
|
raw.append({
|
|
"system": ep.name, "scenario": f"single/{bucket}", "i": i,
|
|
"ttft_s": r.ttft_s, "tpot_s": r.tpot_s,
|
|
"completion_tokens": r.completion_tokens,
|
|
"e2e_s": r.e2e_s, "error": r.error,
|
|
"finish_reason": r.finish_reason,
|
|
})
|
|
return rows, raw
|
|
|
|
|
|
async def run_concurrent(
|
|
ep: SystemEndpoint, cfg: BenchConfig,
|
|
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
|
|
"""Fixed medium-length prompt, sweep concurrency."""
|
|
rows: list[SpeedRow] = []
|
|
raw: list[dict[str, Any]] = []
|
|
prompt = SPEED_PROMPTS["medium"]
|
|
for c in cfg.speed_concurrency:
|
|
# Send 4x concurrency requests so the scheduler sees sustained load,
|
|
# not just one wave.
|
|
n = max(c * 4, 8)
|
|
messages = [[{"role": "user", "content": prompt}]] * n
|
|
results, wall = await chat_concurrent(
|
|
ep.base_url, ep.model_id, messages,
|
|
max_tokens=cfg.speed_max_tokens,
|
|
temperature=0.0,
|
|
api_key=ep.api_key,
|
|
timeout=cfg.request_timeout_s,
|
|
concurrency=c,
|
|
)
|
|
rows.append(_summarize(ep.name, f"concurrent-{c}", results, wall))
|
|
for i, r in enumerate(results):
|
|
raw.append({
|
|
"system": ep.name, "scenario": f"concurrent-{c}", "i": i,
|
|
"ttft_s": r.ttft_s, "tpot_s": r.tpot_s,
|
|
"completion_tokens": r.completion_tokens,
|
|
"e2e_s": r.e2e_s, "error": r.error,
|
|
"finish_reason": r.finish_reason,
|
|
})
|
|
return rows, raw
|
|
|
|
|
|
def run_speed(
|
|
endpoints: list[SystemEndpoint], cfg: BenchConfig,
|
|
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
|
|
all_rows: list[SpeedRow] = []
|
|
all_raw: list[dict[str, Any]] = []
|
|
for ep in endpoints:
|
|
print(f"[speed] === {ep.name} ===")
|
|
# Tiny warmup so the first row isn't penalized by lazy cache allocation.
|
|
warm_messages = [[{"role": "user", "content": "Hello"}]]
|
|
asyncio.run(chat_concurrent(
|
|
ep.base_url, ep.model_id, warm_messages,
|
|
max_tokens=8, temperature=0.0, api_key=ep.api_key,
|
|
timeout=120, concurrency=1,
|
|
))
|
|
|
|
rows1, raw1 = asyncio.run(run_single_stream(ep, cfg))
|
|
all_rows.extend(rows1); all_raw.extend(raw1)
|
|
for r in rows1:
|
|
print(f" {r.scenario:18s} ttft p50={r.ttft_ms_p50:7.1f}ms "
|
|
f"tpot p50={r.tpot_ms_p50:6.2f}ms thpt={r.throughput_tok_s:6.1f} tok/s")
|
|
|
|
rows2, raw2 = asyncio.run(run_concurrent(ep, cfg))
|
|
all_rows.extend(rows2); all_raw.extend(raw2)
|
|
for r in rows2:
|
|
print(f" {r.scenario:18s} reqs={r.requests:3d} thpt={r.throughput_tok_s:6.1f} tok/s "
|
|
f"ttft p95={r.ttft_ms_p95:7.1f}ms errs={r.errors}")
|
|
|
|
return all_rows, all_raw
|
|
|
|
|
|
def rows_to_dicts(rows: list[SpeedRow]) -> list[dict[str, Any]]:
|
|
return [asdict(r) for r in rows]
|