"""Speed suite: TTFT, TPOT, throughput; serial and concurrent. Single-stream and concurrent throughput are reported separately because they stress different things — TTFT/TPOT are kernel/latency bound (single stream), throughput at high concurrency is scheduler/batching bound. """ from __future__ import annotations import asyncio import statistics from dataclasses import asdict, dataclass from typing import Any from .client import StreamResult, chat_concurrent from .config import BenchConfig, SystemEndpoint # Three prompt-length buckets cover the common interesting points: # short = greeting-style; medium = QA; long = summarize-ish (prefill-heavy). SPEED_PROMPTS = { "short": "What is 2 + 2?", "medium": "Explain the difference between TCP and UDP, briefly.", "long": ( "Write a detailed comparison of Python and Rust for systems programming. " "Cover memory management, performance, ergonomics, ecosystem, and typical " "use cases. Be specific." ), } @dataclass class SpeedRow: system: str scenario: str # e.g. "single/short", "concurrent-4" requests: int completion_tokens_total: int wall_s: float ttft_ms_p50: float ttft_ms_p95: float tpot_ms_p50: float tpot_ms_p95: float throughput_tok_s: float # aggregate completion_tokens / wall per_req_throughput_tok_s_mean: float errors: int def _percentile(values: list[float], p: float) -> float: if not values: return -1.0 s = sorted(values) idx = max(0, min(len(s) - 1, int(round((p / 100.0) * (len(s) - 1))))) return s[idx] def _summarize(system: str, scenario: str, results: list[StreamResult], wall_s: float) -> SpeedRow: ok = [r for r in results if r.error is None] ttft_ms = [r.ttft_s * 1000 for r in ok if r.ttft_s >= 0] tpot_ms = [r.tpot_s * 1000 for r in ok if r.tpot_s >= 0] per_req_tps = [r.throughput_tok_s for r in ok if r.throughput_tok_s > 0] total_tokens = sum(r.completion_tokens for r in ok) return SpeedRow( system=system, scenario=scenario, requests=len(results), completion_tokens_total=total_tokens, wall_s=wall_s, ttft_ms_p50=_percentile(ttft_ms, 50), ttft_ms_p95=_percentile(ttft_ms, 95), tpot_ms_p50=_percentile(tpot_ms, 50), tpot_ms_p95=_percentile(tpot_ms, 95), throughput_tok_s=total_tokens / wall_s if wall_s > 0 else -1.0, per_req_throughput_tok_s_mean=statistics.mean(per_req_tps) if per_req_tps else -1.0, errors=len(results) - len(ok), ) async def run_single_stream( ep: SystemEndpoint, cfg: BenchConfig, ) -> tuple[list[SpeedRow], list[dict[str, Any]]]: """One request at a time, three prompt lengths. Repeat each `cfg.speed_prompts` times.""" rows: list[SpeedRow] = [] raw: list[dict[str, Any]] = [] for bucket, prompt in SPEED_PROMPTS.items(): messages = [[{"role": "user", "content": prompt}]] * cfg.speed_prompts results, wall = await chat_concurrent( ep.base_url, ep.model_id, messages, max_tokens=cfg.speed_max_tokens, temperature=0.0, api_key=ep.api_key, timeout=cfg.request_timeout_s, concurrency=1, ) rows.append(_summarize(ep.name, f"single/{bucket}", results, wall)) for i, r in enumerate(results): raw.append({ "system": ep.name, "scenario": f"single/{bucket}", "i": i, "ttft_s": r.ttft_s, "tpot_s": r.tpot_s, "completion_tokens": r.completion_tokens, "e2e_s": r.e2e_s, "error": r.error, "finish_reason": r.finish_reason, }) return rows, raw async def run_concurrent( ep: SystemEndpoint, cfg: BenchConfig, ) -> tuple[list[SpeedRow], list[dict[str, Any]]]: """Fixed medium-length prompt, sweep concurrency.""" rows: list[SpeedRow] = [] raw: list[dict[str, Any]] = [] prompt = SPEED_PROMPTS["medium"] for c in cfg.speed_concurrency: # Send 4x concurrency requests so the scheduler sees sustained load, # not just one wave. n = max(c * 4, 8) messages = [[{"role": "user", "content": prompt}]] * n results, wall = await chat_concurrent( ep.base_url, ep.model_id, messages, max_tokens=cfg.speed_max_tokens, temperature=0.0, api_key=ep.api_key, timeout=cfg.request_timeout_s, concurrency=c, ) rows.append(_summarize(ep.name, f"concurrent-{c}", results, wall)) for i, r in enumerate(results): raw.append({ "system": ep.name, "scenario": f"concurrent-{c}", "i": i, "ttft_s": r.ttft_s, "tpot_s": r.tpot_s, "completion_tokens": r.completion_tokens, "e2e_s": r.e2e_s, "error": r.error, "finish_reason": r.finish_reason, }) return rows, raw def run_speed( endpoints: list[SystemEndpoint], cfg: BenchConfig, ) -> tuple[list[SpeedRow], list[dict[str, Any]]]: all_rows: list[SpeedRow] = [] all_raw: list[dict[str, Any]] = [] for ep in endpoints: print(f"[speed] === {ep.name} ===") # Tiny warmup so the first row isn't penalized by lazy cache allocation. warm_messages = [[{"role": "user", "content": "Hello"}]] asyncio.run(chat_concurrent( ep.base_url, ep.model_id, warm_messages, max_tokens=8, temperature=0.0, api_key=ep.api_key, timeout=120, concurrency=1, )) rows1, raw1 = asyncio.run(run_single_stream(ep, cfg)) all_rows.extend(rows1); all_raw.extend(raw1) for r in rows1: print(f" {r.scenario:18s} ttft p50={r.ttft_ms_p50:7.1f}ms " f"tpot p50={r.tpot_ms_p50:6.2f}ms thpt={r.throughput_tok_s:6.1f} tok/s") rows2, raw2 = asyncio.run(run_concurrent(ep, cfg)) all_rows.extend(rows2); all_raw.extend(raw2) for r in rows2: print(f" {r.scenario:18s} reqs={r.requests:3d} thpt={r.throughput_tok_s:6.1f} tok/s " f"ttft p95={r.ttft_ms_p95:7.1f}ms errs={r.errors}") return all_rows, all_raw def rows_to_dicts(rows: list[SpeedRow]) -> list[dict[str, Any]]: return [asdict(r) for r in rows]