xserv/tools/bench/speed.py

"""Speed suite: TTFT, TPOT, throughput; serial and concurrent.

Single-stream and concurrent throughput are reported separately because they
stress different things — TTFT/TPOT are kernel/latency bound (single stream),
throughput at high concurrency is scheduler/batching bound.
"""

from __future__ import annotations

import asyncio
import statistics
from dataclasses import asdict, dataclass
from typing import Any

from .client import StreamResult, chat_concurrent
from .config import BenchConfig, SystemEndpoint


# Three prompt-length buckets cover the common interesting points:
# short = greeting-style; medium = QA; long = summarize-ish (prefill-heavy).
SPEED_PROMPTS = {
    "short":  "What is 2 + 2?",
    "medium": "Explain the difference between TCP and UDP, briefly.",
    "long": (
        "Write a detailed comparison of Python and Rust for systems programming. "
        "Cover memory management, performance, ergonomics, ecosystem, and typical "
        "use cases. Be specific."
    ),
}


@dataclass
class SpeedRow:
    system: str
    scenario: str          # e.g. "single/short", "concurrent-4"
    requests: int
    completion_tokens_total: int
    wall_s: float
    ttft_ms_p50: float
    ttft_ms_p95: float
    tpot_ms_p50: float
    tpot_ms_p95: float
    throughput_tok_s: float    # aggregate completion_tokens / wall
    per_req_throughput_tok_s_mean: float
    errors: int


def _percentile(values: list[float], p: float) -> float:
    if not values:
        return -1.0
    s = sorted(values)
    idx = max(0, min(len(s) - 1, int(round((p / 100.0) * (len(s) - 1)))))
    return s[idx]


def _summarize(system: str, scenario: str, results: list[StreamResult], wall_s: float) -> SpeedRow:
    ok = [r for r in results if r.error is None]
    ttft_ms = [r.ttft_s * 1000 for r in ok if r.ttft_s >= 0]
    tpot_ms = [r.tpot_s * 1000 for r in ok if r.tpot_s >= 0]
    per_req_tps = [r.throughput_tok_s for r in ok if r.throughput_tok_s > 0]
    total_tokens = sum(r.completion_tokens for r in ok)
    return SpeedRow(
        system=system,
        scenario=scenario,
        requests=len(results),
        completion_tokens_total=total_tokens,
        wall_s=wall_s,
        ttft_ms_p50=_percentile(ttft_ms, 50),
        ttft_ms_p95=_percentile(ttft_ms, 95),
        tpot_ms_p50=_percentile(tpot_ms, 50),
        tpot_ms_p95=_percentile(tpot_ms, 95),
        throughput_tok_s=total_tokens / wall_s if wall_s > 0 else -1.0,
        per_req_throughput_tok_s_mean=statistics.mean(per_req_tps) if per_req_tps else -1.0,
        errors=len(results) - len(ok),
    )


async def run_single_stream(
    ep: SystemEndpoint, cfg: BenchConfig,
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
    """One request at a time, three prompt lengths. Repeat each `cfg.speed_prompts` times."""
    rows: list[SpeedRow] = []
    raw: list[dict[str, Any]] = []
    for bucket, prompt in SPEED_PROMPTS.items():
        messages = [[{"role": "user", "content": prompt}]] * cfg.speed_prompts
        results, wall = await chat_concurrent(
            ep.base_url, ep.model_id, messages,
            max_tokens=cfg.speed_max_tokens,
            temperature=0.0,
            api_key=ep.api_key,
            timeout=cfg.request_timeout_s,
            concurrency=1,
        )
        rows.append(_summarize(ep.name, f"single/{bucket}", results, wall))
        for i, r in enumerate(results):
            raw.append({
                "system": ep.name, "scenario": f"single/{bucket}", "i": i,
                "ttft_s": r.ttft_s, "tpot_s": r.tpot_s,
                "completion_tokens": r.completion_tokens,
                "e2e_s": r.e2e_s, "error": r.error,
                "finish_reason": r.finish_reason,
            })
    return rows, raw


async def run_concurrent(
    ep: SystemEndpoint, cfg: BenchConfig,
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
    """Fixed medium-length prompt, sweep concurrency."""
    rows: list[SpeedRow] = []
    raw: list[dict[str, Any]] = []
    prompt = SPEED_PROMPTS["medium"]
    for c in cfg.speed_concurrency:
        # Send 4x concurrency requests so the scheduler sees sustained load,
        # not just one wave.
        n = max(c * 4, 8)
        messages = [[{"role": "user", "content": prompt}]] * n
        results, wall = await chat_concurrent(
            ep.base_url, ep.model_id, messages,
            max_tokens=cfg.speed_max_tokens,
            temperature=0.0,
            api_key=ep.api_key,
            timeout=cfg.request_timeout_s,
            concurrency=c,
        )
        rows.append(_summarize(ep.name, f"concurrent-{c}", results, wall))
        for i, r in enumerate(results):
            raw.append({
                "system": ep.name, "scenario": f"concurrent-{c}", "i": i,
                "ttft_s": r.ttft_s, "tpot_s": r.tpot_s,
                "completion_tokens": r.completion_tokens,
                "e2e_s": r.e2e_s, "error": r.error,
                "finish_reason": r.finish_reason,
            })
    return rows, raw


def run_speed(
    endpoints: list[SystemEndpoint], cfg: BenchConfig,
) -> tuple[list[SpeedRow], list[dict[str, Any]]]:
    all_rows: list[SpeedRow] = []
    all_raw: list[dict[str, Any]] = []
    for ep in endpoints:
        print(f"[speed] === {ep.name} ===")
        # Tiny warmup so the first row isn't penalized by lazy cache allocation.
        warm_messages = [[{"role": "user", "content": "Hello"}]]
        asyncio.run(chat_concurrent(
            ep.base_url, ep.model_id, warm_messages,
            max_tokens=8, temperature=0.0, api_key=ep.api_key,
            timeout=120, concurrency=1,
        ))

        rows1, raw1 = asyncio.run(run_single_stream(ep, cfg))
        all_rows.extend(rows1); all_raw.extend(raw1)
        for r in rows1:
            print(f"  {r.scenario:18s} ttft p50={r.ttft_ms_p50:7.1f}ms  "
                  f"tpot p50={r.tpot_ms_p50:6.2f}ms  thpt={r.throughput_tok_s:6.1f} tok/s")

        rows2, raw2 = asyncio.run(run_concurrent(ep, cfg))
        all_rows.extend(rows2); all_raw.extend(raw2)
        for r in rows2:
            print(f"  {r.scenario:18s} reqs={r.requests:3d}  thpt={r.throughput_tok_s:6.1f} tok/s  "
                  f"ttft p95={r.ttft_ms_p95:7.1f}ms  errs={r.errors}")

    return all_rows, all_raw


def rows_to_dicts(rows: list[SpeedRow]) -> list[dict[str, Any]]:
    return [asdict(r) for r in rows]