tools: add llama.cpp comparison baseline + standard benchmark suite

Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00
parent 9bb5c5c328
commit 49c7653222
20 changed files with 1690 additions and 14 deletions
--- a/tools/bench/client.py
+++ b/tools/bench/client.py
@@ -0,0 +1,154 @@
+"""HTTP client for OpenAI-compatible /v1/chat/completions.
+
+Records per-request: TTFT (time to first content token), TPOT (mean
+inter-token latency over the decode phase), and end-to-end throughput.
+
+We don't care about parsing exact OpenAI envelope semantics, just enough to
+get the deltas + finish_reason + token counts.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import time
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+
+@dataclass
+class StreamResult:
+    text: str = ""
+    completion_tokens: int = 0
+    prompt_tokens: int = 0
+    finish_reason: str | None = None
+    # Timings (seconds; -1 means not measured)
+    ttft_s: float = -1.0
+    e2e_s: float = -1.0
+    chunk_times: list[float] = field(default_factory=list)   # absolute monotonic times of content chunks
+    error: str | None = None
+
+    @property
+    def tpot_s(self) -> float:
+        """Mean inter-content-chunk latency after the first chunk (seconds/token)."""
+        if len(self.chunk_times) < 2:
+            return -1.0
+        deltas = [self.chunk_times[i] - self.chunk_times[i - 1] for i in range(1, len(self.chunk_times))]
+        return sum(deltas) / len(deltas)
+
+    @property
+    def throughput_tok_s(self) -> float:
+        if self.e2e_s <= 0 or self.completion_tokens <= 0:
+            return -1.0
+        return self.completion_tokens / self.e2e_s
+
+
+async def chat_stream(
+    client: httpx.AsyncClient,
+    base_url: str,
+    model: str,
+    messages: list[dict[str, str]],
+    *,
+    max_tokens: int,
+    temperature: float = 0.0,
+    api_key: str | None = None,
+    timeout: float = 1800.0,
+) -> StreamResult:
+    payload: dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "stream": True,
+    }
+    # llama-server returns usage in the final stream chunk when this is set;
+    # xserv ignores unknown fields, so this is harmless there.
+    payload["stream_options"] = {"include_usage": True}
+
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    url = base_url.rstrip("/") + "/v1/chat/completions"
+    res = StreamResult()
+    t_start = time.perf_counter()
+
+    try:
+        async with client.stream(
+            "POST", url, json=payload, headers=headers, timeout=timeout,
+        ) as resp:
+            if resp.status_code != 200:
+                body = await resp.aread()
+                res.error = f"HTTP {resp.status_code}: {body.decode(errors='replace')[:400]}"
+                res.e2e_s = time.perf_counter() - t_start
+                return res
+
+            async for line in resp.aiter_lines():
+                if not line or not line.startswith("data:"):
+                    continue
+                data = line[len("data:"):].strip()
+                if data == "[DONE]":
+                    break
+                try:
+                    chunk = json.loads(data)
+                except json.JSONDecodeError:
+                    continue
+
+                if "usage" in chunk and chunk["usage"]:
+                    usage = chunk["usage"]
+                    res.prompt_tokens = usage.get("prompt_tokens", res.prompt_tokens)
+                    res.completion_tokens = usage.get("completion_tokens", res.completion_tokens)
+
+                choices = chunk.get("choices") or []
+                if not choices:
+                    continue
+                choice = choices[0]
+                delta = choice.get("delta") or {}
+                content = delta.get("content")
+                if content:
+                    now = time.perf_counter()
+                    if res.ttft_s < 0:
+                        res.ttft_s = now - t_start
+                    res.chunk_times.append(now)
+                    res.text += content
+                if choice.get("finish_reason"):
+                    res.finish_reason = choice["finish_reason"]
+    except Exception as e:  # noqa: BLE001 — surface any failure to the report
+        res.error = f"{type(e).__name__}: {e}"
+
+    res.e2e_s = time.perf_counter() - t_start
+    # Fall back to chunk count when server doesn't report usage (xserv stream path).
+    if res.completion_tokens == 0:
+        res.completion_tokens = len(res.chunk_times)
+    return res
+
+
+async def chat_concurrent(
+    base_url: str,
+    model: str,
+    prompts: list[list[dict[str, str]]],
+    *,
+    max_tokens: int,
+    temperature: float = 0.0,
+    api_key: str | None = None,
+    timeout: float = 1800.0,
+    concurrency: int,
+) -> tuple[list[StreamResult], float]:
+    """Fire `concurrency` requests in parallel waves. Returns per-request results
+    plus wall-clock elapsed time of the entire batch."""
+    sem = asyncio.Semaphore(concurrency)
+    limits = httpx.Limits(max_connections=concurrency * 2, max_keepalive_connections=concurrency)
+    async with httpx.AsyncClient(timeout=timeout, limits=limits) as client:
+        async def one(messages: list[dict[str, str]]) -> StreamResult:
+            async with sem:
+                return await chat_stream(
+                    client, base_url, model, messages,
+                    max_tokens=max_tokens, temperature=temperature,
+                    api_key=api_key, timeout=timeout,
+                )
+        t0 = time.perf_counter()
+        results = await asyncio.gather(*(one(p) for p in prompts))
+        wall = time.perf_counter() - t0
+    return results, wall