bench: run one server at a time, match thinking mode, fix tools package

Refinements from end-to-end bring-up on the GPU host: - Run each system start→suites→stop in sequence. Two BF16 8B models don't co-reside on one 32GB GPU, and a resident idle engine would distort the other's latency/throughput. - Match generation mode: xserv hardcodes Qwen3 thinking off, so send chat_template_kwargs={enable_thinking:false} to llama.cpp via a per-endpoint extra_body. --enable-thinking opts back into thinking mode. - Add tools/__init__.py so `python3 -m tools.bench.runner` resolves our package instead of a site-packages `tools` (nvfuser ships one that shadowed it). - Document offline-GPU-host workflow, thinking-match, and the xserv 8192 OOM finding that the bench surfaced. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:40:07 +08:00
parent 49c7653222
commit 7cb9ee3870
7 changed files with 102 additions and 28 deletions
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/bench/client.py
+++ b/tools/bench/client.py
@@ -55,6 +55,7 @@ async def chat_stream(
    temperature: float = 0.0,
    api_key: str | None = None,
    timeout: float = 1800.0,
+    extra_body: dict | None = None,
 ) -> StreamResult:
    payload: dict[str, Any] = {
        "model": model,
@@ -66,6 +67,8 @@ async def chat_stream(
    # llama-server returns usage in the final stream chunk when this is set;
    # xserv ignores unknown fields, so this is harmless there.
    payload["stream_options"] = {"include_usage": True}
+    if extra_body:
+        payload.update(extra_body)

    headers = {"Content-Type": "application/json"}
    if api_key:
@@ -135,6 +138,7 @@ async def chat_concurrent(
    api_key: str | None = None,
    timeout: float = 1800.0,
    concurrency: int,
+    extra_body: dict | None = None,
 ) -> tuple[list[StreamResult], float]:
    """Fire `concurrency` requests in parallel waves. Returns per-request results
    plus wall-clock elapsed time of the entire batch."""
@@ -146,7 +150,7 @@ async def chat_concurrent(
                return await chat_stream(
                    client, base_url, model, messages,
                    max_tokens=max_tokens, temperature=temperature,
-                    api_key=api_key, timeout=timeout,
+                    api_key=api_key, timeout=timeout, extra_body=extra_body,
                )
        t0 = time.perf_counter()
        results = await asyncio.gather(*(one(p) for p in prompts))
--- a/tools/bench/config.py
+++ b/tools/bench/config.py
@@ -24,6 +24,12 @@ class SystemEndpoint:
    base_url: str                  # http://host:port  (OpenAI-compatible root, no /v1)
    model_id: str                  # what to put in the request body's "model" field
    api_key: str | None = None     # llama-server doesn't need one; xserv ignores it
+    # Extra fields merged into every request body for this system. Used to keep
+    # the two engines in the SAME generation mode — xserv hardcodes Qwen3
+    # thinking OFF (empty <think></think> in its prompt builder), so we disable
+    # thinking on llama-server via chat_template_kwargs to match. Both engines
+    # ignore unknown fields, so this is safe.
+    extra_body: dict | None = None
    # Process supervision is optional — if base_url is already serving, we skip launch.
    launch_cmd: list[str] | None = None
    launch_env: dict[str, str] = field(default_factory=dict)
--- a/tools/bench/quality.py
+++ b/tools/bench/quality.py
@@ -81,6 +81,7 @@ async def _run_one_task(
                temperature=cfg.quality_temperature,
                api_key=ep.api_key,
                timeout=cfg.request_timeout_s,
+                extra_body=ep.extra_body,
            )
            pred = task_mod.extract_answer(r.text) if r.error is None else None
            correct = task_mod.score(pred, prob["answer"]) if r.error is None else False
--- a/tools/bench/runner.py
+++ b/tools/bench/runner.py
@@ -24,7 +24,6 @@ import os
 import platform
 import subprocess
 import sys
-from contextlib import ExitStack
 from typing import Any

 # Allow running as `python3 tools/bench/runner.py` from repo root.
@@ -35,7 +34,7 @@ from tools.bench.config import (
    BenchConfig, SystemEndpoint, SYSTEM_XSERV, SYSTEM_LLAMA_CPP,
 )
 from tools.bench.servers import (
-    ServerHandle, start_server, stop_server,
+    start_server, stop_server,
    xserv_launch_cmd, llama_cpp_launch_cmd,
 )
 from tools.bench.speed import run_speed, rows_to_dicts as speed_rows_to_dicts
@@ -70,6 +69,9 @@ def parse_args() -> argparse.Namespace:
    p.add_argument("--max-seq-len", type=int, default=8192)
    p.add_argument("--systems", default="xserv,llama.cpp",
                   help="Comma-separated subset to run, e.g. 'xserv' to skip llama.cpp")
+    p.add_argument("--enable-thinking", action="store_true",
+                   help="Enable Qwen3 thinking on llama.cpp. Default OFF to match "
+                        "xserv, which hardcodes thinking off in its prompt builder.")

    # Suites
    p.add_argument("--suite", choices=["speed", "quality", "all"], default="all")
@@ -110,11 +112,17 @@ def build_endpoints(args) -> list[SystemEndpoint]:
                ready_timeout_s=900.0,
            ))

+    # Match xserv's hardcoded thinking-OFF mode unless explicitly overridden.
+    llama_extra_body = None if args.enable_thinking else {
+        "chat_template_kwargs": {"enable_thinking": False}
+    }
+
    if SYSTEM_LLAMA_CPP in wanted:
        if args.llama_base_url:
            eps.append(SystemEndpoint(
                name=SYSTEM_LLAMA_CPP, base_url=args.llama_base_url,
                model_id=args.llama_model_id, launch_cmd=None,
+                extra_body=llama_extra_body,
            ))
        else:
            gguf = args.llama_gguf or os.environ.get("LLAMA_GGUF")
@@ -131,6 +139,7 @@ def build_endpoints(args) -> list[SystemEndpoint]:
                # llama-server's health endpoint also returns 200 only when model is loaded.
                health_path="/health",
                ready_timeout_s=900.0,
+                extra_body=llama_extra_body,
            ))
    return eps

@@ -169,24 +178,28 @@ def main() -> None:
    os.makedirs(args.out_dir, exist_ok=True)
    log_dir = os.path.join(args.out_dir, "logs")

-    handles: list[ServerHandle] = []
    speed_rows: list[Any] = []
    speed_raw: list[dict[str, Any]] = []
    quality_rows: list[Any] = []
    quality_cases: list[Any] = []
+    tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()]

-    with ExitStack() as stack:
-        for ep in endpoints:
-            h = start_server(ep, log_dir)
-            handles.append(h)
-            stack.callback(stop_server, h)
-
-        if args.suite in ("speed", "all"):
-            speed_rows, speed_raw = run_speed(endpoints, cfg)
-
-        if args.suite in ("quality", "all"):
-            tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()]
-            quality_rows, quality_cases = run_quality(endpoints, cfg, tasks)
+    # One server at a time. Two BF16 8B models (~16GB each) do not co-reside on a
+    # single 32GB GPU, and even if they did, a resident idle engine would distort
+    # the other's measurements. Start → run all suites → stop, then next system.
+    for ep in endpoints:
+        h = start_server(ep, log_dir)
+        try:
+            if args.suite in ("speed", "all"):
+                rows, raw = run_speed([ep], cfg)
+                speed_rows.extend(rows)
+                speed_raw.extend(raw)
+            if args.suite in ("quality", "all"):
+                rows, cases = run_quality([ep], cfg, tasks)
+                quality_rows.extend(rows)
+                quality_cases.extend(cases)
+        finally:
+            stop_server(h)

    write_report(
        out_dir=args.out_dir,
--- a/tools/bench/speed.py
+++ b/tools/bench/speed.py
@@ -90,6 +90,7 @@ async def run_single_stream(
            api_key=ep.api_key,
            timeout=cfg.request_timeout_s,
            concurrency=1,
+            extra_body=ep.extra_body,
        )
        rows.append(_summarize(ep.name, f"single/{bucket}", results, wall))
        for i, r in enumerate(results):
@@ -122,6 +123,7 @@ async def run_concurrent(
            api_key=ep.api_key,
            timeout=cfg.request_timeout_s,
            concurrency=c,
+            extra_body=ep.extra_body,
        )
        rows.append(_summarize(ep.name, f"concurrent-{c}", results, wall))
        for i, r in enumerate(results):
@@ -147,7 +149,7 @@ def run_speed(
        asyncio.run(chat_concurrent(
            ep.base_url, ep.model_id, warm_messages,
            max_tokens=8, temperature=0.0, api_key=ep.api_key,
-            timeout=120, concurrency=1,
+            timeout=120, concurrency=1, extra_body=ep.extra_body,
        ))

        rows1, raw1 = asyncio.run(run_single_stream(ep, cfg))