From 7cb9ee3870f1e6b9ff3d7080b07a572fdacf2ac2 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Thu, 28 May 2026 11:40:07 +0800
Subject: [PATCH] bench: run one server at a time, match thinking mode, fix
 tools package
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refinements from end-to-end bring-up on the GPU host:

- Run each system start→suites→stop in sequence. Two BF16 8B models don't
  co-reside on one 32GB GPU, and a resident idle engine would distort the
  other's latency/throughput.
- Match generation mode: xserv hardcodes Qwen3 thinking off, so send
  chat_template_kwargs={enable_thinking:false} to llama.cpp via a per-endpoint
  extra_body. --enable-thinking opts back into thinking mode.
- Add tools/__init__.py so `python3 -m tools.bench.runner` resolves our package
  instead of a site-packages `tools` (nvfuser ships one that shadowed it).
- Document offline-GPU-host workflow, thinking-match, and the xserv 8192 OOM
  finding that the bench surfaced.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docs/16-llama-cpp-comparison.md | 70 +++++++++++++++++++++++++++------
 tools/__init__.py               |  0
 tools/bench/client.py           |  6 ++-
 tools/bench/config.py           |  6 +++
 tools/bench/quality.py          |  1 +
 tools/bench/runner.py           | 43 +++++++++++++-------
 tools/bench/speed.py            |  4 +-
 7 files changed, 102 insertions(+), 28 deletions(-)
 create mode 100644 tools/__init__.py
diff --git a/docs/16-llama-cpp-comparison.md b/docs/16-llama-cpp-comparison.md
index 406053a..a3fc127 100644
--- a/docs/16-llama-cpp-comparison.md
+++ b/docs/16-llama-cpp-comparison.md
@@ -52,18 +52,33 @@ isolates the test harness from internal API churn on either side.
 
 ## Workflow
 
+The GPU host (dash5) has **no outbound network and no rsync**, so anything from
+the internet is fetched locally and shipped over via tar-over-ssh.
+
 ```
-local repo                            dash5 (GPU host)
-──────────                            ────────────────
-tools/sync-and-build.sh bench   →  rsync project (excl. target, third_party, bench-out)
-                                   →  setup-llama-cpp.sh    (no-op if built)
-                                   →  convert-to-gguf.sh    (no-op if .gguf exists)
-                                   →  cargo build --release
-                                   →  python3 -m tools.bench.runner ...
-                                   →  bench-out/comparison-<stamp>.md
-tools/sync-and-build.sh fetch-bench-out  ←  rsync bench-out back
+local repo (has network)              dash5 (GPU host, no network)
+────────────────────────              ────────────────────────────
+# one-time, on a networked machine:
+python3 -m tools.bench.fetch_datasets  →  tools/bench/data/{aime2025,gsm8k}.json
+git submodule update --init …          →  third_party/llama.cpp source
+
+tools/sync-and-build.sh bench   →  tar project   (excl. target, third_party, bench-out)
+                                →  tar llama.cpp source (excl. build, .git)
+                                →  setup-llama-cpp.sh   (build-only; no-op if built)
+                                →  convert-to-gguf.sh   (no-op if .gguf exists)
+                                →  cargo build --release
+                                →  python3 -m tools.bench.runner ...
+                                →  bench-out/comparison-<stamp>.md
+tools/sync-and-build.sh fetch-bench-out  ←  tar bench-out back
 ```
 
+Behind a flaky proxy, fetch datasets through the HF mirror:
+`HF_ENDPOINT=https://hf-mirror.com python3 -m tools.bench.fetch_datasets`.
+
+`tools/__init__.py` exists so `python3 -m tools.bench.runner` resolves our
+package: some site-packages (e.g. nvfuser) ship a regular top-level `tools`
+package that would otherwise shadow a namespace `tools`.
+
 ## What gets measured
 
 ### Speed (TTFT / TPOT / throughput)
@@ -78,12 +93,19 @@ tools/sync-and-build.sh fetch-bench-out  ←  rsync bench-out back
 
 | Task | N | Source | Scoring | Why |
 |---|---|---|---|---|
-| AIME 2025 | 30 | `MathArena/aime_2025` (HF) | exact-match boxed integer (0..999) | reasoning + math, hard signal |
+| AIME 2025 | 30 | `MathArena/aime_2025`, fallback `yentinglin/aime_2025` (HF) | exact-match boxed integer (0..999) | reasoning + math, hard signal |
 | GSM8K | 1319 | `openai/gsm8k` (HF), `test` split | exact-match `\boxed{n}` or last number | broad sanity, decimals allowed |
 
 Same `temperature=0` sampling across both systems. Max tokens: 16384 for AIME
 (reasoning long), 2048 for GSM8K. Subsample with `--quality-limit N` for smoke.
 
+**Generation mode must match.** xserv's prompt builder hardcodes Qwen3 thinking
+OFF (it appends an empty `<think></think>` block). llama-server applies the
+GGUF's Qwen3 jinja template, which has thinking ON by default. The driver
+therefore sends `chat_template_kwargs={"enable_thinking": false}` to llama.cpp
+so both engines run the model in the same mode. Pass `--enable-thinking` to
+compare in thinking mode instead (xserv would need a matching change first).
+
 ### Report
 
 `bench-out/comparison-<stamp>.md` contains:
@@ -96,9 +118,16 @@ A sibling `.json` holds all per-request raw rows and per-problem case detail
 
 ## Running it
 
+**One-time prerequisites (on a networked machine):**
+```bash
+git submodule update --init third_party/llama.cpp     # pinned to b9371
+HF_ENDPOINT=https://hf-mirror.com python3 -m tools.bench.fetch_datasets
+```
+
 **Full sweep on dash5 (recommended):**
 ```bash
-./tools/sync-and-build.sh bench
+# 4096 ctx because xserv OOMs at 8192 (see Known constraints)
+./tools/sync-and-build.sh bench -- --max-seq-len 4096 --quality-limit 50
 ./tools/sync-and-build.sh fetch-bench-out
 open bench-out/comparison-*.md
 ```
@@ -142,6 +171,25 @@ python3 -m tools.bench.runner \
    own process group and SIGTERM the group on exit so half-dead llama-server
    children don't survive. If the user is already running a server somewhere,
    pass `--xserv-base-url` / `--llama-base-url` to skip launch.
+6. **One server at a time.** The driver starts a system, runs every suite
+   against it, stops it, then moves to the next. Two BF16 8B models (~16GB each)
+   do not co-reside on a single 32GB GPU, and a resident idle engine would
+   distort the other's latency/throughput. This serialization is why the report
+   is assembled from per-system passes rather than a single interleaved run.
+
+## Known constraints / findings
+
+- **xserv OOMs at `--max-seq-len 8192` + `--max-batch 4`.** xserv eagerly
+  pre-allocates its paged-KV pool (`total_blocks = blocks_per_seq · max_batch ·
+  2`, ≈9GB at 8192) on top of the 16GB weights, exceeding 32GB at startup
+  (`paged_kv_cache.rs` `alloc paged K pool: OutOfMemory`). llama.cpp allocates
+  KV lazily and fits 8192 easily. Until xserv's sizing is fixed, run the
+  comparison at `--max-seq-len 4096` (xserv peaks ~28GB there). The benchmark
+  surfaced this — it's tracked as a follow-up fix.
+- When the xserv engine thread dies, the request handler panics on the poisoned
+  `engine_sender` mutex and every subsequent request fails with "server
+  disconnected". The driver records these as per-request errors (no crash), so a
+  broken engine shows up as `errs=N` / `accuracy 0%` rather than a hung run.
 
 ## Future extensions
 
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tools/bench/client.py b/tools/bench/client.py
index 977c32e..df40685 100644
--- a/tools/bench/client.py
+++ b/tools/bench/client.py
@@ -55,6 +55,7 @@ async def chat_stream(
     temperature: float = 0.0,
     api_key: str | None = None,
     timeout: float = 1800.0,
+    extra_body: dict | None = None,
 ) -> StreamResult:
     payload: dict[str, Any] = {
         "model": model,
@@ -66,6 +67,8 @@ async def chat_stream(
     # llama-server returns usage in the final stream chunk when this is set;
     # xserv ignores unknown fields, so this is harmless there.
     payload["stream_options"] = {"include_usage": True}
+    if extra_body:
+        payload.update(extra_body)
 
     headers = {"Content-Type": "application/json"}
     if api_key:
@@ -135,6 +138,7 @@ async def chat_concurrent(
     api_key: str | None = None,
     timeout: float = 1800.0,
     concurrency: int,
+    extra_body: dict | None = None,
 ) -> tuple[list[StreamResult], float]:
     """Fire `concurrency` requests in parallel waves. Returns per-request results
     plus wall-clock elapsed time of the entire batch."""
@@ -146,7 +150,7 @@ async def chat_concurrent(
                 return await chat_stream(
                     client, base_url, model, messages,
                     max_tokens=max_tokens, temperature=temperature,
-                    api_key=api_key, timeout=timeout,
+                    api_key=api_key, timeout=timeout, extra_body=extra_body,
                 )
         t0 = time.perf_counter()
         results = await asyncio.gather(*(one(p) for p in prompts))
diff --git a/tools/bench/config.py b/tools/bench/config.py
index 7309231..3b905b5 100644
--- a/tools/bench/config.py
+++ b/tools/bench/config.py
@@ -24,6 +24,12 @@ class SystemEndpoint:
     base_url: str                  # http://host:port  (OpenAI-compatible root, no /v1)
     model_id: str                  # what to put in the request body's "model" field
     api_key: str | None = None     # llama-server doesn't need one; xserv ignores it
+    # Extra fields merged into every request body for this system. Used to keep
+    # the two engines in the SAME generation mode — xserv hardcodes Qwen3
+    # thinking OFF (empty <think></think> in its prompt builder), so we disable
+    # thinking on llama-server via chat_template_kwargs to match. Both engines
+    # ignore unknown fields, so this is safe.
+    extra_body: dict | None = None
     # Process supervision is optional — if base_url is already serving, we skip launch.
     launch_cmd: list[str] | None = None
     launch_env: dict[str, str] = field(default_factory=dict)
diff --git a/tools/bench/quality.py b/tools/bench/quality.py
index 082e3de..e27ee16 100644
--- a/tools/bench/quality.py
+++ b/tools/bench/quality.py
@@ -81,6 +81,7 @@ async def _run_one_task(
                 temperature=cfg.quality_temperature,
                 api_key=ep.api_key,
                 timeout=cfg.request_timeout_s,
+                extra_body=ep.extra_body,
             )
             pred = task_mod.extract_answer(r.text) if r.error is None else None
             correct = task_mod.score(pred, prob["answer"]) if r.error is None else False
diff --git a/tools/bench/runner.py b/tools/bench/runner.py
index 44e0b17..3118f28 100644
--- a/tools/bench/runner.py
+++ b/tools/bench/runner.py
@@ -24,7 +24,6 @@ import os
 import platform
 import subprocess
 import sys
-from contextlib import ExitStack
 from typing import Any
 
 # Allow running as `python3 tools/bench/runner.py` from repo root.
@@ -35,7 +34,7 @@ from tools.bench.config import (
     BenchConfig, SystemEndpoint, SYSTEM_XSERV, SYSTEM_LLAMA_CPP,
 )
 from tools.bench.servers import (
-    ServerHandle, start_server, stop_server,
+    start_server, stop_server,
     xserv_launch_cmd, llama_cpp_launch_cmd,
 )
 from tools.bench.speed import run_speed, rows_to_dicts as speed_rows_to_dicts
@@ -70,6 +69,9 @@ def parse_args() -> argparse.Namespace:
     p.add_argument("--max-seq-len", type=int, default=8192)
     p.add_argument("--systems", default="xserv,llama.cpp",
                    help="Comma-separated subset to run, e.g. 'xserv' to skip llama.cpp")
+    p.add_argument("--enable-thinking", action="store_true",
+                   help="Enable Qwen3 thinking on llama.cpp. Default OFF to match "
+                        "xserv, which hardcodes thinking off in its prompt builder.")
 
     # Suites
     p.add_argument("--suite", choices=["speed", "quality", "all"], default="all")
@@ -110,11 +112,17 @@ def build_endpoints(args) -> list[SystemEndpoint]:
                 ready_timeout_s=900.0,
             ))
 
+    # Match xserv's hardcoded thinking-OFF mode unless explicitly overridden.
+    llama_extra_body = None if args.enable_thinking else {
+        "chat_template_kwargs": {"enable_thinking": False}
+    }
+
     if SYSTEM_LLAMA_CPP in wanted:
         if args.llama_base_url:
             eps.append(SystemEndpoint(
                 name=SYSTEM_LLAMA_CPP, base_url=args.llama_base_url,
                 model_id=args.llama_model_id, launch_cmd=None,
+                extra_body=llama_extra_body,
             ))
         else:
             gguf = args.llama_gguf or os.environ.get("LLAMA_GGUF")
@@ -131,6 +139,7 @@ def build_endpoints(args) -> list[SystemEndpoint]:
                 # llama-server's health endpoint also returns 200 only when model is loaded.
                 health_path="/health",
                 ready_timeout_s=900.0,
+                extra_body=llama_extra_body,
             ))
     return eps
 
@@ -169,24 +178,28 @@ def main() -> None:
     os.makedirs(args.out_dir, exist_ok=True)
     log_dir = os.path.join(args.out_dir, "logs")
 
-    handles: list[ServerHandle] = []
     speed_rows: list[Any] = []
     speed_raw: list[dict[str, Any]] = []
     quality_rows: list[Any] = []
     quality_cases: list[Any] = []
+    tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()]
 
-    with ExitStack() as stack:
-        for ep in endpoints:
-            h = start_server(ep, log_dir)
-            handles.append(h)
-            stack.callback(stop_server, h)
-
-        if args.suite in ("speed", "all"):
-            speed_rows, speed_raw = run_speed(endpoints, cfg)
-
-        if args.suite in ("quality", "all"):
-            tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()]
-            quality_rows, quality_cases = run_quality(endpoints, cfg, tasks)
+    # One server at a time. Two BF16 8B models (~16GB each) do not co-reside on a
+    # single 32GB GPU, and even if they did, a resident idle engine would distort
+    # the other's measurements. Start → run all suites → stop, then next system.
+    for ep in endpoints:
+        h = start_server(ep, log_dir)
+        try:
+            if args.suite in ("speed", "all"):
+                rows, raw = run_speed([ep], cfg)
+                speed_rows.extend(rows)
+                speed_raw.extend(raw)
+            if args.suite in ("quality", "all"):
+                rows, cases = run_quality([ep], cfg, tasks)
+                quality_rows.extend(rows)
+                quality_cases.extend(cases)
+        finally:
+            stop_server(h)
 
     write_report(
         out_dir=args.out_dir,
diff --git a/tools/bench/speed.py b/tools/bench/speed.py
index 2c57b0b..256d8ac 100644
--- a/tools/bench/speed.py
+++ b/tools/bench/speed.py
@@ -90,6 +90,7 @@ async def run_single_stream(
             api_key=ep.api_key,
             timeout=cfg.request_timeout_s,
             concurrency=1,
+            extra_body=ep.extra_body,
         )
         rows.append(_summarize(ep.name, f"single/{bucket}", results, wall))
         for i, r in enumerate(results):
@@ -122,6 +123,7 @@ async def run_concurrent(
             api_key=ep.api_key,
             timeout=cfg.request_timeout_s,
             concurrency=c,
+            extra_body=ep.extra_body,
         )
         rows.append(_summarize(ep.name, f"concurrent-{c}", results, wall))
         for i, r in enumerate(results):
@@ -147,7 +149,7 @@ def run_speed(
         asyncio.run(chat_concurrent(
             ep.base_url, ep.model_id, warm_messages,
             max_tokens=8, temperature=0.0, api_key=ep.api_key,
-            timeout=120, concurrency=1,
+            timeout=120, concurrency=1, extra_body=ep.extra_body,
         ))
 
         rows1, raw1 = asyncio.run(run_single_stream(ep, cfg))