From 7cb9ee3870f1e6b9ff3d7080b07a572fdacf2ac2 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Thu, 28 May 2026 11:40:07 +0800 Subject: [PATCH] bench: run one server at a time, match thinking mode, fix tools package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refinements from end-to-end bring-up on the GPU host: - Run each system start→suites→stop in sequence. Two BF16 8B models don't co-reside on one 32GB GPU, and a resident idle engine would distort the other's latency/throughput. - Match generation mode: xserv hardcodes Qwen3 thinking off, so send chat_template_kwargs={enable_thinking:false} to llama.cpp via a per-endpoint extra_body. --enable-thinking opts back into thinking mode. - Add tools/__init__.py so `python3 -m tools.bench.runner` resolves our package instead of a site-packages `tools` (nvfuser ships one that shadowed it). - Document offline-GPU-host workflow, thinking-match, and the xserv 8192 OOM finding that the bench surfaced. Co-Authored-By: Claude Opus 4.7 --- docs/16-llama-cpp-comparison.md | 70 +++++++++++++++++++++++++++------ tools/__init__.py | 0 tools/bench/client.py | 6 ++- tools/bench/config.py | 6 +++ tools/bench/quality.py | 1 + tools/bench/runner.py | 43 +++++++++++++------- tools/bench/speed.py | 4 +- 7 files changed, 102 insertions(+), 28 deletions(-) create mode 100644 tools/__init__.py diff --git a/docs/16-llama-cpp-comparison.md b/docs/16-llama-cpp-comparison.md index 406053a..a3fc127 100644 --- a/docs/16-llama-cpp-comparison.md +++ b/docs/16-llama-cpp-comparison.md @@ -52,18 +52,33 @@ isolates the test harness from internal API churn on either side. ## Workflow +The GPU host (dash5) has **no outbound network and no rsync**, so anything from +the internet is fetched locally and shipped over via tar-over-ssh. + ``` -local repo dash5 (GPU host) -────────── ──────────────── -tools/sync-and-build.sh bench → rsync project (excl. target, third_party, bench-out) - → setup-llama-cpp.sh (no-op if built) - → convert-to-gguf.sh (no-op if .gguf exists) - → cargo build --release - → python3 -m tools.bench.runner ... - → bench-out/comparison-.md -tools/sync-and-build.sh fetch-bench-out ← rsync bench-out back +local repo (has network) dash5 (GPU host, no network) +──────────────────────── ──────────────────────────── +# one-time, on a networked machine: +python3 -m tools.bench.fetch_datasets → tools/bench/data/{aime2025,gsm8k}.json +git submodule update --init … → third_party/llama.cpp source + +tools/sync-and-build.sh bench → tar project (excl. target, third_party, bench-out) + → tar llama.cpp source (excl. build, .git) + → setup-llama-cpp.sh (build-only; no-op if built) + → convert-to-gguf.sh (no-op if .gguf exists) + → cargo build --release + → python3 -m tools.bench.runner ... + → bench-out/comparison-.md +tools/sync-and-build.sh fetch-bench-out ← tar bench-out back ``` +Behind a flaky proxy, fetch datasets through the HF mirror: +`HF_ENDPOINT=https://hf-mirror.com python3 -m tools.bench.fetch_datasets`. + +`tools/__init__.py` exists so `python3 -m tools.bench.runner` resolves our +package: some site-packages (e.g. nvfuser) ship a regular top-level `tools` +package that would otherwise shadow a namespace `tools`. + ## What gets measured ### Speed (TTFT / TPOT / throughput) @@ -78,12 +93,19 @@ tools/sync-and-build.sh fetch-bench-out ← rsync bench-out back | Task | N | Source | Scoring | Why | |---|---|---|---|---| -| AIME 2025 | 30 | `MathArena/aime_2025` (HF) | exact-match boxed integer (0..999) | reasoning + math, hard signal | +| AIME 2025 | 30 | `MathArena/aime_2025`, fallback `yentinglin/aime_2025` (HF) | exact-match boxed integer (0..999) | reasoning + math, hard signal | | GSM8K | 1319 | `openai/gsm8k` (HF), `test` split | exact-match `\boxed{n}` or last number | broad sanity, decimals allowed | Same `temperature=0` sampling across both systems. Max tokens: 16384 for AIME (reasoning long), 2048 for GSM8K. Subsample with `--quality-limit N` for smoke. +**Generation mode must match.** xserv's prompt builder hardcodes Qwen3 thinking +OFF (it appends an empty `` block). llama-server applies the +GGUF's Qwen3 jinja template, which has thinking ON by default. The driver +therefore sends `chat_template_kwargs={"enable_thinking": false}` to llama.cpp +so both engines run the model in the same mode. Pass `--enable-thinking` to +compare in thinking mode instead (xserv would need a matching change first). + ### Report `bench-out/comparison-.md` contains: @@ -96,9 +118,16 @@ A sibling `.json` holds all per-request raw rows and per-problem case detail ## Running it +**One-time prerequisites (on a networked machine):** +```bash +git submodule update --init third_party/llama.cpp # pinned to b9371 +HF_ENDPOINT=https://hf-mirror.com python3 -m tools.bench.fetch_datasets +``` + **Full sweep on dash5 (recommended):** ```bash -./tools/sync-and-build.sh bench +# 4096 ctx because xserv OOMs at 8192 (see Known constraints) +./tools/sync-and-build.sh bench -- --max-seq-len 4096 --quality-limit 50 ./tools/sync-and-build.sh fetch-bench-out open bench-out/comparison-*.md ``` @@ -142,6 +171,25 @@ python3 -m tools.bench.runner \ own process group and SIGTERM the group on exit so half-dead llama-server children don't survive. If the user is already running a server somewhere, pass `--xserv-base-url` / `--llama-base-url` to skip launch. +6. **One server at a time.** The driver starts a system, runs every suite + against it, stops it, then moves to the next. Two BF16 8B models (~16GB each) + do not co-reside on a single 32GB GPU, and a resident idle engine would + distort the other's latency/throughput. This serialization is why the report + is assembled from per-system passes rather than a single interleaved run. + +## Known constraints / findings + +- **xserv OOMs at `--max-seq-len 8192` + `--max-batch 4`.** xserv eagerly + pre-allocates its paged-KV pool (`total_blocks = blocks_per_seq · max_batch · + 2`, ≈9GB at 8192) on top of the 16GB weights, exceeding 32GB at startup + (`paged_kv_cache.rs` `alloc paged K pool: OutOfMemory`). llama.cpp allocates + KV lazily and fits 8192 easily. Until xserv's sizing is fixed, run the + comparison at `--max-seq-len 4096` (xserv peaks ~28GB there). The benchmark + surfaced this — it's tracked as a follow-up fix. +- When the xserv engine thread dies, the request handler panics on the poisoned + `engine_sender` mutex and every subsequent request fails with "server + disconnected". The driver records these as per-request errors (no crash), so a + broken engine shows up as `errs=N` / `accuracy 0%` rather than a hung run. ## Future extensions diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/bench/client.py b/tools/bench/client.py index 977c32e..df40685 100644 --- a/tools/bench/client.py +++ b/tools/bench/client.py @@ -55,6 +55,7 @@ async def chat_stream( temperature: float = 0.0, api_key: str | None = None, timeout: float = 1800.0, + extra_body: dict | None = None, ) -> StreamResult: payload: dict[str, Any] = { "model": model, @@ -66,6 +67,8 @@ async def chat_stream( # llama-server returns usage in the final stream chunk when this is set; # xserv ignores unknown fields, so this is harmless there. payload["stream_options"] = {"include_usage": True} + if extra_body: + payload.update(extra_body) headers = {"Content-Type": "application/json"} if api_key: @@ -135,6 +138,7 @@ async def chat_concurrent( api_key: str | None = None, timeout: float = 1800.0, concurrency: int, + extra_body: dict | None = None, ) -> tuple[list[StreamResult], float]: """Fire `concurrency` requests in parallel waves. Returns per-request results plus wall-clock elapsed time of the entire batch.""" @@ -146,7 +150,7 @@ async def chat_concurrent( return await chat_stream( client, base_url, model, messages, max_tokens=max_tokens, temperature=temperature, - api_key=api_key, timeout=timeout, + api_key=api_key, timeout=timeout, extra_body=extra_body, ) t0 = time.perf_counter() results = await asyncio.gather(*(one(p) for p in prompts)) diff --git a/tools/bench/config.py b/tools/bench/config.py index 7309231..3b905b5 100644 --- a/tools/bench/config.py +++ b/tools/bench/config.py @@ -24,6 +24,12 @@ class SystemEndpoint: base_url: str # http://host:port (OpenAI-compatible root, no /v1) model_id: str # what to put in the request body's "model" field api_key: str | None = None # llama-server doesn't need one; xserv ignores it + # Extra fields merged into every request body for this system. Used to keep + # the two engines in the SAME generation mode — xserv hardcodes Qwen3 + # thinking OFF (empty in its prompt builder), so we disable + # thinking on llama-server via chat_template_kwargs to match. Both engines + # ignore unknown fields, so this is safe. + extra_body: dict | None = None # Process supervision is optional — if base_url is already serving, we skip launch. launch_cmd: list[str] | None = None launch_env: dict[str, str] = field(default_factory=dict) diff --git a/tools/bench/quality.py b/tools/bench/quality.py index 082e3de..e27ee16 100644 --- a/tools/bench/quality.py +++ b/tools/bench/quality.py @@ -81,6 +81,7 @@ async def _run_one_task( temperature=cfg.quality_temperature, api_key=ep.api_key, timeout=cfg.request_timeout_s, + extra_body=ep.extra_body, ) pred = task_mod.extract_answer(r.text) if r.error is None else None correct = task_mod.score(pred, prob["answer"]) if r.error is None else False diff --git a/tools/bench/runner.py b/tools/bench/runner.py index 44e0b17..3118f28 100644 --- a/tools/bench/runner.py +++ b/tools/bench/runner.py @@ -24,7 +24,6 @@ import os import platform import subprocess import sys -from contextlib import ExitStack from typing import Any # Allow running as `python3 tools/bench/runner.py` from repo root. @@ -35,7 +34,7 @@ from tools.bench.config import ( BenchConfig, SystemEndpoint, SYSTEM_XSERV, SYSTEM_LLAMA_CPP, ) from tools.bench.servers import ( - ServerHandle, start_server, stop_server, + start_server, stop_server, xserv_launch_cmd, llama_cpp_launch_cmd, ) from tools.bench.speed import run_speed, rows_to_dicts as speed_rows_to_dicts @@ -70,6 +69,9 @@ def parse_args() -> argparse.Namespace: p.add_argument("--max-seq-len", type=int, default=8192) p.add_argument("--systems", default="xserv,llama.cpp", help="Comma-separated subset to run, e.g. 'xserv' to skip llama.cpp") + p.add_argument("--enable-thinking", action="store_true", + help="Enable Qwen3 thinking on llama.cpp. Default OFF to match " + "xserv, which hardcodes thinking off in its prompt builder.") # Suites p.add_argument("--suite", choices=["speed", "quality", "all"], default="all") @@ -110,11 +112,17 @@ def build_endpoints(args) -> list[SystemEndpoint]: ready_timeout_s=900.0, )) + # Match xserv's hardcoded thinking-OFF mode unless explicitly overridden. + llama_extra_body = None if args.enable_thinking else { + "chat_template_kwargs": {"enable_thinking": False} + } + if SYSTEM_LLAMA_CPP in wanted: if args.llama_base_url: eps.append(SystemEndpoint( name=SYSTEM_LLAMA_CPP, base_url=args.llama_base_url, model_id=args.llama_model_id, launch_cmd=None, + extra_body=llama_extra_body, )) else: gguf = args.llama_gguf or os.environ.get("LLAMA_GGUF") @@ -131,6 +139,7 @@ def build_endpoints(args) -> list[SystemEndpoint]: # llama-server's health endpoint also returns 200 only when model is loaded. health_path="/health", ready_timeout_s=900.0, + extra_body=llama_extra_body, )) return eps @@ -169,24 +178,28 @@ def main() -> None: os.makedirs(args.out_dir, exist_ok=True) log_dir = os.path.join(args.out_dir, "logs") - handles: list[ServerHandle] = [] speed_rows: list[Any] = [] speed_raw: list[dict[str, Any]] = [] quality_rows: list[Any] = [] quality_cases: list[Any] = [] + tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()] - with ExitStack() as stack: - for ep in endpoints: - h = start_server(ep, log_dir) - handles.append(h) - stack.callback(stop_server, h) - - if args.suite in ("speed", "all"): - speed_rows, speed_raw = run_speed(endpoints, cfg) - - if args.suite in ("quality", "all"): - tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()] - quality_rows, quality_cases = run_quality(endpoints, cfg, tasks) + # One server at a time. Two BF16 8B models (~16GB each) do not co-reside on a + # single 32GB GPU, and even if they did, a resident idle engine would distort + # the other's measurements. Start → run all suites → stop, then next system. + for ep in endpoints: + h = start_server(ep, log_dir) + try: + if args.suite in ("speed", "all"): + rows, raw = run_speed([ep], cfg) + speed_rows.extend(rows) + speed_raw.extend(raw) + if args.suite in ("quality", "all"): + rows, cases = run_quality([ep], cfg, tasks) + quality_rows.extend(rows) + quality_cases.extend(cases) + finally: + stop_server(h) write_report( out_dir=args.out_dir, diff --git a/tools/bench/speed.py b/tools/bench/speed.py index 2c57b0b..256d8ac 100644 --- a/tools/bench/speed.py +++ b/tools/bench/speed.py @@ -90,6 +90,7 @@ async def run_single_stream( api_key=ep.api_key, timeout=cfg.request_timeout_s, concurrency=1, + extra_body=ep.extra_body, ) rows.append(_summarize(ep.name, f"single/{bucket}", results, wall)) for i, r in enumerate(results): @@ -122,6 +123,7 @@ async def run_concurrent( api_key=ep.api_key, timeout=cfg.request_timeout_s, concurrency=c, + extra_body=ep.extra_body, ) rows.append(_summarize(ep.name, f"concurrent-{c}", results, wall)) for i, r in enumerate(results): @@ -147,7 +149,7 @@ def run_speed( asyncio.run(chat_concurrent( ep.base_url, ep.model_id, warm_messages, max_tokens=8, temperature=0.0, api_key=ep.api_key, - timeout=120, concurrency=1, + timeout=120, concurrency=1, extra_body=ep.extra_body, )) rows1, raw1 = asyncio.run(run_single_stream(ep, cfg))