Refinements from end-to-end bring-up on the GPU host:
- Run each system start→suites→stop in sequence. Two BF16 8B models don't
co-reside on one 32GB GPU, and a resident idle engine would distort the
other's latency/throughput.
- Match generation mode: xserv hardcodes Qwen3 thinking off, so send
chat_template_kwargs={enable_thinking:false} to llama.cpp via a per-endpoint
extra_body. --enable-thinking opts back into thinking mode.
- Add tools/__init__.py so `python3 -m tools.bench.runner` resolves our package
instead of a site-packages `tools` (nvfuser ships one that shadowed it).
- Document offline-GPU-host workflow, thinking-match, and the xserv 8192 OOM
finding that the bench surfaced.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
216 lines
8.4 KiB
Python
216 lines
8.4 KiB
Python
"""One-click entrypoint: spin up both servers, run suites, write report.
|
|
|
|
Usage examples:
|
|
|
|
# Full sweep against both systems
|
|
python3 -m tools.bench.runner \
|
|
--xserv-bin ./target/release/xserv-server \
|
|
--xserv-model /opt/wjh/models/qwen3-8b \
|
|
--llama-bin third_party/llama.cpp/build/bin/llama-server \
|
|
--llama-gguf /opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf \
|
|
--suite all
|
|
|
|
# Speed-only smoke test
|
|
python3 -m tools.bench.runner ... --suite speed
|
|
|
|
# Quality with 5-problem subsample
|
|
python3 -m tools.bench.runner ... --suite quality --quality-limit 5
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import platform
|
|
import subprocess
|
|
import sys
|
|
from typing import Any
|
|
|
|
# Allow running as `python3 tools/bench/runner.py` from repo root.
|
|
if __package__ in (None, ""):
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
|
|
from tools.bench.config import (
|
|
BenchConfig, SystemEndpoint, SYSTEM_XSERV, SYSTEM_LLAMA_CPP,
|
|
)
|
|
from tools.bench.servers import (
|
|
start_server, stop_server,
|
|
xserv_launch_cmd, llama_cpp_launch_cmd,
|
|
)
|
|
from tools.bench.speed import run_speed, rows_to_dicts as speed_rows_to_dicts
|
|
from tools.bench.quality import (
|
|
run_quality, rows_to_dicts as q_rows_to_dicts, cases_to_dicts,
|
|
)
|
|
from tools.bench.report import write_report
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(description="xserv vs llama.cpp benchmark suite")
|
|
# Targets
|
|
p.add_argument("--xserv-bin", default="./target/release/xserv-server")
|
|
p.add_argument("--xserv-model", required=False,
|
|
help="HF model directory for xserv-server (defaults to $XSERV_MODEL_DIR)")
|
|
p.add_argument("--xserv-port", type=int, default=18080)
|
|
p.add_argument("--xserv-base-url", default=None,
|
|
help="If set, skip launching xserv and target this URL.")
|
|
p.add_argument("--xserv-model-id", default="qwen3-8b")
|
|
|
|
p.add_argument("--llama-bin", default="third_party/llama.cpp/build/bin/llama-server")
|
|
p.add_argument("--llama-gguf", required=False,
|
|
help="GGUF model for llama-server (defaults to $LLAMA_GGUF)")
|
|
p.add_argument("--llama-port", type=int, default=18081)
|
|
p.add_argument("--llama-base-url", default=None,
|
|
help="If set, skip launching llama-server and target this URL.")
|
|
p.add_argument("--llama-model-id", default="qwen3-8b",
|
|
help="String to send in OpenAI 'model' field; llama-server is permissive.")
|
|
|
|
# Shared
|
|
p.add_argument("--max-batch", type=int, default=4)
|
|
p.add_argument("--max-seq-len", type=int, default=8192)
|
|
p.add_argument("--systems", default="xserv,llama.cpp",
|
|
help="Comma-separated subset to run, e.g. 'xserv' to skip llama.cpp")
|
|
p.add_argument("--enable-thinking", action="store_true",
|
|
help="Enable Qwen3 thinking on llama.cpp. Default OFF to match "
|
|
"xserv, which hardcodes thinking off in its prompt builder.")
|
|
|
|
# Suites
|
|
p.add_argument("--suite", choices=["speed", "quality", "all"], default="all")
|
|
p.add_argument("--quality-tasks", default="aime2025,gsm8k")
|
|
p.add_argument("--quality-limit", type=int, default=None,
|
|
help="Cap problems per task (smoke test). None = all problems.")
|
|
p.add_argument("--speed-prompts", type=int, default=8)
|
|
p.add_argument("--speed-max-tokens", type=int, default=128)
|
|
p.add_argument("--speed-concurrency", default="1,2,4,8")
|
|
|
|
p.add_argument("--out-dir", default="bench-out")
|
|
return p.parse_args()
|
|
|
|
|
|
def build_endpoints(args) -> list[SystemEndpoint]:
|
|
wanted = set(s.strip() for s in args.systems.split(",") if s.strip())
|
|
eps: list[SystemEndpoint] = []
|
|
|
|
if SYSTEM_XSERV in wanted:
|
|
if args.xserv_base_url:
|
|
eps.append(SystemEndpoint(
|
|
name=SYSTEM_XSERV, base_url=args.xserv_base_url,
|
|
model_id=args.xserv_model_id, launch_cmd=None,
|
|
))
|
|
else:
|
|
model_dir = args.xserv_model or os.environ.get("XSERV_MODEL_DIR")
|
|
if not model_dir:
|
|
raise SystemExit("--xserv-model or XSERV_MODEL_DIR required (or pass --xserv-base-url)")
|
|
eps.append(SystemEndpoint(
|
|
name=SYSTEM_XSERV,
|
|
base_url=f"http://127.0.0.1:{args.xserv_port}",
|
|
model_id=args.xserv_model_id,
|
|
launch_cmd=xserv_launch_cmd(
|
|
args.xserv_bin, model_dir, args.xserv_port,
|
|
max_batch=args.max_batch, max_seq_len=args.max_seq_len,
|
|
),
|
|
health_path="/health",
|
|
ready_timeout_s=900.0,
|
|
))
|
|
|
|
# Match xserv's hardcoded thinking-OFF mode unless explicitly overridden.
|
|
llama_extra_body = None if args.enable_thinking else {
|
|
"chat_template_kwargs": {"enable_thinking": False}
|
|
}
|
|
|
|
if SYSTEM_LLAMA_CPP in wanted:
|
|
if args.llama_base_url:
|
|
eps.append(SystemEndpoint(
|
|
name=SYSTEM_LLAMA_CPP, base_url=args.llama_base_url,
|
|
model_id=args.llama_model_id, launch_cmd=None,
|
|
extra_body=llama_extra_body,
|
|
))
|
|
else:
|
|
gguf = args.llama_gguf or os.environ.get("LLAMA_GGUF")
|
|
if not gguf:
|
|
raise SystemExit("--llama-gguf or LLAMA_GGUF required (or pass --llama-base-url)")
|
|
eps.append(SystemEndpoint(
|
|
name=SYSTEM_LLAMA_CPP,
|
|
base_url=f"http://127.0.0.1:{args.llama_port}",
|
|
model_id=args.llama_model_id,
|
|
launch_cmd=llama_cpp_launch_cmd(
|
|
args.llama_bin, gguf, args.llama_port,
|
|
n_parallel=args.max_batch, ctx_size=args.max_seq_len,
|
|
),
|
|
# llama-server's health endpoint also returns 200 only when model is loaded.
|
|
health_path="/health",
|
|
ready_timeout_s=900.0,
|
|
extra_body=llama_extra_body,
|
|
))
|
|
return eps
|
|
|
|
|
|
def collect_env() -> dict[str, Any]:
|
|
env: dict[str, Any] = {
|
|
"platform": platform.platform(),
|
|
"python": sys.version.split()[0],
|
|
}
|
|
for cmd, key in [
|
|
(["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader"], "gpu"),
|
|
(["git", "rev-parse", "HEAD"], "xserv_commit"),
|
|
]:
|
|
try:
|
|
out = subprocess.check_output(cmd, text=True, stderr=subprocess.DEVNULL, timeout=5).strip()
|
|
env[key] = out.splitlines()[0] if out else "?"
|
|
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
|
|
env[key] = "?"
|
|
return env
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
endpoints = build_endpoints(args)
|
|
if not endpoints:
|
|
raise SystemExit("no systems selected (check --systems)")
|
|
|
|
cfg = BenchConfig(
|
|
out_dir=args.out_dir,
|
|
speed_prompts=args.speed_prompts,
|
|
speed_max_tokens=args.speed_max_tokens,
|
|
speed_concurrency=tuple(int(c) for c in args.speed_concurrency.split(",") if c.strip()),
|
|
quality_limit=args.quality_limit,
|
|
)
|
|
|
|
os.makedirs(args.out_dir, exist_ok=True)
|
|
log_dir = os.path.join(args.out_dir, "logs")
|
|
|
|
speed_rows: list[Any] = []
|
|
speed_raw: list[dict[str, Any]] = []
|
|
quality_rows: list[Any] = []
|
|
quality_cases: list[Any] = []
|
|
tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()]
|
|
|
|
# One server at a time. Two BF16 8B models (~16GB each) do not co-reside on a
|
|
# single 32GB GPU, and even if they did, a resident idle engine would distort
|
|
# the other's measurements. Start → run all suites → stop, then next system.
|
|
for ep in endpoints:
|
|
h = start_server(ep, log_dir)
|
|
try:
|
|
if args.suite in ("speed", "all"):
|
|
rows, raw = run_speed([ep], cfg)
|
|
speed_rows.extend(rows)
|
|
speed_raw.extend(raw)
|
|
if args.suite in ("quality", "all"):
|
|
rows, cases = run_quality([ep], cfg, tasks)
|
|
quality_rows.extend(rows)
|
|
quality_cases.extend(cases)
|
|
finally:
|
|
stop_server(h)
|
|
|
|
write_report(
|
|
out_dir=args.out_dir,
|
|
speed_rows=speed_rows_to_dicts(speed_rows) if speed_rows else [],
|
|
speed_raw=speed_raw,
|
|
quality_rows=q_rows_to_dicts(quality_rows) if quality_rows else [],
|
|
quality_cases=cases_to_dicts(quality_cases) if quality_cases else [],
|
|
env=collect_env(),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|