"""One-click entrypoint: spin up both servers, run suites, write report. Usage examples: # Full sweep against both systems python3 -m tools.bench.runner \ --xserv-bin ./target/release/xserv-server \ --xserv-model /opt/wjh/models/qwen3-8b \ --llama-bin third_party/llama.cpp/build/bin/llama-server \ --llama-gguf /opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf \ --suite all # Speed-only smoke test python3 -m tools.bench.runner ... --suite speed # Quality with 5-problem subsample python3 -m tools.bench.runner ... --suite quality --quality-limit 5 """ from __future__ import annotations import argparse import os import platform import subprocess import sys from contextlib import ExitStack from typing import Any # Allow running as `python3 tools/bench/runner.py` from repo root. if __package__ in (None, ""): sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from tools.bench.config import ( BenchConfig, SystemEndpoint, SYSTEM_XSERV, SYSTEM_LLAMA_CPP, ) from tools.bench.servers import ( ServerHandle, start_server, stop_server, xserv_launch_cmd, llama_cpp_launch_cmd, ) from tools.bench.speed import run_speed, rows_to_dicts as speed_rows_to_dicts from tools.bench.quality import ( run_quality, rows_to_dicts as q_rows_to_dicts, cases_to_dicts, ) from tools.bench.report import write_report def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="xserv vs llama.cpp benchmark suite") # Targets p.add_argument("--xserv-bin", default="./target/release/xserv-server") p.add_argument("--xserv-model", required=False, help="HF model directory for xserv-server (defaults to $XSERV_MODEL_DIR)") p.add_argument("--xserv-port", type=int, default=18080) p.add_argument("--xserv-base-url", default=None, help="If set, skip launching xserv and target this URL.") p.add_argument("--xserv-model-id", default="qwen3-8b") p.add_argument("--llama-bin", default="third_party/llama.cpp/build/bin/llama-server") p.add_argument("--llama-gguf", required=False, help="GGUF model for llama-server (defaults to $LLAMA_GGUF)") p.add_argument("--llama-port", type=int, default=18081) p.add_argument("--llama-base-url", default=None, help="If set, skip launching llama-server and target this URL.") p.add_argument("--llama-model-id", default="qwen3-8b", help="String to send in OpenAI 'model' field; llama-server is permissive.") # Shared p.add_argument("--max-batch", type=int, default=4) p.add_argument("--max-seq-len", type=int, default=8192) p.add_argument("--systems", default="xserv,llama.cpp", help="Comma-separated subset to run, e.g. 'xserv' to skip llama.cpp") # Suites p.add_argument("--suite", choices=["speed", "quality", "all"], default="all") p.add_argument("--quality-tasks", default="aime2025,gsm8k") p.add_argument("--quality-limit", type=int, default=None, help="Cap problems per task (smoke test). None = all problems.") p.add_argument("--speed-prompts", type=int, default=8) p.add_argument("--speed-max-tokens", type=int, default=128) p.add_argument("--speed-concurrency", default="1,2,4,8") p.add_argument("--out-dir", default="bench-out") return p.parse_args() def build_endpoints(args) -> list[SystemEndpoint]: wanted = set(s.strip() for s in args.systems.split(",") if s.strip()) eps: list[SystemEndpoint] = [] if SYSTEM_XSERV in wanted: if args.xserv_base_url: eps.append(SystemEndpoint( name=SYSTEM_XSERV, base_url=args.xserv_base_url, model_id=args.xserv_model_id, launch_cmd=None, )) else: model_dir = args.xserv_model or os.environ.get("XSERV_MODEL_DIR") if not model_dir: raise SystemExit("--xserv-model or XSERV_MODEL_DIR required (or pass --xserv-base-url)") eps.append(SystemEndpoint( name=SYSTEM_XSERV, base_url=f"http://127.0.0.1:{args.xserv_port}", model_id=args.xserv_model_id, launch_cmd=xserv_launch_cmd( args.xserv_bin, model_dir, args.xserv_port, max_batch=args.max_batch, max_seq_len=args.max_seq_len, ), health_path="/health", ready_timeout_s=900.0, )) if SYSTEM_LLAMA_CPP in wanted: if args.llama_base_url: eps.append(SystemEndpoint( name=SYSTEM_LLAMA_CPP, base_url=args.llama_base_url, model_id=args.llama_model_id, launch_cmd=None, )) else: gguf = args.llama_gguf or os.environ.get("LLAMA_GGUF") if not gguf: raise SystemExit("--llama-gguf or LLAMA_GGUF required (or pass --llama-base-url)") eps.append(SystemEndpoint( name=SYSTEM_LLAMA_CPP, base_url=f"http://127.0.0.1:{args.llama_port}", model_id=args.llama_model_id, launch_cmd=llama_cpp_launch_cmd( args.llama_bin, gguf, args.llama_port, n_parallel=args.max_batch, ctx_size=args.max_seq_len, ), # llama-server's health endpoint also returns 200 only when model is loaded. health_path="/health", ready_timeout_s=900.0, )) return eps def collect_env() -> dict[str, Any]: env: dict[str, Any] = { "platform": platform.platform(), "python": sys.version.split()[0], } for cmd, key in [ (["nvidia-smi", "--query-gpu=name,driver_version,memory.total", "--format=csv,noheader"], "gpu"), (["git", "rev-parse", "HEAD"], "xserv_commit"), ]: try: out = subprocess.check_output(cmd, text=True, stderr=subprocess.DEVNULL, timeout=5).strip() env[key] = out.splitlines()[0] if out else "?" except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): env[key] = "?" return env def main() -> None: args = parse_args() endpoints = build_endpoints(args) if not endpoints: raise SystemExit("no systems selected (check --systems)") cfg = BenchConfig( out_dir=args.out_dir, speed_prompts=args.speed_prompts, speed_max_tokens=args.speed_max_tokens, speed_concurrency=tuple(int(c) for c in args.speed_concurrency.split(",") if c.strip()), quality_limit=args.quality_limit, ) os.makedirs(args.out_dir, exist_ok=True) log_dir = os.path.join(args.out_dir, "logs") handles: list[ServerHandle] = [] speed_rows: list[Any] = [] speed_raw: list[dict[str, Any]] = [] quality_rows: list[Any] = [] quality_cases: list[Any] = [] with ExitStack() as stack: for ep in endpoints: h = start_server(ep, log_dir) handles.append(h) stack.callback(stop_server, h) if args.suite in ("speed", "all"): speed_rows, speed_raw = run_speed(endpoints, cfg) if args.suite in ("quality", "all"): tasks = [t.strip() for t in args.quality_tasks.split(",") if t.strip()] quality_rows, quality_cases = run_quality(endpoints, cfg, tasks) write_report( out_dir=args.out_dir, speed_rows=speed_rows_to_dicts(speed_rows) if speed_rows else [], speed_raw=speed_raw, quality_rows=q_rows_to_dicts(quality_rows) if quality_rows else [], quality_cases=cases_to_dicts(quality_cases) if quality_cases else [], env=collect_env(), ) if __name__ == "__main__": main()