Files
xserv/tools/bench/config.py
Gahow Wang 7cb9ee3870 bench: run one server at a time, match thinking mode, fix tools package
Refinements from end-to-end bring-up on the GPU host:

- Run each system start→suites→stop in sequence. Two BF16 8B models don't
  co-reside on one 32GB GPU, and a resident idle engine would distort the
  other's latency/throughput.
- Match generation mode: xserv hardcodes Qwen3 thinking off, so send
  chat_template_kwargs={enable_thinking:false} to llama.cpp via a per-endpoint
  extra_body. --enable-thinking opts back into thinking mode.
- Add tools/__init__.py so `python3 -m tools.bench.runner` resolves our package
  instead of a site-packages `tools` (nvfuser ships one that shadowed it).
- Document offline-GPU-host workflow, thinking-match, and the xserv 8192 OOM
  finding that the bench surfaced.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:40:07 +08:00

58 lines
2.1 KiB
Python

"""Defaults + CLI argument shapes for the benchmark driver.
All paths default to the dash5 layout (/opt/wjh/...) because that's where the
GPU lives — see docs/16-llama-cpp-comparison.md.
"""
from __future__ import annotations
import os
from dataclasses import dataclass, field
# Names used in reports and as logical keys throughout the driver.
SYSTEM_XSERV = "xserv"
SYSTEM_LLAMA_CPP = "llama.cpp"
DEFAULT_SYSTEMS = (SYSTEM_XSERV, SYSTEM_LLAMA_CPP)
@dataclass
class SystemEndpoint:
"""How to reach (or how to start) one of the systems under test."""
name: str
base_url: str # http://host:port (OpenAI-compatible root, no /v1)
model_id: str # what to put in the request body's "model" field
api_key: str | None = None # llama-server doesn't need one; xserv ignores it
# Extra fields merged into every request body for this system. Used to keep
# the two engines in the SAME generation mode — xserv hardcodes Qwen3
# thinking OFF (empty <think></think> in its prompt builder), so we disable
# thinking on llama-server via chat_template_kwargs to match. Both engines
# ignore unknown fields, so this is safe.
extra_body: dict | None = None
# Process supervision is optional — if base_url is already serving, we skip launch.
launch_cmd: list[str] | None = None
launch_env: dict[str, str] = field(default_factory=dict)
launch_cwd: str | None = None
health_path: str = "/health"
ready_timeout_s: float = 600.0 # cold loads of 8B BF16 take a while
@dataclass
class BenchConfig:
out_dir: str = "bench-out"
# Speed suite
speed_prompts: int = 8 # synthetic prompts per length bucket
speed_max_tokens: int = 128
speed_concurrency: tuple[int, ...] = (1, 2, 4, 8)
# Quality suite
quality_max_tokens_aime: int = 16384
quality_max_tokens_gsm8k: int = 2048
quality_limit: int | None = None # subsample for smoke tests; None = all
quality_temperature: float = 0.0
request_timeout_s: float = 1800.0
def env_default(key: str, fallback: str) -> str:
return os.environ.get(key, fallback)