Files
xserv/tools/bench/config.py
Gahow Wang 49c7653222 tools: add llama.cpp comparison baseline + standard benchmark suite
Vendor llama.cpp as a submodule pinned to b9371 and add a one-click
benchmark driver that compares xserv against it on identical workloads:

- setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh
  converts the same safetensors to BF16 GGUF for an apples-to-apples baseline.
- tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput
  (single-stream + concurrent) and response quality on AIME 2025 + GSM8K.
- fetch_datasets.py pulls datasets to local JSON (GPU host has no network);
  task loaders prefer the local JSON.
- sync-and-build.sh: `bench` subcommand transfers source + datasets to the
  GPU host via tar-over-ssh (no rsync there), builds, and runs the suite.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00

52 lines
1.7 KiB
Python

"""Defaults + CLI argument shapes for the benchmark driver.
All paths default to the dash5 layout (/opt/wjh/...) because that's where the
GPU lives — see docs/16-llama-cpp-comparison.md.
"""
from __future__ import annotations
import os
from dataclasses import dataclass, field
# Names used in reports and as logical keys throughout the driver.
SYSTEM_XSERV = "xserv"
SYSTEM_LLAMA_CPP = "llama.cpp"
DEFAULT_SYSTEMS = (SYSTEM_XSERV, SYSTEM_LLAMA_CPP)
@dataclass
class SystemEndpoint:
"""How to reach (or how to start) one of the systems under test."""
name: str
base_url: str # http://host:port (OpenAI-compatible root, no /v1)
model_id: str # what to put in the request body's "model" field
api_key: str | None = None # llama-server doesn't need one; xserv ignores it
# Process supervision is optional — if base_url is already serving, we skip launch.
launch_cmd: list[str] | None = None
launch_env: dict[str, str] = field(default_factory=dict)
launch_cwd: str | None = None
health_path: str = "/health"
ready_timeout_s: float = 600.0 # cold loads of 8B BF16 take a while
@dataclass
class BenchConfig:
out_dir: str = "bench-out"
# Speed suite
speed_prompts: int = 8 # synthetic prompts per length bucket
speed_max_tokens: int = 128
speed_concurrency: tuple[int, ...] = (1, 2, 4, 8)
# Quality suite
quality_max_tokens_aime: int = 16384
quality_max_tokens_gsm8k: int = 2048
quality_limit: int | None = None # subsample for smoke tests; None = all
quality_temperature: float = 0.0
request_timeout_s: float = 1800.0
def env_default(key: str, fallback: str) -> str:
return os.environ.get(key, fallback)