Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
52 lines
1.7 KiB
Python
52 lines
1.7 KiB
Python
"""Defaults + CLI argument shapes for the benchmark driver.
|
|
|
|
All paths default to the dash5 layout (/opt/wjh/...) because that's where the
|
|
GPU lives — see docs/16-llama-cpp-comparison.md.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
# Names used in reports and as logical keys throughout the driver.
|
|
SYSTEM_XSERV = "xserv"
|
|
SYSTEM_LLAMA_CPP = "llama.cpp"
|
|
DEFAULT_SYSTEMS = (SYSTEM_XSERV, SYSTEM_LLAMA_CPP)
|
|
|
|
|
|
@dataclass
|
|
class SystemEndpoint:
|
|
"""How to reach (or how to start) one of the systems under test."""
|
|
|
|
name: str
|
|
base_url: str # http://host:port (OpenAI-compatible root, no /v1)
|
|
model_id: str # what to put in the request body's "model" field
|
|
api_key: str | None = None # llama-server doesn't need one; xserv ignores it
|
|
# Process supervision is optional — if base_url is already serving, we skip launch.
|
|
launch_cmd: list[str] | None = None
|
|
launch_env: dict[str, str] = field(default_factory=dict)
|
|
launch_cwd: str | None = None
|
|
health_path: str = "/health"
|
|
ready_timeout_s: float = 600.0 # cold loads of 8B BF16 take a while
|
|
|
|
|
|
@dataclass
|
|
class BenchConfig:
|
|
out_dir: str = "bench-out"
|
|
# Speed suite
|
|
speed_prompts: int = 8 # synthetic prompts per length bucket
|
|
speed_max_tokens: int = 128
|
|
speed_concurrency: tuple[int, ...] = (1, 2, 4, 8)
|
|
# Quality suite
|
|
quality_max_tokens_aime: int = 16384
|
|
quality_max_tokens_gsm8k: int = 2048
|
|
quality_limit: int | None = None # subsample for smoke tests; None = all
|
|
quality_temperature: float = 0.0
|
|
request_timeout_s: float = 1800.0
|
|
|
|
|
|
def env_default(key: str, fallback: str) -> str:
|
|
return os.environ.get(key, fallback)
|