"""Defaults + CLI argument shapes for the benchmark driver. All paths default to the dash5 layout (/opt/wjh/...) because that's where the GPU lives — see docs/16-llama-cpp-comparison.md. """ from __future__ import annotations import os from dataclasses import dataclass, field # Names used in reports and as logical keys throughout the driver. SYSTEM_XSERV = "xserv" SYSTEM_LLAMA_CPP = "llama.cpp" DEFAULT_SYSTEMS = (SYSTEM_XSERV, SYSTEM_LLAMA_CPP) @dataclass class SystemEndpoint: """How to reach (or how to start) one of the systems under test.""" name: str base_url: str # http://host:port (OpenAI-compatible root, no /v1) model_id: str # what to put in the request body's "model" field api_key: str | None = None # llama-server doesn't need one; xserv ignores it # Process supervision is optional — if base_url is already serving, we skip launch. launch_cmd: list[str] | None = None launch_env: dict[str, str] = field(default_factory=dict) launch_cwd: str | None = None health_path: str = "/health" ready_timeout_s: float = 600.0 # cold loads of 8B BF16 take a while @dataclass class BenchConfig: out_dir: str = "bench-out" # Speed suite speed_prompts: int = 8 # synthetic prompts per length bucket speed_max_tokens: int = 128 speed_concurrency: tuple[int, ...] = (1, 2, 4, 8) # Quality suite quality_max_tokens_aime: int = 16384 quality_max_tokens_gsm8k: int = 2048 quality_limit: int | None = None # subsample for smoke tests; None = all quality_temperature: float = 0.0 request_timeout_s: float = 1800.0 def env_default(key: str, fallback: str) -> str: return os.environ.get(key, fallback)