xserv/tools/bench/config.py

"""Defaults + CLI argument shapes for the benchmark driver.

All paths default to the dash5 layout (/opt/wjh/...) because that's where the
GPU lives — see docs/16-llama-cpp-comparison.md.
"""

from __future__ import annotations

import os
from dataclasses import dataclass, field


# Names used in reports and as logical keys throughout the driver.
SYSTEM_XSERV = "xserv"
SYSTEM_LLAMA_CPP = "llama.cpp"
DEFAULT_SYSTEMS = (SYSTEM_XSERV, SYSTEM_LLAMA_CPP)


@dataclass
class SystemEndpoint:
    """How to reach (or how to start) one of the systems under test."""

    name: str
    base_url: str                  # http://host:port  (OpenAI-compatible root, no /v1)
    model_id: str                  # what to put in the request body's "model" field
    api_key: str | None = None     # llama-server doesn't need one; xserv ignores it
    # Extra fields merged into every request body for this system. Used to keep
    # the two engines in the SAME generation mode — xserv hardcodes Qwen3
    # thinking OFF (empty <think></think> in its prompt builder), so we disable
    # thinking on llama-server via chat_template_kwargs to match. Both engines
    # ignore unknown fields, so this is safe.
    extra_body: dict | None = None
    # Process supervision is optional — if base_url is already serving, we skip launch.
    launch_cmd: list[str] | None = None
    launch_env: dict[str, str] = field(default_factory=dict)
    launch_cwd: str | None = None
    health_path: str = "/health"
    ready_timeout_s: float = 600.0   # cold loads of 8B BF16 take a while


@dataclass
class BenchConfig:
    out_dir: str = "bench-out"
    # Speed suite
    speed_prompts: int = 8           # synthetic prompts per length bucket
    speed_max_tokens: int = 128
    speed_concurrency: tuple[int, ...] = (1, 2, 4, 8)
    # Quality suite
    quality_max_tokens_aime: int = 16384
    quality_max_tokens_gsm8k: int = 2048
    quality_limit: int | None = None   # subsample for smoke tests; None = all
    quality_temperature: float = 0.0
    request_timeout_s: float = 1800.0


def env_default(key: str, fallback: str) -> str:
    return os.environ.get(key, fallback)