Files
xserv/tools/bench/servers.py
Gahow Wang 49c7653222 tools: add llama.cpp comparison baseline + standard benchmark suite
Vendor llama.cpp as a submodule pinned to b9371 and add a one-click
benchmark driver that compares xserv against it on identical workloads:

- setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh
  converts the same safetensors to BF16 GGUF for an apples-to-apples baseline.
- tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput
  (single-stream + concurrent) and response quality on AIME 2025 + GSM8K.
- fetch_datasets.py pulls datasets to local JSON (GPU host has no network);
  task loaders prefer the local JSON.
- sync-and-build.sh: `bench` subcommand transfers source + datasets to the
  GPU host via tar-over-ssh (no rsync there), builds, and runs the suite.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00

146 lines
4.2 KiB
Python

"""Start/stop xserv-server and llama-server as subprocesses.
The benchmark driver treats both systems as black-box HTTP servers — it does
not import their Rust/C++ code. This keeps the comparison fair (same wire
protocol, no in-process shortcut) and avoids coupling the bench harness to
internal APIs.
"""
from __future__ import annotations
import contextlib
import os
import signal
import subprocess
import sys
import time
import urllib.error
import urllib.request
from dataclasses import dataclass
from .config import SystemEndpoint
@dataclass
class ServerHandle:
endpoint: SystemEndpoint
proc: subprocess.Popen[bytes] | None
log_path: str | None
def _wait_ready(base_url: str, health_path: str, timeout_s: float) -> bool:
url = base_url.rstrip("/") + health_path
deadline = time.monotonic() + timeout_s
last_err = ""
while time.monotonic() < deadline:
try:
with urllib.request.urlopen(url, timeout=5) as r:
if r.status == 200:
return True
except (urllib.error.URLError, ConnectionError, TimeoutError) as e:
last_err = repr(e)
time.sleep(1.0)
print(f"[servers] not ready after {timeout_s}s ({url}): {last_err}", file=sys.stderr)
return False
def start_server(ep: SystemEndpoint, log_dir: str) -> ServerHandle:
"""Launch `ep.launch_cmd` if set; otherwise assume it's already running."""
if ep.launch_cmd is None:
if _wait_ready(ep.base_url, ep.health_path, timeout_s=10.0):
print(f"[servers] reusing already-running {ep.name} at {ep.base_url}")
return ServerHandle(endpoint=ep, proc=None, log_path=None)
raise RuntimeError(f"{ep.name}: no launch_cmd and not reachable at {ep.base_url}")
os.makedirs(log_dir, exist_ok=True)
log_path = os.path.join(log_dir, f"{ep.name.replace('.', '_')}.log")
log_f = open(log_path, "wb")
env = os.environ.copy()
env.update(ep.launch_env)
print(f"[servers] launching {ep.name}: {' '.join(ep.launch_cmd)}")
print(f"[servers] log: {log_path}")
proc = subprocess.Popen(
ep.launch_cmd,
cwd=ep.launch_cwd,
env=env,
stdout=log_f,
stderr=subprocess.STDOUT,
# Own process group so SIGTERM kills children (llama-server in particular).
preexec_fn=os.setsid,
)
ok = _wait_ready(ep.base_url, ep.health_path, timeout_s=ep.ready_timeout_s)
if not ok:
# Hand back enough info so caller can drain logs before dying.
log_f.flush()
try:
os.killpg(proc.pid, signal.SIGTERM)
except ProcessLookupError:
pass
raise RuntimeError(
f"{ep.name} failed to become ready (see {log_path}). "
"Common causes: model path wrong, port already in use, OOM."
)
return ServerHandle(endpoint=ep, proc=proc, log_path=log_path)
def stop_server(h: ServerHandle, *, grace_s: float = 10.0) -> None:
if h.proc is None:
return
print(f"[servers] stopping {h.endpoint.name} (pid {h.proc.pid})")
try:
os.killpg(h.proc.pid, signal.SIGTERM)
except ProcessLookupError:
return
try:
h.proc.wait(timeout=grace_s)
except subprocess.TimeoutExpired:
print(f"[servers] {h.endpoint.name} did not exit, sending SIGKILL")
with contextlib.suppress(ProcessLookupError):
os.killpg(h.proc.pid, signal.SIGKILL)
h.proc.wait(timeout=5)
# ---------- launch-command builders ----------
def xserv_launch_cmd(
bin_path: str,
model_dir: str,
port: int,
*,
max_batch: int,
max_seq_len: int,
) -> list[str]:
return [
bin_path,
model_dir,
"--port", str(port),
"--max-batch", str(max_batch),
"--max-seq-len", str(max_seq_len),
]
def llama_cpp_launch_cmd(
bin_path: str,
gguf_path: str,
port: int,
*,
n_parallel: int,
ctx_size: int,
n_gpu_layers: int = 99,
) -> list[str]:
return [
bin_path,
"-m", gguf_path,
"--port", str(port),
"--host", "0.0.0.0",
"-c", str(ctx_size),
"-ngl", str(n_gpu_layers),
"--parallel", str(n_parallel),
# Be quiet by default; the log file already captures stderr.
"--log-disable",
]