Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
146 lines
4.2 KiB
Python
146 lines
4.2 KiB
Python
"""Start/stop xserv-server and llama-server as subprocesses.
|
|
|
|
The benchmark driver treats both systems as black-box HTTP servers — it does
|
|
not import their Rust/C++ code. This keeps the comparison fair (same wire
|
|
protocol, no in-process shortcut) and avoids coupling the bench harness to
|
|
internal APIs.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import contextlib
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from dataclasses import dataclass
|
|
|
|
from .config import SystemEndpoint
|
|
|
|
|
|
@dataclass
|
|
class ServerHandle:
|
|
endpoint: SystemEndpoint
|
|
proc: subprocess.Popen[bytes] | None
|
|
log_path: str | None
|
|
|
|
|
|
def _wait_ready(base_url: str, health_path: str, timeout_s: float) -> bool:
|
|
url = base_url.rstrip("/") + health_path
|
|
deadline = time.monotonic() + timeout_s
|
|
last_err = ""
|
|
while time.monotonic() < deadline:
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=5) as r:
|
|
if r.status == 200:
|
|
return True
|
|
except (urllib.error.URLError, ConnectionError, TimeoutError) as e:
|
|
last_err = repr(e)
|
|
time.sleep(1.0)
|
|
print(f"[servers] not ready after {timeout_s}s ({url}): {last_err}", file=sys.stderr)
|
|
return False
|
|
|
|
|
|
def start_server(ep: SystemEndpoint, log_dir: str) -> ServerHandle:
|
|
"""Launch `ep.launch_cmd` if set; otherwise assume it's already running."""
|
|
if ep.launch_cmd is None:
|
|
if _wait_ready(ep.base_url, ep.health_path, timeout_s=10.0):
|
|
print(f"[servers] reusing already-running {ep.name} at {ep.base_url}")
|
|
return ServerHandle(endpoint=ep, proc=None, log_path=None)
|
|
raise RuntimeError(f"{ep.name}: no launch_cmd and not reachable at {ep.base_url}")
|
|
|
|
os.makedirs(log_dir, exist_ok=True)
|
|
log_path = os.path.join(log_dir, f"{ep.name.replace('.', '_')}.log")
|
|
log_f = open(log_path, "wb")
|
|
env = os.environ.copy()
|
|
env.update(ep.launch_env)
|
|
|
|
print(f"[servers] launching {ep.name}: {' '.join(ep.launch_cmd)}")
|
|
print(f"[servers] log: {log_path}")
|
|
proc = subprocess.Popen(
|
|
ep.launch_cmd,
|
|
cwd=ep.launch_cwd,
|
|
env=env,
|
|
stdout=log_f,
|
|
stderr=subprocess.STDOUT,
|
|
# Own process group so SIGTERM kills children (llama-server in particular).
|
|
preexec_fn=os.setsid,
|
|
)
|
|
|
|
ok = _wait_ready(ep.base_url, ep.health_path, timeout_s=ep.ready_timeout_s)
|
|
if not ok:
|
|
# Hand back enough info so caller can drain logs before dying.
|
|
log_f.flush()
|
|
try:
|
|
os.killpg(proc.pid, signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
pass
|
|
raise RuntimeError(
|
|
f"{ep.name} failed to become ready (see {log_path}). "
|
|
"Common causes: model path wrong, port already in use, OOM."
|
|
)
|
|
|
|
return ServerHandle(endpoint=ep, proc=proc, log_path=log_path)
|
|
|
|
|
|
def stop_server(h: ServerHandle, *, grace_s: float = 10.0) -> None:
|
|
if h.proc is None:
|
|
return
|
|
print(f"[servers] stopping {h.endpoint.name} (pid {h.proc.pid})")
|
|
try:
|
|
os.killpg(h.proc.pid, signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
return
|
|
try:
|
|
h.proc.wait(timeout=grace_s)
|
|
except subprocess.TimeoutExpired:
|
|
print(f"[servers] {h.endpoint.name} did not exit, sending SIGKILL")
|
|
with contextlib.suppress(ProcessLookupError):
|
|
os.killpg(h.proc.pid, signal.SIGKILL)
|
|
h.proc.wait(timeout=5)
|
|
|
|
|
|
# ---------- launch-command builders ----------
|
|
|
|
|
|
def xserv_launch_cmd(
|
|
bin_path: str,
|
|
model_dir: str,
|
|
port: int,
|
|
*,
|
|
max_batch: int,
|
|
max_seq_len: int,
|
|
) -> list[str]:
|
|
return [
|
|
bin_path,
|
|
model_dir,
|
|
"--port", str(port),
|
|
"--max-batch", str(max_batch),
|
|
"--max-seq-len", str(max_seq_len),
|
|
]
|
|
|
|
|
|
def llama_cpp_launch_cmd(
|
|
bin_path: str,
|
|
gguf_path: str,
|
|
port: int,
|
|
*,
|
|
n_parallel: int,
|
|
ctx_size: int,
|
|
n_gpu_layers: int = 99,
|
|
) -> list[str]:
|
|
return [
|
|
bin_path,
|
|
"-m", gguf_path,
|
|
"--port", str(port),
|
|
"--host", "0.0.0.0",
|
|
"-c", str(ctx_size),
|
|
"-ngl", str(n_gpu_layers),
|
|
"--parallel", str(n_parallel),
|
|
# Be quiet by default; the log file already captures stderr.
|
|
"--log-disable",
|
|
]
|