Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
123 lines
4.1 KiB
Python
123 lines
4.1 KiB
Python
"""Combined speed + quality report (markdown + json side-cars)."""
|
||
|
||
from __future__ import annotations
|
||
|
||
import datetime as dt
|
||
import json
|
||
import os
|
||
from typing import Any
|
||
|
||
from .config import DEFAULT_SYSTEMS
|
||
|
||
|
||
def _fmt(x: float, nd: int = 1) -> str:
|
||
if x is None or x < 0:
|
||
return "—"
|
||
return f"{x:.{nd}f}"
|
||
|
||
|
||
def _speed_table(rows: list[dict[str, Any]]) -> str:
|
||
if not rows:
|
||
return "_(no speed results)_\n"
|
||
|
||
# scenarios in stable order
|
||
scenarios: list[str] = []
|
||
for r in rows:
|
||
if r["scenario"] not in scenarios:
|
||
scenarios.append(r["scenario"])
|
||
systems: list[str] = []
|
||
for r in rows:
|
||
if r["system"] not in systems:
|
||
systems.append(r["system"])
|
||
|
||
by = {(r["system"], r["scenario"]): r for r in rows}
|
||
out = []
|
||
out.append("| scenario | metric | " + " | ".join(systems) + " | speedup (xserv ÷ llama.cpp) |")
|
||
out.append("|---|---|" + "|".join(["---"] * (len(systems) + 1)) + "|")
|
||
|
||
metrics = [
|
||
("ttft_ms_p50", "TTFT p50 (ms)", "lower"),
|
||
("ttft_ms_p95", "TTFT p95 (ms)", "lower"),
|
||
("tpot_ms_p50", "TPOT p50 (ms/tok)", "lower"),
|
||
("throughput_tok_s", "Throughput (tok/s)", "higher"),
|
||
]
|
||
for sc in scenarios:
|
||
for key, label, direction in metrics:
|
||
cells = []
|
||
vals = {}
|
||
for s in systems:
|
||
row = by.get((s, sc))
|
||
v = row[key] if row else -1.0
|
||
vals[s] = v
|
||
cells.append(_fmt(v, 2 if "tpot" in key else 1))
|
||
x = vals.get("xserv", -1.0)
|
||
l = vals.get("llama.cpp", -1.0)
|
||
if x > 0 and l > 0:
|
||
ratio = (x / l) if direction == "higher" else (l / x)
|
||
cells.append(f"{ratio:.2f}×")
|
||
else:
|
||
cells.append("—")
|
||
out.append(f"| {sc} | {label} | " + " | ".join(cells) + " |")
|
||
return "\n".join(out) + "\n"
|
||
|
||
|
||
def _quality_table(rows: list[dict[str, Any]]) -> str:
|
||
if not rows:
|
||
return "_(no quality results)_\n"
|
||
by_task: dict[str, list[dict[str, Any]]] = {}
|
||
for r in rows:
|
||
by_task.setdefault(r["task"], []).append(r)
|
||
out: list[str] = []
|
||
out.append("| task | system | n | correct | accuracy | mean tokens | TTFT (ms) | TPOT (ms/tok) | wall (s) |")
|
||
out.append("|---|---|---|---|---|---|---|---|---|")
|
||
for task, task_rows in by_task.items():
|
||
for r in task_rows:
|
||
out.append(
|
||
f"| {task} | {r['system']} | {r['n_total']} | {r['n_correct']} | "
|
||
f"{r['accuracy'] * 100:.1f}% | {r['mean_completion_tokens']:.0f} | "
|
||
f"{_fmt(r['mean_ttft_ms'])} | {_fmt(r['mean_tpot_ms'], 2)} | {r['wall_s']:.1f} |"
|
||
)
|
||
return "\n".join(out) + "\n"
|
||
|
||
|
||
def write_report(
|
||
out_dir: str,
|
||
speed_rows: list[dict[str, Any]],
|
||
speed_raw: list[dict[str, Any]],
|
||
quality_rows: list[dict[str, Any]],
|
||
quality_cases: list[dict[str, Any]],
|
||
env: dict[str, Any],
|
||
) -> str:
|
||
os.makedirs(out_dir, exist_ok=True)
|
||
stamp = dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||
md_path = os.path.join(out_dir, f"comparison-{stamp}.md")
|
||
json_path = os.path.join(out_dir, f"comparison-{stamp}.json")
|
||
|
||
with open(json_path, "w") as f:
|
||
json.dump({
|
||
"stamp": stamp,
|
||
"env": env,
|
||
"speed": {"summary": speed_rows, "raw": speed_raw},
|
||
"quality": {"summary": quality_rows, "cases": quality_cases},
|
||
}, f, indent=2)
|
||
|
||
lines: list[str] = []
|
||
lines.append(f"# xserv vs llama.cpp — comparison\n")
|
||
lines.append(f"_Generated: {stamp}_\n")
|
||
lines.append("## Environment\n")
|
||
for k, v in env.items():
|
||
lines.append(f"- **{k}**: {v}")
|
||
lines.append("")
|
||
lines.append("## Speed\n")
|
||
lines.append(_speed_table(speed_rows))
|
||
lines.append("\n## Quality\n")
|
||
lines.append(_quality_table(quality_rows))
|
||
lines.append(f"\n_Raw results: `{os.path.basename(json_path)}`_\n")
|
||
|
||
with open(md_path, "w") as f:
|
||
f.write("\n".join(lines))
|
||
|
||
print(f"\n[report] wrote {md_path}")
|
||
print(f"[report] wrote {json_path}")
|
||
return md_path
|