Files
xserv/tools/bench/report.py
Gahow Wang 49c7653222 tools: add llama.cpp comparison baseline + standard benchmark suite
Vendor llama.cpp as a submodule pinned to b9371 and add a one-click
benchmark driver that compares xserv against it on identical workloads:

- setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh
  converts the same safetensors to BF16 GGUF for an apples-to-apples baseline.
- tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput
  (single-stream + concurrent) and response quality on AIME 2025 + GSM8K.
- fetch_datasets.py pulls datasets to local JSON (GPU host has no network);
  task loaders prefer the local JSON.
- sync-and-build.sh: `bench` subcommand transfers source + datasets to the
  GPU host via tar-over-ssh (no rsync there), builds, and runs the suite.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00

123 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Combined speed + quality report (markdown + json side-cars)."""
from __future__ import annotations
import datetime as dt
import json
import os
from typing import Any
from .config import DEFAULT_SYSTEMS
def _fmt(x: float, nd: int = 1) -> str:
if x is None or x < 0:
return ""
return f"{x:.{nd}f}"
def _speed_table(rows: list[dict[str, Any]]) -> str:
if not rows:
return "_(no speed results)_\n"
# scenarios in stable order
scenarios: list[str] = []
for r in rows:
if r["scenario"] not in scenarios:
scenarios.append(r["scenario"])
systems: list[str] = []
for r in rows:
if r["system"] not in systems:
systems.append(r["system"])
by = {(r["system"], r["scenario"]): r for r in rows}
out = []
out.append("| scenario | metric | " + " | ".join(systems) + " | speedup (xserv ÷ llama.cpp) |")
out.append("|---|---|" + "|".join(["---"] * (len(systems) + 1)) + "|")
metrics = [
("ttft_ms_p50", "TTFT p50 (ms)", "lower"),
("ttft_ms_p95", "TTFT p95 (ms)", "lower"),
("tpot_ms_p50", "TPOT p50 (ms/tok)", "lower"),
("throughput_tok_s", "Throughput (tok/s)", "higher"),
]
for sc in scenarios:
for key, label, direction in metrics:
cells = []
vals = {}
for s in systems:
row = by.get((s, sc))
v = row[key] if row else -1.0
vals[s] = v
cells.append(_fmt(v, 2 if "tpot" in key else 1))
x = vals.get("xserv", -1.0)
l = vals.get("llama.cpp", -1.0)
if x > 0 and l > 0:
ratio = (x / l) if direction == "higher" else (l / x)
cells.append(f"{ratio:.2f}×")
else:
cells.append("")
out.append(f"| {sc} | {label} | " + " | ".join(cells) + " |")
return "\n".join(out) + "\n"
def _quality_table(rows: list[dict[str, Any]]) -> str:
if not rows:
return "_(no quality results)_\n"
by_task: dict[str, list[dict[str, Any]]] = {}
for r in rows:
by_task.setdefault(r["task"], []).append(r)
out: list[str] = []
out.append("| task | system | n | correct | accuracy | mean tokens | TTFT (ms) | TPOT (ms/tok) | wall (s) |")
out.append("|---|---|---|---|---|---|---|---|---|")
for task, task_rows in by_task.items():
for r in task_rows:
out.append(
f"| {task} | {r['system']} | {r['n_total']} | {r['n_correct']} | "
f"{r['accuracy'] * 100:.1f}% | {r['mean_completion_tokens']:.0f} | "
f"{_fmt(r['mean_ttft_ms'])} | {_fmt(r['mean_tpot_ms'], 2)} | {r['wall_s']:.1f} |"
)
return "\n".join(out) + "\n"
def write_report(
out_dir: str,
speed_rows: list[dict[str, Any]],
speed_raw: list[dict[str, Any]],
quality_rows: list[dict[str, Any]],
quality_cases: list[dict[str, Any]],
env: dict[str, Any],
) -> str:
os.makedirs(out_dir, exist_ok=True)
stamp = dt.datetime.now().strftime("%Y%m%d-%H%M%S")
md_path = os.path.join(out_dir, f"comparison-{stamp}.md")
json_path = os.path.join(out_dir, f"comparison-{stamp}.json")
with open(json_path, "w") as f:
json.dump({
"stamp": stamp,
"env": env,
"speed": {"summary": speed_rows, "raw": speed_raw},
"quality": {"summary": quality_rows, "cases": quality_cases},
}, f, indent=2)
lines: list[str] = []
lines.append(f"# xserv vs llama.cpp — comparison\n")
lines.append(f"_Generated: {stamp}_\n")
lines.append("## Environment\n")
for k, v in env.items():
lines.append(f"- **{k}**: {v}")
lines.append("")
lines.append("## Speed\n")
lines.append(_speed_table(speed_rows))
lines.append("\n## Quality\n")
lines.append(_quality_table(quality_rows))
lines.append(f"\n_Raw results: `{os.path.basename(json_path)}`_\n")
with open(md_path, "w") as f:
f.write("\n".join(lines))
print(f"\n[report] wrote {md_path}")
print(f"[report] wrote {json_path}")
return md_path