"""Combined speed + quality report (markdown + json side-cars).""" from __future__ import annotations import datetime as dt import json import os from typing import Any from .config import DEFAULT_SYSTEMS def _fmt(x: float, nd: int = 1) -> str: if x is None or x < 0: return "—" return f"{x:.{nd}f}" def _speed_table(rows: list[dict[str, Any]]) -> str: if not rows: return "_(no speed results)_\n" # scenarios in stable order scenarios: list[str] = [] for r in rows: if r["scenario"] not in scenarios: scenarios.append(r["scenario"]) systems: list[str] = [] for r in rows: if r["system"] not in systems: systems.append(r["system"]) by = {(r["system"], r["scenario"]): r for r in rows} out = [] out.append("| scenario | metric | " + " | ".join(systems) + " | speedup (xserv ÷ llama.cpp) |") out.append("|---|---|" + "|".join(["---"] * (len(systems) + 1)) + "|") metrics = [ ("ttft_ms_p50", "TTFT p50 (ms)", "lower"), ("ttft_ms_p95", "TTFT p95 (ms)", "lower"), ("tpot_ms_p50", "TPOT p50 (ms/tok)", "lower"), ("throughput_tok_s", "Throughput (tok/s)", "higher"), ] for sc in scenarios: for key, label, direction in metrics: cells = [] vals = {} for s in systems: row = by.get((s, sc)) v = row[key] if row else -1.0 vals[s] = v cells.append(_fmt(v, 2 if "tpot" in key else 1)) x = vals.get("xserv", -1.0) l = vals.get("llama.cpp", -1.0) if x > 0 and l > 0: ratio = (x / l) if direction == "higher" else (l / x) cells.append(f"{ratio:.2f}×") else: cells.append("—") out.append(f"| {sc} | {label} | " + " | ".join(cells) + " |") return "\n".join(out) + "\n" def _quality_table(rows: list[dict[str, Any]]) -> str: if not rows: return "_(no quality results)_\n" by_task: dict[str, list[dict[str, Any]]] = {} for r in rows: by_task.setdefault(r["task"], []).append(r) out: list[str] = [] out.append("| task | system | n | correct | accuracy | mean tokens | TTFT (ms) | TPOT (ms/tok) | wall (s) |") out.append("|---|---|---|---|---|---|---|---|---|") for task, task_rows in by_task.items(): for r in task_rows: out.append( f"| {task} | {r['system']} | {r['n_total']} | {r['n_correct']} | " f"{r['accuracy'] * 100:.1f}% | {r['mean_completion_tokens']:.0f} | " f"{_fmt(r['mean_ttft_ms'])} | {_fmt(r['mean_tpot_ms'], 2)} | {r['wall_s']:.1f} |" ) return "\n".join(out) + "\n" def write_report( out_dir: str, speed_rows: list[dict[str, Any]], speed_raw: list[dict[str, Any]], quality_rows: list[dict[str, Any]], quality_cases: list[dict[str, Any]], env: dict[str, Any], ) -> str: os.makedirs(out_dir, exist_ok=True) stamp = dt.datetime.now().strftime("%Y%m%d-%H%M%S") md_path = os.path.join(out_dir, f"comparison-{stamp}.md") json_path = os.path.join(out_dir, f"comparison-{stamp}.json") with open(json_path, "w") as f: json.dump({ "stamp": stamp, "env": env, "speed": {"summary": speed_rows, "raw": speed_raw}, "quality": {"summary": quality_rows, "cases": quality_cases}, }, f, indent=2) lines: list[str] = [] lines.append(f"# xserv vs llama.cpp — comparison\n") lines.append(f"_Generated: {stamp}_\n") lines.append("## Environment\n") for k, v in env.items(): lines.append(f"- **{k}**: {v}") lines.append("") lines.append("## Speed\n") lines.append(_speed_table(speed_rows)) lines.append("\n## Quality\n") lines.append(_quality_table(quality_rows)) lines.append(f"\n_Raw results: `{os.path.basename(json_path)}`_\n") with open(md_path, "w") as f: f.write("\n".join(lines)) print(f"\n[report] wrote {md_path}") print(f"[report] wrote {json_path}") return md_path