runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
25 lines
1.0 KiB
Python
25 lines
1.0 KiB
Python
"""Summarize the concurrent PP sweep: bench-out/pp{1,2,4}-{xserv,llama}."""
|
|
import glob
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
base = sys.argv[1] if len(sys.argv) > 1 else "bench-out"
|
|
rows = []
|
|
for pp in (1, 2, 4):
|
|
for sysname in ("xserv", "llama"):
|
|
files = sorted(glob.glob(os.path.join(base, f"pp{pp}-{sysname}", "comparison-*.json")))
|
|
if not files:
|
|
continue
|
|
d = json.load(open(files[-1]))
|
|
for r in d["quality"]["summary"]:
|
|
rows.append((pp, sysname, r["task"], r["n_correct"], r["n_total"],
|
|
r["accuracy"] * 100, r["mean_completion_tokens"],
|
|
r["mean_ttft_ms"], r["mean_tpot_ms"], r["wall_s"]))
|
|
|
|
print("%-3s %-7s %-9s %-9s %7s %9s %9s %10s %9s" %
|
|
("PP", "engine", "task", "correct", "acc%", "mean_tok", "TTFT_ms", "TPOT_ms", "wall_s"))
|
|
for (pp, s, task, nc, nt, acc, tok, ttft, tpot, wall) in rows:
|
|
print("%-3d %-7s %-9s %-9s %6.1f%% %9.0f %9.1f %10.2f %9.0f" %
|
|
(pp, s, task, f"{nc}/{nt}", acc, tok, ttft, tpot, wall))
|