Reuse and concurrency axes redone with proper controlled variables, plus
the orchestration used to run them on dash0:
- run_reuse_fixed.sh: hold REAL prefill work (delta) constant, vary only
cached prefix -> reuse = C/(C+U). Supersedes old fig1 (which held
input=8192 and sliced prefix out, confounding "more reuse" with "less
prefill").
- run_conc.sh: agentic-corner config (in=32768, delta=512, reuse=0.984,
out=128) that exposes PD's structural KV-transfer tax. Supersedes old fig3.
- run_campaign{,2,3}.sh, backfill_d2048o128.sh: serial campaign drivers
(strictly one driver at a time), out=128 sweeps, PD wall-cap for
collapse-draining high-reuse arms, and flaked-arm backfill.
- mb5_run_gpu.sh: per-config bring-up / replay / teardown orchestrator.
- plot_pd_crossover.py: render the reuse_compare figures from fig_agg dumps.
- fig_agg.py: tolerate null stats from fully-collapsed arms (0 successes
write the stat keys as null; `dict.get(k, {})` returns null, not {}).
Data: fig1_reuse_fixed.json, fig1_reuse_d{1024,2048}_o128.json
Figs: reuse_compare_AB.png, reuse_compare_ABC.png
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
143 lines
5.2 KiB
Python
143 lines
5.2 KiB
Python
"""Aggregate a set of MB5 run dirs into one comparison table.
|
|
|
|
Pulls the three core metrics the analysis cares about, per run:
|
|
- E2E latency (from replay_metrics.summary.json: latency_stats_s)
|
|
- TPS (output tokens / wall_clock_s)
|
|
- GPU util by workers (gpu_util.csv over run_window, split prefill/decode by role)
|
|
plus honest reuse (producer-side APC from instance_apc.txt) and TTFT/TPOT for logs.
|
|
|
|
Arm + GPU role split + producer APC ports are inferred from the dir name:
|
|
*_colo_* -> 8 kv_both ; apc ports 8000-8007 (all keep prefix)
|
|
*_pd6_* -> 6P+2D P0-5/D6-7 ; apc 8000-8005
|
|
*_pd_* -> 4P+4D P0-3/D4-7 ; apc 8000-8003 (note: "pd" not "pd4")
|
|
*_pd2_* -> 2P+6D P0-1/D2-7 ; apc 8000-8001
|
|
|
|
Usage: fig_agg.py <run_dir> [<run_dir> ...]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import json
|
|
import re
|
|
import statistics
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def arm_of(name: str):
|
|
# New driver naming (run_conc.sh / run_reuse_fixed.sh): "..._<CONFIG>_rep<r>".
|
|
if "8C-proxy" in name:
|
|
return "colo", list(range(8)), [], list(range(8000, 8008))
|
|
if "6P+2D" in name:
|
|
return "6P+2D", [0, 1, 2, 3, 4, 5], [6, 7], list(range(8000, 8006))
|
|
if "2P+6D" in name:
|
|
return "2P+6D", [0, 1], [2, 3, 4, 5, 6, 7], list(range(8000, 8002))
|
|
if "4P+4D" in name:
|
|
return "4P+4D", [0, 1, 2, 3], [4, 5, 6, 7], list(range(8000, 8004))
|
|
# Legacy naming (original May-30 corrected runs).
|
|
if "_colo_" in name or name.endswith("_colo"):
|
|
return "colo", list(range(8)), [], list(range(8000, 8008))
|
|
if "_pd6_" in name:
|
|
return "6P+2D", [0, 1, 2, 3, 4, 5], [6, 7], list(range(8000, 8006))
|
|
if "_pd2_" in name:
|
|
return "2P+6D", [0, 1], [2, 3, 4, 5, 6, 7], list(range(8000, 8002))
|
|
if "_pd4_" in name or "_pd_" in name:
|
|
return "4P+4D", [0, 1, 2, 3], [4, 5, 6, 7], list(range(8000, 8004))
|
|
return "?", list(range(8)), [], list(range(8000, 8008))
|
|
|
|
|
|
def util_split(run: Path, pgpus, dgpus):
|
|
win = {}
|
|
wp = run / "run_window.json"
|
|
if wp.exists():
|
|
win = json.load(open(wp))
|
|
t0, t1 = win.get("t_start_unix"), win.get("t_end_unix")
|
|
csvp = run / "gpu_util.csv"
|
|
if not csvp.exists():
|
|
return None, None
|
|
by = {}
|
|
for row in csv.DictReader(open(csvp)):
|
|
try:
|
|
ts = float(row["timestamp"]); g = int(row["gpu"]); u = float(row["util_pct"])
|
|
except (ValueError, KeyError):
|
|
continue
|
|
if t0 and not (t0 <= ts <= t1):
|
|
continue
|
|
by.setdefault(g, []).append(u)
|
|
pm = [v for g in pgpus for v in by.get(g, [])]
|
|
dm = [v for g in dgpus for v in by.get(g, [])]
|
|
return (statistics.fmean(pm) if pm else None,
|
|
statistics.fmean(dm) if dm else None)
|
|
|
|
|
|
def apc(run: Path, ports):
|
|
f = run / "instance_apc.txt"
|
|
if not f.exists():
|
|
return None
|
|
q = h = 0
|
|
for line in open(f):
|
|
m = dict(re.findall(r"(\w+)=(\S+)", line))
|
|
try:
|
|
p = int(m.get("port", -1))
|
|
except ValueError:
|
|
continue
|
|
if p in ports:
|
|
q += float(m.get("queries", 0)); h += float(m.get("hits", 0))
|
|
return (h / q) if q else None
|
|
|
|
|
|
def main():
|
|
args = sys.argv[1:]
|
|
as_json = False
|
|
if "--json" in args:
|
|
as_json = True
|
|
args = [a for a in args if a != "--json"]
|
|
rows = []
|
|
for d in args:
|
|
run = Path(d)
|
|
sp = run / "replay_metrics.summary.json"
|
|
if not sp.exists():
|
|
continue
|
|
s = json.load(open(sp))
|
|
arm, pg, dg, ports = arm_of(run.name)
|
|
# `or {}` because a fully-collapsed arm (0 successes) writes these as null,
|
|
# and dict.get(k, {}) returns null (not {}) when the key exists with value null.
|
|
lat = s.get("latency_stats_s") or {}
|
|
ttft = s.get("ttft_stats_s") or {}
|
|
tpot = s.get("tpot_stats_s") or {}
|
|
wall = s.get("wall_clock_s") or 1.0
|
|
out = s.get("actual_output_tokens_stats") or {}
|
|
n = s.get("success_count", 0); req = s.get("request_count", 0)
|
|
tot_out = out.get("count", 0) * out.get("mean", 0)
|
|
tps = tot_out / wall
|
|
pu, du = util_split(run, pg, dg)
|
|
a = apc(run, ports)
|
|
rows.append({
|
|
"name": run.name, "arm": arm, "n": n, "req": req,
|
|
"e2e_p50": lat.get("p50"), "e2e_p90": lat.get("p90"), "e2e_p99": lat.get("p99"),
|
|
"e2e_mean": lat.get("mean"),
|
|
"ttft_p90": ttft.get("p90"), "tpot_p99": tpot.get("p99"),
|
|
"tps": tps, "wall": wall, "pu": pu, "du": du, "apc": a,
|
|
})
|
|
|
|
if as_json:
|
|
print(json.dumps(rows))
|
|
return
|
|
|
|
def f(x, w=7, p=1):
|
|
return f"{x:>{w}.{p}f}" if isinstance(x, (int, float)) else f"{'-':>{w}}"
|
|
|
|
hdr = (f"{'run':<34}{'arm':>7}{'ok/req':>9}{'E2Ep50':>8}{'E2Ep90':>8}{'E2Ep99':>8}"
|
|
f"{'TPS':>8}{'Putil':>7}{'Dutil':>7}{'APC%':>7}{'TTFTp90':>9}{'TPOTp99ms':>10}")
|
|
print(hdr); print("-" * len(hdr))
|
|
for r in sorted(rows, key=lambda r: r["name"]):
|
|
print(f"{r['name']:<34}{r['arm']:>7}{str(r['n'])+'/'+str(r['req']):>9}"
|
|
f"{f(r['e2e_p50'])}{f(r['e2e_p90'])}{f(r['e2e_p99'])}"
|
|
f"{f(r['tps'],8,1)}{f(r['pu'])}{f(r['du'])}"
|
|
f"{f((r['apc'] or 0)*100)}{f(r['ttft_p90'],9,2)}"
|
|
f"{f((r['tpot_p99'] or 0)*1000,10,1)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|