Files
agentic-kvc/microbench/fresh_setup/fig_agg.py
Gahow Wang fafc44da79 MB5 PD reuse-centric ablation: tooling, data, Fig 1-3
Three-axis controlled ablation of PD-colo vs PD-disagg on synthetic regular
traces (closed-loop, controlled reuse via REPLAY_NO_REALIZED_PREFIX) on the
clean stack (e13391e gated off).

  Axis 1 (Fig 1) -- reuse 6%->94% at N=8, in8192/out256
  Axis 2 (Fig 2) -- shape in2048/out2048 -> in32768/out64 at N=8, reuse~70%
  Axis 3 (Fig 3) -- concurrency N=8/16/32/64 at reuse~71%, in8192/out256

Findings:
  * APC parity colo=PD at every reuse (5.5/22/44/66/77/82%) -- contamination
    fix validated.
  * PD edge erodes 1.57x->1.10x with reuse; prefill GPUs strand 26%->9%.
  * Shape: PD-best peaks mid-sweep (1.34x at in8192/out512); wrong PD ratio
    catastrophic at prefill extreme (in32768/out64 pd2 = 378/400, p99 432s).
  * Concurrency: PD wins N<=32 (1.23-1.29x), TIPS at N=64 -- pd2/pd4
    crater (APC 71%->1.4%, TPS -30%) while colo scales cleanly.

Infrastructure:
  * replayer: --max-inflight-sessions, --inter-turn-think, --no-realized-prefix
    (env-defaulted via REPLAY_MAX_INFLIGHT, REPLAY_INTER_TURN_THINK_S,
    REPLAY_NO_REALIZED_PREFIX).
  * mb5_run.sh: writes bench_config.json + gpu_util.csv + run_window.json +
    instance_apc.txt + metrics.jsonl for bench_report/fig_agg ingest.
  * fig_agg.py: per-arm GPU role split + producer-side APC; --json mode.
  * gpu_util_report.py: companion per-GPU util report from gpu_util.csv.
  * partial_summary.py: stats from in-flight replay_metrics.jsonl
    (works before metrics.summary.json exists).

Data: analysis/mb5_pd_ablation/fig{1,2,3}.json (24 + 20 + 16 rows).
Figures: figs/mb5_pd_ablation/fig{1_reuse,2_shape,3_concurrency}_axis.png.
2026-05-31 20:14:46 +08:00

141 lines
5.1 KiB
Python

"""Aggregate a set of MB5 run dirs into one comparison table.
Pulls the three core metrics the analysis cares about, per run:
- E2E latency (from replay_metrics.summary.json: latency_stats_s)
- TPS (output tokens / wall_clock_s)
- GPU util by workers (gpu_util.csv over run_window, split prefill/decode by role)
plus honest reuse (producer-side APC from instance_apc.txt) and TTFT/TPOT for logs.
Arm + GPU role split + producer APC ports are inferred from the dir name:
*_colo_* -> 8 kv_both ; apc ports 8000-8007 (all keep prefix)
*_pd6_* -> 6P+2D P0-5/D6-7 ; apc 8000-8005
*_pd_* -> 4P+4D P0-3/D4-7 ; apc 8000-8003 (note: "pd" not "pd4")
*_pd2_* -> 2P+6D P0-1/D2-7 ; apc 8000-8001
Usage: fig_agg.py <run_dir> [<run_dir> ...]
"""
from __future__ import annotations
import csv
import json
import re
import statistics
import sys
from pathlib import Path
def arm_of(name: str):
# New driver naming (run_conc.sh / run_reuse_fixed.sh): "..._<CONFIG>_rep<r>".
if "8C-proxy" in name:
return "colo", list(range(8)), [], list(range(8000, 8008))
if "6P+2D" in name:
return "6P+2D", [0, 1, 2, 3, 4, 5], [6, 7], list(range(8000, 8006))
if "2P+6D" in name:
return "2P+6D", [0, 1], [2, 3, 4, 5, 6, 7], list(range(8000, 8002))
if "4P+4D" in name:
return "4P+4D", [0, 1, 2, 3], [4, 5, 6, 7], list(range(8000, 8004))
# Legacy naming (original May-30 corrected runs).
if "_colo_" in name or name.endswith("_colo"):
return "colo", list(range(8)), [], list(range(8000, 8008))
if "_pd6_" in name:
return "6P+2D", [0, 1, 2, 3, 4, 5], [6, 7], list(range(8000, 8006))
if "_pd2_" in name:
return "2P+6D", [0, 1], [2, 3, 4, 5, 6, 7], list(range(8000, 8002))
if "_pd4_" in name or "_pd_" in name:
return "4P+4D", [0, 1, 2, 3], [4, 5, 6, 7], list(range(8000, 8004))
return "?", list(range(8)), [], list(range(8000, 8008))
def util_split(run: Path, pgpus, dgpus):
win = {}
wp = run / "run_window.json"
if wp.exists():
win = json.load(open(wp))
t0, t1 = win.get("t_start_unix"), win.get("t_end_unix")
csvp = run / "gpu_util.csv"
if not csvp.exists():
return None, None
by = {}
for row in csv.DictReader(open(csvp)):
try:
ts = float(row["timestamp"]); g = int(row["gpu"]); u = float(row["util_pct"])
except (ValueError, KeyError):
continue
if t0 and not (t0 <= ts <= t1):
continue
by.setdefault(g, []).append(u)
pm = [v for g in pgpus for v in by.get(g, [])]
dm = [v for g in dgpus for v in by.get(g, [])]
return (statistics.fmean(pm) if pm else None,
statistics.fmean(dm) if dm else None)
def apc(run: Path, ports):
f = run / "instance_apc.txt"
if not f.exists():
return None
q = h = 0
for line in open(f):
m = dict(re.findall(r"(\w+)=(\S+)", line))
try:
p = int(m.get("port", -1))
except ValueError:
continue
if p in ports:
q += float(m.get("queries", 0)); h += float(m.get("hits", 0))
return (h / q) if q else None
def main():
args = sys.argv[1:]
as_json = False
if "--json" in args:
as_json = True
args = [a for a in args if a != "--json"]
rows = []
for d in args:
run = Path(d)
sp = run / "replay_metrics.summary.json"
if not sp.exists():
continue
s = json.load(open(sp))
arm, pg, dg, ports = arm_of(run.name)
lat = s.get("latency_stats_s", {})
ttft = s.get("ttft_stats_s", {})
tpot = s.get("tpot_stats_s", {})
wall = s.get("wall_clock_s") or 1.0
out = s.get("actual_output_tokens_stats", {})
n = s.get("success_count", 0); req = s.get("request_count", 0)
tot_out = out.get("count", 0) * out.get("mean", 0)
tps = tot_out / wall
pu, du = util_split(run, pg, dg)
a = apc(run, ports)
rows.append({
"name": run.name, "arm": arm, "n": n, "req": req,
"e2e_p50": lat.get("p50"), "e2e_p90": lat.get("p90"), "e2e_p99": lat.get("p99"),
"e2e_mean": lat.get("mean"),
"ttft_p90": ttft.get("p90"), "tpot_p99": tpot.get("p99"),
"tps": tps, "wall": wall, "pu": pu, "du": du, "apc": a,
})
if as_json:
print(json.dumps(rows))
return
def f(x, w=7, p=1):
return f"{x:>{w}.{p}f}" if isinstance(x, (int, float)) else f"{'-':>{w}}"
hdr = (f"{'run':<34}{'arm':>7}{'ok/req':>9}{'E2Ep50':>8}{'E2Ep90':>8}{'E2Ep99':>8}"
f"{'TPS':>8}{'Putil':>7}{'Dutil':>7}{'APC%':>7}{'TTFTp90':>9}{'TPOTp99ms':>10}")
print(hdr); print("-" * len(hdr))
for r in sorted(rows, key=lambda r: r["name"]):
print(f"{r['name']:<34}{r['arm']:>7}{str(r['n'])+'/'+str(r['req']):>9}"
f"{f(r['e2e_p50'])}{f(r['e2e_p90'])}{f(r['e2e_p99'])}"
f"{f(r['tps'],8,1)}{f(r['pu'])}{f(r['du'])}"
f"{f((r['apc'] or 0)*100)}{f(r['ttft_p90'],9,2)}"
f"{f((r['tpot_p99'] or 0)*1000,10,1)}")
if __name__ == "__main__":
main()