Three-axis controlled ablation of PD-colo vs PD-disagg on synthetic regular
traces (closed-loop, controlled reuse via REPLAY_NO_REALIZED_PREFIX) on the
clean stack (e13391e gated off).
Axis 1 (Fig 1) -- reuse 6%->94% at N=8, in8192/out256
Axis 2 (Fig 2) -- shape in2048/out2048 -> in32768/out64 at N=8, reuse~70%
Axis 3 (Fig 3) -- concurrency N=8/16/32/64 at reuse~71%, in8192/out256
Findings:
* APC parity colo=PD at every reuse (5.5/22/44/66/77/82%) -- contamination
fix validated.
* PD edge erodes 1.57x->1.10x with reuse; prefill GPUs strand 26%->9%.
* Shape: PD-best peaks mid-sweep (1.34x at in8192/out512); wrong PD ratio
catastrophic at prefill extreme (in32768/out64 pd2 = 378/400, p99 432s).
* Concurrency: PD wins N<=32 (1.23-1.29x), TIPS at N=64 -- pd2/pd4
crater (APC 71%->1.4%, TPS -30%) while colo scales cleanly.
Infrastructure:
* replayer: --max-inflight-sessions, --inter-turn-think, --no-realized-prefix
(env-defaulted via REPLAY_MAX_INFLIGHT, REPLAY_INTER_TURN_THINK_S,
REPLAY_NO_REALIZED_PREFIX).
* mb5_run.sh: writes bench_config.json + gpu_util.csv + run_window.json +
instance_apc.txt + metrics.jsonl for bench_report/fig_agg ingest.
* fig_agg.py: per-arm GPU role split + producer-side APC; --json mode.
* gpu_util_report.py: companion per-GPU util report from gpu_util.csv.
* partial_summary.py: stats from in-flight replay_metrics.jsonl
(works before metrics.summary.json exists).
Data: analysis/mb5_pd_ablation/fig{1,2,3}.json (24 + 20 + 16 rows).
Figures: figs/mb5_pd_ablation/fig{1_reuse,2_shape,3_concurrency}_axis.png.
99 lines
3.3 KiB
Python
99 lines
3.3 KiB
Python
"""Compute a per-run summary directly from replay_metrics.jsonl (for partial / in-flight runs).
|
|
|
|
Used when the replayer hasn't completed (so replay_metrics.summary.json doesn't exist
|
|
yet) but enough records have streamed to disk to read out the per-arm result.
|
|
|
|
Also accepts a finished run's directory and prints the same one-line summary for
|
|
apples-to-apples comparison.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import statistics
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def stats(xs):
|
|
xs = sorted(xs)
|
|
n = len(xs)
|
|
if n == 0:
|
|
return None
|
|
return {
|
|
"n": n,
|
|
"mean": statistics.fmean(xs),
|
|
"p50": xs[n // 2],
|
|
"p90": xs[int(0.9 * (n - 1))],
|
|
"p99": xs[int(0.99 * (n - 1))],
|
|
}
|
|
|
|
|
|
def apc(run: Path, producer_ports):
|
|
f = run / "instance_apc.txt"
|
|
if not f.exists():
|
|
return None
|
|
q = h = 0.0
|
|
for line in open(f):
|
|
m = dict(re.findall(r"(\w+)=(\S+)", line))
|
|
try:
|
|
p = int(m.get("port", -1))
|
|
except ValueError:
|
|
continue
|
|
if p in producer_ports:
|
|
q += float(m.get("queries", 0))
|
|
h += float(m.get("hits", 0))
|
|
return (h / q) if q else None
|
|
|
|
|
|
def main():
|
|
for d in sys.argv[1:]:
|
|
run = Path(d)
|
|
# prefer the live replay_metrics.jsonl (so partials work); fall back to metrics.jsonl
|
|
for fn in ("replay_metrics.partial.jsonl", "replay_metrics.jsonl", "metrics.jsonl"):
|
|
p = run / fn
|
|
if p.exists():
|
|
rec_path = p
|
|
break
|
|
else:
|
|
print(f"{run.name}: no records"); continue
|
|
recs = [json.loads(l) for l in open(rec_path)]
|
|
oks = [r for r in recs if r.get("error") is None]
|
|
lat = stats([r["latency_s"] for r in oks if "latency_s" in r])
|
|
ttft = stats([r["ttft_s"] for r in oks if "ttft_s" in r])
|
|
tpot = stats([r["tpot_s"] for r in oks if "tpot_s" in r])
|
|
out = sum(r.get("actual_output_tokens", r.get("output_length", 0)) for r in oks)
|
|
ts = [r["t_dispatch_unix"] for r in oks if "t_dispatch_unix" in r]
|
|
tf = [r["t_finish_unix"] for r in oks if "t_finish_unix" in r]
|
|
span = max(tf) - min(ts) if ts and tf else 0
|
|
tps = out / span if span else 0
|
|
|
|
# producer ports by arm tag in dirname
|
|
n = run.name
|
|
if "_colo_" in n:
|
|
ports = list(range(8000, 8008))
|
|
elif "_pd6_" in n:
|
|
ports = list(range(8000, 8006))
|
|
elif "_pd2_" in n:
|
|
ports = list(range(8000, 8002))
|
|
else:
|
|
ports = list(range(8000, 8004))
|
|
a = apc(run, ports)
|
|
|
|
print(f"{run.name}")
|
|
print(f" n_ok={len(oks)}/{len(recs)}"
|
|
+ (f" (target=1214 -> {len(oks)*100/1214:.1f}%)" if len(recs) < 1214 else ""))
|
|
if lat:
|
|
print(f" E2E mean={lat['mean']:.2f} p50={lat['p50']:.2f} p90={lat['p90']:.2f} p99={lat['p99']:.2f}")
|
|
if ttft:
|
|
print(f" TTFT mean={ttft['mean']:.2f} p50={ttft['p50']:.2f} p90={ttft['p90']:.2f} p99={ttft['p99']:.2f}")
|
|
if tpot:
|
|
print(f" TPOT mean={tpot['mean']*1000:.1f}ms p90={tpot['p90']*1000:.1f}ms p99={tpot['p99']*1000:.1f}ms")
|
|
print(f" output_tokens={out:.0f} span={span:.0f}s TPS={tps:.0f}")
|
|
if a is not None:
|
|
print(f" producer APC={a*100:.1f}%")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|