Three CPU-only analysis pieces that turn raw Window 1 artifacts into publishable numbers and figures. scripts/compute_apc_upper_bound.py Block-level trie walk over hash_ids to compute the theoretical APC ceiling on a trace, decomposed into intra-session / any-session / shared-prefix-only. Gives a fixed reference for what each routing policy could *possibly* achieve. w600 result: 79.6% intra-session, 80.3% any-session, 0.1% shared-prefix. analysis/characterization/b2_sweep_analysis.py (rewrite) Previous version used joined_analysis.interference_index() which labeled overlap = "any prefill in any other request during this decode". With short-prompt decode load this is always true (everyone's prefill overlaps everyone else's decode); n_overlap was 239/240 even in the different-worker control. New version labels overlap iff the decode's [t_first_token, t_finish] intersects an actual large *injection* window, computed from the cell's "prefill"-tagged metric rows. Different-worker control now cleanly sits at idx ≈ 1.0, same-worker scales monotonically. analysis/characterization/render_window1_figures.py Renders 8 PNGs from the result JSONs: B3 latency / APC vs ceiling / APC vs hotspot scatter / per-worker TTFT / failure breakdown, B2 TPOT and TTFT curves (overlap vs clean and idx), reuse decomposition, KV footprint. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
127 lines
4.8 KiB
Python
127 lines
4.8 KiB
Python
"""Aggregate B2 microbench cells: same- vs different-worker prefill overlap.
|
||
|
||
For each (variant × prefill_size) cell we have:
|
||
- 240 short-prompt decode requests at qps=4
|
||
- 4 large-prompt one-token "prefill injections"
|
||
|
||
The interesting question is *not* "does any other request's prefill overlap
|
||
this decode" (the answer is always yes — every decode begins with its own
|
||
short prefill, and at qps=4 they overlap each other constantly). The
|
||
interesting question is "does an injected large prefill on the *same* worker
|
||
materially slow this decode down?".
|
||
|
||
So we:
|
||
1) extract each cell's injection windows = [(t_dispatch, t_finish)
|
||
for r in metrics if r.workload=="prefill"];
|
||
2) label each decode request as overlap iff its
|
||
[t_first_token, t_finish] intersects at least one injection window;
|
||
3) compute TPOT p50/p90/p99 for overlap vs clean;
|
||
4) the per-cell interference index = TPOT_p90(overlap) /
|
||
TPOT_p90(clean). For "different" variant this should hover near 1.0;
|
||
for "same" it should rise with prefill_size.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
|
||
from analysis.characterization.joined_analysis import (
|
||
_percentile,
|
||
load_jsonl,
|
||
write_json,
|
||
)
|
||
|
||
|
||
def _overlaps(a_start: float, a_end: float, b_start: float, b_end: float) -> bool:
|
||
return a_start <= b_end and b_start <= a_end
|
||
|
||
|
||
def _analyze_cell(metrics_rows: list[dict]) -> dict:
|
||
prefills = [r for r in metrics_rows if r.get("workload") == "prefill"
|
||
and r.get("error") is None]
|
||
decodes = [r for r in metrics_rows if r.get("workload") == "decode"
|
||
and r.get("error") is None]
|
||
|
||
inj_windows: list[tuple[float, float]] = []
|
||
for p in prefills:
|
||
ts = p.get("t_dispatch_unix")
|
||
te = p.get("t_finish_unix")
|
||
if ts is None or te is None:
|
||
continue
|
||
inj_windows.append((float(ts), float(te)))
|
||
|
||
overlap_tpots: list[float] = []
|
||
clean_tpots: list[float] = []
|
||
overlap_ttfts: list[float] = []
|
||
clean_ttfts: list[float] = []
|
||
for d in decodes:
|
||
ts = d.get("t_dispatch_unix")
|
||
te = d.get("t_finish_unix")
|
||
if ts is None or te is None:
|
||
continue
|
||
is_overlap = any(_overlaps(ts, te, ws, we) for ws, we in inj_windows)
|
||
tpot = d.get("tpot_s")
|
||
ttft = d.get("ttft_s")
|
||
if tpot is not None:
|
||
(overlap_tpots if is_overlap else clean_tpots).append(float(tpot))
|
||
if ttft is not None:
|
||
(overlap_ttfts if is_overlap else clean_ttfts).append(float(ttft))
|
||
|
||
p90_overlap = _percentile(overlap_tpots, 0.90) if overlap_tpots else None
|
||
p90_clean = _percentile(clean_tpots, 0.90) if clean_tpots else None
|
||
idx = (p90_overlap / p90_clean) if (p90_overlap and p90_clean) else None
|
||
return {
|
||
"n_prefill_injections": len(prefills),
|
||
"n_decode_total": len(decodes),
|
||
"n_decode_overlap": len(overlap_tpots),
|
||
"n_decode_clean": len(clean_tpots),
|
||
"tpot_p50_overlap_s": _percentile(overlap_tpots, 0.50),
|
||
"tpot_p90_overlap_s": p90_overlap,
|
||
"tpot_p99_overlap_s": _percentile(overlap_tpots, 0.99),
|
||
"tpot_p50_clean_s": _percentile(clean_tpots, 0.50),
|
||
"tpot_p90_clean_s": p90_clean,
|
||
"tpot_p99_clean_s": _percentile(clean_tpots, 0.99),
|
||
"ttft_p90_overlap_s": _percentile(overlap_ttfts, 0.90)
|
||
if overlap_ttfts else None,
|
||
"ttft_p90_clean_s": _percentile(clean_ttfts, 0.90)
|
||
if clean_ttfts else None,
|
||
"interference_index": idx,
|
||
}
|
||
|
||
|
||
def main() -> None:
|
||
p = argparse.ArgumentParser(description="B2 sweep aggregation (window-overlap)")
|
||
p.add_argument("--sweep-dir", type=Path, required=True)
|
||
p.add_argument("--out", type=Path, default=None)
|
||
args = p.parse_args()
|
||
|
||
rows: list[dict] = []
|
||
for variant_dir in sorted(args.sweep_dir.glob("*/")):
|
||
if variant_dir.name in ("logs",):
|
||
continue
|
||
for cell_dir in sorted(variant_dir.glob("p*/")):
|
||
window_path = cell_dir / "run_window.json"
|
||
metrics_path = cell_dir / "metrics.jsonl"
|
||
if not window_path.exists() or not metrics_path.exists():
|
||
continue
|
||
window = json.loads(window_path.read_text())
|
||
metrics_rows = load_jsonl(metrics_path)
|
||
cell = _analyze_cell(metrics_rows)
|
||
rows.append({
|
||
"variant": variant_dir.name,
|
||
"prefill_size": int(window["prefill_size"]),
|
||
"decode_endpoint": window["decode_endpoint"],
|
||
"prefill_endpoint": window["prefill_endpoint"],
|
||
**cell,
|
||
})
|
||
out_path = args.out or args.sweep_dir / "b2_sweep_summary.json"
|
||
write_json(out_path, {"rows": rows})
|
||
print(json.dumps(rows, indent=2))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|