Files
agentic-kvc/analysis/characterization/b2_sweep_analysis.py
Gahow Wang b7902061d1 Window 1 analysis: APC upper bound, B2 window-overlap, figure renderer
Three CPU-only analysis pieces that turn raw Window 1 artifacts into
publishable numbers and figures.

scripts/compute_apc_upper_bound.py
  Block-level trie walk over hash_ids to compute the theoretical APC
  ceiling on a trace, decomposed into intra-session / any-session /
  shared-prefix-only. Gives a fixed reference for what each routing
  policy could *possibly* achieve. w600 result: 79.6% intra-session,
  80.3% any-session, 0.1% shared-prefix.

analysis/characterization/b2_sweep_analysis.py (rewrite)
  Previous version used joined_analysis.interference_index() which
  labeled overlap = "any prefill in any other request during this
  decode". With short-prompt decode load this is always true
  (everyone's prefill overlaps everyone else's decode); n_overlap
  was 239/240 even in the different-worker control.

  New version labels overlap iff the decode's [t_first_token, t_finish]
  intersects an actual large *injection* window, computed from the
  cell's "prefill"-tagged metric rows. Different-worker control now
  cleanly sits at idx ≈ 1.0, same-worker scales monotonically.

analysis/characterization/render_window1_figures.py
  Renders 8 PNGs from the result JSONs: B3 latency / APC vs ceiling
  / APC vs hotspot scatter / per-worker TTFT / failure breakdown,
  B2 TPOT and TTFT curves (overlap vs clean and idx), reuse
  decomposition, KV footprint.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 23:24:54 +08:00

127 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Aggregate B2 microbench cells: same- vs different-worker prefill overlap.
For each (variant × prefill_size) cell we have:
- 240 short-prompt decode requests at qps=4
- 4 large-prompt one-token "prefill injections"
The interesting question is *not* "does any other request's prefill overlap
this decode" (the answer is always yes — every decode begins with its own
short prefill, and at qps=4 they overlap each other constantly). The
interesting question is "does an injected large prefill on the *same* worker
materially slow this decode down?".
So we:
1) extract each cell's injection windows = [(t_dispatch, t_finish)
for r in metrics if r.workload=="prefill"];
2) label each decode request as overlap iff its
[t_first_token, t_finish] intersects at least one injection window;
3) compute TPOT p50/p90/p99 for overlap vs clean;
4) the per-cell interference index = TPOT_p90(overlap) /
TPOT_p90(clean). For "different" variant this should hover near 1.0;
for "same" it should rise with prefill_size.
"""
from __future__ import annotations
import argparse
import json
from collections import defaultdict
from pathlib import Path
from analysis.characterization.joined_analysis import (
_percentile,
load_jsonl,
write_json,
)
def _overlaps(a_start: float, a_end: float, b_start: float, b_end: float) -> bool:
return a_start <= b_end and b_start <= a_end
def _analyze_cell(metrics_rows: list[dict]) -> dict:
prefills = [r for r in metrics_rows if r.get("workload") == "prefill"
and r.get("error") is None]
decodes = [r for r in metrics_rows if r.get("workload") == "decode"
and r.get("error") is None]
inj_windows: list[tuple[float, float]] = []
for p in prefills:
ts = p.get("t_dispatch_unix")
te = p.get("t_finish_unix")
if ts is None or te is None:
continue
inj_windows.append((float(ts), float(te)))
overlap_tpots: list[float] = []
clean_tpots: list[float] = []
overlap_ttfts: list[float] = []
clean_ttfts: list[float] = []
for d in decodes:
ts = d.get("t_dispatch_unix")
te = d.get("t_finish_unix")
if ts is None or te is None:
continue
is_overlap = any(_overlaps(ts, te, ws, we) for ws, we in inj_windows)
tpot = d.get("tpot_s")
ttft = d.get("ttft_s")
if tpot is not None:
(overlap_tpots if is_overlap else clean_tpots).append(float(tpot))
if ttft is not None:
(overlap_ttfts if is_overlap else clean_ttfts).append(float(ttft))
p90_overlap = _percentile(overlap_tpots, 0.90) if overlap_tpots else None
p90_clean = _percentile(clean_tpots, 0.90) if clean_tpots else None
idx = (p90_overlap / p90_clean) if (p90_overlap and p90_clean) else None
return {
"n_prefill_injections": len(prefills),
"n_decode_total": len(decodes),
"n_decode_overlap": len(overlap_tpots),
"n_decode_clean": len(clean_tpots),
"tpot_p50_overlap_s": _percentile(overlap_tpots, 0.50),
"tpot_p90_overlap_s": p90_overlap,
"tpot_p99_overlap_s": _percentile(overlap_tpots, 0.99),
"tpot_p50_clean_s": _percentile(clean_tpots, 0.50),
"tpot_p90_clean_s": p90_clean,
"tpot_p99_clean_s": _percentile(clean_tpots, 0.99),
"ttft_p90_overlap_s": _percentile(overlap_ttfts, 0.90)
if overlap_ttfts else None,
"ttft_p90_clean_s": _percentile(clean_ttfts, 0.90)
if clean_ttfts else None,
"interference_index": idx,
}
def main() -> None:
p = argparse.ArgumentParser(description="B2 sweep aggregation (window-overlap)")
p.add_argument("--sweep-dir", type=Path, required=True)
p.add_argument("--out", type=Path, default=None)
args = p.parse_args()
rows: list[dict] = []
for variant_dir in sorted(args.sweep_dir.glob("*/")):
if variant_dir.name in ("logs",):
continue
for cell_dir in sorted(variant_dir.glob("p*/")):
window_path = cell_dir / "run_window.json"
metrics_path = cell_dir / "metrics.jsonl"
if not window_path.exists() or not metrics_path.exists():
continue
window = json.loads(window_path.read_text())
metrics_rows = load_jsonl(metrics_path)
cell = _analyze_cell(metrics_rows)
rows.append({
"variant": variant_dir.name,
"prefill_size": int(window["prefill_size"]),
"decode_endpoint": window["decode_endpoint"],
"prefill_endpoint": window["prefill_endpoint"],
**cell,
})
out_path = args.out or args.sweep_dir / "b2_sweep_summary.json"
write_json(out_path, {"rows": rows})
print(json.dumps(rows, indent=2))
if __name__ == "__main__":
main()