Files
agentic-kvc/analysis/characterization/render_window1_figures.py
Gahow Wang 922d79ac95 Add full latency grid (mean/p50/p90/p99 × TTFT/TPOT/E2E) as f6 companion
The headline f6_e2e_latency_bars only shows p90, hiding three regimes:
  - mean: unified dominates (3.3s TTFT, 7.0s E2E vs sticky 5.6s / 12.1s)
  - p50: sticky and unified are tied on first-turn TTFT (0.5s each) —
    sticky's first turn of each session is free, after which queues
    accumulate. Unified beats sticky everywhere else.
  - p99: tail amplification reveals unified's biggest gap —
    TTFT 42.3s vs sticky 74.1s; E2E 68.8s vs sticky 139.7s.

The 12-panel figure is the honest full picture; the 3-panel headline
stays for slide-friendly summary.

- analysis/characterization/window_1_results/raw_stats/{policy}.json:
  cached ttft/tpot/e2e {mean,p50,p90,p99} pulled from dash0
  /home/admin/cpfs/wjh/agentic-kv/outputs/b3_sweep_20260525_095043/
  (b3_policy_comparison.json doesn't record mean, only percentiles).
- analysis/characterization/render_window1_figures.py:
  new fig_b3_latency_full_grid renders the 4×3 grid from the cache.
- figs/f6_e2e_latency_full_grid.png: 12-panel companion.
- PAPER_OUTLINE.md §5.2: both figures embedded; main table column
  renamed from "Hotspot idx" to "Worker p90 (median / max)" to match
  the new metric convention.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 11:15:18 +08:00

369 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Render PNG figures for Window 1 results (B1', B2, B3).
Inputs (all expected under <results-dir>):
- b3_policy_comparison.json (per-policy table)
- b2_sweep_summary.json (per-cell B2 sweep)
- apc_upper_w600.json (theoretical bounds)
- lmetric_reuse.json (intra/cross/shared decomp)
- kv_footprint_summary.json (full trace KV stats)
Outputs (under <out-dir>):
- fig_b3_apc_vs_hotspot.png
- fig_b3_latency_bars.png
- fig_b3_apc_vs_upper.png
- fig_b3_failure_breakdown.png
- fig_b3_per_worker_ttft_p90.png
- fig_b2_tpot_vs_prefill.png
- fig_b2_ttft_vs_prefill.png
- fig_reuse_decomposition.png
- fig_kv_footprint_cdf.png
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "capped"]
POLICY_COLOR = {
"lmetric": "#1f77b4",
"load_only": "#ff7f0e",
"sticky": "#d62728",
"unified": "#2ca02c",
"capped": "#9467bd",
}
def _load(results_dir: Path, name: str) -> dict:
return json.loads((results_dir / name).read_text())
def fig_b3_apc_vs_hotspot(comp: dict, upper: dict, out: Path) -> None:
upper_intra = upper["apc_upper_intra_session"]
fig, ax = plt.subplots(figsize=(6, 4.5))
for r in comp["rows"]:
pol = r["policy"]
if pol not in POLICY_ORDER:
continue
ax.scatter(r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"],
s=180, color=POLICY_COLOR.get(pol, "gray"), label=pol,
edgecolors="black", linewidths=0.5)
ax.annotate(pol, (r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"]),
xytext=(7, 7), textcoords="offset points",
fontsize=9)
ax.axvline(upper_intra * 100, linestyle="--", color="gray", alpha=0.6,
label=f"intra-session APC upper {upper_intra * 100:.1f}%")
ax.set_xlabel("APC achieved (%)")
ax.set_ylabel("hotspot_index = max(worker TTFT p90) / median")
ax.set_title("B3: APC vs hot-spot tradeoff across policies")
ax.grid(alpha=0.3)
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_latency_bars(comp: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
metrics = [("TTFT p90 (s)", "ttft_p90_s"),
("TPOT p90 (ms)", "tpot_p90_s"),
("E2E p90 (s)", "e2e_p90_s")]
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
for ax, (label, key) in zip(axes, metrics):
vals = [by[p][key] * (1000 if "TPOT" in label else 1) for p in pols]
ax.bar(pols, vals, color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
ax.set_title(label)
ax.tick_params(axis="x", rotation=20)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.suptitle("B3 headline latencies per policy")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_latency_full_grid(results_dir: Path, out: Path) -> None:
"""4 rows (mean / p50 / p90 / p99) × 3 cols (TTFT / TPOT / E2E) per policy.
Reads per-policy metrics.summary.json caches under raw_stats/, which
expose mean alongside the percentiles (b3_policy_comparison.json does
not record mean).
"""
raw_dir = results_dir / "raw_stats"
pols = [p for p in POLICY_ORDER if (raw_dir / f"{p}.json").exists()]
if not pols:
return
stats = {p: json.loads((raw_dir / f"{p}.json").read_text()) for p in pols}
rows = [("mean", "mean"), ("p50", "p50"), ("p90", "p90"), ("p99", "p99")]
cols = [
("TTFT (s)", "ttft", 1.0),
("TPOT (ms)", "tpot", 1000.0),
("E2E (s)", "e2e", 1.0),
]
fig, axes = plt.subplots(len(rows), len(cols), figsize=(11, 11), sharex=True)
for i, (row_label, agg_key) in enumerate(rows):
for j, (col_label, metric_key, scale) in enumerate(cols):
ax = axes[i][j]
vals = [stats[p][metric_key][agg_key] * scale for p in pols]
ax.bar(pols, vals,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
for k, v in enumerate(vals):
ax.text(k, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
if j == 0:
ax.set_ylabel(row_label, fontsize=11)
if i == 0:
ax.set_title(col_label, fontsize=11)
ax.grid(alpha=0.3, axis="y")
ax.tick_params(axis="x", rotation=20, labelsize=9)
ax.margins(y=0.18)
fig.suptitle("B3 latencies per policy — mean / p50 / p90 / p99")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_apc_vs_upper(comp: dict, upper: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
achieved = [by[p]["apc_ratio"] * 100 for p in pols]
fig, ax = plt.subplots(figsize=(6.5, 4))
bars = ax.bar(pols, achieved,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
ax.axhline(upper["apc_upper_intra_session"] * 100, linestyle="--",
color="black", alpha=0.7,
label=f"intra-session ceiling {upper['apc_upper_intra_session'] * 100:.1f}%")
ax.axhline(upper["apc_upper_any_session"] * 100, linestyle=":",
color="darkgray", alpha=0.7,
label=f"any-session ceiling {upper['apc_upper_any_session'] * 100:.1f}%")
for b, v in zip(bars, achieved):
ax.text(b.get_x() + b.get_width() / 2, v + 1, f"{v:.1f}%",
ha="center", fontsize=9)
ax.set_ylim(0, 100)
ax.set_ylabel("APC ratio (%)")
ax.set_title("B3: APC achieved vs theoretical ceiling")
ax.legend(loc="upper right", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_failure_breakdown(comp: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
causes = ["same_worker_prefill_overlap", "hot_worker_queue",
"cache_miss_large_append", "high_kv_occupancy", "unknown"]
cause_color = {
"same_worker_prefill_overlap": "#d62728",
"hot_worker_queue": "#ff7f0e",
"cache_miss_large_append": "#1f77b4",
"high_kv_occupancy": "#8c564b",
"unknown": "#7f7f7f",
}
fig, ax = plt.subplots(figsize=(7, 4.5))
bottom = [0.0] * len(pols)
for c in causes:
vals = [(by[p].get("failure_counts") or {}).get(c, 0) for p in pols]
ax.bar(pols, vals, bottom=bottom, label=c.replace("_", " "),
color=cause_color[c], edgecolor="black", linewidth=0.3)
bottom = [a + b for a, b in zip(bottom, vals)]
for i, total in enumerate(bottom):
ax.text(i, total + 3, f"n={int(total)}", ha="center", fontsize=9)
ax.set_ylabel("slow request count (TTFT > 2× p90 threshold)")
ax.set_title("B3: slow-request cause breakdown per policy")
ax.legend(fontsize=8, loc="upper right")
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_per_worker_ttft(results_dir: Path, comp: dict, out: Path) -> None:
"""Per-worker TTFT p90 grouped bars; title shows median + max worker p90.
We deliberately do NOT report a max/median 'hotspot index' here: it is a
ratio and treats unified (most workers fast, one hot) as worse than
sticky (all workers slow), which inverts the actual user-facing p90.
"""
import statistics
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4),
sharey=True)
if len(pols) == 1:
axes = [axes]
for ax, pol in zip(axes, pols):
path = results_dir / f"per_worker_{pol}.json"
if not path.exists():
ax.text(0.5, 0.5, f"{pol}: no data", ha="center", va="center",
transform=ax.transAxes)
continue
per = json.loads(path.read_text()).get("per_worker_ttft_p90_s") or {}
items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1]))
labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items]
vals = [v for _, v in items]
ax.bar(labels, vals, color=POLICY_COLOR.get(pol, "gray"),
edgecolor="black", linewidth=0.5)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
median_v = statistics.median(vals)
max_v = max(vals)
ax.set_title(
f"{pol}\nmedian {median_v:.1f}s · max {max_v:.1f}s",
fontsize=10,
)
ax.tick_params(axis="x", labelsize=8)
ax.grid(alpha=0.3, axis="y")
axes[0].set_ylabel("worker TTFT p90 (s)")
fig.suptitle("B3 per-worker TTFT p90 distribution")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b2_curves(b2: dict, out_tpot: Path, out_ttft: Path) -> None:
sizes = sorted({r["prefill_size"] for r in b2["rows"]})
by_var = {"same": {}, "different": {}}
for r in b2["rows"]:
by_var[r["variant"]][r["prefill_size"]] = r
for name, key, ylabel, ymax_log, out in [
("TPOT", "tpot_p90", "TPOT p90 (ms)", True, out_tpot),
("TTFT", "ttft_p90", "TTFT p90 (s)", True, out_ttft),
]:
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
ax_abs, ax_idx = axes
for variant in ("different", "same"):
xs, ys_o, ys_c, idxs = [], [], [], []
for sz in sizes:
r = by_var[variant].get(sz)
if not r: continue
ov = r.get(f"{key}_overlap_s")
cl = r.get(f"{key}_clean_s")
if ov is None or cl is None: continue
xs.append(sz)
scale = 1000 if name == "TPOT" else 1.0
ys_o.append(ov * scale)
ys_c.append(cl * scale)
idxs.append(ov / cl)
color = "#d62728" if variant == "same" else "#1f77b4"
ax_abs.plot(xs, ys_o, "o-", color=color,
label=f"{variant} (overlap)")
ax_abs.plot(xs, ys_c, "s--", color=color, alpha=0.5,
label=f"{variant} (clean)")
ax_idx.plot(xs, idxs, "o-", color=color, label=variant,
linewidth=2)
ax_abs.set_xscale("log", base=2)
ax_abs.set_yscale("log")
ax_abs.set_xlabel("prefill injection size (tokens)")
ax_abs.set_ylabel(ylabel + " (log)")
ax_abs.set_title(f"B2 {name} absolute (overlap vs clean)")
ax_abs.legend(fontsize=8)
ax_abs.grid(alpha=0.3, which="both")
ax_idx.set_xscale("log", base=2)
if ymax_log:
ax_idx.set_yscale("log")
ax_idx.axhline(1.0, color="black", linestyle=":", alpha=0.5)
ax_idx.set_xlabel("prefill injection size (tokens)")
ax_idx.set_ylabel(f"{name} idx = overlap / clean")
ax_idx.set_title(f"B2 {name} interference index (same vs different worker)")
ax_idx.legend()
ax_idx.grid(alpha=0.3, which="both")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_reuse_decomposition(reuse: dict, out: Path) -> None:
fr = reuse.get("fractions") or {}
labels = ["intra-session", "cross-session", "shared-prefix", "unclassified"]
vals = [fr.get("intra", 0), fr.get("cross", 0),
fr.get("shared", 0), fr.get("unclassified", 0)]
colors = ["#2ca02c", "#ff7f0e", "#9467bd", "#7f7f7f"]
fig, ax = plt.subplots(figsize=(6, 3))
bottom = 0.0
for label, v, c in zip(labels, vals, colors):
ax.barh(["lmetric run"], [v], left=[bottom], color=c, edgecolor="black",
linewidth=0.5, label=f"{label} ({v * 100:.1f}%)")
bottom += v
ax.set_xlabel("fraction of cached_tokens")
ax.set_xlim(0, 1)
ax.set_title("Real reuse decomposition (w600 lmetric run)")
ax.legend(fontsize=9, loc="lower right")
ax.grid(alpha=0.3, axis="x")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_kv_footprint_cdf(kv: dict, out: Path) -> None:
s = kv.get("kv_mib_per_request") or {}
vals = [s.get(k) for k in ("p50", "p90", "p95", "p99")]
labels = ["p50", "p90", "p95", "p99"]
fig, ax = plt.subplots(figsize=(6, 3.5))
ax.bar(labels, vals, color="#1f77b4", edgecolor="black", linewidth=0.5)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.0f} MiB", ha="center", va="bottom", fontsize=9)
ax.axhline(95 * 1024, color="red", linestyle="--", alpha=0.5,
label="H20 ~95 GiB usable")
ax.set_ylabel("KV bytes per request (MiB)")
ax.set_title("B1' Per-request KV footprint (Qwen3-Coder-30B-A3B, 98304 B/token)")
ax.legend()
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--results-dir", type=Path, required=True)
p.add_argument("--out-dir", type=Path, required=True)
p.add_argument("--exclude-policies", default="",
help="Comma-separated policies to drop from per-policy figures")
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()}
if excluded:
global POLICY_ORDER
POLICY_ORDER = [p for p in POLICY_ORDER if p not in excluded]
print(f"excluding policies: {sorted(excluded)} -> kept {POLICY_ORDER}")
comp = _load(args.results_dir, "b3_policy_comparison.json")
upper = _load(args.results_dir, "apc_upper_w600.json")
b2 = _load(args.results_dir, "b2_sweep_summary.json")
reuse = _load(args.results_dir, "lmetric_reuse.json")
kv = _load(args.results_dir, "kv_footprint_summary.json")
fig_b3_apc_vs_hotspot(comp, upper, args.out_dir / "fig_b3_apc_vs_hotspot.png")
fig_b3_latency_bars(comp, args.out_dir / "fig_b3_latency_bars.png")
fig_b3_latency_full_grid(
args.results_dir, args.out_dir / "fig_b3_latency_full_grid.png"
)
fig_b3_apc_vs_upper(comp, upper, args.out_dir / "fig_b3_apc_vs_upper.png")
fig_b3_failure_breakdown(comp, args.out_dir / "fig_b3_failure_breakdown.png")
fig_b3_per_worker_ttft(args.results_dir, comp,
args.out_dir / "fig_b3_per_worker_ttft_p90.png")
fig_b2_curves(b2,
args.out_dir / "fig_b2_tpot_vs_prefill.png",
args.out_dir / "fig_b2_ttft_vs_prefill.png")
fig_reuse_decomposition(reuse, args.out_dir / "fig_reuse_decomposition.png")
fig_kv_footprint_cdf(kv, args.out_dir / "fig_kv_footprint_cdf.png")
print(f"wrote 8 figures to {args.out_dir}")
if __name__ == "__main__":
main()