Files
agentic-kvc/analysis/characterization/render_window1_figures.py
Gahow Wang 09ff1069c3 Drop 'capped' from per-policy figures (f4a, f4c×2, f6)
'capped' is not a routing policy — it's lmetric run on a separately
truncated trace (sessions capped to 8 turns via build_capped_trace.py).
Putting it alongside lmetric/load_only/sticky/unified in per-policy
comparison figures is misleading because the workload differs, not
the routing decision. Comparing apples to a different-trace orange
inflates/deflates apparent policy gaps for the wrong reasons.

Regenerated 4 figures with --exclude-policies capped on
analysis/characterization/render_window1_figures.py:
  - f4a_apc_loss.png                 (APC bars)
  - f4c_apc_vs_hotspot_tradeoff.png  (APC vs hotspot scatter)
  - f4c_per_worker_ttft.png          (per-worker TTFT panel)
  - f6_e2e_latency_bars.png          (TTFT/TPOT/E2E bars)

Added --exclude-policies CLI flag to the renderer so this is a
reversible choice, not a permanent script mutation. capped data remains
in b3_policy_comparison.json and can be brought back in workload-
sensitivity sections (where it actually belongs) by omitting the flag.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 10:57:43 +08:00

314 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Render PNG figures for Window 1 results (B1', B2, B3).
Inputs (all expected under <results-dir>):
- b3_policy_comparison.json (per-policy table)
- b2_sweep_summary.json (per-cell B2 sweep)
- apc_upper_w600.json (theoretical bounds)
- lmetric_reuse.json (intra/cross/shared decomp)
- kv_footprint_summary.json (full trace KV stats)
Outputs (under <out-dir>):
- fig_b3_apc_vs_hotspot.png
- fig_b3_latency_bars.png
- fig_b3_apc_vs_upper.png
- fig_b3_failure_breakdown.png
- fig_b3_per_worker_ttft_p90.png
- fig_b2_tpot_vs_prefill.png
- fig_b2_ttft_vs_prefill.png
- fig_reuse_decomposition.png
- fig_kv_footprint_cdf.png
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "capped"]
POLICY_COLOR = {
"lmetric": "#1f77b4",
"load_only": "#ff7f0e",
"sticky": "#d62728",
"unified": "#2ca02c",
"capped": "#9467bd",
}
def _load(results_dir: Path, name: str) -> dict:
return json.loads((results_dir / name).read_text())
def fig_b3_apc_vs_hotspot(comp: dict, upper: dict, out: Path) -> None:
upper_intra = upper["apc_upper_intra_session"]
fig, ax = plt.subplots(figsize=(6, 4.5))
for r in comp["rows"]:
pol = r["policy"]
if pol not in POLICY_ORDER:
continue
ax.scatter(r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"],
s=180, color=POLICY_COLOR.get(pol, "gray"), label=pol,
edgecolors="black", linewidths=0.5)
ax.annotate(pol, (r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"]),
xytext=(7, 7), textcoords="offset points",
fontsize=9)
ax.axvline(upper_intra * 100, linestyle="--", color="gray", alpha=0.6,
label=f"intra-session APC upper {upper_intra * 100:.1f}%")
ax.set_xlabel("APC achieved (%)")
ax.set_ylabel("hotspot_index = max(worker TTFT p90) / median")
ax.set_title("B3: APC vs hot-spot tradeoff across policies")
ax.grid(alpha=0.3)
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_latency_bars(comp: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
metrics = [("TTFT p90 (s)", "ttft_p90_s"),
("TPOT p90 (ms)", "tpot_p90_s"),
("E2E p90 (s)", "e2e_p90_s")]
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
for ax, (label, key) in zip(axes, metrics):
vals = [by[p][key] * (1000 if "TPOT" in label else 1) for p in pols]
ax.bar(pols, vals, color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
ax.set_title(label)
ax.tick_params(axis="x", rotation=20)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.suptitle("B3 headline latencies per policy")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_apc_vs_upper(comp: dict, upper: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
achieved = [by[p]["apc_ratio"] * 100 for p in pols]
fig, ax = plt.subplots(figsize=(6.5, 4))
bars = ax.bar(pols, achieved,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
ax.axhline(upper["apc_upper_intra_session"] * 100, linestyle="--",
color="black", alpha=0.7,
label=f"intra-session ceiling {upper['apc_upper_intra_session'] * 100:.1f}%")
ax.axhline(upper["apc_upper_any_session"] * 100, linestyle=":",
color="darkgray", alpha=0.7,
label=f"any-session ceiling {upper['apc_upper_any_session'] * 100:.1f}%")
for b, v in zip(bars, achieved):
ax.text(b.get_x() + b.get_width() / 2, v + 1, f"{v:.1f}%",
ha="center", fontsize=9)
ax.set_ylim(0, 100)
ax.set_ylabel("APC ratio (%)")
ax.set_title("B3: APC achieved vs theoretical ceiling")
ax.legend(loc="upper right", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_failure_breakdown(comp: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
causes = ["same_worker_prefill_overlap", "hot_worker_queue",
"cache_miss_large_append", "high_kv_occupancy", "unknown"]
cause_color = {
"same_worker_prefill_overlap": "#d62728",
"hot_worker_queue": "#ff7f0e",
"cache_miss_large_append": "#1f77b4",
"high_kv_occupancy": "#8c564b",
"unknown": "#7f7f7f",
}
fig, ax = plt.subplots(figsize=(7, 4.5))
bottom = [0.0] * len(pols)
for c in causes:
vals = [(by[p].get("failure_counts") or {}).get(c, 0) for p in pols]
ax.bar(pols, vals, bottom=bottom, label=c.replace("_", " "),
color=cause_color[c], edgecolor="black", linewidth=0.3)
bottom = [a + b for a, b in zip(bottom, vals)]
for i, total in enumerate(bottom):
ax.text(i, total + 3, f"n={int(total)}", ha="center", fontsize=9)
ax.set_ylabel("slow request count (TTFT > 2× p90 threshold)")
ax.set_title("B3: slow-request cause breakdown per policy")
ax.legend(fontsize=8, loc="upper right")
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_per_worker_ttft(results_dir: Path, comp: dict, out: Path) -> None:
"""Per-worker TTFT p90 grouped bars; reads each policy's hotspot_index.json."""
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4),
sharey=True)
if len(pols) == 1:
axes = [axes]
for ax, pol in zip(axes, pols):
path = results_dir / f"per_worker_{pol}.json"
if not path.exists():
ax.text(0.5, 0.5, f"{pol}: no data", ha="center", va="center",
transform=ax.transAxes)
continue
per = json.loads(path.read_text()).get("per_worker_ttft_p90_s") or {}
items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1]))
labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items]
vals = [v for _, v in items]
ax.bar(labels, vals, color=POLICY_COLOR.get(pol, "gray"),
edgecolor="black", linewidth=0.5)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
ax.set_title(f"{pol}\nhotspot={by[pol]['hotspot_index_ttft_p90']:.2f}",
fontsize=10)
ax.tick_params(axis="x", labelsize=8)
ax.grid(alpha=0.3, axis="y")
axes[0].set_ylabel("worker TTFT p90 (s)")
fig.suptitle("B3 per-worker TTFT p90 distribution")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b2_curves(b2: dict, out_tpot: Path, out_ttft: Path) -> None:
sizes = sorted({r["prefill_size"] for r in b2["rows"]})
by_var = {"same": {}, "different": {}}
for r in b2["rows"]:
by_var[r["variant"]][r["prefill_size"]] = r
for name, key, ylabel, ymax_log, out in [
("TPOT", "tpot_p90", "TPOT p90 (ms)", True, out_tpot),
("TTFT", "ttft_p90", "TTFT p90 (s)", True, out_ttft),
]:
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
ax_abs, ax_idx = axes
for variant in ("different", "same"):
xs, ys_o, ys_c, idxs = [], [], [], []
for sz in sizes:
r = by_var[variant].get(sz)
if not r: continue
ov = r.get(f"{key}_overlap_s")
cl = r.get(f"{key}_clean_s")
if ov is None or cl is None: continue
xs.append(sz)
scale = 1000 if name == "TPOT" else 1.0
ys_o.append(ov * scale)
ys_c.append(cl * scale)
idxs.append(ov / cl)
color = "#d62728" if variant == "same" else "#1f77b4"
ax_abs.plot(xs, ys_o, "o-", color=color,
label=f"{variant} (overlap)")
ax_abs.plot(xs, ys_c, "s--", color=color, alpha=0.5,
label=f"{variant} (clean)")
ax_idx.plot(xs, idxs, "o-", color=color, label=variant,
linewidth=2)
ax_abs.set_xscale("log", base=2)
ax_abs.set_yscale("log")
ax_abs.set_xlabel("prefill injection size (tokens)")
ax_abs.set_ylabel(ylabel + " (log)")
ax_abs.set_title(f"B2 {name} absolute (overlap vs clean)")
ax_abs.legend(fontsize=8)
ax_abs.grid(alpha=0.3, which="both")
ax_idx.set_xscale("log", base=2)
if ymax_log:
ax_idx.set_yscale("log")
ax_idx.axhline(1.0, color="black", linestyle=":", alpha=0.5)
ax_idx.set_xlabel("prefill injection size (tokens)")
ax_idx.set_ylabel(f"{name} idx = overlap / clean")
ax_idx.set_title(f"B2 {name} interference index (same vs different worker)")
ax_idx.legend()
ax_idx.grid(alpha=0.3, which="both")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_reuse_decomposition(reuse: dict, out: Path) -> None:
fr = reuse.get("fractions") or {}
labels = ["intra-session", "cross-session", "shared-prefix", "unclassified"]
vals = [fr.get("intra", 0), fr.get("cross", 0),
fr.get("shared", 0), fr.get("unclassified", 0)]
colors = ["#2ca02c", "#ff7f0e", "#9467bd", "#7f7f7f"]
fig, ax = plt.subplots(figsize=(6, 3))
bottom = 0.0
for label, v, c in zip(labels, vals, colors):
ax.barh(["lmetric run"], [v], left=[bottom], color=c, edgecolor="black",
linewidth=0.5, label=f"{label} ({v * 100:.1f}%)")
bottom += v
ax.set_xlabel("fraction of cached_tokens")
ax.set_xlim(0, 1)
ax.set_title("Real reuse decomposition (w600 lmetric run)")
ax.legend(fontsize=9, loc="lower right")
ax.grid(alpha=0.3, axis="x")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_kv_footprint_cdf(kv: dict, out: Path) -> None:
s = kv.get("kv_mib_per_request") or {}
vals = [s.get(k) for k in ("p50", "p90", "p95", "p99")]
labels = ["p50", "p90", "p95", "p99"]
fig, ax = plt.subplots(figsize=(6, 3.5))
ax.bar(labels, vals, color="#1f77b4", edgecolor="black", linewidth=0.5)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.0f} MiB", ha="center", va="bottom", fontsize=9)
ax.axhline(95 * 1024, color="red", linestyle="--", alpha=0.5,
label="H20 ~95 GiB usable")
ax.set_ylabel("KV bytes per request (MiB)")
ax.set_title("B1' Per-request KV footprint (Qwen3-Coder-30B-A3B, 98304 B/token)")
ax.legend()
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--results-dir", type=Path, required=True)
p.add_argument("--out-dir", type=Path, required=True)
p.add_argument("--exclude-policies", default="",
help="Comma-separated policies to drop from per-policy figures")
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()}
if excluded:
global POLICY_ORDER
POLICY_ORDER = [p for p in POLICY_ORDER if p not in excluded]
print(f"excluding policies: {sorted(excluded)} -> kept {POLICY_ORDER}")
comp = _load(args.results_dir, "b3_policy_comparison.json")
upper = _load(args.results_dir, "apc_upper_w600.json")
b2 = _load(args.results_dir, "b2_sweep_summary.json")
reuse = _load(args.results_dir, "lmetric_reuse.json")
kv = _load(args.results_dir, "kv_footprint_summary.json")
fig_b3_apc_vs_hotspot(comp, upper, args.out_dir / "fig_b3_apc_vs_hotspot.png")
fig_b3_latency_bars(comp, args.out_dir / "fig_b3_latency_bars.png")
fig_b3_apc_vs_upper(comp, upper, args.out_dir / "fig_b3_apc_vs_upper.png")
fig_b3_failure_breakdown(comp, args.out_dir / "fig_b3_failure_breakdown.png")
fig_b3_per_worker_ttft(args.results_dir, comp,
args.out_dir / "fig_b3_per_worker_ttft_p90.png")
fig_b2_curves(b2,
args.out_dir / "fig_b2_tpot_vs_prefill.png",
args.out_dir / "fig_b2_ttft_vs_prefill.png")
fig_reuse_decomposition(reuse, args.out_dir / "fig_reuse_decomposition.png")
fig_kv_footprint_cdf(kv, args.out_dir / "fig_kv_footprint_cdf.png")
print(f"wrote 8 figures to {args.out_dir}")
if __name__ == "__main__":
main()