Files
agentic-kvc/analysis/characterization/render_window1_figures.py
Gahow Wang 555cabcf1f f2c: switch to per-instance decode-concurrency view; correct KV pool ceiling
Old f2c plotted per-request KV footprint MiB against an "H20 ~95 GiB
usable" reference line. That ceiling was wrong — a 30B-A3B bf16
deployment burns roughly:
  ~50% HBM for model params (~48 GiB on 96 GiB H20)
  ~10% for runtime activation buffers
  ~40% left for the KV cache pool (~38.4 GiB)
so 95 GiB was overstating the available pool by 2.5×.

New f2c reframes the same data into the answer that actually motivates
the paper: how many concurrent decodes does a single instance hold,
and how does PD-disagg change that? Grouped bars per percentile show
system-wide concurrent decode capacity for three 8-GPU deployments:
  Combined 8C, PD-disagg 4P+4D (N_D=4), PD-disagg 6P+2D (N_D=2)

Key reads off the figure:
  p50 (1.8 GiB/req): 20 fit/inst → 160 / 80 / 40 system-wide
  p90 (8.0 GiB/req):  4 fit/inst →  32 / 16 /  8
  p95 (9.6 GiB/req):  4 fit/inst →  32 / 16 /  8
  p99 (11.5 GiB/req): 3 fit/inst →  24 / 12 /  6

PD-disagg 4P+4D literally halves the decode population at the same
per-request KV pressure — this is the concrete §3.2 "KV memory wall"
penalty stated in terms users care about (concurrency).

- analysis/characterization/render_window1_figures.py:
  fig_kv_footprint_cdf rewritten; reads same kv_footprint_summary.json
  but computes floor(KV_pool / req_size) × N_D and annotates the
  per-instance fit count below each percentile group.
- figs/f2c_kv_footprint_cdf.png: regenerated.
- MEETING.md / PAPER_OUTLINE.md §2.1, §2.4: prose updated with the
  new ceiling and the "3 p99 decodes per instance / halved by PD-disagg"
  framing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 11:28:47 +08:00

417 lines
17 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Render PNG figures for Window 1 results (B1', B2, B3).
Inputs (all expected under <results-dir>):
- b3_policy_comparison.json (per-policy table)
- b2_sweep_summary.json (per-cell B2 sweep)
- apc_upper_w600.json (theoretical bounds)
- lmetric_reuse.json (intra/cross/shared decomp)
- kv_footprint_summary.json (full trace KV stats)
Outputs (under <out-dir>):
- fig_b3_apc_vs_hotspot.png
- fig_b3_latency_bars.png
- fig_b3_apc_vs_upper.png
- fig_b3_failure_breakdown.png
- fig_b3_per_worker_ttft_p90.png
- fig_b2_tpot_vs_prefill.png
- fig_b2_ttft_vs_prefill.png
- fig_reuse_decomposition.png
- fig_kv_footprint_cdf.png
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "capped"]
POLICY_COLOR = {
"lmetric": "#1f77b4",
"load_only": "#ff7f0e",
"sticky": "#d62728",
"unified": "#2ca02c",
"capped": "#9467bd",
}
def _load(results_dir: Path, name: str) -> dict:
return json.loads((results_dir / name).read_text())
def fig_b3_apc_vs_hotspot(comp: dict, upper: dict, out: Path) -> None:
upper_intra = upper["apc_upper_intra_session"]
fig, ax = plt.subplots(figsize=(6, 4.5))
for r in comp["rows"]:
pol = r["policy"]
if pol not in POLICY_ORDER:
continue
ax.scatter(r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"],
s=180, color=POLICY_COLOR.get(pol, "gray"), label=pol,
edgecolors="black", linewidths=0.5)
ax.annotate(pol, (r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"]),
xytext=(7, 7), textcoords="offset points",
fontsize=9)
ax.axvline(upper_intra * 100, linestyle="--", color="gray", alpha=0.6,
label=f"intra-session APC upper {upper_intra * 100:.1f}%")
ax.set_xlabel("APC achieved (%)")
ax.set_ylabel("hotspot_index = max(worker TTFT p90) / median")
ax.set_title("B3: APC vs hot-spot tradeoff across policies")
ax.grid(alpha=0.3)
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_latency_bars(comp: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
metrics = [("TTFT p90 (s)", "ttft_p90_s"),
("TPOT p90 (ms)", "tpot_p90_s"),
("E2E p90 (s)", "e2e_p90_s")]
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
for ax, (label, key) in zip(axes, metrics):
vals = [by[p][key] * (1000 if "TPOT" in label else 1) for p in pols]
ax.bar(pols, vals, color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
ax.set_title(label)
ax.tick_params(axis="x", rotation=20)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.suptitle("B3 headline latencies per policy")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_latency_full_grid(results_dir: Path, out: Path) -> None:
"""4 rows (mean / p50 / p90 / p99) × 3 cols (TTFT / TPOT / E2E) per policy.
Reads per-policy metrics.summary.json caches under raw_stats/, which
expose mean alongside the percentiles (b3_policy_comparison.json does
not record mean).
"""
raw_dir = results_dir / "raw_stats"
pols = [p for p in POLICY_ORDER if (raw_dir / f"{p}.json").exists()]
if not pols:
return
stats = {p: json.loads((raw_dir / f"{p}.json").read_text()) for p in pols}
rows = [("mean", "mean"), ("p50", "p50"), ("p90", "p90"), ("p99", "p99")]
cols = [
("TTFT (s)", "ttft", 1.0),
("TPOT (ms)", "tpot", 1000.0),
("E2E (s)", "e2e", 1.0),
]
fig, axes = plt.subplots(len(rows), len(cols), figsize=(11, 11), sharex=True)
for i, (row_label, agg_key) in enumerate(rows):
for j, (col_label, metric_key, scale) in enumerate(cols):
ax = axes[i][j]
vals = [stats[p][metric_key][agg_key] * scale for p in pols]
ax.bar(pols, vals,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
for k, v in enumerate(vals):
ax.text(k, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
if j == 0:
ax.set_ylabel(row_label, fontsize=11)
if i == 0:
ax.set_title(col_label, fontsize=11)
ax.grid(alpha=0.3, axis="y")
ax.tick_params(axis="x", rotation=20, labelsize=9)
ax.margins(y=0.18)
fig.suptitle("B3 latencies per policy — mean / p50 / p90 / p99")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_apc_vs_upper(comp: dict, upper: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
achieved = [by[p]["apc_ratio"] * 100 for p in pols]
fig, ax = plt.subplots(figsize=(6.5, 4))
bars = ax.bar(pols, achieved,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
ax.axhline(upper["apc_upper_intra_session"] * 100, linestyle="--",
color="black", alpha=0.7,
label=f"intra-session ceiling {upper['apc_upper_intra_session'] * 100:.1f}%")
ax.axhline(upper["apc_upper_any_session"] * 100, linestyle=":",
color="darkgray", alpha=0.7,
label=f"any-session ceiling {upper['apc_upper_any_session'] * 100:.1f}%")
for b, v in zip(bars, achieved):
ax.text(b.get_x() + b.get_width() / 2, v + 1, f"{v:.1f}%",
ha="center", fontsize=9)
ax.set_ylim(0, 100)
ax.set_ylabel("APC ratio (%)")
ax.set_title("B3: APC achieved vs theoretical ceiling")
ax.legend(loc="upper right", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_failure_breakdown(comp: dict, out: Path) -> None:
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
causes = ["same_worker_prefill_overlap", "hot_worker_queue",
"cache_miss_large_append", "high_kv_occupancy", "unknown"]
cause_color = {
"same_worker_prefill_overlap": "#d62728",
"hot_worker_queue": "#ff7f0e",
"cache_miss_large_append": "#1f77b4",
"high_kv_occupancy": "#8c564b",
"unknown": "#7f7f7f",
}
fig, ax = plt.subplots(figsize=(7, 4.5))
bottom = [0.0] * len(pols)
for c in causes:
vals = [(by[p].get("failure_counts") or {}).get(c, 0) for p in pols]
ax.bar(pols, vals, bottom=bottom, label=c.replace("_", " "),
color=cause_color[c], edgecolor="black", linewidth=0.3)
bottom = [a + b for a, b in zip(bottom, vals)]
for i, total in enumerate(bottom):
ax.text(i, total + 3, f"n={int(total)}", ha="center", fontsize=9)
ax.set_ylabel("slow request count (TTFT > 2× p90 threshold)")
ax.set_title("B3: slow-request cause breakdown per policy")
ax.legend(fontsize=8, loc="upper right")
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b3_per_worker_ttft(results_dir: Path, comp: dict, out: Path) -> None:
"""Per-worker TTFT p90 grouped bars; title shows median + max worker p90.
We deliberately do NOT report a max/median 'hotspot index' here: it is a
ratio and treats unified (most workers fast, one hot) as worse than
sticky (all workers slow), which inverts the actual user-facing p90.
"""
import statistics
by = {r["policy"]: r for r in comp["rows"]}
pols = [p for p in POLICY_ORDER if p in by]
fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4),
sharey=True)
if len(pols) == 1:
axes = [axes]
for ax, pol in zip(axes, pols):
path = results_dir / f"per_worker_{pol}.json"
if not path.exists():
ax.text(0.5, 0.5, f"{pol}: no data", ha="center", va="center",
transform=ax.transAxes)
continue
per = json.loads(path.read_text()).get("per_worker_ttft_p90_s") or {}
items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1]))
labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items]
vals = [v for _, v in items]
ax.bar(labels, vals, color=POLICY_COLOR.get(pol, "gray"),
edgecolor="black", linewidth=0.5)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
median_v = statistics.median(vals)
max_v = max(vals)
ax.set_title(
f"{pol}\nmedian {median_v:.1f}s · max {max_v:.1f}s",
fontsize=10,
)
ax.tick_params(axis="x", labelsize=8)
ax.grid(alpha=0.3, axis="y")
axes[0].set_ylabel("worker TTFT p90 (s)")
fig.suptitle("B3 per-worker TTFT p90 distribution")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_b2_curves(b2: dict, out_tpot: Path, out_ttft: Path) -> None:
sizes = sorted({r["prefill_size"] for r in b2["rows"]})
by_var = {"same": {}, "different": {}}
for r in b2["rows"]:
by_var[r["variant"]][r["prefill_size"]] = r
for name, key, ylabel, ymax_log, out in [
("TPOT", "tpot_p90", "TPOT p90 (ms)", True, out_tpot),
("TTFT", "ttft_p90", "TTFT p90 (s)", True, out_ttft),
]:
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
ax_abs, ax_idx = axes
for variant in ("different", "same"):
xs, ys_o, ys_c, idxs = [], [], [], []
for sz in sizes:
r = by_var[variant].get(sz)
if not r: continue
ov = r.get(f"{key}_overlap_s")
cl = r.get(f"{key}_clean_s")
if ov is None or cl is None: continue
xs.append(sz)
scale = 1000 if name == "TPOT" else 1.0
ys_o.append(ov * scale)
ys_c.append(cl * scale)
idxs.append(ov / cl)
color = "#d62728" if variant == "same" else "#1f77b4"
ax_abs.plot(xs, ys_o, "o-", color=color,
label=f"{variant} (overlap)")
ax_abs.plot(xs, ys_c, "s--", color=color, alpha=0.5,
label=f"{variant} (clean)")
ax_idx.plot(xs, idxs, "o-", color=color, label=variant,
linewidth=2)
ax_abs.set_xscale("log", base=2)
ax_abs.set_yscale("log")
ax_abs.set_xlabel("prefill injection size (tokens)")
ax_abs.set_ylabel(ylabel + " (log)")
ax_abs.set_title(f"B2 {name} absolute (overlap vs clean)")
ax_abs.legend(fontsize=8)
ax_abs.grid(alpha=0.3, which="both")
ax_idx.set_xscale("log", base=2)
if ymax_log:
ax_idx.set_yscale("log")
ax_idx.axhline(1.0, color="black", linestyle=":", alpha=0.5)
ax_idx.set_xlabel("prefill injection size (tokens)")
ax_idx.set_ylabel(f"{name} idx = overlap / clean")
ax_idx.set_title(f"B2 {name} interference index (same vs different worker)")
ax_idx.legend()
ax_idx.grid(alpha=0.3, which="both")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_reuse_decomposition(reuse: dict, out: Path) -> None:
fr = reuse.get("fractions") or {}
labels = ["intra-session", "cross-session", "shared-prefix", "unclassified"]
vals = [fr.get("intra", 0), fr.get("cross", 0),
fr.get("shared", 0), fr.get("unclassified", 0)]
colors = ["#2ca02c", "#ff7f0e", "#9467bd", "#7f7f7f"]
fig, ax = plt.subplots(figsize=(6, 3))
bottom = 0.0
for label, v, c in zip(labels, vals, colors):
ax.barh(["lmetric run"], [v], left=[bottom], color=c, edgecolor="black",
linewidth=0.5, label=f"{label} ({v * 100:.1f}%)")
bottom += v
ax.set_xlabel("fraction of cached_tokens")
ax.set_xlim(0, 1)
ax.set_title("Real reuse decomposition (w600 lmetric run)")
ax.legend(fontsize=9, loc="lower right")
ax.grid(alpha=0.3, axis="x")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_kv_footprint_cdf(kv: dict, out: Path) -> None:
"""How many concurrent decodes fit per percentile, under three deployments.
KV pool assumption: 96 GiB H20 HBM split ~50% model params (Qwen3-Coder-
30B-A3B in bf16 + headroom), ~10% runtime activations, leaving ~40% for
the KV cache pool — i.e. ~38.4 GiB per instance.
For each request-size percentile, we report system-wide concurrent
decode capacity = N_D × floor(KV_pool / req_size_MiB) under three 8-GPU
deployments: all-combined, 4P+4D, 6P+2D. The point is that going from
combined 8C to 4P+4D halves the system's decode population at the
same per-request KV pressure.
"""
s = kv.get("kv_mib_per_request") or {}
pct_keys = ["p50", "p90", "p95", "p99"]
req_mib = [float(s.get(k, 0.0)) for k in pct_keys]
req_gib = [v / 1024 for v in req_mib]
hbm_gib = 96.0
kv_pool_frac = 0.40
kv_pool_mib = hbm_gib * kv_pool_frac * 1024 # ≈ 39322 MiB per instance
deploys = [
("Combined 8C", 8, "#2ca02c"),
("PD-disagg 4P+4D", 4, "#ff7f0e"),
("PD-disagg 6P+2D", 2, "#d62728"),
]
import numpy as _np
x = _np.arange(len(pct_keys))
bar_w = 0.26
fig, ax = plt.subplots(figsize=(9, 5.2))
for i, (label, n_d, color) in enumerate(deploys):
per_inst = [int(kv_pool_mib // r) if r > 0 else 0 for r in req_mib]
sys_cap = [n_d * pi for pi in per_inst]
bars = ax.bar(x + (i - 1) * bar_w, sys_cap, bar_w,
label=f"{label} (N_D={n_d})",
color=color, edgecolor="black", linewidth=0.5)
for j, (b, n) in enumerate(zip(bars, sys_cap)):
ax.text(b.get_x() + b.get_width() / 2, n, str(n),
ha="center", va="bottom", fontsize=9, color="#333")
# Annotate per-request KV size and per-instance fit just above the x-axis
per_inst_combined = [int(kv_pool_mib // r) if r > 0 else 0 for r in req_mib]
annot = [
f"{pct}\n{rg:.1f} GiB / req\nfits {pi}/inst"
for pct, rg, pi in zip(pct_keys, req_gib, per_inst_combined)
]
ax.set_xticks(x)
ax.set_xticklabels(annot, fontsize=10)
ax.set_ylabel("System-wide concurrent decodes")
ax.set_title(
f"Per-instance KV pool ≈ {kv_pool_mib / 1024:.1f} GiB "
f"(0.4 × H20 96 GiB; remaining 0.5 model + 0.1 activation)\n"
f"PD-disagg halves the decode population at p90+ "
f"(Qwen3-Coder-30B-A3B, 98304 B/token)"
)
ax.legend(loc="upper right")
ax.grid(alpha=0.3, axis="y")
ax.margins(y=0.15)
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--results-dir", type=Path, required=True)
p.add_argument("--out-dir", type=Path, required=True)
p.add_argument("--exclude-policies", default="",
help="Comma-separated policies to drop from per-policy figures")
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()}
if excluded:
global POLICY_ORDER
POLICY_ORDER = [p for p in POLICY_ORDER if p not in excluded]
print(f"excluding policies: {sorted(excluded)} -> kept {POLICY_ORDER}")
comp = _load(args.results_dir, "b3_policy_comparison.json")
upper = _load(args.results_dir, "apc_upper_w600.json")
b2 = _load(args.results_dir, "b2_sweep_summary.json")
reuse = _load(args.results_dir, "lmetric_reuse.json")
kv = _load(args.results_dir, "kv_footprint_summary.json")
fig_b3_apc_vs_hotspot(comp, upper, args.out_dir / "fig_b3_apc_vs_hotspot.png")
fig_b3_latency_bars(comp, args.out_dir / "fig_b3_latency_bars.png")
fig_b3_latency_full_grid(
args.results_dir, args.out_dir / "fig_b3_latency_full_grid.png"
)
fig_b3_apc_vs_upper(comp, upper, args.out_dir / "fig_b3_apc_vs_upper.png")
fig_b3_failure_breakdown(comp, args.out_dir / "fig_b3_failure_breakdown.png")
fig_b3_per_worker_ttft(args.results_dir, comp,
args.out_dir / "fig_b3_per_worker_ttft_p90.png")
fig_b2_curves(b2,
args.out_dir / "fig_b2_tpot_vs_prefill.png",
args.out_dir / "fig_b2_ttft_vs_prefill.png")
fig_reuse_decomposition(reuse, args.out_dir / "fig_reuse_decomposition.png")
fig_kv_footprint_cdf(kv, args.out_dir / "fig_kv_footprint_cdf.png")
print(f"wrote 8 figures to {args.out_dir}")
if __name__ == "__main__":
main()