Old f2c plotted per-request KV footprint MiB against an "H20 ~95 GiB usable" reference line. That ceiling was wrong — a 30B-A3B bf16 deployment burns roughly: ~50% HBM for model params (~48 GiB on 96 GiB H20) ~10% for runtime activation buffers ~40% left for the KV cache pool (~38.4 GiB) so 95 GiB was overstating the available pool by 2.5×. New f2c reframes the same data into the answer that actually motivates the paper: how many concurrent decodes does a single instance hold, and how does PD-disagg change that? Grouped bars per percentile show system-wide concurrent decode capacity for three 8-GPU deployments: Combined 8C, PD-disagg 4P+4D (N_D=4), PD-disagg 6P+2D (N_D=2) Key reads off the figure: p50 (1.8 GiB/req): 20 fit/inst → 160 / 80 / 40 system-wide p90 (8.0 GiB/req): 4 fit/inst → 32 / 16 / 8 p95 (9.6 GiB/req): 4 fit/inst → 32 / 16 / 8 p99 (11.5 GiB/req): 3 fit/inst → 24 / 12 / 6 PD-disagg 4P+4D literally halves the decode population at the same per-request KV pressure — this is the concrete §3.2 "KV memory wall" penalty stated in terms users care about (concurrency). - analysis/characterization/render_window1_figures.py: fig_kv_footprint_cdf rewritten; reads same kv_footprint_summary.json but computes floor(KV_pool / req_size) × N_D and annotates the per-instance fit count below each percentile group. - figs/f2c_kv_footprint_cdf.png: regenerated. - MEETING.md / PAPER_OUTLINE.md §2.1, §2.4: prose updated with the new ceiling and the "3 p99 decodes per instance / halved by PD-disagg" framing. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
417 lines
17 KiB
Python
417 lines
17 KiB
Python
"""Render PNG figures for Window 1 results (B1', B2, B3).
|
||
|
||
Inputs (all expected under <results-dir>):
|
||
- b3_policy_comparison.json (per-policy table)
|
||
- b2_sweep_summary.json (per-cell B2 sweep)
|
||
- apc_upper_w600.json (theoretical bounds)
|
||
- lmetric_reuse.json (intra/cross/shared decomp)
|
||
- kv_footprint_summary.json (full trace KV stats)
|
||
|
||
Outputs (under <out-dir>):
|
||
- fig_b3_apc_vs_hotspot.png
|
||
- fig_b3_latency_bars.png
|
||
- fig_b3_apc_vs_upper.png
|
||
- fig_b3_failure_breakdown.png
|
||
- fig_b3_per_worker_ttft_p90.png
|
||
- fig_b2_tpot_vs_prefill.png
|
||
- fig_b2_ttft_vs_prefill.png
|
||
- fig_reuse_decomposition.png
|
||
- fig_kv_footprint_cdf.png
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
from pathlib import Path
|
||
|
||
import matplotlib
|
||
matplotlib.use("Agg")
|
||
import matplotlib.pyplot as plt
|
||
|
||
POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "capped"]
|
||
POLICY_COLOR = {
|
||
"lmetric": "#1f77b4",
|
||
"load_only": "#ff7f0e",
|
||
"sticky": "#d62728",
|
||
"unified": "#2ca02c",
|
||
"capped": "#9467bd",
|
||
}
|
||
|
||
|
||
def _load(results_dir: Path, name: str) -> dict:
|
||
return json.loads((results_dir / name).read_text())
|
||
|
||
|
||
def fig_b3_apc_vs_hotspot(comp: dict, upper: dict, out: Path) -> None:
|
||
upper_intra = upper["apc_upper_intra_session"]
|
||
fig, ax = plt.subplots(figsize=(6, 4.5))
|
||
for r in comp["rows"]:
|
||
pol = r["policy"]
|
||
if pol not in POLICY_ORDER:
|
||
continue
|
||
ax.scatter(r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"],
|
||
s=180, color=POLICY_COLOR.get(pol, "gray"), label=pol,
|
||
edgecolors="black", linewidths=0.5)
|
||
ax.annotate(pol, (r["apc_ratio"] * 100, r["hotspot_index_ttft_p90"]),
|
||
xytext=(7, 7), textcoords="offset points",
|
||
fontsize=9)
|
||
ax.axvline(upper_intra * 100, linestyle="--", color="gray", alpha=0.6,
|
||
label=f"intra-session APC upper {upper_intra * 100:.1f}%")
|
||
ax.set_xlabel("APC achieved (%)")
|
||
ax.set_ylabel("hotspot_index = max(worker TTFT p90) / median")
|
||
ax.set_title("B3: APC vs hot-spot tradeoff across policies")
|
||
ax.grid(alpha=0.3)
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_b3_latency_bars(comp: dict, out: Path) -> None:
|
||
by = {r["policy"]: r for r in comp["rows"]}
|
||
pols = [p for p in POLICY_ORDER if p in by]
|
||
metrics = [("TTFT p90 (s)", "ttft_p90_s"),
|
||
("TPOT p90 (ms)", "tpot_p90_s"),
|
||
("E2E p90 (s)", "e2e_p90_s")]
|
||
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
|
||
for ax, (label, key) in zip(axes, metrics):
|
||
vals = [by[p][key] * (1000 if "TPOT" in label else 1) for p in pols]
|
||
ax.bar(pols, vals, color=[POLICY_COLOR.get(p, "gray") for p in pols],
|
||
edgecolor="black", linewidth=0.5)
|
||
ax.set_title(label)
|
||
ax.tick_params(axis="x", rotation=20)
|
||
for i, v in enumerate(vals):
|
||
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9)
|
||
ax.grid(alpha=0.3, axis="y")
|
||
fig.suptitle("B3 headline latencies per policy")
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_b3_latency_full_grid(results_dir: Path, out: Path) -> None:
|
||
"""4 rows (mean / p50 / p90 / p99) × 3 cols (TTFT / TPOT / E2E) per policy.
|
||
|
||
Reads per-policy metrics.summary.json caches under raw_stats/, which
|
||
expose mean alongside the percentiles (b3_policy_comparison.json does
|
||
not record mean).
|
||
"""
|
||
raw_dir = results_dir / "raw_stats"
|
||
pols = [p for p in POLICY_ORDER if (raw_dir / f"{p}.json").exists()]
|
||
if not pols:
|
||
return
|
||
stats = {p: json.loads((raw_dir / f"{p}.json").read_text()) for p in pols}
|
||
|
||
rows = [("mean", "mean"), ("p50", "p50"), ("p90", "p90"), ("p99", "p99")]
|
||
cols = [
|
||
("TTFT (s)", "ttft", 1.0),
|
||
("TPOT (ms)", "tpot", 1000.0),
|
||
("E2E (s)", "e2e", 1.0),
|
||
]
|
||
fig, axes = plt.subplots(len(rows), len(cols), figsize=(11, 11), sharex=True)
|
||
for i, (row_label, agg_key) in enumerate(rows):
|
||
for j, (col_label, metric_key, scale) in enumerate(cols):
|
||
ax = axes[i][j]
|
||
vals = [stats[p][metric_key][agg_key] * scale for p in pols]
|
||
ax.bar(pols, vals,
|
||
color=[POLICY_COLOR.get(p, "gray") for p in pols],
|
||
edgecolor="black", linewidth=0.5)
|
||
for k, v in enumerate(vals):
|
||
ax.text(k, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
|
||
if j == 0:
|
||
ax.set_ylabel(row_label, fontsize=11)
|
||
if i == 0:
|
||
ax.set_title(col_label, fontsize=11)
|
||
ax.grid(alpha=0.3, axis="y")
|
||
ax.tick_params(axis="x", rotation=20, labelsize=9)
|
||
ax.margins(y=0.18)
|
||
fig.suptitle("B3 latencies per policy — mean / p50 / p90 / p99")
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_b3_apc_vs_upper(comp: dict, upper: dict, out: Path) -> None:
|
||
by = {r["policy"]: r for r in comp["rows"]}
|
||
pols = [p for p in POLICY_ORDER if p in by]
|
||
achieved = [by[p]["apc_ratio"] * 100 for p in pols]
|
||
fig, ax = plt.subplots(figsize=(6.5, 4))
|
||
bars = ax.bar(pols, achieved,
|
||
color=[POLICY_COLOR.get(p, "gray") for p in pols],
|
||
edgecolor="black", linewidth=0.5)
|
||
ax.axhline(upper["apc_upper_intra_session"] * 100, linestyle="--",
|
||
color="black", alpha=0.7,
|
||
label=f"intra-session ceiling {upper['apc_upper_intra_session'] * 100:.1f}%")
|
||
ax.axhline(upper["apc_upper_any_session"] * 100, linestyle=":",
|
||
color="darkgray", alpha=0.7,
|
||
label=f"any-session ceiling {upper['apc_upper_any_session'] * 100:.1f}%")
|
||
for b, v in zip(bars, achieved):
|
||
ax.text(b.get_x() + b.get_width() / 2, v + 1, f"{v:.1f}%",
|
||
ha="center", fontsize=9)
|
||
ax.set_ylim(0, 100)
|
||
ax.set_ylabel("APC ratio (%)")
|
||
ax.set_title("B3: APC achieved vs theoretical ceiling")
|
||
ax.legend(loc="upper right", fontsize=9)
|
||
ax.grid(alpha=0.3, axis="y")
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_b3_failure_breakdown(comp: dict, out: Path) -> None:
|
||
by = {r["policy"]: r for r in comp["rows"]}
|
||
pols = [p for p in POLICY_ORDER if p in by]
|
||
causes = ["same_worker_prefill_overlap", "hot_worker_queue",
|
||
"cache_miss_large_append", "high_kv_occupancy", "unknown"]
|
||
cause_color = {
|
||
"same_worker_prefill_overlap": "#d62728",
|
||
"hot_worker_queue": "#ff7f0e",
|
||
"cache_miss_large_append": "#1f77b4",
|
||
"high_kv_occupancy": "#8c564b",
|
||
"unknown": "#7f7f7f",
|
||
}
|
||
fig, ax = plt.subplots(figsize=(7, 4.5))
|
||
bottom = [0.0] * len(pols)
|
||
for c in causes:
|
||
vals = [(by[p].get("failure_counts") or {}).get(c, 0) for p in pols]
|
||
ax.bar(pols, vals, bottom=bottom, label=c.replace("_", " "),
|
||
color=cause_color[c], edgecolor="black", linewidth=0.3)
|
||
bottom = [a + b for a, b in zip(bottom, vals)]
|
||
for i, total in enumerate(bottom):
|
||
ax.text(i, total + 3, f"n={int(total)}", ha="center", fontsize=9)
|
||
ax.set_ylabel("slow request count (TTFT > 2× p90 threshold)")
|
||
ax.set_title("B3: slow-request cause breakdown per policy")
|
||
ax.legend(fontsize=8, loc="upper right")
|
||
ax.grid(alpha=0.3, axis="y")
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_b3_per_worker_ttft(results_dir: Path, comp: dict, out: Path) -> None:
|
||
"""Per-worker TTFT p90 grouped bars; title shows median + max worker p90.
|
||
|
||
We deliberately do NOT report a max/median 'hotspot index' here: it is a
|
||
ratio and treats unified (most workers fast, one hot) as worse than
|
||
sticky (all workers slow), which inverts the actual user-facing p90.
|
||
"""
|
||
import statistics
|
||
by = {r["policy"]: r for r in comp["rows"]}
|
||
pols = [p for p in POLICY_ORDER if p in by]
|
||
fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4),
|
||
sharey=True)
|
||
if len(pols) == 1:
|
||
axes = [axes]
|
||
for ax, pol in zip(axes, pols):
|
||
path = results_dir / f"per_worker_{pol}.json"
|
||
if not path.exists():
|
||
ax.text(0.5, 0.5, f"{pol}: no data", ha="center", va="center",
|
||
transform=ax.transAxes)
|
||
continue
|
||
per = json.loads(path.read_text()).get("per_worker_ttft_p90_s") or {}
|
||
items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1]))
|
||
labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items]
|
||
vals = [v for _, v in items]
|
||
ax.bar(labels, vals, color=POLICY_COLOR.get(pol, "gray"),
|
||
edgecolor="black", linewidth=0.5)
|
||
for i, v in enumerate(vals):
|
||
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
|
||
median_v = statistics.median(vals)
|
||
max_v = max(vals)
|
||
ax.set_title(
|
||
f"{pol}\nmedian {median_v:.1f}s · max {max_v:.1f}s",
|
||
fontsize=10,
|
||
)
|
||
ax.tick_params(axis="x", labelsize=8)
|
||
ax.grid(alpha=0.3, axis="y")
|
||
axes[0].set_ylabel("worker TTFT p90 (s)")
|
||
fig.suptitle("B3 per-worker TTFT p90 distribution")
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_b2_curves(b2: dict, out_tpot: Path, out_ttft: Path) -> None:
|
||
sizes = sorted({r["prefill_size"] for r in b2["rows"]})
|
||
by_var = {"same": {}, "different": {}}
|
||
for r in b2["rows"]:
|
||
by_var[r["variant"]][r["prefill_size"]] = r
|
||
|
||
for name, key, ylabel, ymax_log, out in [
|
||
("TPOT", "tpot_p90", "TPOT p90 (ms)", True, out_tpot),
|
||
("TTFT", "ttft_p90", "TTFT p90 (s)", True, out_ttft),
|
||
]:
|
||
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
|
||
ax_abs, ax_idx = axes
|
||
for variant in ("different", "same"):
|
||
xs, ys_o, ys_c, idxs = [], [], [], []
|
||
for sz in sizes:
|
||
r = by_var[variant].get(sz)
|
||
if not r: continue
|
||
ov = r.get(f"{key}_overlap_s")
|
||
cl = r.get(f"{key}_clean_s")
|
||
if ov is None or cl is None: continue
|
||
xs.append(sz)
|
||
scale = 1000 if name == "TPOT" else 1.0
|
||
ys_o.append(ov * scale)
|
||
ys_c.append(cl * scale)
|
||
idxs.append(ov / cl)
|
||
color = "#d62728" if variant == "same" else "#1f77b4"
|
||
ax_abs.plot(xs, ys_o, "o-", color=color,
|
||
label=f"{variant} (overlap)")
|
||
ax_abs.plot(xs, ys_c, "s--", color=color, alpha=0.5,
|
||
label=f"{variant} (clean)")
|
||
ax_idx.plot(xs, idxs, "o-", color=color, label=variant,
|
||
linewidth=2)
|
||
ax_abs.set_xscale("log", base=2)
|
||
ax_abs.set_yscale("log")
|
||
ax_abs.set_xlabel("prefill injection size (tokens)")
|
||
ax_abs.set_ylabel(ylabel + " (log)")
|
||
ax_abs.set_title(f"B2 {name} absolute (overlap vs clean)")
|
||
ax_abs.legend(fontsize=8)
|
||
ax_abs.grid(alpha=0.3, which="both")
|
||
|
||
ax_idx.set_xscale("log", base=2)
|
||
if ymax_log:
|
||
ax_idx.set_yscale("log")
|
||
ax_idx.axhline(1.0, color="black", linestyle=":", alpha=0.5)
|
||
ax_idx.set_xlabel("prefill injection size (tokens)")
|
||
ax_idx.set_ylabel(f"{name} idx = overlap / clean")
|
||
ax_idx.set_title(f"B2 {name} interference index (same vs different worker)")
|
||
ax_idx.legend()
|
||
ax_idx.grid(alpha=0.3, which="both")
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_reuse_decomposition(reuse: dict, out: Path) -> None:
|
||
fr = reuse.get("fractions") or {}
|
||
labels = ["intra-session", "cross-session", "shared-prefix", "unclassified"]
|
||
vals = [fr.get("intra", 0), fr.get("cross", 0),
|
||
fr.get("shared", 0), fr.get("unclassified", 0)]
|
||
colors = ["#2ca02c", "#ff7f0e", "#9467bd", "#7f7f7f"]
|
||
fig, ax = plt.subplots(figsize=(6, 3))
|
||
bottom = 0.0
|
||
for label, v, c in zip(labels, vals, colors):
|
||
ax.barh(["lmetric run"], [v], left=[bottom], color=c, edgecolor="black",
|
||
linewidth=0.5, label=f"{label} ({v * 100:.1f}%)")
|
||
bottom += v
|
||
ax.set_xlabel("fraction of cached_tokens")
|
||
ax.set_xlim(0, 1)
|
||
ax.set_title("Real reuse decomposition (w600 lmetric run)")
|
||
ax.legend(fontsize=9, loc="lower right")
|
||
ax.grid(alpha=0.3, axis="x")
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def fig_kv_footprint_cdf(kv: dict, out: Path) -> None:
|
||
"""How many concurrent decodes fit per percentile, under three deployments.
|
||
|
||
KV pool assumption: 96 GiB H20 HBM split ~50% model params (Qwen3-Coder-
|
||
30B-A3B in bf16 + headroom), ~10% runtime activations, leaving ~40% for
|
||
the KV cache pool — i.e. ~38.4 GiB per instance.
|
||
|
||
For each request-size percentile, we report system-wide concurrent
|
||
decode capacity = N_D × floor(KV_pool / req_size_MiB) under three 8-GPU
|
||
deployments: all-combined, 4P+4D, 6P+2D. The point is that going from
|
||
combined 8C to 4P+4D halves the system's decode population at the
|
||
same per-request KV pressure.
|
||
"""
|
||
s = kv.get("kv_mib_per_request") or {}
|
||
pct_keys = ["p50", "p90", "p95", "p99"]
|
||
req_mib = [float(s.get(k, 0.0)) for k in pct_keys]
|
||
req_gib = [v / 1024 for v in req_mib]
|
||
|
||
hbm_gib = 96.0
|
||
kv_pool_frac = 0.40
|
||
kv_pool_mib = hbm_gib * kv_pool_frac * 1024 # ≈ 39322 MiB per instance
|
||
|
||
deploys = [
|
||
("Combined 8C", 8, "#2ca02c"),
|
||
("PD-disagg 4P+4D", 4, "#ff7f0e"),
|
||
("PD-disagg 6P+2D", 2, "#d62728"),
|
||
]
|
||
|
||
import numpy as _np
|
||
x = _np.arange(len(pct_keys))
|
||
bar_w = 0.26
|
||
|
||
fig, ax = plt.subplots(figsize=(9, 5.2))
|
||
for i, (label, n_d, color) in enumerate(deploys):
|
||
per_inst = [int(kv_pool_mib // r) if r > 0 else 0 for r in req_mib]
|
||
sys_cap = [n_d * pi for pi in per_inst]
|
||
bars = ax.bar(x + (i - 1) * bar_w, sys_cap, bar_w,
|
||
label=f"{label} (N_D={n_d})",
|
||
color=color, edgecolor="black", linewidth=0.5)
|
||
for j, (b, n) in enumerate(zip(bars, sys_cap)):
|
||
ax.text(b.get_x() + b.get_width() / 2, n, str(n),
|
||
ha="center", va="bottom", fontsize=9, color="#333")
|
||
|
||
# Annotate per-request KV size and per-instance fit just above the x-axis
|
||
per_inst_combined = [int(kv_pool_mib // r) if r > 0 else 0 for r in req_mib]
|
||
annot = [
|
||
f"{pct}\n{rg:.1f} GiB / req\nfits {pi}/inst"
|
||
for pct, rg, pi in zip(pct_keys, req_gib, per_inst_combined)
|
||
]
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(annot, fontsize=10)
|
||
|
||
ax.set_ylabel("System-wide concurrent decodes")
|
||
ax.set_title(
|
||
f"Per-instance KV pool ≈ {kv_pool_mib / 1024:.1f} GiB "
|
||
f"(0.4 × H20 96 GiB; remaining 0.5 model + 0.1 activation)\n"
|
||
f"PD-disagg halves the decode population at p90+ "
|
||
f"(Qwen3-Coder-30B-A3B, 98304 B/token)"
|
||
)
|
||
ax.legend(loc="upper right")
|
||
ax.grid(alpha=0.3, axis="y")
|
||
ax.margins(y=0.15)
|
||
fig.tight_layout()
|
||
fig.savefig(out, dpi=120)
|
||
plt.close(fig)
|
||
|
||
|
||
def main() -> None:
|
||
p = argparse.ArgumentParser()
|
||
p.add_argument("--results-dir", type=Path, required=True)
|
||
p.add_argument("--out-dir", type=Path, required=True)
|
||
p.add_argument("--exclude-policies", default="",
|
||
help="Comma-separated policies to drop from per-policy figures")
|
||
args = p.parse_args()
|
||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()}
|
||
if excluded:
|
||
global POLICY_ORDER
|
||
POLICY_ORDER = [p for p in POLICY_ORDER if p not in excluded]
|
||
print(f"excluding policies: {sorted(excluded)} -> kept {POLICY_ORDER}")
|
||
|
||
comp = _load(args.results_dir, "b3_policy_comparison.json")
|
||
upper = _load(args.results_dir, "apc_upper_w600.json")
|
||
b2 = _load(args.results_dir, "b2_sweep_summary.json")
|
||
reuse = _load(args.results_dir, "lmetric_reuse.json")
|
||
kv = _load(args.results_dir, "kv_footprint_summary.json")
|
||
|
||
fig_b3_apc_vs_hotspot(comp, upper, args.out_dir / "fig_b3_apc_vs_hotspot.png")
|
||
fig_b3_latency_bars(comp, args.out_dir / "fig_b3_latency_bars.png")
|
||
fig_b3_latency_full_grid(
|
||
args.results_dir, args.out_dir / "fig_b3_latency_full_grid.png"
|
||
)
|
||
fig_b3_apc_vs_upper(comp, upper, args.out_dir / "fig_b3_apc_vs_upper.png")
|
||
fig_b3_failure_breakdown(comp, args.out_dir / "fig_b3_failure_breakdown.png")
|
||
fig_b3_per_worker_ttft(args.results_dir, comp,
|
||
args.out_dir / "fig_b3_per_worker_ttft_p90.png")
|
||
fig_b2_curves(b2,
|
||
args.out_dir / "fig_b2_tpot_vs_prefill.png",
|
||
args.out_dir / "fig_b2_ttft_vs_prefill.png")
|
||
fig_reuse_decomposition(reuse, args.out_dir / "fig_reuse_decomposition.png")
|
||
fig_kv_footprint_cdf(kv, args.out_dir / "fig_kv_footprint_cdf.png")
|
||
print(f"wrote 8 figures to {args.out_dir}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|