Files
agentic-kvc/v2/exp_b_capacity_knee/analyze_and_plot.py
Gahow Wang ad754cfe0b v2 exp(b): GPU KV-capacity APC/latency knee + writeup
Sweeps GPU KV-cache capacity (--num-gpu-blocks-override) under a closed-loop
replay (concurrency 4) of a controlled multi-turn workload (cumulative
intra-session prefix, gen_synth_trace.py), measuring realized APC
(prefix_cache hits/queries delta) and latency per capacity.

Result: a sharp knee at 3.6 GB = exactly the active working set
(4 sessions x 0.91 GB). APC rises 7->12->36->80% then saturates at the
~71% intra-session ceiling; TTFT p90 collapses 13.0 s -> 0.53 s at the same
point; dead flat to 14.5 GB, 100% completion throughout. So only the active
working set needs HBM; capacity beyond it -- and the CPU/storage tier built
to chase the reuse tail -- buys ~0. Knee scales linearly with concurrency
= cluster GPU count.

README.md ties exp(a)+exp(b) into the section-2.2 GPU-hit-first argument
with tables, conclusions, and caveats. Raw per-request dumps gitignored;
summary/m0/m1 deltas kept.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 11:23:31 +08:00

72 lines
2.7 KiB
Python

"""Analyze + plot exp (b): realized APC and latency vs GPU KV capacity (the knee)."""
import json
import statistics
import sys
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
R = Path(sys.argv[1] if len(sys.argv) > 1 else "v2/exp_b_capacity_knee/results")
FIG = Path(sys.argv[2] if len(sys.argv) > 2 else "v2/figs/exp_b_capacity_knee.png")
BLOCK_BYTES = 16 * 98304 # 1.573 MB / block
def pct(v, q):
v = sorted(v)
return v[min(int(q * len(v)), len(v) - 1)] if v else 0.0
rows = []
for mf in sorted(R.glob("metrics_blk*.jsonl"), key=lambda p: int(p.stem.split("blk")[1])):
blk = int(mf.stem.split("blk")[1])
gb = blk * BLOCK_BYTES / 1e9
recs = [json.loads(l) for l in open(mf)]
ok = [r for r in recs if not r.get("error")]
ttft = [r["ttft_s"] for r in ok if r.get("ttft_s")]
e2e = [r["latency_s"] for r in ok if r.get("latency_s")]
m0 = json.load(open(R / f"m0_blk{blk}.json"))
m1 = json.load(open(R / f"m1_blk{blk}.json"))
dq = m1["gpu_queries"] - m0["gpu_queries"]
dh = m1["gpu_hits"] - m0["gpu_hits"]
apc = dh / dq if dq > 0 else 0.0
rows.append({
"blocks": blk, "gb": gb,
"apc": apc,
"completion": len(ok) / len(recs) if recs else 0,
"n_ok": len(ok), "n": len(recs),
"ttft_p50": pct(ttft, .5), "ttft_p90": pct(ttft, .9),
"e2e_p50": pct(e2e, .5), "e2e_p90": pct(e2e, .9),
})
print(f"{'GB':>6} {'blocks':>7} {'APC':>7} {'compl':>6} {'TTFTp50':>8} {'TTFTp90':>8} {'E2Ep90':>8}")
for r in rows:
print(f"{r['gb']:>6.1f} {r['blocks']:>7} {r['apc']:>6.1%} {r['completion']:>6.0%} "
f"{r['ttft_p50']:>8.3f} {r['ttft_p90']:>8.3f} {r['e2e_p90']:>8.3f}")
json.dump(rows, open(R / "summary.json", "w"), indent=2)
if rows:
gb = [r["gb"] for r in rows]
fig, ax1 = plt.subplots(figsize=(7.4, 5.0))
ax1.plot(gb, [r["apc"] * 100 for r in rows], "o-", color="#2ca02c",
linewidth=2.2, markersize=8, label="Realized APC")
ax1.set_xlabel("GPU KV-cache capacity (GB)")
ax1.set_ylabel("Realized APC (%)", color="#2ca02c")
ax1.tick_params(axis="y", labelcolor="#2ca02c")
ax1.set_ylim(0, 100)
ax1.grid(True, alpha=0.3)
ax2 = ax1.twinx()
ax2.plot(gb, [r["ttft_p90"] for r in rows], "s--", color="#d62728",
linewidth=2, markersize=7, label="TTFT p90")
ax2.set_ylabel("TTFT p90 (s)", color="#d62728")
ax2.tick_params(axis="y", labelcolor="#d62728")
ax1.set_title("APC and latency saturate at small GPU KV capacity\n"
"Qwen3-Coder-30B-A3B, 1xH20, agentic trace replay")
fig.tight_layout()
FIG.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(FIG, dpi=140)
print("wrote", FIG)