Sweeps GPU KV-cache capacity (--num-gpu-blocks-override) under a closed-loop replay (concurrency 4) of a controlled multi-turn workload (cumulative intra-session prefix, gen_synth_trace.py), measuring realized APC (prefix_cache hits/queries delta) and latency per capacity. Result: a sharp knee at 3.6 GB = exactly the active working set (4 sessions x 0.91 GB). APC rises 7->12->36->80% then saturates at the ~71% intra-session ceiling; TTFT p90 collapses 13.0 s -> 0.53 s at the same point; dead flat to 14.5 GB, 100% completion throughout. So only the active working set needs HBM; capacity beyond it -- and the CPU/storage tier built to chase the reuse tail -- buys ~0. Knee scales linearly with concurrency = cluster GPU count. README.md ties exp(a)+exp(b) into the section-2.2 GPU-hit-first argument with tables, conclusions, and caveats. Raw per-request dumps gitignored; summary/m0/m1 deltas kept. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
72 lines
2.7 KiB
Python
72 lines
2.7 KiB
Python
"""Analyze + plot exp (b): realized APC and latency vs GPU KV capacity (the knee)."""
|
|
import json
|
|
import statistics
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
|
|
R = Path(sys.argv[1] if len(sys.argv) > 1 else "v2/exp_b_capacity_knee/results")
|
|
FIG = Path(sys.argv[2] if len(sys.argv) > 2 else "v2/figs/exp_b_capacity_knee.png")
|
|
BLOCK_BYTES = 16 * 98304 # 1.573 MB / block
|
|
|
|
|
|
def pct(v, q):
|
|
v = sorted(v)
|
|
return v[min(int(q * len(v)), len(v) - 1)] if v else 0.0
|
|
|
|
|
|
rows = []
|
|
for mf in sorted(R.glob("metrics_blk*.jsonl"), key=lambda p: int(p.stem.split("blk")[1])):
|
|
blk = int(mf.stem.split("blk")[1])
|
|
gb = blk * BLOCK_BYTES / 1e9
|
|
recs = [json.loads(l) for l in open(mf)]
|
|
ok = [r for r in recs if not r.get("error")]
|
|
ttft = [r["ttft_s"] for r in ok if r.get("ttft_s")]
|
|
e2e = [r["latency_s"] for r in ok if r.get("latency_s")]
|
|
m0 = json.load(open(R / f"m0_blk{blk}.json"))
|
|
m1 = json.load(open(R / f"m1_blk{blk}.json"))
|
|
dq = m1["gpu_queries"] - m0["gpu_queries"]
|
|
dh = m1["gpu_hits"] - m0["gpu_hits"]
|
|
apc = dh / dq if dq > 0 else 0.0
|
|
rows.append({
|
|
"blocks": blk, "gb": gb,
|
|
"apc": apc,
|
|
"completion": len(ok) / len(recs) if recs else 0,
|
|
"n_ok": len(ok), "n": len(recs),
|
|
"ttft_p50": pct(ttft, .5), "ttft_p90": pct(ttft, .9),
|
|
"e2e_p50": pct(e2e, .5), "e2e_p90": pct(e2e, .9),
|
|
})
|
|
|
|
print(f"{'GB':>6} {'blocks':>7} {'APC':>7} {'compl':>6} {'TTFTp50':>8} {'TTFTp90':>8} {'E2Ep90':>8}")
|
|
for r in rows:
|
|
print(f"{r['gb']:>6.1f} {r['blocks']:>7} {r['apc']:>6.1%} {r['completion']:>6.0%} "
|
|
f"{r['ttft_p50']:>8.3f} {r['ttft_p90']:>8.3f} {r['e2e_p90']:>8.3f}")
|
|
json.dump(rows, open(R / "summary.json", "w"), indent=2)
|
|
|
|
if rows:
|
|
gb = [r["gb"] for r in rows]
|
|
fig, ax1 = plt.subplots(figsize=(7.4, 5.0))
|
|
ax1.plot(gb, [r["apc"] * 100 for r in rows], "o-", color="#2ca02c",
|
|
linewidth=2.2, markersize=8, label="Realized APC")
|
|
ax1.set_xlabel("GPU KV-cache capacity (GB)")
|
|
ax1.set_ylabel("Realized APC (%)", color="#2ca02c")
|
|
ax1.tick_params(axis="y", labelcolor="#2ca02c")
|
|
ax1.set_ylim(0, 100)
|
|
ax1.grid(True, alpha=0.3)
|
|
|
|
ax2 = ax1.twinx()
|
|
ax2.plot(gb, [r["ttft_p90"] for r in rows], "s--", color="#d62728",
|
|
linewidth=2, markersize=7, label="TTFT p90")
|
|
ax2.set_ylabel("TTFT p90 (s)", color="#d62728")
|
|
ax2.tick_params(axis="y", labelcolor="#d62728")
|
|
|
|
ax1.set_title("APC and latency saturate at small GPU KV capacity\n"
|
|
"Qwen3-Coder-30B-A3B, 1xH20, agentic trace replay")
|
|
fig.tight_layout()
|
|
FIG.parent.mkdir(parents=True, exist_ok=True)
|
|
fig.savefig(FIG, dpi=140)
|
|
print("wrote", FIG)
|