Files
agentic-kvc/v2/exp_a_tier_latency/plot.py
Gahow Wang 837df6bc9e v2 exp(a): three-tier KV-hit latency microbench (GPU >> CPU >> miss)
Measures TTFT to serve a reused prefix of length L from each KV tier on a
single H20 (Qwen3-Coder-30B-A3B, vLLM 0.18.1): miss (recompute), CPU-tier
hit (native DRAM offload), GPU-tier hit (HBM prefix cache). Each measured
request is bracketed by /metrics scrapes so the tier is verified
(vllm:prefix_cache_hits vs external_prefix_cache_hits), not assumed.

Result: GPU hit is ~flat (42->111 ms over 1k->64k tokens); CPU hit is
transfer-bound (PCIe H2D ~54 GB/s, 57->272 ms); miss grows superlinearly
(78 ms -> 15.2 s). GPU beats CPU 1.4-2.5x (gap grows with context);
miss/CPU up to 56x, miss/GPU up to 137x. pcie_transfer.py is the
independent CPU-hit floor backstop. Evidence for the GPU-hit-first
principle (paper section 2.2).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 11:23:04 +08:00

69 lines
2.6 KiB
Python

"""Plot exp (a): TTFT vs prefix length for miss / gpu-hit / cpu-hit (+ PCIe floor)."""
import json
import sys
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
R = Path(sys.argv[1] if len(sys.argv) > 1 else "v2/exp_a_tier_latency/results")
FIG = Path(sys.argv[2] if len(sys.argv) > 2 else "v2/figs/exp_a_tier_latency.png")
KV_BYTES_PER_TOKEN = 98304
def load(name):
p = R / name
return json.load(open(p)) if p.exists() else None
miss, gpu, cpu, pcie = load("miss.json"), load("gpu.json"), load("cpu.json"), load("pcie.json")
def series(d):
if not d:
return [], []
items = sorted(((int(k), v["ttft_p50"]) for k, v in d["by_length"].items()), key=lambda x: x[0])
return [a for a, _ in items], [b for _, b in items]
fig, ax = plt.subplots(figsize=(7.2, 5.0))
for d, lab, mk, c in [(miss, "miss (recompute)", "o", "#d62728"),
(cpu, "CPU-tier hit (DRAM offload)", "s", "#ff7f0e"),
(gpu, "GPU-tier hit (HBM APC)", "^", "#2ca02c")]:
xs, ys = series(d)
if xs:
ax.plot(xs, ys, marker=mk, label=lab, color=c, linewidth=2, markersize=7)
if pcie:
items = sorted(((int(k), v["transfer_s"]) for k, v in pcie["by_length"].items()))
xs = [a for a, _ in items]; ys = [b for _, b in items]
ax.plot(xs, ys, "--", color="#7f7f7f", linewidth=1.4,
label="CPU-hit transfer floor (PCIe H2D)")
ax.set_xscale("log", base=2); ax.set_yscale("log")
ax.set_xlabel("Reused prefix length (tokens)")
ax.set_ylabel("TTFT (s, log)")
ax.set_title("Cost of serving a reused prefix from each KV tier\nQwen3-Coder-30B-A3B, 1xH20")
ax.grid(True, which="both", alpha=0.3)
ax.legend()
FIG.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout(); fig.savefig(FIG, dpi=140)
print("wrote", FIG)
# Table
print(f"\n{'L':>7} {'miss(s)':>10} {'cpu(s)':>10} {'gpu(s)':>10} {'miss/cpu':>9} {'cpu/gpu':>9}")
allL = sorted({int(k) for d in (miss, gpu, cpu) if d for k in d["by_length"]})
for L in allL:
m = miss["by_length"].get(str(L), {}).get("ttft_p50") if miss else None
c = cpu["by_length"].get(str(L), {}).get("ttft_p50") if cpu else None
g = gpu["by_length"].get(str(L), {}).get("ttft_p50") if gpu else None
f = lambda x: f"{x:.4f}" if x is not None else " - "
r1 = f"{m/c:.1f}x" if (m and c) else " -"
r2 = f"{c/g:.1f}x" if (c and g) else " -"
print(f"{L:>7} {f(m):>10} {f(c):>10} {f(g):>10} {r1:>9} {r2:>9}")
if cpu:
vf = {k: v.get("verified_frac") for k, v in cpu["by_length"].items()}
print("\nCPU-tier verified fraction (ext_hits>0):", vf)