agentic-kvc/v2/exp_b_capacity_knee/analyze_and_plot.py

"""Analyze + plot exp (b): realized APC and latency vs GPU KV capacity (the knee)."""
import json
import statistics
import sys
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

R = Path(sys.argv[1] if len(sys.argv) > 1 else "v2/exp_b_capacity_knee/results")
FIG = Path(sys.argv[2] if len(sys.argv) > 2 else "v2/figs/exp_b_capacity_knee.png")
BLOCK_BYTES = 16 * 98304  # 1.573 MB / block


def pct(v, q):
    v = sorted(v)
    return v[min(int(q * len(v)), len(v) - 1)] if v else 0.0


rows = []
for mf in sorted(R.glob("metrics_blk*.jsonl"), key=lambda p: int(p.stem.split("blk")[1])):
    blk = int(mf.stem.split("blk")[1])
    gb = blk * BLOCK_BYTES / 1e9
    recs = [json.loads(l) for l in open(mf)]
    ok = [r for r in recs if not r.get("error")]
    ttft = [r["ttft_s"] for r in ok if r.get("ttft_s")]
    e2e = [r["latency_s"] for r in ok if r.get("latency_s")]
    m0 = json.load(open(R / f"m0_blk{blk}.json"))
    m1 = json.load(open(R / f"m1_blk{blk}.json"))
    dq = m1["gpu_queries"] - m0["gpu_queries"]
    dh = m1["gpu_hits"] - m0["gpu_hits"]
    apc = dh / dq if dq > 0 else 0.0
    rows.append({
        "blocks": blk, "gb": gb,
        "apc": apc,
        "completion": len(ok) / len(recs) if recs else 0,
        "n_ok": len(ok), "n": len(recs),
        "ttft_p50": pct(ttft, .5), "ttft_p90": pct(ttft, .9),
        "e2e_p50": pct(e2e, .5), "e2e_p90": pct(e2e, .9),
    })

print(f"{'GB':>6} {'blocks':>7} {'APC':>7} {'compl':>6} {'TTFTp50':>8} {'TTFTp90':>8} {'E2Ep90':>8}")
for r in rows:
    print(f"{r['gb']:>6.1f} {r['blocks']:>7} {r['apc']:>6.1%} {r['completion']:>6.0%} "
          f"{r['ttft_p50']:>8.3f} {r['ttft_p90']:>8.3f} {r['e2e_p90']:>8.3f}")
json.dump(rows, open(R / "summary.json", "w"), indent=2)

if rows:
    gb = [r["gb"] for r in rows]
    fig, ax1 = plt.subplots(figsize=(7.4, 5.0))
    ax1.plot(gb, [r["apc"] * 100 for r in rows], "o-", color="#2ca02c",
             linewidth=2.2, markersize=8, label="Realized APC")
    ax1.set_xlabel("GPU KV-cache capacity (GB)")
    ax1.set_ylabel("Realized APC (%)", color="#2ca02c")
    ax1.tick_params(axis="y", labelcolor="#2ca02c")
    ax1.set_ylim(0, 100)
    ax1.grid(True, alpha=0.3)

    ax2 = ax1.twinx()
    ax2.plot(gb, [r["ttft_p90"] for r in rows], "s--", color="#d62728",
             linewidth=2, markersize=7, label="TTFT p90")
    ax2.set_ylabel("TTFT p90 (s)", color="#d62728")
    ax2.tick_params(axis="y", labelcolor="#d62728")

    ax1.set_title("APC and latency saturate at small GPU KV capacity\n"
                  "Qwen3-Coder-30B-A3B, 1xH20, agentic trace replay")
    fig.tight_layout()
    FIG.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(FIG, dpi=140)
    print("wrote", FIG)