agentic-kvc/scripts/analyze_h4_results.py

"""Analyze H4 cache-aware gate experiment results."""
import json
import sys
from collections import Counter

outdir = sys.argv[1] if len(sys.argv) > 1 else "outputs/h4_cache_gate"

rows = [json.loads(l) for l in open(f"{outdir}/metrics.jsonl")]
ok = [r for r in rows if not r.get("error")]
fail = [r for r in rows if r.get("error")]
p = lambda v, q: sorted(v)[min(int(q * len(v)), len(v) - 1)] if v else 0

ttfts = sorted([r["ttft_s"] for r in ok if r.get("ttft_s")])
tpots = sorted([r["tpot_s"] for r in ok if r.get("tpot_s") and r["tpot_s"] > 0])
e2es = sorted([r["latency_s"] for r in ok])

print("=" * 70)
print("H4 Cache-Aware Offload Gate Results")
print("=" * 70)
print(f"OK={len(ok)}/{len(rows)}  TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f}  TPOT90={p(tpots,.9):.4f}  E2E50={p(e2es,.5):.3f} E2E90={p(e2es,.9):.3f}")

# Per-class breakdown
for lo, hi, cl in [(0, 5000, "WARM"), (5000, 20000, "MED"), (20000, 200000, "HEAVY")]:
    sub = [r for r in ok if lo <= r["input_length"] < hi and r.get("ttft_s")]
    if sub:
        t = sorted([r["ttft_s"] for r in sub])
        tp = sorted([r["tpot_s"] for r in sub if r.get("tpot_s") and r["tpot_s"] > 0])
        e = sorted([r["latency_s"] for r in sub])
        print(f"  {cl:6s} n={len(sub):3d}  TTFT50={p(t,.5):.3f} TTFT90={p(t,.9):.3f}  TPOT90={p(tp,.9):.4f}  E2E50={p(e,.5):.3f} E2E90={p(e,.9):.3f}")

# Route distribution from breakdown
try:
    bd = json.load(open(f"{outdir}/breakdown.json"))
    rc = Counter(b.get("route_class", "") for b in bd)
    print(f"\nRoute class distribution:")
    for cls, cnt in sorted(rc.items()):
        print(f"  {cls}: {cnt}")

    heavy = [b for b in bd if b.get("route_class", "").startswith("HEAVY")]
    reasons = Counter(b.get("offload_reason", "") for b in heavy)
    print(f"\nHEAVY offload reasons: {dict(reasons)}")

    colo = [b for b in bd if b.get("route_class") == "HEAVY_COLO"]
    offloaded = [b for b in bd if b.get("route_class") == "HEAVY_OFFLOAD"]
    print(f"\nHEAVY_COLO (cold, no RDMA): {len(colo)}")
    print(f"HEAVY_OFFLOAD (cached, RDMA): {len(offloaded)}")

    # Cache ratio distribution for HEAVY
    print("\nCache ratio distribution for HEAVY:")
    for b in heavy:
        cr = b.get("cache_ratio", b.get("cache_hit", 0) / max(b.get("input_length", 1), 1))
        cls = b.get("route_class", "")
        reason = b.get("offload_reason", "")
        # Don't print individual ones, summarize

    ratios = [b.get("cache_ratio", b.get("cache_hit", 0) / max(b.get("input_length", 1), 1)) for b in heavy]
    if ratios:
        ratios.sort()
        print(f"  min={min(ratios):.2f} p50={p(ratios,.5):.2f} mean={sum(ratios)/len(ratios):.2f} max={max(ratios):.2f}")
        print(f"  >=0.3 (would offload): {sum(1 for r in ratios if r >= 0.3)}")
        print(f"  <0.3 (stays colo):     {sum(1 for r in ratios if r < 0.3)}")

    # TTFT comparison: HEAVY_COLO timing
    if colo:
        colo_ttft = sorted([b["t_first_token"] - b["t_proxy_recv"] for b in colo if b.get("t_first_token")])
        if colo_ttft:
            print(f"\n  HEAVY_COLO TTFT: p50={p(colo_ttft,.5):.2f}s p90={p(colo_ttft,.9):.2f}s")
    if offloaded:
        off_ttft = sorted([b["t_first_token"] - b["t_proxy_recv"] for b in offloaded if b.get("t_first_token")])
        if off_ttft:
            print(f"  HEAVY_OFFLOAD TTFT: p50={p(off_ttft,.5):.2f}s p90={p(off_ttft,.9):.2f}s")
        pf = [b["t_prefill_done"] - b["t_prefill_sent"] for b in offloaded if b.get("t_prefill_done") and b.get("t_prefill_sent")]
        kv = [b["t_first_token"] - b["t_prefill_done"] for b in offloaded if b.get("t_first_token") and b.get("t_prefill_done")]
        if pf:
            pf.sort()
            print(f"  Offload prefill: p50={p(pf,.5):.2f}s p90={p(pf,.9):.2f}s")
        if kv:
            kv.sort()
            print(f"  Offload KV xfer: p50={p(kv,.5):.2f}s p90={p(kv,.9):.2f}s")

except Exception as e:
    print(f"Breakdown analysis error: {e}")

if fail:
    print(f"\nFailed requests ({len(fail)}):")
    for r in fail[:5]:
        print(f"  input={r['input_length']} error={str(r['error'])[:80]}")

print()
print("=" * 70)
print("Comparison with all prior experiments")
print("=" * 70)
print("Baseline 8C plain:    OK=198/200  TTFT50=1.075  TTFT90=9.384   TPOT90=0.0761  E2E50=5.075")
print("Phase0A 7C kv_both:   OK=198/200  TTFT50=1.073  TPOT90=0.0738  E2E50=5.096")
print("V2 all-offload:       OK=179/185  TTFT50=0.762  TPOT90=0.0746  E2E50=4.628")
print(f"H4 cache-aware gate:  OK={len(ok)}/{len(rows)}  TTFT50={p(ttfts,.5):.3f}  TTFT90={p(ttfts,.9):.3f}   TPOT90={p(tpots,.9):.4f}  E2E50={p(e2es,.5):.3f}")