Files
agentic-kvc/scripts/analyze_h4_results.py
Gahow Wang 3bc37cc6d5 PS experiments + H4 cache-gate + GPU profiling + Mooncake elif→if fix
Experiments run:
- Phase 0: kv_both has zero idle overhead (TPOT +1.3%, noise)
- PS V1 (cold prefill): REJECTED — PS always slower than cached C
- PS V1+flexD: 92.5% OK, HEAVY TTFT 7.8s (baseline 5.0s) — PS bottleneck
- V2 (C_s prefill + flexible D): E2E -9% but 6 errors, RDMA bimodal
- H4 (cache-gate): 198/200 OK, GPU imbalance 4.0x→2.0x, but HEAVY_OFFLOAD
  TTFT=11.5s due to RDMA. HEAVY_COLO improved 10.5% from better balance.
- H5: Mooncake RDMA transfer R²=0.095, bimodal (0.6s or 18-30s)

Key findings:
- Mooncake lacks layerwise KV transfer → RDMA is pure sequential overhead
- 92% of HEAVY are turn-1 cold → offloading cold requests always loses
- GPU balance improvement from routing IS real (-10.5% HEAVY_COLO TTFT)
- RDMA transfer negates the routing benefit for offloaded requests

Code changes:
- bench.sh: add GPU timeline monitoring (gpu_monitor.sh during benchmark)
- cache_aware_proxy.py: H4 cache-gate, flexible D, PS routing
- mooncake_connector.py: elif→if fix (allow dual prefill+decode flags)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-23 02:14:37 +08:00

97 lines
4.5 KiB
Python

"""Analyze H4 cache-aware gate experiment results."""
import json
import sys
from collections import Counter
outdir = sys.argv[1] if len(sys.argv) > 1 else "outputs/h4_cache_gate"
rows = [json.loads(l) for l in open(f"{outdir}/metrics.jsonl")]
ok = [r for r in rows if not r.get("error")]
fail = [r for r in rows if r.get("error")]
p = lambda v, q: sorted(v)[min(int(q * len(v)), len(v) - 1)] if v else 0
ttfts = sorted([r["ttft_s"] for r in ok if r.get("ttft_s")])
tpots = sorted([r["tpot_s"] for r in ok if r.get("tpot_s") and r["tpot_s"] > 0])
e2es = sorted([r["latency_s"] for r in ok])
print("=" * 70)
print("H4 Cache-Aware Offload Gate Results")
print("=" * 70)
print(f"OK={len(ok)}/{len(rows)} TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f} TPOT90={p(tpots,.9):.4f} E2E50={p(e2es,.5):.3f} E2E90={p(e2es,.9):.3f}")
# Per-class breakdown
for lo, hi, cl in [(0, 5000, "WARM"), (5000, 20000, "MED"), (20000, 200000, "HEAVY")]:
sub = [r for r in ok if lo <= r["input_length"] < hi and r.get("ttft_s")]
if sub:
t = sorted([r["ttft_s"] for r in sub])
tp = sorted([r["tpot_s"] for r in sub if r.get("tpot_s") and r["tpot_s"] > 0])
e = sorted([r["latency_s"] for r in sub])
print(f" {cl:6s} n={len(sub):3d} TTFT50={p(t,.5):.3f} TTFT90={p(t,.9):.3f} TPOT90={p(tp,.9):.4f} E2E50={p(e,.5):.3f} E2E90={p(e,.9):.3f}")
# Route distribution from breakdown
try:
bd = json.load(open(f"{outdir}/breakdown.json"))
rc = Counter(b.get("route_class", "") for b in bd)
print(f"\nRoute class distribution:")
for cls, cnt in sorted(rc.items()):
print(f" {cls}: {cnt}")
heavy = [b for b in bd if b.get("route_class", "").startswith("HEAVY")]
reasons = Counter(b.get("offload_reason", "") for b in heavy)
print(f"\nHEAVY offload reasons: {dict(reasons)}")
colo = [b for b in bd if b.get("route_class") == "HEAVY_COLO"]
offloaded = [b for b in bd if b.get("route_class") == "HEAVY_OFFLOAD"]
print(f"\nHEAVY_COLO (cold, no RDMA): {len(colo)}")
print(f"HEAVY_OFFLOAD (cached, RDMA): {len(offloaded)}")
# Cache ratio distribution for HEAVY
print("\nCache ratio distribution for HEAVY:")
for b in heavy:
cr = b.get("cache_ratio", b.get("cache_hit", 0) / max(b.get("input_length", 1), 1))
cls = b.get("route_class", "")
reason = b.get("offload_reason", "")
# Don't print individual ones, summarize
ratios = [b.get("cache_ratio", b.get("cache_hit", 0) / max(b.get("input_length", 1), 1)) for b in heavy]
if ratios:
ratios.sort()
print(f" min={min(ratios):.2f} p50={p(ratios,.5):.2f} mean={sum(ratios)/len(ratios):.2f} max={max(ratios):.2f}")
print(f" >=0.3 (would offload): {sum(1 for r in ratios if r >= 0.3)}")
print(f" <0.3 (stays colo): {sum(1 for r in ratios if r < 0.3)}")
# TTFT comparison: HEAVY_COLO timing
if colo:
colo_ttft = sorted([b["t_first_token"] - b["t_proxy_recv"] for b in colo if b.get("t_first_token")])
if colo_ttft:
print(f"\n HEAVY_COLO TTFT: p50={p(colo_ttft,.5):.2f}s p90={p(colo_ttft,.9):.2f}s")
if offloaded:
off_ttft = sorted([b["t_first_token"] - b["t_proxy_recv"] for b in offloaded if b.get("t_first_token")])
if off_ttft:
print(f" HEAVY_OFFLOAD TTFT: p50={p(off_ttft,.5):.2f}s p90={p(off_ttft,.9):.2f}s")
pf = [b["t_prefill_done"] - b["t_prefill_sent"] for b in offloaded if b.get("t_prefill_done") and b.get("t_prefill_sent")]
kv = [b["t_first_token"] - b["t_prefill_done"] for b in offloaded if b.get("t_first_token") and b.get("t_prefill_done")]
if pf:
pf.sort()
print(f" Offload prefill: p50={p(pf,.5):.2f}s p90={p(pf,.9):.2f}s")
if kv:
kv.sort()
print(f" Offload KV xfer: p50={p(kv,.5):.2f}s p90={p(kv,.9):.2f}s")
except Exception as e:
print(f"Breakdown analysis error: {e}")
if fail:
print(f"\nFailed requests ({len(fail)}):")
for r in fail[:5]:
print(f" input={r['input_length']} error={str(r['error'])[:80]}")
print()
print("=" * 70)
print("Comparison with all prior experiments")
print("=" * 70)
print("Baseline 8C plain: OK=198/200 TTFT50=1.075 TTFT90=9.384 TPOT90=0.0761 E2E50=5.075")
print("Phase0A 7C kv_both: OK=198/200 TTFT50=1.073 TPOT90=0.0738 E2E50=5.096")
print("V2 all-offload: OK=179/185 TTFT50=0.762 TPOT90=0.0746 E2E50=4.628")
print(f"H4 cache-aware gate: OK={len(ok)}/{len(rows)} TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f} TPOT90={p(tpots,.9):.4f} E2E50={p(e2es,.5):.3f}")