Experiments run: - Phase 0: kv_both has zero idle overhead (TPOT +1.3%, noise) - PS V1 (cold prefill): REJECTED — PS always slower than cached C - PS V1+flexD: 92.5% OK, HEAVY TTFT 7.8s (baseline 5.0s) — PS bottleneck - V2 (C_s prefill + flexible D): E2E -9% but 6 errors, RDMA bimodal - H4 (cache-gate): 198/200 OK, GPU imbalance 4.0x→2.0x, but HEAVY_OFFLOAD TTFT=11.5s due to RDMA. HEAVY_COLO improved 10.5% from better balance. - H5: Mooncake RDMA transfer R²=0.095, bimodal (0.6s or 18-30s) Key findings: - Mooncake lacks layerwise KV transfer → RDMA is pure sequential overhead - 92% of HEAVY are turn-1 cold → offloading cold requests always loses - GPU balance improvement from routing IS real (-10.5% HEAVY_COLO TTFT) - RDMA transfer negates the routing benefit for offloaded requests Code changes: - bench.sh: add GPU timeline monitoring (gpu_monitor.sh during benchmark) - cache_aware_proxy.py: H4 cache-gate, flexible D, PS routing - mooncake_connector.py: elif→if fix (allow dual prefill+decode flags) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
97 lines
4.5 KiB
Python
97 lines
4.5 KiB
Python
"""Analyze H4 cache-aware gate experiment results."""
|
|
import json
|
|
import sys
|
|
from collections import Counter
|
|
|
|
outdir = sys.argv[1] if len(sys.argv) > 1 else "outputs/h4_cache_gate"
|
|
|
|
rows = [json.loads(l) for l in open(f"{outdir}/metrics.jsonl")]
|
|
ok = [r for r in rows if not r.get("error")]
|
|
fail = [r for r in rows if r.get("error")]
|
|
p = lambda v, q: sorted(v)[min(int(q * len(v)), len(v) - 1)] if v else 0
|
|
|
|
ttfts = sorted([r["ttft_s"] for r in ok if r.get("ttft_s")])
|
|
tpots = sorted([r["tpot_s"] for r in ok if r.get("tpot_s") and r["tpot_s"] > 0])
|
|
e2es = sorted([r["latency_s"] for r in ok])
|
|
|
|
print("=" * 70)
|
|
print("H4 Cache-Aware Offload Gate Results")
|
|
print("=" * 70)
|
|
print(f"OK={len(ok)}/{len(rows)} TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f} TPOT90={p(tpots,.9):.4f} E2E50={p(e2es,.5):.3f} E2E90={p(e2es,.9):.3f}")
|
|
|
|
# Per-class breakdown
|
|
for lo, hi, cl in [(0, 5000, "WARM"), (5000, 20000, "MED"), (20000, 200000, "HEAVY")]:
|
|
sub = [r for r in ok if lo <= r["input_length"] < hi and r.get("ttft_s")]
|
|
if sub:
|
|
t = sorted([r["ttft_s"] for r in sub])
|
|
tp = sorted([r["tpot_s"] for r in sub if r.get("tpot_s") and r["tpot_s"] > 0])
|
|
e = sorted([r["latency_s"] for r in sub])
|
|
print(f" {cl:6s} n={len(sub):3d} TTFT50={p(t,.5):.3f} TTFT90={p(t,.9):.3f} TPOT90={p(tp,.9):.4f} E2E50={p(e,.5):.3f} E2E90={p(e,.9):.3f}")
|
|
|
|
# Route distribution from breakdown
|
|
try:
|
|
bd = json.load(open(f"{outdir}/breakdown.json"))
|
|
rc = Counter(b.get("route_class", "") for b in bd)
|
|
print(f"\nRoute class distribution:")
|
|
for cls, cnt in sorted(rc.items()):
|
|
print(f" {cls}: {cnt}")
|
|
|
|
heavy = [b for b in bd if b.get("route_class", "").startswith("HEAVY")]
|
|
reasons = Counter(b.get("offload_reason", "") for b in heavy)
|
|
print(f"\nHEAVY offload reasons: {dict(reasons)}")
|
|
|
|
colo = [b for b in bd if b.get("route_class") == "HEAVY_COLO"]
|
|
offloaded = [b for b in bd if b.get("route_class") == "HEAVY_OFFLOAD"]
|
|
print(f"\nHEAVY_COLO (cold, no RDMA): {len(colo)}")
|
|
print(f"HEAVY_OFFLOAD (cached, RDMA): {len(offloaded)}")
|
|
|
|
# Cache ratio distribution for HEAVY
|
|
print("\nCache ratio distribution for HEAVY:")
|
|
for b in heavy:
|
|
cr = b.get("cache_ratio", b.get("cache_hit", 0) / max(b.get("input_length", 1), 1))
|
|
cls = b.get("route_class", "")
|
|
reason = b.get("offload_reason", "")
|
|
# Don't print individual ones, summarize
|
|
|
|
ratios = [b.get("cache_ratio", b.get("cache_hit", 0) / max(b.get("input_length", 1), 1)) for b in heavy]
|
|
if ratios:
|
|
ratios.sort()
|
|
print(f" min={min(ratios):.2f} p50={p(ratios,.5):.2f} mean={sum(ratios)/len(ratios):.2f} max={max(ratios):.2f}")
|
|
print(f" >=0.3 (would offload): {sum(1 for r in ratios if r >= 0.3)}")
|
|
print(f" <0.3 (stays colo): {sum(1 for r in ratios if r < 0.3)}")
|
|
|
|
# TTFT comparison: HEAVY_COLO timing
|
|
if colo:
|
|
colo_ttft = sorted([b["t_first_token"] - b["t_proxy_recv"] for b in colo if b.get("t_first_token")])
|
|
if colo_ttft:
|
|
print(f"\n HEAVY_COLO TTFT: p50={p(colo_ttft,.5):.2f}s p90={p(colo_ttft,.9):.2f}s")
|
|
if offloaded:
|
|
off_ttft = sorted([b["t_first_token"] - b["t_proxy_recv"] for b in offloaded if b.get("t_first_token")])
|
|
if off_ttft:
|
|
print(f" HEAVY_OFFLOAD TTFT: p50={p(off_ttft,.5):.2f}s p90={p(off_ttft,.9):.2f}s")
|
|
pf = [b["t_prefill_done"] - b["t_prefill_sent"] for b in offloaded if b.get("t_prefill_done") and b.get("t_prefill_sent")]
|
|
kv = [b["t_first_token"] - b["t_prefill_done"] for b in offloaded if b.get("t_first_token") and b.get("t_prefill_done")]
|
|
if pf:
|
|
pf.sort()
|
|
print(f" Offload prefill: p50={p(pf,.5):.2f}s p90={p(pf,.9):.2f}s")
|
|
if kv:
|
|
kv.sort()
|
|
print(f" Offload KV xfer: p50={p(kv,.5):.2f}s p90={p(kv,.9):.2f}s")
|
|
|
|
except Exception as e:
|
|
print(f"Breakdown analysis error: {e}")
|
|
|
|
if fail:
|
|
print(f"\nFailed requests ({len(fail)}):")
|
|
for r in fail[:5]:
|
|
print(f" input={r['input_length']} error={str(r['error'])[:80]}")
|
|
|
|
print()
|
|
print("=" * 70)
|
|
print("Comparison with all prior experiments")
|
|
print("=" * 70)
|
|
print("Baseline 8C plain: OK=198/200 TTFT50=1.075 TTFT90=9.384 TPOT90=0.0761 E2E50=5.075")
|
|
print("Phase0A 7C kv_both: OK=198/200 TTFT50=1.073 TPOT90=0.0738 E2E50=5.096")
|
|
print("V2 all-offload: OK=179/185 TTFT50=0.762 TPOT90=0.0746 E2E50=4.628")
|
|
print(f"H4 cache-aware gate: OK={len(ok)}/{len(rows)} TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f} TPOT90={p(tpots,.9):.4f} E2E50={p(e2es,.5):.3f}")
|