#!/bin/bash # H4: Cache-aware offload gate — only offload HEAVY when cache_ratio >= 0.3 # Cold turn-1 HEAVY stays co-located (no RDMA overhead) # Cached turn-2+ HEAVY offloads to flexible D set -euo pipefail cd /home/admin/cpfs/wjh/agentic-kv source .venv/bin/activate MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct OUTDIR=outputs/h4_cache_gate cleanup() { for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do kill -9 "$p" 2>/dev/null || true; done sleep 3 for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true); do kill -9 "$p" 2>/dev/null || true; done sleep 5 } cleanup mkdir -p "$OUTDIR" echo "=== Verifying GPUs free ===" nvidia-smi --query-gpu=index,memory.used --format=csv,noheader # ---- Launch 8 Combined instances (GPUs 0-7, all kv_both) ---- echo "=== Launching 8 Combined instances ===" for i in $(seq 0 7); do echo "Starting C instance $i on GPU $i, port $((8000+i)), bootstrap $((8998+i))" VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \ .venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching --enforce-eager \ --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \ --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \ > "$OUTDIR/vllm_$i.log" 2>&1 & sleep 2 done # ---- Wait for all instances healthy ---- echo "=== Waiting for all instances to be healthy ===" for port in $(seq 8000 8007); do echo -n " Waiting for port $port..." timeout 600 bash -c "until curl -sf http://127.0.0.1:$port/health > /dev/null 2>&1; do sleep 5; done" echo " OK" done # Wait for bootstrap ports for bp in $(seq 8998 9005); do timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done" done echo "=== All instances healthy ===" sleep 5 # ---- Launch Proxy with H4 cache-aware offload ---- echo "=== Launching cache_aware_proxy with H4 cache-aware offload gate ===" .venv/bin/python scripts/cache_aware_proxy.py \ --combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \ http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 \ http://127.0.0.1:8006 http://127.0.0.1:8007 \ --bootstrap-ports 8998,8999,9000,9001,9002,9003,9004,9005 \ --offload --port 9090 \ > "$OUTDIR/proxy.log" 2>&1 & PROXY_PID=$! echo "Proxy PID: $PROXY_PID" # Wait for proxy ready echo -n " Waiting for proxy..." until curl -sf "http://127.0.0.1:9090/stats" > /dev/null 2>&1; do sleep 2 echo -n "." done echo " OK" # ---- Run benchmark ---- echo "=== Running benchmark: 200 req, time_scale=20, max-inflight-sessions=8 ===" .venv/bin/python -m replayer --trace traces/sampled_1000req_seed42.jsonl \ --output "$OUTDIR/metrics.jsonl" \ --endpoint http://localhost:9090 --model "$MODEL" \ --time-scale 20 --max-inflight-sessions 8 --request-limit 200 -v # ---- Save proxy data BEFORE cleanup ---- echo "=== Saving proxy breakdown and stats ===" curl -sf "http://127.0.0.1:9090/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true curl -sf "http://127.0.0.1:9090/stats" > "$OUTDIR/stats.json" 2>/dev/null || true # ---- Analysis ---- echo "=== H4 Cache-Aware Gate Results ===" .venv/bin/python -c " import json from collections import Counter rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')] ok = [r for r in rows if not r.get('error')] fail = [r for r in rows if r.get('error')] p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0 ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')]) tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0]) e2es = sorted([r['latency_s'] for r in ok]) print(f'OK={len(ok)}/{len(rows)} TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f} TPOT90={p(tpots,.9):.4f} E2E50={p(e2es,.5):.3f}') # Per-class breakdown for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MED'),(20000,200000,'HEAVY')]: sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')] if sub: t = sorted([r['ttft_s'] for r in sub]) tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0]) e = sorted([r['latency_s'] for r in sub]) print(f' {cl:6s} n={len(sub):3d} TTFT50={p(t,.5):.3f} TTFT90={p(t,.9):.3f} TPOT90={p(tp,.9):.4f} E2E50={p(e,.5):.3f}') # Route distribution from breakdown try: bd = json.load(open('$OUTDIR/breakdown.json')) rc = Counter(b.get('route_class','') for b in bd) print(f'\nRoute class distribution:') for cls, cnt in sorted(rc.items()): print(f' {cls}: {cnt}') # Cache ratio analysis for HEAVY heavy = [b for b in bd if b.get('route_class','').startswith('HEAVY')] reasons = Counter(b.get('offload_reason','') for b in heavy) print(f'\nHEAVY offload reasons: {dict(reasons)}') colo = [b for b in bd if b.get('route_class') == 'HEAVY_COLO'] offloaded = [b for b in bd if b.get('route_class') == 'HEAVY_OFFLOAD'] print(f'\nHEAVY_COLO (cold, no RDMA): {len(colo)}') print(f'HEAVY_OFFLOAD (cached, RDMA): {len(offloaded)}') # Cache ratios for b in heavy: cr = b.get('cache_ratio', b.get('cache_hit',0)/max(b.get('input_length',1),1)) cls = b.get('route_class','') reason = b.get('offload_reason','') # Timing comparison: HEAVY_COLO vs HEAVY_OFFLOAD if colo: colo_ttft = sorted([b['t_first_token']-b['t_proxy_recv'] for b in colo if b.get('t_first_token')]) if colo_ttft: print(f' HEAVY_COLO TTFT: p50={p(colo_ttft,.5):.2f}s p90={p(colo_ttft,.9):.2f}s') if offloaded: off_ttft = sorted([b['t_first_token']-b['t_proxy_recv'] for b in offloaded if b.get('t_first_token')]) if off_ttft: print(f' HEAVY_OFFLOAD TTFT: p50={p(off_ttft,.5):.2f}s p90={p(off_ttft,.9):.2f}s') # Offload timing breakdown if offloaded: pf = [b['t_prefill_done']-b['t_prefill_sent'] for b in offloaded if b.get('t_prefill_done') and b.get('t_prefill_sent')] kv = [b['t_first_token']-b['t_prefill_done'] for b in offloaded if b.get('t_first_token') and b.get('t_prefill_done')] if pf: pf.sort() print(f' Offload prefill: p50={p(pf,.5):.2f}s p90={p(pf,.9):.2f}s') if kv: kv.sort() print(f' Offload KV xfer+decode start: p50={p(kv,.5):.2f}s p90={p(kv,.9):.2f}s') except Exception as e: print(f'Breakdown analysis error: {e}') if fail: print(f'\nFailed requests ({len(fail)}):') for r in fail[:5]: print(f' input={r[\"input_length\"]} error={r[\"error\"][:80]}') print() print('=== Baselines for comparison ===') print('Baseline 8C plain: OK=198/200 TTFT50=1.075 TTFT90=9.384 TPOT90=0.0761 E2E50=5.075') print('Phase0A 7C kv_both: OK=198/200 TTFT50=1.073 TPOT90=0.0738 E2E50=5.096') print('V2 all-offload: OK=179/185 TTFT50=0.762 TPOT90=0.0746 E2E50=4.628') " cleanup echo "=== DONE $(date) ==="