D2: run_benchmark.sh and run_experiments.sh still pass --time-scale and --max-inflight-sessions to the replayer, but those flags were removed when the project moved to trace-driven dispatch. The scripts cannot run as-is. D3: ~25 ad-hoc analyze_* / compare_* / profile_* / final_* scripts and a handful of single-experiment run_*.sh point at /home/admin/cpfs paths, deleted output directories, or a sampled trace file that no longer exists. Keep them in scripts/legacy/ for historical reference; the scripts that remain in scripts/ (analyze_trace, analyze_breakdown, analyze_cache_hit, analyze_eviction, compare_results, compute_roofline, sample_trace, analyze_agentic_patterns, simulate_cache_policies, plus launch_*.sh, gpu_monitor.sh, bench.sh) cover the current workflow. Adds scripts/legacy/README.md to document the archival policy.
173 lines
7.0 KiB
Bash
173 lines
7.0 KiB
Bash
#!/bin/bash
|
|
# H4: Cache-aware offload gate — only offload HEAVY when cache_ratio >= 0.3
|
|
# Cold turn-1 HEAVY stays co-located (no RDMA overhead)
|
|
# Cached turn-2+ HEAVY offloads to flexible D
|
|
set -euo pipefail
|
|
cd /home/admin/cpfs/wjh/agentic-kv
|
|
source .venv/bin/activate
|
|
|
|
MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct
|
|
OUTDIR=outputs/h4_cache_gate
|
|
|
|
cleanup() {
|
|
for p in $(ps aux | grep -E 'vllm serve|cache_aware_proxy' | grep -v grep | awk '{print $2}' 2>/dev/null); do kill -9 "$p" 2>/dev/null || true; done
|
|
sleep 3
|
|
for p in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u | grep -v '^$' || true); do kill -9 "$p" 2>/dev/null || true; done
|
|
sleep 5
|
|
}
|
|
|
|
cleanup
|
|
mkdir -p "$OUTDIR"
|
|
|
|
echo "=== Verifying GPUs free ==="
|
|
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader
|
|
|
|
# ---- Launch 8 Combined instances (GPUs 0-7, all kv_both) ----
|
|
echo "=== Launching 8 Combined instances ==="
|
|
for i in $(seq 0 7); do
|
|
echo "Starting C instance $i on GPU $i, port $((8000+i)), bootstrap $((8998+i))"
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \
|
|
.venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
> "$OUTDIR/vllm_$i.log" 2>&1 &
|
|
sleep 2
|
|
done
|
|
|
|
# ---- Wait for all instances healthy ----
|
|
echo "=== Waiting for all instances to be healthy ==="
|
|
for port in $(seq 8000 8007); do
|
|
echo -n " Waiting for port $port..."
|
|
timeout 600 bash -c "until curl -sf http://127.0.0.1:$port/health > /dev/null 2>&1; do sleep 5; done"
|
|
echo " OK"
|
|
done
|
|
|
|
# Wait for bootstrap ports
|
|
for bp in $(seq 8998 9005); do
|
|
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
|
|
done
|
|
echo "=== All instances healthy ==="
|
|
sleep 5
|
|
|
|
# ---- Launch Proxy with H4 cache-aware offload ----
|
|
echo "=== Launching cache_aware_proxy with H4 cache-aware offload gate ==="
|
|
.venv/bin/python scripts/cache_aware_proxy.py \
|
|
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \
|
|
http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 \
|
|
http://127.0.0.1:8006 http://127.0.0.1:8007 \
|
|
--bootstrap-ports 8998,8999,9000,9001,9002,9003,9004,9005 \
|
|
--offload --port 9090 \
|
|
> "$OUTDIR/proxy.log" 2>&1 &
|
|
PROXY_PID=$!
|
|
echo "Proxy PID: $PROXY_PID"
|
|
|
|
# Wait for proxy ready
|
|
echo -n " Waiting for proxy..."
|
|
until curl -sf "http://127.0.0.1:9090/stats" > /dev/null 2>&1; do
|
|
sleep 2
|
|
echo -n "."
|
|
done
|
|
echo " OK"
|
|
|
|
# ---- Run benchmark ----
|
|
echo "=== Running benchmark: 200 req, time_scale=20, max-inflight-sessions=8 ==="
|
|
.venv/bin/python -m replayer --trace traces/sampled_1000req_seed42.jsonl \
|
|
--output "$OUTDIR/metrics.jsonl" \
|
|
--endpoint http://localhost:9090 --model "$MODEL" \
|
|
--time-scale 20 --max-inflight-sessions 8 --request-limit 200 -v
|
|
|
|
# ---- Save proxy data BEFORE cleanup ----
|
|
echo "=== Saving proxy breakdown and stats ==="
|
|
curl -sf "http://127.0.0.1:9090/breakdown" > "$OUTDIR/breakdown.json" 2>/dev/null || true
|
|
curl -sf "http://127.0.0.1:9090/stats" > "$OUTDIR/stats.json" 2>/dev/null || true
|
|
|
|
# ---- Analysis ----
|
|
echo "=== H4 Cache-Aware Gate Results ==="
|
|
.venv/bin/python -c "
|
|
import json
|
|
from collections import Counter
|
|
|
|
rows = [json.loads(l) for l in open('$OUTDIR/metrics.jsonl')]
|
|
ok = [r for r in rows if not r.get('error')]
|
|
fail = [r for r in rows if r.get('error')]
|
|
p = lambda v,q: sorted(v)[min(int(q*len(v)),len(v)-1)] if v else 0
|
|
|
|
ttfts = sorted([r['ttft_s'] for r in ok if r.get('ttft_s')])
|
|
tpots = sorted([r['tpot_s'] for r in ok if r.get('tpot_s') and r['tpot_s']>0])
|
|
e2es = sorted([r['latency_s'] for r in ok])
|
|
|
|
print(f'OK={len(ok)}/{len(rows)} TTFT50={p(ttfts,.5):.3f} TTFT90={p(ttfts,.9):.3f} TPOT90={p(tpots,.9):.4f} E2E50={p(e2es,.5):.3f}')
|
|
|
|
# Per-class breakdown
|
|
for lo,hi,cl in [(0,5000,'WARM'),(5000,20000,'MED'),(20000,200000,'HEAVY')]:
|
|
sub = [r for r in ok if lo <= r['input_length'] < hi and r.get('ttft_s')]
|
|
if sub:
|
|
t = sorted([r['ttft_s'] for r in sub])
|
|
tp = sorted([r['tpot_s'] for r in sub if r.get('tpot_s') and r['tpot_s']>0])
|
|
e = sorted([r['latency_s'] for r in sub])
|
|
print(f' {cl:6s} n={len(sub):3d} TTFT50={p(t,.5):.3f} TTFT90={p(t,.9):.3f} TPOT90={p(tp,.9):.4f} E2E50={p(e,.5):.3f}')
|
|
|
|
# Route distribution from breakdown
|
|
try:
|
|
bd = json.load(open('$OUTDIR/breakdown.json'))
|
|
rc = Counter(b.get('route_class','') for b in bd)
|
|
print(f'\nRoute class distribution:')
|
|
for cls, cnt in sorted(rc.items()):
|
|
print(f' {cls}: {cnt}')
|
|
|
|
# Cache ratio analysis for HEAVY
|
|
heavy = [b for b in bd if b.get('route_class','').startswith('HEAVY')]
|
|
reasons = Counter(b.get('offload_reason','') for b in heavy)
|
|
print(f'\nHEAVY offload reasons: {dict(reasons)}')
|
|
|
|
colo = [b for b in bd if b.get('route_class') == 'HEAVY_COLO']
|
|
offloaded = [b for b in bd if b.get('route_class') == 'HEAVY_OFFLOAD']
|
|
print(f'\nHEAVY_COLO (cold, no RDMA): {len(colo)}')
|
|
print(f'HEAVY_OFFLOAD (cached, RDMA): {len(offloaded)}')
|
|
|
|
# Cache ratios
|
|
for b in heavy:
|
|
cr = b.get('cache_ratio', b.get('cache_hit',0)/max(b.get('input_length',1),1))
|
|
cls = b.get('route_class','')
|
|
reason = b.get('offload_reason','')
|
|
|
|
# Timing comparison: HEAVY_COLO vs HEAVY_OFFLOAD
|
|
if colo:
|
|
colo_ttft = sorted([b['t_first_token']-b['t_proxy_recv'] for b in colo if b.get('t_first_token')])
|
|
if colo_ttft:
|
|
print(f' HEAVY_COLO TTFT: p50={p(colo_ttft,.5):.2f}s p90={p(colo_ttft,.9):.2f}s')
|
|
if offloaded:
|
|
off_ttft = sorted([b['t_first_token']-b['t_proxy_recv'] for b in offloaded if b.get('t_first_token')])
|
|
if off_ttft:
|
|
print(f' HEAVY_OFFLOAD TTFT: p50={p(off_ttft,.5):.2f}s p90={p(off_ttft,.9):.2f}s')
|
|
|
|
# Offload timing breakdown
|
|
if offloaded:
|
|
pf = [b['t_prefill_done']-b['t_prefill_sent'] for b in offloaded if b.get('t_prefill_done') and b.get('t_prefill_sent')]
|
|
kv = [b['t_first_token']-b['t_prefill_done'] for b in offloaded if b.get('t_first_token') and b.get('t_prefill_done')]
|
|
if pf:
|
|
pf.sort()
|
|
print(f' Offload prefill: p50={p(pf,.5):.2f}s p90={p(pf,.9):.2f}s')
|
|
if kv:
|
|
kv.sort()
|
|
print(f' Offload KV xfer+decode start: p50={p(kv,.5):.2f}s p90={p(kv,.9):.2f}s')
|
|
|
|
except Exception as e:
|
|
print(f'Breakdown analysis error: {e}')
|
|
|
|
if fail:
|
|
print(f'\nFailed requests ({len(fail)}):')
|
|
for r in fail[:5]:
|
|
print(f' input={r[\"input_length\"]} error={r[\"error\"][:80]}')
|
|
|
|
print()
|
|
print('=== Baselines for comparison ===')
|
|
print('Baseline 8C plain: OK=198/200 TTFT50=1.075 TTFT90=9.384 TPOT90=0.0761 E2E50=5.075')
|
|
print('Phase0A 7C kv_both: OK=198/200 TTFT50=1.073 TPOT90=0.0738 E2E50=5.096')
|
|
print('V2 all-offload: OK=179/185 TTFT50=0.762 TPOT90=0.0746 E2E50=4.628')
|
|
"
|
|
|
|
cleanup
|
|
echo "=== DONE $(date) ==="
|