#!/usr/bin/env bash # Phase 1: Dedicated Prefill Service (7C + 1PS) # 7 Combined instances (GPUs 0-6, ports 8000-8006, kv_both) # 1 Prefill Service instance (GPU 7, port 8007, kv_both) set -euo pipefail PROJECT_DIR="${PROJECT_DIR:-$HOME/phd/agentic-kv}" cd "$PROJECT_DIR" source .venv/bin/activate MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" OUTDIR=outputs/phase1_ps mkdir -p "$OUTDIR" # ---- Cleanup ---- echo "=== Killing existing processes ===" pkill -f "vllm serve" 2>/dev/null || true pkill -f "cache_aware_proxy" 2>/dev/null || true sleep 3 echo "=== Verifying GPUs free ===" nvidia-smi --query-gpu=index,memory.used --format=csv,noheader sleep 2 # ---- Launch 7 Combined instances (GPUs 0-6) ---- echo "=== Launching 7 Combined instances ===" for i in $(seq 0 6); do echo "Starting C instance $i on GPU $i, port $((8000+i)), bootstrap $((8998+i))" VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998+i)) MASTER_PORT=$((29500+i)) CUDA_VISIBLE_DEVICES=$i \ .venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port $((8000+i)) --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching \ --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \ --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \ > "$OUTDIR/vllm_c_$i.log" 2>&1 & sleep 2 done # ---- Launch 1 PS instance (GPU 7) ---- echo "=== Launching PS instance on GPU 7, port 8007, bootstrap 9005 ===" VLLM_MOONCAKE_BOOTSTRAP_PORT=9005 MASTER_PORT=29507 CUDA_VISIBLE_DEVICES=7 \ .venv/bin/vllm serve "$MODEL" --host 0.0.0.0 --port 8007 --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching \ --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \ --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \ > "$OUTDIR/vllm_ps_0.log" 2>&1 & sleep 2 # ---- Wait for all instances healthy ---- echo "=== Waiting for all instances to be healthy ===" for port in 8000 8001 8002 8003 8004 8005 8006 8007; do echo -n " Waiting for port $port..." until curl -sf "http://127.0.0.1:$port/health" > /dev/null 2>&1; do sleep 5 echo -n "." done echo " OK" done echo "=== All instances healthy ===" sleep 5 # ---- Launch Proxy ---- echo "=== Launching cache_aware_proxy with PS ===" .venv/bin/python scripts/cache_aware_proxy.py \ --combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 \ http://127.0.0.1:8003 http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 \ --ps-instances http://127.0.0.1:8007 \ --ps-bootstrap-ports 9005 \ --bootstrap-ports 8998,8999,9000,9001,9002,9003,9004 \ --port 9090 \ > "$OUTDIR/proxy.log" 2>&1 & PROXY_PID=$! echo "Proxy PID: $PROXY_PID" # Wait for proxy ready echo -n " Waiting for proxy..." until curl -sf "http://127.0.0.1:9090/stats" > /dev/null 2>&1; do sleep 2 echo -n "." done echo " OK" echo "=== Running benchmark ===" .venv/bin/python -m replayer --trace traces/sampled_1000req_seed42.jsonl \ --output "$OUTDIR/metrics.jsonl" \ --endpoint http://localhost:9090 --model "$MODEL" \ --time-scale 20 --max-inflight-sessions 7 --request-limit 200 -v echo "=== Saving proxy breakdown and stats ===" curl -s "http://127.0.0.1:9090/breakdown" > "$OUTDIR/breakdown.json" curl -s "http://127.0.0.1:9090/stats" > "$OUTDIR/stats.json" echo "=== Benchmark complete ===" echo "Results in $OUTDIR/" echo "" # ---- Quick analysis ---- echo "=== Quick metrics summary ===" python3 -c " import json, statistics records = [] with open('$OUTDIR/metrics.jsonl') as f: for line in f: records.append(json.loads(line)) ok = [r for r in records if r.get('status') == 'ok'] fail = [r for r in records if r.get('status') != 'ok'] print(f'Total: {len(records)}, OK: {len(ok)}, Failed: {len(fail)}') print(f'Success rate: {len(ok)/len(records)*100:.1f}%') if ok: ttfts = sorted([r['ttft'] for r in ok]) tpots = sorted([r['tpot'] for r in ok]) e2es = sorted([r['e2e'] for r in ok]) def pct(vals, p): idx = int(len(vals) * p / 100) return vals[min(idx, len(vals)-1)] print(f'TTFT p50={pct(ttfts,50):.3f} p90={pct(ttfts,90):.3f} p99={pct(ttfts,99):.3f}') print(f'TPOT p50={pct(tpots,50):.4f} p90={pct(tpots,90):.4f} p99={pct(tpots,99):.4f}') print(f'E2E p50={pct(e2es,50):.3f} p90={pct(e2es,90):.3f} p99={pct(e2es,99):.3f}') # Breakdown analysis try: with open('$OUTDIR/breakdown.json') as f: bd = json.load(f) classes = {} for r in bd: rc = r.get('route_class', 'UNKNOWN') classes[rc] = classes.get(rc, 0) + 1 print(f'\nRoute class breakdown:') for rc, cnt in sorted(classes.items()): print(f' {rc}: {cnt}') # PS utilization ps_reqs = [r for r in bd if r.get('route_class') == 'HEAVY_PS'] print(f'\nPS offloaded: {len(ps_reqs)} requests') # Offload reasons for HEAVY heavy = [r for r in bd if r.get('route_class', '').startswith('HEAVY')] reasons = {} for r in heavy: reason = r.get('offload_reason', 'unknown') reasons[reason] = reasons.get(reason, 0) + 1 if reasons: print(f'HEAVY offload reasons:') for reason, cnt in sorted(reasons.items()): print(f' {reason}: {cnt}') except Exception as e: print(f'Breakdown analysis error: {e}') " echo "" echo "=== Phase 1 experiment complete ==="