#!/bin/bash # Elastic P2P offload: 8× TP=1 kv_both instances + cache_aware_proxy --offload. # # Architecture: # All 8 instances run kv_role=kv_both (Mooncake connector). # The proxy classifies requests as WARM/MEDIUM/HEAVY. # HEAVY requests: prefill on a different instance (P), KV via Mooncake RDMA, # decode on session-sticky instance (D). # WARM/MEDIUM: co-located prefill+decode on session-sticky instance. # # Usage: # bash scripts/launch_elastic_p2p.sh # default: this machine # HOST=dash1 bash scripts/launch_elastic_p2p.sh # launch on dash1 via ssh set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" VENV="$PROJECT_DIR/.venv/bin" VLLM="$VENV/vllm" PYTHON="$VENV/python" MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" N_INSTANCES=8 BASE_PORT=8000 PROXY_PORT=9090 HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}" MAX_OFFLOAD="${MAX_OFFLOAD:-4}" trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null || true; pkill -9 -f "vllm serve" 2>/dev/null || true; pkill -9 -f "EngineCore" 2>/dev/null || true; pkill -9 -f cache_aware_proxy 2>/dev/null || true; wait 2>/dev/null || true' EXIT INT TERM echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ===" echo " Model: $MODEL" echo " Instances: $N_INSTANCES × TP=1" echo " Proxy: port $PROXY_PORT" echo " Heavy threshold: $HEAVY_THRESHOLD tokens" echo " Max offload: $MAX_OFFLOAD concurrent" echo "" # Step 1: Launch all instances with kv_role=kv_both combined_args="" bootstrap_ports="" for i in $(seq 0 $((N_INSTANCES - 1))); do port=$((BASE_PORT + i)) bootstrap=$((8998 + i)) master_port=$((29500 + i)) logfile="/tmp/elastic_inst_${i}.log" echo " Instance $i: GPU $i, port $port, bootstrap $bootstrap" VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \ MASTER_PORT=$master_port \ CUDA_VISIBLE_DEVICES=$i \ $VLLM serve "$MODEL" \ --host 0.0.0.0 --port $port \ --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching \ --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \ --kv-transfer-config \ '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \ > "$logfile" 2>&1 & combined_args="$combined_args http://127.0.0.1:$port" bootstrap_ports="${bootstrap_ports:+$bootstrap_ports,}$bootstrap" sleep 2 # stagger startup to avoid port collision done # Step 2: Wait for all instances echo "" echo "Waiting for instances..." for i in $(seq 0 $((N_INSTANCES - 1))); do port=$((BASE_PORT + i)) timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done" echo " Instance $i (port $port) ready" done # Step 3: Wait for bootstrap servers echo "Waiting for bootstrap servers..." for i in $(seq 0 $((N_INSTANCES - 1))); do bp=$((8998 + i)) timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done" echo " Bootstrap $bp ready" done # Step 4: Start proxy with --offload echo "" echo "Starting proxy (offload mode)..." $PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \ --combined $combined_args \ --bootstrap-ports "$bootstrap_ports" \ --offload \ --policy unified \ --heavy-threshold $HEAVY_THRESHOLD \ --port $PROXY_PORT & sleep 5 # Step 5: Smoke test echo "" echo "Smoke test..." result=$(curl -s -m 120 http://localhost:$PROXY_PORT/v1/completions \ -X POST -H "Content-Type: application/json" \ -d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1) if echo "$result" | grep -q "choices"; then echo " Smoke test passed!" else echo " WARNING: Smoke test failed: $result" fi echo "" echo "=== Elastic P2P ready ===" echo " Endpoint: http://localhost:$PROXY_PORT" echo " Breakdown: curl http://localhost:$PROXY_PORT/breakdown" echo " Instance logs: /tmp/elastic_inst_*.log" echo "" echo "Run benchmark:" echo " python -m replayer --trace traces/sampled_1000req_seed42.jsonl \\" echo " --output outputs/elastic_p2p/metrics.jsonl \\" echo " --endpoint http://localhost:$PROXY_PORT \\" echo " --time-scale 20 --max-inflight-sessions 8 -v" echo "" wait