Critical: - cache_aware_proxy: _handle_pd_sep leaked p_inst.num_requests (never decremented) and never managed d_inst.num_requests; fix media_type from application/json to text/event-stream for SSE stream High: - b3_sweep/b3_isolated_policy/b3_analyze: replace hardcoded /home/admin/cpfs/wjh/ ROOT with script-relative $(dirname "$0")/.. - b3_analyze: replace hardcoded 8-port WORKER_MAP with dynamic generation from BASE_PORT and N_INSTANCES Medium: - analyze_breakdown: warn on stderr when records are skipped (was silent) - deploy_vllm_patches: fail-fast on SSH/SCP errors instead of continuing with empty VENV_SITE - pyproject.toml: declare fastapi and uvicorn as runtime dependencies - launch_elastic_p2p: kill EngineCore and proxy in trap handler to prevent GPU memory leaks on exit
124 lines
4.2 KiB
Bash
Executable File
124 lines
4.2 KiB
Bash
Executable File
#!/bin/bash
|
||
# Elastic P2P offload: 8× TP=1 kv_both instances + cache_aware_proxy --offload.
|
||
#
|
||
# Architecture:
|
||
# All 8 instances run kv_role=kv_both (Mooncake connector).
|
||
# The proxy classifies requests as WARM/MEDIUM/HEAVY.
|
||
# HEAVY requests: prefill on a different instance (P), KV via Mooncake RDMA,
|
||
# decode on session-sticky instance (D).
|
||
# WARM/MEDIUM: co-located prefill+decode on session-sticky instance.
|
||
#
|
||
# Usage:
|
||
# bash scripts/launch_elastic_p2p.sh # default: this machine
|
||
# HOST=dash1 bash scripts/launch_elastic_p2p.sh # launch on dash1 via ssh
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||
VENV="$PROJECT_DIR/.venv/bin"
|
||
VLLM="$VENV/vllm"
|
||
PYTHON="$VENV/python"
|
||
|
||
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||
N_INSTANCES=8
|
||
BASE_PORT=8000
|
||
PROXY_PORT=9090
|
||
HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}"
|
||
MAX_OFFLOAD="${MAX_OFFLOAD:-4}"
|
||
|
||
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; pkill -9 -f "vllm serve" 2>/dev/null; pkill -9 -f "EngineCore" 2>/dev/null; pkill -9 -f cache_aware_proxy 2>/dev/null; wait 2>/dev/null' EXIT INT TERM
|
||
|
||
echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ==="
|
||
echo " Model: $MODEL"
|
||
echo " Instances: $N_INSTANCES × TP=1"
|
||
echo " Proxy: port $PROXY_PORT"
|
||
echo " Heavy threshold: $HEAVY_THRESHOLD tokens"
|
||
echo " Max offload: $MAX_OFFLOAD concurrent"
|
||
echo ""
|
||
|
||
# Step 1: Launch all instances with kv_role=kv_both
|
||
combined_args=""
|
||
bootstrap_ports=""
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
port=$((BASE_PORT + i))
|
||
bootstrap=$((8998 + i))
|
||
master_port=$((29500 + i))
|
||
logfile="/tmp/elastic_inst_${i}.log"
|
||
|
||
echo " Instance $i: GPU $i, port $port, bootstrap $bootstrap"
|
||
|
||
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
|
||
MASTER_PORT=$master_port \
|
||
CUDA_VISIBLE_DEVICES=$i \
|
||
$VLLM serve "$MODEL" \
|
||
--host 0.0.0.0 --port $port \
|
||
--tensor-parallel-size 1 \
|
||
--trust-remote-code --enable-prefix-caching \
|
||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
||
--kv-transfer-config \
|
||
'{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
||
> "$logfile" 2>&1 &
|
||
|
||
combined_args="$combined_args http://127.0.0.1:$port"
|
||
bootstrap_ports="${bootstrap_ports:+$bootstrap_ports,}$bootstrap"
|
||
|
||
sleep 2 # stagger startup to avoid port collision
|
||
done
|
||
|
||
# Step 2: Wait for all instances
|
||
echo ""
|
||
echo "Waiting for instances..."
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
port=$((BASE_PORT + i))
|
||
timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
|
||
echo " Instance $i (port $port) ready"
|
||
done
|
||
|
||
# Step 3: Wait for bootstrap servers
|
||
echo "Waiting for bootstrap servers..."
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
bp=$((8998 + i))
|
||
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
|
||
echo " Bootstrap $bp ready"
|
||
done
|
||
|
||
# Step 4: Start proxy with --offload
|
||
echo ""
|
||
echo "Starting proxy (offload mode)..."
|
||
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
||
--combined $combined_args \
|
||
--bootstrap-ports "$bootstrap_ports" \
|
||
--offload \
|
||
--policy unified \
|
||
--heavy-threshold $HEAVY_THRESHOLD \
|
||
--port $PROXY_PORT &
|
||
sleep 5
|
||
|
||
# Step 5: Smoke test
|
||
echo ""
|
||
echo "Smoke test..."
|
||
result=$(curl -s -m 120 http://localhost:$PROXY_PORT/v1/completions \
|
||
-X POST -H "Content-Type: application/json" \
|
||
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
|
||
if echo "$result" | grep -q "choices"; then
|
||
echo " Smoke test passed!"
|
||
else
|
||
echo " WARNING: Smoke test failed: $result"
|
||
fi
|
||
|
||
echo ""
|
||
echo "=== Elastic P2P ready ==="
|
||
echo " Endpoint: http://localhost:$PROXY_PORT"
|
||
echo " Breakdown: curl http://localhost:$PROXY_PORT/breakdown"
|
||
echo " Instance logs: /tmp/elastic_inst_*.log"
|
||
echo ""
|
||
echo "Run benchmark:"
|
||
echo " python -m replayer --trace traces/sampled_1000req_seed42.jsonl \\"
|
||
echo " --output outputs/elastic_p2p/metrics.jsonl \\"
|
||
echo " --endpoint http://localhost:$PROXY_PORT \\"
|
||
echo " --time-scale 20 --max-inflight-sessions 8 -v"
|
||
echo ""
|
||
|
||
wait
|