Files
agentic-kvc/scripts/launch_elastic_p2p.sh
Gahow Wang 645b067dd4 Fix review bugs: PD-sep counter leaks, hardcoded paths, missing deps
Critical:
- cache_aware_proxy: _handle_pd_sep leaked p_inst.num_requests (never
  decremented) and never managed d_inst.num_requests; fix media_type
  from application/json to text/event-stream for SSE stream

High:
- b3_sweep/b3_isolated_policy/b3_analyze: replace hardcoded
  /home/admin/cpfs/wjh/ ROOT with script-relative $(dirname "$0")/..
- b3_analyze: replace hardcoded 8-port WORKER_MAP with dynamic
  generation from BASE_PORT and N_INSTANCES

Medium:
- analyze_breakdown: warn on stderr when records are skipped (was silent)
- deploy_vllm_patches: fail-fast on SSH/SCP errors instead of
  continuing with empty VENV_SITE
- pyproject.toml: declare fastapi and uvicorn as runtime dependencies
- launch_elastic_p2p: kill EngineCore and proxy in trap handler to
  prevent GPU memory leaks on exit
2026-05-26 15:54:55 +08:00

124 lines
4.2 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Elastic P2P offload: 8× TP=1 kv_both instances + cache_aware_proxy --offload.
#
# Architecture:
# All 8 instances run kv_role=kv_both (Mooncake connector).
# The proxy classifies requests as WARM/MEDIUM/HEAVY.
# HEAVY requests: prefill on a different instance (P), KV via Mooncake RDMA,
# decode on session-sticky instance (D).
# WARM/MEDIUM: co-located prefill+decode on session-sticky instance.
#
# Usage:
# bash scripts/launch_elastic_p2p.sh # default: this machine
# HOST=dash1 bash scripts/launch_elastic_p2p.sh # launch on dash1 via ssh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}"
MAX_OFFLOAD="${MAX_OFFLOAD:-4}"
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; pkill -9 -f "vllm serve" 2>/dev/null; pkill -9 -f "EngineCore" 2>/dev/null; pkill -9 -f cache_aware_proxy 2>/dev/null; wait 2>/dev/null' EXIT INT TERM
echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ==="
echo " Model: $MODEL"
echo " Instances: $N_INSTANCES × TP=1"
echo " Proxy: port $PROXY_PORT"
echo " Heavy threshold: $HEAVY_THRESHOLD tokens"
echo " Max offload: $MAX_OFFLOAD concurrent"
echo ""
# Step 1: Launch all instances with kv_role=kv_both
combined_args=""
bootstrap_ports=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
bootstrap=$((8998 + i))
master_port=$((29500 + i))
logfile="/tmp/elastic_inst_${i}.log"
echo " Instance $i: GPU $i, port $port, bootstrap $bootstrap"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
MASTER_PORT=$master_port \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config \
'{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
> "$logfile" 2>&1 &
combined_args="$combined_args http://127.0.0.1:$port"
bootstrap_ports="${bootstrap_ports:+$bootstrap_ports,}$bootstrap"
sleep 2 # stagger startup to avoid port collision
done
# Step 2: Wait for all instances
echo ""
echo "Waiting for instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
echo " Instance $i (port $port) ready"
done
# Step 3: Wait for bootstrap servers
echo "Waiting for bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
bp=$((8998 + i))
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
echo " Bootstrap $bp ready"
done
# Step 4: Start proxy with --offload
echo ""
echo "Starting proxy (offload mode)..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--bootstrap-ports "$bootstrap_ports" \
--offload \
--policy unified \
--heavy-threshold $HEAVY_THRESHOLD \
--port $PROXY_PORT &
sleep 5
# Step 5: Smoke test
echo ""
echo "Smoke test..."
result=$(curl -s -m 120 http://localhost:$PROXY_PORT/v1/completions \
-X POST -H "Content-Type: application/json" \
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
if echo "$result" | grep -q "choices"; then
echo " Smoke test passed!"
else
echo " WARNING: Smoke test failed: $result"
fi
echo ""
echo "=== Elastic P2P ready ==="
echo " Endpoint: http://localhost:$PROXY_PORT"
echo " Breakdown: curl http://localhost:$PROXY_PORT/breakdown"
echo " Instance logs: /tmp/elastic_inst_*.log"
echo ""
echo "Run benchmark:"
echo " python -m replayer --trace traces/sampled_1000req_seed42.jsonl \\"
echo " --output outputs/elastic_p2p/metrics.jsonl \\"
echo " --endpoint http://localhost:$PROXY_PORT \\"
echo " --time-scale 20 --max-inflight-sessions 8 -v"
echo ""
wait