Files
agentic-kvc/scripts/launch_elastic_p2p.sh
Gahow Wang 3fdcec9c0f Fix review P2s: lockfile, model path convention, trap robustness
- Regenerate uv.lock after adding fastapi/uvicorn deps so uv sync
  --locked no longer fails
- B3 scripts: default MODEL to $HOME/models/... matching documented
  convention and other launch scripts (repo has no models/ directory)
- launch_elastic_p2p: append || true to each trap command so set -e
  doesn't abort cleanup when jobs -p is empty and EngineCore orphans
  remain
2026-05-26 16:05:43 +08:00

124 lines
4.2 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Elastic P2P offload: 8× TP=1 kv_both instances + cache_aware_proxy --offload.
#
# Architecture:
# All 8 instances run kv_role=kv_both (Mooncake connector).
# The proxy classifies requests as WARM/MEDIUM/HEAVY.
# HEAVY requests: prefill on a different instance (P), KV via Mooncake RDMA,
# decode on session-sticky instance (D).
# WARM/MEDIUM: co-located prefill+decode on session-sticky instance.
#
# Usage:
# bash scripts/launch_elastic_p2p.sh # default: this machine
# HOST=dash1 bash scripts/launch_elastic_p2p.sh # launch on dash1 via ssh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}"
MAX_OFFLOAD="${MAX_OFFLOAD:-4}"
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null || true; pkill -9 -f "vllm serve" 2>/dev/null || true; pkill -9 -f "EngineCore" 2>/dev/null || true; pkill -9 -f cache_aware_proxy 2>/dev/null || true; wait 2>/dev/null || true' EXIT INT TERM
echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ==="
echo " Model: $MODEL"
echo " Instances: $N_INSTANCES × TP=1"
echo " Proxy: port $PROXY_PORT"
echo " Heavy threshold: $HEAVY_THRESHOLD tokens"
echo " Max offload: $MAX_OFFLOAD concurrent"
echo ""
# Step 1: Launch all instances with kv_role=kv_both
combined_args=""
bootstrap_ports=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
bootstrap=$((8998 + i))
master_port=$((29500 + i))
logfile="/tmp/elastic_inst_${i}.log"
echo " Instance $i: GPU $i, port $port, bootstrap $bootstrap"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
MASTER_PORT=$master_port \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config \
'{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
> "$logfile" 2>&1 &
combined_args="$combined_args http://127.0.0.1:$port"
bootstrap_ports="${bootstrap_ports:+$bootstrap_ports,}$bootstrap"
sleep 2 # stagger startup to avoid port collision
done
# Step 2: Wait for all instances
echo ""
echo "Waiting for instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
echo " Instance $i (port $port) ready"
done
# Step 3: Wait for bootstrap servers
echo "Waiting for bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
bp=$((8998 + i))
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
echo " Bootstrap $bp ready"
done
# Step 4: Start proxy with --offload
echo ""
echo "Starting proxy (offload mode)..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--bootstrap-ports "$bootstrap_ports" \
--offload \
--policy unified \
--heavy-threshold $HEAVY_THRESHOLD \
--port $PROXY_PORT &
sleep 5
# Step 5: Smoke test
echo ""
echo "Smoke test..."
result=$(curl -s -m 120 http://localhost:$PROXY_PORT/v1/completions \
-X POST -H "Content-Type: application/json" \
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
if echo "$result" | grep -q "choices"; then
echo " Smoke test passed!"
else
echo " WARNING: Smoke test failed: $result"
fi
echo ""
echo "=== Elastic P2P ready ==="
echo " Endpoint: http://localhost:$PROXY_PORT"
echo " Breakdown: curl http://localhost:$PROXY_PORT/breakdown"
echo " Instance logs: /tmp/elastic_inst_*.log"
echo ""
echo "Run benchmark:"
echo " python -m replayer --trace traces/sampled_1000req_seed42.jsonl \\"
echo " --output outputs/elastic_p2p/metrics.jsonl \\"
echo " --endpoint http://localhost:$PROXY_PORT \\"
echo " --time-scale 20 --max-inflight-sessions 8 -v"
echo ""
wait