Files
agentic-kvc/scripts/launch_elastic_p2p.sh
Gahow Wang 2b0ac70ee7 Phase 1 milestone: system-level analysis + reproducible report
- REPORT.md: self-contained milestone report covering baseline vs elastic
  setup, exact launch commands, benchmark params, results, log locations,
  and repo structure — sufficient for anyone to reproduce
- analysis/pd_separation_analysis.md §5: elastic P2P system-level breakdown
  (KV cache hit ratio, per-class TTFT, GPU util paradox explanation)
- scripts/cache_aware_proxy.py: round-robin P-instance selection replacing
  argmin(ongoing_tokens) to fix GPU load imbalance (3.0x → expected ~2x)
- scripts/launch_elastic_p2p.sh: one-command launch for elastic P2P config

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 16:17:41 +08:00

123 lines
4.0 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Elastic P2P offload: 8× TP=1 kv_both instances + cache_aware_proxy --offload.
#
# Architecture:
# All 8 instances run kv_role=kv_both (Mooncake connector).
# The proxy classifies requests as WARM/MEDIUM/HEAVY.
# HEAVY requests: prefill on a different instance (P), KV via Mooncake RDMA,
# decode on session-sticky instance (D).
# WARM/MEDIUM: co-located prefill+decode on session-sticky instance.
#
# Usage:
# bash scripts/launch_elastic_p2p.sh # default: this machine
# HOST=dash1 bash scripts/launch_elastic_p2p.sh # launch on dash1 via ssh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}"
MAX_OFFLOAD="${MAX_OFFLOAD:-4}"
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; wait 2>/dev/null' EXIT INT TERM
echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ==="
echo " Model: $MODEL"
echo " Instances: $N_INSTANCES × TP=1"
echo " Proxy: port $PROXY_PORT"
echo " Heavy threshold: $HEAVY_THRESHOLD tokens"
echo " Max offload: $MAX_OFFLOAD concurrent"
echo ""
# Step 1: Launch all instances with kv_role=kv_both
combined_args=""
bootstrap_ports=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
bootstrap=$((8998 + i))
master_port=$((29500 + i))
logfile="/tmp/elastic_inst_${i}.log"
echo " Instance $i: GPU $i, port $port, bootstrap $bootstrap"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
MASTER_PORT=$master_port \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config \
'{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
> "$logfile" 2>&1 &
combined_args="$combined_args http://127.0.0.1:$port"
bootstrap_ports="${bootstrap_ports:+$bootstrap_ports,}$bootstrap"
sleep 2 # stagger startup to avoid port collision
done
# Step 2: Wait for all instances
echo ""
echo "Waiting for instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
echo " Instance $i (port $port) ready"
done
# Step 3: Wait for bootstrap servers
echo "Waiting for bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
bp=$((8998 + i))
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
echo " Bootstrap $bp ready"
done
# Step 4: Start proxy with --offload
echo ""
echo "Starting proxy (offload mode)..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--bootstrap-ports "$bootstrap_ports" \
--offload \
--heavy-threshold $HEAVY_THRESHOLD \
--port $PROXY_PORT &
sleep 5
# Step 5: Smoke test
echo ""
echo "Smoke test..."
result=$(curl -s -m 120 http://localhost:$PROXY_PORT/v1/completions \
-X POST -H "Content-Type: application/json" \
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
if echo "$result" | grep -q "choices"; then
echo " Smoke test passed!"
else
echo " WARNING: Smoke test failed: $result"
fi
echo ""
echo "=== Elastic P2P ready ==="
echo " Endpoint: http://localhost:$PROXY_PORT"
echo " Breakdown: curl http://localhost:$PROXY_PORT/breakdown"
echo " Instance logs: /tmp/elastic_inst_*.log"
echo ""
echo "Run benchmark:"
echo " python -m replayer --trace traces/sampled_1000req_seed42.jsonl \\"
echo " --output outputs/elastic_p2p/metrics.jsonl \\"
echo " --endpoint http://localhost:$PROXY_PORT \\"
echo " --time-scale 20 --max-inflight-sessions 8 -v"
echo ""
wait