agentic-kvc/scripts/launch_elastic_p2p.sh

#!/bin/bash
# Elastic P2P offload: 8× TP=1 kv_both instances + cache_aware_proxy --offload.
#
# Architecture:
#   All 8 instances run kv_role=kv_both (Mooncake connector).
#   The proxy classifies requests as WARM/MEDIUM/HEAVY.
#   HEAVY requests: prefill on a different instance (P), KV via Mooncake RDMA,
#     decode on session-sticky instance (D).
#   WARM/MEDIUM: co-located prefill+decode on session-sticky instance.
#
# Usage:
#   bash scripts/launch_elastic_p2p.sh              # default: this machine
#   HOST=dash1 bash scripts/launch_elastic_p2p.sh   # launch on dash1 via ssh

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"

MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}"
MAX_OFFLOAD="${MAX_OFFLOAD:-4}"

trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; wait 2>/dev/null' EXIT INT TERM

echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ==="
echo "  Model:           $MODEL"
echo "  Instances:       $N_INSTANCES × TP=1"
echo "  Proxy:           port $PROXY_PORT"
echo "  Heavy threshold: $HEAVY_THRESHOLD tokens"
echo "  Max offload:     $MAX_OFFLOAD concurrent"
echo ""

# Step 1: Launch all instances with kv_role=kv_both
combined_args=""
bootstrap_ports=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
    port=$((BASE_PORT + i))
    bootstrap=$((8998 + i))
    master_port=$((29500 + i))
    logfile="/tmp/elastic_inst_${i}.log"

    echo "  Instance $i: GPU $i, port $port, bootstrap $bootstrap"

    VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
    MASTER_PORT=$master_port \
    CUDA_VISIBLE_DEVICES=$i \
    $VLLM serve "$MODEL" \
        --host 0.0.0.0 --port $port \
        --tensor-parallel-size 1 \
        --trust-remote-code --enable-prefix-caching --enforce-eager \
        --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
        --kv-transfer-config \
        '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
        > "$logfile" 2>&1 &

    combined_args="$combined_args http://127.0.0.1:$port"
    bootstrap_ports="${bootstrap_ports:+$bootstrap_ports,}$bootstrap"

    sleep 2  # stagger startup to avoid port collision
done

# Step 2: Wait for all instances
echo ""
echo "Waiting for instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
    port=$((BASE_PORT + i))
    timeout 600 bash -c "until curl -s localhost:$port/v1/models > /dev/null 2>&1; do sleep 5; done"
    echo "  Instance $i (port $port) ready"
done

# Step 3: Wait for bootstrap servers
echo "Waiting for bootstrap servers..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
    bp=$((8998 + i))
    timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
    echo "  Bootstrap $bp ready"
done

# Step 4: Start proxy with --offload
echo ""
echo "Starting proxy (offload mode)..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
    --combined $combined_args \
    --bootstrap-ports "$bootstrap_ports" \
    --offload \
    --heavy-threshold $HEAVY_THRESHOLD \
    --port $PROXY_PORT &
sleep 5

# Step 5: Smoke test
echo ""
echo "Smoke test..."
result=$(curl -s -m 120 http://localhost:$PROXY_PORT/v1/completions \
    -X POST -H "Content-Type: application/json" \
    -d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
if echo "$result" | grep -q "choices"; then
    echo "  Smoke test passed!"
else
    echo "  WARNING: Smoke test failed: $result"
fi

echo ""
echo "=== Elastic P2P ready ==="
echo "  Endpoint:    http://localhost:$PROXY_PORT"
echo "  Breakdown:   curl http://localhost:$PROXY_PORT/breakdown"
echo "  Instance logs: /tmp/elastic_inst_*.log"
echo ""
echo "Run benchmark:"
echo "  python -m replayer --trace traces/sampled_1000req_seed42.jsonl \\"
echo "      --output outputs/elastic_p2p/metrics.jsonl \\"
echo "      --endpoint http://localhost:$PROXY_PORT \\"
echo "      --time-scale 20 --max-inflight-sessions 8 -v"
echo ""

wait