agentic-kvc/microbench/lifecycle/launch_pd_pair.sh

#!/bin/bash
# Launch PD-separated pair (TP=1 each) for lifecycle microbenchmark.
# Uses GPUs 1 (prefill) and 2 (decode) to avoid conflicting with Microbench 1 on GPU 0.
#
# Usage: bash launch_pd_pair.sh
#   Requires: ~/agentic-kv/.venv with vLLM 0.18.1 + Mooncake

set -euo pipefail

VENV="$HOME/agentic-kv/.venv/bin"
PYTHON="$VENV/python"
MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"

PREFILL_PORT=8010
DECODE_PORT=8020
BOOTSTRAP_PORT=8998

PREFILL_GPU=1
DECODE_GPU=2

LOG_DIR="$HOME/agentic-kv/microbench/lifecycle/logs"
mkdir -p "$LOG_DIR"

trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; wait 2>/dev/null' EXIT INT TERM

echo "=== PD Lifecycle Microbench: PD-separated pair ==="
echo "  Model:     $MODEL_PATH"
echo "  Prefill:   GPU $PREFILL_GPU, port $PREFILL_PORT, bootstrap $BOOTSTRAP_PORT"
echo "  Decode:    GPU $DECODE_GPU, port $DECODE_PORT"
echo ""

# Start prefill instance (KV producer)
echo "[1/2] Starting prefill instance on GPU $PREFILL_GPU..."
VLLM_MOONCAKE_BOOTSTRAP_PORT=$BOOTSTRAP_PORT \
CUDA_VISIBLE_DEVICES=$PREFILL_GPU \
$PYTHON -m vllm.entrypoints.openai.api_server \
    --model "$MODEL_PATH" \
    --host 0.0.0.0 \
    --port $PREFILL_PORT \
    --tensor-parallel-size 1 \
    --trust-remote-code \
    --enable-prefix-caching \
    --dtype auto \
    --gpu-memory-utilization 0.9 \
    --max-model-len 200000 \
    --no-enable-log-requests \
    --kv-transfer-config \
    '{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}' \
    2>&1 | tee "$LOG_DIR/prefill.log" &
PREFILL_PID=$!
echo "  Prefill PID=$PREFILL_PID"

# Wait for prefill to be ready
echo "  Waiting for prefill instance..."
for i in $(seq 1 180); do
    if curl -s "http://127.0.0.1:$PREFILL_PORT/v1/models" > /dev/null 2>&1; then
        echo "  Prefill ready after ${i}s"
        break
    fi
    if [ $i -eq 180 ]; then
        echo "  ERROR: Prefill did not start within 180s"
        exit 1
    fi
    sleep 1
done

# Start decode instance (KV consumer)
echo "[2/2] Starting decode instance on GPU $DECODE_GPU..."
CUDA_VISIBLE_DEVICES=$DECODE_GPU \
$PYTHON -m vllm.entrypoints.openai.api_server \
    --model "$MODEL_PATH" \
    --host 0.0.0.0 \
    --port $DECODE_PORT \
    --tensor-parallel-size 1 \
    --trust-remote-code \
    --enable-prefix-caching \
    --dtype auto \
    --gpu-memory-utilization 0.9 \
    --max-model-len 200000 \
    --no-enable-log-requests \
    --kv-transfer-config \
    "{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_connector_extra_config\":{\"prefill_addr\":\"127.0.0.1:$BOOTSTRAP_PORT\"}}" \
    2>&1 | tee "$LOG_DIR/decode.log" &
DECODE_PID=$!
echo "  Decode PID=$DECODE_PID"

# Wait for decode to be ready
echo "  Waiting for decode instance..."
for i in $(seq 1 180); do
    if curl -s "http://127.0.0.1:$DECODE_PORT/v1/models" > /dev/null 2>&1; then
        echo "  Decode ready after ${i}s"
        break
    fi
    if [ $i -eq 180 ]; then
        echo "  ERROR: Decode did not start within 180s"
        exit 1
    fi
    sleep 1
done

echo ""
echo "=== Both instances ready ==="
echo "  Prefill: http://127.0.0.1:$PREFILL_PORT (PID $PREFILL_PID)"
echo "  Decode:  http://127.0.0.1:$DECODE_PORT (PID $DECODE_PID)"
echo ""
echo "  Prefill PID: $PREFILL_PID" > "$LOG_DIR/.pids"
echo "  Decode PID: $DECODE_PID" >> "$LOG_DIR/.pids"
echo "$PREFILL_PID" > "$LOG_DIR/.prefill.pid"
echo "$DECODE_PID" > "$LOG_DIR/.decode.pid"

echo "Press Ctrl+C to stop both instances."
wait