Files
agentic-kvc/microbench/lifecycle/launch_pd_pair.sh
Gahow Wang f784e49c07 Microbench: prefill-decode interference + PD transfer lifecycle
Two microbenchmarks quantifying the elastic offload decision:

1. Interference (corrected): cold prefill causes 14-214x TPOT p90
   degradation on same-worker decode (D∈{1,2,4,8} × P∈{2k,8k,16k,32k}).
   Earlier run had a prefix-cache bug (deterministic prompts hit cache
   after rep 0); fixed with uuid+time_ns unique prompts.

2. Transfer lifecycle: PD-sep TTFT breakdown via Mooncake proxy,
   measuring prefill→RDMA→decode startup overhead.

Key finding: offload wins at all P≥2048 operating points —
transfer cost is 25-50% of interference cost even with bulk Mooncake.
2026-05-26 00:57:06 +08:00

113 lines
3.4 KiB
Bash

#!/bin/bash
# Launch PD-separated pair (TP=1 each) for lifecycle microbenchmark.
# Uses GPUs 1 (prefill) and 2 (decode) to avoid conflicting with Microbench 1 on GPU 0.
#
# Usage: bash launch_pd_pair.sh
# Requires: ~/agentic-kv/.venv with vLLM 0.18.1 + Mooncake
set -euo pipefail
VENV="$HOME/agentic-kv/.venv/bin"
PYTHON="$VENV/python"
MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
PREFILL_PORT=8010
DECODE_PORT=8020
BOOTSTRAP_PORT=8998
PREFILL_GPU=1
DECODE_GPU=2
LOG_DIR="$HOME/agentic-kv/microbench/lifecycle/logs"
mkdir -p "$LOG_DIR"
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; wait 2>/dev/null' EXIT INT TERM
echo "=== PD Lifecycle Microbench: PD-separated pair ==="
echo " Model: $MODEL_PATH"
echo " Prefill: GPU $PREFILL_GPU, port $PREFILL_PORT, bootstrap $BOOTSTRAP_PORT"
echo " Decode: GPU $DECODE_GPU, port $DECODE_PORT"
echo ""
# Start prefill instance (KV producer)
echo "[1/2] Starting prefill instance on GPU $PREFILL_GPU..."
VLLM_MOONCAKE_BOOTSTRAP_PORT=$BOOTSTRAP_PORT \
CUDA_VISIBLE_DEVICES=$PREFILL_GPU \
$PYTHON -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--host 0.0.0.0 \
--port $PREFILL_PORT \
--tensor-parallel-size 1 \
--trust-remote-code \
--enable-prefix-caching \
--dtype auto \
--gpu-memory-utilization 0.9 \
--max-model-len 200000 \
--no-enable-log-requests \
--kv-transfer-config \
'{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}' \
2>&1 | tee "$LOG_DIR/prefill.log" &
PREFILL_PID=$!
echo " Prefill PID=$PREFILL_PID"
# Wait for prefill to be ready
echo " Waiting for prefill instance..."
for i in $(seq 1 180); do
if curl -s "http://127.0.0.1:$PREFILL_PORT/v1/models" > /dev/null 2>&1; then
echo " Prefill ready after ${i}s"
break
fi
if [ $i -eq 180 ]; then
echo " ERROR: Prefill did not start within 180s"
exit 1
fi
sleep 1
done
# Start decode instance (KV consumer)
echo "[2/2] Starting decode instance on GPU $DECODE_GPU..."
CUDA_VISIBLE_DEVICES=$DECODE_GPU \
$PYTHON -m vllm.entrypoints.openai.api_server \
--model "$MODEL_PATH" \
--host 0.0.0.0 \
--port $DECODE_PORT \
--tensor-parallel-size 1 \
--trust-remote-code \
--enable-prefix-caching \
--dtype auto \
--gpu-memory-utilization 0.9 \
--max-model-len 200000 \
--no-enable-log-requests \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_connector_extra_config\":{\"prefill_addr\":\"127.0.0.1:$BOOTSTRAP_PORT\"}}" \
2>&1 | tee "$LOG_DIR/decode.log" &
DECODE_PID=$!
echo " Decode PID=$DECODE_PID"
# Wait for decode to be ready
echo " Waiting for decode instance..."
for i in $(seq 1 180); do
if curl -s "http://127.0.0.1:$DECODE_PORT/v1/models" > /dev/null 2>&1; then
echo " Decode ready after ${i}s"
break
fi
if [ $i -eq 180 ]; then
echo " ERROR: Decode did not start within 180s"
exit 1
fi
sleep 1
done
echo ""
echo "=== Both instances ready ==="
echo " Prefill: http://127.0.0.1:$PREFILL_PORT (PID $PREFILL_PID)"
echo " Decode: http://127.0.0.1:$DECODE_PORT (PID $DECODE_PID)"
echo ""
echo " Prefill PID: $PREFILL_PID" > "$LOG_DIR/.pids"
echo " Decode PID: $DECODE_PID" >> "$LOG_DIR/.pids"
echo "$PREFILL_PID" > "$LOG_DIR/.prefill.pid"
echo "$DECODE_PID" > "$LOG_DIR/.decode.pid"
echo "Press Ctrl+C to stop both instances."
wait