Two microbenchmarks quantifying the elastic offload decision:
1. Interference (corrected): cold prefill causes 14-214x TPOT p90
degradation on same-worker decode (D∈{1,2,4,8} × P∈{2k,8k,16k,32k}).
Earlier run had a prefix-cache bug (deterministic prompts hit cache
after rep 0); fixed with uuid+time_ns unique prompts.
2. Transfer lifecycle: PD-sep TTFT breakdown via Mooncake proxy,
measuring prefill→RDMA→decode startup overhead.
Key finding: offload wins at all P≥2048 operating points —
transfer cost is 25-50% of interference cost even with bulk Mooncake.
113 lines
3.4 KiB
Bash
113 lines
3.4 KiB
Bash
#!/bin/bash
|
|
# Launch PD-separated pair (TP=1 each) for lifecycle microbenchmark.
|
|
# Uses GPUs 1 (prefill) and 2 (decode) to avoid conflicting with Microbench 1 on GPU 0.
|
|
#
|
|
# Usage: bash launch_pd_pair.sh
|
|
# Requires: ~/agentic-kv/.venv with vLLM 0.18.1 + Mooncake
|
|
|
|
set -euo pipefail
|
|
|
|
VENV="$HOME/agentic-kv/.venv/bin"
|
|
PYTHON="$VENV/python"
|
|
MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
|
|
PREFILL_PORT=8010
|
|
DECODE_PORT=8020
|
|
BOOTSTRAP_PORT=8998
|
|
|
|
PREFILL_GPU=1
|
|
DECODE_GPU=2
|
|
|
|
LOG_DIR="$HOME/agentic-kv/microbench/lifecycle/logs"
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; wait 2>/dev/null' EXIT INT TERM
|
|
|
|
echo "=== PD Lifecycle Microbench: PD-separated pair ==="
|
|
echo " Model: $MODEL_PATH"
|
|
echo " Prefill: GPU $PREFILL_GPU, port $PREFILL_PORT, bootstrap $BOOTSTRAP_PORT"
|
|
echo " Decode: GPU $DECODE_GPU, port $DECODE_PORT"
|
|
echo ""
|
|
|
|
# Start prefill instance (KV producer)
|
|
echo "[1/2] Starting prefill instance on GPU $PREFILL_GPU..."
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$BOOTSTRAP_PORT \
|
|
CUDA_VISIBLE_DEVICES=$PREFILL_GPU \
|
|
$PYTHON -m vllm.entrypoints.openai.api_server \
|
|
--model "$MODEL_PATH" \
|
|
--host 0.0.0.0 \
|
|
--port $PREFILL_PORT \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code \
|
|
--enable-prefix-caching \
|
|
--dtype auto \
|
|
--gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--no-enable-log-requests \
|
|
--kv-transfer-config \
|
|
'{"kv_connector":"MooncakeConnector","kv_role":"kv_producer"}' \
|
|
2>&1 | tee "$LOG_DIR/prefill.log" &
|
|
PREFILL_PID=$!
|
|
echo " Prefill PID=$PREFILL_PID"
|
|
|
|
# Wait for prefill to be ready
|
|
echo " Waiting for prefill instance..."
|
|
for i in $(seq 1 180); do
|
|
if curl -s "http://127.0.0.1:$PREFILL_PORT/v1/models" > /dev/null 2>&1; then
|
|
echo " Prefill ready after ${i}s"
|
|
break
|
|
fi
|
|
if [ $i -eq 180 ]; then
|
|
echo " ERROR: Prefill did not start within 180s"
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
# Start decode instance (KV consumer)
|
|
echo "[2/2] Starting decode instance on GPU $DECODE_GPU..."
|
|
CUDA_VISIBLE_DEVICES=$DECODE_GPU \
|
|
$PYTHON -m vllm.entrypoints.openai.api_server \
|
|
--model "$MODEL_PATH" \
|
|
--host 0.0.0.0 \
|
|
--port $DECODE_PORT \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code \
|
|
--enable-prefix-caching \
|
|
--dtype auto \
|
|
--gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--no-enable-log-requests \
|
|
--kv-transfer-config \
|
|
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_connector_extra_config\":{\"prefill_addr\":\"127.0.0.1:$BOOTSTRAP_PORT\"}}" \
|
|
2>&1 | tee "$LOG_DIR/decode.log" &
|
|
DECODE_PID=$!
|
|
echo " Decode PID=$DECODE_PID"
|
|
|
|
# Wait for decode to be ready
|
|
echo " Waiting for decode instance..."
|
|
for i in $(seq 1 180); do
|
|
if curl -s "http://127.0.0.1:$DECODE_PORT/v1/models" > /dev/null 2>&1; then
|
|
echo " Decode ready after ${i}s"
|
|
break
|
|
fi
|
|
if [ $i -eq 180 ]; then
|
|
echo " ERROR: Decode did not start within 180s"
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
echo ""
|
|
echo "=== Both instances ready ==="
|
|
echo " Prefill: http://127.0.0.1:$PREFILL_PORT (PID $PREFILL_PID)"
|
|
echo " Decode: http://127.0.0.1:$DECODE_PORT (PID $DECODE_PID)"
|
|
echo ""
|
|
echo " Prefill PID: $PREFILL_PID" > "$LOG_DIR/.pids"
|
|
echo " Decode PID: $DECODE_PID" >> "$LOG_DIR/.pids"
|
|
echo "$PREFILL_PID" > "$LOG_DIR/.prefill.pid"
|
|
echo "$DECODE_PID" > "$LOG_DIR/.decode.pid"
|
|
|
|
echo "Press Ctrl+C to stop both instances."
|
|
wait
|