Two microbenchmarks quantifying the elastic offload decision:
1. Interference (corrected): cold prefill causes 14-214x TPOT p90
degradation on same-worker decode (D∈{1,2,4,8} × P∈{2k,8k,16k,32k}).
Earlier run had a prefix-cache bug (deterministic prompts hit cache
after rep 0); fixed with uuid+time_ns unique prompts.
2. Transfer lifecycle: PD-sep TTFT breakdown via Mooncake proxy,
measuring prefill→RDMA→decode startup overhead.
Key finding: offload wins at all P≥2048 operating points —
transfer cost is 25-50% of interference cost even with bulk Mooncake.
58 lines
1.5 KiB
Bash
58 lines
1.5 KiB
Bash
#!/bin/bash
|
|
# Launch a single vLLM instance on GPU 0 for interference microbenchmark.
|
|
# Uses TP=1, enable-chunked-prefill, enable-prefix-caching.
|
|
#
|
|
# Usage: bash launch_microbench1.sh [chunk_size] [port]
|
|
# chunk_size: max_num_batched_tokens (default: 8192)
|
|
# port: serving port (default: 8000)
|
|
|
|
set -euo pipefail
|
|
|
|
CHUNK_SIZE=${1:-8192}
|
|
PORT=${2:-8000}
|
|
MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
GPU_ID=${GPU_ID:-0}
|
|
LOG_FILE="vllm_microbench1_chunk${CHUNK_SIZE}.log"
|
|
|
|
echo "=== Interference Microbench vLLM Instance ==="
|
|
echo "Model: $MODEL"
|
|
echo "GPU: $GPU_ID"
|
|
echo "Port: $PORT"
|
|
echo "Chunk size (max_num_batched_tokens): $CHUNK_SIZE"
|
|
echo "Log: $LOG_FILE"
|
|
echo ""
|
|
|
|
# Kill any existing vLLM on this port
|
|
pkill -f "vllm.*--port $PORT" 2>/dev/null || true
|
|
sleep 2
|
|
|
|
CUDA_VISIBLE_DEVICES=$GPU_ID python -m vllm.entrypoints.openai.api_server \
|
|
--model "$MODEL" \
|
|
--tensor-parallel-size 1 \
|
|
--enable-prefix-caching \
|
|
--dtype auto \
|
|
--gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--max-num-batched-tokens "$CHUNK_SIZE" \
|
|
--port "$PORT" \
|
|
--trust-remote-code \
|
|
--disable-log-requests \
|
|
2>&1 | tee "$LOG_FILE" &
|
|
|
|
VLLM_PID=$!
|
|
echo "vLLM PID: $VLLM_PID"
|
|
echo "$VLLM_PID" > .vllm_microbench1.pid
|
|
|
|
# Wait for server to be ready
|
|
echo "Waiting for server to start..."
|
|
for i in $(seq 1 120); do
|
|
if curl -s "http://127.0.0.1:$PORT/v1/models" > /dev/null 2>&1; then
|
|
echo "Server ready after ${i}s!"
|
|
exit 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
echo "ERROR: Server did not start within 120s"
|
|
exit 1
|