Systematic study of prefill-decode disaggregation for agentic LLM workloads using production GLM-5.1 coder trace (2.1M requests, 71B input tokens). Key findings: - Cache-aware routing improves TPOT p90 by 15% and APC from 20.8% to 44.7% without PD separation, matching PD-Sep's decode isolation benefit - PD separation adds +72% TTFT overhead (KV transfer) with no TPOT gain when using the same cache-aware scheduler - Prefill remains compute-bound even at 95% KV cache reuse (AI >1000x vs decode AI <2), but absolute FLOPs drop 71% from cache hits - For agentic MoE workloads, cache-aware routing > PD separation Infrastructure: - Trace sampler preserving session structure + hash_ids for prefix sharing - Async trace replayer with streaming TTFT/TPOT/E2E measurement - Unified cache-aware + token-level load-balanced global scheduler proxy supporting both PD-colocated and PD-disaggregated (Mooncake/RDMA) modes - vLLM 0.18.1 scheduler patch for KV transfer abort race condition - Roofline analysis tool for prefill/decode compute characterization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
255 lines
9.0 KiB
Bash
Executable File
255 lines
9.0 KiB
Bash
Executable File
#!/bin/bash
|
||
# Run the complete experiment matrix:
|
||
# 1. Combined TP=2 DP=4 (4 instances, baseline)
|
||
# 2. Combined TP=1 DP=8 (8 instances, max throughput)
|
||
# 3. PD-Sep TP=1: P×4 + D×4 via Mooncake/RDMA
|
||
#
|
||
# All use the same trace, same concurrency, same timeout.
|
||
|
||
set -euo pipefail
|
||
|
||
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||
VENV="$PROJECT_DIR/.venv/bin"
|
||
VLLM="$VENV/vllm"
|
||
PYTHON="$VENV/python"
|
||
|
||
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
|
||
|
||
# Uniform benchmark params
|
||
MAX_SESSIONS=${MAX_SESSIONS:-8}
|
||
MAX_CONCURRENT=${MAX_CONCURRENT:-16}
|
||
TIME_SCALE=10
|
||
REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-300}
|
||
REQUEST_LIMIT="${REQUEST_LIMIT:-}" # empty = all 1000
|
||
|
||
cleanup_gpu() {
|
||
pkill -9 -f "vllm" 2>/dev/null || true
|
||
pkill -9 -f "cache_aware_proxy\|mooncake_connector_proxy\|uvicorn" 2>/dev/null || true
|
||
fuser 9090/tcp 8000/tcp 2>/dev/null | xargs -r kill -9 2>/dev/null || true
|
||
sleep 5
|
||
fuser /dev/nvidia* 2>/dev/null | tr " " "\n" | sort -u | xargs -r kill -9 2>/dev/null || true
|
||
sleep 10
|
||
}
|
||
|
||
wait_for_server() {
|
||
local port=$1
|
||
local timeout=${2:-600}
|
||
timeout "$timeout" bash -c "until curl -s localhost:$port/v1/models >/dev/null 2>&1; do sleep 5; done"
|
||
}
|
||
|
||
run_benchmark() {
|
||
local tag=$1
|
||
local endpoint=$2
|
||
local extra_args="${3:-}"
|
||
local outdir="$PROJECT_DIR/outputs/$tag"
|
||
|
||
echo " Running benchmark -> $outdir"
|
||
local limit_arg=""
|
||
if [ -n "$REQUEST_LIMIT" ]; then
|
||
limit_arg="--request-limit $REQUEST_LIMIT"
|
||
fi
|
||
|
||
$PYTHON -m replayer \
|
||
--trace "$TRACE" \
|
||
--output "$outdir/metrics.jsonl" \
|
||
--endpoint "$endpoint" \
|
||
--model "$MODEL" \
|
||
--time-scale $TIME_SCALE \
|
||
--max-inflight-sessions $MAX_SESSIONS \
|
||
--concurrency-limit $MAX_CONCURRENT \
|
||
--request-timeout $REQUEST_TIMEOUT \
|
||
$limit_arg \
|
||
-v
|
||
|
||
echo " Done: $(wc -l < "$outdir/metrics.jsonl") requests"
|
||
}
|
||
|
||
#######################################################################
|
||
# Experiment 1: Combined TP=2 DP=4
|
||
#######################################################################
|
||
run_combined_tp2_dp4() {
|
||
echo ""
|
||
echo "================================================================"
|
||
echo " Experiment 1: Combined TP=2 DP=4 (4 instances on 8 GPUs)"
|
||
echo "================================================================"
|
||
cleanup_gpu
|
||
|
||
for i in 0 1 2 3; do
|
||
local gpu_start=$((i * 2))
|
||
local gpu_end=$((gpu_start + 1))
|
||
local port=$((8000 + i))
|
||
echo " Starting instance $i: GPUs $gpu_start,$gpu_end, port $port"
|
||
CUDA_VISIBLE_DEVICES=$gpu_start,$gpu_end $VLLM serve "$MODEL" \
|
||
--host 0.0.0.0 --port $port \
|
||
--tensor-parallel-size 2 \
|
||
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 &
|
||
done
|
||
|
||
for i in 0 1 2 3; do
|
||
wait_for_server $((8000 + i))
|
||
echo " Instance $i ready"
|
||
done
|
||
echo " All 4 instances ready"
|
||
|
||
# Start global scheduler (cache-aware proxy in combined mode)
|
||
echo " Starting global scheduler..."
|
||
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
||
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 http://127.0.0.1:8003 \
|
||
--port 9090 &
|
||
sleep 5
|
||
|
||
run_benchmark "exp1_combined_tp2_dp4" "http://localhost:9090"
|
||
}
|
||
|
||
#######################################################################
|
||
# Experiment 2: Combined TP=1 DP=8
|
||
#######################################################################
|
||
run_combined_tp1_dp8() {
|
||
echo ""
|
||
echo "================================================================"
|
||
echo " Experiment 2: Combined TP=1 DP=8 (8 instances on 8 GPUs)"
|
||
echo "================================================================"
|
||
cleanup_gpu
|
||
|
||
for i in $(seq 0 7); do
|
||
local port=$((8000 + i))
|
||
echo " Starting instance $i: GPU $i, port $port"
|
||
CUDA_VISIBLE_DEVICES=$i $VLLM serve "$MODEL" \
|
||
--host 0.0.0.0 --port $port \
|
||
--tensor-parallel-size 1 \
|
||
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 &
|
||
done
|
||
|
||
for i in $(seq 0 7); do
|
||
wait_for_server $((8000 + i))
|
||
echo " Instance $i ready"
|
||
done
|
||
echo " All 8 instances ready"
|
||
|
||
# Start global scheduler (cache-aware proxy in combined mode)
|
||
echo " Starting global scheduler..."
|
||
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
|
||
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 http://127.0.0.1:8003 \
|
||
http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 http://127.0.0.1:8007 \
|
||
--port 9090 &
|
||
sleep 5
|
||
|
||
run_benchmark "exp2_combined_tp1_dp8" "http://localhost:9090"
|
||
}
|
||
|
||
#######################################################################
|
||
# Experiment 3: PD-Sep TP=1 P×4 D×4 (Mooncake/RDMA)
|
||
#######################################################################
|
||
run_pd_sep_tp1() {
|
||
echo ""
|
||
echo "================================================================"
|
||
echo " Experiment 3: PD-Sep TP=1 P×4 + D×4 (Mooncake/RDMA)"
|
||
echo "================================================================"
|
||
cleanup_gpu
|
||
|
||
PROXY_SCRIPT="$PROJECT_DIR/scripts/cache_aware_proxy.py"
|
||
|
||
# Start 4 prefill instances (GPUs 0-3)
|
||
local prefill_args=""
|
||
for i in 0 1 2 3; do
|
||
local port=$((8010 + i))
|
||
local bootstrap=$((8998 + i))
|
||
echo " Prefill $i: GPU $i, port $port, bootstrap $bootstrap"
|
||
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
|
||
CUDA_VISIBLE_DEVICES=$i $VLLM serve "$MODEL" \
|
||
--host 0.0.0.0 --port $port \
|
||
--tensor-parallel-size 1 \
|
||
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
||
--kv-transfer-config \
|
||
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" &
|
||
prefill_args="$prefill_args --prefill http://127.0.0.1:$port $bootstrap"
|
||
done
|
||
|
||
# Start 4 decode instances (GPUs 4-7)
|
||
local decode_args=""
|
||
for i in 0 1 2 3; do
|
||
local gpu=$((4 + i))
|
||
local port=$((8020 + i))
|
||
echo " Decode $i: GPU $gpu, port $port"
|
||
CUDA_VISIBLE_DEVICES=$gpu $VLLM serve "$MODEL" \
|
||
--host 0.0.0.0 --port $port \
|
||
--tensor-parallel-size 1 \
|
||
--trust-remote-code --enable-prefix-caching --enforce-eager \
|
||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
||
--kv-transfer-config \
|
||
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_load_failure_policy\":\"recompute\"}" &
|
||
decode_args="$decode_args --decode http://127.0.0.1:$port"
|
||
done
|
||
|
||
# Wait for all instances
|
||
for i in 0 1 2 3; do
|
||
wait_for_server $((8010 + i))
|
||
echo " Prefill $i ready"
|
||
done
|
||
for i in 0 1 2 3; do
|
||
wait_for_server $((8020 + i))
|
||
echo " Decode $i ready"
|
||
done
|
||
|
||
# Start proxy (wait for bootstrap to be queryable first)
|
||
echo " Waiting for bootstrap servers..."
|
||
for bp in 8998 8999 9000 9001; do
|
||
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
|
||
echo " Bootstrap $bp ready"
|
||
done
|
||
|
||
echo " Starting proxy on port 9000..."
|
||
$PYTHON "$PROXY_SCRIPT" $prefill_args $decode_args --host 0.0.0.0 --port 9090 &
|
||
sleep 15
|
||
|
||
# Smoke test with retry
|
||
echo " Smoke test..."
|
||
for attempt in 1 2 3; do
|
||
result=$(curl -s -m 120 http://localhost:9090/v1/completions \
|
||
-X POST -H "Content-Type: application/json" \
|
||
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
|
||
if echo "$result" | grep -q "choices"; then
|
||
echo " Smoke test passed!"
|
||
break
|
||
fi
|
||
echo " Attempt $attempt failed, retrying..."
|
||
sleep 10
|
||
done
|
||
|
||
run_benchmark "exp3_pd_sep_tp1_mooncake" "http://localhost:9090"
|
||
}
|
||
|
||
#######################################################################
|
||
# Main
|
||
#######################################################################
|
||
echo "Starting experiment matrix on $(hostname)"
|
||
echo "Model: $MODEL"
|
||
echo "Trace: $TRACE"
|
||
echo "Params: sessions=$MAX_SESSIONS, concurrent=$MAX_CONCURRENT, time_scale=$TIME_SCALE"
|
||
echo ""
|
||
|
||
case "${1:-all}" in
|
||
1|tp2dp4) run_combined_tp2_dp4 ;;
|
||
2|tp1dp8) run_combined_tp1_dp8 ;;
|
||
3|pdsep) run_pd_sep_tp1 ;;
|
||
all)
|
||
run_combined_tp2_dp4
|
||
run_combined_tp1_dp8
|
||
run_pd_sep_tp1
|
||
;;
|
||
*)
|
||
echo "Usage: $0 {1|2|3|all|tp2dp4|tp1dp8|pdsep}"
|
||
exit 1
|
||
;;
|
||
esac
|
||
|
||
echo ""
|
||
echo "================================================================"
|
||
echo " All experiments complete!"
|
||
echo "================================================================"
|
||
cleanup_gpu
|