Files
agentic-kvc/scripts/run_experiments.sh
Gahow Wang 05592e6adc Agentic workload PD separation analysis with trace-driven benchmarks
Systematic study of prefill-decode disaggregation for agentic LLM workloads
using production GLM-5.1 coder trace (2.1M requests, 71B input tokens).

Key findings:
- Cache-aware routing improves TPOT p90 by 15% and APC from 20.8% to 44.7%
  without PD separation, matching PD-Sep's decode isolation benefit
- PD separation adds +72% TTFT overhead (KV transfer) with no TPOT gain
  when using the same cache-aware scheduler
- Prefill remains compute-bound even at 95% KV cache reuse (AI >1000x
  vs decode AI <2), but absolute FLOPs drop 71% from cache hits
- For agentic MoE workloads, cache-aware routing > PD separation

Infrastructure:
- Trace sampler preserving session structure + hash_ids for prefix sharing
- Async trace replayer with streaming TTFT/TPOT/E2E measurement
- Unified cache-aware + token-level load-balanced global scheduler proxy
  supporting both PD-colocated and PD-disaggregated (Mooncake/RDMA) modes
- vLLM 0.18.1 scheduler patch for KV transfer abort race condition
- Roofline analysis tool for prefill/decode compute characterization

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-21 21:21:57 +08:00

255 lines
9.0 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Run the complete experiment matrix:
# 1. Combined TP=2 DP=4 (4 instances, baseline)
# 2. Combined TP=1 DP=8 (8 instances, max throughput)
# 3. PD-Sep TP=1: P×4 + D×4 via Mooncake/RDMA
#
# All use the same trace, same concurrency, same timeout.
set -euo pipefail
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
VENV="$PROJECT_DIR/.venv/bin"
VLLM="$VENV/vllm"
PYTHON="$VENV/python"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="$PROJECT_DIR/traces/sampled_1000req_seed42.jsonl"
# Uniform benchmark params
MAX_SESSIONS=${MAX_SESSIONS:-8}
MAX_CONCURRENT=${MAX_CONCURRENT:-16}
TIME_SCALE=10
REQUEST_TIMEOUT=${REQUEST_TIMEOUT:-300}
REQUEST_LIMIT="${REQUEST_LIMIT:-}" # empty = all 1000
cleanup_gpu() {
pkill -9 -f "vllm" 2>/dev/null || true
pkill -9 -f "cache_aware_proxy\|mooncake_connector_proxy\|uvicorn" 2>/dev/null || true
fuser 9090/tcp 8000/tcp 2>/dev/null | xargs -r kill -9 2>/dev/null || true
sleep 5
fuser /dev/nvidia* 2>/dev/null | tr " " "\n" | sort -u | xargs -r kill -9 2>/dev/null || true
sleep 10
}
wait_for_server() {
local port=$1
local timeout=${2:-600}
timeout "$timeout" bash -c "until curl -s localhost:$port/v1/models >/dev/null 2>&1; do sleep 5; done"
}
run_benchmark() {
local tag=$1
local endpoint=$2
local extra_args="${3:-}"
local outdir="$PROJECT_DIR/outputs/$tag"
echo " Running benchmark -> $outdir"
local limit_arg=""
if [ -n "$REQUEST_LIMIT" ]; then
limit_arg="--request-limit $REQUEST_LIMIT"
fi
$PYTHON -m replayer \
--trace "$TRACE" \
--output "$outdir/metrics.jsonl" \
--endpoint "$endpoint" \
--model "$MODEL" \
--time-scale $TIME_SCALE \
--max-inflight-sessions $MAX_SESSIONS \
--concurrency-limit $MAX_CONCURRENT \
--request-timeout $REQUEST_TIMEOUT \
$limit_arg \
-v
echo " Done: $(wc -l < "$outdir/metrics.jsonl") requests"
}
#######################################################################
# Experiment 1: Combined TP=2 DP=4
#######################################################################
run_combined_tp2_dp4() {
echo ""
echo "================================================================"
echo " Experiment 1: Combined TP=2 DP=4 (4 instances on 8 GPUs)"
echo "================================================================"
cleanup_gpu
for i in 0 1 2 3; do
local gpu_start=$((i * 2))
local gpu_end=$((gpu_start + 1))
local port=$((8000 + i))
echo " Starting instance $i: GPUs $gpu_start,$gpu_end, port $port"
CUDA_VISIBLE_DEVICES=$gpu_start,$gpu_end $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 2 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 &
done
for i in 0 1 2 3; do
wait_for_server $((8000 + i))
echo " Instance $i ready"
done
echo " All 4 instances ready"
# Start global scheduler (cache-aware proxy in combined mode)
echo " Starting global scheduler..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 http://127.0.0.1:8003 \
--port 9090 &
sleep 5
run_benchmark "exp1_combined_tp2_dp4" "http://localhost:9090"
}
#######################################################################
# Experiment 2: Combined TP=1 DP=8
#######################################################################
run_combined_tp1_dp8() {
echo ""
echo "================================================================"
echo " Experiment 2: Combined TP=1 DP=8 (8 instances on 8 GPUs)"
echo "================================================================"
cleanup_gpu
for i in $(seq 0 7); do
local port=$((8000 + i))
echo " Starting instance $i: GPU $i, port $port"
CUDA_VISIBLE_DEVICES=$i $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 &
done
for i in $(seq 0 7); do
wait_for_server $((8000 + i))
echo " Instance $i ready"
done
echo " All 8 instances ready"
# Start global scheduler (cache-aware proxy in combined mode)
echo " Starting global scheduler..."
$PYTHON "$PROJECT_DIR/scripts/cache_aware_proxy.py" \
--combined http://127.0.0.1:8000 http://127.0.0.1:8001 http://127.0.0.1:8002 http://127.0.0.1:8003 \
http://127.0.0.1:8004 http://127.0.0.1:8005 http://127.0.0.1:8006 http://127.0.0.1:8007 \
--port 9090 &
sleep 5
run_benchmark "exp2_combined_tp1_dp8" "http://localhost:9090"
}
#######################################################################
# Experiment 3: PD-Sep TP=1 P×4 D×4 (Mooncake/RDMA)
#######################################################################
run_pd_sep_tp1() {
echo ""
echo "================================================================"
echo " Experiment 3: PD-Sep TP=1 P×4 + D×4 (Mooncake/RDMA)"
echo "================================================================"
cleanup_gpu
PROXY_SCRIPT="$PROJECT_DIR/scripts/cache_aware_proxy.py"
# Start 4 prefill instances (GPUs 0-3)
local prefill_args=""
for i in 0 1 2 3; do
local port=$((8010 + i))
local bootstrap=$((8998 + i))
echo " Prefill $i: GPU $i, port $port, bootstrap $bootstrap"
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bootstrap \
CUDA_VISIBLE_DEVICES=$i $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_producer\"}" &
prefill_args="$prefill_args --prefill http://127.0.0.1:$port $bootstrap"
done
# Start 4 decode instances (GPUs 4-7)
local decode_args=""
for i in 0 1 2 3; do
local gpu=$((4 + i))
local port=$((8020 + i))
echo " Decode $i: GPU $gpu, port $port"
CUDA_VISIBLE_DEVICES=$gpu $VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching --enforce-eager \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config \
"{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_consumer\",\"kv_load_failure_policy\":\"recompute\"}" &
decode_args="$decode_args --decode http://127.0.0.1:$port"
done
# Wait for all instances
for i in 0 1 2 3; do
wait_for_server $((8010 + i))
echo " Prefill $i ready"
done
for i in 0 1 2 3; do
wait_for_server $((8020 + i))
echo " Decode $i ready"
done
# Start proxy (wait for bootstrap to be queryable first)
echo " Waiting for bootstrap servers..."
for bp in 8998 8999 9000 9001; do
timeout 120 bash -c "until curl -s localhost:$bp/query > /dev/null 2>&1; do sleep 2; done"
echo " Bootstrap $bp ready"
done
echo " Starting proxy on port 9000..."
$PYTHON "$PROXY_SCRIPT" $prefill_args $decode_args --host 0.0.0.0 --port 9090 &
sleep 15
# Smoke test with retry
echo " Smoke test..."
for attempt in 1 2 3; do
result=$(curl -s -m 120 http://localhost:9090/v1/completions \
-X POST -H "Content-Type: application/json" \
-d "{\"model\":\"$MODEL\",\"prompt\":[100,200,300],\"max_tokens\":3,\"temperature\":0}" 2>&1)
if echo "$result" | grep -q "choices"; then
echo " Smoke test passed!"
break
fi
echo " Attempt $attempt failed, retrying..."
sleep 10
done
run_benchmark "exp3_pd_sep_tp1_mooncake" "http://localhost:9090"
}
#######################################################################
# Main
#######################################################################
echo "Starting experiment matrix on $(hostname)"
echo "Model: $MODEL"
echo "Trace: $TRACE"
echo "Params: sessions=$MAX_SESSIONS, concurrent=$MAX_CONCURRENT, time_scale=$TIME_SCALE"
echo ""
case "${1:-all}" in
1|tp2dp4) run_combined_tp2_dp4 ;;
2|tp1dp8) run_combined_tp1_dp8 ;;
3|pdsep) run_pd_sep_tp1 ;;
all)
run_combined_tp2_dp4
run_combined_tp1_dp8
run_pd_sep_tp1
;;
*)
echo "Usage: $0 {1|2|3|all|tp2dp4|tp1dp8|pdsep}"
exit 1
;;
esac
echo ""
echo "================================================================"
echo " All experiments complete!"
echo "================================================================"
cleanup_gpu