8×TP1 + load_only proxy, shape 512×64, rates 32/64/128 req/s total:
Rate=32 (non-saturated, thr=0.95-0.97):
plain TTFT p90=64ms, mooncake_both=65ms → +2% (noise)
Rate=64 (non-saturated, thr=0.96):
plain TTFT p90=114ms, mooncake_both=107ms → -6% (noise)
Rate=128 (saturated, thr=0.70-0.71):
plain TTFT p90=702ms, mooncake_both=822ms → +17%
plain TTFT p50=339ms, mooncake_both=470ms → +39%
Conclusion: The elastic_migration_v2 +45% is a saturation artifact.
Under SLO-compliant load (TTFT<10s, thr_ratio>0.9), mooncake_both's
1.4ms/step build_connector_meta overhead is completely masked by the
scheduler-model async pipeline. The tax only manifests when the system
is already saturated and queueing amplifies per-step differences.
For practical deployment: enabling kv_role=kv_both has effectively zero
cost as long as the serving system stays within SLO capacity bounds.
170 lines
5.7 KiB
Bash
Executable File
170 lines
5.7 KiB
Bash
Executable File
#!/bin/bash
|
||
# 8-instance connector tax microbench: plain vs mooncake_both
|
||
#
|
||
# Launches 8×TP1 vLLM instances + cache_aware_proxy, same topology as
|
||
# elastic_migration_v2. Runs open-loop bench at rates 32,64,128 req/s
|
||
# with short shape (512 input, 64 output) to maximize decode concurrency.
|
||
#
|
||
# Usage:
|
||
# bash run_8instance.sh --mode plain # no Mooncake
|
||
# bash run_8instance.sh --mode mooncake # kv_role=kv_both
|
||
#
|
||
# Results go to results/8inst_<mode>_<date>/
|
||
|
||
set -euo pipefail
|
||
|
||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJ="$(cd "$HERE/../.." && pwd)"
|
||
PYTHON="$PROJ/.venv/bin/python"
|
||
VLLM="$PROJ/.venv/bin/vllm"
|
||
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||
export PYTHONPATH="$PROJ:${PYTHONPATH:-}"
|
||
|
||
N_INSTANCES=8
|
||
BASE_PORT=8000
|
||
PROXY_PORT=9090
|
||
MODE="${1:---mode}" # parse below
|
||
|
||
# Parse --mode plain|mooncake
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--mode) MODE="$2"; shift 2 ;;
|
||
*) echo "Unknown arg: $1"; exit 1 ;;
|
||
esac
|
||
done
|
||
|
||
if [[ "$MODE" != "plain" && "$MODE" != "mooncake" ]]; then
|
||
echo "Usage: $0 --mode plain|mooncake"
|
||
exit 1
|
||
fi
|
||
|
||
DATE=$(date +%Y%m%d_%H%M)
|
||
OUTDIR="$HERE/results/8inst_${MODE}_${DATE}"
|
||
mkdir -p "$OUTDIR"
|
||
|
||
echo "=== 8-Instance Connector Tax Microbench ==="
|
||
echo "Mode: $MODE"
|
||
echo "Output: $OUTDIR"
|
||
echo ""
|
||
|
||
# ── Cleanup ───────────────────────────────────────────────────────────────
|
||
cleanup() {
|
||
echo "[cleanup] Killing all vLLM/proxy processes..."
|
||
pkill -9 -f "vllm serve" 2>/dev/null || true
|
||
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
|
||
pkill -9 -f "cache_aware_proxy" 2>/dev/null || true
|
||
sleep 5
|
||
for _ in $(seq 1 20); do
|
||
total_used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk '{s+=$1}END{print s}')
|
||
[[ "$total_used" -lt 1000 ]] && return 0
|
||
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
|
||
sleep 3
|
||
done
|
||
echo "[cleanup] WARNING: GPU memory not fully released"
|
||
}
|
||
trap cleanup EXIT
|
||
|
||
cleanup # ensure clean start
|
||
|
||
# ── Launch 8 instances ────────────────────────────────────────────────────
|
||
echo "[launch] Starting $N_INSTANCES vLLM instances..."
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
port=$((BASE_PORT + i))
|
||
master=$((29500 + i))
|
||
logfile="$OUTDIR/vllm_inst_${i}.log"
|
||
step_log="$OUTDIR/engine_step_${i}.jsonl"
|
||
|
||
kv_args=""
|
||
mooncake_env=""
|
||
if [[ "$MODE" == "mooncake" ]]; then
|
||
kv_args="--kv-transfer-config {\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\"}"
|
||
mooncake_env="VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i))"
|
||
fi
|
||
|
||
env $mooncake_env \
|
||
AGENTIC_STEP_LOG_PATH="$step_log" \
|
||
AGENTIC_WORKER_ID="engine_${i}" \
|
||
MASTER_PORT=$master \
|
||
CUDA_VISIBLE_DEVICES=$i \
|
||
$VLLM serve "$MODEL" \
|
||
--host 0.0.0.0 --port $port \
|
||
--tensor-parallel-size 1 \
|
||
--trust-remote-code --enable-prefix-caching \
|
||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
||
--no-enable-log-requests \
|
||
$kv_args \
|
||
> "$logfile" 2>&1 &
|
||
|
||
echo " inst_$i: GPU=$i port=$port"
|
||
sleep 2
|
||
done
|
||
|
||
# Wait for all instances to be ready
|
||
echo "[launch] Waiting for all instances..."
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
port=$((BASE_PORT + i))
|
||
for t in $(seq 1 240); do
|
||
if curl -sf "http://127.0.0.1:$port/v1/models" >/dev/null 2>&1; then
|
||
echo " inst_$i ready after ${t}s"
|
||
break
|
||
fi
|
||
if [[ $t -eq 240 ]]; then
|
||
echo " ERROR: inst_$i did not start within 240s"
|
||
exit 1
|
||
fi
|
||
sleep 1
|
||
done
|
||
done
|
||
|
||
# ── Launch proxy ──────────────────────────────────────────────────────────
|
||
echo "[proxy] Starting cache_aware_proxy (policy=load_only)..."
|
||
combined_args=""
|
||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
||
done
|
||
|
||
$PYTHON "$PROJ/scripts/cache_aware_proxy.py" \
|
||
--combined $combined_args \
|
||
--port $PROXY_PORT \
|
||
--policy load_only \
|
||
> "$OUTDIR/proxy.log" 2>&1 &
|
||
PROXY_PID=$!
|
||
|
||
# Wait for proxy
|
||
for t in $(seq 1 30); do
|
||
if curl -sf "http://127.0.0.1:$PROXY_PORT/v1/models" >/dev/null 2>&1; then
|
||
echo "[proxy] Ready on port $PROXY_PORT"
|
||
break
|
||
fi
|
||
sleep 1
|
||
done
|
||
|
||
# ── Run benchmark ─────────────────────────────────────────────────────────
|
||
echo ""
|
||
echo "[bench] Running open-loop bench (512 input, 64 output, rates=32,64,128)..."
|
||
$PYTHON "$HERE/bench_loop.py" \
|
||
--url "http://127.0.0.1:$PROXY_PORT/v1/chat/completions" \
|
||
--model "$MODEL" \
|
||
--phase A \
|
||
--rates "32,64,128" \
|
||
--shape "512,64" \
|
||
--duration 60 \
|
||
--min-completed 200 \
|
||
--warmup 10 \
|
||
--output-dir "$OUTDIR"
|
||
|
||
echo ""
|
||
echo "[done] Results in $OUTDIR"
|
||
echo ""
|
||
|
||
# Print summary
|
||
cat "$OUTDIR/summary_A.json" | $PYTHON -c "
|
||
import json, sys
|
||
data = json.load(sys.stdin)
|
||
print('Rate | TTFT p50 TTFT p90 TTFT p99 | TPOT p50 TPOT p90 TPOT p99 | Thr ratio')
|
||
print('-' * 100)
|
||
for c in data:
|
||
r = c['rate_target']
|
||
print(f'{r:>5.0f} | {c.get(\"ttft_ms_p50\",0):>7.0f}ms {c.get(\"ttft_ms_p90\",0):>7.0f}ms {c.get(\"ttft_ms_p99\",0):>7.0f}ms | {c.get(\"tpot_ms_p50\",0):>7.1f}ms {c.get(\"tpot_ms_p90\",0):>7.1f}ms {c.get(\"tpot_ms_p99\",0):>7.1f}ms | {c.get(\"throughput_ratio\",0):>9.2f}')
|
||
"
|