8-instance connector tax: +2% at non-saturated, +17% only at saturation

8×TP1 + load_only proxy, shape 512×64, rates 32/64/128 req/s total:

  Rate=32 (non-saturated, thr=0.95-0.97):
    plain TTFT p90=64ms,  mooncake_both=65ms  → +2% (noise)
  Rate=64 (non-saturated, thr=0.96):
    plain TTFT p90=114ms, mooncake_both=107ms → -6% (noise)
  Rate=128 (saturated, thr=0.70-0.71):
    plain TTFT p90=702ms, mooncake_both=822ms → +17%
    plain TTFT p50=339ms, mooncake_both=470ms → +39%

Conclusion: The elastic_migration_v2 +45% is a saturation artifact.
Under SLO-compliant load (TTFT<10s, thr_ratio>0.9), mooncake_both's
1.4ms/step build_connector_meta overhead is completely masked by the
scheduler-model async pipeline. The tax only manifests when the system
is already saturated and queueing amplifies per-step differences.

For practical deployment: enabling kv_role=kv_both has effectively zero
cost as long as the serving system stays within SLO capacity bounds.
This commit is contained in:
2026-05-26 21:32:46 +08:00
parent c8ec73c548
commit e3480f7d28

View File

@@ -0,0 +1,169 @@
#!/bin/bash
# 8-instance connector tax microbench: plain vs mooncake_both
#
# Launches 8×TP1 vLLM instances + cache_aware_proxy, same topology as
# elastic_migration_v2. Runs open-loop bench at rates 32,64,128 req/s
# with short shape (512 input, 64 output) to maximize decode concurrency.
#
# Usage:
# bash run_8instance.sh --mode plain # no Mooncake
# bash run_8instance.sh --mode mooncake # kv_role=kv_both
#
# Results go to results/8inst_<mode>_<date>/
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJ="$(cd "$HERE/../.." && pwd)"
PYTHON="$PROJ/.venv/bin/python"
VLLM="$PROJ/.venv/bin/vllm"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
export PYTHONPATH="$PROJ:${PYTHONPATH:-}"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
MODE="${1:---mode}" # parse below
# Parse --mode plain|mooncake
while [[ $# -gt 0 ]]; do
case "$1" in
--mode) MODE="$2"; shift 2 ;;
*) echo "Unknown arg: $1"; exit 1 ;;
esac
done
if [[ "$MODE" != "plain" && "$MODE" != "mooncake" ]]; then
echo "Usage: $0 --mode plain|mooncake"
exit 1
fi
DATE=$(date +%Y%m%d_%H%M)
OUTDIR="$HERE/results/8inst_${MODE}_${DATE}"
mkdir -p "$OUTDIR"
echo "=== 8-Instance Connector Tax Microbench ==="
echo "Mode: $MODE"
echo "Output: $OUTDIR"
echo ""
# ── Cleanup ───────────────────────────────────────────────────────────────
cleanup() {
echo "[cleanup] Killing all vLLM/proxy processes..."
pkill -9 -f "vllm serve" 2>/dev/null || true
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
pkill -9 -f "cache_aware_proxy" 2>/dev/null || true
sleep 5
for _ in $(seq 1 20); do
total_used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk '{s+=$1}END{print s}')
[[ "$total_used" -lt 1000 ]] && return 0
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
sleep 3
done
echo "[cleanup] WARNING: GPU memory not fully released"
}
trap cleanup EXIT
cleanup # ensure clean start
# ── Launch 8 instances ────────────────────────────────────────────────────
echo "[launch] Starting $N_INSTANCES vLLM instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
master=$((29500 + i))
logfile="$OUTDIR/vllm_inst_${i}.log"
step_log="$OUTDIR/engine_step_${i}.jsonl"
kv_args=""
mooncake_env=""
if [[ "$MODE" == "mooncake" ]]; then
kv_args="--kv-transfer-config {\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\"}"
mooncake_env="VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i))"
fi
env $mooncake_env \
AGENTIC_STEP_LOG_PATH="$step_log" \
AGENTIC_WORKER_ID="engine_${i}" \
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--no-enable-log-requests \
$kv_args \
> "$logfile" 2>&1 &
echo " inst_$i: GPU=$i port=$port"
sleep 2
done
# Wait for all instances to be ready
echo "[launch] Waiting for all instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
for t in $(seq 1 240); do
if curl -sf "http://127.0.0.1:$port/v1/models" >/dev/null 2>&1; then
echo " inst_$i ready after ${t}s"
break
fi
if [[ $t -eq 240 ]]; then
echo " ERROR: inst_$i did not start within 240s"
exit 1
fi
sleep 1
done
done
# ── Launch proxy ──────────────────────────────────────────────────────────
echo "[proxy] Starting cache_aware_proxy (policy=load_only)..."
combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
$PYTHON "$PROJ/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--port $PROXY_PORT \
--policy load_only \
> "$OUTDIR/proxy.log" 2>&1 &
PROXY_PID=$!
# Wait for proxy
for t in $(seq 1 30); do
if curl -sf "http://127.0.0.1:$PROXY_PORT/v1/models" >/dev/null 2>&1; then
echo "[proxy] Ready on port $PROXY_PORT"
break
fi
sleep 1
done
# ── Run benchmark ─────────────────────────────────────────────────────────
echo ""
echo "[bench] Running open-loop bench (512 input, 64 output, rates=32,64,128)..."
$PYTHON "$HERE/bench_loop.py" \
--url "http://127.0.0.1:$PROXY_PORT/v1/chat/completions" \
--model "$MODEL" \
--phase A \
--rates "32,64,128" \
--shape "512,64" \
--duration 60 \
--min-completed 200 \
--warmup 10 \
--output-dir "$OUTDIR"
echo ""
echo "[done] Results in $OUTDIR"
echo ""
# Print summary
cat "$OUTDIR/summary_A.json" | $PYTHON -c "
import json, sys
data = json.load(sys.stdin)
print('Rate | TTFT p50 TTFT p90 TTFT p99 | TPOT p50 TPOT p90 TPOT p99 | Thr ratio')
print('-' * 100)
for c in data:
r = c['rate_target']
print(f'{r:>5.0f} | {c.get(\"ttft_ms_p50\",0):>7.0f}ms {c.get(\"ttft_ms_p90\",0):>7.0f}ms {c.get(\"ttft_ms_p99\",0):>7.0f}ms | {c.get(\"tpot_ms_p50\",0):>7.1f}ms {c.get(\"tpot_ms_p90\",0):>7.1f}ms {c.get(\"tpot_ms_p99\",0):>7.1f}ms | {c.get(\"throughput_ratio\",0):>9.2f}')
"