8-instance connector tax: +2% at non-saturated, +17% only at saturation

8×TP1 + load_only proxy, shape 512×64, rates 32/64/128 req/s total: Rate=32 (non-saturated, thr=0.95-0.97): plain TTFT p90=64ms, mooncake_both=65ms → +2% (noise) Rate=64 (non-saturated, thr=0.96): plain TTFT p90=114ms, mooncake_both=107ms → -6% (noise) Rate=128 (saturated, thr=0.70-0.71): plain TTFT p90=702ms, mooncake_both=822ms → +17% plain TTFT p50=339ms, mooncake_both=470ms → +39% Conclusion: The elastic_migration_v2 +45% is a saturation artifact. Under SLO-compliant load (TTFT<10s, thr_ratio>0.9), mooncake_both's 1.4ms/step build_connector_meta overhead is completely masked by the scheduler-model async pipeline. The tax only manifests when the system is already saturated and queueing amplifies per-step differences. For practical deployment: enabling kv_role=kv_both has effectively zero cost as long as the serving system stays within SLO capacity bounds.
2026-05-26 21:32:46 +08:00
parent c8ec73c548
commit e3480f7d28
1 changed files with 169 additions and 0 deletions
--- a/microbench/connector_tax/run_8instance.sh
+++ b/microbench/connector_tax/run_8instance.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+# 8-instance connector tax microbench: plain vs mooncake_both
+#
+# Launches 8×TP1 vLLM instances + cache_aware_proxy, same topology as
+# elastic_migration_v2. Runs open-loop bench at rates 32,64,128 req/s
+# with short shape (512 input, 64 output) to maximize decode concurrency.
+#
+# Usage:
+#   bash run_8instance.sh --mode plain     # no Mooncake
+#   bash run_8instance.sh --mode mooncake  # kv_role=kv_both
+#
+# Results go to results/8inst_<mode>_<date>/
+
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJ="$(cd "$HERE/../.." && pwd)"
+PYTHON="$PROJ/.venv/bin/python"
+VLLM="$PROJ/.venv/bin/vllm"
+MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
+export PYTHONPATH="$PROJ:${PYTHONPATH:-}"
+
+N_INSTANCES=8
+BASE_PORT=8000
+PROXY_PORT=9090
+MODE="${1:---mode}"  # parse below
+
+# Parse --mode plain|mooncake
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --mode) MODE="$2"; shift 2 ;;
+        *) echo "Unknown arg: $1"; exit 1 ;;
+    esac
+done
+
+if [[ "$MODE" != "plain" && "$MODE" != "mooncake" ]]; then
+    echo "Usage: $0 --mode plain|mooncake"
+    exit 1
+fi
+
+DATE=$(date +%Y%m%d_%H%M)
+OUTDIR="$HERE/results/8inst_${MODE}_${DATE}"
+mkdir -p "$OUTDIR"
+
+echo "=== 8-Instance Connector Tax Microbench ==="
+echo "Mode: $MODE"
+echo "Output: $OUTDIR"
+echo ""
+
+# ── Cleanup ───────────────────────────────────────────────────────────────
+cleanup() {
+    echo "[cleanup] Killing all vLLM/proxy processes..."
+    pkill -9 -f "vllm serve" 2>/dev/null || true
+    pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
+    pkill -9 -f "cache_aware_proxy" 2>/dev/null || true
+    sleep 5
+    for _ in $(seq 1 20); do
+        total_used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk '{s+=$1}END{print s}')
+        [[ "$total_used" -lt 1000 ]] && return 0
+        pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
+        sleep 3
+    done
+    echo "[cleanup] WARNING: GPU memory not fully released"
+}
+trap cleanup EXIT
+
+cleanup  # ensure clean start
+
+# ── Launch 8 instances ────────────────────────────────────────────────────
+echo "[launch] Starting $N_INSTANCES vLLM instances..."
+for i in $(seq 0 $((N_INSTANCES - 1))); do
+    port=$((BASE_PORT + i))
+    master=$((29500 + i))
+    logfile="$OUTDIR/vllm_inst_${i}.log"
+    step_log="$OUTDIR/engine_step_${i}.jsonl"
+
+    kv_args=""
+    mooncake_env=""
+    if [[ "$MODE" == "mooncake" ]]; then
+        kv_args="--kv-transfer-config {\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\"}"
+        mooncake_env="VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i))"
+    fi
+
+    env $mooncake_env \
+        AGENTIC_STEP_LOG_PATH="$step_log" \
+        AGENTIC_WORKER_ID="engine_${i}" \
+        MASTER_PORT=$master \
+        CUDA_VISIBLE_DEVICES=$i \
+    $VLLM serve "$MODEL" \
+        --host 0.0.0.0 --port $port \
+        --tensor-parallel-size 1 \
+        --trust-remote-code --enable-prefix-caching \
+        --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
+        --no-enable-log-requests \
+        $kv_args \
+        > "$logfile" 2>&1 &
+
+    echo "  inst_$i: GPU=$i port=$port"
+    sleep 2
+done
+
+# Wait for all instances to be ready
+echo "[launch] Waiting for all instances..."
+for i in $(seq 0 $((N_INSTANCES - 1))); do
+    port=$((BASE_PORT + i))
+    for t in $(seq 1 240); do
+        if curl -sf "http://127.0.0.1:$port/v1/models" >/dev/null 2>&1; then
+            echo "  inst_$i ready after ${t}s"
+            break
+        fi
+        if [[ $t -eq 240 ]]; then
+            echo "  ERROR: inst_$i did not start within 240s"
+            exit 1
+        fi
+        sleep 1
+    done
+done
+
+# ── Launch proxy ──────────────────────────────────────────────────────────
+echo "[proxy] Starting cache_aware_proxy (policy=load_only)..."
+combined_args=""
+for i in $(seq 0 $((N_INSTANCES - 1))); do
+    combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
+done
+
+$PYTHON "$PROJ/scripts/cache_aware_proxy.py" \
+    --combined $combined_args \
+    --port $PROXY_PORT \
+    --policy load_only \
+    > "$OUTDIR/proxy.log" 2>&1 &
+PROXY_PID=$!
+
+# Wait for proxy
+for t in $(seq 1 30); do
+    if curl -sf "http://127.0.0.1:$PROXY_PORT/v1/models" >/dev/null 2>&1; then
+        echo "[proxy] Ready on port $PROXY_PORT"
+        break
+    fi
+    sleep 1
+done
+
+# ── Run benchmark ─────────────────────────────────────────────────────────
+echo ""
+echo "[bench] Running open-loop bench (512 input, 64 output, rates=32,64,128)..."
+$PYTHON "$HERE/bench_loop.py" \
+    --url "http://127.0.0.1:$PROXY_PORT/v1/chat/completions" \
+    --model "$MODEL" \
+    --phase A \
+    --rates "32,64,128" \
+    --shape "512,64" \
+    --duration 60 \
+    --min-completed 200 \
+    --warmup 10 \
+    --output-dir "$OUTDIR"
+
+echo ""
+echo "[done] Results in $OUTDIR"
+echo ""
+
+# Print summary
+cat "$OUTDIR/summary_A.json" | $PYTHON -c "
+import json, sys
+data = json.load(sys.stdin)
+print('Rate   | TTFT p50    TTFT p90    TTFT p99  | TPOT p50    TPOT p90    TPOT p99  | Thr ratio')
+print('-' * 100)
+for c in data:
+    r = c['rate_target']
+    print(f'{r:>5.0f}  | {c.get(\"ttft_ms_p50\",0):>7.0f}ms  {c.get(\"ttft_ms_p90\",0):>7.0f}ms  {c.get(\"ttft_ms_p99\",0):>7.0f}ms | {c.get(\"tpot_ms_p50\",0):>7.1f}ms  {c.get(\"tpot_ms_p90\",0):>7.1f}ms  {c.get(\"tpot_ms_p99\",0):>7.1f}ms | {c.get(\"throughput_ratio\",0):>9.2f}')
+"