8-instance connector tax: +2% at non-saturated, +17% only at saturation
8×TP1 + load_only proxy, shape 512×64, rates 32/64/128 req/s total:
Rate=32 (non-saturated, thr=0.95-0.97):
plain TTFT p90=64ms, mooncake_both=65ms → +2% (noise)
Rate=64 (non-saturated, thr=0.96):
plain TTFT p90=114ms, mooncake_both=107ms → -6% (noise)
Rate=128 (saturated, thr=0.70-0.71):
plain TTFT p90=702ms, mooncake_both=822ms → +17%
plain TTFT p50=339ms, mooncake_both=470ms → +39%
Conclusion: The elastic_migration_v2 +45% is a saturation artifact.
Under SLO-compliant load (TTFT<10s, thr_ratio>0.9), mooncake_both's
1.4ms/step build_connector_meta overhead is completely masked by the
scheduler-model async pipeline. The tax only manifests when the system
is already saturated and queueing amplifies per-step differences.
For practical deployment: enabling kv_role=kv_both has effectively zero
cost as long as the serving system stays within SLO capacity bounds.
This commit is contained in:
169
microbench/connector_tax/run_8instance.sh
Executable file
169
microbench/connector_tax/run_8instance.sh
Executable file
@@ -0,0 +1,169 @@
|
||||
#!/bin/bash
|
||||
# 8-instance connector tax microbench: plain vs mooncake_both
|
||||
#
|
||||
# Launches 8×TP1 vLLM instances + cache_aware_proxy, same topology as
|
||||
# elastic_migration_v2. Runs open-loop bench at rates 32,64,128 req/s
|
||||
# with short shape (512 input, 64 output) to maximize decode concurrency.
|
||||
#
|
||||
# Usage:
|
||||
# bash run_8instance.sh --mode plain # no Mooncake
|
||||
# bash run_8instance.sh --mode mooncake # kv_role=kv_both
|
||||
#
|
||||
# Results go to results/8inst_<mode>_<date>/
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJ="$(cd "$HERE/../.." && pwd)"
|
||||
PYTHON="$PROJ/.venv/bin/python"
|
||||
VLLM="$PROJ/.venv/bin/vllm"
|
||||
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||||
export PYTHONPATH="$PROJ:${PYTHONPATH:-}"
|
||||
|
||||
N_INSTANCES=8
|
||||
BASE_PORT=8000
|
||||
PROXY_PORT=9090
|
||||
MODE="${1:---mode}" # parse below
|
||||
|
||||
# Parse --mode plain|mooncake
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--mode) MODE="$2"; shift 2 ;;
|
||||
*) echo "Unknown arg: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$MODE" != "plain" && "$MODE" != "mooncake" ]]; then
|
||||
echo "Usage: $0 --mode plain|mooncake"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DATE=$(date +%Y%m%d_%H%M)
|
||||
OUTDIR="$HERE/results/8inst_${MODE}_${DATE}"
|
||||
mkdir -p "$OUTDIR"
|
||||
|
||||
echo "=== 8-Instance Connector Tax Microbench ==="
|
||||
echo "Mode: $MODE"
|
||||
echo "Output: $OUTDIR"
|
||||
echo ""
|
||||
|
||||
# ── Cleanup ───────────────────────────────────────────────────────────────
|
||||
cleanup() {
|
||||
echo "[cleanup] Killing all vLLM/proxy processes..."
|
||||
pkill -9 -f "vllm serve" 2>/dev/null || true
|
||||
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
|
||||
pkill -9 -f "cache_aware_proxy" 2>/dev/null || true
|
||||
sleep 5
|
||||
for _ in $(seq 1 20); do
|
||||
total_used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk '{s+=$1}END{print s}')
|
||||
[[ "$total_used" -lt 1000 ]] && return 0
|
||||
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
|
||||
sleep 3
|
||||
done
|
||||
echo "[cleanup] WARNING: GPU memory not fully released"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
cleanup # ensure clean start
|
||||
|
||||
# ── Launch 8 instances ────────────────────────────────────────────────────
|
||||
echo "[launch] Starting $N_INSTANCES vLLM instances..."
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
port=$((BASE_PORT + i))
|
||||
master=$((29500 + i))
|
||||
logfile="$OUTDIR/vllm_inst_${i}.log"
|
||||
step_log="$OUTDIR/engine_step_${i}.jsonl"
|
||||
|
||||
kv_args=""
|
||||
mooncake_env=""
|
||||
if [[ "$MODE" == "mooncake" ]]; then
|
||||
kv_args="--kv-transfer-config {\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\"}"
|
||||
mooncake_env="VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i))"
|
||||
fi
|
||||
|
||||
env $mooncake_env \
|
||||
AGENTIC_STEP_LOG_PATH="$step_log" \
|
||||
AGENTIC_WORKER_ID="engine_${i}" \
|
||||
MASTER_PORT=$master \
|
||||
CUDA_VISIBLE_DEVICES=$i \
|
||||
$VLLM serve "$MODEL" \
|
||||
--host 0.0.0.0 --port $port \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code --enable-prefix-caching \
|
||||
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
||||
--no-enable-log-requests \
|
||||
$kv_args \
|
||||
> "$logfile" 2>&1 &
|
||||
|
||||
echo " inst_$i: GPU=$i port=$port"
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# Wait for all instances to be ready
|
||||
echo "[launch] Waiting for all instances..."
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
port=$((BASE_PORT + i))
|
||||
for t in $(seq 1 240); do
|
||||
if curl -sf "http://127.0.0.1:$port/v1/models" >/dev/null 2>&1; then
|
||||
echo " inst_$i ready after ${t}s"
|
||||
break
|
||||
fi
|
||||
if [[ $t -eq 240 ]]; then
|
||||
echo " ERROR: inst_$i did not start within 240s"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
|
||||
# ── Launch proxy ──────────────────────────────────────────────────────────
|
||||
echo "[proxy] Starting cache_aware_proxy (policy=load_only)..."
|
||||
combined_args=""
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
||||
done
|
||||
|
||||
$PYTHON "$PROJ/scripts/cache_aware_proxy.py" \
|
||||
--combined $combined_args \
|
||||
--port $PROXY_PORT \
|
||||
--policy load_only \
|
||||
> "$OUTDIR/proxy.log" 2>&1 &
|
||||
PROXY_PID=$!
|
||||
|
||||
# Wait for proxy
|
||||
for t in $(seq 1 30); do
|
||||
if curl -sf "http://127.0.0.1:$PROXY_PORT/v1/models" >/dev/null 2>&1; then
|
||||
echo "[proxy] Ready on port $PROXY_PORT"
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# ── Run benchmark ─────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "[bench] Running open-loop bench (512 input, 64 output, rates=32,64,128)..."
|
||||
$PYTHON "$HERE/bench_loop.py" \
|
||||
--url "http://127.0.0.1:$PROXY_PORT/v1/chat/completions" \
|
||||
--model "$MODEL" \
|
||||
--phase A \
|
||||
--rates "32,64,128" \
|
||||
--shape "512,64" \
|
||||
--duration 60 \
|
||||
--min-completed 200 \
|
||||
--warmup 10 \
|
||||
--output-dir "$OUTDIR"
|
||||
|
||||
echo ""
|
||||
echo "[done] Results in $OUTDIR"
|
||||
echo ""
|
||||
|
||||
# Print summary
|
||||
cat "$OUTDIR/summary_A.json" | $PYTHON -c "
|
||||
import json, sys
|
||||
data = json.load(sys.stdin)
|
||||
print('Rate | TTFT p50 TTFT p90 TTFT p99 | TPOT p50 TPOT p90 TPOT p99 | Thr ratio')
|
||||
print('-' * 100)
|
||||
for c in data:
|
||||
r = c['rate_target']
|
||||
print(f'{r:>5.0f} | {c.get(\"ttft_ms_p50\",0):>7.0f}ms {c.get(\"ttft_ms_p90\",0):>7.0f}ms {c.get(\"ttft_ms_p99\",0):>7.0f}ms | {c.get(\"tpot_ms_p50\",0):>7.1f}ms {c.get(\"tpot_ms_p90\",0):>7.1f}ms {c.get(\"tpot_ms_p99\",0):>7.1f}ms | {c.get(\"throughput_ratio\",0):>9.2f}')
|
||||
"
|
||||
Reference in New Issue
Block a user