Files
agentic-kvc/microbench/connector_tax/run_8instance.sh
Gahow Wang e3480f7d28 8-instance connector tax: +2% at non-saturated, +17% only at saturation
8×TP1 + load_only proxy, shape 512×64, rates 32/64/128 req/s total:

  Rate=32 (non-saturated, thr=0.95-0.97):
    plain TTFT p90=64ms,  mooncake_both=65ms  → +2% (noise)
  Rate=64 (non-saturated, thr=0.96):
    plain TTFT p90=114ms, mooncake_both=107ms → -6% (noise)
  Rate=128 (saturated, thr=0.70-0.71):
    plain TTFT p90=702ms, mooncake_both=822ms → +17%
    plain TTFT p50=339ms, mooncake_both=470ms → +39%

Conclusion: The elastic_migration_v2 +45% is a saturation artifact.
Under SLO-compliant load (TTFT<10s, thr_ratio>0.9), mooncake_both's
1.4ms/step build_connector_meta overhead is completely masked by the
scheduler-model async pipeline. The tax only manifests when the system
is already saturated and queueing amplifies per-step differences.

For practical deployment: enabling kv_role=kv_both has effectively zero
cost as long as the serving system stays within SLO capacity bounds.
2026-05-26 21:32:46 +08:00

170 lines
5.7 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 8-instance connector tax microbench: plain vs mooncake_both
#
# Launches 8×TP1 vLLM instances + cache_aware_proxy, same topology as
# elastic_migration_v2. Runs open-loop bench at rates 32,64,128 req/s
# with short shape (512 input, 64 output) to maximize decode concurrency.
#
# Usage:
# bash run_8instance.sh --mode plain # no Mooncake
# bash run_8instance.sh --mode mooncake # kv_role=kv_both
#
# Results go to results/8inst_<mode>_<date>/
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJ="$(cd "$HERE/../.." && pwd)"
PYTHON="$PROJ/.venv/bin/python"
VLLM="$PROJ/.venv/bin/vllm"
MODEL="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
export PYTHONPATH="$PROJ:${PYTHONPATH:-}"
N_INSTANCES=8
BASE_PORT=8000
PROXY_PORT=9090
MODE="${1:---mode}" # parse below
# Parse --mode plain|mooncake
while [[ $# -gt 0 ]]; do
case "$1" in
--mode) MODE="$2"; shift 2 ;;
*) echo "Unknown arg: $1"; exit 1 ;;
esac
done
if [[ "$MODE" != "plain" && "$MODE" != "mooncake" ]]; then
echo "Usage: $0 --mode plain|mooncake"
exit 1
fi
DATE=$(date +%Y%m%d_%H%M)
OUTDIR="$HERE/results/8inst_${MODE}_${DATE}"
mkdir -p "$OUTDIR"
echo "=== 8-Instance Connector Tax Microbench ==="
echo "Mode: $MODE"
echo "Output: $OUTDIR"
echo ""
# ── Cleanup ───────────────────────────────────────────────────────────────
cleanup() {
echo "[cleanup] Killing all vLLM/proxy processes..."
pkill -9 -f "vllm serve" 2>/dev/null || true
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
pkill -9 -f "cache_aware_proxy" 2>/dev/null || true
sleep 5
for _ in $(seq 1 20); do
total_used=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk '{s+=$1}END{print s}')
[[ "$total_used" -lt 1000 ]] && return 0
pkill -9 -f "VLLM::EngineCore" 2>/dev/null || true
sleep 3
done
echo "[cleanup] WARNING: GPU memory not fully released"
}
trap cleanup EXIT
cleanup # ensure clean start
# ── Launch 8 instances ────────────────────────────────────────────────────
echo "[launch] Starting $N_INSTANCES vLLM instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
master=$((29500 + i))
logfile="$OUTDIR/vllm_inst_${i}.log"
step_log="$OUTDIR/engine_step_${i}.jsonl"
kv_args=""
mooncake_env=""
if [[ "$MODE" == "mooncake" ]]; then
kv_args="--kv-transfer-config {\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\"}"
mooncake_env="VLLM_MOONCAKE_BOOTSTRAP_PORT=$((8998 + i))"
fi
env $mooncake_env \
AGENTIC_STEP_LOG_PATH="$step_log" \
AGENTIC_WORKER_ID="engine_${i}" \
MASTER_PORT=$master \
CUDA_VISIBLE_DEVICES=$i \
$VLLM serve "$MODEL" \
--host 0.0.0.0 --port $port \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--no-enable-log-requests \
$kv_args \
> "$logfile" 2>&1 &
echo " inst_$i: GPU=$i port=$port"
sleep 2
done
# Wait for all instances to be ready
echo "[launch] Waiting for all instances..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
port=$((BASE_PORT + i))
for t in $(seq 1 240); do
if curl -sf "http://127.0.0.1:$port/v1/models" >/dev/null 2>&1; then
echo " inst_$i ready after ${t}s"
break
fi
if [[ $t -eq 240 ]]; then
echo " ERROR: inst_$i did not start within 240s"
exit 1
fi
sleep 1
done
done
# ── Launch proxy ──────────────────────────────────────────────────────────
echo "[proxy] Starting cache_aware_proxy (policy=load_only)..."
combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
$PYTHON "$PROJ/scripts/cache_aware_proxy.py" \
--combined $combined_args \
--port $PROXY_PORT \
--policy load_only \
> "$OUTDIR/proxy.log" 2>&1 &
PROXY_PID=$!
# Wait for proxy
for t in $(seq 1 30); do
if curl -sf "http://127.0.0.1:$PROXY_PORT/v1/models" >/dev/null 2>&1; then
echo "[proxy] Ready on port $PROXY_PORT"
break
fi
sleep 1
done
# ── Run benchmark ─────────────────────────────────────────────────────────
echo ""
echo "[bench] Running open-loop bench (512 input, 64 output, rates=32,64,128)..."
$PYTHON "$HERE/bench_loop.py" \
--url "http://127.0.0.1:$PROXY_PORT/v1/chat/completions" \
--model "$MODEL" \
--phase A \
--rates "32,64,128" \
--shape "512,64" \
--duration 60 \
--min-completed 200 \
--warmup 10 \
--output-dir "$OUTDIR"
echo ""
echo "[done] Results in $OUTDIR"
echo ""
# Print summary
cat "$OUTDIR/summary_A.json" | $PYTHON -c "
import json, sys
data = json.load(sys.stdin)
print('Rate | TTFT p50 TTFT p90 TTFT p99 | TPOT p50 TPOT p90 TPOT p99 | Thr ratio')
print('-' * 100)
for c in data:
r = c['rate_target']
print(f'{r:>5.0f} | {c.get(\"ttft_ms_p50\",0):>7.0f}ms {c.get(\"ttft_ms_p90\",0):>7.0f}ms {c.get(\"ttft_ms_p99\",0):>7.0f}ms | {c.get(\"tpot_ms_p50\",0):>7.1f}ms {c.get(\"tpot_ms_p90\",0):>7.1f}ms {c.get(\"tpot_ms_p99\",0):>7.1f}ms | {c.get(\"throughput_ratio\",0):>9.2f}')
"