Files
Gahow Wang 1262c9c22e Migration transfer-cost study: KV transfer is slow on busy GPUs
MIGRATION_TRANSFER_COST.md: under real load, migration KV transfer runs at
~3 GB/s vs ~10 GB/s idle. Decomposed (instruments + MB6 microbench) into
~55% RDMA-actual (HBM/PCIe contention with running kernels: 7.6->4.0 GB/s)
+ ~45% control-plane GIL starvation during long prefills. Reproduced on a
fresh upstream venv (byte-identical transfer path) -> upstream/hardware
inherent, not our patch. Layerwise is the wrong lever; the tax is structural
on a loaded agentic cluster. Includes mb6_transfer_under_load + run_mb6,
instrument_dst_migration/mooncake, and the dst/transfer decomposition analyzers.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-29 11:53:01 +08:00

110 lines
3.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# MB6 launcher: 2 vLLM instances (kv_both, Mooncake) + transfer-under-load
# sweep. Parameterized by VENV so it runs on either the patched main venv
# or the fresh upstream venv, to test whether the bandwidth degradation is
# our patch or inherent to upstream mooncake.
#
# Usage:
# VENV=/home/admin/cpfs/wjh/agentic-kv/.venv bash run_mb6.sh # main
# VENV=/home/admin/cpfs/wjh/agentic-kv-fresh/.venv bash run_mb6.sh # fresh
set -uo pipefail
PROJ_DIR="${PROJ_DIR:-/home/admin/cpfs/wjh/agentic-kv}"
VENV="${VENV:-$PROJ_DIR/.venv}"
LABEL="${LABEL:-$(basename $(dirname $VENV))}"
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
GPUS="${GPUS:-0 1}"
SIZES="${SIZES:-16384,65536}"
BG_LOADS="${BG_LOADS:-0,8,24}"
REPEATS="${REPEATS:-4}"
DATE="$(date +%Y%m%d_%H%M)"
OUTDIR="${OUTDIR:-$PROJ_DIR/outputs/mb6_${LABEL}_${DATE}}"
PYTHON="$VENV/bin/python"
MC_INSTR="$PROJ_DIR/microbench/fresh_setup/instrument_mooncake.py"
DRIVER="$PROJ_DIR/microbench/fresh_setup/mb6_transfer_under_load.py"
MC_FILE="$VENV/lib/python3.12/site-packages/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py"
mkdir -p "$OUTDIR/logs"
XFER_LOG_DIR="$OUTDIR/xfer_log"; mkdir -p "$XFER_LOG_DIR"
echo "=== MB6 transfer-under-load ($LABEL) ==="
echo "VENV : $VENV"
echo "Out : $OUTDIR"
echo ""
PORTS=(8000 8001); BPS=(8998 8999)
gpu_arr=($GPUS)
cleanup() {
pkill -9 -f "vllm serve" 2>/dev/null || true
pkill -9 -f "EngineCore" 2>/dev/null || true
sleep 4
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --revert 2>/dev/null || true
}
trap cleanup EXIT
cleanup
echo "[0] apply MB2 mooncake instrument to $LABEL venv"
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --apply
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --check
echo "[1] launch 2 instances"
i=0
for gpu in ${gpu_arr[@]:0:2}; do
port=${PORTS[$i]}; bp=${BPS[$i]}; master=$((29600 + i))
PYTHONHASHSEED=42 \
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
MB2_LOG_DIR="$XFER_LOG_DIR" \
CUDA_VISIBLE_DEVICES=$gpu \
MASTER_PORT=$master \
nohup "$VENV/bin/vllm" serve "$MODEL" \
--host 0.0.0.0 --port "$port" \
--tensor-parallel-size 1 --trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
--enable-prompt-tokens-details \
> "$OUTDIR/logs/vllm_${i}_gpu${gpu}.log" 2>&1 &
disown
sleep 2
i=$((i + 1))
done
echo "[2] wait for health"
for i in 0 1; do
port=${PORTS[$i]}; tries=0
while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 180 ]; then echo "FATAL inst_$i not healthy"; exit 1; fi
sleep 2
done
echo " inst_$i ready"
done
# bootstrap /query reachable?
for i in 0 1; do
bp=${BPS[$i]}; tries=0
while ! curl -sf "http://127.0.0.1:$bp/query" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 60 ]; then echo "WARN bootstrap $bp not ready"; break; fi
sleep 2
done
done
echo "[3] run MB6 driver"
"$PYTHON" "$DRIVER" \
--src-port "${PORTS[0]}" --dst-port "${PORTS[1]}" \
--src-bp "${BPS[0]}" --dst-bp "${BPS[1]}" \
--sizes "$SIZES" --bg-loads "$BG_LOADS" --repeats "$REPEATS" \
--label "$LABEL" --out "$OUTDIR/mb6_result.json" \
2>&1 | tee "$OUTDIR/mb6_run.txt"
echo "[4] teardown + revert"
pkill -9 -f "vllm serve" 2>/dev/null || true
pkill -9 -f "EngineCore" 2>/dev/null || true
sleep 4
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --revert
echo ""
echo "Done. Artifacts in $OUTDIR/"
echo " mb6_result.json mb6_run.txt xfer_log/ logs/"