MIGRATION_TRANSFER_COST.md: under real load, migration KV transfer runs at ~3 GB/s vs ~10 GB/s idle. Decomposed (instruments + MB6 microbench) into ~55% RDMA-actual (HBM/PCIe contention with running kernels: 7.6->4.0 GB/s) + ~45% control-plane GIL starvation during long prefills. Reproduced on a fresh upstream venv (byte-identical transfer path) -> upstream/hardware inherent, not our patch. Layerwise is the wrong lever; the tax is structural on a loaded agentic cluster. Includes mb6_transfer_under_load + run_mb6, instrument_dst_migration/mooncake, and the dst/transfer decomposition analyzers. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
110 lines
3.6 KiB
Bash
Executable File
110 lines
3.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# MB6 launcher: 2 vLLM instances (kv_both, Mooncake) + transfer-under-load
|
|
# sweep. Parameterized by VENV so it runs on either the patched main venv
|
|
# or the fresh upstream venv, to test whether the bandwidth degradation is
|
|
# our patch or inherent to upstream mooncake.
|
|
#
|
|
# Usage:
|
|
# VENV=/home/admin/cpfs/wjh/agentic-kv/.venv bash run_mb6.sh # main
|
|
# VENV=/home/admin/cpfs/wjh/agentic-kv-fresh/.venv bash run_mb6.sh # fresh
|
|
|
|
set -uo pipefail
|
|
|
|
PROJ_DIR="${PROJ_DIR:-/home/admin/cpfs/wjh/agentic-kv}"
|
|
VENV="${VENV:-$PROJ_DIR/.venv}"
|
|
LABEL="${LABEL:-$(basename $(dirname $VENV))}"
|
|
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
GPUS="${GPUS:-0 1}"
|
|
SIZES="${SIZES:-16384,65536}"
|
|
BG_LOADS="${BG_LOADS:-0,8,24}"
|
|
REPEATS="${REPEATS:-4}"
|
|
DATE="$(date +%Y%m%d_%H%M)"
|
|
OUTDIR="${OUTDIR:-$PROJ_DIR/outputs/mb6_${LABEL}_${DATE}}"
|
|
PYTHON="$VENV/bin/python"
|
|
MC_INSTR="$PROJ_DIR/microbench/fresh_setup/instrument_mooncake.py"
|
|
DRIVER="$PROJ_DIR/microbench/fresh_setup/mb6_transfer_under_load.py"
|
|
MC_FILE="$VENV/lib/python3.12/site-packages/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py"
|
|
|
|
mkdir -p "$OUTDIR/logs"
|
|
XFER_LOG_DIR="$OUTDIR/xfer_log"; mkdir -p "$XFER_LOG_DIR"
|
|
|
|
echo "=== MB6 transfer-under-load ($LABEL) ==="
|
|
echo "VENV : $VENV"
|
|
echo "Out : $OUTDIR"
|
|
echo ""
|
|
|
|
PORTS=(8000 8001); BPS=(8998 8999)
|
|
gpu_arr=($GPUS)
|
|
|
|
cleanup() {
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 4
|
|
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --revert 2>/dev/null || true
|
|
}
|
|
trap cleanup EXIT
|
|
cleanup
|
|
|
|
echo "[0] apply MB2 mooncake instrument to $LABEL venv"
|
|
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --apply
|
|
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --check
|
|
|
|
echo "[1] launch 2 instances"
|
|
i=0
|
|
for gpu in ${gpu_arr[@]:0:2}; do
|
|
port=${PORTS[$i]}; bp=${BPS[$i]}; master=$((29600 + i))
|
|
PYTHONHASHSEED=42 \
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
|
|
MB2_LOG_DIR="$XFER_LOG_DIR" \
|
|
CUDA_VISIBLE_DEVICES=$gpu \
|
|
MASTER_PORT=$master \
|
|
nohup "$VENV/bin/vllm" serve "$MODEL" \
|
|
--host 0.0.0.0 --port "$port" \
|
|
--tensor-parallel-size 1 --trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
--enable-prompt-tokens-details \
|
|
> "$OUTDIR/logs/vllm_${i}_gpu${gpu}.log" 2>&1 &
|
|
disown
|
|
sleep 2
|
|
i=$((i + 1))
|
|
done
|
|
|
|
echo "[2] wait for health"
|
|
for i in 0 1; do
|
|
port=${PORTS[$i]}; tries=0
|
|
while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -gt 180 ]; then echo "FATAL inst_$i not healthy"; exit 1; fi
|
|
sleep 2
|
|
done
|
|
echo " inst_$i ready"
|
|
done
|
|
# bootstrap /query reachable?
|
|
for i in 0 1; do
|
|
bp=${BPS[$i]}; tries=0
|
|
while ! curl -sf "http://127.0.0.1:$bp/query" >/dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -gt 60 ]; then echo "WARN bootstrap $bp not ready"; break; fi
|
|
sleep 2
|
|
done
|
|
done
|
|
|
|
echo "[3] run MB6 driver"
|
|
"$PYTHON" "$DRIVER" \
|
|
--src-port "${PORTS[0]}" --dst-port "${PORTS[1]}" \
|
|
--src-bp "${BPS[0]}" --dst-bp "${BPS[1]}" \
|
|
--sizes "$SIZES" --bg-loads "$BG_LOADS" --repeats "$REPEATS" \
|
|
--label "$LABEL" --out "$OUTDIR/mb6_result.json" \
|
|
2>&1 | tee "$OUTDIR/mb6_run.txt"
|
|
|
|
echo "[4] teardown + revert"
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 4
|
|
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --revert
|
|
|
|
echo ""
|
|
echo "Done. Artifacts in $OUTDIR/"
|
|
echo " mb6_result.json mb6_run.txt xfer_log/ logs/"
|