agentic-kvc/microbench/fresh_setup/run_mb6.sh

#!/usr/bin/env bash
# MB6 launcher: 2 vLLM instances (kv_both, Mooncake) + transfer-under-load
# sweep. Parameterized by VENV so it runs on either the patched main venv
# or the fresh upstream venv, to test whether the bandwidth degradation is
# our patch or inherent to upstream mooncake.
#
# Usage:
#   VENV=/home/admin/cpfs/wjh/agentic-kv/.venv       bash run_mb6.sh   # main
#   VENV=/home/admin/cpfs/wjh/agentic-kv-fresh/.venv bash run_mb6.sh   # fresh

set -uo pipefail

PROJ_DIR="${PROJ_DIR:-/home/admin/cpfs/wjh/agentic-kv}"
VENV="${VENV:-$PROJ_DIR/.venv}"
LABEL="${LABEL:-$(basename $(dirname $VENV))}"
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
GPUS="${GPUS:-0 1}"
SIZES="${SIZES:-16384,65536}"
BG_LOADS="${BG_LOADS:-0,8,24}"
REPEATS="${REPEATS:-4}"
DATE="$(date +%Y%m%d_%H%M)"
OUTDIR="${OUTDIR:-$PROJ_DIR/outputs/mb6_${LABEL}_${DATE}}"
PYTHON="$VENV/bin/python"
MC_INSTR="$PROJ_DIR/microbench/fresh_setup/instrument_mooncake.py"
DRIVER="$PROJ_DIR/microbench/fresh_setup/mb6_transfer_under_load.py"
MC_FILE="$VENV/lib/python3.12/site-packages/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py"

mkdir -p "$OUTDIR/logs"
XFER_LOG_DIR="$OUTDIR/xfer_log"; mkdir -p "$XFER_LOG_DIR"

echo "=== MB6 transfer-under-load ($LABEL) ==="
echo "VENV : $VENV"
echo "Out  : $OUTDIR"
echo ""

PORTS=(8000 8001); BPS=(8998 8999)
gpu_arr=($GPUS)

cleanup() {
    pkill -9 -f "vllm serve" 2>/dev/null || true
    pkill -9 -f "EngineCore" 2>/dev/null || true
    sleep 4
    "$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --revert 2>/dev/null || true
}
trap cleanup EXIT
cleanup

echo "[0] apply MB2 mooncake instrument to $LABEL venv"
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --apply
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --check

echo "[1] launch 2 instances"
i=0
for gpu in ${gpu_arr[@]:0:2}; do
    port=${PORTS[$i]}; bp=${BPS[$i]}; master=$((29600 + i))
    PYTHONHASHSEED=42 \
    VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
    MB2_LOG_DIR="$XFER_LOG_DIR" \
    CUDA_VISIBLE_DEVICES=$gpu \
    MASTER_PORT=$master \
    nohup "$VENV/bin/vllm" serve "$MODEL" \
        --host 0.0.0.0 --port "$port" \
        --tensor-parallel-size 1 --trust-remote-code --enable-prefix-caching \
        --dtype auto --gpu-memory-utilization 0.9 --max-model-len 200000 \
        --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
        --enable-prompt-tokens-details \
        > "$OUTDIR/logs/vllm_${i}_gpu${gpu}.log" 2>&1 &
    disown
    sleep 2
    i=$((i + 1))
done

echo "[2] wait for health"
for i in 0 1; do
    port=${PORTS[$i]}; tries=0
    while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
        tries=$((tries + 1))
        if [ $tries -gt 180 ]; then echo "FATAL inst_$i not healthy"; exit 1; fi
        sleep 2
    done
    echo "  inst_$i ready"
done
# bootstrap /query reachable?
for i in 0 1; do
    bp=${BPS[$i]}; tries=0
    while ! curl -sf "http://127.0.0.1:$bp/query" >/dev/null 2>&1; do
        tries=$((tries + 1))
        if [ $tries -gt 60 ]; then echo "WARN bootstrap $bp not ready"; break; fi
        sleep 2
    done
done

echo "[3] run MB6 driver"
"$PYTHON" "$DRIVER" \
    --src-port "${PORTS[0]}" --dst-port "${PORTS[1]}" \
    --src-bp "${BPS[0]}" --dst-bp "${BPS[1]}" \
    --sizes "$SIZES" --bg-loads "$BG_LOADS" --repeats "$REPEATS" \
    --label "$LABEL" --out "$OUTDIR/mb6_result.json" \
    2>&1 | tee "$OUTDIR/mb6_run.txt"

echo "[4] teardown + revert"
pkill -9 -f "vllm serve" 2>/dev/null || true
pkill -9 -f "EngineCore" 2>/dev/null || true
sleep 4
"$PYTHON" "$MC_INSTR" --venv "$MC_FILE" --revert

echo ""
echo "Done. Artifacts in $OUTDIR/"
echo "  mb6_result.json  mb6_run.txt  xfer_log/  logs/"