agentic-kvc/microbench/fresh_setup/start_vllm_pair.sh

#!/usr/bin/env bash
# Start 2 vLLM instances with Mooncake kv_connector (kv_both) for MB2.
#
# Default config: both on local GPU 0 and 1 (intra-node A/B test).
# Override via GPU_A / GPU_B / HOST_A / HOST_B env vars.
#
# This uses the FRESH venv at /home/admin/cpfs/wjh/agentic-kv-fresh/.venv
# (vanilla vllm 0.18.1 + vanilla mooncake-transfer-engine 0.3.11), NOT
# the dash0 patched build.
#
# Usage:
#   GPU_A=0 GPU_B=1 bash microbench/fresh_setup/start_vllm_pair.sh
#   bash microbench/fresh_setup/start_vllm_pair.sh status
#   bash microbench/fresh_setup/start_vllm_pair.sh stop

set -eo pipefail

FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
VENV="${FRESH_ROOT}/.venv"
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
LOGS_DIR="${LOGS_DIR:-${FRESH_ROOT}/mb2_logs}"
mkdir -p "${LOGS_DIR}"

GPU_A="${GPU_A:-0}"
GPU_B="${GPU_B:-1}"
PORT_A=8000
PORT_B=8001
BP_A=8998
BP_B=8999
MASTER_A=29500
MASTER_B=29501
ROLE_A="${ROLE_A:-kv_both}"  # kv_both (works) or kv_producer (hits vllm 0.18.1 bootstrap_server bug on D-side counterpart)
ROLE_B="${ROLE_B:-kv_both}"  # kv_both / kv_consumer

MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INSTRUMENT="${SCRIPT_DIR}/instrument_mooncake.py"

stop_all() {
    pkill -9 -f "vllm serve" 2>/dev/null || true
    pkill -9 -f "EngineCore" 2>/dev/null || true
    sleep 2
    if [[ -f "${INSTRUMENT}" ]]; then
        python "${INSTRUMENT}" --revert --venv "${VENV}" 2>/dev/null || true
    fi
}

case "${1:-start}" in
    stop)
        stop_all
        exit 0
        ;;
    status)
        for p in "${PORT_A}" "${PORT_B}"; do
            if curl -sf "http://127.0.0.1:${p}/health" >/dev/null 2>&1; then
                echo "port ${p}: UP"
            else
                echo "port ${p}: DOWN"
            fi
        done
        exit 0
        ;;
    start)
        ;;
    *)
        echo "Unknown command: $1"; exit 1;;
esac

stop_all

source "${VENV}/bin/activate"

if [[ -f "${INSTRUMENT}" ]]; then
    echo "[mb2] applying instrumentation patch"
    python "${INSTRUMENT}" --apply --venv "${VENV}"
else
    echo "[mb2] WARN instrument_mooncake.py not found at ${INSTRUMENT}; transfer logs will be absent"
fi

mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"

launch() {
    local idx="$1" gpu="$2" port="$3" bp="$4" master="$5" role="$6"
    local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
    echo "[mb2] launching ${idx}: gpu=${gpu} port=${port} bp=${bp} role=${role}"
    PYTHONHASHSEED=42 \
    VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
    CUDA_VISIBLE_DEVICES="${gpu}" \
    MASTER_PORT="${master}" \
    MB2_LOG_DIR="${MB2_LOG_ROOT}/${idx}" \
    nohup vllm serve "${MODEL}" \
        --host 0.0.0.0 --port "${port}" \
        --tensor-parallel-size 1 \
        --trust-remote-code --enable-prefix-caching \
        --dtype auto --gpu-memory-utilization 0.9 \
        --max-model-len 200000 \
        --kv-transfer-config "${cfg}" \
        --enable-prompt-tokens-details \
        > "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
    disown
}

launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}" "${ROLE_A}"
sleep 3
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}" "${ROLE_B}"

echo "[mb2] waiting for both /health endpoints..."
for port in "${PORT_A}" "${PORT_B}"; do
    tries=0
    while ! curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1; do
        tries=$((tries+1))
        if [ ${tries} -gt 180 ]; then
            echo "[mb2] FATAL port ${port} did not come up in 6 min"
            tail -40 "${LOGS_DIR}/vllm_"*"_gpu"*".log" || true
            exit 1
        fi
        sleep 2
    done
    echo "  port=${port} ready"
done

echo "[mb2] both instances UP"
echo "  A: 127.0.0.1:${PORT_A} (GPU ${GPU_A}, bp ${BP_A}, log_dir ${MB2_LOG_ROOT}/A)"
echo "  B: 127.0.0.1:${PORT_B} (GPU ${GPU_B}, bp ${BP_B}, log_dir ${MB2_LOG_ROOT}/B)"
echo "  vllm stdout: ${LOGS_DIR}"