agentic-kvc/microbench/fresh_setup/start_vllm_single.sh

#!/usr/bin/env bash
# Start ONE vLLM instance with Mooncake kv_connector, for inter-node MB2.
# Run separately on each host (dash1, dash2) before kicking off the bench.
#
# Usage on each host:
#   INSTANCE=A GPU=0 PORT=8000 BP=8998 MASTER=29500 ROLE=kv_both \
#     bash microbench/fresh_setup/start_vllm_single.sh start
#   bash microbench/fresh_setup/start_vllm_single.sh status
#   bash microbench/fresh_setup/start_vllm_single.sh stop
#
# All hosts share cpfs, so the venv at FRESH_ROOT/.venv is single-installed
# and the instrumentation patch is global. Per-instance logs go under
# FRESH_ROOT/mb2_transfer_logs/{INSTANCE}/ which is visible from any host.

set -eo pipefail

FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
VENV="${FRESH_ROOT}/.venv"
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
LOGS_DIR="${LOGS_DIR:-${FRESH_ROOT}/mb2_logs}"
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"

INSTANCE="${INSTANCE:-A}"
GPU="${GPU:-0}"
PORT="${PORT:-8000}"
BP="${BP:-8998}"
MASTER="${MASTER:-29500}"
ROLE="${ROLE:-kv_both}"

mkdir -p "${LOGS_DIR}" "${MB2_LOG_ROOT}/${INSTANCE}"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INSTRUMENT="${SCRIPT_DIR}/instrument_mooncake.py"

stop_local() {
    pkill -9 -f "vllm serve.*--port ${PORT} " 2>/dev/null || true
    pkill -9 -f "EngineCore" 2>/dev/null || true
    sleep 2
}

case "${1:-start}" in
    stop)
        stop_local
        # Patch revert is only safe to do when no other instance is using
        # the venv — for a shared cpfs venv we leave it applied until all
        # instances are stopped. Do it manually with:
        #   python instrument_mooncake.py --revert
        exit 0;;
    status)
        if curl -sf "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
            echo "port ${PORT}: UP"
        else
            echo "port ${PORT}: DOWN"
        fi
        exit 0;;
    start) ;;
    *) echo "Unknown command: $1"; exit 1;;
esac

stop_local

source "${VENV}/bin/activate"

# Apply instrumentation on first launch (it's idempotent / safe to re-apply).
if [[ -f "${INSTRUMENT}" ]]; then
    python "${INSTRUMENT}" --apply --venv "${VENV}" 2>&1 || true
fi

cfg='{"kv_connector":"MooncakeConnector","kv_role":"'${ROLE}'"}'
echo "[mb2-single] launching ${INSTANCE}: gpu=${GPU} port=${PORT} bp=${BP} role=${ROLE}"

PYTHONHASHSEED=42 \
VLLM_MOONCAKE_BOOTSTRAP_PORT="${BP}" \
CUDA_VISIBLE_DEVICES="${GPU}" \
MASTER_PORT="${MASTER}" \
MB2_LOG_DIR="${MB2_LOG_ROOT}/${INSTANCE}" \
nohup vllm serve "${MODEL}" \
    --host 0.0.0.0 --port "${PORT}" \
    --tensor-parallel-size 1 \
    --trust-remote-code --enable-prefix-caching \
    --dtype auto --gpu-memory-utilization 0.9 \
    --max-model-len 200000 \
    --kv-transfer-config "${cfg}" \
    --enable-prompt-tokens-details \
    > "${LOGS_DIR}/vllm_${INSTANCE}_$(hostname -s)_gpu${GPU}.log" 2>&1 &
disown

echo "[mb2-single] waiting for /health on port ${PORT}..."
tries=0
while ! curl -sf "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; do
    tries=$((tries+1))
    if [ ${tries} -gt 180 ]; then
        echo "[mb2-single] FATAL port ${PORT} did not come up in 6 min"
        tail -40 "${LOGS_DIR}/vllm_${INSTANCE}_"*"_gpu${GPU}.log" || true
        exit 1
    fi
    sleep 2
done
echo "[mb2-single] ${INSTANCE} UP on $(hostname -s):${PORT} (bp ${BP}, gpu ${GPU})"