Files
agentic-kvc/microbench/fresh_setup/mb5_launch.sh

248 lines
9.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# Launch vLLM instances + (optional) proxy for one MB5 config.
#
# CONFIG=8C : 8 kv_both instances on GPU 0-7 (replayer
# talks directly via round-robin)
# CONFIG=6P+2D : 6 kv_producer (GPU 0-5) + 2 kv_consumer (GPU 6-7)
# CONFIG=4P+4D : 4 producer (GPU 0-3) + 4 consumer (GPU 4-7)
# CONFIG=2P+6D : 2 producer (GPU 0-1) + 6 consumer (GPU 2-7)
#
# All configs use the fresh venv (vanilla vLLM 0.18.1 + Mooncake 0.3.11),
# kv_both/kv_producer/kv_consumer roles, MB5 scheduler instrumentation
# applied. PD configs are launched per the official vLLM example
# (run_mooncake_connector.sh) — round-robin P / round-robin D via
# mooncake_connector_proxy.py on PROXY_PORT.
#
# Usage:
# CONFIG=4P+4D RUN_LABEL=run1 bash mb5_launch.sh start
# bash mb5_launch.sh status
# bash mb5_launch.sh stop
#
# After "start", grep "ENDPOINTS=" from this script's output to get the
# URL(s) the replayer should target.
set -eo pipefail
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
# MB5_VENV lets a second host use an isolated venv clone (e.g. .venv_dash0) so
# two boxes can run in parallel without racing on the shared cpfs venv patch.
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
INSTRUMENT="${SCRIPT_DIR}/instrument_kv_snapshot.py"
PROXY_SRC="${SCRIPT_DIR}/mb5_pd_proxy.py"
CONFIG="${CONFIG:-8C}"
RUN_LABEL="${RUN_LABEL:-default}"
# All artefacts for this run live here
RUN_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_LABEL}_${CONFIG}"
LOGS_DIR="${RUN_ROOT}/vllm_logs"
SNAPSHOT_DIR="${RUN_ROOT}/kv_snapshots"
PROXY_PORT="${PROXY_PORT:-8100}"
BASE_HTTP=8000
BASE_BP=8998
BASE_MASTER=29500
stop_all() {
pkill -9 -f "mb5_pd_proxy.py" 2>/dev/null || true
pkill -9 -f "mooncake_connector_proxy.py" 2>/dev/null || true
pkill -9 -f "vllm serve" 2>/dev/null || true
pkill -9 -f "EngineCore" 2>/dev/null || true
sleep 3
# Hard guarantee: required ports must be free before we start. If they
# aren't, an earlier run left a stale process holding the socket and the
# readiness check would (silently) probe the stale proxy.
for port in 8000 8001 8002 8003 8004 8005 8006 8007 "${PROXY_PORT}"; do
if ss -ltn 2>/dev/null | awk '{print $4}' | grep -qE "[:.]${port}\$"; then
echo "[mb5] FATAL port ${port} still in use after stop_all; manual cleanup needed"
ss -ltnp 2>/dev/null | grep -E "[:.]${port}\$" || true
exit 1
fi
done
}
case "${1:-start}" in
stop)
stop_all
python "${INSTRUMENT}" --revert --venv "${VENV}" 2>/dev/null || true
exit 0;;
status)
ports=()
case "${CONFIG}" in
8C) for i in 0 1 2 3 4 5 6 7; do ports+=( $((BASE_HTTP+i)) ); done ;;
*) ports=( ${PROXY_PORT} ) ;;
esac
for p in "${ports[@]}"; do
if curl -sf "http://127.0.0.1:${p}/health" >/dev/null 2>&1 \
|| curl -sf "http://127.0.0.1:${p}/v1/models" >/dev/null 2>&1; then
echo "port ${p}: UP"
else
echo "port ${p}: DOWN"
fi
done
exit 0;;
start) ;;
*) echo "Unknown command: $1"; exit 1;;
esac
# --- parse CONFIG into (prefill_gpus, decode_gpus) ----------------
USE_COLO_PROXY=0
case "${CONFIG}" in
8C) ROLES="combined"; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
8C-proxy) ROLES="combined"; USE_COLO_PROXY=1; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
6P+2D) ROLES="pd"; P_GPUS="0,1,2,3,4,5"; D_GPUS="6,7" ;;
5P+3D) ROLES="pd"; P_GPUS="0,1,2,3,4"; D_GPUS="5,6,7" ;;
4P+4D) ROLES="pd"; P_GPUS="0,1,2,3"; D_GPUS="4,5,6,7" ;;
3P+5D) ROLES="pd"; P_GPUS="0,1,2"; D_GPUS="3,4,5,6,7" ;;
2P+6D) ROLES="pd"; P_GPUS="0,1"; D_GPUS="2,3,4,5,6,7" ;;
*) echo "Unknown CONFIG=${CONFIG} (expected: 8C, 8C-proxy, 6P+2D, 5P+3D, 4P+4D, 3P+5D, 2P+6D)"; exit 1;;
esac
stop_all
mkdir -p "${LOGS_DIR}" "${SNAPSHOT_DIR}"
source "${VENV}/bin/activate"
# Apply MB5 patch (idempotent — affects entire shared cpfs venv).
python "${INSTRUMENT}" --apply --venv "${VENV}"
launch_vllm() {
# $1 idx, $2 gpu, $3 port, $4 role, $5 bp_or_dash
local idx="$1" gpu="$2" port="$3" role="$4" bp="$5"
local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
local master=$((BASE_MASTER+idx))
# Bootstrap port: always set (vllm only consumes it for producer/kv_both
# modes; kv_consumer mode just ignores it). Use a placeholder if the
# caller didn't supply one — bash one-shot env-prefix with a dynamic
# variable is fragile, so we always export a literal.
local bp_use="${bp}"
if [ -z "${bp_use}" ] || [ "${bp_use}" = "-" ]; then
bp_use="9999"
fi
echo "[mb5] launching idx=${idx} gpu=${gpu} port=${port} role=${role} bp=${bp:-none}"
PYTHONHASHSEED=42 \
MB5_LOG_DIR="${SNAPSHOT_DIR}" \
CUDA_VISIBLE_DEVICES="${gpu}" \
MASTER_PORT="${master}" \
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp_use}" \
nohup vllm serve "${MODEL}" \
--host 0.0.0.0 --port "${port}" \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 \
--max-model-len 200000 \
--max-num-batched-tokens 8192 \
--kv-transfer-config "${cfg}" \
--enable-prompt-tokens-details \
> "${LOGS_DIR}/vllm_idx${idx}_gpu${gpu}_${role}.log" 2>&1 &
disown
}
idx=0
proxy_args=()
colo_args=()
ENDPOINTS=""
case "${ROLES}" in
combined)
IFS=',' read -ra GPUS <<< "${COMBINED_GPUS}"
for gpu in "${GPUS[@]}"; do
port=$((BASE_HTTP+idx))
bp=$((BASE_BP+idx))
launch_vllm "${idx}" "${gpu}" "${port}" "kv_both" "${bp}"
ENDPOINTS+="${ENDPOINTS:+,}http://127.0.0.1:${port}"
colo_args+=( --colo "http://127.0.0.1:${port}" )
idx=$((idx+1))
sleep 1
done
;;
pd)
# producers
IFS=',' read -ra PG <<< "${P_GPUS}"
for gpu in "${PG[@]}"; do
port=$((BASE_HTTP+idx))
bp=$((BASE_BP+idx))
launch_vllm "${idx}" "${gpu}" "${port}" "kv_producer" "${bp}"
proxy_args+=( --prefill "http://127.0.0.1:${port}" "${bp}" )
idx=$((idx+1))
sleep 1
done
# consumers
IFS=',' read -ra DG <<< "${D_GPUS}"
for gpu in "${DG[@]}"; do
port=$((BASE_HTTP+idx))
launch_vllm "${idx}" "${gpu}" "${port}" "kv_consumer" "-"
proxy_args+=( --decode "http://127.0.0.1:${port}" )
idx=$((idx+1))
sleep 1
done
;;
esac
echo "[mb5] waiting for all vllm /v1/models endpoints..."
all_ports=()
for ((p=0; p<idx; p++)); do all_ports+=( $((BASE_HTTP+p)) ); done
for port in "${all_ports[@]}"; do
tries=0
while ! curl -sf "http://127.0.0.1:${port}/v1/models" >/dev/null 2>&1; do
tries=$((tries+1))
if [ ${tries} -gt 300 ]; then
echo "[mb5] FATAL port ${port} did not come up in 10 min"
tail -40 "${LOGS_DIR}/vllm_idx"*"_gpu"*".log" || true
exit 1
fi
sleep 2
done
echo " port=${port} ready"
done
if [ "${ROLES}" = "pd" ]; then
P_ROUTING="${MB5_P_ROUTING:-rr}"
echo "[mb5] launching mooncake_connector_proxy on ${PROXY_PORT} (P routing=${P_ROUTING})"
MB5_P_ROUTING="${P_ROUTING}" \
nohup python "${PROXY_SRC}" "${proxy_args[@]}" --port "${PROXY_PORT}" --host 0.0.0.0 \
> "${LOGS_DIR}/proxy.log" 2>&1 &
disown
# wait for proxy. Official mooncake_connector_proxy only handles
# /v1/completions, so /health and /v1/models return 404 — accept any
# HTTP response as "alive".
tries=0
while ! curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${PROXY_PORT}/" 2>/dev/null | grep -qE "^[0-9]"; do
tries=$((tries+1))
if [ ${tries} -gt 60 ]; then
echo "[mb5] FATAL proxy did not come up in 2 min"
tail -40 "${LOGS_DIR}/proxy.log" || true
exit 1
fi
sleep 2
done
echo " proxy port=${PROXY_PORT} ready (HTTP responding)"
ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
fi
if [ "${USE_COLO_PROXY}" = "1" ]; then
echo "[mb5] launching colo passthrough proxy on ${PROXY_PORT} (8 kv_both instances)"
nohup python "${PROXY_SRC}" "${colo_args[@]}" --port "${PROXY_PORT}" --host 0.0.0.0 \
> "${LOGS_DIR}/proxy.log" 2>&1 &
disown
tries=0
while ! curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${PROXY_PORT}/" 2>/dev/null | grep -qE "^[0-9]"; do
tries=$((tries+1))
if [ ${tries} -gt 60 ]; then
echo "[mb5] FATAL colo proxy did not come up in 2 min"
tail -40 "${LOGS_DIR}/proxy.log" || true
exit 1
fi
sleep 2
done
echo " colo proxy port=${PROXY_PORT} ready (HTTP responding)"
ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
fi
echo "[mb5] CONFIG=${CONFIG} RUN_LABEL=${RUN_LABEL} UP"
echo "ENDPOINTS=${ENDPOINTS}"
echo "RUN_ROOT=${RUN_ROOT}"
echo "SNAPSHOT_DIR=${SNAPSHOT_DIR}"
echo "VLLM_LOGS=${LOGS_DIR}"