248 lines
9.1 KiB
Bash
Executable File
248 lines
9.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Launch vLLM instances + (optional) proxy for one MB5 config.
|
|
#
|
|
# CONFIG=8C : 8 kv_both instances on GPU 0-7 (replayer
|
|
# talks directly via round-robin)
|
|
# CONFIG=6P+2D : 6 kv_producer (GPU 0-5) + 2 kv_consumer (GPU 6-7)
|
|
# CONFIG=4P+4D : 4 producer (GPU 0-3) + 4 consumer (GPU 4-7)
|
|
# CONFIG=2P+6D : 2 producer (GPU 0-1) + 6 consumer (GPU 2-7)
|
|
#
|
|
# All configs use the fresh venv (vanilla vLLM 0.18.1 + Mooncake 0.3.11),
|
|
# kv_both/kv_producer/kv_consumer roles, MB5 scheduler instrumentation
|
|
# applied. PD configs are launched per the official vLLM example
|
|
# (run_mooncake_connector.sh) — round-robin P / round-robin D via
|
|
# mooncake_connector_proxy.py on PROXY_PORT.
|
|
#
|
|
# Usage:
|
|
# CONFIG=4P+4D RUN_LABEL=run1 bash mb5_launch.sh start
|
|
# bash mb5_launch.sh status
|
|
# bash mb5_launch.sh stop
|
|
#
|
|
# After "start", grep "ENDPOINTS=" from this script's output to get the
|
|
# URL(s) the replayer should target.
|
|
|
|
set -eo pipefail
|
|
|
|
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
|
# MB5_VENV lets a second host use an isolated venv clone (e.g. .venv_dash0) so
|
|
# two boxes can run in parallel without racing on the shared cpfs venv patch.
|
|
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
|
|
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
INSTRUMENT="${SCRIPT_DIR}/instrument_kv_snapshot.py"
|
|
PROXY_SRC="${SCRIPT_DIR}/mb5_pd_proxy.py"
|
|
|
|
CONFIG="${CONFIG:-8C}"
|
|
RUN_LABEL="${RUN_LABEL:-default}"
|
|
|
|
# All artefacts for this run live here
|
|
RUN_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_LABEL}_${CONFIG}"
|
|
LOGS_DIR="${RUN_ROOT}/vllm_logs"
|
|
SNAPSHOT_DIR="${RUN_ROOT}/kv_snapshots"
|
|
|
|
PROXY_PORT="${PROXY_PORT:-8100}"
|
|
BASE_HTTP=8000
|
|
BASE_BP=8998
|
|
BASE_MASTER=29500
|
|
|
|
stop_all() {
|
|
pkill -9 -f "mb5_pd_proxy.py" 2>/dev/null || true
|
|
pkill -9 -f "mooncake_connector_proxy.py" 2>/dev/null || true
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 3
|
|
# Hard guarantee: required ports must be free before we start. If they
|
|
# aren't, an earlier run left a stale process holding the socket and the
|
|
# readiness check would (silently) probe the stale proxy.
|
|
for port in 8000 8001 8002 8003 8004 8005 8006 8007 "${PROXY_PORT}"; do
|
|
if ss -ltn 2>/dev/null | awk '{print $4}' | grep -qE "[:.]${port}\$"; then
|
|
echo "[mb5] FATAL port ${port} still in use after stop_all; manual cleanup needed"
|
|
ss -ltnp 2>/dev/null | grep -E "[:.]${port}\$" || true
|
|
exit 1
|
|
fi
|
|
done
|
|
}
|
|
|
|
case "${1:-start}" in
|
|
stop)
|
|
stop_all
|
|
python "${INSTRUMENT}" --revert --venv "${VENV}" 2>/dev/null || true
|
|
exit 0;;
|
|
status)
|
|
ports=()
|
|
case "${CONFIG}" in
|
|
8C) for i in 0 1 2 3 4 5 6 7; do ports+=( $((BASE_HTTP+i)) ); done ;;
|
|
*) ports=( ${PROXY_PORT} ) ;;
|
|
esac
|
|
for p in "${ports[@]}"; do
|
|
if curl -sf "http://127.0.0.1:${p}/health" >/dev/null 2>&1 \
|
|
|| curl -sf "http://127.0.0.1:${p}/v1/models" >/dev/null 2>&1; then
|
|
echo "port ${p}: UP"
|
|
else
|
|
echo "port ${p}: DOWN"
|
|
fi
|
|
done
|
|
exit 0;;
|
|
start) ;;
|
|
*) echo "Unknown command: $1"; exit 1;;
|
|
esac
|
|
|
|
# --- parse CONFIG into (prefill_gpus, decode_gpus) ----------------
|
|
USE_COLO_PROXY=0
|
|
case "${CONFIG}" in
|
|
8C) ROLES="combined"; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
|
|
8C-proxy) ROLES="combined"; USE_COLO_PROXY=1; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
|
|
6P+2D) ROLES="pd"; P_GPUS="0,1,2,3,4,5"; D_GPUS="6,7" ;;
|
|
5P+3D) ROLES="pd"; P_GPUS="0,1,2,3,4"; D_GPUS="5,6,7" ;;
|
|
4P+4D) ROLES="pd"; P_GPUS="0,1,2,3"; D_GPUS="4,5,6,7" ;;
|
|
3P+5D) ROLES="pd"; P_GPUS="0,1,2"; D_GPUS="3,4,5,6,7" ;;
|
|
2P+6D) ROLES="pd"; P_GPUS="0,1"; D_GPUS="2,3,4,5,6,7" ;;
|
|
*) echo "Unknown CONFIG=${CONFIG} (expected: 8C, 8C-proxy, 6P+2D, 5P+3D, 4P+4D, 3P+5D, 2P+6D)"; exit 1;;
|
|
esac
|
|
|
|
stop_all
|
|
mkdir -p "${LOGS_DIR}" "${SNAPSHOT_DIR}"
|
|
source "${VENV}/bin/activate"
|
|
|
|
# Apply MB5 patch (idempotent — affects entire shared cpfs venv).
|
|
python "${INSTRUMENT}" --apply --venv "${VENV}"
|
|
|
|
launch_vllm() {
|
|
# $1 idx, $2 gpu, $3 port, $4 role, $5 bp_or_dash
|
|
local idx="$1" gpu="$2" port="$3" role="$4" bp="$5"
|
|
local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
|
|
local master=$((BASE_MASTER+idx))
|
|
# Bootstrap port: always set (vllm only consumes it for producer/kv_both
|
|
# modes; kv_consumer mode just ignores it). Use a placeholder if the
|
|
# caller didn't supply one — bash one-shot env-prefix with a dynamic
|
|
# variable is fragile, so we always export a literal.
|
|
local bp_use="${bp}"
|
|
if [ -z "${bp_use}" ] || [ "${bp_use}" = "-" ]; then
|
|
bp_use="9999"
|
|
fi
|
|
echo "[mb5] launching idx=${idx} gpu=${gpu} port=${port} role=${role} bp=${bp:-none}"
|
|
PYTHONHASHSEED=42 \
|
|
MB5_LOG_DIR="${SNAPSHOT_DIR}" \
|
|
CUDA_VISIBLE_DEVICES="${gpu}" \
|
|
MASTER_PORT="${master}" \
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp_use}" \
|
|
nohup vllm serve "${MODEL}" \
|
|
--host 0.0.0.0 --port "${port}" \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--max-num-batched-tokens 8192 \
|
|
--kv-transfer-config "${cfg}" \
|
|
--enable-prompt-tokens-details \
|
|
> "${LOGS_DIR}/vllm_idx${idx}_gpu${gpu}_${role}.log" 2>&1 &
|
|
disown
|
|
}
|
|
|
|
idx=0
|
|
proxy_args=()
|
|
colo_args=()
|
|
ENDPOINTS=""
|
|
|
|
case "${ROLES}" in
|
|
combined)
|
|
IFS=',' read -ra GPUS <<< "${COMBINED_GPUS}"
|
|
for gpu in "${GPUS[@]}"; do
|
|
port=$((BASE_HTTP+idx))
|
|
bp=$((BASE_BP+idx))
|
|
launch_vllm "${idx}" "${gpu}" "${port}" "kv_both" "${bp}"
|
|
ENDPOINTS+="${ENDPOINTS:+,}http://127.0.0.1:${port}"
|
|
colo_args+=( --colo "http://127.0.0.1:${port}" )
|
|
idx=$((idx+1))
|
|
sleep 1
|
|
done
|
|
;;
|
|
pd)
|
|
# producers
|
|
IFS=',' read -ra PG <<< "${P_GPUS}"
|
|
for gpu in "${PG[@]}"; do
|
|
port=$((BASE_HTTP+idx))
|
|
bp=$((BASE_BP+idx))
|
|
launch_vllm "${idx}" "${gpu}" "${port}" "kv_producer" "${bp}"
|
|
proxy_args+=( --prefill "http://127.0.0.1:${port}" "${bp}" )
|
|
idx=$((idx+1))
|
|
sleep 1
|
|
done
|
|
# consumers
|
|
IFS=',' read -ra DG <<< "${D_GPUS}"
|
|
for gpu in "${DG[@]}"; do
|
|
port=$((BASE_HTTP+idx))
|
|
launch_vllm "${idx}" "${gpu}" "${port}" "kv_consumer" "-"
|
|
proxy_args+=( --decode "http://127.0.0.1:${port}" )
|
|
idx=$((idx+1))
|
|
sleep 1
|
|
done
|
|
;;
|
|
esac
|
|
|
|
echo "[mb5] waiting for all vllm /v1/models endpoints..."
|
|
all_ports=()
|
|
for ((p=0; p<idx; p++)); do all_ports+=( $((BASE_HTTP+p)) ); done
|
|
for port in "${all_ports[@]}"; do
|
|
tries=0
|
|
while ! curl -sf "http://127.0.0.1:${port}/v1/models" >/dev/null 2>&1; do
|
|
tries=$((tries+1))
|
|
if [ ${tries} -gt 300 ]; then
|
|
echo "[mb5] FATAL port ${port} did not come up in 10 min"
|
|
tail -40 "${LOGS_DIR}/vllm_idx"*"_gpu"*".log" || true
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " port=${port} ready"
|
|
done
|
|
|
|
if [ "${ROLES}" = "pd" ]; then
|
|
P_ROUTING="${MB5_P_ROUTING:-rr}"
|
|
echo "[mb5] launching mooncake_connector_proxy on ${PROXY_PORT} (P routing=${P_ROUTING})"
|
|
MB5_P_ROUTING="${P_ROUTING}" \
|
|
nohup python "${PROXY_SRC}" "${proxy_args[@]}" --port "${PROXY_PORT}" --host 0.0.0.0 \
|
|
> "${LOGS_DIR}/proxy.log" 2>&1 &
|
|
disown
|
|
# wait for proxy. Official mooncake_connector_proxy only handles
|
|
# /v1/completions, so /health and /v1/models return 404 — accept any
|
|
# HTTP response as "alive".
|
|
tries=0
|
|
while ! curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${PROXY_PORT}/" 2>/dev/null | grep -qE "^[0-9]"; do
|
|
tries=$((tries+1))
|
|
if [ ${tries} -gt 60 ]; then
|
|
echo "[mb5] FATAL proxy did not come up in 2 min"
|
|
tail -40 "${LOGS_DIR}/proxy.log" || true
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " proxy port=${PROXY_PORT} ready (HTTP responding)"
|
|
ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
|
|
fi
|
|
|
|
if [ "${USE_COLO_PROXY}" = "1" ]; then
|
|
echo "[mb5] launching colo passthrough proxy on ${PROXY_PORT} (8 kv_both instances)"
|
|
nohup python "${PROXY_SRC}" "${colo_args[@]}" --port "${PROXY_PORT}" --host 0.0.0.0 \
|
|
> "${LOGS_DIR}/proxy.log" 2>&1 &
|
|
disown
|
|
tries=0
|
|
while ! curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${PROXY_PORT}/" 2>/dev/null | grep -qE "^[0-9]"; do
|
|
tries=$((tries+1))
|
|
if [ ${tries} -gt 60 ]; then
|
|
echo "[mb5] FATAL colo proxy did not come up in 2 min"
|
|
tail -40 "${LOGS_DIR}/proxy.log" || true
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " colo proxy port=${PROXY_PORT} ready (HTTP responding)"
|
|
ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
|
|
fi
|
|
|
|
echo "[mb5] CONFIG=${CONFIG} RUN_LABEL=${RUN_LABEL} UP"
|
|
echo "ENDPOINTS=${ENDPOINTS}"
|
|
echo "RUN_ROOT=${RUN_ROOT}"
|
|
echo "SNAPSHOT_DIR=${SNAPSHOT_DIR}"
|
|
echo "VLLM_LOGS=${LOGS_DIR}"
|