Adds the pieces needed to run the producer on dash1 and the consumer on dash2 with the same shared cpfs venv: start_vllm_single.sh INSTANCE / GPU / PORT / BP / MASTER / ROLE env vars; brings up ONE vLLM instance + applies the mooncake instrumentation patch (idempotent since the venv is cpfs-shared, so the first invocation applies and the second is a no-op). Per-instance MB2_LOG_DIR keeps producer/consumer events separate even though both directories live on the same cpfs path visible to both hosts. mb2_kv_transfer.py New --src-host / --dst-host args. Defaults stay 127.0.0.1 for backward-compat with the intra-node sweep. /v1/completions URLs and /query URLs now use the supplied hosts. remote_bootstrap_addr is built as http://<src_host>:<src_bp> so the consumer's do_remote_prefill request carries a routable address. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
100 lines
3.3 KiB
Bash
Executable File
100 lines
3.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Start ONE vLLM instance with Mooncake kv_connector, for inter-node MB2.
|
|
# Run separately on each host (dash1, dash2) before kicking off the bench.
|
|
#
|
|
# Usage on each host:
|
|
# INSTANCE=A GPU=0 PORT=8000 BP=8998 MASTER=29500 ROLE=kv_both \
|
|
# bash microbench/fresh_setup/start_vllm_single.sh start
|
|
# bash microbench/fresh_setup/start_vllm_single.sh status
|
|
# bash microbench/fresh_setup/start_vllm_single.sh stop
|
|
#
|
|
# All hosts share cpfs, so the venv at FRESH_ROOT/.venv is single-installed
|
|
# and the instrumentation patch is global. Per-instance logs go under
|
|
# FRESH_ROOT/mb2_transfer_logs/{INSTANCE}/ which is visible from any host.
|
|
|
|
set -eo pipefail
|
|
|
|
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
|
VENV="${FRESH_ROOT}/.venv"
|
|
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
LOGS_DIR="${LOGS_DIR:-${FRESH_ROOT}/mb2_logs}"
|
|
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
|
|
|
|
INSTANCE="${INSTANCE:-A}"
|
|
GPU="${GPU:-0}"
|
|
PORT="${PORT:-8000}"
|
|
BP="${BP:-8998}"
|
|
MASTER="${MASTER:-29500}"
|
|
ROLE="${ROLE:-kv_both}"
|
|
|
|
mkdir -p "${LOGS_DIR}" "${MB2_LOG_ROOT}/${INSTANCE}"
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
INSTRUMENT="${SCRIPT_DIR}/instrument_mooncake.py"
|
|
|
|
stop_local() {
|
|
pkill -9 -f "vllm serve.*--port ${PORT} " 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 2
|
|
}
|
|
|
|
case "${1:-start}" in
|
|
stop)
|
|
stop_local
|
|
# Patch revert is only safe to do when no other instance is using
|
|
# the venv — for a shared cpfs venv we leave it applied until all
|
|
# instances are stopped. Do it manually with:
|
|
# python instrument_mooncake.py --revert
|
|
exit 0;;
|
|
status)
|
|
if curl -sf "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
|
|
echo "port ${PORT}: UP"
|
|
else
|
|
echo "port ${PORT}: DOWN"
|
|
fi
|
|
exit 0;;
|
|
start) ;;
|
|
*) echo "Unknown command: $1"; exit 1;;
|
|
esac
|
|
|
|
stop_local
|
|
|
|
source "${VENV}/bin/activate"
|
|
|
|
# Apply instrumentation on first launch (it's idempotent / safe to re-apply).
|
|
if [[ -f "${INSTRUMENT}" ]]; then
|
|
python "${INSTRUMENT}" --apply --venv "${VENV}" 2>&1 || true
|
|
fi
|
|
|
|
cfg='{"kv_connector":"MooncakeConnector","kv_role":"'${ROLE}'"}'
|
|
echo "[mb2-single] launching ${INSTANCE}: gpu=${GPU} port=${PORT} bp=${BP} role=${ROLE}"
|
|
|
|
PYTHONHASHSEED=42 \
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT="${BP}" \
|
|
CUDA_VISIBLE_DEVICES="${GPU}" \
|
|
MASTER_PORT="${MASTER}" \
|
|
MB2_LOG_DIR="${MB2_LOG_ROOT}/${INSTANCE}" \
|
|
nohup vllm serve "${MODEL}" \
|
|
--host 0.0.0.0 --port "${PORT}" \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--kv-transfer-config "${cfg}" \
|
|
--enable-prompt-tokens-details \
|
|
> "${LOGS_DIR}/vllm_${INSTANCE}_$(hostname -s)_gpu${GPU}.log" 2>&1 &
|
|
disown
|
|
|
|
echo "[mb2-single] waiting for /health on port ${PORT}..."
|
|
tries=0
|
|
while ! curl -sf "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; do
|
|
tries=$((tries+1))
|
|
if [ ${tries} -gt 180 ]; then
|
|
echo "[mb2-single] FATAL port ${PORT} did not come up in 6 min"
|
|
tail -40 "${LOGS_DIR}/vllm_${INSTANCE}_"*"_gpu${GPU}.log" || true
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo "[mb2-single] ${INSTANCE} UP on $(hostname -s):${PORT} (bp ${BP}, gpu ${GPU})"
|