This commit closes the loop on the fresh-venv MB2 path. Three corrections
on top of the previous scaffold made the bench fire successfully on
dash1 GPU 0+1 with kv_both connector roles:
1. Re-target instrumentation patch to vLLM's shipped MooncakeConnector
(vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py).
The mooncake-package's own mooncake_connector_v1.py turned out not to
be the implementation vLLM 0.18.1 loads — the
'{"kv_connector": "MooncakeConnector"}' config picks up the vLLM-shipped
one. Patches go at _send_blocks (P-side) and receive_kv_from_single_worker
(D-side, async, both entry and FINISH branch).
2. /query lives on the mooncake bootstrap port, not the vLLM HTTP port.
Add --src-bp / --dst-bp args; default 8998 / 8999.
3. kv_transfer_params schema for the vanilla connector:
do_remote_decode → {transfer_id}
do_remote_prefill → {transfer_id, remote_engine_id, remote_bootstrap_addr}
where remote_bootstrap_addr must include the http:// scheme. The dash0
smoke_test_migrate_cache.py was written for the patched build, which
used a different field-name set (remote_host, remote_port,
remote_block_ids); those are rejected here.
Also discovered (and worked around): vLLM 0.18.1 with kv_role=kv_consumer
raises AttributeError on `self.bootstrap_server` because that attribute
is only assigned conditionally inside `if not self.is_kv_consumer`. We
sidestep by running kv_both for the microbench — transfer mechanics are
identical (same batch_transfer_sync_write call); the role gate only
affects which request types each instance accepts. For §5 strict PD-disagg
baseline we'll need either to fix this bug or front the pair with a
role-aware proxy.
Sanity smoke (3 sizes × 2 repeats, dash1 GPU 0+1, kv_both intra-node):
input KV-MiB send_blocks_ms (P) receive_kv_ms (D) client_step2_ms
512 48 5–23 7–33 18–91
2048 192 21 23 37
8192 768 85 88 110
=> intra-node bandwidth ~9 GB/s on the actual transfer for 768 MiB,
which is well below NVLink p2p; likely PCIe-staged. Worth verifying.
Next step (in flight): full sweep 512..128k tokens × 5 repeats with
the per-stage analyzer.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
126 lines
3.9 KiB
Bash
Executable File
126 lines
3.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Start 2 vLLM instances with Mooncake kv_connector (kv_both) for MB2.
|
|
#
|
|
# Default config: both on local GPU 0 and 1 (intra-node A/B test).
|
|
# Override via GPU_A / GPU_B / HOST_A / HOST_B env vars.
|
|
#
|
|
# This uses the FRESH venv at /home/admin/cpfs/wjh/agentic-kv-fresh/.venv
|
|
# (vanilla vllm 0.18.1 + vanilla mooncake-transfer-engine 0.3.11), NOT
|
|
# the dash0 patched build.
|
|
#
|
|
# Usage:
|
|
# GPU_A=0 GPU_B=1 bash microbench/fresh_setup/start_vllm_pair.sh
|
|
# bash microbench/fresh_setup/start_vllm_pair.sh status
|
|
# bash microbench/fresh_setup/start_vllm_pair.sh stop
|
|
|
|
set -eo pipefail
|
|
|
|
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
|
VENV="${FRESH_ROOT}/.venv"
|
|
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
LOGS_DIR="${LOGS_DIR:-${FRESH_ROOT}/mb2_logs}"
|
|
mkdir -p "${LOGS_DIR}"
|
|
|
|
GPU_A="${GPU_A:-0}"
|
|
GPU_B="${GPU_B:-1}"
|
|
PORT_A=8000
|
|
PORT_B=8001
|
|
BP_A=8998
|
|
BP_B=8999
|
|
MASTER_A=29500
|
|
MASTER_B=29501
|
|
ROLE_A="${ROLE_A:-kv_both}" # kv_both (works) or kv_producer (hits vllm 0.18.1 bootstrap_server bug on D-side counterpart)
|
|
ROLE_B="${ROLE_B:-kv_both}" # kv_both / kv_consumer
|
|
|
|
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
INSTRUMENT="${SCRIPT_DIR}/instrument_mooncake.py"
|
|
|
|
stop_all() {
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 2
|
|
if [[ -f "${INSTRUMENT}" ]]; then
|
|
python "${INSTRUMENT}" --revert --venv "${VENV}" 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
case "${1:-start}" in
|
|
stop)
|
|
stop_all
|
|
exit 0
|
|
;;
|
|
status)
|
|
for p in "${PORT_A}" "${PORT_B}"; do
|
|
if curl -sf "http://127.0.0.1:${p}/health" >/dev/null 2>&1; then
|
|
echo "port ${p}: UP"
|
|
else
|
|
echo "port ${p}: DOWN"
|
|
fi
|
|
done
|
|
exit 0
|
|
;;
|
|
start)
|
|
;;
|
|
*)
|
|
echo "Unknown command: $1"; exit 1;;
|
|
esac
|
|
|
|
stop_all
|
|
|
|
source "${VENV}/bin/activate"
|
|
|
|
if [[ -f "${INSTRUMENT}" ]]; then
|
|
echo "[mb2] applying instrumentation patch"
|
|
python "${INSTRUMENT}" --apply --venv "${VENV}"
|
|
else
|
|
echo "[mb2] WARN instrument_mooncake.py not found at ${INSTRUMENT}; transfer logs will be absent"
|
|
fi
|
|
|
|
mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"
|
|
|
|
launch() {
|
|
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5" role="$6"
|
|
local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
|
|
echo "[mb2] launching ${idx}: gpu=${gpu} port=${port} bp=${bp} role=${role}"
|
|
PYTHONHASHSEED=42 \
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
|
|
CUDA_VISIBLE_DEVICES="${gpu}" \
|
|
MASTER_PORT="${master}" \
|
|
MB2_LOG_DIR="${MB2_LOG_ROOT}/${idx}" \
|
|
nohup vllm serve "${MODEL}" \
|
|
--host 0.0.0.0 --port "${port}" \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--kv-transfer-config "${cfg}" \
|
|
--enable-prompt-tokens-details \
|
|
> "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
|
|
disown
|
|
}
|
|
|
|
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}" "${ROLE_A}"
|
|
sleep 3
|
|
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}" "${ROLE_B}"
|
|
|
|
echo "[mb2] waiting for both /health endpoints..."
|
|
for port in "${PORT_A}" "${PORT_B}"; do
|
|
tries=0
|
|
while ! curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1; do
|
|
tries=$((tries+1))
|
|
if [ ${tries} -gt 180 ]; then
|
|
echo "[mb2] FATAL port ${port} did not come up in 6 min"
|
|
tail -40 "${LOGS_DIR}/vllm_"*"_gpu"*".log" || true
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " port=${port} ready"
|
|
done
|
|
|
|
echo "[mb2] both instances UP"
|
|
echo " A: 127.0.0.1:${PORT_A} (GPU ${GPU_A}, bp ${BP_A}, log_dir ${MB2_LOG_ROOT}/A)"
|
|
echo " B: 127.0.0.1:${PORT_B} (GPU ${GPU_B}, bp ${BP_B}, log_dir ${MB2_LOG_ROOT}/B)"
|
|
echo " vllm stdout: ${LOGS_DIR}"
|