Per-stage breakdown of "step 2" (the B-side do_remote_prefill) requires
vLLM/mooncake-internal timing — we cannot infer it from black-box HTTP
E2E. This commit adds the four pieces to do that breakdown:
instrument_mooncake.py
apply / revert / check patches on mooncake_connector_v1.py to emit
structured JSONL transfer events at two key sites:
send_blocks (P-side, on batch_transfer_sync_write):
{event, remote_session, total_bytes, duration_s, t_start_unix,
ret, tp_rank, t_log_unix}
receive_kv (D-side, on the ZMQ-driven pull request):
{event, path, local_req_ids, remote_req_ids, duration_s,
t_start_unix, tp_rank, t_log_unix}
All injected code is bracketed by `# MB2_INSTRUMENT_START/END` so the
--revert pass is a single regex scan. Apply-revert round-trip
validated on dash1 (PATCHED → py_compile ok → revert → CLEAN → ok).
start_vllm_pair.sh (updated)
- Picks up instrument_mooncake.py via SCRIPT_DIR.
- On `start`: applies patch before launching the two vLLM instances.
- On `stop` (or trap exit): reverts patch.
- Sets per-instance MB2_LOG_DIR = $FRESH_ROOT/mb2_transfer_logs/{A,B}/
so send-side and receive-side events land in cleanly separated dirs.
deploy.sh
tar-over-ssh sync of microbench/fresh_setup/ → cpfs
/home/admin/cpfs/wjh/agentic-kv-fresh/scripts/ so dash1 / dash2 see
the same scripts (dash{1,2} don't have rsync; tar pipe works).
The mb2_kv_transfer.py client still uses black-box E2E timing — the
next commit will teach it to ingest the per-instance JSONL logs to
produce the 4-way breakdown (queueing / setup / transfer / decode).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
123 lines
3.6 KiB
Bash
Executable File
123 lines
3.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Start 2 vLLM instances with Mooncake kv_connector (kv_both) for MB2.
|
|
#
|
|
# Default config: both on local GPU 0 and 1 (intra-node A/B test).
|
|
# Override via GPU_A / GPU_B / HOST_A / HOST_B env vars.
|
|
#
|
|
# This uses the FRESH venv at /home/admin/cpfs/wjh/agentic-kv-fresh/.venv
|
|
# (vanilla vllm 0.18.1 + vanilla mooncake-transfer-engine 0.3.11), NOT
|
|
# the dash0 patched build.
|
|
#
|
|
# Usage:
|
|
# GPU_A=0 GPU_B=1 bash microbench/fresh_setup/start_vllm_pair.sh
|
|
# bash microbench/fresh_setup/start_vllm_pair.sh status
|
|
# bash microbench/fresh_setup/start_vllm_pair.sh stop
|
|
|
|
set -eo pipefail
|
|
|
|
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
|
VENV="${FRESH_ROOT}/.venv"
|
|
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
LOGS_DIR="${LOGS_DIR:-${FRESH_ROOT}/mb2_logs}"
|
|
mkdir -p "${LOGS_DIR}"
|
|
|
|
GPU_A="${GPU_A:-0}"
|
|
GPU_B="${GPU_B:-1}"
|
|
PORT_A=8000
|
|
PORT_B=8001
|
|
BP_A=8998
|
|
BP_B=8999
|
|
MASTER_A=29500
|
|
MASTER_B=29501
|
|
|
|
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
INSTRUMENT="${SCRIPT_DIR}/instrument_mooncake.py"
|
|
|
|
stop_all() {
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 2
|
|
if [[ -f "${INSTRUMENT}" ]]; then
|
|
python "${INSTRUMENT}" --revert --venv "${VENV}" 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
case "${1:-start}" in
|
|
stop)
|
|
stop_all
|
|
exit 0
|
|
;;
|
|
status)
|
|
for p in "${PORT_A}" "${PORT_B}"; do
|
|
if curl -sf "http://127.0.0.1:${p}/health" >/dev/null 2>&1; then
|
|
echo "port ${p}: UP"
|
|
else
|
|
echo "port ${p}: DOWN"
|
|
fi
|
|
done
|
|
exit 0
|
|
;;
|
|
start)
|
|
;;
|
|
*)
|
|
echo "Unknown command: $1"; exit 1;;
|
|
esac
|
|
|
|
stop_all
|
|
|
|
source "${VENV}/bin/activate"
|
|
|
|
if [[ -f "${INSTRUMENT}" ]]; then
|
|
echo "[mb2] applying instrumentation patch"
|
|
python "${INSTRUMENT}" --apply --venv "${VENV}"
|
|
else
|
|
echo "[mb2] WARN instrument_mooncake.py not found at ${INSTRUMENT}; transfer logs will be absent"
|
|
fi
|
|
|
|
mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"
|
|
|
|
launch() {
|
|
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5"
|
|
echo "[mb2] launching instance ${idx} on GPU ${gpu}, port ${port}, bp ${bp}"
|
|
PYTHONHASHSEED=42 \
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
|
|
CUDA_VISIBLE_DEVICES="${gpu}" \
|
|
MASTER_PORT="${master}" \
|
|
MB2_LOG_DIR="${MB2_LOG_ROOT}/${idx}" \
|
|
nohup vllm serve "${MODEL}" \
|
|
--host 0.0.0.0 --port "${port}" \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
--enable-prompt-tokens-details \
|
|
> "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
|
|
disown
|
|
}
|
|
|
|
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}"
|
|
sleep 3
|
|
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}"
|
|
|
|
echo "[mb2] waiting for both /health endpoints..."
|
|
for port in "${PORT_A}" "${PORT_B}"; do
|
|
tries=0
|
|
while ! curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1; do
|
|
tries=$((tries+1))
|
|
if [ ${tries} -gt 180 ]; then
|
|
echo "[mb2] FATAL port ${port} did not come up in 6 min"
|
|
tail -40 "${LOGS_DIR}/vllm_"*"_gpu"*".log" || true
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " port=${port} ready"
|
|
done
|
|
|
|
echo "[mb2] both instances UP"
|
|
echo " A: 127.0.0.1:${PORT_A} (GPU ${GPU_A}, bp ${BP_A}, log_dir ${MB2_LOG_ROOT}/A)"
|
|
echo " B: 127.0.0.1:${PORT_B} (GPU ${GPU_B}, bp ${BP_B}, log_dir ${MB2_LOG_ROOT}/B)"
|
|
echo " vllm stdout: ${LOGS_DIR}"
|