scripts/b3_isolated_policy.sh:
Recognize unified_v3 as a kv_both-requiring policy; respect explicit
KV_CONNECTOR=Nixl override (so unified_v2 / unified_v3 / unified_kv_both
can run against either Mooncake or Nixl back-end). When Nixl is
selected, skip the bootstrap-ports plumbing — Nixl uses its own UCX
side-channel and the proxy forwards kv_transfer_params from the src
response body instead of pre-baking engine_id/bootstrap_addr.
scripts/cache_aware_proxy.py:
- New unified_v3 policy (~250 lines): prefill stays on session-affinity
host (preserves intra-session prefix-cache reuse), decode is migrated
to a lower-load target when the affinity host is busy with concurrent
decodes. KV transfer flows prefill_host → decode_target, opposite of
v2. Knobs: v3_min_new_tokens, v3_min_prefill_decode_busy,
v3_target_load_ratio, v3_min_load_gap, v3_rotate_affinity,
v3_prefer_cache_target. cache_miss_audit found rotation hurts cross-
turn locality (9.5% hit with vs ~80% without) so default
v3_rotate_affinity=False.
- New connector_type setting ("mooncake" | "nixl") gating the PD-sep
handshake form: mooncake uses pre-baked kv_transfer_params,
nixl forwards them from the response body.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
212 lines
7.9 KiB
Bash
Executable File
212 lines
7.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Run a single B3 policy with a cold-start vLLM (clean APC).
|
|
#
|
|
# Usage:
|
|
# bash scripts/b3_isolated_policy.sh <policy> <trace> <rundir>
|
|
#
|
|
# Launches 8 fresh vLLM instances, captures their engine_state into
|
|
# <rundir>/engine_state/, runs the policy through the proxy on
|
|
# <trace>, then kills everything. Distinct from b3_sweep.sh which
|
|
# shares one vLLM-set across all five policies (faster but warm-cache).
|
|
|
|
set -euo pipefail
|
|
|
|
ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
|
|
VENV="$ROOT/.venv/bin"
|
|
MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
|
PROXY_PORT="${PROXY_PORT:-9300}"
|
|
BASE_PORT="${BASE_PORT:-8000}"
|
|
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
|
|
EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}"
|
|
N_INSTANCES=$(echo $GPU_INDICES | wc -w)
|
|
# When ENABLE_KV_BOTH=1, vLLM launches with the Mooncake KV connector in
|
|
# kv_both role and the proxy is given bootstrap ports. This is required
|
|
# for --policy unified_v2 (per-request PD-sep) but disabled by default
|
|
# because it adds always-on KV-transfer overhead even when not triggered.
|
|
ENABLE_KV_BOTH="${ENABLE_KV_BOTH:-0}"
|
|
BOOTSTRAP_BASE_PORT="${BOOTSTRAP_BASE_PORT:-8998}"
|
|
|
|
POLICY="${1:?usage: $0 <policy> <trace> <rundir>}"
|
|
TRACE="${2:?usage: $0 <policy> <trace> <rundir>}"
|
|
RUNDIR="${3:?usage: $0 <policy> <trace> <rundir>}"
|
|
|
|
# Auto-enable kv_both when the policy requires it.
|
|
# KV_CONNECTOR (Mooncake|Nixl) selects the underlying connector when KV_BOTH=1.
|
|
_KV_CONNECTOR_EXPLICIT="${KV_CONNECTOR:-}"
|
|
KV_CONNECTOR="${KV_CONNECTOR:-Mooncake}"
|
|
if [ "$POLICY" = "unified_v2" ] || [ "$POLICY" = "unified_v3" ] || [ "$POLICY" = "unified_kv_both" ]; then
|
|
ENABLE_KV_BOTH=1
|
|
# honor explicit KV_CONNECTOR override (e.g. =Nixl); otherwise default Mooncake.
|
|
if [ -z "$_KV_CONNECTOR_EXPLICIT" ]; then
|
|
KV_CONNECTOR="Mooncake"
|
|
fi
|
|
fi
|
|
if [ "$POLICY" = "unified_nixl_both" ]; then
|
|
ENABLE_KV_BOTH=1
|
|
KV_CONNECTOR="Nixl"
|
|
fi
|
|
|
|
mkdir -p "$RUNDIR/engine_state" "$RUNDIR/logs"
|
|
echo "[isolated] policy=$POLICY trace=$(basename $TRACE) rundir=$RUNDIR"
|
|
|
|
cleanup() {
|
|
pkill -9 -f cache_aware_proxy 2>/dev/null || true
|
|
pkill -9 -f "vllm serve" 2>/dev/null || true
|
|
pkill -9 -f "EngineCore" 2>/dev/null || true
|
|
sleep 3
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
# Hard reset first
|
|
cleanup
|
|
|
|
echo "[isolated] launching $N_INSTANCES vLLM on GPUs $GPU_INDICES ENABLE_KV_BOTH=$ENABLE_KV_BOTH KV_CONNECTOR=$KV_CONNECTOR ..."
|
|
i=0
|
|
for gpu in $GPU_INDICES; do
|
|
port=$((BASE_PORT + i))
|
|
master=$((29500 + i))
|
|
bp=$((BOOTSTRAP_BASE_PORT + i))
|
|
if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Mooncake" ]; then
|
|
PYTHONHASHSEED=42 \
|
|
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
|
|
AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
|
|
AGENTIC_WORKER_ID="engine_${i}" \
|
|
CUDA_VISIBLE_DEVICES=$gpu \
|
|
MASTER_PORT=$master \
|
|
nohup "$VENV/vllm" serve "$MODEL" \
|
|
--host 0.0.0.0 --port "$port" \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
|
$EXTRA_VLLM_ARGS \
|
|
> "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
|
|
elif [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then
|
|
# NixlConnector's handshake listener binds to a fixed default
|
|
# port 5600 unless VLLM_NIXL_SIDE_CHANNEL_PORT is overridden.
|
|
# Multiple instances on the same host MUST use distinct ports
|
|
# or only one will start; the rest hit
|
|
# `zmq.error.ZMQError: Address already in use`.
|
|
nixl_port=$((5600 + i))
|
|
VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \
|
|
AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
|
|
AGENTIC_WORKER_ID="engine_${i}" \
|
|
CUDA_VISIBLE_DEVICES=$gpu \
|
|
MASTER_PORT=$master \
|
|
nohup "$VENV/vllm" serve "$MODEL" \
|
|
--host 0.0.0.0 --port "$port" \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
|
|
$EXTRA_VLLM_ARGS \
|
|
> "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
|
|
else
|
|
AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
|
|
AGENTIC_WORKER_ID="engine_${i}" \
|
|
CUDA_VISIBLE_DEVICES=$gpu \
|
|
MASTER_PORT=$master \
|
|
nohup "$VENV/vllm" serve "$MODEL" \
|
|
--host 0.0.0.0 --port "$port" \
|
|
--tensor-parallel-size 1 \
|
|
--trust-remote-code --enable-prefix-caching \
|
|
--dtype auto --gpu-memory-utilization 0.9 \
|
|
--max-model-len 200000 \
|
|
$EXTRA_VLLM_ARGS \
|
|
> "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
|
|
fi
|
|
disown
|
|
sleep 2
|
|
i=$((i + 1))
|
|
done
|
|
|
|
echo "[isolated] waiting for vLLM health ..."
|
|
# NIXL init takes ~100-150s per instance even with concurrent launches;
|
|
# Mooncake is closer to ~30-60s. Use a generous 360s timeout to cover
|
|
# both (90s -> 360s vs the previous 180s).
|
|
HEALTH_MAX_TRIES=180
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
port=$((BASE_PORT + i))
|
|
tries=0
|
|
while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -gt $HEALTH_MAX_TRIES ]; then
|
|
echo "[isolated] FATAL: inst_$i not healthy after $((HEALTH_MAX_TRIES * 2))s"
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
echo " inst_$i ready"
|
|
done
|
|
|
|
echo "[isolated] launching proxy with --policy $POLICY ..."
|
|
combined_args=""
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
|
|
done
|
|
proxy_extra=""
|
|
# Bootstrap ports only needed for Mooncake handshake. Nixl uses its own
|
|
# UCX side-channel and the proxy forwards kv_transfer_params from src's
|
|
# response body instead of pre-baking engine_id/bootstrap_addr.
|
|
if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Mooncake" ]; then
|
|
bp_list=""
|
|
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
|
if [ -z "$bp_list" ]; then
|
|
bp_list="$((BOOTSTRAP_BASE_PORT + i))"
|
|
else
|
|
bp_list="$bp_list,$((BOOTSTRAP_BASE_PORT + i))"
|
|
fi
|
|
done
|
|
proxy_extra="--bootstrap-ports $bp_list"
|
|
fi
|
|
if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then
|
|
proxy_extra="--connector-type nixl"
|
|
fi
|
|
nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \
|
|
--port "$PROXY_PORT" \
|
|
--combined $combined_args \
|
|
--policy "$POLICY" \
|
|
$proxy_extra \
|
|
${EXTRA_PROXY_ARGS:-} \
|
|
> "$RUNDIR/proxy.log" 2>&1 &
|
|
disown
|
|
tries=0
|
|
until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do
|
|
tries=$((tries + 1))
|
|
if [ $tries -gt 30 ]; then
|
|
echo "[isolated] FATAL: proxy did not come up in 60s"
|
|
tail -30 "$RUNDIR/proxy.log"
|
|
exit 1
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
t_start=$(date +%s.%N)
|
|
echo "[isolated] running replayer ..."
|
|
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
|
|
--trace "$TRACE" \
|
|
--output "$RUNDIR/metrics.jsonl" \
|
|
--endpoint "http://127.0.0.1:$PROXY_PORT" \
|
|
--model "$MODEL" \
|
|
2>&1 | tee "$RUNDIR/replayer.log" | tail -3
|
|
t_end=$(date +%s.%N)
|
|
|
|
python3 - "$RUNDIR" "$POLICY" "$TRACE" "$t_start" "$t_end" <<'PY'
|
|
import json, sys
|
|
rundir, policy, trace, t_start, t_end = sys.argv[1:]
|
|
with open(f"{rundir}/run_window.json", "w") as f:
|
|
json.dump({
|
|
"policy": policy, "trace": trace,
|
|
"t_start_unix": float(t_start),
|
|
"t_end_unix": float(t_end),
|
|
"isolated": True,
|
|
}, f, indent=2)
|
|
PY
|
|
|
|
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$RUNDIR/breakdown.json"
|
|
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$RUNDIR/worker_state.json"
|
|
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$RUNDIR/stats.json"
|
|
echo "[isolated] $POLICY done: $(wc -l < "$RUNDIR/metrics.jsonl") metric rows"
|