#!/usr/bin/env bash # Run a single B3 policy with a cold-start vLLM (clean APC). # # Usage: # bash scripts/b3_isolated_policy.sh # # Launches 8 fresh vLLM instances, captures their engine_state into # /engine_state/, runs the policy through the proxy on # , then kills everything. Distinct from b3_sweep.sh which # shares one vLLM-set across all five policies (faster but warm-cache). set -euo pipefail ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}" VENV="$ROOT/.venv/bin" MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" PROXY_PORT="${PROXY_PORT:-9300}" BASE_PORT="${BASE_PORT:-8000}" GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}" EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}" N_INSTANCES=$(echo $GPU_INDICES | wc -w) # When ENABLE_KV_BOTH=1, vLLM launches with the Mooncake KV connector in # kv_both role and the proxy is given bootstrap ports. This is required # for --policy unified_v2 (per-request PD-sep) but disabled by default # because it adds always-on KV-transfer overhead even when not triggered. ENABLE_KV_BOTH="${ENABLE_KV_BOTH:-0}" BOOTSTRAP_BASE_PORT="${BOOTSTRAP_BASE_PORT:-8998}" POLICY="${1:?usage: $0 }" TRACE="${2:?usage: $0 }" RUNDIR="${3:?usage: $0 }" # Auto-enable kv_both when the policy requires it. # KV_CONNECTOR (Mooncake|Nixl) selects the underlying connector when KV_BOTH=1. _KV_CONNECTOR_EXPLICIT="${KV_CONNECTOR:-}" KV_CONNECTOR="${KV_CONNECTOR:-Mooncake}" if [ "$POLICY" = "unified_v2" ] || [ "$POLICY" = "unified_v3" ] || [ "$POLICY" = "unified_kv_both" ]; then ENABLE_KV_BOTH=1 # honor explicit KV_CONNECTOR override (e.g. =Nixl); otherwise default Mooncake. if [ -z "$_KV_CONNECTOR_EXPLICIT" ]; then KV_CONNECTOR="Mooncake" fi fi if [ "$POLICY" = "unified_nixl_both" ]; then ENABLE_KV_BOTH=1 KV_CONNECTOR="Nixl" fi mkdir -p "$RUNDIR/engine_state" "$RUNDIR/logs" echo "[isolated] policy=$POLICY trace=$(basename $TRACE) rundir=$RUNDIR" cleanup() { pkill -f gpu_monitor.sh 2>/dev/null || true pkill -9 -f cache_aware_proxy 2>/dev/null || true pkill -9 -f "vllm serve" 2>/dev/null || true pkill -9 -f "EngineCore" 2>/dev/null || true sleep 3 } trap cleanup EXIT # Hard reset first cleanup echo "[isolated] launching $N_INSTANCES vLLM on GPUs $GPU_INDICES ENABLE_KV_BOTH=$ENABLE_KV_BOTH KV_CONNECTOR=$KV_CONNECTOR ..." i=0 for gpu in $GPU_INDICES; do port=$((BASE_PORT + i)) master=$((29500 + i)) bp=$((BOOTSTRAP_BASE_PORT + i)) if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Mooncake" ]; then PYTHONHASHSEED=42 \ VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \ AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \ AGENTIC_WORKER_ID="engine_${i}" \ CUDA_VISIBLE_DEVICES=$gpu \ MASTER_PORT=$master \ nohup "$VENV/vllm" serve "$MODEL" \ --host 0.0.0.0 --port "$port" \ --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching \ --dtype auto --gpu-memory-utilization 0.9 \ --max-model-len 200000 \ --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \ $EXTRA_VLLM_ARGS \ > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 & elif [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then # NixlConnector's handshake listener binds to a fixed default # port 5600 unless VLLM_NIXL_SIDE_CHANNEL_PORT is overridden. # Multiple instances on the same host MUST use distinct ports # or only one will start; the rest hit # `zmq.error.ZMQError: Address already in use`. nixl_port=$((5600 + i)) VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \ AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \ AGENTIC_WORKER_ID="engine_${i}" \ CUDA_VISIBLE_DEVICES=$gpu \ MASTER_PORT=$master \ nohup "$VENV/vllm" serve "$MODEL" \ --host 0.0.0.0 --port "$port" \ --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching \ --dtype auto --gpu-memory-utilization 0.9 \ --max-model-len 200000 \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ $EXTRA_VLLM_ARGS \ > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 & else AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \ AGENTIC_WORKER_ID="engine_${i}" \ CUDA_VISIBLE_DEVICES=$gpu \ MASTER_PORT=$master \ nohup "$VENV/vllm" serve "$MODEL" \ --host 0.0.0.0 --port "$port" \ --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching \ --dtype auto --gpu-memory-utilization 0.9 \ --max-model-len 200000 \ $EXTRA_VLLM_ARGS \ > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 & fi disown sleep 2 i=$((i + 1)) done echo "[isolated] waiting for vLLM health ..." # NIXL init takes ~100-150s per instance even with concurrent launches; # Mooncake is closer to ~30-60s. Use a generous 360s timeout to cover # both (90s -> 360s vs the previous 180s). Override via env for slow nodes # (e.g. HEALTH_MAX_TRIES=300 -> 600s). HEALTH_MAX_TRIES="${HEALTH_MAX_TRIES:-180}" for i in $(seq 0 $((N_INSTANCES - 1))); do port=$((BASE_PORT + i)) tries=0 while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do tries=$((tries + 1)) if [ $tries -gt $HEALTH_MAX_TRIES ]; then echo "[isolated] FATAL: inst_$i not healthy after $((HEALTH_MAX_TRIES * 2))s" exit 1 fi sleep 2 done echo " inst_$i ready" done echo "[isolated] launching proxy with --policy $POLICY ..." combined_args="" for i in $(seq 0 $((N_INSTANCES - 1))); do combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))" done proxy_extra="" # Bootstrap ports only needed for Mooncake handshake. Nixl uses its own # UCX side-channel and the proxy forwards kv_transfer_params from src's # response body instead of pre-baking engine_id/bootstrap_addr. if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Mooncake" ]; then bp_list="" for i in $(seq 0 $((N_INSTANCES - 1))); do if [ -z "$bp_list" ]; then bp_list="$((BOOTSTRAP_BASE_PORT + i))" else bp_list="$bp_list,$((BOOTSTRAP_BASE_PORT + i))" fi done proxy_extra="--bootstrap-ports $bp_list" fi if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then proxy_extra="--connector-type nixl" fi nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \ --port "$PROXY_PORT" \ --combined $combined_args \ --policy "$POLICY" \ $proxy_extra \ ${EXTRA_PROXY_ARGS:-} \ > "$RUNDIR/proxy.log" 2>&1 & disown tries=0 until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do tries=$((tries + 1)) if [ $tries -gt 30 ]; then echo "[isolated] FATAL: proxy did not come up in 60s" tail -30 "$RUNDIR/proxy.log" exit 1 fi sleep 2 done # Unified bench infra: record worker->GPU mapping + sample per-GPU util during # the replay so bench_report.py can emit per-worker GPU util / TPS for every run. python3 - "$RUNDIR" "$BASE_PORT" "$PROXY_PORT" "$GPU_INDICES" "$N_INSTANCES" <<'PY' import json, sys rundir, base_port, proxy_port, gpu_indices, n = sys.argv[1:] json.dump({"base_port": int(base_port), "proxy_port": int(proxy_port), "gpu_indices": [int(x) for x in gpu_indices.split()], "n_instances": int(n)}, open(f"{rundir}/bench_config.json", "w"), indent=2) PY bash "$ROOT/scripts/gpu_monitor.sh" "$RUNDIR/gpu_util.csv" 5 >/dev/null 2>&1 & GPU_MON_PID=$! t_start=$(date +%s.%N) echo "[isolated] running replayer ..." PYTHONPATH="$ROOT" "$VENV/python" -m replayer \ --trace "$TRACE" \ --output "$RUNDIR/metrics.jsonl" \ --endpoint "http://127.0.0.1:$PROXY_PORT" \ --model "$MODEL" \ 2>&1 | tee "$RUNDIR/replayer.log" | tail -3 t_end=$(date +%s.%N) kill "${GPU_MON_PID:-}" 2>/dev/null || true python3 - "$RUNDIR" "$POLICY" "$TRACE" "$t_start" "$t_end" <<'PY' import json, sys rundir, policy, trace, t_start, t_end = sys.argv[1:] with open(f"{rundir}/run_window.json", "w") as f: json.dump({ "policy": policy, "trace": trace, "t_start_unix": float(t_start), "t_end_unix": float(t_end), "isolated": True, }, f, indent=2) PY curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$RUNDIR/breakdown.json" curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$RUNDIR/worker_state.json" curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$RUNDIR/stats.json" echo "[isolated] $POLICY done: $(wc -l < "$RUNDIR/metrics.jsonl") metric rows"