B3: cold-start isolated policy runner (clean APC per cell)

scripts/b3_isolated_policy.sh wraps one policy run in a fresh 8-instance vLLM lifecycle: hard reset -> launch -> health -> proxy -> replayer -> snapshot artifacts -> cleanup. Used when cross- policy APC contamination matters more than the ~25-min vLLM warmup overhead per policy. Counterpart to the existing b3_sweep.sh which keeps vLLM warm across all policies (faster but warm-cache; we found via the sticky pre-flight that contamination is < 1% on this trace, so b3_sweep.sh stays the default). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 20:33:44 +08:00
parent 08530b3915
commit 1d87082ca1
1 changed files with 125 additions and 0 deletions
--- a/scripts/b3_isolated_policy.sh
+++ b/scripts/b3_isolated_policy.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+# Run a single B3 policy with a cold-start vLLM (clean APC).
+#
+# Usage:
+#   bash scripts/b3_isolated_policy.sh <policy> <trace> <rundir>
+#
+# Launches 8 fresh vLLM instances, captures their engine_state into
+# <rundir>/engine_state/, runs the policy through the proxy on
+# <trace>, then kills everything. Distinct from b3_sweep.sh which
+# shares one vLLM-set across all five policies (faster but warm-cache).
+
+set -euo pipefail
+
+ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
+VENV="$ROOT/.venv/bin"
+MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
+PROXY_PORT="${PROXY_PORT:-9300}"
+BASE_PORT="${BASE_PORT:-8000}"
+GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
+EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}"
+N_INSTANCES=$(echo $GPU_INDICES | wc -w)
+
+POLICY="${1:?usage: $0 <policy> <trace> <rundir>}"
+TRACE="${2:?usage: $0 <policy> <trace> <rundir>}"
+RUNDIR="${3:?usage: $0 <policy> <trace> <rundir>}"
+
+mkdir -p "$RUNDIR/engine_state" "$RUNDIR/logs"
+echo "[isolated] policy=$POLICY trace=$(basename $TRACE) rundir=$RUNDIR"
+
+cleanup() {
+    pkill -9 -f cache_aware_proxy 2>/dev/null || true
+    pkill -9 -f "vllm serve" 2>/dev/null || true
+    pkill -9 -f "EngineCore" 2>/dev/null || true
+    sleep 3
+}
+trap cleanup EXIT
+
+# Hard reset first
+cleanup
+
+echo "[isolated] launching $N_INSTANCES vLLM on GPUs $GPU_INDICES ..."
+i=0
+for gpu in $GPU_INDICES; do
+    port=$((BASE_PORT + i))
+    master=$((29500 + i))
+    AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
+    AGENTIC_WORKER_ID="engine_${i}" \
+    CUDA_VISIBLE_DEVICES=$gpu \
+    MASTER_PORT=$master \
+    nohup "$VENV/vllm" serve "$MODEL" \
+        --host 0.0.0.0 --port "$port" \
+        --tensor-parallel-size 1 \
+        --trust-remote-code --enable-prefix-caching \
+        --dtype auto --gpu-memory-utilization 0.9 \
+        --max-model-len 200000 \
+        $EXTRA_VLLM_ARGS \
+        > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
+    disown
+    sleep 2
+    i=$((i + 1))
+done
+
+echo "[isolated] waiting for vLLM health ..."
+for i in $(seq 0 $((N_INSTANCES - 1))); do
+    port=$((BASE_PORT + i))
+    tries=0
+    while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
+        tries=$((tries + 1))
+        if [ $tries -gt 90 ]; then
+            echo "[isolated] FATAL: inst_$i not healthy after 180s"
+            exit 1
+        fi
+        sleep 2
+    done
+    echo "  inst_$i ready"
+done
+
+echo "[isolated] launching proxy with --policy $POLICY ..."
+combined_args=""
+for i in $(seq 0 $((N_INSTANCES - 1))); do
+    combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
+done
+nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \
+    --port "$PROXY_PORT" \
+    --combined $combined_args \
+    --policy "$POLICY" \
+    > "$RUNDIR/proxy.log" 2>&1 &
+disown
+tries=0
+until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do
+    tries=$((tries + 1))
+    if [ $tries -gt 30 ]; then
+        echo "[isolated] FATAL: proxy did not come up in 60s"
+        tail -30 "$RUNDIR/proxy.log"
+        exit 1
+    fi
+    sleep 2
+done
+
+t_start=$(date +%s.%N)
+echo "[isolated] running replayer ..."
+PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
+    --trace "$TRACE" \
+    --output "$RUNDIR/metrics.jsonl" \
+    --endpoint "http://127.0.0.1:$PROXY_PORT" \
+    --model "$MODEL" \
+    2>&1 | tee "$RUNDIR/replayer.log" | tail -3
+t_end=$(date +%s.%N)
+
+python3 - "$RUNDIR" "$POLICY" "$TRACE" "$t_start" "$t_end" <<'PY'
+import json, sys
+rundir, policy, trace, t_start, t_end = sys.argv[1:]
+with open(f"{rundir}/run_window.json", "w") as f:
+    json.dump({
+        "policy": policy, "trace": trace,
+        "t_start_unix": float(t_start),
+        "t_end_unix": float(t_end),
+        "isolated": True,
+    }, f, indent=2)
+PY
+
+curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$RUNDIR/breakdown.json"
+curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$RUNDIR/worker_state.json"
+curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$RUNDIR/stats.json"
+echo "[isolated] $POLICY done: $(wc -l < "$RUNDIR/metrics.jsonl") metric rows"