#!/usr/bin/env bash # B3 routing sweep: 5 policies on 8x TP1 instances with full instrumentation. # # Policies: # lmetric — cache-aware P_tokens × BS routing (main baseline) # load_only — pure min-num_requests (B3 control: no cache) # sticky — hard session affinity (B3 control: perfect locality) # unified — hybrid affinity + LMetric fallback # capped — lmetric on a per-session turn-capped trace # # Each policy run produces metrics.jsonl + breakdown.json + worker_state.json # + run_window.json (start/end unix timestamps so the analyzer can slice the # shared engine_*.jsonl by time). set -euo pipefail ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}" VENV="$ROOT/.venv/bin" MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" TRACE="${TRACE:-$ROOT/traces/w600_r0.0015_st30.jsonl}" OUTDIR="${OUTDIR:-$ROOT/outputs/b3_sweep_$(date +%Y%m%d_%H%M%S)}" PROXY_PORT="${PROXY_PORT:-9300}" BASE_PORT="${BASE_PORT:-8000}" # Space-separated list of GPU indices to use, one vLLM instance per index. # Override via GPU_INDICES="1 2 3 4 5 6 7" when GPU 0 holds ghost memory. GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}" POLICIES="${POLICIES:-lmetric load_only sticky unified}" MAX_TURNS_CAP="${MAX_TURNS_CAP:-8}" EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}" # Derive N_INSTANCES from GPU_INDICES N_INSTANCES=$(echo $GPU_INDICES | wc -w) mkdir -p "$OUTDIR/engine_state" "$OUTDIR/logs" echo "[b3_sweep] OUTDIR=$OUTDIR" cleanup() { pkill -9 -f "vllm serve" 2>/dev/null || true # vLLM spawns an EngineCore child whose process name is # "VLLM::EngineCor" — pkill -f "vllm serve" misses it and leaves # the GPU memory locked by a dead-but-tracked-by-driver context. pkill -9 -f "EngineCore" 2>/dev/null || true pkill -9 -f cache_aware_proxy 2>/dev/null || true sleep 3 } trap cleanup EXIT # 1) Launch one vLLM per GPU index in GPU_INDICES; each emits engine_.jsonl launch_vllm() { echo "[b3_sweep] launching $N_INSTANCES vLLM instances on GPUs $GPU_INDICES ..." local i=0 for gpu in $GPU_INDICES; do local port=$((BASE_PORT + i)) local master=$((29500 + i)) local log="$OUTDIR/logs/vllm_inst_${i}_gpu${gpu}.log" AGENTIC_STEP_LOG_PATH="$OUTDIR/engine_state/engine_${i}.jsonl" \ AGENTIC_WORKER_ID="engine_${i}" \ CUDA_VISIBLE_DEVICES=$gpu \ MASTER_PORT=$master \ nohup "$VENV/vllm" serve "$MODEL" \ --host 0.0.0.0 --port "$port" \ --tensor-parallel-size 1 \ --trust-remote-code --enable-prefix-caching \ --dtype auto --gpu-memory-utilization 0.9 \ --max-model-len 200000 \ $EXTRA_VLLM_ARGS \ > "$log" 2>&1 & disown sleep 2 i=$((i + 1)) done echo "[b3_sweep] waiting for vLLM health ..." for i in $(seq 0 $((N_INSTANCES - 1))); do local port=$((BASE_PORT + i)) local tries=0 while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do tries=$((tries + 1)) if [ $tries -gt 90 ]; then echo "[b3_sweep] FATAL: inst_$i (port $port) not healthy after 180s" exit 1 fi sleep 2 done echo " inst_$i ready" done } launch_proxy() { local policy="$1" local logfile="$2" local combined_args="" for i in $(seq 0 $((N_INSTANCES - 1))); do combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))" done nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \ --port "$PROXY_PORT" \ --combined $combined_args \ --policy "$policy" \ > "$logfile" 2>&1 & disown local tries=0 until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do tries=$((tries + 1)) if [ $tries -gt 30 ]; then echo "[b3_sweep] FATAL: proxy did not come up in 60s" tail -30 "$logfile" exit 1 fi sleep 2 done } run_policy() { local policy="$1" local trace="$2" local rundir="$OUTDIR/$policy" mkdir -p "$rundir" echo "[b3_sweep] === policy=$policy trace=$(basename "$trace") ===" pkill -9 -f cache_aware_proxy 2>/dev/null || true sleep 2 launch_proxy "$policy" "$rundir/proxy.log" local t_start t_start=$(date +%s.%N) echo "{\"policy\": \"$policy\", \"trace\": \"$trace\", \"t_start_unix\": $t_start}" \ > "$rundir/run_window.json.partial" PYTHONPATH="$ROOT" "$VENV/python" -m replayer \ --trace "$trace" \ --output "$rundir/metrics.jsonl" \ --endpoint "http://127.0.0.1:$PROXY_PORT" \ --model "$MODEL" \ 2>&1 | tee "$rundir/replayer.log" | tail -3 local t_end t_end=$(date +%s.%N) python3 - "$rundir" "$policy" "$trace" "$t_start" "$t_end" <<'PY' import json, sys rundir, policy, trace, t_start, t_end = sys.argv[1:] with open(f"{rundir}/run_window.json", "w") as f: json.dump({ "policy": policy, "trace": trace, "t_start_unix": float(t_start), "t_end_unix": float(t_end), }, f, indent=2) PY rm -f "$rundir/run_window.json.partial" curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$rundir/breakdown.json" curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$rundir/worker_state.json" curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$rundir/stats.json" echo "[b3_sweep] $policy done: $(wc -l < "$rundir/metrics.jsonl") metric rows" } # 2) Run each policy launch_vllm for policy in $POLICIES; do run_policy "$policy" "$TRACE" done # 3) Capped variant: lmetric picker on a per-session turn-capped trace. # The directory label is "capped" but the proxy must launch with # --policy lmetric (the proxy's argparse has no "capped" choice). echo "[b3_sweep] building capped trace (max_turns=$MAX_TURNS_CAP) ..." CAPPED_TRACE="$OUTDIR/capped/trace.jsonl" mkdir -p "$OUTDIR/capped" "$VENV/python" "$ROOT/scripts/build_capped_trace.py" \ --input "$TRACE" \ --output "$CAPPED_TRACE" \ --max-turns "$MAX_TURNS_CAP" | tee "$OUTDIR/capped/build.log" # Inline equivalent of run_policy "capped" but using --policy lmetric. echo "[b3_sweep] === policy=capped (picker=lmetric) trace=$(basename "$CAPPED_TRACE") ===" pkill -9 -f cache_aware_proxy 2>/dev/null || true sleep 2 launch_proxy lmetric "$OUTDIR/capped/proxy.log" t_cap_start=$(date +%s.%N) PYTHONPATH="$ROOT" "$VENV/python" -m replayer \ --trace "$CAPPED_TRACE" \ --output "$OUTDIR/capped/metrics.jsonl" \ --endpoint "http://127.0.0.1:$PROXY_PORT" \ --model "$MODEL" \ 2>&1 | tee "$OUTDIR/capped/replayer.log" | tail -3 t_cap_end=$(date +%s.%N) python3 - "$OUTDIR/capped" capped "$CAPPED_TRACE" "$t_cap_start" "$t_cap_end" <<'PY' import json, sys rundir, policy, trace, t_start, t_end = sys.argv[1:] with open(f"{rundir}/run_window.json", "w") as f: json.dump({ "policy": policy, "trace": trace, "t_start_unix": float(t_start), "t_end_unix": float(t_end), }, f, indent=2) PY curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$OUTDIR/capped/breakdown.json" curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$OUTDIR/capped/worker_state.json" curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$OUTDIR/capped/stats.json" echo "[b3_sweep] capped done: $(wc -l < "$OUTDIR/capped/metrics.jsonl") metric rows" # 4) Snapshot final engine state file sizes for the analyzer ls -l "$OUTDIR/engine_state/" > "$OUTDIR/engine_state_files.txt" echo "[b3_sweep] sweep complete. OUTDIR=$OUTDIR"