agentic-kvc/scripts/b3_sweep.sh

#!/usr/bin/env bash
# B3 routing sweep: 5 policies on 8x TP1 instances with full instrumentation.
#
# Policies:
#   lmetric    — cache-aware P_tokens × BS routing (main baseline)
#   load_only  — pure min-num_requests (B3 control: no cache)
#   sticky     — hard session affinity (B3 control: perfect locality)
#   unified    — hybrid affinity + LMetric fallback
#   capped     — lmetric on a per-session turn-capped trace
#
# Each policy run produces metrics.jsonl + breakdown.json + worker_state.json
# + run_window.json (start/end unix timestamps so the analyzer can slice the
# shared engine_*.jsonl by time).

set -euo pipefail

ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
VENV="$ROOT/.venv/bin"
MODEL="${MODEL:-$ROOT/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="${TRACE:-$ROOT/traces/w600_r0.0015_st30.jsonl}"
OUTDIR="${OUTDIR:-$ROOT/outputs/b3_sweep_$(date +%Y%m%d_%H%M%S)}"
PROXY_PORT="${PROXY_PORT:-9300}"
BASE_PORT="${BASE_PORT:-8000}"
# Space-separated list of GPU indices to use, one vLLM instance per index.
# Override via GPU_INDICES="1 2 3 4 5 6 7" when GPU 0 holds ghost memory.
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
POLICIES="${POLICIES:-lmetric load_only sticky unified}"
MAX_TURNS_CAP="${MAX_TURNS_CAP:-8}"
EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}"

# Derive N_INSTANCES from GPU_INDICES
N_INSTANCES=$(echo $GPU_INDICES | wc -w)

mkdir -p "$OUTDIR/engine_state" "$OUTDIR/logs"
echo "[b3_sweep] OUTDIR=$OUTDIR"

cleanup() {
    pkill -9 -f "vllm serve" 2>/dev/null || true
    # vLLM spawns an EngineCore child whose process name is
    # "VLLM::EngineCor" — pkill -f "vllm serve" misses it and leaves
    # the GPU memory locked by a dead-but-tracked-by-driver context.
    pkill -9 -f "EngineCore" 2>/dev/null || true
    pkill -9 -f cache_aware_proxy 2>/dev/null || true
    sleep 3
}
trap cleanup EXIT

# 1) Launch one vLLM per GPU index in GPU_INDICES; each emits engine_<i>.jsonl
launch_vllm() {
    echo "[b3_sweep] launching $N_INSTANCES vLLM instances on GPUs $GPU_INDICES ..."
    local i=0
    for gpu in $GPU_INDICES; do
        local port=$((BASE_PORT + i))
        local master=$((29500 + i))
        local log="$OUTDIR/logs/vllm_inst_${i}_gpu${gpu}.log"
        AGENTIC_STEP_LOG_PATH="$OUTDIR/engine_state/engine_${i}.jsonl" \
        AGENTIC_WORKER_ID="engine_${i}" \
        CUDA_VISIBLE_DEVICES=$gpu \
        MASTER_PORT=$master \
        nohup "$VENV/vllm" serve "$MODEL" \
            --host 0.0.0.0 --port "$port" \
            --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching \
            --dtype auto --gpu-memory-utilization 0.9 \
            --max-model-len 200000 \
            $EXTRA_VLLM_ARGS \
            > "$log" 2>&1 &
        disown
        sleep 2
        i=$((i + 1))
    done

    echo "[b3_sweep] waiting for vLLM health ..."
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        local port=$((BASE_PORT + i))
        local tries=0
        while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
            tries=$((tries + 1))
            if [ $tries -gt 90 ]; then
                echo "[b3_sweep] FATAL: inst_$i (port $port) not healthy after 180s"
                exit 1
            fi
            sleep 2
        done
        echo "  inst_$i ready"
    done
}

launch_proxy() {
    local policy="$1"
    local logfile="$2"
    local combined_args=""
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
    done
    nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \
        --port "$PROXY_PORT" \
        --combined $combined_args \
        --policy "$policy" \
        > "$logfile" 2>&1 &
    disown
    local tries=0
    until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do
        tries=$((tries + 1))
        if [ $tries -gt 30 ]; then
            echo "[b3_sweep] FATAL: proxy did not come up in 60s"
            tail -30 "$logfile"
            exit 1
        fi
        sleep 2
    done
}

run_policy() {
    local policy="$1"
    local trace="$2"
    local rundir="$OUTDIR/$policy"
    mkdir -p "$rundir"
    echo "[b3_sweep] === policy=$policy trace=$(basename "$trace") ==="

    pkill -9 -f cache_aware_proxy 2>/dev/null || true
    sleep 2
    launch_proxy "$policy" "$rundir/proxy.log"

    local t_start
    t_start=$(date +%s.%N)
    echo "{\"policy\": \"$policy\", \"trace\": \"$trace\", \"t_start_unix\": $t_start}" \
        > "$rundir/run_window.json.partial"

    PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
        --trace "$trace" \
        --output "$rundir/metrics.jsonl" \
        --endpoint "http://127.0.0.1:$PROXY_PORT" \
        --model "$MODEL" \
        2>&1 | tee "$rundir/replayer.log" | tail -3

    local t_end
    t_end=$(date +%s.%N)
    python3 - "$rundir" "$policy" "$trace" "$t_start" "$t_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
    json.dump({
        "policy": policy, "trace": trace,
        "t_start_unix": float(t_start),
        "t_end_unix": float(t_end),
    }, f, indent=2)
PY
    rm -f "$rundir/run_window.json.partial"

    curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$rundir/breakdown.json"
    curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$rundir/worker_state.json"
    curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$rundir/stats.json"
    echo "[b3_sweep] $policy done: $(wc -l < "$rundir/metrics.jsonl") metric rows"
}

# 2) Run each policy
launch_vllm

for policy in $POLICIES; do
    run_policy "$policy" "$TRACE"
done

# 3) Capped variant: lmetric picker on a per-session turn-capped trace.
# The directory label is "capped" but the proxy must launch with
# --policy lmetric (the proxy's argparse has no "capped" choice).
echo "[b3_sweep] building capped trace (max_turns=$MAX_TURNS_CAP) ..."
CAPPED_TRACE="$OUTDIR/capped/trace.jsonl"
mkdir -p "$OUTDIR/capped"
"$VENV/python" "$ROOT/scripts/build_capped_trace.py" \
    --input "$TRACE" \
    --output "$CAPPED_TRACE" \
    --max-turns "$MAX_TURNS_CAP" | tee "$OUTDIR/capped/build.log"

# Inline equivalent of run_policy "capped" but using --policy lmetric.
echo "[b3_sweep] === policy=capped (picker=lmetric) trace=$(basename "$CAPPED_TRACE") ==="
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 2
launch_proxy lmetric "$OUTDIR/capped/proxy.log"
t_cap_start=$(date +%s.%N)
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
    --trace "$CAPPED_TRACE" \
    --output "$OUTDIR/capped/metrics.jsonl" \
    --endpoint "http://127.0.0.1:$PROXY_PORT" \
    --model "$MODEL" \
    2>&1 | tee "$OUTDIR/capped/replayer.log" | tail -3
t_cap_end=$(date +%s.%N)
python3 - "$OUTDIR/capped" capped "$CAPPED_TRACE" "$t_cap_start" "$t_cap_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
    json.dump({
        "policy": policy, "trace": trace,
        "t_start_unix": float(t_start),
        "t_end_unix": float(t_end),
    }, f, indent=2)
PY
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$OUTDIR/capped/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$OUTDIR/capped/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$OUTDIR/capped/stats.json"
echo "[b3_sweep] capped done: $(wc -l < "$OUTDIR/capped/metrics.jsonl") metric rows"

# 4) Snapshot final engine state file sizes for the analyzer
ls -l "$OUTDIR/engine_state/" > "$OUTDIR/engine_state_files.txt"

echo "[b3_sweep] sweep complete. OUTDIR=$OUTDIR"