agentic-kvc/scripts/b3_isolated_policy.sh

#!/usr/bin/env bash
# Run a single B3 policy with a cold-start vLLM (clean APC).
#
# Usage:
#   bash scripts/b3_isolated_policy.sh <policy> <trace> <rundir>
#
# Launches 8 fresh vLLM instances, captures their engine_state into
# <rundir>/engine_state/, runs the policy through the proxy on
# <trace>, then kills everything. Distinct from b3_sweep.sh which
# shares one vLLM-set across all five policies (faster but warm-cache).

set -euo pipefail

ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
VENV="$ROOT/.venv/bin"
MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
PROXY_PORT="${PROXY_PORT:-9300}"
BASE_PORT="${BASE_PORT:-8000}"
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}"
N_INSTANCES=$(echo $GPU_INDICES | wc -w)
# When ENABLE_KV_BOTH=1, vLLM launches with the Mooncake KV connector in
# kv_both role and the proxy is given bootstrap ports. This is required
# for --policy unified_v2 (per-request PD-sep) but disabled by default
# because it adds always-on KV-transfer overhead even when not triggered.
ENABLE_KV_BOTH="${ENABLE_KV_BOTH:-0}"
BOOTSTRAP_BASE_PORT="${BOOTSTRAP_BASE_PORT:-8998}"

POLICY="${1:?usage: $0 <policy> <trace> <rundir>}"
TRACE="${2:?usage: $0 <policy> <trace> <rundir>}"
RUNDIR="${3:?usage: $0 <policy> <trace> <rundir>}"

# Auto-enable kv_both when the policy requires it.
# KV_CONNECTOR (Mooncake|Nixl) selects the underlying connector when KV_BOTH=1.
_KV_CONNECTOR_EXPLICIT="${KV_CONNECTOR:-}"
KV_CONNECTOR="${KV_CONNECTOR:-Mooncake}"
if [ "$POLICY" = "unified_v2" ] || [ "$POLICY" = "unified_v3" ] || [ "$POLICY" = "unified_kv_both" ]; then
    ENABLE_KV_BOTH=1
    # honor explicit KV_CONNECTOR override (e.g. =Nixl); otherwise default Mooncake.
    if [ -z "$_KV_CONNECTOR_EXPLICIT" ]; then
        KV_CONNECTOR="Mooncake"
    fi
fi
if [ "$POLICY" = "unified_nixl_both" ]; then
    ENABLE_KV_BOTH=1
    KV_CONNECTOR="Nixl"
fi

mkdir -p "$RUNDIR/engine_state" "$RUNDIR/logs"
echo "[isolated] policy=$POLICY trace=$(basename $TRACE) rundir=$RUNDIR"

cleanup() {
    pkill -f gpu_monitor.sh 2>/dev/null || true
    pkill -9 -f cache_aware_proxy 2>/dev/null || true
    pkill -9 -f "vllm serve" 2>/dev/null || true
    pkill -9 -f "EngineCore" 2>/dev/null || true
    sleep 3
}
trap cleanup EXIT

# Hard reset first
cleanup

echo "[isolated] launching $N_INSTANCES vLLM on GPUs $GPU_INDICES ENABLE_KV_BOTH=$ENABLE_KV_BOTH KV_CONNECTOR=$KV_CONNECTOR ..."
i=0
for gpu in $GPU_INDICES; do
    port=$((BASE_PORT + i))
    master=$((29500 + i))
    bp=$((BOOTSTRAP_BASE_PORT + i))
    if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Mooncake" ]; then
        PYTHONHASHSEED=42 \
        VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
        AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
        AGENTIC_WORKER_ID="engine_${i}" \
        CUDA_VISIBLE_DEVICES=$gpu \
        MASTER_PORT=$master \
        nohup "$VENV/vllm" serve "$MODEL" \
            --host 0.0.0.0 --port "$port" \
            --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching \
            --dtype auto --gpu-memory-utilization 0.9 \
            --max-model-len 200000 \
            --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
            $EXTRA_VLLM_ARGS \
            > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
    elif [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then
        # NixlConnector's handshake listener binds to a fixed default
        # port 5600 unless VLLM_NIXL_SIDE_CHANNEL_PORT is overridden.
        # Multiple instances on the same host MUST use distinct ports
        # or only one will start; the rest hit
        # `zmq.error.ZMQError: Address already in use`.
        nixl_port=$((5600 + i))
        VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \
        AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
        AGENTIC_WORKER_ID="engine_${i}" \
        CUDA_VISIBLE_DEVICES=$gpu \
        MASTER_PORT=$master \
        nohup "$VENV/vllm" serve "$MODEL" \
            --host 0.0.0.0 --port "$port" \
            --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching \
            --dtype auto --gpu-memory-utilization 0.9 \
            --max-model-len 200000 \
            --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
            $EXTRA_VLLM_ARGS \
            > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
    else
        AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
        AGENTIC_WORKER_ID="engine_${i}" \
        CUDA_VISIBLE_DEVICES=$gpu \
        MASTER_PORT=$master \
        nohup "$VENV/vllm" serve "$MODEL" \
            --host 0.0.0.0 --port "$port" \
            --tensor-parallel-size 1 \
            --trust-remote-code --enable-prefix-caching \
            --dtype auto --gpu-memory-utilization 0.9 \
            --max-model-len 200000 \
            $EXTRA_VLLM_ARGS \
            > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
    fi
    disown
    sleep 2
    i=$((i + 1))
done

echo "[isolated] waiting for vLLM health ..."
# NIXL init takes ~100-150s per instance even with concurrent launches;
# Mooncake is closer to ~30-60s. Use a generous 360s timeout to cover
# both (90s -> 360s vs the previous 180s).
HEALTH_MAX_TRIES=180
for i in $(seq 0 $((N_INSTANCES - 1))); do
    port=$((BASE_PORT + i))
    tries=0
    while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
        tries=$((tries + 1))
        if [ $tries -gt $HEALTH_MAX_TRIES ]; then
            echo "[isolated] FATAL: inst_$i not healthy after $((HEALTH_MAX_TRIES * 2))s"
            exit 1
        fi
        sleep 2
    done
    echo "  inst_$i ready"
done

echo "[isolated] launching proxy with --policy $POLICY ..."
combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
    combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
proxy_extra=""
# Bootstrap ports only needed for Mooncake handshake. Nixl uses its own
# UCX side-channel and the proxy forwards kv_transfer_params from src's
# response body instead of pre-baking engine_id/bootstrap_addr.
if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Mooncake" ]; then
    bp_list=""
    for i in $(seq 0 $((N_INSTANCES - 1))); do
        if [ -z "$bp_list" ]; then
            bp_list="$((BOOTSTRAP_BASE_PORT + i))"
        else
            bp_list="$bp_list,$((BOOTSTRAP_BASE_PORT + i))"
        fi
    done
    proxy_extra="--bootstrap-ports $bp_list"
fi
if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then
    proxy_extra="--connector-type nixl"
fi
nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \
    --port "$PROXY_PORT" \
    --combined $combined_args \
    --policy "$POLICY" \
    $proxy_extra \
    ${EXTRA_PROXY_ARGS:-} \
    > "$RUNDIR/proxy.log" 2>&1 &
disown
tries=0
until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do
    tries=$((tries + 1))
    if [ $tries -gt 30 ]; then
        echo "[isolated] FATAL: proxy did not come up in 60s"
        tail -30 "$RUNDIR/proxy.log"
        exit 1
    fi
    sleep 2
done

# Unified bench infra: record worker->GPU mapping + sample per-GPU util during
# the replay so bench_report.py can emit per-worker GPU util / TPS for every run.
python3 - "$RUNDIR" "$BASE_PORT" "$PROXY_PORT" "$GPU_INDICES" "$N_INSTANCES" <<'PY'
import json, sys
rundir, base_port, proxy_port, gpu_indices, n = sys.argv[1:]
json.dump({"base_port": int(base_port), "proxy_port": int(proxy_port),
           "gpu_indices": [int(x) for x in gpu_indices.split()],
           "n_instances": int(n)}, open(f"{rundir}/bench_config.json", "w"), indent=2)
PY
bash "$ROOT/scripts/gpu_monitor.sh" "$RUNDIR/gpu_util.csv" 5 >/dev/null 2>&1 &
GPU_MON_PID=$!

t_start=$(date +%s.%N)
echo "[isolated] running replayer ..."
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
    --trace "$TRACE" \
    --output "$RUNDIR/metrics.jsonl" \
    --endpoint "http://127.0.0.1:$PROXY_PORT" \
    --model "$MODEL" \
    2>&1 | tee "$RUNDIR/replayer.log" | tail -3
t_end=$(date +%s.%N)
kill "${GPU_MON_PID:-}" 2>/dev/null || true

python3 - "$RUNDIR" "$POLICY" "$TRACE" "$t_start" "$t_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
    json.dump({
        "policy": policy, "trace": trace,
        "t_start_unix": float(t_start),
        "t_end_unix": float(t_end),
        "isolated": True,
    }, f, indent=2)
PY

curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$RUNDIR/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$RUNDIR/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$RUNDIR/stats.json"
echo "[isolated] $POLICY done: $(wc -l < "$RUNDIR/metrics.jsonl") metric rows"