Files
agentic-kvc/scripts/b3_sweep.sh
Gahow Wang 3fdcec9c0f Fix review P2s: lockfile, model path convention, trap robustness
- Regenerate uv.lock after adding fastapi/uvicorn deps so uv sync
  --locked no longer fails
- B3 scripts: default MODEL to $HOME/models/... matching documented
  convention and other launch scripts (repo has no models/ directory)
- launch_elastic_p2p: append || true to each trap command so set -e
  doesn't abort cleanup when jobs -p is empty and EngineCore orphans
  remain
2026-05-26 16:05:43 +08:00

207 lines
7.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# B3 routing sweep: 5 policies on 8x TP1 instances with full instrumentation.
#
# Policies:
# lmetric — cache-aware P_tokens × BS routing (main baseline)
# load_only — pure min-num_requests (B3 control: no cache)
# sticky — hard session affinity (B3 control: perfect locality)
# unified — hybrid affinity + LMetric fallback
# capped — lmetric on a per-session turn-capped trace
#
# Each policy run produces metrics.jsonl + breakdown.json + worker_state.json
# + run_window.json (start/end unix timestamps so the analyzer can slice the
# shared engine_*.jsonl by time).
set -euo pipefail
ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
VENV="$ROOT/.venv/bin"
MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="${TRACE:-$ROOT/traces/w600_r0.0015_st30.jsonl}"
OUTDIR="${OUTDIR:-$ROOT/outputs/b3_sweep_$(date +%Y%m%d_%H%M%S)}"
PROXY_PORT="${PROXY_PORT:-9300}"
BASE_PORT="${BASE_PORT:-8000}"
# Space-separated list of GPU indices to use, one vLLM instance per index.
# Override via GPU_INDICES="1 2 3 4 5 6 7" when GPU 0 holds ghost memory.
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
POLICIES="${POLICIES:-lmetric load_only sticky unified}"
MAX_TURNS_CAP="${MAX_TURNS_CAP:-8}"
EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}"
# Derive N_INSTANCES from GPU_INDICES
N_INSTANCES=$(echo $GPU_INDICES | wc -w)
mkdir -p "$OUTDIR/engine_state" "$OUTDIR/logs"
echo "[b3_sweep] OUTDIR=$OUTDIR"
cleanup() {
pkill -9 -f "vllm serve" 2>/dev/null || true
# vLLM spawns an EngineCore child whose process name is
# "VLLM::EngineCor" — pkill -f "vllm serve" misses it and leaves
# the GPU memory locked by a dead-but-tracked-by-driver context.
pkill -9 -f "EngineCore" 2>/dev/null || true
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 3
}
trap cleanup EXIT
# 1) Launch one vLLM per GPU index in GPU_INDICES; each emits engine_<i>.jsonl
launch_vllm() {
echo "[b3_sweep] launching $N_INSTANCES vLLM instances on GPUs $GPU_INDICES ..."
local i=0
for gpu in $GPU_INDICES; do
local port=$((BASE_PORT + i))
local master=$((29500 + i))
local log="$OUTDIR/logs/vllm_inst_${i}_gpu${gpu}.log"
AGENTIC_STEP_LOG_PATH="$OUTDIR/engine_state/engine_${i}.jsonl" \
AGENTIC_WORKER_ID="engine_${i}" \
CUDA_VISIBLE_DEVICES=$gpu \
MASTER_PORT=$master \
nohup "$VENV/vllm" serve "$MODEL" \
--host 0.0.0.0 --port "$port" \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 \
--max-model-len 200000 \
$EXTRA_VLLM_ARGS \
> "$log" 2>&1 &
disown
sleep 2
i=$((i + 1))
done
echo "[b3_sweep] waiting for vLLM health ..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 90 ]; then
echo "[b3_sweep] FATAL: inst_$i (port $port) not healthy after 180s"
exit 1
fi
sleep 2
done
echo " inst_$i ready"
done
}
launch_proxy() {
local policy="$1"
local logfile="$2"
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \
--port "$PROXY_PORT" \
--combined $combined_args \
--policy "$policy" \
> "$logfile" 2>&1 &
disown
local tries=0
until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 30 ]; then
echo "[b3_sweep] FATAL: proxy did not come up in 60s"
tail -30 "$logfile"
exit 1
fi
sleep 2
done
}
run_policy() {
local policy="$1"
local trace="$2"
local rundir="$OUTDIR/$policy"
mkdir -p "$rundir"
echo "[b3_sweep] === policy=$policy trace=$(basename "$trace") ==="
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 2
launch_proxy "$policy" "$rundir/proxy.log"
local t_start
t_start=$(date +%s.%N)
echo "{\"policy\": \"$policy\", \"trace\": \"$trace\", \"t_start_unix\": $t_start}" \
> "$rundir/run_window.json.partial"
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
--trace "$trace" \
--output "$rundir/metrics.jsonl" \
--endpoint "http://127.0.0.1:$PROXY_PORT" \
--model "$MODEL" \
2>&1 | tee "$rundir/replayer.log" | tail -3
local t_end
t_end=$(date +%s.%N)
python3 - "$rundir" "$policy" "$trace" "$t_start" "$t_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
json.dump({
"policy": policy, "trace": trace,
"t_start_unix": float(t_start),
"t_end_unix": float(t_end),
}, f, indent=2)
PY
rm -f "$rundir/run_window.json.partial"
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$rundir/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$rundir/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$rundir/stats.json"
echo "[b3_sweep] $policy done: $(wc -l < "$rundir/metrics.jsonl") metric rows"
}
# 2) Run each policy
launch_vllm
for policy in $POLICIES; do
run_policy "$policy" "$TRACE"
done
# 3) Capped variant: lmetric picker on a per-session turn-capped trace.
# The directory label is "capped" but the proxy must launch with
# --policy lmetric (the proxy's argparse has no "capped" choice).
echo "[b3_sweep] building capped trace (max_turns=$MAX_TURNS_CAP) ..."
CAPPED_TRACE="$OUTDIR/capped/trace.jsonl"
mkdir -p "$OUTDIR/capped"
"$VENV/python" "$ROOT/scripts/build_capped_trace.py" \
--input "$TRACE" \
--output "$CAPPED_TRACE" \
--max-turns "$MAX_TURNS_CAP" | tee "$OUTDIR/capped/build.log"
# Inline equivalent of run_policy "capped" but using --policy lmetric.
echo "[b3_sweep] === policy=capped (picker=lmetric) trace=$(basename "$CAPPED_TRACE") ==="
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 2
launch_proxy lmetric "$OUTDIR/capped/proxy.log"
t_cap_start=$(date +%s.%N)
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
--trace "$CAPPED_TRACE" \
--output "$OUTDIR/capped/metrics.jsonl" \
--endpoint "http://127.0.0.1:$PROXY_PORT" \
--model "$MODEL" \
2>&1 | tee "$OUTDIR/capped/replayer.log" | tail -3
t_cap_end=$(date +%s.%N)
python3 - "$OUTDIR/capped" capped "$CAPPED_TRACE" "$t_cap_start" "$t_cap_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
json.dump({
"policy": policy, "trace": trace,
"t_start_unix": float(t_start),
"t_end_unix": float(t_end),
}, f, indent=2)
PY
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$OUTDIR/capped/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$OUTDIR/capped/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$OUTDIR/capped/stats.json"
echo "[b3_sweep] capped done: $(wc -l < "$OUTDIR/capped/metrics.jsonl") metric rows"
# 4) Snapshot final engine state file sizes for the analyzer
ls -l "$OUTDIR/engine_state/" > "$OUTDIR/engine_state_files.txt"
echo "[b3_sweep] sweep complete. OUTDIR=$OUTDIR"