Files
agentic-kvc/scripts/b3_sweep.sh
Gahow Wang 0e82612100 Fix B3 analysis bugs from subagent audit (median + percentile + sweep)
Three fixes from the B3 audit:

1) joined_analysis.hotspot_index used sorted[n//2] as median, which
   returns the ~60th percentile for n=8 (even-length). Systematically
   under-states the hotspot index. Recomputed values:
       lmetric   2.238 -> 2.253  (+0.7%)
       load_only 1.140 -> 1.294  (+13.5%)
       sticky    2.349 -> 2.728  (+16.1%)
       unified   3.350 -> 3.667  (+9.5%)
       capped    1.937 -> 2.020  (+4.3%)
   Qualitative ranking preserved; "capped only modestly reduces hotspot"
   story holds with ~10% drop instead of the previously reported 13%.
   Added test_hotspot_index_uses_true_median_for_even_n to lock in the
   fix.

2) b3_analyze.sh's pct() helper used floor-indexed percentile
   sorted[int(p*(n-1))], inconsistent with metrics._percentile and
   joined_analysis._percentile which both use linear interpolation.
   Now matches.

3) b3_sweep.sh's capped step called run_policy "capped", but the
   proxy's argparse has no "capped" choice, so the hot-sweep variant
   would have crashed on this step. The actual capped data was
   produced via b3_isolated_policy.sh with --policy lmetric. Replace
   the broken inline call with an explicit launch_proxy lmetric +
   inline replayer block so the sweep script matches the data path
   it documents.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-26 01:08:37 +08:00

207 lines
7.5 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# B3 routing sweep: 5 policies on 8x TP1 instances with full instrumentation.
#
# Policies:
# lmetric — cache-aware P_tokens × BS routing (main baseline)
# load_only — pure min-num_requests (B3 control: no cache)
# sticky — hard session affinity (B3 control: perfect locality)
# unified — hybrid affinity + LMetric fallback
# capped — lmetric on a per-session turn-capped trace
#
# Each policy run produces metrics.jsonl + breakdown.json + worker_state.json
# + run_window.json (start/end unix timestamps so the analyzer can slice the
# shared engine_*.jsonl by time).
set -euo pipefail
ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
VENV="$ROOT/.venv/bin"
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="${TRACE:-$ROOT/traces/w600_r0.0015_st30.jsonl}"
OUTDIR="${OUTDIR:-$ROOT/outputs/b3_sweep_$(date +%Y%m%d_%H%M%S)}"
PROXY_PORT="${PROXY_PORT:-9300}"
BASE_PORT="${BASE_PORT:-8000}"
# Space-separated list of GPU indices to use, one vLLM instance per index.
# Override via GPU_INDICES="1 2 3 4 5 6 7" when GPU 0 holds ghost memory.
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
POLICIES="${POLICIES:-lmetric load_only sticky unified}"
MAX_TURNS_CAP="${MAX_TURNS_CAP:-8}"
EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}"
# Derive N_INSTANCES from GPU_INDICES
N_INSTANCES=$(echo $GPU_INDICES | wc -w)
mkdir -p "$OUTDIR/engine_state" "$OUTDIR/logs"
echo "[b3_sweep] OUTDIR=$OUTDIR"
cleanup() {
pkill -9 -f "vllm serve" 2>/dev/null || true
# vLLM spawns an EngineCore child whose process name is
# "VLLM::EngineCor" — pkill -f "vllm serve" misses it and leaves
# the GPU memory locked by a dead-but-tracked-by-driver context.
pkill -9 -f "EngineCore" 2>/dev/null || true
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 3
}
trap cleanup EXIT
# 1) Launch one vLLM per GPU index in GPU_INDICES; each emits engine_<i>.jsonl
launch_vllm() {
echo "[b3_sweep] launching $N_INSTANCES vLLM instances on GPUs $GPU_INDICES ..."
local i=0
for gpu in $GPU_INDICES; do
local port=$((BASE_PORT + i))
local master=$((29500 + i))
local log="$OUTDIR/logs/vllm_inst_${i}_gpu${gpu}.log"
AGENTIC_STEP_LOG_PATH="$OUTDIR/engine_state/engine_${i}.jsonl" \
AGENTIC_WORKER_ID="engine_${i}" \
CUDA_VISIBLE_DEVICES=$gpu \
MASTER_PORT=$master \
nohup "$VENV/vllm" serve "$MODEL" \
--host 0.0.0.0 --port "$port" \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 \
--max-model-len 200000 \
$EXTRA_VLLM_ARGS \
> "$log" 2>&1 &
disown
sleep 2
i=$((i + 1))
done
echo "[b3_sweep] waiting for vLLM health ..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 90 ]; then
echo "[b3_sweep] FATAL: inst_$i (port $port) not healthy after 180s"
exit 1
fi
sleep 2
done
echo " inst_$i ready"
done
}
launch_proxy() {
local policy="$1"
local logfile="$2"
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \
--port "$PROXY_PORT" \
--combined $combined_args \
--policy "$policy" \
> "$logfile" 2>&1 &
disown
local tries=0
until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 30 ]; then
echo "[b3_sweep] FATAL: proxy did not come up in 60s"
tail -30 "$logfile"
exit 1
fi
sleep 2
done
}
run_policy() {
local policy="$1"
local trace="$2"
local rundir="$OUTDIR/$policy"
mkdir -p "$rundir"
echo "[b3_sweep] === policy=$policy trace=$(basename "$trace") ==="
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 2
launch_proxy "$policy" "$rundir/proxy.log"
local t_start
t_start=$(date +%s.%N)
echo "{\"policy\": \"$policy\", \"trace\": \"$trace\", \"t_start_unix\": $t_start}" \
> "$rundir/run_window.json.partial"
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
--trace "$trace" \
--output "$rundir/metrics.jsonl" \
--endpoint "http://127.0.0.1:$PROXY_PORT" \
--model "$MODEL" \
2>&1 | tee "$rundir/replayer.log" | tail -3
local t_end
t_end=$(date +%s.%N)
python3 - "$rundir" "$policy" "$trace" "$t_start" "$t_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
json.dump({
"policy": policy, "trace": trace,
"t_start_unix": float(t_start),
"t_end_unix": float(t_end),
}, f, indent=2)
PY
rm -f "$rundir/run_window.json.partial"
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$rundir/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$rundir/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$rundir/stats.json"
echo "[b3_sweep] $policy done: $(wc -l < "$rundir/metrics.jsonl") metric rows"
}
# 2) Run each policy
launch_vllm
for policy in $POLICIES; do
run_policy "$policy" "$TRACE"
done
# 3) Capped variant: lmetric picker on a per-session turn-capped trace.
# The directory label is "capped" but the proxy must launch with
# --policy lmetric (the proxy's argparse has no "capped" choice).
echo "[b3_sweep] building capped trace (max_turns=$MAX_TURNS_CAP) ..."
CAPPED_TRACE="$OUTDIR/capped/trace.jsonl"
mkdir -p "$OUTDIR/capped"
"$VENV/python" "$ROOT/scripts/build_capped_trace.py" \
--input "$TRACE" \
--output "$CAPPED_TRACE" \
--max-turns "$MAX_TURNS_CAP" | tee "$OUTDIR/capped/build.log"
# Inline equivalent of run_policy "capped" but using --policy lmetric.
echo "[b3_sweep] === policy=capped (picker=lmetric) trace=$(basename "$CAPPED_TRACE") ==="
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 2
launch_proxy lmetric "$OUTDIR/capped/proxy.log"
t_cap_start=$(date +%s.%N)
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
--trace "$CAPPED_TRACE" \
--output "$OUTDIR/capped/metrics.jsonl" \
--endpoint "http://127.0.0.1:$PROXY_PORT" \
--model "$MODEL" \
2>&1 | tee "$OUTDIR/capped/replayer.log" | tail -3
t_cap_end=$(date +%s.%N)
python3 - "$OUTDIR/capped" capped "$CAPPED_TRACE" "$t_cap_start" "$t_cap_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
json.dump({
"policy": policy, "trace": trace,
"t_start_unix": float(t_start),
"t_end_unix": float(t_end),
}, f, indent=2)
PY
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$OUTDIR/capped/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$OUTDIR/capped/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$OUTDIR/capped/stats.json"
echo "[b3_sweep] capped done: $(wc -l < "$OUTDIR/capped/metrics.jsonl") metric rows"
# 4) Snapshot final engine state file sizes for the analyzer
ls -l "$OUTDIR/engine_state/" > "$OUTDIR/engine_state_files.txt"
echo "[b3_sweep] sweep complete. OUTDIR=$OUTDIR"