Files
agentic-kvc/scripts/b3_sweep.sh
Gahow Wang 645b067dd4 Fix review bugs: PD-sep counter leaks, hardcoded paths, missing deps
Critical:
- cache_aware_proxy: _handle_pd_sep leaked p_inst.num_requests (never
  decremented) and never managed d_inst.num_requests; fix media_type
  from application/json to text/event-stream for SSE stream

High:
- b3_sweep/b3_isolated_policy/b3_analyze: replace hardcoded
  /home/admin/cpfs/wjh/ ROOT with script-relative $(dirname "$0")/..
- b3_analyze: replace hardcoded 8-port WORKER_MAP with dynamic
  generation from BASE_PORT and N_INSTANCES

Medium:
- analyze_breakdown: warn on stderr when records are skipped (was silent)
- deploy_vllm_patches: fail-fast on SSH/SCP errors instead of
  continuing with empty VENV_SITE
- pyproject.toml: declare fastapi and uvicorn as runtime dependencies
- launch_elastic_p2p: kill EngineCore and proxy in trap handler to
  prevent GPU memory leaks on exit
2026-05-26 15:54:55 +08:00

207 lines
7.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# B3 routing sweep: 5 policies on 8x TP1 instances with full instrumentation.
#
# Policies:
# lmetric — cache-aware P_tokens × BS routing (main baseline)
# load_only — pure min-num_requests (B3 control: no cache)
# sticky — hard session affinity (B3 control: perfect locality)
# unified — hybrid affinity + LMetric fallback
# capped — lmetric on a per-session turn-capped trace
#
# Each policy run produces metrics.jsonl + breakdown.json + worker_state.json
# + run_window.json (start/end unix timestamps so the analyzer can slice the
# shared engine_*.jsonl by time).
set -euo pipefail
ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
VENV="$ROOT/.venv/bin"
MODEL="${MODEL:-$ROOT/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
TRACE="${TRACE:-$ROOT/traces/w600_r0.0015_st30.jsonl}"
OUTDIR="${OUTDIR:-$ROOT/outputs/b3_sweep_$(date +%Y%m%d_%H%M%S)}"
PROXY_PORT="${PROXY_PORT:-9300}"
BASE_PORT="${BASE_PORT:-8000}"
# Space-separated list of GPU indices to use, one vLLM instance per index.
# Override via GPU_INDICES="1 2 3 4 5 6 7" when GPU 0 holds ghost memory.
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
POLICIES="${POLICIES:-lmetric load_only sticky unified}"
MAX_TURNS_CAP="${MAX_TURNS_CAP:-8}"
EXTRA_VLLM_ARGS="${EXTRA_VLLM_ARGS:---enable-prompt-tokens-details}"
# Derive N_INSTANCES from GPU_INDICES
N_INSTANCES=$(echo $GPU_INDICES | wc -w)
mkdir -p "$OUTDIR/engine_state" "$OUTDIR/logs"
echo "[b3_sweep] OUTDIR=$OUTDIR"
cleanup() {
pkill -9 -f "vllm serve" 2>/dev/null || true
# vLLM spawns an EngineCore child whose process name is
# "VLLM::EngineCor" — pkill -f "vllm serve" misses it and leaves
# the GPU memory locked by a dead-but-tracked-by-driver context.
pkill -9 -f "EngineCore" 2>/dev/null || true
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 3
}
trap cleanup EXIT
# 1) Launch one vLLM per GPU index in GPU_INDICES; each emits engine_<i>.jsonl
launch_vllm() {
echo "[b3_sweep] launching $N_INSTANCES vLLM instances on GPUs $GPU_INDICES ..."
local i=0
for gpu in $GPU_INDICES; do
local port=$((BASE_PORT + i))
local master=$((29500 + i))
local log="$OUTDIR/logs/vllm_inst_${i}_gpu${gpu}.log"
AGENTIC_STEP_LOG_PATH="$OUTDIR/engine_state/engine_${i}.jsonl" \
AGENTIC_WORKER_ID="engine_${i}" \
CUDA_VISIBLE_DEVICES=$gpu \
MASTER_PORT=$master \
nohup "$VENV/vllm" serve "$MODEL" \
--host 0.0.0.0 --port "$port" \
--tensor-parallel-size 1 \
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 \
--max-model-len 200000 \
$EXTRA_VLLM_ARGS \
> "$log" 2>&1 &
disown
sleep 2
i=$((i + 1))
done
echo "[b3_sweep] waiting for vLLM health ..."
for i in $(seq 0 $((N_INSTANCES - 1))); do
local port=$((BASE_PORT + i))
local tries=0
while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 90 ]; then
echo "[b3_sweep] FATAL: inst_$i (port $port) not healthy after 180s"
exit 1
fi
sleep 2
done
echo " inst_$i ready"
done
}
launch_proxy() {
local policy="$1"
local logfile="$2"
local combined_args=""
for i in $(seq 0 $((N_INSTANCES - 1))); do
combined_args="$combined_args http://127.0.0.1:$((BASE_PORT + i))"
done
nohup "$VENV/python" "$ROOT/scripts/cache_aware_proxy.py" \
--port "$PROXY_PORT" \
--combined $combined_args \
--policy "$policy" \
> "$logfile" 2>&1 &
disown
local tries=0
until curl -sf "http://127.0.0.1:$PROXY_PORT/stats" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ $tries -gt 30 ]; then
echo "[b3_sweep] FATAL: proxy did not come up in 60s"
tail -30 "$logfile"
exit 1
fi
sleep 2
done
}
run_policy() {
local policy="$1"
local trace="$2"
local rundir="$OUTDIR/$policy"
mkdir -p "$rundir"
echo "[b3_sweep] === policy=$policy trace=$(basename "$trace") ==="
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 2
launch_proxy "$policy" "$rundir/proxy.log"
local t_start
t_start=$(date +%s.%N)
echo "{\"policy\": \"$policy\", \"trace\": \"$trace\", \"t_start_unix\": $t_start}" \
> "$rundir/run_window.json.partial"
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
--trace "$trace" \
--output "$rundir/metrics.jsonl" \
--endpoint "http://127.0.0.1:$PROXY_PORT" \
--model "$MODEL" \
2>&1 | tee "$rundir/replayer.log" | tail -3
local t_end
t_end=$(date +%s.%N)
python3 - "$rundir" "$policy" "$trace" "$t_start" "$t_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
json.dump({
"policy": policy, "trace": trace,
"t_start_unix": float(t_start),
"t_end_unix": float(t_end),
}, f, indent=2)
PY
rm -f "$rundir/run_window.json.partial"
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$rundir/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$rundir/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$rundir/stats.json"
echo "[b3_sweep] $policy done: $(wc -l < "$rundir/metrics.jsonl") metric rows"
}
# 2) Run each policy
launch_vllm
for policy in $POLICIES; do
run_policy "$policy" "$TRACE"
done
# 3) Capped variant: lmetric picker on a per-session turn-capped trace.
# The directory label is "capped" but the proxy must launch with
# --policy lmetric (the proxy's argparse has no "capped" choice).
echo "[b3_sweep] building capped trace (max_turns=$MAX_TURNS_CAP) ..."
CAPPED_TRACE="$OUTDIR/capped/trace.jsonl"
mkdir -p "$OUTDIR/capped"
"$VENV/python" "$ROOT/scripts/build_capped_trace.py" \
--input "$TRACE" \
--output "$CAPPED_TRACE" \
--max-turns "$MAX_TURNS_CAP" | tee "$OUTDIR/capped/build.log"
# Inline equivalent of run_policy "capped" but using --policy lmetric.
echo "[b3_sweep] === policy=capped (picker=lmetric) trace=$(basename "$CAPPED_TRACE") ==="
pkill -9 -f cache_aware_proxy 2>/dev/null || true
sleep 2
launch_proxy lmetric "$OUTDIR/capped/proxy.log"
t_cap_start=$(date +%s.%N)
PYTHONPATH="$ROOT" "$VENV/python" -m replayer \
--trace "$CAPPED_TRACE" \
--output "$OUTDIR/capped/metrics.jsonl" \
--endpoint "http://127.0.0.1:$PROXY_PORT" \
--model "$MODEL" \
2>&1 | tee "$OUTDIR/capped/replayer.log" | tail -3
t_cap_end=$(date +%s.%N)
python3 - "$OUTDIR/capped" capped "$CAPPED_TRACE" "$t_cap_start" "$t_cap_end" <<'PY'
import json, sys
rundir, policy, trace, t_start, t_end = sys.argv[1:]
with open(f"{rundir}/run_window.json", "w") as f:
json.dump({
"policy": policy, "trace": trace,
"t_start_unix": float(t_start),
"t_end_unix": float(t_end),
}, f, indent=2)
PY
curl -s "http://127.0.0.1:$PROXY_PORT/breakdown" > "$OUTDIR/capped/breakdown.json"
curl -s "http://127.0.0.1:$PROXY_PORT/worker_state" > "$OUTDIR/capped/worker_state.json"
curl -s "http://127.0.0.1:$PROXY_PORT/stats" > "$OUTDIR/capped/stats.json"
echo "[b3_sweep] capped done: $(wc -l < "$OUTDIR/capped/metrics.jsonl") metric rows"
# 4) Snapshot final engine state file sizes for the analyzer
ls -l "$OUTDIR/engine_state/" > "$OUTDIR/engine_state_files.txt"
echo "[b3_sweep] sweep complete. OUTDIR=$OUTDIR"