Reuse and concurrency axes redone with proper controlled variables, plus
the orchestration used to run them on dash0:
- run_reuse_fixed.sh: hold REAL prefill work (delta) constant, vary only
cached prefix -> reuse = C/(C+U). Supersedes old fig1 (which held
input=8192 and sliced prefix out, confounding "more reuse" with "less
prefill").
- run_conc.sh: agentic-corner config (in=32768, delta=512, reuse=0.984,
out=128) that exposes PD's structural KV-transfer tax. Supersedes old fig3.
- run_campaign{,2,3}.sh, backfill_d2048o128.sh: serial campaign drivers
(strictly one driver at a time), out=128 sweeps, PD wall-cap for
collapse-draining high-reuse arms, and flaked-arm backfill.
- mb5_run_gpu.sh: per-config bring-up / replay / teardown orchestrator.
- plot_pd_crossover.py: render the reuse_compare figures from fig_agg dumps.
- fig_agg.py: tolerate null stats from fully-collapsed arms (0 successes
write the stat keys as null; `dict.get(k, {})` returns null, not {}).
Data: fig1_reuse_fixed.json, fig1_reuse_d{1024,2048}_o128.json
Figs: reuse_compare_AB.png, reuse_compare_ABC.png
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
36 lines
2.0 KiB
Bash
36 lines
2.0 KiB
Bash
#!/usr/bin/env bash
|
|
# Backfill the d2048/o128 reuse arms that vLLM startup-flaked out (transient
|
|
# "Engine core initialization failed", intermittent). Retry up to 4x each with a
|
|
# clean teardown between attempts; HEALTH_MAX_TRIES=180 so a crashed launch fails
|
|
# in ~6min (not 10) before retrying. Then re-aggregate the figure JSON.
|
|
cd /home/admin/cpfs/wjh/agentic-kv-fresh
|
|
export MB5_VENV=$PWD/.venv_dash0
|
|
export HEALTH_MAX_TRIES=180
|
|
VPY=$MB5_VENV/bin/python
|
|
DELTA=2048; OL=128; N=8; THINK=0.5; TURNS=8; NSESS=48
|
|
MISS="${MISS:-4096:6P+2D 18432:6P+2D 38912:8C-proxy 38912:6P+2D}"
|
|
echo "=== BACKFILL START $(date) miss='$MISS' ==="
|
|
for pc in $MISS; do
|
|
pfx=${pc%%:*}; cfg=${pc##*:}
|
|
tag="reuse_p${pfx}_d${DELTA}_o${OL}"; trace="traces_synth/${tag}.jsonl"
|
|
$VPY scripts/gen_synthetic_trace.py --out "$trace" --mode regular --qps "$NSESS" --duration-s 1 \
|
|
--turns "$TURNS" --prefix-len "$pfx" --delta-len "$DELTA" --output-len "$OL" --seed 42 >/dev/null 2>&1
|
|
dur=""; [ "$cfg" != "8C-proxy" ] && dur=500
|
|
ok=0
|
|
for attempt in 1 2 3 4; do
|
|
echo "[backfill] $tag $cfg attempt=$attempt $(date +%T)"
|
|
MB5_P_ROUTING=session MB5_COLO_ROUTING=session \
|
|
REPLAY_MAX_INFLIGHT=$N REPLAY_INTER_TURN_THINK_S=$THINK REPLAY_NO_REALIZED_PREFIX=1 REPLAY_MAX_DURATION="$dur" \
|
|
CONFIGS="$cfg" REPS=1 TRACE="$trace" RUN_TAG="$tag" \
|
|
bash scripts/mb5_run_gpu.sh >/dev/null 2>&1
|
|
if [ -f "mb5_runs/${tag}_${cfg}_rep1/replay_metrics.summary.json" ]; then
|
|
echo " OK $cfg pfx=$pfx attempt=$attempt"; ok=1; break; fi
|
|
echo " FAILED attempt=$attempt; cleanup+retry"
|
|
MB5_VENV=$PWD/.venv_dash0 bash scripts/mb5_launch.sh stop >/dev/null 2>&1; sleep 5
|
|
done
|
|
[ $ok = 0 ] && echo "[backfill] GAVE UP $tag $cfg"
|
|
done
|
|
dirs=(); for d in mb5_runs/reuse_*_d2048_o128_*_rep1; do [ -f "$d/replay_metrics.summary.json" ] && dirs+=("$d"); done
|
|
$VPY scripts/fig_agg.py --json "${dirs[@]}" > analysis/mb5_pd_ablation/fig1_reuse_d2048_o128.json
|
|
echo "=== BACKFILL DONE dirs=${#dirs[@]}/24 $(date) ==="
|