MB5 PD ablation: controlled-variable reuse/conc redo + campaign tooling

Reuse and concurrency axes redone with proper controlled variables, plus
the orchestration used to run them on dash0:

- run_reuse_fixed.sh: hold REAL prefill work (delta) constant, vary only
  cached prefix -> reuse = C/(C+U). Supersedes old fig1 (which held
  input=8192 and sliced prefix out, confounding "more reuse" with "less
  prefill").
- run_conc.sh: agentic-corner config (in=32768, delta=512, reuse=0.984,
  out=128) that exposes PD's structural KV-transfer tax. Supersedes old fig3.
- run_campaign{,2,3}.sh, backfill_d2048o128.sh: serial campaign drivers
  (strictly one driver at a time), out=128 sweeps, PD wall-cap for
  collapse-draining high-reuse arms, and flaked-arm backfill.
- mb5_run_gpu.sh: per-config bring-up / replay / teardown orchestrator.
- plot_pd_crossover.py: render the reuse_compare figures from fig_agg dumps.
- fig_agg.py: tolerate null stats from fully-collapsed arms (0 successes
  write the stat keys as null; `dict.get(k, {})` returns null, not {}).

Data: fig1_reuse_fixed.json, fig1_reuse_d{1024,2048}_o128.json
Figs: reuse_compare_AB.png, reuse_compare_ABC.png

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-01 01:03:27 +08:00
parent 32f7f55990
commit 9c105cf05a
14 changed files with 586 additions and 4 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 114 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 99 KiB

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Backfill the d2048/o128 reuse arms that vLLM startup-flaked out (transient
# "Engine core initialization failed", intermittent). Retry up to 4x each with a
# clean teardown between attempts; HEALTH_MAX_TRIES=180 so a crashed launch fails
# in ~6min (not 10) before retrying. Then re-aggregate the figure JSON.
cd /home/admin/cpfs/wjh/agentic-kv-fresh
export MB5_VENV=$PWD/.venv_dash0
export HEALTH_MAX_TRIES=180
VPY=$MB5_VENV/bin/python
DELTA=2048; OL=128; N=8; THINK=0.5; TURNS=8; NSESS=48
MISS="${MISS:-4096:6P+2D 18432:6P+2D 38912:8C-proxy 38912:6P+2D}"
echo "=== BACKFILL START $(date) miss='$MISS' ==="
for pc in $MISS; do
pfx=${pc%%:*}; cfg=${pc##*:}
tag="reuse_p${pfx}_d${DELTA}_o${OL}"; trace="traces_synth/${tag}.jsonl"
$VPY scripts/gen_synthetic_trace.py --out "$trace" --mode regular --qps "$NSESS" --duration-s 1 \
--turns "$TURNS" --prefix-len "$pfx" --delta-len "$DELTA" --output-len "$OL" --seed 42 >/dev/null 2>&1
dur=""; [ "$cfg" != "8C-proxy" ] && dur=500
ok=0
for attempt in 1 2 3 4; do
echo "[backfill] $tag $cfg attempt=$attempt $(date +%T)"
MB5_P_ROUTING=session MB5_COLO_ROUTING=session \
REPLAY_MAX_INFLIGHT=$N REPLAY_INTER_TURN_THINK_S=$THINK REPLAY_NO_REALIZED_PREFIX=1 REPLAY_MAX_DURATION="$dur" \
CONFIGS="$cfg" REPS=1 TRACE="$trace" RUN_TAG="$tag" \
bash scripts/mb5_run_gpu.sh >/dev/null 2>&1
if [ -f "mb5_runs/${tag}_${cfg}_rep1/replay_metrics.summary.json" ]; then
echo " OK $cfg pfx=$pfx attempt=$attempt"; ok=1; break; fi
echo " FAILED attempt=$attempt; cleanup+retry"
MB5_VENV=$PWD/.venv_dash0 bash scripts/mb5_launch.sh stop >/dev/null 2>&1; sleep 5
done
[ $ok = 0 ] && echo "[backfill] GAVE UP $tag $cfg"
done
dirs=(); for d in mb5_runs/reuse_*_d2048_o128_*_rep1; do [ -f "$d/replay_metrics.summary.json" ] && dirs+=("$d"); done
$VPY scripts/fig_agg.py --json "${dirs[@]}" > analysis/mb5_pd_ablation/fig1_reuse_d2048_o128.json
echo "=== BACKFILL DONE dirs=${#dirs[@]}/24 $(date) ==="

View File

@@ -100,11 +100,13 @@ def main():
continue continue
s = json.load(open(sp)) s = json.load(open(sp))
arm, pg, dg, ports = arm_of(run.name) arm, pg, dg, ports = arm_of(run.name)
lat = s.get("latency_stats_s", {}) # `or {}` because a fully-collapsed arm (0 successes) writes these as null,
ttft = s.get("ttft_stats_s", {}) # and dict.get(k, {}) returns null (not {}) when the key exists with value null.
tpot = s.get("tpot_stats_s", {}) lat = s.get("latency_stats_s") or {}
ttft = s.get("ttft_stats_s") or {}
tpot = s.get("tpot_stats_s") or {}
wall = s.get("wall_clock_s") or 1.0 wall = s.get("wall_clock_s") or 1.0
out = s.get("actual_output_tokens_stats", {}) out = s.get("actual_output_tokens_stats") or {}
n = s.get("success_count", 0); req = s.get("request_count", 0) n = s.get("success_count", 0); req = s.get("request_count", 0)
tot_out = out.get("count", 0) * out.get("mean", 0) tot_out = out.get("count", 0) * out.get("mean", 0)
tps = tot_out / wall tps = tot_out / wall

View File

@@ -0,0 +1,144 @@
#!/usr/bin/env bash
# Orchestrator for MB5: for each CONFIG × rep, bring up the stack, run a
# trace replay against it, collect KV snapshots and replayer metrics,
# tear down.
#
# Designed to be run on dash1 (or any host with cpfs mounted at
# /home/admin/cpfs/wjh/).
#
# Env vars (with defaults):
# CONFIGS : space-separated MB5 configs (default: "8C 6P+2D 4P+4D 2P+6D")
# REPS : reps per config (default: 3)
# TRACE : trace JSONL path
# (default: /home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl)
# RUN_TAG : output root tag (default: $(date +%Y%m%d_%H%M%S))
# REQUEST_LIMIT : optional, cap replay requests (default: none)
set -eo pipefail
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
REPLAYER_DIR="${FRESH_ROOT}/replayer"
CONFIGS="${CONFIGS:-8C 6P+2D 4P+4D 2P+6D}"
REPS="${REPS:-3}"
TRACE="${TRACE:-/home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl}"
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d_%H%M%S)}"
MODEL_NAME="${MODEL_NAME:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
REQUEST_LIMIT_ARG=""
if [ -n "${REQUEST_LIMIT:-}" ]; then
REQUEST_LIMIT_ARG="--request-limit ${REQUEST_LIMIT}"
fi
OUT_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_TAG}"
mkdir -p "${OUT_ROOT}"
echo "[mb5-run] RUN_TAG=${RUN_TAG}"
echo "[mb5-run] OUT_ROOT=${OUT_ROOT}"
echo "[mb5-run] CONFIGS=${CONFIGS}"
echo "[mb5-run] REPS=${REPS}"
echo "[mb5-run] TRACE=${TRACE}"
run_one() {
local config="$1" rep="$2"
local label="${RUN_TAG}_${config}_rep${rep}"
local rundir="${FRESH_ROOT}/mb5_runs/${label}"
echo ""
echo "======== ${config} rep${rep} ========"
# Launch
if ! CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
bash "${LAUNCH}" start > "${OUT_ROOT}/${config}_rep${rep}_launch.log" 2>&1; then
echo "[mb5-run] LAUNCH FAILED for ${config} rep${rep}; see ${OUT_ROOT}/${config}_rep${rep}_launch.log"
return 1
fi
# Extract ENDPOINTS line emitted by mb5_launch.sh
local endpoints
endpoints=$(grep "^ENDPOINTS=" "${OUT_ROOT}/${config}_rep${rep}_launch.log" | tail -1 | cut -d= -f2-)
if [ -z "${endpoints}" ]; then
echo "[mb5-run] ERROR: no ENDPOINTS in launch log"
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
return 1
fi
echo "[mb5-run] endpoints: ${endpoints}"
# Replay
source "${VENV}/bin/activate"
local replay_out="${rundir}/replay_metrics.jsonl"
mkdir -p "$(dirname "${replay_out}")"
# per-GPU utilization timeseries over the replay window (2s sampling)
bash "${FRESH_ROOT}/microbench/fresh_setup/gpu_monitor.sh" "${rundir}/gpu_util.csv" 2 >/dev/null 2>&1 &
local GPU_MON=$!
local t0
t0=$(date +%s.%N)
if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \
--endpoint "${endpoints}" \
--trace "${TRACE}" \
--output "${replay_out}" \
--model "${MODEL_NAME}" \
${REQUEST_LIMIT_ARG} \
> "${OUT_ROOT}/${config}_rep${rep}_replay.log" 2>&1; then
local t1
t1=$(date +%s.%N)
local wall=$(python -c "print(${t1} - ${t0})")
echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log"
kill "${GPU_MON}" 2>/dev/null || true
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
return 1
fi
local t1
t1=$(date +%s.%N)
local wall_clock_s
wall_clock_s=$(python -c "print(${t1} - ${t0})")
echo "[mb5-run] replay done in ${wall_clock_s}s"
echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"
kill "${GPU_MON}" 2>/dev/null || true
printf '{"t_start_unix":%s,"t_end_unix":%s}\n' "${t0}" "${t1}" > "${rundir}/run_window.json"
# Per-instance prefix-cache counters, scraped from each backend BEFORE
# teardown. For PD this is the only honest reuse signal: producer ports
# (the low ones) show cross-turn prefix-cache hits; the consumer's
# per-request cached_tokens is meaningless (it counts transferred KV).
{
for p in 8000 8001 8002 8003 8004 8005 8006 8007; do
m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue
q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}')
h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}')
[ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}"
done
} > "${rundir}/instance_apc.txt" 2>/dev/null || true
# Stop launch (cleans up vllm + proxy; reverts patch on last call)
CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true
sleep 10 # cooldown so GPUs settle before next config
echo "[mb5-run] DONE ${config} rep${rep}"
}
# Quick check that the launch script and replayer are reachable
if [ ! -f "${LAUNCH}" ]; then echo "missing ${LAUNCH}"; exit 1; fi
if [ ! -d "${REPLAYER_DIR}" ]; then echo "missing ${REPLAYER_DIR}"; exit 1; fi
if [ ! -f "${TRACE}" ]; then echo "missing trace ${TRACE}"; exit 1; fi
# Iterate
failures=0
for config in ${CONFIGS}; do
for ((rep=1; rep<=REPS; rep++)); do
if ! run_one "${config}" "${rep}"; then
failures=$((failures+1))
fi
done
done
# Final patch revert (defensive — mb5_launch.sh stop also reverts)
python "${SCRIPT_DIR}/instrument_kv_snapshot.py" --revert --venv "${VENV}" 2>/dev/null || true
echo ""
echo "======== ALL CONFIGS DONE ========"
echo "failures: ${failures}"
echo "results under: ${FRESH_ROOT}/mb5_runs/${RUN_TAG}_*"
echo "to plot: python plot_kv_pool_timeline.py --run-tag ${RUN_TAG}"

View File

@@ -0,0 +1,184 @@
"""Render the three PD-vs-colo crossover figures from fig_agg JSON dumps.
Inputs (produced by `fig_agg.py --json`):
analysis/mb5_pd_ablation/fig1_reuse_fixed.json reuse axis (N=8, FIXED real
prefill delta=2048; vary cached prefix -> reuse = pfx/(pfx+delta).
Controlled-variable: real new-prefill work is constant across the sweep,
only the cached fraction (and total context) grows. Supersedes the old
fig1.json, which held input=8192 and sliced prefix out of it so delta
shrank 15x as reuse rose — a confound, not a pure reuse axis.)
analysis/mb5_pd_ablation/fig2.json shape axis (N=8, reuse~70%)
analysis/mb5_pd_ablation/fig3_conc32k.json concurrency (in32768/out128,
reuse~0.984 = 32256 resident + 512 real new-prefill per turn; retuned
2026-05-31 to the agentic corner so PD pays the full-context per-turn
KV-transfer tax while colo keeps it resident; vary N by step 8 up to the
mean-E2E<=10s SLO ceiling)
Each figure overlays colo + the three PD ratios and marks the PD-best advantage.
All three share the corrected (uncontaminated, e13391e-gated-off) stack.
"""
from __future__ import annotations
import json
import re
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
ROOT = Path(__file__).resolve().parents[2]
DATA = ROOT / "analysis" / "mb5_pd_ablation"
OUT = ROOT / "figs" / "mb5_pd_ablation"
OUT.mkdir(parents=True, exist_ok=True)
PD_ARMS = ["2P+6D", "4P+4D", "6P+2D"]
STYLE = {
"colo": dict(color="k", marker="o", lw=2.4, ls="-", label="colo (8×kv_both)"),
"2P+6D": dict(color="#1f77b4", marker="s", lw=1.6, ls="--", label="PD 2P+6D"),
"4P+4D": dict(color="#2ca02c", marker="^", lw=1.6, ls="--", label="PD 4P+4D"),
"6P+2D": dict(color="#ff7f0e", marker="v", lw=1.6, ls="--", label="PD 6P+2D"),
}
def load(name):
return json.load(open(DATA / name))
def by_axis(rows, keyfn):
"""Group rows -> {axis_val: {arm: row}}."""
out = {}
for r in rows:
k = keyfn(r["name"])
if k is None:
continue
out.setdefault(k, {})[r["arm"]] = r
return out
def pd_best(armmap, metric="e2e_p90"):
vals = [(a, armmap[a][metric]) for a in PD_ARMS
if a in armmap and armmap[a].get(metric) is not None]
return min(vals, key=lambda t: t[1]) if vals else (None, None)
def series(grp, xs, arm, metric):
return [grp[x][arm].get(metric) if arm in grp[x] else None for x in xs]
# ---------- Fig 1: reuse axis ----------
def _reuse_pct(name):
"""Reuse % from a `reuse_p{pfx}_d{delta}_{arm}` run name: pfx/(pfx+delta)."""
m = re.search(r"_p(\d+)_d(\d+)", name)
if not m:
return None
pfx, delta = int(m.group(1)), int(m.group(2))
return round(pfx / (pfx + delta) * 100)
def fig_reuse():
g = by_axis(load("fig1_reuse_fixed.json"), _reuse_pct)
xs = sorted(g)
reuse = xs
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4.2))
for arm in ["colo", *PD_ARMS]:
ax1.plot(reuse, series(g, xs, arm, "e2e_p90"), **STYLE[arm])
ax1.set_xlabel("intra-session KV reuse (%) [fixed real prefill, delta=2048]")
ax1.set_ylabel("E2E latency p90 (s)")
ax1.set_title("(a) E2E-p90 vs reuse (N=8, delta=2048/out256)")
ax1.legend(fontsize=8); ax1.grid(alpha=.3)
adv, putil = [], []
for x in xs:
co = g[x]["colo"]["e2e_p90"]; _, b = pd_best(g[x])
adv.append(co / b if b else None)
a = pd_best(g[x])[0]
putil.append(g[x][a].get("pu") if a else None)
ax2.plot(reuse, adv, color="purple", marker="D", lw=2, label="PD-best advantage (colo/PD)")
ax2.axhline(1.0, color="grey", ls=":", lw=1)
ax2.set_xlabel("intra-session KV reuse (%)"); ax2.set_ylabel("advantage (>1 = PD wins)")
ax2b = ax2.twinx()
ax2b.plot(reuse, putil, color="brown", marker="x", lw=1.4, ls="-.", label="PD-best prefill-GPU util")
ax2b.set_ylabel("prefill-GPU util (%)", color="brown"); ax2b.tick_params(axis="y", colors="brown")
ax2.set_title("(b) advantage erodes; prefill GPUs go idle")
l1, la1 = ax2.get_legend_handles_labels(); l2, la2 = ax2b.get_legend_handles_labels()
ax2.legend(l1 + l2, la1 + la2, fontsize=8, loc="center right"); ax2.grid(alpha=.3)
fig.suptitle("Fig 1 — Reuse axis (fixed real prefill delta=2048): PD's edge vs rising cache reuse",
fontsize=11, y=1.02)
fig.tight_layout(); p = OUT / "fig1_reuse_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
print("wrote", p)
# ---------- Fig 2: shape axis ----------
def fig_shape():
g = by_axis(load("fig2.json"),
lambda n: ((int(m.group(1)), int(m.group(2)))
if (m := re.search(r"_in(\d+)_out(\d+)_", n)) else None))
xs = sorted(g, key=lambda t: t[0]) # ascending input
labels = [f"in{i}\nout{o}" for i, o in xs]
xi = list(range(len(xs)))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4.2))
for arm in ["colo", *PD_ARMS]:
ax1.plot(xi, series(g, xs, arm, "e2e_p90"), **STYLE[arm])
ax1.set_xticks(xi); ax1.set_xticklabels(labels, fontsize=7)
ax1.set_xlabel("shape (decode-heavy → prefill-heavy)"); ax1.set_ylabel("E2E latency p90 (s)")
ax1.set_title("(a) E2E-p90 vs shape (N=8, reuse~70%)")
ax1.legend(fontsize=8); ax1.grid(alpha=.3)
adv, comp = [], []
for x in xs:
co = g[x]["colo"]["e2e_p90"]; a, b = pd_best(g[x])
adv.append(co / b if b else None)
# completion of the worst PD arm (exposes catastrophic ratio)
worst = min((g[x][arm]["n"] / g[x][arm]["req"]) for arm in PD_ARMS if arm in g[x])
comp.append(worst * 100)
ax2.plot(xi, adv, color="purple", marker="D", lw=2, label="PD-best advantage (colo/PD)")
ax2.axhline(1.0, color="grey", ls=":", lw=1)
ax2.set_xticks(xi); ax2.set_xticklabels(labels, fontsize=7)
ax2.set_xlabel("shape"); ax2.set_ylabel("advantage (>1 = PD wins)")
ax2b = ax2.twinx()
ax2b.plot(xi, comp, color="red", marker="x", lw=1.4, ls="-.", label="worst-PD-arm completion %")
ax2b.set_ylabel("worst PD completion (%)", color="red"); ax2b.tick_params(axis="y", colors="red")
ax2b.set_ylim(80, 101)
ax2.set_title("(b) advantage peaks mid-sweep; wrong ratio catastrophic at prefill extreme")
l1, la1 = ax2.get_legend_handles_labels(); l2, la2 = ax2b.get_legend_handles_labels()
ax2.legend(l1 + l2, la1 + la2, fontsize=8, loc="lower left"); ax2.grid(alpha=.3)
fig.suptitle("Fig 2 — Shape axis: PD wins decode-heavy, ties prefill-heavy; optimal ratio rotates",
fontsize=11, y=1.02)
fig.tight_layout(); p = OUT / "fig2_shape_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
print("wrote", p)
# ---------- Fig 3: concurrency axis ----------
def fig_conc():
g = by_axis(load("fig3_conc32k.json"),
lambda n: (int(m.group(1)) if (m := re.search(r"_N(\d+)_", n)) else None))
xs = sorted(g)
fig, axes = plt.subplots(1, 3, figsize=(15, 4.2))
ax1, ax2, ax3 = axes
for arm in ["colo", *PD_ARMS]:
ax1.plot(xs, series(g, xs, arm, "e2e_mean"), **STYLE[arm])
ax1.axhline(10.0, color="red", ls=":", lw=1, label="SLO (mean E2E 10s)")
ax1.set_yscale("log"); ax1.set_xticks(xs); ax1.set_xticklabels(xs, fontsize=7)
ax1.set_xlabel("concurrent sessions N"); ax1.set_ylabel("E2E latency mean (s, log)")
ax1.set_title("(a) mean-E2E vs concurrency"); ax1.legend(fontsize=8); ax1.grid(alpha=.3, which="both")
for arm in ["colo", *PD_ARMS]:
ax2.plot(xs, series(g, xs, arm, "tps"), **STYLE[arm])
ax2.set_xticks(xs); ax2.set_xticklabels(xs, fontsize=7)
ax2.set_xlabel("concurrent sessions N"); ax2.set_ylabel("throughput (tok/s)")
ax2.set_title("(b) TPS: colo scales, PD plateaus/drops"); ax2.legend(fontsize=8); ax2.grid(alpha=.3)
for arm in ["colo", *PD_ARMS]:
ax3.plot(xs, [v * 100 if v is not None else None for v in series(g, xs, arm, "apc")], **STYLE[arm])
ax3.set_xticks(xs); ax3.set_xticklabels(xs, fontsize=7)
ax3.set_xlabel("concurrent sessions N"); ax3.set_ylabel("producer prefix-cache hit-rate (%)")
ax3.set_title("(c) APC vs concurrency"); ax3.legend(fontsize=8); ax3.grid(alpha=.3)
fig.suptitle("Fig 3 — Concurrency axis (in32768/out128, reuse~0.984): sweep N by 8 to the 10s-SLO ceiling",
fontsize=11, y=1.02)
fig.tight_layout(); p = OUT / "fig3_concurrency_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
print("wrote", p)
if __name__ == "__main__":
fig_reuse(); fig_shape(); fig_conc()

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Unattended serial PD-ablation campaign: reuse sweep -> conc sweep.
# STRICTLY one driver at a time (the hard lesson): each inner driver brings up and
# tears down its own vLLM per config via scripts/mb5_run_gpu.sh, and the two sweeps
# run sequentially (reuse fully finishes + tears down before conc starts). We verify
# GPUs are clear between sweeps. NO set -e here: a sub-sweep nonzero must NOT skip the
# other sweep; rc is captured and reported. Detached launch writes a DONE marker.
cd /home/admin/cpfs/wjh/agentic-kv-fresh
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
FS=microbench/fresh_setup
echo "=== CAMPAIGN START $(date) ==="
echo "=== [1/2] REUSE SWEEP (fixed real prefill delta=2048, out=256, reuse 20-95%, N=8) $(date) ==="
bash "$FS/run_reuse_fixed.sh"; rc_reuse=$?
echo "=== reuse sweep rc=$rc_reuse $(date) ==="
sleep 15
echo "--- GPU mem after reuse sweep (expect ~0 before conc) ---"
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
echo "=== [2/2] CONC SWEEP (in=32768 reuse=0.984, balanced N grid 8 16 32 48 64 96 128) $(date) ==="
NLIST="8 16 32 48 64 96 128" bash "$FS/run_conc.sh"; rc_conc=$?
echo "=== conc sweep rc=$rc_conc $(date) ==="
echo "=== CAMPAIGN DONE reuse_rc=$rc_reuse conc_rc=$rc_conc $(date) ==="

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Campaign 2 (2026-05-31): two extra reuse sweeps at out=128 (user request:
# delta=1024/out=128 and delta=2048/out=128), then the capped conc restart.
# STRICTLY one driver at a time; reuse sweeps run uncapped (mild collapse, matches
# the existing d2048/o256 sweep), conc runs with the PD-arm wall-cap. NO set -e.
cd /home/admin/cpfs/wjh/agentic-kv-fresh
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
FS=microbench/fresh_setup
echo "=== CAMPAIGN2 START $(date) ==="
echo "=== [1/3] REUSE delta=1024 out=128 (reuse 0.33-0.97) $(date) ==="
DELTA=1024 OL=128 bash "$FS/run_reuse_fixed.sh"; rc1=$?
echo "=== reuse d1024 o128 rc=$rc1 $(date) ==="
sleep 12; nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
echo "=== [2/3] REUSE delta=2048 out=128 (reuse 0.20-0.95) $(date) ==="
DELTA=2048 OL=128 bash "$FS/run_reuse_fixed.sh"; rc2=$?
echo "=== reuse d2048 o128 rc=$rc2 $(date) ==="
sleep 12; nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
echo "=== [3/3] CONC capped (PD wall=${CONC_PD_MAXDUR:-600}s, colo uncapped), N 8..128 $(date) ==="
NLIST="8 16 32 48 64 96 128" bash "$FS/run_conc.sh"; rc3=$?
echo "=== conc rc=$rc3 $(date) ==="
echo "=== CAMPAIGN2 DONE reuse_d1024_o128=$rc1 reuse_d2048_o128=$rc2 conc=$rc3 $(date) ==="

View File

@@ -0,0 +1,20 @@
#!/usr/bin/env bash
# Campaign 3 (2026-05-31): the uncapped d2048/o128 reuse sweep stalled on a
# collapse-draining high-reuse PD arm (4P+4D @ reuse 0.90, ~1 req/several-min).
# Finish it by re-running ONLY the high-reuse points (0.90, 0.95) WITH the PD
# wall-cap (low-reuse arms already completed and are cap-insensitive). Then run
# the capped conc sweep. STRICTLY serial. NO set -e.
cd /home/admin/cpfs/wjh/agentic-kv-fresh
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
FS=microbench/fresh_setup
echo "=== CAMPAIGN3 START $(date) ==="
echo "=== [1/2] finish reuse d2048/o128: re-run pts pfx=18432,38912 (PD capped 500s) $(date) ==="
DELTA=2048 OL=128 PFXS="18432 38912" REUSE_PD_MAXDUR=500 bash "$FS/run_reuse_fixed.sh"; rc1=$?
echo "=== reuse d2048 o128 finish rc=$rc1 $(date) ==="
sleep 12; nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
echo "=== [2/2] CONC capped (PD wall=600s, colo uncapped), N 8..128 $(date) ==="
NLIST="8 16 32 48 64 96 128" CONC_PD_MAXDUR=600 bash "$FS/run_conc.sh"; rc2=$?
echo "=== conc rc=$rc2 $(date) ==="
echo "=== CAMPAIGN3 DONE reuse_finish=$rc1 conc=$rc2 $(date) ==="

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env bash
# Concurrency axis, agentic-corner config. Supersedes old fig3 (in~8192/out256).
# RETUNED 2026-05-31 for realism (C2): hold total context in=32768 but shrink the
# real per-turn new-prefill to delta=512 and push reuse to 0.984 (real agentic
# reuse ->99.6%). prefix 32256 + delta 512. out=128. This is the corner that
# exposes PD's structural tax: colo keeps the 32k resident KV local, but PD must
# KV-transfer the whole 32k context every turn even though only 512 tokens are new
# (C2 PD-tax ~250-450x). Sweep closed-loop N by step 8 up to mean-E2E<=SLO ceiling.
# Wiring per memory project-mb5-pd-ablation-wiring: .venv_dash0, traces_synth/,
# CONFIG 8C-proxy + PD, MB5_P_ROUTING=session + MB5_COLO_ROUTING=session,
# N=REPLAY_MAX_INFLIGHT closed loop + REPLAY_INTER_TURN_THINK_S,
# REPLAY_NO_REALIZED_PREFIX=1. RUN ONLY ONE DRIVER AT A TIME (shared GPUs/ports).
set -eo pipefail
cd /home/admin/cpfs/wjh/agentic-kv-fresh
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
VPY="$MB5_VENV/bin/python"
PFX="${PFX:-32256}"; DELTA="${DELTA:-512}"; OL="${OL:-128}" # reuse=0.984, in=32768
THINK="${THINK:-0.5}"; TURNS="${TURNS:-8}"
NSTART="${NSTART:-8}"; NSTEP="${NSTEP:-8}"; NMAX="${NMAX:-128}"
NLIST="${NLIST:-}" # explicit N grid (overrides NSTART/STEP/MAX), e.g. "8 16 32 48 64 96 128"
CONC_PD_MAXDUR="${CONC_PD_MAXDUR:-600}" # wall-deadline (s) for PD arms only; bounds collapsed-arm
# drain (un-run turns = failures). colo (8C-proxy) runs UNCAPPED
# so the headline reference is always fully measured.
SLO="${SLO:-10.0}"
SESS_PER_N="${SESS_PER_N:-4}"
CFGS="${CFGS:-8C-proxy 2P+6D 4P+4D 6P+2D}"
ONLY_N="${ONLY_N:-}"
run_N() {
local N="$1"; local sess=$(( SESS_PER_N * N ))
local tag="conc32k_N${N}"; local trace="traces_synth/${tag}.jsonl"
"$VPY" scripts/gen_synthetic_trace.py --out "$trace" --mode regular \
--qps "$sess" --duration-s 1 --turns "$TURNS" \
--prefix-len "$PFX" --delta-len "$DELTA" --output-len "$OL" --seed 42 >/dev/null
echo "[conc32k] N=$N sess=$sess in=$((PFX+DELTA)) out=$OL -> $trace"
for cfg in $CFGS; do
echo " -> $cfg"
local dur=""; [ "$cfg" != "8C-proxy" ] && dur="$CONC_PD_MAXDUR" # colo uncapped
MB5_P_ROUTING=session MB5_COLO_ROUTING=session \
REPLAY_MAX_INFLIGHT="$N" REPLAY_INTER_TURN_THINK_S="$THINK" REPLAY_NO_REALIZED_PREFIX=1 \
REPLAY_MAX_DURATION="$dur" \
CONFIGS="$cfg" REPS=1 TRACE="$trace" RUN_TAG="$tag" \
bash scripts/mb5_run_gpu.sh >/dev/null 2>&1 || echo " [warn] ${tag}_${cfg} failed" >&2
done
local d="mb5_runs/${tag}_8C-proxy_rep1"
if [ -f "$d/replay_metrics.summary.json" ]; then
"$VPY" scripts/fig_agg.py --json "$d" 2>/dev/null \
| "$VPY" -c "import sys,json;r=json.load(sys.stdin);print(r[0].get('e2e_mean') if r else 'nan')"
else echo nan; fi
}
if [ -n "$ONLY_N" ]; then
echo "[conc32k] SMOKE N=$ONLY_N cfgs='$CFGS'"
t0=$(date +%s); m=$(run_N "$ONLY_N"); t1=$(date +%s)
echo "[conc32k] SMOKE N=$ONLY_N colo mean-E2E=${m}s wall=$(( t1 - t0 ))s; compare:"
"$VPY" scripts/fig_agg.py mb5_runs/conc32k_N${ONLY_N}_*_rep1 2>&1
exit 0
fi
if [ -n "$NLIST" ]; then NSEQ="$NLIST"; else NSEQ=$(seq "$NSTART" "$NSTEP" "$NMAX"); fi
for N in $NSEQ; do
echo "[conc32k] === N=$N ==="
m=$(run_N "$N"); echo "[conc32k] N=$N colo mean-E2E=${m}s"
over=$("$VPY" -c "print(1 if float('${m}')>${SLO} else 0)" 2>/dev/null || echo 0)
[ "$over" = "1" ] && { echo "[conc32k] colo crossed SLO ${SLO}s at N=$N -> stop"; break; }
done
dirs=(); for d in mb5_runs/conc32k_N*_rep1; do [ -d "$d" ] && dirs+=("$d"); done
"$VPY" scripts/fig_agg.py --json "${dirs[@]}" > analysis/mb5_pd_ablation/fig3_conc32k.json
echo "[conc32k] done -> analysis/mb5_pd_ablation/fig3_conc32k.json (${#dirs[@]} dirs)"

View File

@@ -0,0 +1,72 @@
#!/usr/bin/env bash
# Reuse axis, DONE RIGHT (controlled variable). Supersedes old fig1.
# Hold REAL (uncached) prefill work constant: --delta-len = U fixed.
# Vary only --prefix-len = C -> reuse = C/(C+U). Context grows with reuse but
# the tokens that must actually be prefilled each turn stays = U.
# Old fig1 held input=8192 and sliced prefix out of it, so delta shrank 15x as
# reuse rose -> confounded "more reuse" with "less prefill". This fixes that.
#
# Wiring matches the corrected MB5 stack (see memory project-mb5-pd-ablation-wiring):
# .venv_dash0, traces_synth/, CONFIG 8C-proxy + PD, MB5_P_ROUTING=session,
# N injected via REPLAY_MAX_INFLIGHT (closed loop) + REPLAY_INTER_TURN_THINK_S,
# REPLAY_NO_REALIZED_PREFIX=1 (reuse governed by hash_ids, required for this sweep).
set -eo pipefail
cd /home/admin/cpfs/wjh/agentic-kv-fresh
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
VPY="$MB5_VENV/bin/python"
DELTA="${DELTA:-2048}" # fixed real prefill per turn (USER-CHOSEN)
OL="${OL:-256}"
N="${N:-8}"
THINK="${THINK:-0.5}"
TURNS="${TURNS:-8}"
NSESS="${NSESS:-48}" # number of sessions (closed-loop: arrival rate is
# irrelevant, only the count matters; ~6 waves at N=8)
PFXS="${PFXS:-512 2048 4096 8192 18432 38912}" # reuse .20 .50 .67 .80 .90 .95
CFGS="${CFGS:-8C-proxy 2P+6D 4P+4D 6P+2D}"
REUSE_PD_MAXDUR="${REUSE_PD_MAXDUR:-500}" # wall-deadline (s) for PD arms only (colo uncapped):
# bounds the collapse-drain that stalls high-reuse PD arms
# (un-run turns = failures, honest completion%). 0/empty = off.
ONLY_PFX="${ONLY_PFX:-}" # smoke a single prefix then exit
run_point() { # <pfx>
local pfx="$1"
local reuse; reuse=$(python3 -c "print(f'{$pfx/($pfx+$DELTA):.3f}')")
local tag="reuse_p${pfx}_d${DELTA}_o${OL}" # _o${OL} so different output lens don't collide
local trace="traces_synth/${tag}.jsonl"
# Closed-loop: pass NSESS as qps with duration 1 so n_sessions = NSESS
# exactly (gen_regular: n_sessions = int(duration_s * session_qps)).
"$VPY" scripts/gen_synthetic_trace.py --out "$trace" --mode regular \
--qps "$NSESS" --duration-s 1 --turns "$TURNS" \
--prefix-len "$pfx" --delta-len "$DELTA" --output-len "$OL" --seed 42 >/dev/null
echo "[reuse] pfx=$pfx delta=$DELTA reuse=$reuse in=$((pfx+DELTA)) -> $trace"
for cfg in $CFGS; do
echo " -> $cfg"
# Both routings set to session so BOTH colo (kv_both) and PD producers
# pin a session's turns to one instance and reuse its prefix cache — the
# fair cache-aware comparison. P_ROUTING is ignored by colo, COLO_ROUTING
# by PD, so setting both is harmless and symmetric.
local dur=""; [ "$cfg" != "8C-proxy" ] && dur="$REUSE_PD_MAXDUR" # colo uncapped
MB5_P_ROUTING=session MB5_COLO_ROUTING=session \
REPLAY_MAX_INFLIGHT="$N" REPLAY_INTER_TURN_THINK_S="$THINK" \
REPLAY_NO_REALIZED_PREFIX=1 REPLAY_MAX_DURATION="$dur" \
CONFIGS="$cfg" REPS=1 TRACE="$trace" RUN_TAG="$tag" \
bash scripts/mb5_run_gpu.sh >/dev/null 2>&1 || echo " [warn] $cfg failed" >&2
done
}
if [ -n "$ONLY_PFX" ]; then
echo "[reuse] SMOKE pfx=$ONLY_PFX cfgs='$CFGS'"
t0=$(date +%s); run_point "$ONLY_PFX"; t1=$(date +%s)
echo "[reuse] SMOKE done wall=$(( t1 - t0 ))s; compare:"
"$VPY" scripts/fig_agg.py mb5_runs/reuse_p${ONLY_PFX}_d${DELTA}_o${OL}_*_rep1
exit 0
fi
for pfx in $PFXS; do run_point "$pfx"; done
# Aggregate ONLY this sweep's dirs (matched by delta+output) so the three
# reuse figures (d2048/o256, d1024/o128, d2048/o128) never cross-contaminate.
dirs=(); for d in mb5_runs/reuse_*_d${DELTA}_o${OL}_*_rep1; do [ -d "$d" ] && dirs+=("$d"); done
OUTJSON="analysis/mb5_pd_ablation/fig1_reuse_d${DELTA}_o${OL}.json"
"$VPY" scripts/fig_agg.py --json "${dirs[@]}" > "$OUTJSON"
echo "[reuse] done -> $OUTJSON (${#dirs[@]} dirs)"