MB5 PD ablation: controlled-variable reuse/conc redo + campaign tooling
Reuse and concurrency axes redone with proper controlled variables, plus
the orchestration used to run them on dash0:
- run_reuse_fixed.sh: hold REAL prefill work (delta) constant, vary only
cached prefix -> reuse = C/(C+U). Supersedes old fig1 (which held
input=8192 and sliced prefix out, confounding "more reuse" with "less
prefill").
- run_conc.sh: agentic-corner config (in=32768, delta=512, reuse=0.984,
out=128) that exposes PD's structural KV-transfer tax. Supersedes old fig3.
- run_campaign{,2,3}.sh, backfill_d2048o128.sh: serial campaign drivers
(strictly one driver at a time), out=128 sweeps, PD wall-cap for
collapse-draining high-reuse arms, and flaked-arm backfill.
- mb5_run_gpu.sh: per-config bring-up / replay / teardown orchestrator.
- plot_pd_crossover.py: render the reuse_compare figures from fig_agg dumps.
- fig_agg.py: tolerate null stats from fully-collapsed arms (0 successes
write the stat keys as null; `dict.get(k, {})` returns null, not {}).
Data: fig1_reuse_fixed.json, fig1_reuse_d{1024,2048}_o128.json
Figs: reuse_compare_AB.png, reuse_compare_ABC.png
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
1
analysis/mb5_pd_ablation/fig1_reuse_d1024_o128.json
Normal file
1
analysis/mb5_pd_ablation/fig1_reuse_d1024_o128.json
Normal file
File diff suppressed because one or more lines are too long
1
analysis/mb5_pd_ablation/fig1_reuse_d2048_o128.json
Normal file
1
analysis/mb5_pd_ablation/fig1_reuse_d2048_o128.json
Normal file
File diff suppressed because one or more lines are too long
1
analysis/mb5_pd_ablation/fig1_reuse_fixed.json
Normal file
1
analysis/mb5_pd_ablation/fig1_reuse_fixed.json
Normal file
File diff suppressed because one or more lines are too long
BIN
figs/mb5_pd_ablation/reuse_compare_AB.png
Normal file
BIN
figs/mb5_pd_ablation/reuse_compare_AB.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 114 KiB |
BIN
figs/mb5_pd_ablation/reuse_compare_ABC.png
Normal file
BIN
figs/mb5_pd_ablation/reuse_compare_ABC.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 99 KiB |
35
microbench/fresh_setup/backfill_d2048o128.sh
Normal file
35
microbench/fresh_setup/backfill_d2048o128.sh
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Backfill the d2048/o128 reuse arms that vLLM startup-flaked out (transient
|
||||||
|
# "Engine core initialization failed", intermittent). Retry up to 4x each with a
|
||||||
|
# clean teardown between attempts; HEALTH_MAX_TRIES=180 so a crashed launch fails
|
||||||
|
# in ~6min (not 10) before retrying. Then re-aggregate the figure JSON.
|
||||||
|
cd /home/admin/cpfs/wjh/agentic-kv-fresh
|
||||||
|
export MB5_VENV=$PWD/.venv_dash0
|
||||||
|
export HEALTH_MAX_TRIES=180
|
||||||
|
VPY=$MB5_VENV/bin/python
|
||||||
|
DELTA=2048; OL=128; N=8; THINK=0.5; TURNS=8; NSESS=48
|
||||||
|
MISS="${MISS:-4096:6P+2D 18432:6P+2D 38912:8C-proxy 38912:6P+2D}"
|
||||||
|
echo "=== BACKFILL START $(date) miss='$MISS' ==="
|
||||||
|
for pc in $MISS; do
|
||||||
|
pfx=${pc%%:*}; cfg=${pc##*:}
|
||||||
|
tag="reuse_p${pfx}_d${DELTA}_o${OL}"; trace="traces_synth/${tag}.jsonl"
|
||||||
|
$VPY scripts/gen_synthetic_trace.py --out "$trace" --mode regular --qps "$NSESS" --duration-s 1 \
|
||||||
|
--turns "$TURNS" --prefix-len "$pfx" --delta-len "$DELTA" --output-len "$OL" --seed 42 >/dev/null 2>&1
|
||||||
|
dur=""; [ "$cfg" != "8C-proxy" ] && dur=500
|
||||||
|
ok=0
|
||||||
|
for attempt in 1 2 3 4; do
|
||||||
|
echo "[backfill] $tag $cfg attempt=$attempt $(date +%T)"
|
||||||
|
MB5_P_ROUTING=session MB5_COLO_ROUTING=session \
|
||||||
|
REPLAY_MAX_INFLIGHT=$N REPLAY_INTER_TURN_THINK_S=$THINK REPLAY_NO_REALIZED_PREFIX=1 REPLAY_MAX_DURATION="$dur" \
|
||||||
|
CONFIGS="$cfg" REPS=1 TRACE="$trace" RUN_TAG="$tag" \
|
||||||
|
bash scripts/mb5_run_gpu.sh >/dev/null 2>&1
|
||||||
|
if [ -f "mb5_runs/${tag}_${cfg}_rep1/replay_metrics.summary.json" ]; then
|
||||||
|
echo " OK $cfg pfx=$pfx attempt=$attempt"; ok=1; break; fi
|
||||||
|
echo " FAILED attempt=$attempt; cleanup+retry"
|
||||||
|
MB5_VENV=$PWD/.venv_dash0 bash scripts/mb5_launch.sh stop >/dev/null 2>&1; sleep 5
|
||||||
|
done
|
||||||
|
[ $ok = 0 ] && echo "[backfill] GAVE UP $tag $cfg"
|
||||||
|
done
|
||||||
|
dirs=(); for d in mb5_runs/reuse_*_d2048_o128_*_rep1; do [ -f "$d/replay_metrics.summary.json" ] && dirs+=("$d"); done
|
||||||
|
$VPY scripts/fig_agg.py --json "${dirs[@]}" > analysis/mb5_pd_ablation/fig1_reuse_d2048_o128.json
|
||||||
|
echo "=== BACKFILL DONE dirs=${#dirs[@]}/24 $(date) ==="
|
||||||
@@ -100,11 +100,13 @@ def main():
|
|||||||
continue
|
continue
|
||||||
s = json.load(open(sp))
|
s = json.load(open(sp))
|
||||||
arm, pg, dg, ports = arm_of(run.name)
|
arm, pg, dg, ports = arm_of(run.name)
|
||||||
lat = s.get("latency_stats_s", {})
|
# `or {}` because a fully-collapsed arm (0 successes) writes these as null,
|
||||||
ttft = s.get("ttft_stats_s", {})
|
# and dict.get(k, {}) returns null (not {}) when the key exists with value null.
|
||||||
tpot = s.get("tpot_stats_s", {})
|
lat = s.get("latency_stats_s") or {}
|
||||||
|
ttft = s.get("ttft_stats_s") or {}
|
||||||
|
tpot = s.get("tpot_stats_s") or {}
|
||||||
wall = s.get("wall_clock_s") or 1.0
|
wall = s.get("wall_clock_s") or 1.0
|
||||||
out = s.get("actual_output_tokens_stats", {})
|
out = s.get("actual_output_tokens_stats") or {}
|
||||||
n = s.get("success_count", 0); req = s.get("request_count", 0)
|
n = s.get("success_count", 0); req = s.get("request_count", 0)
|
||||||
tot_out = out.get("count", 0) * out.get("mean", 0)
|
tot_out = out.get("count", 0) * out.get("mean", 0)
|
||||||
tps = tot_out / wall
|
tps = tot_out / wall
|
||||||
|
|||||||
144
microbench/fresh_setup/mb5_run_gpu.sh
Executable file
144
microbench/fresh_setup/mb5_run_gpu.sh
Executable file
@@ -0,0 +1,144 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Orchestrator for MB5: for each CONFIG × rep, bring up the stack, run a
|
||||||
|
# trace replay against it, collect KV snapshots and replayer metrics,
|
||||||
|
# tear down.
|
||||||
|
#
|
||||||
|
# Designed to be run on dash1 (or any host with cpfs mounted at
|
||||||
|
# /home/admin/cpfs/wjh/).
|
||||||
|
#
|
||||||
|
# Env vars (with defaults):
|
||||||
|
# CONFIGS : space-separated MB5 configs (default: "8C 6P+2D 4P+4D 2P+6D")
|
||||||
|
# REPS : reps per config (default: 3)
|
||||||
|
# TRACE : trace JSONL path
|
||||||
|
# (default: /home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl)
|
||||||
|
# RUN_TAG : output root tag (default: $(date +%Y%m%d_%H%M%S))
|
||||||
|
# REQUEST_LIMIT : optional, cap replay requests (default: none)
|
||||||
|
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
||||||
|
# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
|
||||||
|
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
|
||||||
|
REPLAYER_DIR="${FRESH_ROOT}/replayer"
|
||||||
|
|
||||||
|
CONFIGS="${CONFIGS:-8C 6P+2D 4P+4D 2P+6D}"
|
||||||
|
REPS="${REPS:-3}"
|
||||||
|
TRACE="${TRACE:-/home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl}"
|
||||||
|
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d_%H%M%S)}"
|
||||||
|
MODEL_NAME="${MODEL_NAME:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||||||
|
REQUEST_LIMIT_ARG=""
|
||||||
|
if [ -n "${REQUEST_LIMIT:-}" ]; then
|
||||||
|
REQUEST_LIMIT_ARG="--request-limit ${REQUEST_LIMIT}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
OUT_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_TAG}"
|
||||||
|
mkdir -p "${OUT_ROOT}"
|
||||||
|
echo "[mb5-run] RUN_TAG=${RUN_TAG}"
|
||||||
|
echo "[mb5-run] OUT_ROOT=${OUT_ROOT}"
|
||||||
|
echo "[mb5-run] CONFIGS=${CONFIGS}"
|
||||||
|
echo "[mb5-run] REPS=${REPS}"
|
||||||
|
echo "[mb5-run] TRACE=${TRACE}"
|
||||||
|
|
||||||
|
run_one() {
|
||||||
|
local config="$1" rep="$2"
|
||||||
|
local label="${RUN_TAG}_${config}_rep${rep}"
|
||||||
|
local rundir="${FRESH_ROOT}/mb5_runs/${label}"
|
||||||
|
echo ""
|
||||||
|
echo "======== ${config} rep${rep} ========"
|
||||||
|
|
||||||
|
# Launch
|
||||||
|
if ! CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
|
||||||
|
bash "${LAUNCH}" start > "${OUT_ROOT}/${config}_rep${rep}_launch.log" 2>&1; then
|
||||||
|
echo "[mb5-run] LAUNCH FAILED for ${config} rep${rep}; see ${OUT_ROOT}/${config}_rep${rep}_launch.log"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract ENDPOINTS line emitted by mb5_launch.sh
|
||||||
|
local endpoints
|
||||||
|
endpoints=$(grep "^ENDPOINTS=" "${OUT_ROOT}/${config}_rep${rep}_launch.log" | tail -1 | cut -d= -f2-)
|
||||||
|
if [ -z "${endpoints}" ]; then
|
||||||
|
echo "[mb5-run] ERROR: no ENDPOINTS in launch log"
|
||||||
|
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
echo "[mb5-run] endpoints: ${endpoints}"
|
||||||
|
|
||||||
|
# Replay
|
||||||
|
source "${VENV}/bin/activate"
|
||||||
|
local replay_out="${rundir}/replay_metrics.jsonl"
|
||||||
|
mkdir -p "$(dirname "${replay_out}")"
|
||||||
|
# per-GPU utilization timeseries over the replay window (2s sampling)
|
||||||
|
bash "${FRESH_ROOT}/microbench/fresh_setup/gpu_monitor.sh" "${rundir}/gpu_util.csv" 2 >/dev/null 2>&1 &
|
||||||
|
local GPU_MON=$!
|
||||||
|
local t0
|
||||||
|
t0=$(date +%s.%N)
|
||||||
|
if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \
|
||||||
|
--endpoint "${endpoints}" \
|
||||||
|
--trace "${TRACE}" \
|
||||||
|
--output "${replay_out}" \
|
||||||
|
--model "${MODEL_NAME}" \
|
||||||
|
${REQUEST_LIMIT_ARG} \
|
||||||
|
> "${OUT_ROOT}/${config}_rep${rep}_replay.log" 2>&1; then
|
||||||
|
local t1
|
||||||
|
t1=$(date +%s.%N)
|
||||||
|
local wall=$(python -c "print(${t1} - ${t0})")
|
||||||
|
echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log"
|
||||||
|
kill "${GPU_MON}" 2>/dev/null || true
|
||||||
|
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
local t1
|
||||||
|
t1=$(date +%s.%N)
|
||||||
|
local wall_clock_s
|
||||||
|
wall_clock_s=$(python -c "print(${t1} - ${t0})")
|
||||||
|
echo "[mb5-run] replay done in ${wall_clock_s}s"
|
||||||
|
echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"
|
||||||
|
kill "${GPU_MON}" 2>/dev/null || true
|
||||||
|
printf '{"t_start_unix":%s,"t_end_unix":%s}\n' "${t0}" "${t1}" > "${rundir}/run_window.json"
|
||||||
|
|
||||||
|
# Per-instance prefix-cache counters, scraped from each backend BEFORE
|
||||||
|
# teardown. For PD this is the only honest reuse signal: producer ports
|
||||||
|
# (the low ones) show cross-turn prefix-cache hits; the consumer's
|
||||||
|
# per-request cached_tokens is meaningless (it counts transferred KV).
|
||||||
|
{
|
||||||
|
for p in 8000 8001 8002 8003 8004 8005 8006 8007; do
|
||||||
|
m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue
|
||||||
|
q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}')
|
||||||
|
h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}')
|
||||||
|
[ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}"
|
||||||
|
done
|
||||||
|
} > "${rundir}/instance_apc.txt" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Stop launch (cleans up vllm + proxy; reverts patch on last call)
|
||||||
|
CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
|
||||||
|
bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true
|
||||||
|
|
||||||
|
sleep 10 # cooldown so GPUs settle before next config
|
||||||
|
echo "[mb5-run] DONE ${config} rep${rep}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Quick check that the launch script and replayer are reachable
|
||||||
|
if [ ! -f "${LAUNCH}" ]; then echo "missing ${LAUNCH}"; exit 1; fi
|
||||||
|
if [ ! -d "${REPLAYER_DIR}" ]; then echo "missing ${REPLAYER_DIR}"; exit 1; fi
|
||||||
|
if [ ! -f "${TRACE}" ]; then echo "missing trace ${TRACE}"; exit 1; fi
|
||||||
|
|
||||||
|
# Iterate
|
||||||
|
failures=0
|
||||||
|
for config in ${CONFIGS}; do
|
||||||
|
for ((rep=1; rep<=REPS; rep++)); do
|
||||||
|
if ! run_one "${config}" "${rep}"; then
|
||||||
|
failures=$((failures+1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
# Final patch revert (defensive — mb5_launch.sh stop also reverts)
|
||||||
|
python "${SCRIPT_DIR}/instrument_kv_snapshot.py" --revert --venv "${VENV}" 2>/dev/null || true
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======== ALL CONFIGS DONE ========"
|
||||||
|
echo "failures: ${failures}"
|
||||||
|
echo "results under: ${FRESH_ROOT}/mb5_runs/${RUN_TAG}_*"
|
||||||
|
echo "to plot: python plot_kv_pool_timeline.py --run-tag ${RUN_TAG}"
|
||||||
184
microbench/fresh_setup/plot_pd_crossover.py
Normal file
184
microbench/fresh_setup/plot_pd_crossover.py
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
"""Render the three PD-vs-colo crossover figures from fig_agg JSON dumps.
|
||||||
|
|
||||||
|
Inputs (produced by `fig_agg.py --json`):
|
||||||
|
analysis/mb5_pd_ablation/fig1_reuse_fixed.json reuse axis (N=8, FIXED real
|
||||||
|
prefill delta=2048; vary cached prefix -> reuse = pfx/(pfx+delta).
|
||||||
|
Controlled-variable: real new-prefill work is constant across the sweep,
|
||||||
|
only the cached fraction (and total context) grows. Supersedes the old
|
||||||
|
fig1.json, which held input=8192 and sliced prefix out of it so delta
|
||||||
|
shrank 15x as reuse rose — a confound, not a pure reuse axis.)
|
||||||
|
analysis/mb5_pd_ablation/fig2.json shape axis (N=8, reuse~70%)
|
||||||
|
analysis/mb5_pd_ablation/fig3_conc32k.json concurrency (in32768/out128,
|
||||||
|
reuse~0.984 = 32256 resident + 512 real new-prefill per turn; retuned
|
||||||
|
2026-05-31 to the agentic corner so PD pays the full-context per-turn
|
||||||
|
KV-transfer tax while colo keeps it resident; vary N by step 8 up to the
|
||||||
|
mean-E2E<=10s SLO ceiling)
|
||||||
|
|
||||||
|
Each figure overlays colo + the three PD ratios and marks the PD-best advantage.
|
||||||
|
All three share the corrected (uncontaminated, e13391e-gated-off) stack.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use("Agg")
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
DATA = ROOT / "analysis" / "mb5_pd_ablation"
|
||||||
|
OUT = ROOT / "figs" / "mb5_pd_ablation"
|
||||||
|
OUT.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
PD_ARMS = ["2P+6D", "4P+4D", "6P+2D"]
|
||||||
|
STYLE = {
|
||||||
|
"colo": dict(color="k", marker="o", lw=2.4, ls="-", label="colo (8×kv_both)"),
|
||||||
|
"2P+6D": dict(color="#1f77b4", marker="s", lw=1.6, ls="--", label="PD 2P+6D"),
|
||||||
|
"4P+4D": dict(color="#2ca02c", marker="^", lw=1.6, ls="--", label="PD 4P+4D"),
|
||||||
|
"6P+2D": dict(color="#ff7f0e", marker="v", lw=1.6, ls="--", label="PD 6P+2D"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load(name):
|
||||||
|
return json.load(open(DATA / name))
|
||||||
|
|
||||||
|
|
||||||
|
def by_axis(rows, keyfn):
|
||||||
|
"""Group rows -> {axis_val: {arm: row}}."""
|
||||||
|
out = {}
|
||||||
|
for r in rows:
|
||||||
|
k = keyfn(r["name"])
|
||||||
|
if k is None:
|
||||||
|
continue
|
||||||
|
out.setdefault(k, {})[r["arm"]] = r
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def pd_best(armmap, metric="e2e_p90"):
|
||||||
|
vals = [(a, armmap[a][metric]) for a in PD_ARMS
|
||||||
|
if a in armmap and armmap[a].get(metric) is not None]
|
||||||
|
return min(vals, key=lambda t: t[1]) if vals else (None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def series(grp, xs, arm, metric):
|
||||||
|
return [grp[x][arm].get(metric) if arm in grp[x] else None for x in xs]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- Fig 1: reuse axis ----------
|
||||||
|
def _reuse_pct(name):
|
||||||
|
"""Reuse % from a `reuse_p{pfx}_d{delta}_{arm}` run name: pfx/(pfx+delta)."""
|
||||||
|
m = re.search(r"_p(\d+)_d(\d+)", name)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
pfx, delta = int(m.group(1)), int(m.group(2))
|
||||||
|
return round(pfx / (pfx + delta) * 100)
|
||||||
|
|
||||||
|
|
||||||
|
def fig_reuse():
|
||||||
|
g = by_axis(load("fig1_reuse_fixed.json"), _reuse_pct)
|
||||||
|
xs = sorted(g)
|
||||||
|
reuse = xs
|
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4.2))
|
||||||
|
for arm in ["colo", *PD_ARMS]:
|
||||||
|
ax1.plot(reuse, series(g, xs, arm, "e2e_p90"), **STYLE[arm])
|
||||||
|
ax1.set_xlabel("intra-session KV reuse (%) [fixed real prefill, delta=2048]")
|
||||||
|
ax1.set_ylabel("E2E latency p90 (s)")
|
||||||
|
ax1.set_title("(a) E2E-p90 vs reuse (N=8, delta=2048/out256)")
|
||||||
|
ax1.legend(fontsize=8); ax1.grid(alpha=.3)
|
||||||
|
|
||||||
|
adv, putil = [], []
|
||||||
|
for x in xs:
|
||||||
|
co = g[x]["colo"]["e2e_p90"]; _, b = pd_best(g[x])
|
||||||
|
adv.append(co / b if b else None)
|
||||||
|
a = pd_best(g[x])[0]
|
||||||
|
putil.append(g[x][a].get("pu") if a else None)
|
||||||
|
ax2.plot(reuse, adv, color="purple", marker="D", lw=2, label="PD-best advantage (colo/PD)")
|
||||||
|
ax2.axhline(1.0, color="grey", ls=":", lw=1)
|
||||||
|
ax2.set_xlabel("intra-session KV reuse (%)"); ax2.set_ylabel("advantage (>1 = PD wins)")
|
||||||
|
ax2b = ax2.twinx()
|
||||||
|
ax2b.plot(reuse, putil, color="brown", marker="x", lw=1.4, ls="-.", label="PD-best prefill-GPU util")
|
||||||
|
ax2b.set_ylabel("prefill-GPU util (%)", color="brown"); ax2b.tick_params(axis="y", colors="brown")
|
||||||
|
ax2.set_title("(b) advantage erodes; prefill GPUs go idle")
|
||||||
|
l1, la1 = ax2.get_legend_handles_labels(); l2, la2 = ax2b.get_legend_handles_labels()
|
||||||
|
ax2.legend(l1 + l2, la1 + la2, fontsize=8, loc="center right"); ax2.grid(alpha=.3)
|
||||||
|
fig.suptitle("Fig 1 — Reuse axis (fixed real prefill delta=2048): PD's edge vs rising cache reuse",
|
||||||
|
fontsize=11, y=1.02)
|
||||||
|
fig.tight_layout(); p = OUT / "fig1_reuse_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
|
||||||
|
print("wrote", p)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- Fig 2: shape axis ----------
|
||||||
|
def fig_shape():
|
||||||
|
g = by_axis(load("fig2.json"),
|
||||||
|
lambda n: ((int(m.group(1)), int(m.group(2)))
|
||||||
|
if (m := re.search(r"_in(\d+)_out(\d+)_", n)) else None))
|
||||||
|
xs = sorted(g, key=lambda t: t[0]) # ascending input
|
||||||
|
labels = [f"in{i}\nout{o}" for i, o in xs]
|
||||||
|
xi = list(range(len(xs)))
|
||||||
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 4.2))
|
||||||
|
for arm in ["colo", *PD_ARMS]:
|
||||||
|
ax1.plot(xi, series(g, xs, arm, "e2e_p90"), **STYLE[arm])
|
||||||
|
ax1.set_xticks(xi); ax1.set_xticklabels(labels, fontsize=7)
|
||||||
|
ax1.set_xlabel("shape (decode-heavy → prefill-heavy)"); ax1.set_ylabel("E2E latency p90 (s)")
|
||||||
|
ax1.set_title("(a) E2E-p90 vs shape (N=8, reuse~70%)")
|
||||||
|
ax1.legend(fontsize=8); ax1.grid(alpha=.3)
|
||||||
|
|
||||||
|
adv, comp = [], []
|
||||||
|
for x in xs:
|
||||||
|
co = g[x]["colo"]["e2e_p90"]; a, b = pd_best(g[x])
|
||||||
|
adv.append(co / b if b else None)
|
||||||
|
# completion of the worst PD arm (exposes catastrophic ratio)
|
||||||
|
worst = min((g[x][arm]["n"] / g[x][arm]["req"]) for arm in PD_ARMS if arm in g[x])
|
||||||
|
comp.append(worst * 100)
|
||||||
|
ax2.plot(xi, adv, color="purple", marker="D", lw=2, label="PD-best advantage (colo/PD)")
|
||||||
|
ax2.axhline(1.0, color="grey", ls=":", lw=1)
|
||||||
|
ax2.set_xticks(xi); ax2.set_xticklabels(labels, fontsize=7)
|
||||||
|
ax2.set_xlabel("shape"); ax2.set_ylabel("advantage (>1 = PD wins)")
|
||||||
|
ax2b = ax2.twinx()
|
||||||
|
ax2b.plot(xi, comp, color="red", marker="x", lw=1.4, ls="-.", label="worst-PD-arm completion %")
|
||||||
|
ax2b.set_ylabel("worst PD completion (%)", color="red"); ax2b.tick_params(axis="y", colors="red")
|
||||||
|
ax2b.set_ylim(80, 101)
|
||||||
|
ax2.set_title("(b) advantage peaks mid-sweep; wrong ratio catastrophic at prefill extreme")
|
||||||
|
l1, la1 = ax2.get_legend_handles_labels(); l2, la2 = ax2b.get_legend_handles_labels()
|
||||||
|
ax2.legend(l1 + l2, la1 + la2, fontsize=8, loc="lower left"); ax2.grid(alpha=.3)
|
||||||
|
fig.suptitle("Fig 2 — Shape axis: PD wins decode-heavy, ties prefill-heavy; optimal ratio rotates",
|
||||||
|
fontsize=11, y=1.02)
|
||||||
|
fig.tight_layout(); p = OUT / "fig2_shape_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
|
||||||
|
print("wrote", p)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------- Fig 3: concurrency axis ----------
|
||||||
|
def fig_conc():
|
||||||
|
g = by_axis(load("fig3_conc32k.json"),
|
||||||
|
lambda n: (int(m.group(1)) if (m := re.search(r"_N(\d+)_", n)) else None))
|
||||||
|
xs = sorted(g)
|
||||||
|
fig, axes = plt.subplots(1, 3, figsize=(15, 4.2))
|
||||||
|
ax1, ax2, ax3 = axes
|
||||||
|
for arm in ["colo", *PD_ARMS]:
|
||||||
|
ax1.plot(xs, series(g, xs, arm, "e2e_mean"), **STYLE[arm])
|
||||||
|
ax1.axhline(10.0, color="red", ls=":", lw=1, label="SLO (mean E2E 10s)")
|
||||||
|
ax1.set_yscale("log"); ax1.set_xticks(xs); ax1.set_xticklabels(xs, fontsize=7)
|
||||||
|
ax1.set_xlabel("concurrent sessions N"); ax1.set_ylabel("E2E latency mean (s, log)")
|
||||||
|
ax1.set_title("(a) mean-E2E vs concurrency"); ax1.legend(fontsize=8); ax1.grid(alpha=.3, which="both")
|
||||||
|
|
||||||
|
for arm in ["colo", *PD_ARMS]:
|
||||||
|
ax2.plot(xs, series(g, xs, arm, "tps"), **STYLE[arm])
|
||||||
|
ax2.set_xticks(xs); ax2.set_xticklabels(xs, fontsize=7)
|
||||||
|
ax2.set_xlabel("concurrent sessions N"); ax2.set_ylabel("throughput (tok/s)")
|
||||||
|
ax2.set_title("(b) TPS: colo scales, PD plateaus/drops"); ax2.legend(fontsize=8); ax2.grid(alpha=.3)
|
||||||
|
|
||||||
|
for arm in ["colo", *PD_ARMS]:
|
||||||
|
ax3.plot(xs, [v * 100 if v is not None else None for v in series(g, xs, arm, "apc")], **STYLE[arm])
|
||||||
|
ax3.set_xticks(xs); ax3.set_xticklabels(xs, fontsize=7)
|
||||||
|
ax3.set_xlabel("concurrent sessions N"); ax3.set_ylabel("producer prefix-cache hit-rate (%)")
|
||||||
|
ax3.set_title("(c) APC vs concurrency"); ax3.legend(fontsize=8); ax3.grid(alpha=.3)
|
||||||
|
fig.suptitle("Fig 3 — Concurrency axis (in32768/out128, reuse~0.984): sweep N by 8 to the 10s-SLO ceiling",
|
||||||
|
fontsize=11, y=1.02)
|
||||||
|
fig.tight_layout(); p = OUT / "fig3_concurrency_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
|
||||||
|
print("wrote", p)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fig_reuse(); fig_shape(); fig_conc()
|
||||||
26
microbench/fresh_setup/run_campaign.sh
Normal file
26
microbench/fresh_setup/run_campaign.sh
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Unattended serial PD-ablation campaign: reuse sweep -> conc sweep.
|
||||||
|
# STRICTLY one driver at a time (the hard lesson): each inner driver brings up and
|
||||||
|
# tears down its own vLLM per config via scripts/mb5_run_gpu.sh, and the two sweeps
|
||||||
|
# run sequentially (reuse fully finishes + tears down before conc starts). We verify
|
||||||
|
# GPUs are clear between sweeps. NO set -e here: a sub-sweep nonzero must NOT skip the
|
||||||
|
# other sweep; rc is captured and reported. Detached launch writes a DONE marker.
|
||||||
|
cd /home/admin/cpfs/wjh/agentic-kv-fresh
|
||||||
|
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
|
||||||
|
FS=microbench/fresh_setup
|
||||||
|
|
||||||
|
echo "=== CAMPAIGN START $(date) ==="
|
||||||
|
|
||||||
|
echo "=== [1/2] REUSE SWEEP (fixed real prefill delta=2048, out=256, reuse 20-95%, N=8) $(date) ==="
|
||||||
|
bash "$FS/run_reuse_fixed.sh"; rc_reuse=$?
|
||||||
|
echo "=== reuse sweep rc=$rc_reuse $(date) ==="
|
||||||
|
|
||||||
|
sleep 15
|
||||||
|
echo "--- GPU mem after reuse sweep (expect ~0 before conc) ---"
|
||||||
|
nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
|
||||||
|
|
||||||
|
echo "=== [2/2] CONC SWEEP (in=32768 reuse=0.984, balanced N grid 8 16 32 48 64 96 128) $(date) ==="
|
||||||
|
NLIST="8 16 32 48 64 96 128" bash "$FS/run_conc.sh"; rc_conc=$?
|
||||||
|
echo "=== conc sweep rc=$rc_conc $(date) ==="
|
||||||
|
|
||||||
|
echo "=== CAMPAIGN DONE reuse_rc=$rc_reuse conc_rc=$rc_conc $(date) ==="
|
||||||
26
microbench/fresh_setup/run_campaign2.sh
Normal file
26
microbench/fresh_setup/run_campaign2.sh
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Campaign 2 (2026-05-31): two extra reuse sweeps at out=128 (user request:
|
||||||
|
# delta=1024/out=128 and delta=2048/out=128), then the capped conc restart.
|
||||||
|
# STRICTLY one driver at a time; reuse sweeps run uncapped (mild collapse, matches
|
||||||
|
# the existing d2048/o256 sweep), conc runs with the PD-arm wall-cap. NO set -e.
|
||||||
|
cd /home/admin/cpfs/wjh/agentic-kv-fresh
|
||||||
|
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
|
||||||
|
FS=microbench/fresh_setup
|
||||||
|
|
||||||
|
echo "=== CAMPAIGN2 START $(date) ==="
|
||||||
|
|
||||||
|
echo "=== [1/3] REUSE delta=1024 out=128 (reuse 0.33-0.97) $(date) ==="
|
||||||
|
DELTA=1024 OL=128 bash "$FS/run_reuse_fixed.sh"; rc1=$?
|
||||||
|
echo "=== reuse d1024 o128 rc=$rc1 $(date) ==="
|
||||||
|
sleep 12; nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
|
||||||
|
|
||||||
|
echo "=== [2/3] REUSE delta=2048 out=128 (reuse 0.20-0.95) $(date) ==="
|
||||||
|
DELTA=2048 OL=128 bash "$FS/run_reuse_fixed.sh"; rc2=$?
|
||||||
|
echo "=== reuse d2048 o128 rc=$rc2 $(date) ==="
|
||||||
|
sleep 12; nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
|
||||||
|
|
||||||
|
echo "=== [3/3] CONC capped (PD wall=${CONC_PD_MAXDUR:-600}s, colo uncapped), N 8..128 $(date) ==="
|
||||||
|
NLIST="8 16 32 48 64 96 128" bash "$FS/run_conc.sh"; rc3=$?
|
||||||
|
echo "=== conc rc=$rc3 $(date) ==="
|
||||||
|
|
||||||
|
echo "=== CAMPAIGN2 DONE reuse_d1024_o128=$rc1 reuse_d2048_o128=$rc2 conc=$rc3 $(date) ==="
|
||||||
20
microbench/fresh_setup/run_campaign3.sh
Normal file
20
microbench/fresh_setup/run_campaign3.sh
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Campaign 3 (2026-05-31): the uncapped d2048/o128 reuse sweep stalled on a
|
||||||
|
# collapse-draining high-reuse PD arm (4P+4D @ reuse 0.90, ~1 req/several-min).
|
||||||
|
# Finish it by re-running ONLY the high-reuse points (0.90, 0.95) WITH the PD
|
||||||
|
# wall-cap (low-reuse arms already completed and are cap-insensitive). Then run
|
||||||
|
# the capped conc sweep. STRICTLY serial. NO set -e.
|
||||||
|
cd /home/admin/cpfs/wjh/agentic-kv-fresh
|
||||||
|
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
|
||||||
|
FS=microbench/fresh_setup
|
||||||
|
echo "=== CAMPAIGN3 START $(date) ==="
|
||||||
|
|
||||||
|
echo "=== [1/2] finish reuse d2048/o128: re-run pts pfx=18432,38912 (PD capped 500s) $(date) ==="
|
||||||
|
DELTA=2048 OL=128 PFXS="18432 38912" REUSE_PD_MAXDUR=500 bash "$FS/run_reuse_fixed.sh"; rc1=$?
|
||||||
|
echo "=== reuse d2048 o128 finish rc=$rc1 $(date) ==="
|
||||||
|
sleep 12; nvidia-smi --query-gpu=index,memory.used --format=csv,noheader | head -8
|
||||||
|
|
||||||
|
echo "=== [2/2] CONC capped (PD wall=600s, colo uncapped), N 8..128 $(date) ==="
|
||||||
|
NLIST="8 16 32 48 64 96 128" CONC_PD_MAXDUR=600 bash "$FS/run_conc.sh"; rc2=$?
|
||||||
|
echo "=== conc rc=$rc2 $(date) ==="
|
||||||
|
echo "=== CAMPAIGN3 DONE reuse_finish=$rc1 conc=$rc2 $(date) ==="
|
||||||
70
microbench/fresh_setup/run_conc.sh
Normal file
70
microbench/fresh_setup/run_conc.sh
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Concurrency axis, agentic-corner config. Supersedes old fig3 (in~8192/out256).
|
||||||
|
# RETUNED 2026-05-31 for realism (C2): hold total context in=32768 but shrink the
|
||||||
|
# real per-turn new-prefill to delta=512 and push reuse to 0.984 (real agentic
|
||||||
|
# reuse ->99.6%). prefix 32256 + delta 512. out=128. This is the corner that
|
||||||
|
# exposes PD's structural tax: colo keeps the 32k resident KV local, but PD must
|
||||||
|
# KV-transfer the whole 32k context every turn even though only 512 tokens are new
|
||||||
|
# (C2 PD-tax ~250-450x). Sweep closed-loop N by step 8 up to mean-E2E<=SLO ceiling.
|
||||||
|
# Wiring per memory project-mb5-pd-ablation-wiring: .venv_dash0, traces_synth/,
|
||||||
|
# CONFIG 8C-proxy + PD, MB5_P_ROUTING=session + MB5_COLO_ROUTING=session,
|
||||||
|
# N=REPLAY_MAX_INFLIGHT closed loop + REPLAY_INTER_TURN_THINK_S,
|
||||||
|
# REPLAY_NO_REALIZED_PREFIX=1. RUN ONLY ONE DRIVER AT A TIME (shared GPUs/ports).
|
||||||
|
set -eo pipefail
|
||||||
|
cd /home/admin/cpfs/wjh/agentic-kv-fresh
|
||||||
|
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
|
||||||
|
VPY="$MB5_VENV/bin/python"
|
||||||
|
|
||||||
|
PFX="${PFX:-32256}"; DELTA="${DELTA:-512}"; OL="${OL:-128}" # reuse=0.984, in=32768
|
||||||
|
THINK="${THINK:-0.5}"; TURNS="${TURNS:-8}"
|
||||||
|
NSTART="${NSTART:-8}"; NSTEP="${NSTEP:-8}"; NMAX="${NMAX:-128}"
|
||||||
|
NLIST="${NLIST:-}" # explicit N grid (overrides NSTART/STEP/MAX), e.g. "8 16 32 48 64 96 128"
|
||||||
|
CONC_PD_MAXDUR="${CONC_PD_MAXDUR:-600}" # wall-deadline (s) for PD arms only; bounds collapsed-arm
|
||||||
|
# drain (un-run turns = failures). colo (8C-proxy) runs UNCAPPED
|
||||||
|
# so the headline reference is always fully measured.
|
||||||
|
SLO="${SLO:-10.0}"
|
||||||
|
SESS_PER_N="${SESS_PER_N:-4}"
|
||||||
|
CFGS="${CFGS:-8C-proxy 2P+6D 4P+4D 6P+2D}"
|
||||||
|
ONLY_N="${ONLY_N:-}"
|
||||||
|
|
||||||
|
run_N() {
|
||||||
|
local N="$1"; local sess=$(( SESS_PER_N * N ))
|
||||||
|
local tag="conc32k_N${N}"; local trace="traces_synth/${tag}.jsonl"
|
||||||
|
"$VPY" scripts/gen_synthetic_trace.py --out "$trace" --mode regular \
|
||||||
|
--qps "$sess" --duration-s 1 --turns "$TURNS" \
|
||||||
|
--prefix-len "$PFX" --delta-len "$DELTA" --output-len "$OL" --seed 42 >/dev/null
|
||||||
|
echo "[conc32k] N=$N sess=$sess in=$((PFX+DELTA)) out=$OL -> $trace"
|
||||||
|
for cfg in $CFGS; do
|
||||||
|
echo " -> $cfg"
|
||||||
|
local dur=""; [ "$cfg" != "8C-proxy" ] && dur="$CONC_PD_MAXDUR" # colo uncapped
|
||||||
|
MB5_P_ROUTING=session MB5_COLO_ROUTING=session \
|
||||||
|
REPLAY_MAX_INFLIGHT="$N" REPLAY_INTER_TURN_THINK_S="$THINK" REPLAY_NO_REALIZED_PREFIX=1 \
|
||||||
|
REPLAY_MAX_DURATION="$dur" \
|
||||||
|
CONFIGS="$cfg" REPS=1 TRACE="$trace" RUN_TAG="$tag" \
|
||||||
|
bash scripts/mb5_run_gpu.sh >/dev/null 2>&1 || echo " [warn] ${tag}_${cfg} failed" >&2
|
||||||
|
done
|
||||||
|
local d="mb5_runs/${tag}_8C-proxy_rep1"
|
||||||
|
if [ -f "$d/replay_metrics.summary.json" ]; then
|
||||||
|
"$VPY" scripts/fig_agg.py --json "$d" 2>/dev/null \
|
||||||
|
| "$VPY" -c "import sys,json;r=json.load(sys.stdin);print(r[0].get('e2e_mean') if r else 'nan')"
|
||||||
|
else echo nan; fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -n "$ONLY_N" ]; then
|
||||||
|
echo "[conc32k] SMOKE N=$ONLY_N cfgs='$CFGS'"
|
||||||
|
t0=$(date +%s); m=$(run_N "$ONLY_N"); t1=$(date +%s)
|
||||||
|
echo "[conc32k] SMOKE N=$ONLY_N colo mean-E2E=${m}s wall=$(( t1 - t0 ))s; compare:"
|
||||||
|
"$VPY" scripts/fig_agg.py mb5_runs/conc32k_N${ONLY_N}_*_rep1 2>&1
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$NLIST" ]; then NSEQ="$NLIST"; else NSEQ=$(seq "$NSTART" "$NSTEP" "$NMAX"); fi
|
||||||
|
for N in $NSEQ; do
|
||||||
|
echo "[conc32k] === N=$N ==="
|
||||||
|
m=$(run_N "$N"); echo "[conc32k] N=$N colo mean-E2E=${m}s"
|
||||||
|
over=$("$VPY" -c "print(1 if float('${m}')>${SLO} else 0)" 2>/dev/null || echo 0)
|
||||||
|
[ "$over" = "1" ] && { echo "[conc32k] colo crossed SLO ${SLO}s at N=$N -> stop"; break; }
|
||||||
|
done
|
||||||
|
dirs=(); for d in mb5_runs/conc32k_N*_rep1; do [ -d "$d" ] && dirs+=("$d"); done
|
||||||
|
"$VPY" scripts/fig_agg.py --json "${dirs[@]}" > analysis/mb5_pd_ablation/fig3_conc32k.json
|
||||||
|
echo "[conc32k] done -> analysis/mb5_pd_ablation/fig3_conc32k.json (${#dirs[@]} dirs)"
|
||||||
72
microbench/fresh_setup/run_reuse_fixed.sh
Normal file
72
microbench/fresh_setup/run_reuse_fixed.sh
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Reuse axis, DONE RIGHT (controlled variable). Supersedes old fig1.
|
||||||
|
# Hold REAL (uncached) prefill work constant: --delta-len = U fixed.
|
||||||
|
# Vary only --prefix-len = C -> reuse = C/(C+U). Context grows with reuse but
|
||||||
|
# the tokens that must actually be prefilled each turn stays = U.
|
||||||
|
# Old fig1 held input=8192 and sliced prefix out of it, so delta shrank 15x as
|
||||||
|
# reuse rose -> confounded "more reuse" with "less prefill". This fixes that.
|
||||||
|
#
|
||||||
|
# Wiring matches the corrected MB5 stack (see memory project-mb5-pd-ablation-wiring):
|
||||||
|
# .venv_dash0, traces_synth/, CONFIG 8C-proxy + PD, MB5_P_ROUTING=session,
|
||||||
|
# N injected via REPLAY_MAX_INFLIGHT (closed loop) + REPLAY_INTER_TURN_THINK_S,
|
||||||
|
# REPLAY_NO_REALIZED_PREFIX=1 (reuse governed by hash_ids, required for this sweep).
|
||||||
|
set -eo pipefail
|
||||||
|
cd /home/admin/cpfs/wjh/agentic-kv-fresh
|
||||||
|
export MB5_VENV="${MB5_VENV:-/home/admin/cpfs/wjh/agentic-kv-fresh/.venv_dash0}"
|
||||||
|
VPY="$MB5_VENV/bin/python"
|
||||||
|
|
||||||
|
DELTA="${DELTA:-2048}" # fixed real prefill per turn (USER-CHOSEN)
|
||||||
|
OL="${OL:-256}"
|
||||||
|
N="${N:-8}"
|
||||||
|
THINK="${THINK:-0.5}"
|
||||||
|
TURNS="${TURNS:-8}"
|
||||||
|
NSESS="${NSESS:-48}" # number of sessions (closed-loop: arrival rate is
|
||||||
|
# irrelevant, only the count matters; ~6 waves at N=8)
|
||||||
|
PFXS="${PFXS:-512 2048 4096 8192 18432 38912}" # reuse .20 .50 .67 .80 .90 .95
|
||||||
|
CFGS="${CFGS:-8C-proxy 2P+6D 4P+4D 6P+2D}"
|
||||||
|
REUSE_PD_MAXDUR="${REUSE_PD_MAXDUR:-500}" # wall-deadline (s) for PD arms only (colo uncapped):
|
||||||
|
# bounds the collapse-drain that stalls high-reuse PD arms
|
||||||
|
# (un-run turns = failures, honest completion%). 0/empty = off.
|
||||||
|
ONLY_PFX="${ONLY_PFX:-}" # smoke a single prefix then exit
|
||||||
|
|
||||||
|
run_point() { # <pfx>
|
||||||
|
local pfx="$1"
|
||||||
|
local reuse; reuse=$(python3 -c "print(f'{$pfx/($pfx+$DELTA):.3f}')")
|
||||||
|
local tag="reuse_p${pfx}_d${DELTA}_o${OL}" # _o${OL} so different output lens don't collide
|
||||||
|
local trace="traces_synth/${tag}.jsonl"
|
||||||
|
# Closed-loop: pass NSESS as qps with duration 1 so n_sessions = NSESS
|
||||||
|
# exactly (gen_regular: n_sessions = int(duration_s * session_qps)).
|
||||||
|
"$VPY" scripts/gen_synthetic_trace.py --out "$trace" --mode regular \
|
||||||
|
--qps "$NSESS" --duration-s 1 --turns "$TURNS" \
|
||||||
|
--prefix-len "$pfx" --delta-len "$DELTA" --output-len "$OL" --seed 42 >/dev/null
|
||||||
|
echo "[reuse] pfx=$pfx delta=$DELTA reuse=$reuse in=$((pfx+DELTA)) -> $trace"
|
||||||
|
for cfg in $CFGS; do
|
||||||
|
echo " -> $cfg"
|
||||||
|
# Both routings set to session so BOTH colo (kv_both) and PD producers
|
||||||
|
# pin a session's turns to one instance and reuse its prefix cache — the
|
||||||
|
# fair cache-aware comparison. P_ROUTING is ignored by colo, COLO_ROUTING
|
||||||
|
# by PD, so setting both is harmless and symmetric.
|
||||||
|
local dur=""; [ "$cfg" != "8C-proxy" ] && dur="$REUSE_PD_MAXDUR" # colo uncapped
|
||||||
|
MB5_P_ROUTING=session MB5_COLO_ROUTING=session \
|
||||||
|
REPLAY_MAX_INFLIGHT="$N" REPLAY_INTER_TURN_THINK_S="$THINK" \
|
||||||
|
REPLAY_NO_REALIZED_PREFIX=1 REPLAY_MAX_DURATION="$dur" \
|
||||||
|
CONFIGS="$cfg" REPS=1 TRACE="$trace" RUN_TAG="$tag" \
|
||||||
|
bash scripts/mb5_run_gpu.sh >/dev/null 2>&1 || echo " [warn] $cfg failed" >&2
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -n "$ONLY_PFX" ]; then
|
||||||
|
echo "[reuse] SMOKE pfx=$ONLY_PFX cfgs='$CFGS'"
|
||||||
|
t0=$(date +%s); run_point "$ONLY_PFX"; t1=$(date +%s)
|
||||||
|
echo "[reuse] SMOKE done wall=$(( t1 - t0 ))s; compare:"
|
||||||
|
"$VPY" scripts/fig_agg.py mb5_runs/reuse_p${ONLY_PFX}_d${DELTA}_o${OL}_*_rep1
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
for pfx in $PFXS; do run_point "$pfx"; done
|
||||||
|
# Aggregate ONLY this sweep's dirs (matched by delta+output) so the three
|
||||||
|
# reuse figures (d2048/o256, d1024/o128, d2048/o128) never cross-contaminate.
|
||||||
|
dirs=(); for d in mb5_runs/reuse_*_d${DELTA}_o${OL}_*_rep1; do [ -d "$d" ] && dirs+=("$d"); done
|
||||||
|
OUTJSON="analysis/mb5_pd_ablation/fig1_reuse_d${DELTA}_o${OL}.json"
|
||||||
|
"$VPY" scripts/fig_agg.py --json "${dirs[@]}" > "$OUTJSON"
|
||||||
|
echo "[reuse] done -> $OUTJSON (${#dirs[@]} dirs)"
|
||||||
Reference in New Issue
Block a user