Three-axis controlled ablation of PD-colo vs PD-disagg on synthetic regular
traces (closed-loop, controlled reuse via REPLAY_NO_REALIZED_PREFIX) on the
clean stack (e13391e gated off).
Axis 1 (Fig 1) -- reuse 6%->94% at N=8, in8192/out256
Axis 2 (Fig 2) -- shape in2048/out2048 -> in32768/out64 at N=8, reuse~70%
Axis 3 (Fig 3) -- concurrency N=8/16/32/64 at reuse~71%, in8192/out256
Findings:
* APC parity colo=PD at every reuse (5.5/22/44/66/77/82%) -- contamination
fix validated.
* PD edge erodes 1.57x->1.10x with reuse; prefill GPUs strand 26%->9%.
* Shape: PD-best peaks mid-sweep (1.34x at in8192/out512); wrong PD ratio
catastrophic at prefill extreme (in32768/out64 pd2 = 378/400, p99 432s).
* Concurrency: PD wins N<=32 (1.23-1.29x), TIPS at N=64 -- pd2/pd4
crater (APC 71%->1.4%, TPS -30%) while colo scales cleanly.
Infrastructure:
* replayer: --max-inflight-sessions, --inter-turn-think, --no-realized-prefix
(env-defaulted via REPLAY_MAX_INFLIGHT, REPLAY_INTER_TURN_THINK_S,
REPLAY_NO_REALIZED_PREFIX).
* mb5_run.sh: writes bench_config.json + gpu_util.csv + run_window.json +
instance_apc.txt + metrics.jsonl for bench_report/fig_agg ingest.
* fig_agg.py: per-arm GPU role split + producer-side APC; --json mode.
* gpu_util_report.py: companion per-GPU util report from gpu_util.csv.
* partial_summary.py: stats from in-flight replay_metrics.jsonl
(works before metrics.summary.json exists).
Data: analysis/mb5_pd_ablation/fig{1,2,3}.json (24 + 20 + 16 rows).
Figures: figs/mb5_pd_ablation/fig{1_reuse,2_shape,3_concurrency}_axis.png.
150 lines
6.1 KiB
Bash
Executable File
150 lines
6.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Orchestrator for MB5: for each CONFIG × rep, bring up the stack, run a
|
||
# trace replay against it, collect KV snapshots and replayer metrics,
|
||
# tear down.
|
||
#
|
||
# Designed to be run on dash1 (or any host with cpfs mounted at
|
||
# /home/admin/cpfs/wjh/).
|
||
#
|
||
# Env vars (with defaults):
|
||
# CONFIGS : space-separated MB5 configs (default: "8C 6P+2D 4P+4D 2P+6D")
|
||
# REPS : reps per config (default: 3)
|
||
# TRACE : trace JSONL path
|
||
# (default: /home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl)
|
||
# RUN_TAG : output root tag (default: $(date +%Y%m%d_%H%M%S))
|
||
# REQUEST_LIMIT : optional, cap replay requests (default: none)
|
||
|
||
set -eo pipefail
|
||
|
||
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
||
# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
|
||
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
|
||
REPLAYER_DIR="${FRESH_ROOT}/replayer"
|
||
|
||
CONFIGS="${CONFIGS:-8C 6P+2D 4P+4D 2P+6D}"
|
||
REPS="${REPS:-3}"
|
||
TRACE="${TRACE:-/home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl}"
|
||
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d_%H%M%S)}"
|
||
MODEL_NAME="${MODEL_NAME:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||
REQUEST_LIMIT_ARG=""
|
||
if [ -n "${REQUEST_LIMIT:-}" ]; then
|
||
REQUEST_LIMIT_ARG="--request-limit ${REQUEST_LIMIT}"
|
||
fi
|
||
|
||
OUT_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_TAG}"
|
||
mkdir -p "${OUT_ROOT}"
|
||
echo "[mb5-run] RUN_TAG=${RUN_TAG}"
|
||
echo "[mb5-run] OUT_ROOT=${OUT_ROOT}"
|
||
echo "[mb5-run] CONFIGS=${CONFIGS}"
|
||
echo "[mb5-run] REPS=${REPS}"
|
||
echo "[mb5-run] TRACE=${TRACE}"
|
||
|
||
run_one() {
|
||
local config="$1" rep="$2"
|
||
local label="${RUN_TAG}_${config}_rep${rep}"
|
||
local rundir="${FRESH_ROOT}/mb5_runs/${label}"
|
||
echo ""
|
||
echo "======== ${config} rep${rep} ========"
|
||
|
||
# Launch
|
||
if ! CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
|
||
bash "${LAUNCH}" start > "${OUT_ROOT}/${config}_rep${rep}_launch.log" 2>&1; then
|
||
echo "[mb5-run] LAUNCH FAILED for ${config} rep${rep}; see ${OUT_ROOT}/${config}_rep${rep}_launch.log"
|
||
return 1
|
||
fi
|
||
|
||
# Extract ENDPOINTS line emitted by mb5_launch.sh
|
||
local endpoints
|
||
endpoints=$(grep "^ENDPOINTS=" "${OUT_ROOT}/${config}_rep${rep}_launch.log" | tail -1 | cut -d= -f2-)
|
||
if [ -z "${endpoints}" ]; then
|
||
echo "[mb5-run] ERROR: no ENDPOINTS in launch log"
|
||
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
|
||
return 1
|
||
fi
|
||
echo "[mb5-run] endpoints: ${endpoints}"
|
||
|
||
# Replay
|
||
source "${VENV}/bin/activate"
|
||
local replay_out="${rundir}/replay_metrics.jsonl"
|
||
mkdir -p "$(dirname "${replay_out}")"
|
||
# bench_report.py inputs: worker->gpu map (worker i == gpu i for every config;
|
||
# for PD, workers 0-3 are producers on gpu0-3, 4-7 consumers on gpu4-7).
|
||
printf '{"base_port":8000,"n_instances":8,"gpu_indices":[0,1,2,3,4,5,6,7]}\n' \
|
||
> "${rundir}/bench_config.json"
|
||
# per-GPU utilization timeseries over the replay window (2s sampling)
|
||
bash "${SCRIPT_DIR}/gpu_monitor.sh" "${rundir}/gpu_util.csv" 2 >/dev/null 2>&1 &
|
||
local GPU_MON=$!
|
||
local t0
|
||
t0=$(date +%s.%N)
|
||
if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \
|
||
--endpoint "${endpoints}" \
|
||
--trace "${TRACE}" \
|
||
--output "${replay_out}" \
|
||
--model "${MODEL_NAME}" \
|
||
${REQUEST_LIMIT_ARG} \
|
||
> "${OUT_ROOT}/${config}_rep${rep}_replay.log" 2>&1; then
|
||
local t1
|
||
t1=$(date +%s.%N)
|
||
local wall=$(python -c "print(${t1} - ${t0})")
|
||
echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log"
|
||
kill "${GPU_MON}" 2>/dev/null || true
|
||
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
|
||
return 1
|
||
fi
|
||
local t1
|
||
t1=$(date +%s.%N)
|
||
local wall_clock_s
|
||
wall_clock_s=$(python -c "print(${t1} - ${t0})")
|
||
echo "[mb5-run] replay done in ${wall_clock_s}s"
|
||
echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"
|
||
kill "${GPU_MON}" 2>/dev/null || true
|
||
printf '{"t_start_unix":%s,"t_end_unix":%s}\n' "${t0}" "${t1}" > "${rundir}/run_window.json"
|
||
cp -f "${replay_out}" "${rundir}/metrics.jsonl" # bench_report.py expects metrics.jsonl
|
||
|
||
# Per-instance prefix-cache counters, scraped from each backend BEFORE
|
||
# teardown. For PD this is the only honest reuse signal: producer ports
|
||
# (the low ones) show cross-turn prefix-cache hits; the consumer's
|
||
# per-request cached_tokens is meaningless (it counts transferred KV).
|
||
{
|
||
for p in 8000 8001 8002 8003 8004 8005 8006 8007; do
|
||
m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue
|
||
q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}')
|
||
h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}')
|
||
[ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}"
|
||
done
|
||
} > "${rundir}/instance_apc.txt" 2>/dev/null || true
|
||
|
||
# Stop launch (cleans up vllm + proxy; reverts patch on last call)
|
||
CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
|
||
bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true
|
||
|
||
sleep 10 # cooldown so GPUs settle before next config
|
||
echo "[mb5-run] DONE ${config} rep${rep}"
|
||
}
|
||
|
||
# Quick check that the launch script and replayer are reachable
|
||
if [ ! -f "${LAUNCH}" ]; then echo "missing ${LAUNCH}"; exit 1; fi
|
||
if [ ! -d "${REPLAYER_DIR}" ]; then echo "missing ${REPLAYER_DIR}"; exit 1; fi
|
||
if [ ! -f "${TRACE}" ]; then echo "missing trace ${TRACE}"; exit 1; fi
|
||
|
||
# Iterate
|
||
failures=0
|
||
for config in ${CONFIGS}; do
|
||
for ((rep=1; rep<=REPS; rep++)); do
|
||
if ! run_one "${config}" "${rep}"; then
|
||
failures=$((failures+1))
|
||
fi
|
||
done
|
||
done
|
||
|
||
# Final patch revert (defensive — mb5_launch.sh stop also reverts)
|
||
python "${SCRIPT_DIR}/instrument_kv_snapshot.py" --revert --venv "${VENV}" 2>/dev/null || true
|
||
|
||
echo ""
|
||
echo "======== ALL CONFIGS DONE ========"
|
||
echo "failures: ${failures}"
|
||
echo "results under: ${FRESH_ROOT}/mb5_runs/${RUN_TAG}_*"
|
||
echo "to plot: python plot_kv_pool_timeline.py --run-tag ${RUN_TAG}"
|