#!/usr/bin/env bash # Orchestrator for MB5: for each CONFIG × rep, bring up the stack, run a # trace replay against it, collect KV snapshots and replayer metrics, # tear down. # # Designed to be run on dash1 (or any host with cpfs mounted at # /home/admin/cpfs/wjh/). # # Env vars (with defaults): # CONFIGS : space-separated MB5 configs (default: "8C 6P+2D 4P+4D 2P+6D") # REPS : reps per config (default: 3) # TRACE : trace JSONL path # (default: /home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl) # RUN_TAG : output root tag (default: $(date +%Y%m%d_%H%M%S)) # REQUEST_LIMIT : optional, cap replay requests (default: none) set -eo pipefail FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh" # MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh). VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LAUNCH="${SCRIPT_DIR}/mb5_launch.sh" REPLAYER_DIR="${FRESH_ROOT}/replayer" CONFIGS="${CONFIGS:-8C 6P+2D 4P+4D 2P+6D}" REPS="${REPS:-3}" TRACE="${TRACE:-/home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl}" RUN_TAG="${RUN_TAG:-$(date +%Y%m%d_%H%M%S)}" MODEL_NAME="${MODEL_NAME:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" REQUEST_LIMIT_ARG="" if [ -n "${REQUEST_LIMIT:-}" ]; then REQUEST_LIMIT_ARG="--request-limit ${REQUEST_LIMIT}" fi OUT_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_TAG}" mkdir -p "${OUT_ROOT}" echo "[mb5-run] RUN_TAG=${RUN_TAG}" echo "[mb5-run] OUT_ROOT=${OUT_ROOT}" echo "[mb5-run] CONFIGS=${CONFIGS}" echo "[mb5-run] REPS=${REPS}" echo "[mb5-run] TRACE=${TRACE}" run_one() { local config="$1" rep="$2" local label="${RUN_TAG}_${config}_rep${rep}" local rundir="${FRESH_ROOT}/mb5_runs/${label}" echo "" echo "======== ${config} rep${rep} ========" # Launch if ! CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \ bash "${LAUNCH}" start > "${OUT_ROOT}/${config}_rep${rep}_launch.log" 2>&1; then echo "[mb5-run] LAUNCH FAILED for ${config} rep${rep}; see ${OUT_ROOT}/${config}_rep${rep}_launch.log" return 1 fi # Extract ENDPOINTS line emitted by mb5_launch.sh local endpoints endpoints=$(grep "^ENDPOINTS=" "${OUT_ROOT}/${config}_rep${rep}_launch.log" | tail -1 | cut -d= -f2-) if [ -z "${endpoints}" ]; then echo "[mb5-run] ERROR: no ENDPOINTS in launch log" bash "${LAUNCH}" stop > /dev/null 2>&1 || true return 1 fi echo "[mb5-run] endpoints: ${endpoints}" # Replay source "${VENV}/bin/activate" local replay_out="${rundir}/replay_metrics.jsonl" mkdir -p "$(dirname "${replay_out}")" # bench_report.py inputs: worker->gpu map (worker i == gpu i for every config; # for PD, workers 0-3 are producers on gpu0-3, 4-7 consumers on gpu4-7). printf '{"base_port":8000,"n_instances":8,"gpu_indices":[0,1,2,3,4,5,6,7]}\n' \ > "${rundir}/bench_config.json" # per-GPU utilization timeseries over the replay window (2s sampling) bash "${SCRIPT_DIR}/gpu_monitor.sh" "${rundir}/gpu_util.csv" 2 >/dev/null 2>&1 & local GPU_MON=$! local t0 t0=$(date +%s.%N) if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \ --endpoint "${endpoints}" \ --trace "${TRACE}" \ --output "${replay_out}" \ --model "${MODEL_NAME}" \ ${REQUEST_LIMIT_ARG} \ > "${OUT_ROOT}/${config}_rep${rep}_replay.log" 2>&1; then local t1 t1=$(date +%s.%N) local wall=$(python -c "print(${t1} - ${t0})") echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log" kill "${GPU_MON}" 2>/dev/null || true bash "${LAUNCH}" stop > /dev/null 2>&1 || true return 1 fi local t1 t1=$(date +%s.%N) local wall_clock_s wall_clock_s=$(python -c "print(${t1} - ${t0})") echo "[mb5-run] replay done in ${wall_clock_s}s" echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt" kill "${GPU_MON}" 2>/dev/null || true printf '{"t_start_unix":%s,"t_end_unix":%s}\n' "${t0}" "${t1}" > "${rundir}/run_window.json" cp -f "${replay_out}" "${rundir}/metrics.jsonl" # bench_report.py expects metrics.jsonl # Per-instance prefix-cache counters, scraped from each backend BEFORE # teardown. For PD this is the only honest reuse signal: producer ports # (the low ones) show cross-turn prefix-cache hits; the consumer's # per-request cached_tokens is meaningless (it counts transferred KV). { for p in 8000 8001 8002 8003 8004 8005 8006 8007; do m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}') h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}') [ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}" done } > "${rundir}/instance_apc.txt" 2>/dev/null || true # Stop launch (cleans up vllm + proxy; reverts patch on last call) CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \ bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true sleep 10 # cooldown so GPUs settle before next config echo "[mb5-run] DONE ${config} rep${rep}" } # Quick check that the launch script and replayer are reachable if [ ! -f "${LAUNCH}" ]; then echo "missing ${LAUNCH}"; exit 1; fi if [ ! -d "${REPLAYER_DIR}" ]; then echo "missing ${REPLAYER_DIR}"; exit 1; fi if [ ! -f "${TRACE}" ]; then echo "missing trace ${TRACE}"; exit 1; fi # Iterate failures=0 for config in ${CONFIGS}; do for ((rep=1; rep<=REPS; rep++)); do if ! run_one "${config}" "${rep}"; then failures=$((failures+1)) fi done done # Final patch revert (defensive — mb5_launch.sh stop also reverts) python "${SCRIPT_DIR}/instrument_kv_snapshot.py" --revert --venv "${VENV}" 2>/dev/null || true echo "" echo "======== ALL CONFIGS DONE ========" echo "failures: ${failures}" echo "results under: ${FRESH_ROOT}/mb5_runs/${RUN_TAG}_*" echo "to plot: python plot_kv_pool_timeline.py --run-tag ${RUN_TAG}"