Files
agentic-kvc/microbench/fresh_setup/mb5_run.sh

126 lines
4.6 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Orchestrator for MB5: for each CONFIG × rep, bring up the stack, run a
# trace replay against it, collect KV snapshots and replayer metrics,
# tear down.
#
# Designed to be run on dash1 (or any host with cpfs mounted at
# /home/admin/cpfs/wjh/).
#
# Env vars (with defaults):
# CONFIGS : space-separated MB5 configs (default: "8C 6P+2D 4P+4D 2P+6D")
# REPS : reps per config (default: 3)
# TRACE : trace JSONL path
# (default: /home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl)
# RUN_TAG : output root tag (default: $(date +%Y%m%d_%H%M%S))
# REQUEST_LIMIT : optional, cap replay requests (default: none)
set -eo pipefail
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
REPLAYER_DIR="${FRESH_ROOT}/replayer"
CONFIGS="${CONFIGS:-8C 6P+2D 4P+4D 2P+6D}"
REPS="${REPS:-3}"
TRACE="${TRACE:-/home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl}"
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d_%H%M%S)}"
MODEL_NAME="${MODEL_NAME:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
REQUEST_LIMIT_ARG=""
if [ -n "${REQUEST_LIMIT:-}" ]; then
REQUEST_LIMIT_ARG="--request-limit ${REQUEST_LIMIT}"
fi
OUT_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_TAG}"
mkdir -p "${OUT_ROOT}"
echo "[mb5-run] RUN_TAG=${RUN_TAG}"
echo "[mb5-run] OUT_ROOT=${OUT_ROOT}"
echo "[mb5-run] CONFIGS=${CONFIGS}"
echo "[mb5-run] REPS=${REPS}"
echo "[mb5-run] TRACE=${TRACE}"
run_one() {
local config="$1" rep="$2"
local label="${RUN_TAG}_${config}_rep${rep}"
local rundir="${FRESH_ROOT}/mb5_runs/${label}"
echo ""
echo "======== ${config} rep${rep} ========"
# Launch
if ! CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
bash "${LAUNCH}" start > "${OUT_ROOT}/${config}_rep${rep}_launch.log" 2>&1; then
echo "[mb5-run] LAUNCH FAILED for ${config} rep${rep}; see ${OUT_ROOT}/${config}_rep${rep}_launch.log"
return 1
fi
# Extract ENDPOINTS line emitted by mb5_launch.sh
local endpoints
endpoints=$(grep "^ENDPOINTS=" "${OUT_ROOT}/${config}_rep${rep}_launch.log" | tail -1 | cut -d= -f2-)
if [ -z "${endpoints}" ]; then
echo "[mb5-run] ERROR: no ENDPOINTS in launch log"
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
return 1
fi
echo "[mb5-run] endpoints: ${endpoints}"
# Replay
source "${VENV}/bin/activate"
local replay_out="${rundir}/replay_metrics.jsonl"
mkdir -p "$(dirname "${replay_out}")"
local t0
t0=$(date +%s.%N)
if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \
--endpoint "${endpoints}" \
--trace "${TRACE}" \
--output "${replay_out}" \
--model "${MODEL_NAME}" \
${REQUEST_LIMIT_ARG} \
> "${OUT_ROOT}/${config}_rep${rep}_replay.log" 2>&1; then
local t1
t1=$(date +%s.%N)
local wall=$(python -c "print(${t1} - ${t0})")
echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log"
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
return 1
fi
local t1
t1=$(date +%s.%N)
local wall_clock_s
wall_clock_s=$(python -c "print(${t1} - ${t0})")
echo "[mb5-run] replay done in ${wall_clock_s}s"
echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"
# Stop launch (cleans up vllm + proxy; reverts patch on last call)
CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true
sleep 10 # cooldown so GPUs settle before next config
echo "[mb5-run] DONE ${config} rep${rep}"
}
# Quick check that the launch script and replayer are reachable
if [ ! -f "${LAUNCH}" ]; then echo "missing ${LAUNCH}"; exit 1; fi
if [ ! -d "${REPLAYER_DIR}" ]; then echo "missing ${REPLAYER_DIR}"; exit 1; fi
if [ ! -f "${TRACE}" ]; then echo "missing trace ${TRACE}"; exit 1; fi
# Iterate
failures=0
for config in ${CONFIGS}; do
for ((rep=1; rep<=REPS; rep++)); do
if ! run_one "${config}" "${rep}"; then
failures=$((failures+1))
fi
done
done
# Final patch revert (defensive — mb5_launch.sh stop also reverts)
python "${SCRIPT_DIR}/instrument_kv_snapshot.py" --revert --venv "${VENV}" 2>/dev/null || true
echo ""
echo "======== ALL CONFIGS DONE ========"
echo "failures: ${failures}"
echo "results under: ${FRESH_ROOT}/mb5_runs/${RUN_TAG}_*"
echo "to plot: python plot_kv_pool_timeline.py --run-tag ${RUN_TAG}"