Files
agentic-kvc/microbench/fresh_setup/mb5_run.sh
Gahow Wang e532e83d3e mb5_run: scrape per-instance prefix-cache counters before teardown
Per-port vllm:prefix_cache_{queries,hits}_total -> instance_apc.txt. For PD
this is the only honest reuse signal: producer ports show cross-turn prefix
hits, while the consumer's per-request cached_tokens just counts transferred KV.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-29 11:56:43 +08:00

139 lines
5.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Orchestrator for MB5: for each CONFIG × rep, bring up the stack, run a
# trace replay against it, collect KV snapshots and replayer metrics,
# tear down.
#
# Designed to be run on dash1 (or any host with cpfs mounted at
# /home/admin/cpfs/wjh/).
#
# Env vars (with defaults):
# CONFIGS : space-separated MB5 configs (default: "8C 6P+2D 4P+4D 2P+6D")
# REPS : reps per config (default: 3)
# TRACE : trace JSONL path
# (default: /home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl)
# RUN_TAG : output root tag (default: $(date +%Y%m%d_%H%M%S))
# REQUEST_LIMIT : optional, cap replay requests (default: none)
set -eo pipefail
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
REPLAYER_DIR="${FRESH_ROOT}/replayer"
CONFIGS="${CONFIGS:-8C 6P+2D 4P+4D 2P+6D}"
REPS="${REPS:-3}"
TRACE="${TRACE:-/home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl}"
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d_%H%M%S)}"
MODEL_NAME="${MODEL_NAME:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
REQUEST_LIMIT_ARG=""
if [ -n "${REQUEST_LIMIT:-}" ]; then
REQUEST_LIMIT_ARG="--request-limit ${REQUEST_LIMIT}"
fi
OUT_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_TAG}"
mkdir -p "${OUT_ROOT}"
echo "[mb5-run] RUN_TAG=${RUN_TAG}"
echo "[mb5-run] OUT_ROOT=${OUT_ROOT}"
echo "[mb5-run] CONFIGS=${CONFIGS}"
echo "[mb5-run] REPS=${REPS}"
echo "[mb5-run] TRACE=${TRACE}"
run_one() {
local config="$1" rep="$2"
local label="${RUN_TAG}_${config}_rep${rep}"
local rundir="${FRESH_ROOT}/mb5_runs/${label}"
echo ""
echo "======== ${config} rep${rep} ========"
# Launch
if ! CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
bash "${LAUNCH}" start > "${OUT_ROOT}/${config}_rep${rep}_launch.log" 2>&1; then
echo "[mb5-run] LAUNCH FAILED for ${config} rep${rep}; see ${OUT_ROOT}/${config}_rep${rep}_launch.log"
return 1
fi
# Extract ENDPOINTS line emitted by mb5_launch.sh
local endpoints
endpoints=$(grep "^ENDPOINTS=" "${OUT_ROOT}/${config}_rep${rep}_launch.log" | tail -1 | cut -d= -f2-)
if [ -z "${endpoints}" ]; then
echo "[mb5-run] ERROR: no ENDPOINTS in launch log"
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
return 1
fi
echo "[mb5-run] endpoints: ${endpoints}"
# Replay
source "${VENV}/bin/activate"
local replay_out="${rundir}/replay_metrics.jsonl"
mkdir -p "$(dirname "${replay_out}")"
local t0
t0=$(date +%s.%N)
if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \
--endpoint "${endpoints}" \
--trace "${TRACE}" \
--output "${replay_out}" \
--model "${MODEL_NAME}" \
${REQUEST_LIMIT_ARG} \
> "${OUT_ROOT}/${config}_rep${rep}_replay.log" 2>&1; then
local t1
t1=$(date +%s.%N)
local wall=$(python -c "print(${t1} - ${t0})")
echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log"
bash "${LAUNCH}" stop > /dev/null 2>&1 || true
return 1
fi
local t1
t1=$(date +%s.%N)
local wall_clock_s
wall_clock_s=$(python -c "print(${t1} - ${t0})")
echo "[mb5-run] replay done in ${wall_clock_s}s"
echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"
# Per-instance prefix-cache counters, scraped from each backend BEFORE
# teardown. For PD this is the only honest reuse signal: producer ports
# (the low ones) show cross-turn prefix-cache hits; the consumer's
# per-request cached_tokens is meaningless (it counts transferred KV).
{
for p in 8000 8001 8002 8003 8004 8005 8006 8007; do
m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue
q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}')
h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}')
[ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}"
done
} > "${rundir}/instance_apc.txt" 2>/dev/null || true
# Stop launch (cleans up vllm + proxy; reverts patch on last call)
CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true
sleep 10 # cooldown so GPUs settle before next config
echo "[mb5-run] DONE ${config} rep${rep}"
}
# Quick check that the launch script and replayer are reachable
if [ ! -f "${LAUNCH}" ]; then echo "missing ${LAUNCH}"; exit 1; fi
if [ ! -d "${REPLAYER_DIR}" ]; then echo "missing ${REPLAYER_DIR}"; exit 1; fi
if [ ! -f "${TRACE}" ]; then echo "missing trace ${TRACE}"; exit 1; fi
# Iterate
failures=0
for config in ${CONFIGS}; do
for ((rep=1; rep<=REPS; rep++)); do
if ! run_one "${config}" "${rep}"; then
failures=$((failures+1))
fi
done
done
# Final patch revert (defensive — mb5_launch.sh stop also reverts)
python "${SCRIPT_DIR}/instrument_kv_snapshot.py" --revert --venv "${VENV}" 2>/dev/null || true
echo ""
echo "======== ALL CONFIGS DONE ========"
echo "failures: ${failures}"
echo "results under: ${FRESH_ROOT}/mb5_runs/${RUN_TAG}_*"
echo "to plot: python plot_kv_pool_timeline.py --run-tag ${RUN_TAG}"