agentic-kvc/microbench/fresh_setup/mb5_run_gpu.sh

#!/usr/bin/env bash
# Orchestrator for MB5: for each CONFIG × rep, bring up the stack, run a
# trace replay against it, collect KV snapshots and replayer metrics,
# tear down.
#
# Designed to be run on dash1 (or any host with cpfs mounted at
# /home/admin/cpfs/wjh/).
#
# Env vars (with defaults):
#   CONFIGS    : space-separated MB5 configs (default: "8C 6P+2D 4P+4D 2P+6D")
#   REPS       : reps per config (default: 3)
#   TRACE      : trace JSONL path
#                (default: /home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl)
#   RUN_TAG    : output root tag (default: $(date +%Y%m%d_%H%M%S))
#   REQUEST_LIMIT : optional, cap replay requests (default: none)

set -eo pipefail

FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
REPLAYER_DIR="${FRESH_ROOT}/replayer"

CONFIGS="${CONFIGS:-8C 6P+2D 4P+4D 2P+6D}"
REPS="${REPS:-3}"
TRACE="${TRACE:-/home/admin/cpfs/wjh/agentic-kv/traces/w600_r0.0015_st30.jsonl}"
RUN_TAG="${RUN_TAG:-$(date +%Y%m%d_%H%M%S)}"
MODEL_NAME="${MODEL_NAME:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
REQUEST_LIMIT_ARG=""
if [ -n "${REQUEST_LIMIT:-}" ]; then
    REQUEST_LIMIT_ARG="--request-limit ${REQUEST_LIMIT}"
fi

OUT_ROOT="${FRESH_ROOT}/mb5_runs/${RUN_TAG}"
mkdir -p "${OUT_ROOT}"
echo "[mb5-run] RUN_TAG=${RUN_TAG}"
echo "[mb5-run] OUT_ROOT=${OUT_ROOT}"
echo "[mb5-run] CONFIGS=${CONFIGS}"
echo "[mb5-run] REPS=${REPS}"
echo "[mb5-run] TRACE=${TRACE}"

run_one() {
    local config="$1" rep="$2"
    local label="${RUN_TAG}_${config}_rep${rep}"
    local rundir="${FRESH_ROOT}/mb5_runs/${label}"
    echo ""
    echo "======== ${config} rep${rep} ========"

    # Launch
    if ! CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
            bash "${LAUNCH}" start > "${OUT_ROOT}/${config}_rep${rep}_launch.log" 2>&1; then
        echo "[mb5-run] LAUNCH FAILED for ${config} rep${rep}; see ${OUT_ROOT}/${config}_rep${rep}_launch.log"
        return 1
    fi

    # Extract ENDPOINTS line emitted by mb5_launch.sh
    local endpoints
    endpoints=$(grep "^ENDPOINTS=" "${OUT_ROOT}/${config}_rep${rep}_launch.log" | tail -1 | cut -d= -f2-)
    if [ -z "${endpoints}" ]; then
        echo "[mb5-run] ERROR: no ENDPOINTS in launch log"
        bash "${LAUNCH}" stop > /dev/null 2>&1 || true
        return 1
    fi
    echo "[mb5-run] endpoints: ${endpoints}"

    # Replay
    source "${VENV}/bin/activate"
    local replay_out="${rundir}/replay_metrics.jsonl"
    mkdir -p "$(dirname "${replay_out}")"
    # per-GPU utilization timeseries over the replay window (2s sampling)
    bash "${FRESH_ROOT}/microbench/fresh_setup/gpu_monitor.sh" "${rundir}/gpu_util.csv" 2 >/dev/null 2>&1 &
    local GPU_MON=$!
    local t0
    t0=$(date +%s.%N)
    if ! PYTHONPATH="${FRESH_ROOT}" python -m replayer \
            --endpoint "${endpoints}" \
            --trace "${TRACE}" \
            --output "${replay_out}" \
            --model "${MODEL_NAME}" \
            ${REQUEST_LIMIT_ARG} \
            > "${OUT_ROOT}/${config}_rep${rep}_replay.log" 2>&1; then
        local t1
        t1=$(date +%s.%N)
        local wall=$(python -c "print(${t1} - ${t0})")
        echo "[mb5-run] REPLAY FAILED after ${wall} s; see ${OUT_ROOT}/${config}_rep${rep}_replay.log"
        kill "${GPU_MON}" 2>/dev/null || true
        bash "${LAUNCH}" stop > /dev/null 2>&1 || true
        return 1
    fi
    local t1
    t1=$(date +%s.%N)
    local wall_clock_s
    wall_clock_s=$(python -c "print(${t1} - ${t0})")
    echo "[mb5-run] replay done in ${wall_clock_s}s"
    echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"
    kill "${GPU_MON}" 2>/dev/null || true
    printf '{"t_start_unix":%s,"t_end_unix":%s}\n' "${t0}" "${t1}" > "${rundir}/run_window.json"

    # Per-instance prefix-cache counters, scraped from each backend BEFORE
    # teardown. For PD this is the only honest reuse signal: producer ports
    # (the low ones) show cross-turn prefix-cache hits; the consumer's
    # per-request cached_tokens is meaningless (it counts transferred KV).
    {
        for p in 8000 8001 8002 8003 8004 8005 8006 8007; do
            m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue
            q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}')
            h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}')
            [ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}"
        done
    } > "${rundir}/instance_apc.txt" 2>/dev/null || true

    # Stop launch (cleans up vllm + proxy; reverts patch on last call)
    CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
        bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true

    sleep 10  # cooldown so GPUs settle before next config
    echo "[mb5-run] DONE ${config} rep${rep}"
}

# Quick check that the launch script and replayer are reachable
if [ ! -f "${LAUNCH}" ]; then echo "missing ${LAUNCH}"; exit 1; fi
if [ ! -d "${REPLAYER_DIR}" ]; then echo "missing ${REPLAYER_DIR}"; exit 1; fi
if [ ! -f "${TRACE}" ]; then echo "missing trace ${TRACE}"; exit 1; fi

# Iterate
failures=0
for config in ${CONFIGS}; do
    for ((rep=1; rep<=REPS; rep++)); do
        if ! run_one "${config}" "${rep}"; then
            failures=$((failures+1))
        fi
    done
done

# Final patch revert (defensive — mb5_launch.sh stop also reverts)
python "${SCRIPT_DIR}/instrument_kv_snapshot.py" --revert --venv "${VENV}" 2>/dev/null || true

echo ""
echo "======== ALL CONFIGS DONE ========"
echo "failures: ${failures}"
echo "results under: ${FRESH_ROOT}/mb5_runs/${RUN_TAG}_*"
echo "to plot:  python plot_kv_pool_timeline.py --run-tag ${RUN_TAG}"