mb5_run: scrape per-instance prefix-cache counters before teardown

Per-port vllm:prefix_cache_{queries,hits}_total -> instance_apc.txt. For PD this is the only honest reuse signal: producer ports show cross-turn prefix hits, while the consumer's per-request cached_tokens just counts transferred KV. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-29 11:56:43 +08:00
parent d376d91fe1
commit e532e83d3e
1 changed files with 13 additions and 0 deletions
--- a/microbench/fresh_setup/mb5_run.sh
+++ b/microbench/fresh_setup/mb5_run.sh
@@ -92,6 +92,19 @@ run_one() {
    echo "[mb5-run] replay done in ${wall_clock_s}s"
    echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"

+    # Per-instance prefix-cache counters, scraped from each backend BEFORE
+    # teardown. For PD this is the only honest reuse signal: producer ports
+    # (the low ones) show cross-turn prefix-cache hits; the consumer's
+    # per-request cached_tokens is meaningless (it counts transferred KV).
+    {
+        for p in 8000 8001 8002 8003 8004 8005 8006 8007; do
+            m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue
+            q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}')
+            h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}')
+            [ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}"
+        done
+    } > "${rundir}/instance_apc.txt" 2>/dev/null || true
+
    # Stop launch (cleans up vllm + proxy; reverts patch on last call)
    CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
        bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true