mb5_run: scrape per-instance prefix-cache counters before teardown
Per-port vllm:prefix_cache_{queries,hits}_total -> instance_apc.txt. For PD
this is the only honest reuse signal: producer ports show cross-turn prefix
hits, while the consumer's per-request cached_tokens just counts transferred KV.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -92,6 +92,19 @@ run_one() {
|
||||
echo "[mb5-run] replay done in ${wall_clock_s}s"
|
||||
echo "${wall_clock_s}" > "${rundir}/wall_clock_s.txt"
|
||||
|
||||
# Per-instance prefix-cache counters, scraped from each backend BEFORE
|
||||
# teardown. For PD this is the only honest reuse signal: producer ports
|
||||
# (the low ones) show cross-turn prefix-cache hits; the consumer's
|
||||
# per-request cached_tokens is meaningless (it counts transferred KV).
|
||||
{
|
||||
for p in 8000 8001 8002 8003 8004 8005 8006 8007; do
|
||||
m=$(curl -s --noproxy '*' "http://127.0.0.1:${p}/metrics" 2>/dev/null) || continue
|
||||
q=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_queries_total/{print $2; exit}')
|
||||
h=$(printf '%s' "$m" | awk '/^vllm:prefix_cache_hits_total/{print $2; exit}')
|
||||
[ -n "${q}" ] && echo "port=${p} queries=${q} hits=${h}"
|
||||
done
|
||||
} > "${rundir}/instance_apc.txt" 2>/dev/null || true
|
||||
|
||||
# Stop launch (cleans up vllm + proxy; reverts patch on last call)
|
||||
CONFIG="${config}" RUN_LABEL="${RUN_TAG}_${config}_rep${rep}" \
|
||||
bash "${LAUNCH}" stop > "${OUT_ROOT}/${config}_rep${rep}_stop.log" 2>&1 || true
|
||||
|
||||
Reference in New Issue
Block a user