diff --git a/microbench/fresh_setup/instrument_kv_snapshot.py b/microbench/fresh_setup/instrument_kv_snapshot.py index d90d257..7343668 100644 --- a/microbench/fresh_setup/instrument_kv_snapshot.py +++ b/microbench/fresh_setup/instrument_kv_snapshot.py @@ -36,6 +36,7 @@ MOONCAKE_REL = ( "lib/python3.12/site-packages/vllm/distributed/kv_transfer/" "kv_connector/v1/mooncake/mooncake_connector.py" ) +LOGGERS_REL = "lib/python3.12/site-packages/vllm/v1/metrics/loggers.py" START_MARK = "# MB5_INSTRUMENT_START" END_MARK = "# MB5_INSTRUMENT_END" @@ -192,9 +193,37 @@ MOONCAKE_PATCHES = [ MOONCAKE_ANCHOR + MOONCAKE_INSERT), ] +# ---------- Patch 4: vLLM 0.18.1 PD-consumer metrics counter underflow ------ +# In PromptTokenStats.update_from_output, local_cache_hit is computed as +# (num_cached_tokens + recomputed - num_external_computed_tokens). On a +# kv_consumer, a remote KV transfer can report more external-computed tokens +# than the scheduler's cached count (esp. on a KV-load failure for a large +# request), driving local_cache_hit negative. loggers.record() then calls +# Counter.inc() with that negative value and prometheus_client raises +# "Counters can only be incremented by non-negative amounts.", which kills the +# EngineCore — turning one failed request into a total config collapse. +# We clamp the per-source counts to >= 0 right before they are recorded. +LOGGERS_ANCHOR = " pts = iteration_stats.prompt_token_stats\n" +LOGGERS_INSERT = ( + f" {START_MARK}\n" + f" if pts.local_cache_hit < 0:\n" + f" pts.local_cache_hit = 0\n" + f" if pts.computed < 0:\n" + f" pts.computed = 0\n" + f" if pts.external_kv_transfer < 0:\n" + f" pts.external_kv_transfer = 0\n" + f" {END_MARK}\n" +) + +LOGGERS_PATCHES = [ + ("PD-consumer counter underflow clamp", LOGGERS_ANCHOR, + LOGGERS_ANCHOR + LOGGERS_INSERT), +] + PATCH_FILES = [ (TARGET_REL, SCHED_PATCHES), (MOONCAKE_REL, MOONCAKE_PATCHES), + (LOGGERS_REL, LOGGERS_PATCHES), ]