vLLM scheduler publishes real state (running/waiting, KV free, and the max-in-progress-prefill signal /metrics lacks) to a tmpfs/redis store ~20Hz; router reads it and avoids GIL-stall (mid-large-prefill) + KV-capacity-wall targets, using real load over 30s-stale shadow counters. Components: engine_state.py (canonical+reader), instrument_engine_state.py (scheduler patch, file/redis writer), migration_target.py (scorer), proxy wiring (--engine-state-uri, off=unchanged). All unit-tested without GPU; not yet run live. See P2_ENGINE_STATE.md. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
235 lines
7.8 KiB
Python
235 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Patch vLLM V1 scheduler to publish REAL engine state to a shared store,
|
|
so the global router reads ground truth instead of its own stale shadow
|
|
counters (reconciled only every 30s).
|
|
|
|
Published per engine (key = AGENTIC_ENGINE_ID), throttled ~20 Hz from a
|
|
daemon thread (off the forward hot path):
|
|
|
|
{ts, num_running, num_waiting, gpu_blocks_total, gpu_blocks_free,
|
|
gpu_kv_used_frac, pending_prefill_tokens, ongoing_decode_tokens,
|
|
num_prefilling, max_prefill_remaining}
|
|
|
|
`max_prefill_remaining` is the key signal /metrics does NOT expose: the
|
|
largest in-progress prefill on the engine. A big in-progress prefill holds
|
|
the GIL and stalls the mooncake receiver_loop — so the router should avoid
|
|
migrating KV to such an instance (P2).
|
|
|
|
Transport (env AGENTIC_ENGINE_STATE_URI):
|
|
file:///dev/shm/agentic_engine_state (default; atomic temp+rename)
|
|
redis://host:port/0 (optional; needs redis-py + server)
|
|
|
|
Self-contained (inlined writer) so the engine process needs no repo import.
|
|
Apply/revert markers: # ES_INSTRUMENT_START / # ES_INSTRUMENT_END.
|
|
|
|
Usage:
|
|
python instrument_engine_state.py --apply [--venv PATH]
|
|
python instrument_engine_state.py --revert [--venv PATH]
|
|
python instrument_engine_state.py --check [--venv PATH]
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
|
|
DEFAULT_VENV = Path("/home/admin/cpfs/wjh/agentic-kv/.venv")
|
|
TARGET_REL = "lib/python3.12/site-packages/vllm/v1/core/sched/scheduler.py"
|
|
START = "# ES_INSTRUMENT_START"
|
|
END = "# ES_INSTRUMENT_END"
|
|
|
|
# ---- Patch 1: header (writer + publisher thread), before class Scheduler ----
|
|
HEADER_ANCHOR = "class Scheduler(SchedulerInterface):"
|
|
HEADER = f'''{START}
|
|
import json as _es_json
|
|
import os as _es_os
|
|
import threading as _es_threading
|
|
import time as _es_time
|
|
|
|
_ES_URI = _es_os.environ.get("AGENTIC_ENGINE_STATE_URI", "")
|
|
_ES_ID = _es_os.environ.get("AGENTIC_ENGINE_ID") or _es_os.environ.get(
|
|
"AGENTIC_WORKER_ID", f"engine_{{_es_os.getpid()}}")
|
|
_ES_PERIOD_S = float(_es_os.environ.get("AGENTIC_ENGINE_STATE_PERIOD_MS", "50")) / 1000.0
|
|
|
|
|
|
class _ESWriter:
|
|
"""Pluggable state writer: file:// (atomic temp+rename) or redis://."""
|
|
def __init__(self, uri: str, engine_id: str):
|
|
self.engine_id = engine_id
|
|
self.kind = None
|
|
if uri.startswith("file://"):
|
|
self.kind = "file"
|
|
self.dir = uri[len("file://"):]
|
|
_es_os.makedirs(self.dir, exist_ok=True)
|
|
self.path = _es_os.path.join(self.dir, f"{{engine_id}}.json")
|
|
self.tmp = self.path + f".tmp.{{_es_os.getpid()}}"
|
|
elif uri.startswith("redis://"):
|
|
self.kind = "redis"
|
|
import redis # lazy
|
|
self.r = redis.Redis.from_url(uri)
|
|
self.key = f"engine_state:{{engine_id}}"
|
|
|
|
def publish(self, state: dict):
|
|
try:
|
|
if self.kind == "file":
|
|
with open(self.tmp, "w") as f:
|
|
f.write(_es_json.dumps(state))
|
|
_es_os.replace(self.tmp, self.path) # atomic
|
|
elif self.kind == "redis":
|
|
self.r.set(self.key, _es_json.dumps(state), ex=5)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _es_compute_snapshot(scheduler) -> dict:
|
|
"""Cheap O(batch) state read from the live scheduler."""
|
|
try:
|
|
kvm = scheduler.kv_cache_manager
|
|
pool = kvm.block_pool
|
|
total = int(pool.num_gpu_blocks)
|
|
free = int(pool.get_num_free_blocks())
|
|
except Exception:
|
|
total = free = -1
|
|
n_run = 0
|
|
pend = 0
|
|
dec = 0
|
|
n_pref = 0
|
|
max_pref = 0
|
|
try:
|
|
for r in scheduler.running:
|
|
n_run += 1
|
|
npr = int(getattr(r, "num_prompt_tokens", 0))
|
|
nct = int(getattr(r, "num_computed_tokens", 0))
|
|
if nct < npr: # still prefilling
|
|
rem = npr - nct
|
|
pend += rem
|
|
n_pref += 1
|
|
if rem > max_pref:
|
|
max_pref = rem
|
|
else: # decoding
|
|
dec += int(getattr(r, "num_tokens", 0))
|
|
except Exception:
|
|
pass
|
|
n_wait = 0
|
|
try:
|
|
n_wait = len(scheduler.waiting) + len(getattr(scheduler, "skipped_waiting", []))
|
|
for r in list(scheduler.waiting):
|
|
pend += max(0, int(getattr(r, "num_prompt_tokens", 0))
|
|
- int(getattr(r, "num_computed_tokens", 0)))
|
|
except Exception:
|
|
pass
|
|
used_frac = ((total - free) / total) if (total and total > 0) else -1.0
|
|
return {{
|
|
"ts": _es_time.time(),
|
|
"engine_id": _ES_ID,
|
|
"num_running": n_run,
|
|
"num_waiting": int(n_wait),
|
|
"gpu_blocks_total": total,
|
|
"gpu_blocks_free": free,
|
|
"gpu_kv_used_frac": used_frac,
|
|
"pending_prefill_tokens": int(pend),
|
|
"ongoing_decode_tokens": int(dec),
|
|
"num_prefilling": n_pref,
|
|
"max_prefill_remaining": int(max_pref),
|
|
}}
|
|
|
|
|
|
class _ESPublisher:
|
|
def __init__(self, scheduler):
|
|
self._sched = scheduler
|
|
self._writer = _ESWriter(_ES_URI, _ES_ID)
|
|
self._stop = _es_threading.Event()
|
|
self._t = _es_threading.Thread(target=self._loop, daemon=True)
|
|
self._t.start()
|
|
|
|
def _loop(self):
|
|
while not self._stop.is_set():
|
|
try:
|
|
self._writer.publish(_es_compute_snapshot(self._sched))
|
|
except Exception:
|
|
pass
|
|
_es_time.sleep(_ES_PERIOD_S)
|
|
{END}
|
|
|
|
|
|
'''
|
|
|
|
# ---- Patch 2: start the publisher at the end of Scheduler.__init__ ----------
|
|
# Anchor on the existing agentic step-log block tail in __init__.
|
|
INIT_ANCHOR = """ _step_path = _os.environ.get("AGENTIC_STEP_LOG_PATH")"""
|
|
INIT_INSERT = f""" {START}
|
|
if _ES_URI:
|
|
try:
|
|
self._es_publisher = _ESPublisher(self)
|
|
logger.info("agentic engine-state publisher: uri=%s id=%s",
|
|
_ES_URI, _ES_ID)
|
|
except Exception as _e:
|
|
logger.warning("engine-state publisher disabled (%r)", _e)
|
|
{END}
|
|
_step_path = _os.environ.get("AGENTIC_STEP_LOG_PATH")"""
|
|
|
|
PATCHES = [
|
|
("header", HEADER_ANCHOR, HEADER + HEADER_ANCHOR),
|
|
("init", INIT_ANCHOR, INIT_INSERT),
|
|
]
|
|
|
|
|
|
def find_target(venv: Path) -> Path:
|
|
for c in (venv / TARGET_REL, DEFAULT_VENV / TARGET_REL):
|
|
if c.is_file():
|
|
return c
|
|
raise FileNotFoundError(f"cannot find {TARGET_REL} under {venv}")
|
|
|
|
|
|
def is_patched(t: str) -> bool:
|
|
return START in t
|
|
|
|
|
|
def apply(target: Path):
|
|
text = target.read_text()
|
|
if is_patched(text):
|
|
print(f"[es-instr] already patched: {target}")
|
|
return
|
|
new = text
|
|
for name, src, dst in PATCHES:
|
|
if src not in new:
|
|
raise RuntimeError(f"patch {name!r}: anchor not found in {target}")
|
|
new = new.replace(src, dst, 1)
|
|
target.write_text(new)
|
|
print(f"[es-instr] applied {len(PATCHES)} patches -> {target}")
|
|
|
|
|
|
def revert(target: Path):
|
|
text = target.read_text()
|
|
if not is_patched(text):
|
|
print(f"[es-instr] not patched: {target}")
|
|
return
|
|
pat = re.compile(r"[ \t]*" + re.escape(START) + r".*?" + re.escape(END) + r"\n",
|
|
flags=re.DOTALL)
|
|
new = pat.sub("", text)
|
|
new = re.sub(r"\n{3,}class Scheduler\(", "\n\nclass Scheduler(", new)
|
|
target.write_text(new)
|
|
print(f"[es-instr] reverted: {target}")
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--apply", action="store_true")
|
|
p.add_argument("--revert", action="store_true")
|
|
p.add_argument("--check", action="store_true")
|
|
p.add_argument("--venv", type=Path, default=DEFAULT_VENV)
|
|
a = p.parse_args()
|
|
t = find_target(a.venv)
|
|
if a.apply:
|
|
apply(t)
|
|
elif a.revert:
|
|
revert(t)
|
|
elif a.check:
|
|
print(f"[es-instr] {'PATCHED' if is_patched(t.read_text()) else 'CLEAN'}: {t}")
|
|
else:
|
|
p.error("specify --apply/--revert/--check")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|