agentic-kvc/microbench/connector_tax/layerwise/instrument_engine_state.py

#!/usr/bin/env python3
"""Patch vLLM V1 scheduler to publish REAL engine state to a shared store,
so the global router reads ground truth instead of its own stale shadow
counters (reconciled only every 30s).

Published per engine (key = AGENTIC_ENGINE_ID), throttled ~20 Hz from a
daemon thread (off the forward hot path):

  {ts, num_running, num_waiting, gpu_blocks_total, gpu_blocks_free,
   gpu_kv_used_frac, pending_prefill_tokens, ongoing_decode_tokens,
   num_prefilling, max_prefill_remaining}

`max_prefill_remaining` is the key signal /metrics does NOT expose: the
largest in-progress prefill on the engine. A big in-progress prefill holds
the GIL and stalls the mooncake receiver_loop — so the router should avoid
migrating KV to such an instance (P2).

Transport (env AGENTIC_ENGINE_STATE_URI):
  file:///dev/shm/agentic_engine_state   (default; atomic temp+rename)
  redis://host:port/0                      (optional; needs redis-py + server)

Self-contained (inlined writer) so the engine process needs no repo import.
Apply/revert markers: # ES_INSTRUMENT_START / # ES_INSTRUMENT_END.

Usage:
  python instrument_engine_state.py --apply  [--venv PATH]
  python instrument_engine_state.py --revert [--venv PATH]
  python instrument_engine_state.py --check  [--venv PATH]
"""
from __future__ import annotations

import argparse
import re
from pathlib import Path

DEFAULT_VENV = Path("/home/admin/cpfs/wjh/agentic-kv/.venv")
TARGET_REL = "lib/python3.12/site-packages/vllm/v1/core/sched/scheduler.py"
START = "# ES_INSTRUMENT_START"
END = "# ES_INSTRUMENT_END"

# ---- Patch 1: header (writer + publisher thread), before class Scheduler ----
HEADER_ANCHOR = "class Scheduler(SchedulerInterface):"
HEADER = f'''{START}
import json as _es_json
import os as _es_os
import threading as _es_threading
import time as _es_time

_ES_URI = _es_os.environ.get("AGENTIC_ENGINE_STATE_URI", "")
_ES_ID = _es_os.environ.get("AGENTIC_ENGINE_ID") or _es_os.environ.get(
    "AGENTIC_WORKER_ID", f"engine_{{_es_os.getpid()}}")
_ES_PERIOD_S = float(_es_os.environ.get("AGENTIC_ENGINE_STATE_PERIOD_MS", "50")) / 1000.0


class _ESWriter:
    """Pluggable state writer: file:// (atomic temp+rename) or redis://."""
    def __init__(self, uri: str, engine_id: str):
        self.engine_id = engine_id
        self.kind = None
        if uri.startswith("file://"):
            self.kind = "file"
            self.dir = uri[len("file://"):]
            _es_os.makedirs(self.dir, exist_ok=True)
            self.path = _es_os.path.join(self.dir, f"{{engine_id}}.json")
            self.tmp = self.path + f".tmp.{{_es_os.getpid()}}"
        elif uri.startswith("redis://"):
            self.kind = "redis"
            import redis  # lazy
            self.r = redis.Redis.from_url(uri)
            self.key = f"engine_state:{{engine_id}}"

    def publish(self, state: dict):
        try:
            if self.kind == "file":
                with open(self.tmp, "w") as f:
                    f.write(_es_json.dumps(state))
                _es_os.replace(self.tmp, self.path)  # atomic
            elif self.kind == "redis":
                self.r.set(self.key, _es_json.dumps(state), ex=5)
        except Exception:
            pass


def _es_compute_snapshot(scheduler) -> dict:
    """Cheap O(batch) state read from the live scheduler."""
    try:
        kvm = scheduler.kv_cache_manager
        pool = kvm.block_pool
        total = int(pool.num_gpu_blocks)
        free = int(pool.get_num_free_blocks())
    except Exception:
        total = free = -1
    n_run = 0
    pend = 0
    dec = 0
    n_pref = 0
    max_pref = 0
    try:
        for r in scheduler.running:
            n_run += 1
            npr = int(getattr(r, "num_prompt_tokens", 0))
            nct = int(getattr(r, "num_computed_tokens", 0))
            if nct < npr:  # still prefilling
                rem = npr - nct
                pend += rem
                n_pref += 1
                if rem > max_pref:
                    max_pref = rem
            else:  # decoding
                dec += int(getattr(r, "num_tokens", 0))
    except Exception:
        pass
    n_wait = 0
    try:
        n_wait = len(scheduler.waiting) + len(getattr(scheduler, "skipped_waiting", []))
        for r in list(scheduler.waiting):
            pend += max(0, int(getattr(r, "num_prompt_tokens", 0))
                        - int(getattr(r, "num_computed_tokens", 0)))
    except Exception:
        pass
    used_frac = ((total - free) / total) if (total and total > 0) else -1.0
    return {{
        "ts": _es_time.time(),
        "engine_id": _ES_ID,
        "num_running": n_run,
        "num_waiting": int(n_wait),
        "gpu_blocks_total": total,
        "gpu_blocks_free": free,
        "gpu_kv_used_frac": used_frac,
        "pending_prefill_tokens": int(pend),
        "ongoing_decode_tokens": int(dec),
        "num_prefilling": n_pref,
        "max_prefill_remaining": int(max_pref),
    }}


class _ESPublisher:
    def __init__(self, scheduler):
        self._sched = scheduler
        self._writer = _ESWriter(_ES_URI, _ES_ID)
        self._stop = _es_threading.Event()
        self._t = _es_threading.Thread(target=self._loop, daemon=True)
        self._t.start()

    def _loop(self):
        while not self._stop.is_set():
            try:
                self._writer.publish(_es_compute_snapshot(self._sched))
            except Exception:
                pass
            _es_time.sleep(_ES_PERIOD_S)
{END}


'''

# ---- Patch 2: start the publisher at the end of Scheduler.__init__ ----------
# Anchor on the existing agentic step-log block tail in __init__.
INIT_ANCHOR = """        _step_path = _os.environ.get("AGENTIC_STEP_LOG_PATH")"""
INIT_INSERT = f"""        {START}
        if _ES_URI:
            try:
                self._es_publisher = _ESPublisher(self)
                logger.info("agentic engine-state publisher: uri=%s id=%s",
                            _ES_URI, _ES_ID)
            except Exception as _e:
                logger.warning("engine-state publisher disabled (%r)", _e)
        {END}
        _step_path = _os.environ.get("AGENTIC_STEP_LOG_PATH")"""

PATCHES = [
    ("header", HEADER_ANCHOR, HEADER + HEADER_ANCHOR),
    ("init", INIT_ANCHOR, INIT_INSERT),
]


def find_target(venv: Path) -> Path:
    for c in (venv / TARGET_REL, DEFAULT_VENV / TARGET_REL):
        if c.is_file():
            return c
    raise FileNotFoundError(f"cannot find {TARGET_REL} under {venv}")


def is_patched(t: str) -> bool:
    return START in t


def apply(target: Path):
    text = target.read_text()
    if is_patched(text):
        print(f"[es-instr] already patched: {target}")
        return
    new = text
    for name, src, dst in PATCHES:
        if src not in new:
            raise RuntimeError(f"patch {name!r}: anchor not found in {target}")
        new = new.replace(src, dst, 1)
    target.write_text(new)
    print(f"[es-instr] applied {len(PATCHES)} patches -> {target}")


def revert(target: Path):
    text = target.read_text()
    if not is_patched(text):
        print(f"[es-instr] not patched: {target}")
        return
    pat = re.compile(r"[ \t]*" + re.escape(START) + r".*?" + re.escape(END) + r"\n",
                     flags=re.DOTALL)
    new = pat.sub("", text)
    new = re.sub(r"\n{3,}class Scheduler\(", "\n\nclass Scheduler(", new)
    target.write_text(new)
    print(f"[es-instr] reverted: {target}")


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--apply", action="store_true")
    p.add_argument("--revert", action="store_true")
    p.add_argument("--check", action="store_true")
    p.add_argument("--venv", type=Path, default=DEFAULT_VENV)
    a = p.parse_args()
    t = find_target(a.venv)
    if a.apply:
        apply(t)
    elif a.revert:
        revert(t)
    elif a.check:
        print(f"[es-instr] {'PATCHED' if is_patched(t.read_text()) else 'CLEAN'}: {t}")
    else:
        p.error("specify --apply/--revert/--check")


if __name__ == "__main__":
    main()