Files
agentic-kvc/microbench/connector_tax/layerwise/migration_target.py
Gahow Wang be948d32b8 P2: real engine-state feed replaces stale shadow counters for migration targeting
vLLM scheduler publishes real state (running/waiting, KV free, and the
max-in-progress-prefill signal /metrics lacks) to a tmpfs/redis store ~20Hz;
router reads it and avoids GIL-stall (mid-large-prefill) + KV-capacity-wall
targets, using real load over 30s-stale shadow counters. Components:
engine_state.py (canonical+reader), instrument_engine_state.py (scheduler
patch, file/redis writer), migration_target.py (scorer), proxy wiring
(--engine-state-uri, off=unchanged). All unit-tested without GPU; not yet
run live. See P2_ENGINE_STATE.md.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:01:26 +08:00

80 lines
2.9 KiB
Python

#!/usr/bin/env python3
"""P2: real-state-aware migration target selection.
Pure helpers (no proxy deps) so they're unit-testable. The router calls
`rank_migration_targets` to pick the decode target, using REAL engine state
(from the engine-state store) when available, falling back to shadow counters.
Key fix over the shadow-only Mechanism B: deprioritise targets that are
mid-large-prefill (`max_prefill_remaining` high) — those hold the GIL and
stall the mooncake receiver_loop, which is the ~45% control-plane residual
that layer-wise transfer does NOT fix. Also avoid targets near the KV
capacity wall (`gpu_kv_used_frac` high).
"""
from __future__ import annotations
from dataclasses import dataclass
@dataclass
class TargetCandidate:
idx: int
cache_hit: int # estimated transfer bytes saved (tokens)
shadow_num_req: int # proxy shadow counter (fallback)
ongoing_tokens: int # shadow tertiary
real_state: dict | None = None # engine-state record, or None if stale/missing
def real_load(c: TargetCandidate) -> float:
"""Effective load: prefer real (running + waiting); else shadow."""
rs = c.real_state
if rs is not None:
return float(rs.get("num_running", 0) + rs.get("num_waiting", 0))
return float(c.shadow_num_req)
def big_prefill_remaining(c: TargetCandidate) -> int:
"""Largest in-progress prefill on the candidate (GIL-stall predictor).
0 when unknown (no real state) so we don't over-penalise blind."""
rs = c.real_state
return int(rs.get("max_prefill_remaining", 0)) if rs is not None else 0
def kv_used_frac(c: TargetCandidate) -> float:
rs = c.real_state
if rs is not None:
f = rs.get("gpu_kv_used_frac", -1.0)
return float(f) if f is not None and f >= 0 else 0.0
return 0.0
def target_sort_key(
c: TargetCandidate,
big_prefill_threshold: int = 16000,
kv_wall_frac: float = 0.90,
):
"""Sort key (lower = better). Ordering of concerns:
1. NOT mid-large-prefill (avoid the GIL-stall dst) [bool]
2. NOT near the KV capacity wall [bool]
3. most cache-rich (fewest transfer bytes) -> -cache_hit
4. lowest real load
5. lowest ongoing_tokens (shadow tertiary tie-break)
"""
stalls = 1 if big_prefill_remaining(c) >= big_prefill_threshold else 0
near_wall = 1 if kv_used_frac(c) >= kv_wall_frac else 0
return (stalls, near_wall, -c.cache_hit, real_load(c), c.ongoing_tokens)
def rank_migration_targets(
candidates: list[TargetCandidate],
big_prefill_threshold: int = 16000,
kv_wall_frac: float = 0.90,
) -> TargetCandidate | None:
"""Return the best candidate, or None if the list is empty."""
if not candidates:
return None
return min(
candidates,
key=lambda c: target_sort_key(c, big_prefill_threshold, kv_wall_frac),
)