vLLM scheduler publishes real state (running/waiting, KV free, and the max-in-progress-prefill signal /metrics lacks) to a tmpfs/redis store ~20Hz; router reads it and avoids GIL-stall (mid-large-prefill) + KV-capacity-wall targets, using real load over 30s-stale shadow counters. Components: engine_state.py (canonical+reader), instrument_engine_state.py (scheduler patch, file/redis writer), migration_target.py (scorer), proxy wiring (--engine-state-uri, off=unchanged). All unit-tested without GPU; not yet run live. See P2_ENGINE_STATE.md. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
80 lines
2.9 KiB
Python
80 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""P2: real-state-aware migration target selection.
|
|
|
|
Pure helpers (no proxy deps) so they're unit-testable. The router calls
|
|
`rank_migration_targets` to pick the decode target, using REAL engine state
|
|
(from the engine-state store) when available, falling back to shadow counters.
|
|
|
|
Key fix over the shadow-only Mechanism B: deprioritise targets that are
|
|
mid-large-prefill (`max_prefill_remaining` high) — those hold the GIL and
|
|
stall the mooncake receiver_loop, which is the ~45% control-plane residual
|
|
that layer-wise transfer does NOT fix. Also avoid targets near the KV
|
|
capacity wall (`gpu_kv_used_frac` high).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass
|
|
class TargetCandidate:
|
|
idx: int
|
|
cache_hit: int # estimated transfer bytes saved (tokens)
|
|
shadow_num_req: int # proxy shadow counter (fallback)
|
|
ongoing_tokens: int # shadow tertiary
|
|
real_state: dict | None = None # engine-state record, or None if stale/missing
|
|
|
|
|
|
def real_load(c: TargetCandidate) -> float:
|
|
"""Effective load: prefer real (running + waiting); else shadow."""
|
|
rs = c.real_state
|
|
if rs is not None:
|
|
return float(rs.get("num_running", 0) + rs.get("num_waiting", 0))
|
|
return float(c.shadow_num_req)
|
|
|
|
|
|
def big_prefill_remaining(c: TargetCandidate) -> int:
|
|
"""Largest in-progress prefill on the candidate (GIL-stall predictor).
|
|
0 when unknown (no real state) so we don't over-penalise blind."""
|
|
rs = c.real_state
|
|
return int(rs.get("max_prefill_remaining", 0)) if rs is not None else 0
|
|
|
|
|
|
def kv_used_frac(c: TargetCandidate) -> float:
|
|
rs = c.real_state
|
|
if rs is not None:
|
|
f = rs.get("gpu_kv_used_frac", -1.0)
|
|
return float(f) if f is not None and f >= 0 else 0.0
|
|
return 0.0
|
|
|
|
|
|
def target_sort_key(
|
|
c: TargetCandidate,
|
|
big_prefill_threshold: int = 16000,
|
|
kv_wall_frac: float = 0.90,
|
|
):
|
|
"""Sort key (lower = better). Ordering of concerns:
|
|
1. NOT mid-large-prefill (avoid the GIL-stall dst) [bool]
|
|
2. NOT near the KV capacity wall [bool]
|
|
3. most cache-rich (fewest transfer bytes) -> -cache_hit
|
|
4. lowest real load
|
|
5. lowest ongoing_tokens (shadow tertiary tie-break)
|
|
"""
|
|
stalls = 1 if big_prefill_remaining(c) >= big_prefill_threshold else 0
|
|
near_wall = 1 if kv_used_frac(c) >= kv_wall_frac else 0
|
|
return (stalls, near_wall, -c.cache_hit, real_load(c), c.ongoing_tokens)
|
|
|
|
|
|
def rank_migration_targets(
|
|
candidates: list[TargetCandidate],
|
|
big_prefill_threshold: int = 16000,
|
|
kv_wall_frac: float = 0.90,
|
|
) -> TargetCandidate | None:
|
|
"""Return the best candidate, or None if the list is empty."""
|
|
if not candidates:
|
|
return None
|
|
return min(
|
|
candidates,
|
|
key=lambda c: target_sort_key(c, big_prefill_threshold, kv_wall_frac),
|
|
)
|