agentic-kvc/microbench/connector_tax/layerwise/migration_target.py

#!/usr/bin/env python3
"""P2: real-state-aware migration target selection.

Pure helpers (no proxy deps) so they're unit-testable. The router calls
`rank_migration_targets` to pick the decode target, using REAL engine state
(from the engine-state store) when available, falling back to shadow counters.

Key fix over the shadow-only Mechanism B: deprioritise targets that are
mid-large-prefill (`max_prefill_remaining` high) — those hold the GIL and
stall the mooncake receiver_loop, which is the ~45% control-plane residual
that layer-wise transfer does NOT fix. Also avoid targets near the KV
capacity wall (`gpu_kv_used_frac` high).
"""
from __future__ import annotations

from dataclasses import dataclass


@dataclass
class TargetCandidate:
    idx: int
    cache_hit: int                 # estimated transfer bytes saved (tokens)
    shadow_num_req: int            # proxy shadow counter (fallback)
    ongoing_tokens: int            # shadow tertiary
    real_state: dict | None = None # engine-state record, or None if stale/missing


def real_load(c: TargetCandidate) -> float:
    """Effective load: prefer real (running + waiting); else shadow."""
    rs = c.real_state
    if rs is not None:
        return float(rs.get("num_running", 0) + rs.get("num_waiting", 0))
    return float(c.shadow_num_req)


def big_prefill_remaining(c: TargetCandidate) -> int:
    """Largest in-progress prefill on the candidate (GIL-stall predictor).
    0 when unknown (no real state) so we don't over-penalise blind."""
    rs = c.real_state
    return int(rs.get("max_prefill_remaining", 0)) if rs is not None else 0


def kv_used_frac(c: TargetCandidate) -> float:
    rs = c.real_state
    if rs is not None:
        f = rs.get("gpu_kv_used_frac", -1.0)
        return float(f) if f is not None and f >= 0 else 0.0
    return 0.0


def target_sort_key(
    c: TargetCandidate,
    big_prefill_threshold: int = 16000,
    kv_wall_frac: float = 0.90,
):
    """Sort key (lower = better). Ordering of concerns:
      1. NOT mid-large-prefill (avoid the GIL-stall dst)         [bool]
      2. NOT near the KV capacity wall                            [bool]
      3. most cache-rich  (fewest transfer bytes)  -> -cache_hit
      4. lowest real load
      5. lowest ongoing_tokens (shadow tertiary tie-break)
    """
    stalls = 1 if big_prefill_remaining(c) >= big_prefill_threshold else 0
    near_wall = 1 if kv_used_frac(c) >= kv_wall_frac else 0
    return (stalls, near_wall, -c.cache_hit, real_load(c), c.ongoing_tokens)


def rank_migration_targets(
    candidates: list[TargetCandidate],
    big_prefill_threshold: int = 16000,
    kv_wall_frac: float = 0.90,
) -> TargetCandidate | None:
    """Return the best candidate, or None if the list is empty."""
    if not candidates:
        return None
    return min(
        candidates,
        key=lambda c: target_sort_key(c, big_prefill_threshold, kv_wall_frac),
    )