Files
agentic-kvc/tests/test_proxy_pick.py
Gahow Wang ac6534c3ff Cleanup: retire dead PUSH path + extract hybrid picker
- Delete unreachable best_needs_push block in _handle_combined and the
  four orphaned helpers (_handle_cached_prefill_offload,
  _handle_direct_read_offload, _query_bootstrap_hit,
  _get_bootstrap_client). Their only caller was the retired PUSH gate;
  see REPORT §3.9 errata for the rejected experiments (cc6e562, 4c583f2).

- Extract pick_instance_unified_hybrid as a pure function returning
  (chosen, idx, decision_dict). The decision dict carries the review #7
  breakdown fields (decision, affinity_idx/chosen_idx, cache_hit/ratio,
  avg_num_requests, fallback_score, tie_break_used).

- Add LMetric-fallback tie-breaker (primary score, then new_uncached,
  num_requests, round-robin) so new sessions don't all pin to inst 0
  when BS=0 across the board.

- Drop the lmetric-policy affinity write so --policy lmetric stays
  affinity-free per review #3.

- Mark --max-offload-inflight / --offload-mode / --cache-gate-ratio /
  --decode-iteration-s as [DEPRECATED] in --help; flags remain accepted
  so scripts/bench.sh and legacy launchers don't break.

- Revert uncommitted overload_factor 2.0->1.5 default; H7 sweep already
  rejected this knob (within noise). Future sweeps should go via CLI.

Tests: add 6 hybrid-policy tests in tests/test_proxy_pick.py covering
affinity-hit, overload break, low-cache fallback, tie-break rotation,
lmetric purity, and breakdown field shape. 19/19 pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 10:46:57 +08:00

316 lines
12 KiB
Python

"""Minimal coverage for scripts/cache_aware_proxy pick_instance + cache LRU (S1)."""
from __future__ import annotations
import importlib.util
import sys
import types
from pathlib import Path
import pytest
PROXY_PATH = Path(__file__).resolve().parent.parent / "scripts" / "cache_aware_proxy.py"
def _install_stub_modules() -> None:
"""Provide minimal stand-ins for fastapi/uvicorn/httpx so the proxy
module imports cleanly without the full server deps."""
if "uvicorn" not in sys.modules:
sys.modules["uvicorn"] = types.ModuleType("uvicorn")
if "fastapi" not in sys.modules:
fastapi_mod = types.ModuleType("fastapi")
class _FastAPI:
def __init__(self, *a, **kw):
self.state = types.SimpleNamespace()
def post(self, *a, **kw):
def deco(fn): return fn
return deco
def get(self, *a, **kw):
def deco(fn): return fn
return deco
class _HTTPException(Exception):
def __init__(self, status_code=500, detail=""):
self.status_code = status_code
self.detail = detail
class _Request: # not actually instantiated by the routing tests
pass
fastapi_mod.FastAPI = _FastAPI
fastapi_mod.HTTPException = _HTTPException
fastapi_mod.Request = _Request
sys.modules["fastapi"] = fastapi_mod
responses_mod = types.ModuleType("fastapi.responses")
class _StreamingResponse:
def __init__(self, *a, **kw): pass
responses_mod.StreamingResponse = _StreamingResponse
sys.modules["fastapi.responses"] = responses_mod
if "httpx" not in sys.modules:
httpx_mod = types.ModuleType("httpx")
class _AsyncClient:
def __init__(self, *a, **kw): pass
async def aclose(self): pass
class _Limits:
def __init__(self, *a, **kw): pass
httpx_mod.AsyncClient = _AsyncClient
httpx_mod.Limits = _Limits
sys.modules["httpx"] = httpx_mod
@pytest.fixture(scope="module")
def proxy():
_install_stub_modules()
spec = importlib.util.spec_from_file_location("cache_aware_proxy", PROXY_PATH)
if spec is None or spec.loader is None:
pytest.skip(f"cannot load proxy module at {PROXY_PATH}")
mod = importlib.util.module_from_spec(spec)
sys.modules["cache_aware_proxy"] = mod
try:
spec.loader.exec_module(mod)
except ModuleNotFoundError as exc:
pytest.skip(f"proxy dependency missing: {exc}")
return mod
def _make_inst(proxy, url: str, ongoing_tokens: int = 0,
active_p_offloads: int = 0):
inst = proxy.InstanceState(url)
inst.ongoing_tokens = ongoing_tokens
inst.active_p_offloads = active_p_offloads
return inst
def test_record_prefix_evicts_oldest_block(proxy):
"""LRU bound on cached_blocks must evict the oldest entry once full."""
inst = proxy.InstanceState("http://x")
saved = proxy.SETTINGS.cache_capacity_blocks
proxy.SETTINGS.cache_capacity_blocks = 2
try:
block_size = proxy.BLOCK_SIZE
# Three distinct one-block prefixes; first must be evicted.
prefix_a = [1] * block_size
prefix_b = [2] * block_size
prefix_c = [3] * block_size
inst.record_prefix(prefix_a)
inst.record_prefix(prefix_b)
inst.record_prefix(prefix_c)
assert len(inst.cached_blocks) == 2
# A should have been evicted.
assert inst.estimate_cache_hit(prefix_a) == 0
assert inst.estimate_cache_hit(prefix_b) == block_size
assert inst.estimate_cache_hit(prefix_c) == block_size
finally:
proxy.SETTINGS.cache_capacity_blocks = saved
def test_estimate_cache_hit_touches_lru(proxy):
"""A cache hit must move the block to the MRU position."""
inst = proxy.InstanceState("http://x")
saved = proxy.SETTINGS.cache_capacity_blocks
proxy.SETTINGS.cache_capacity_blocks = 2
try:
block_size = proxy.BLOCK_SIZE
a = [1] * block_size
b = [2] * block_size
c = [3] * block_size
inst.record_prefix(a)
inst.record_prefix(b)
# Touch A so it becomes MRU; B is now LRU.
assert inst.estimate_cache_hit(a) == block_size
# Insert C: B should be evicted, A should remain.
inst.record_prefix(c)
assert inst.estimate_cache_hit(a) == block_size
assert inst.estimate_cache_hit(b) == 0
finally:
proxy.SETTINGS.cache_capacity_blocks = saved
def test_pick_instance_session_affinity_sticks(proxy):
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
affinity = {"sess1": 1}
chosen, idx = proxy.pick_instance(insts, None, "sess1", 1000, affinity)
assert idx == 1 and chosen is insts[1]
def test_pick_instance_session_affinity_breaks_on_overload(proxy):
"""When the pinned instance is heavily overloaded, fallback to load-aware pick."""
insts = [
_make_inst(proxy, "http://a", ongoing_tokens=100),
_make_inst(proxy, "http://b", ongoing_tokens=1_000_000),
_make_inst(proxy, "http://c", ongoing_tokens=100),
]
affinity = {"sess1": 1}
chosen, idx = proxy.pick_instance(insts, None, "sess1", 1000, affinity)
# avg ~333k; B at 1M is ~3x avg, well above OVERLOAD_FACTOR=2.0 -> fallback.
assert idx != 1
assert chosen is not insts[1]
def test_pick_instance_p_offload_penalty_steers_away(proxy):
"""Instances actively running offloaded HEAVY prefills get penalized."""
insts = [
_make_inst(proxy, "http://a", ongoing_tokens=0, active_p_offloads=2),
_make_inst(proxy, "http://b", ongoing_tokens=1000),
]
chosen, idx = proxy.pick_instance(insts, None, None, 5000, {})
# B's 1000-token load is much smaller than A's 2 * HEAVY_THRESHOLD penalty.
assert idx == 1 and chosen is insts[1]
def test_pick_instance_lmetric_picks_lowest_score(proxy):
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
insts[0].pending_prefill_tokens = 0
insts[0].num_requests = 0
insts[1].pending_prefill_tokens = 5000
insts[1].num_requests = 4
chosen, idx = proxy.pick_instance_lmetric(insts, None, None, 1000, {})
# Empty instance has score = 1000 * 0 = 0; busy one has (5000+1000)*4.
assert idx == 0 and chosen is insts[0]
def test_pick_instance_lmetric_ignores_session_affinity(proxy):
"""Review #3: pure --policy lmetric must remain affinity-free."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
# Make inst[1] look much busier than inst[0]; LMetric must still pick 0
# even though affinity points at 1.
insts[0].pending_prefill_tokens = 0
insts[0].num_requests = 0
insts[1].pending_prefill_tokens = 5000
insts[1].num_requests = 4
affinity = {"sess1": 1}
chosen, idx = proxy.pick_instance_lmetric(insts, None, "sess1", 1000, affinity)
assert idx == 0
# Picker must not mutate the affinity dict either.
assert affinity == {"sess1": 1}
def _record_n_blocks(proxy, inst, n: int) -> list[int]:
"""Record n distinct one-block prefixes on inst; return token_ids covering them."""
block_size = proxy.BLOCK_SIZE
tokens: list[int] = []
for b in range(n):
tokens.extend([1000 + b] * block_size)
inst.record_prefix(tokens)
return tokens
def test_hybrid_high_cache_session_sticks_to_affinity(proxy):
"""Hybrid: affinity instance with cache_ratio > 0.5 and no overload → stick."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
tokens = _record_n_blocks(proxy, insts[1], 2) # 2 blocks cached on inst[1]
affinity = {"sess1": 1}
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, tokens, "sess1", len(tokens), affinity)
assert idx == 1 and chosen is insts[1]
assert decision["decision"] == "affinity"
assert decision["affinity_idx"] == 1
assert decision["chosen_idx"] == 1
assert decision["affinity_cache_ratio"] > 0.5
assert decision["tie_break_used"] is False
def test_hybrid_high_cache_breaks_on_overload(proxy):
"""Hybrid: affinity num_requests > avg * overload_factor → fall back to LMetric,
and with realistic new-token tail the LMetric fallback steers off the hot instance."""
insts = [
_make_inst(proxy, "http://a"),
_make_inst(proxy, "http://b"),
_make_inst(proxy, "http://c"),
]
cached = _record_n_blocks(proxy, insts[1], 2)
# Append one more uncached block so LMetric sees a real prefill cost on the
# cached instance too (BS multiplier becomes visible). Without this, the
# cached instance scores 0 * BS = 0 regardless of how loaded it is.
tokens = cached + [999_999] * proxy.BLOCK_SIZE
insts[1].num_requests = 300 # avg = 100; 300 > 100 * 2.0 ✓ breaks the gate
affinity = {"sess1": 1}
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, tokens, "sess1", len(tokens), affinity)
assert decision["decision"] == "lmetric_fallback"
assert decision["affinity_idx"] == 1
assert idx != 1, "affinity instance is overloaded; fallback should steer away"
def test_hybrid_low_cache_falls_back(proxy):
"""Hybrid: cache_ratio <= 0.5 on affinity → fall back to LMetric."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
tokens = [1] * (proxy.BLOCK_SIZE * 2) # 1024 tokens, nothing cached anywhere
affinity = {"sess1": 1}
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, tokens, "sess1", len(tokens), affinity)
assert decision["decision"] == "lmetric_fallback"
assert decision["affinity_cache_ratio"] == 0.0
def test_hybrid_new_session_tie_break_does_not_always_pick_index_0(proxy):
"""Review #4: when all instances tie (e.g. BS=0), tie-break must rotate."""
insts = [_make_inst(proxy, "http://a") for _ in range(3)]
seen = set()
for _ in range(12):
# No session_id, all empty → score = 0 for everyone → ties → rotate.
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, None, None, 100, {})
seen.add(idx)
assert decision["decision"] == "lmetric_fallback"
assert decision["tie_break_used"] is True
assert seen == {0, 1, 2}, f"tie-breaker did not rotate; only saw {seen}"
def test_hybrid_decision_fields_populated(proxy):
"""Review #7: decision dict must carry the breakdown fields."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
_, _, decision = proxy.pick_instance_unified_hybrid(
insts, None, None, 100, {})
expected_keys = {
"decision", "affinity_idx", "chosen_idx",
"affinity_cache_hit", "affinity_cache_ratio", "affinity_num_requests",
"avg_num_requests", "fallback_score", "tie_break_used",
}
assert expected_keys.issubset(decision.keys())
def test_settings_has_runtime_knobs(proxy):
"""D5/B4/M3: Settings dataclass exposes the previously-hardcoded knobs."""
s = proxy.SETTINGS
for field in (
"heavy_threshold",
"overload_factor",
"max_offload_inflight",
"cache_gate_ratio",
"prefill_throughput",
"rdma_overhead_s",
"cache_capacity_blocks",
):
assert hasattr(s, field), f"SETTINGS missing {field}"
# Runtime mutability matters for tests + __main__ override.
saved = s.cache_gate_ratio
s.cache_gate_ratio = 0.55
assert proxy.SETTINGS.cache_gate_ratio == 0.55
s.cache_gate_ratio = saved
def test_p_offload_penalty_uses_settings_heavy_threshold(proxy):
"""M2: tweaking SETTINGS.heavy_threshold changes the P-offload penalty."""
inst = proxy.InstanceState("http://x")
inst.active_p_offloads = 3
saved = proxy.SETTINGS.heavy_threshold
try:
proxy.SETTINGS.heavy_threshold = 10000
assert proxy._p_offload_penalty(inst) == 30000
proxy.SETTINGS.heavy_threshold = 50000
assert proxy._p_offload_penalty(inst) == 150000
finally:
proxy.SETTINGS.heavy_threshold = saved