- Delete unreachable best_needs_push block in _handle_combined and the four orphaned helpers (_handle_cached_prefill_offload, _handle_direct_read_offload, _query_bootstrap_hit, _get_bootstrap_client). Their only caller was the retired PUSH gate; see REPORT §3.9 errata for the rejected experiments (cc6e562,4c583f2). - Extract pick_instance_unified_hybrid as a pure function returning (chosen, idx, decision_dict). The decision dict carries the review #7 breakdown fields (decision, affinity_idx/chosen_idx, cache_hit/ratio, avg_num_requests, fallback_score, tie_break_used). - Add LMetric-fallback tie-breaker (primary score, then new_uncached, num_requests, round-robin) so new sessions don't all pin to inst 0 when BS=0 across the board. - Drop the lmetric-policy affinity write so --policy lmetric stays affinity-free per review #3. - Mark --max-offload-inflight / --offload-mode / --cache-gate-ratio / --decode-iteration-s as [DEPRECATED] in --help; flags remain accepted so scripts/bench.sh and legacy launchers don't break. - Revert uncommitted overload_factor 2.0->1.5 default; H7 sweep already rejected this knob (within noise). Future sweeps should go via CLI. Tests: add 6 hybrid-policy tests in tests/test_proxy_pick.py covering affinity-hit, overload break, low-cache fallback, tie-break rotation, lmetric purity, and breakdown field shape. 19/19 pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
316 lines
12 KiB
Python
316 lines
12 KiB
Python
"""Minimal coverage for scripts/cache_aware_proxy pick_instance + cache LRU (S1)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import sys
|
|
import types
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
PROXY_PATH = Path(__file__).resolve().parent.parent / "scripts" / "cache_aware_proxy.py"
|
|
|
|
|
|
def _install_stub_modules() -> None:
|
|
"""Provide minimal stand-ins for fastapi/uvicorn/httpx so the proxy
|
|
module imports cleanly without the full server deps."""
|
|
if "uvicorn" not in sys.modules:
|
|
sys.modules["uvicorn"] = types.ModuleType("uvicorn")
|
|
|
|
if "fastapi" not in sys.modules:
|
|
fastapi_mod = types.ModuleType("fastapi")
|
|
|
|
class _FastAPI:
|
|
def __init__(self, *a, **kw):
|
|
self.state = types.SimpleNamespace()
|
|
|
|
def post(self, *a, **kw):
|
|
def deco(fn): return fn
|
|
return deco
|
|
|
|
def get(self, *a, **kw):
|
|
def deco(fn): return fn
|
|
return deco
|
|
|
|
class _HTTPException(Exception):
|
|
def __init__(self, status_code=500, detail=""):
|
|
self.status_code = status_code
|
|
self.detail = detail
|
|
|
|
class _Request: # not actually instantiated by the routing tests
|
|
pass
|
|
|
|
fastapi_mod.FastAPI = _FastAPI
|
|
fastapi_mod.HTTPException = _HTTPException
|
|
fastapi_mod.Request = _Request
|
|
sys.modules["fastapi"] = fastapi_mod
|
|
|
|
responses_mod = types.ModuleType("fastapi.responses")
|
|
|
|
class _StreamingResponse:
|
|
def __init__(self, *a, **kw): pass
|
|
|
|
responses_mod.StreamingResponse = _StreamingResponse
|
|
sys.modules["fastapi.responses"] = responses_mod
|
|
|
|
if "httpx" not in sys.modules:
|
|
httpx_mod = types.ModuleType("httpx")
|
|
|
|
class _AsyncClient:
|
|
def __init__(self, *a, **kw): pass
|
|
async def aclose(self): pass
|
|
|
|
class _Limits:
|
|
def __init__(self, *a, **kw): pass
|
|
|
|
httpx_mod.AsyncClient = _AsyncClient
|
|
httpx_mod.Limits = _Limits
|
|
sys.modules["httpx"] = httpx_mod
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def proxy():
|
|
_install_stub_modules()
|
|
spec = importlib.util.spec_from_file_location("cache_aware_proxy", PROXY_PATH)
|
|
if spec is None or spec.loader is None:
|
|
pytest.skip(f"cannot load proxy module at {PROXY_PATH}")
|
|
mod = importlib.util.module_from_spec(spec)
|
|
sys.modules["cache_aware_proxy"] = mod
|
|
try:
|
|
spec.loader.exec_module(mod)
|
|
except ModuleNotFoundError as exc:
|
|
pytest.skip(f"proxy dependency missing: {exc}")
|
|
return mod
|
|
|
|
|
|
def _make_inst(proxy, url: str, ongoing_tokens: int = 0,
|
|
active_p_offloads: int = 0):
|
|
inst = proxy.InstanceState(url)
|
|
inst.ongoing_tokens = ongoing_tokens
|
|
inst.active_p_offloads = active_p_offloads
|
|
return inst
|
|
|
|
|
|
def test_record_prefix_evicts_oldest_block(proxy):
|
|
"""LRU bound on cached_blocks must evict the oldest entry once full."""
|
|
inst = proxy.InstanceState("http://x")
|
|
saved = proxy.SETTINGS.cache_capacity_blocks
|
|
proxy.SETTINGS.cache_capacity_blocks = 2
|
|
try:
|
|
block_size = proxy.BLOCK_SIZE
|
|
# Three distinct one-block prefixes; first must be evicted.
|
|
prefix_a = [1] * block_size
|
|
prefix_b = [2] * block_size
|
|
prefix_c = [3] * block_size
|
|
inst.record_prefix(prefix_a)
|
|
inst.record_prefix(prefix_b)
|
|
inst.record_prefix(prefix_c)
|
|
assert len(inst.cached_blocks) == 2
|
|
# A should have been evicted.
|
|
assert inst.estimate_cache_hit(prefix_a) == 0
|
|
assert inst.estimate_cache_hit(prefix_b) == block_size
|
|
assert inst.estimate_cache_hit(prefix_c) == block_size
|
|
finally:
|
|
proxy.SETTINGS.cache_capacity_blocks = saved
|
|
|
|
|
|
def test_estimate_cache_hit_touches_lru(proxy):
|
|
"""A cache hit must move the block to the MRU position."""
|
|
inst = proxy.InstanceState("http://x")
|
|
saved = proxy.SETTINGS.cache_capacity_blocks
|
|
proxy.SETTINGS.cache_capacity_blocks = 2
|
|
try:
|
|
block_size = proxy.BLOCK_SIZE
|
|
a = [1] * block_size
|
|
b = [2] * block_size
|
|
c = [3] * block_size
|
|
inst.record_prefix(a)
|
|
inst.record_prefix(b)
|
|
# Touch A so it becomes MRU; B is now LRU.
|
|
assert inst.estimate_cache_hit(a) == block_size
|
|
# Insert C: B should be evicted, A should remain.
|
|
inst.record_prefix(c)
|
|
assert inst.estimate_cache_hit(a) == block_size
|
|
assert inst.estimate_cache_hit(b) == 0
|
|
finally:
|
|
proxy.SETTINGS.cache_capacity_blocks = saved
|
|
|
|
|
|
def test_pick_instance_session_affinity_sticks(proxy):
|
|
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
|
affinity = {"sess1": 1}
|
|
chosen, idx = proxy.pick_instance(insts, None, "sess1", 1000, affinity)
|
|
assert idx == 1 and chosen is insts[1]
|
|
|
|
|
|
def test_pick_instance_session_affinity_breaks_on_overload(proxy):
|
|
"""When the pinned instance is heavily overloaded, fallback to load-aware pick."""
|
|
insts = [
|
|
_make_inst(proxy, "http://a", ongoing_tokens=100),
|
|
_make_inst(proxy, "http://b", ongoing_tokens=1_000_000),
|
|
_make_inst(proxy, "http://c", ongoing_tokens=100),
|
|
]
|
|
affinity = {"sess1": 1}
|
|
chosen, idx = proxy.pick_instance(insts, None, "sess1", 1000, affinity)
|
|
# avg ~333k; B at 1M is ~3x avg, well above OVERLOAD_FACTOR=2.0 -> fallback.
|
|
assert idx != 1
|
|
assert chosen is not insts[1]
|
|
|
|
|
|
def test_pick_instance_p_offload_penalty_steers_away(proxy):
|
|
"""Instances actively running offloaded HEAVY prefills get penalized."""
|
|
insts = [
|
|
_make_inst(proxy, "http://a", ongoing_tokens=0, active_p_offloads=2),
|
|
_make_inst(proxy, "http://b", ongoing_tokens=1000),
|
|
]
|
|
chosen, idx = proxy.pick_instance(insts, None, None, 5000, {})
|
|
# B's 1000-token load is much smaller than A's 2 * HEAVY_THRESHOLD penalty.
|
|
assert idx == 1 and chosen is insts[1]
|
|
|
|
|
|
def test_pick_instance_lmetric_picks_lowest_score(proxy):
|
|
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
|
insts[0].pending_prefill_tokens = 0
|
|
insts[0].num_requests = 0
|
|
insts[1].pending_prefill_tokens = 5000
|
|
insts[1].num_requests = 4
|
|
chosen, idx = proxy.pick_instance_lmetric(insts, None, None, 1000, {})
|
|
# Empty instance has score = 1000 * 0 = 0; busy one has (5000+1000)*4.
|
|
assert idx == 0 and chosen is insts[0]
|
|
|
|
|
|
def test_pick_instance_lmetric_ignores_session_affinity(proxy):
|
|
"""Review #3: pure --policy lmetric must remain affinity-free."""
|
|
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
|
# Make inst[1] look much busier than inst[0]; LMetric must still pick 0
|
|
# even though affinity points at 1.
|
|
insts[0].pending_prefill_tokens = 0
|
|
insts[0].num_requests = 0
|
|
insts[1].pending_prefill_tokens = 5000
|
|
insts[1].num_requests = 4
|
|
affinity = {"sess1": 1}
|
|
chosen, idx = proxy.pick_instance_lmetric(insts, None, "sess1", 1000, affinity)
|
|
assert idx == 0
|
|
# Picker must not mutate the affinity dict either.
|
|
assert affinity == {"sess1": 1}
|
|
|
|
|
|
def _record_n_blocks(proxy, inst, n: int) -> list[int]:
|
|
"""Record n distinct one-block prefixes on inst; return token_ids covering them."""
|
|
block_size = proxy.BLOCK_SIZE
|
|
tokens: list[int] = []
|
|
for b in range(n):
|
|
tokens.extend([1000 + b] * block_size)
|
|
inst.record_prefix(tokens)
|
|
return tokens
|
|
|
|
|
|
def test_hybrid_high_cache_session_sticks_to_affinity(proxy):
|
|
"""Hybrid: affinity instance with cache_ratio > 0.5 and no overload → stick."""
|
|
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
|
tokens = _record_n_blocks(proxy, insts[1], 2) # 2 blocks cached on inst[1]
|
|
affinity = {"sess1": 1}
|
|
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
|
insts, tokens, "sess1", len(tokens), affinity)
|
|
assert idx == 1 and chosen is insts[1]
|
|
assert decision["decision"] == "affinity"
|
|
assert decision["affinity_idx"] == 1
|
|
assert decision["chosen_idx"] == 1
|
|
assert decision["affinity_cache_ratio"] > 0.5
|
|
assert decision["tie_break_used"] is False
|
|
|
|
|
|
def test_hybrid_high_cache_breaks_on_overload(proxy):
|
|
"""Hybrid: affinity num_requests > avg * overload_factor → fall back to LMetric,
|
|
and with realistic new-token tail the LMetric fallback steers off the hot instance."""
|
|
insts = [
|
|
_make_inst(proxy, "http://a"),
|
|
_make_inst(proxy, "http://b"),
|
|
_make_inst(proxy, "http://c"),
|
|
]
|
|
cached = _record_n_blocks(proxy, insts[1], 2)
|
|
# Append one more uncached block so LMetric sees a real prefill cost on the
|
|
# cached instance too (BS multiplier becomes visible). Without this, the
|
|
# cached instance scores 0 * BS = 0 regardless of how loaded it is.
|
|
tokens = cached + [999_999] * proxy.BLOCK_SIZE
|
|
insts[1].num_requests = 300 # avg = 100; 300 > 100 * 2.0 ✓ breaks the gate
|
|
affinity = {"sess1": 1}
|
|
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
|
insts, tokens, "sess1", len(tokens), affinity)
|
|
assert decision["decision"] == "lmetric_fallback"
|
|
assert decision["affinity_idx"] == 1
|
|
assert idx != 1, "affinity instance is overloaded; fallback should steer away"
|
|
|
|
|
|
def test_hybrid_low_cache_falls_back(proxy):
|
|
"""Hybrid: cache_ratio <= 0.5 on affinity → fall back to LMetric."""
|
|
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
|
tokens = [1] * (proxy.BLOCK_SIZE * 2) # 1024 tokens, nothing cached anywhere
|
|
affinity = {"sess1": 1}
|
|
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
|
insts, tokens, "sess1", len(tokens), affinity)
|
|
assert decision["decision"] == "lmetric_fallback"
|
|
assert decision["affinity_cache_ratio"] == 0.0
|
|
|
|
|
|
def test_hybrid_new_session_tie_break_does_not_always_pick_index_0(proxy):
|
|
"""Review #4: when all instances tie (e.g. BS=0), tie-break must rotate."""
|
|
insts = [_make_inst(proxy, "http://a") for _ in range(3)]
|
|
seen = set()
|
|
for _ in range(12):
|
|
# No session_id, all empty → score = 0 for everyone → ties → rotate.
|
|
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
|
insts, None, None, 100, {})
|
|
seen.add(idx)
|
|
assert decision["decision"] == "lmetric_fallback"
|
|
assert decision["tie_break_used"] is True
|
|
assert seen == {0, 1, 2}, f"tie-breaker did not rotate; only saw {seen}"
|
|
|
|
|
|
def test_hybrid_decision_fields_populated(proxy):
|
|
"""Review #7: decision dict must carry the breakdown fields."""
|
|
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
|
_, _, decision = proxy.pick_instance_unified_hybrid(
|
|
insts, None, None, 100, {})
|
|
expected_keys = {
|
|
"decision", "affinity_idx", "chosen_idx",
|
|
"affinity_cache_hit", "affinity_cache_ratio", "affinity_num_requests",
|
|
"avg_num_requests", "fallback_score", "tie_break_used",
|
|
}
|
|
assert expected_keys.issubset(decision.keys())
|
|
|
|
|
|
def test_settings_has_runtime_knobs(proxy):
|
|
"""D5/B4/M3: Settings dataclass exposes the previously-hardcoded knobs."""
|
|
s = proxy.SETTINGS
|
|
for field in (
|
|
"heavy_threshold",
|
|
"overload_factor",
|
|
"max_offload_inflight",
|
|
"cache_gate_ratio",
|
|
"prefill_throughput",
|
|
"rdma_overhead_s",
|
|
"cache_capacity_blocks",
|
|
):
|
|
assert hasattr(s, field), f"SETTINGS missing {field}"
|
|
# Runtime mutability matters for tests + __main__ override.
|
|
saved = s.cache_gate_ratio
|
|
s.cache_gate_ratio = 0.55
|
|
assert proxy.SETTINGS.cache_gate_ratio == 0.55
|
|
s.cache_gate_ratio = saved
|
|
|
|
|
|
def test_p_offload_penalty_uses_settings_heavy_threshold(proxy):
|
|
"""M2: tweaking SETTINGS.heavy_threshold changes the P-offload penalty."""
|
|
inst = proxy.InstanceState("http://x")
|
|
inst.active_p_offloads = 3
|
|
saved = proxy.SETTINGS.heavy_threshold
|
|
try:
|
|
proxy.SETTINGS.heavy_threshold = 10000
|
|
assert proxy._p_offload_penalty(inst) == 30000
|
|
proxy.SETTINGS.heavy_threshold = 50000
|
|
assert proxy._p_offload_penalty(inst) == 150000
|
|
finally:
|
|
proxy.SETTINGS.heavy_threshold = saved
|