Cleanup: retire dead PUSH path + extract hybrid picker
- Delete unreachable best_needs_push block in _handle_combined and the four orphaned helpers (_handle_cached_prefill_offload, _handle_direct_read_offload, _query_bootstrap_hit, _get_bootstrap_client). Their only caller was the retired PUSH gate; see REPORT §3.9 errata for the rejected experiments (cc6e562,4c583f2). - Extract pick_instance_unified_hybrid as a pure function returning (chosen, idx, decision_dict). The decision dict carries the review #7 breakdown fields (decision, affinity_idx/chosen_idx, cache_hit/ratio, avg_num_requests, fallback_score, tie_break_used). - Add LMetric-fallback tie-breaker (primary score, then new_uncached, num_requests, round-robin) so new sessions don't all pin to inst 0 when BS=0 across the board. - Drop the lmetric-policy affinity write so --policy lmetric stays affinity-free per review #3. - Mark --max-offload-inflight / --offload-mode / --cache-gate-ratio / --decode-iteration-s as [DEPRECATED] in --help; flags remain accepted so scripts/bench.sh and legacy launchers don't break. - Revert uncommitted overload_factor 2.0->1.5 default; H7 sweep already rejected this knob (within noise). Future sweeps should go via CLI. Tests: add 6 hybrid-policy tests in tests/test_proxy_pick.py covering affinity-hit, overload break, low-cache fallback, tie-break rotation, lmetric purity, and breakdown field shape. 19/19 pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -180,6 +180,107 @@ def test_pick_instance_lmetric_picks_lowest_score(proxy):
|
||||
assert idx == 0 and chosen is insts[0]
|
||||
|
||||
|
||||
def test_pick_instance_lmetric_ignores_session_affinity(proxy):
|
||||
"""Review #3: pure --policy lmetric must remain affinity-free."""
|
||||
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
||||
# Make inst[1] look much busier than inst[0]; LMetric must still pick 0
|
||||
# even though affinity points at 1.
|
||||
insts[0].pending_prefill_tokens = 0
|
||||
insts[0].num_requests = 0
|
||||
insts[1].pending_prefill_tokens = 5000
|
||||
insts[1].num_requests = 4
|
||||
affinity = {"sess1": 1}
|
||||
chosen, idx = proxy.pick_instance_lmetric(insts, None, "sess1", 1000, affinity)
|
||||
assert idx == 0
|
||||
# Picker must not mutate the affinity dict either.
|
||||
assert affinity == {"sess1": 1}
|
||||
|
||||
|
||||
def _record_n_blocks(proxy, inst, n: int) -> list[int]:
|
||||
"""Record n distinct one-block prefixes on inst; return token_ids covering them."""
|
||||
block_size = proxy.BLOCK_SIZE
|
||||
tokens: list[int] = []
|
||||
for b in range(n):
|
||||
tokens.extend([1000 + b] * block_size)
|
||||
inst.record_prefix(tokens)
|
||||
return tokens
|
||||
|
||||
|
||||
def test_hybrid_high_cache_session_sticks_to_affinity(proxy):
|
||||
"""Hybrid: affinity instance with cache_ratio > 0.5 and no overload → stick."""
|
||||
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
||||
tokens = _record_n_blocks(proxy, insts[1], 2) # 2 blocks cached on inst[1]
|
||||
affinity = {"sess1": 1}
|
||||
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
||||
insts, tokens, "sess1", len(tokens), affinity)
|
||||
assert idx == 1 and chosen is insts[1]
|
||||
assert decision["decision"] == "affinity"
|
||||
assert decision["affinity_idx"] == 1
|
||||
assert decision["chosen_idx"] == 1
|
||||
assert decision["affinity_cache_ratio"] > 0.5
|
||||
assert decision["tie_break_used"] is False
|
||||
|
||||
|
||||
def test_hybrid_high_cache_breaks_on_overload(proxy):
|
||||
"""Hybrid: affinity num_requests > avg * overload_factor → fall back to LMetric,
|
||||
and with realistic new-token tail the LMetric fallback steers off the hot instance."""
|
||||
insts = [
|
||||
_make_inst(proxy, "http://a"),
|
||||
_make_inst(proxy, "http://b"),
|
||||
_make_inst(proxy, "http://c"),
|
||||
]
|
||||
cached = _record_n_blocks(proxy, insts[1], 2)
|
||||
# Append one more uncached block so LMetric sees a real prefill cost on the
|
||||
# cached instance too (BS multiplier becomes visible). Without this, the
|
||||
# cached instance scores 0 * BS = 0 regardless of how loaded it is.
|
||||
tokens = cached + [999_999] * proxy.BLOCK_SIZE
|
||||
insts[1].num_requests = 300 # avg = 100; 300 > 100 * 2.0 ✓ breaks the gate
|
||||
affinity = {"sess1": 1}
|
||||
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
||||
insts, tokens, "sess1", len(tokens), affinity)
|
||||
assert decision["decision"] == "lmetric_fallback"
|
||||
assert decision["affinity_idx"] == 1
|
||||
assert idx != 1, "affinity instance is overloaded; fallback should steer away"
|
||||
|
||||
|
||||
def test_hybrid_low_cache_falls_back(proxy):
|
||||
"""Hybrid: cache_ratio <= 0.5 on affinity → fall back to LMetric."""
|
||||
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
||||
tokens = [1] * (proxy.BLOCK_SIZE * 2) # 1024 tokens, nothing cached anywhere
|
||||
affinity = {"sess1": 1}
|
||||
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
||||
insts, tokens, "sess1", len(tokens), affinity)
|
||||
assert decision["decision"] == "lmetric_fallback"
|
||||
assert decision["affinity_cache_ratio"] == 0.0
|
||||
|
||||
|
||||
def test_hybrid_new_session_tie_break_does_not_always_pick_index_0(proxy):
|
||||
"""Review #4: when all instances tie (e.g. BS=0), tie-break must rotate."""
|
||||
insts = [_make_inst(proxy, "http://a") for _ in range(3)]
|
||||
seen = set()
|
||||
for _ in range(12):
|
||||
# No session_id, all empty → score = 0 for everyone → ties → rotate.
|
||||
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
|
||||
insts, None, None, 100, {})
|
||||
seen.add(idx)
|
||||
assert decision["decision"] == "lmetric_fallback"
|
||||
assert decision["tie_break_used"] is True
|
||||
assert seen == {0, 1, 2}, f"tie-breaker did not rotate; only saw {seen}"
|
||||
|
||||
|
||||
def test_hybrid_decision_fields_populated(proxy):
|
||||
"""Review #7: decision dict must carry the breakdown fields."""
|
||||
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
|
||||
_, _, decision = proxy.pick_instance_unified_hybrid(
|
||||
insts, None, None, 100, {})
|
||||
expected_keys = {
|
||||
"decision", "affinity_idx", "chosen_idx",
|
||||
"affinity_cache_hit", "affinity_cache_ratio", "affinity_num_requests",
|
||||
"avg_num_requests", "fallback_score", "tie_break_used",
|
||||
}
|
||||
assert expected_keys.issubset(decision.keys())
|
||||
|
||||
|
||||
def test_settings_has_runtime_knobs(proxy):
|
||||
"""D5/B4/M3: Settings dataclass exposes the previously-hardcoded knobs."""
|
||||
s = proxy.SETTINGS
|
||||
|
||||
Reference in New Issue
Block a user