Cleanup: retire dead PUSH path + extract hybrid picker

- Delete unreachable best_needs_push block in _handle_combined and the
  four orphaned helpers (_handle_cached_prefill_offload,
  _handle_direct_read_offload, _query_bootstrap_hit,
  _get_bootstrap_client). Their only caller was the retired PUSH gate;
  see REPORT §3.9 errata for the rejected experiments (cc6e562, 4c583f2).

- Extract pick_instance_unified_hybrid as a pure function returning
  (chosen, idx, decision_dict). The decision dict carries the review #7
  breakdown fields (decision, affinity_idx/chosen_idx, cache_hit/ratio,
  avg_num_requests, fallback_score, tie_break_used).

- Add LMetric-fallback tie-breaker (primary score, then new_uncached,
  num_requests, round-robin) so new sessions don't all pin to inst 0
  when BS=0 across the board.

- Drop the lmetric-policy affinity write so --policy lmetric stays
  affinity-free per review #3.

- Mark --max-offload-inflight / --offload-mode / --cache-gate-ratio /
  --decode-iteration-s as [DEPRECATED] in --help; flags remain accepted
  so scripts/bench.sh and legacy launchers don't break.

- Revert uncommitted overload_factor 2.0->1.5 default; H7 sweep already
  rejected this knob (within noise). Future sweeps should go via CLI.

Tests: add 6 hybrid-policy tests in tests/test_proxy_pick.py covering
affinity-hit, overload break, low-cache fallback, tie-break rotation,
lmetric purity, and breakdown field shape. 19/19 pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 10:46:57 +08:00
parent 255c8e6884
commit ac6534c3ff
2 changed files with 220 additions and 332 deletions

View File

@@ -180,6 +180,107 @@ def test_pick_instance_lmetric_picks_lowest_score(proxy):
assert idx == 0 and chosen is insts[0]
def test_pick_instance_lmetric_ignores_session_affinity(proxy):
"""Review #3: pure --policy lmetric must remain affinity-free."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
# Make inst[1] look much busier than inst[0]; LMetric must still pick 0
# even though affinity points at 1.
insts[0].pending_prefill_tokens = 0
insts[0].num_requests = 0
insts[1].pending_prefill_tokens = 5000
insts[1].num_requests = 4
affinity = {"sess1": 1}
chosen, idx = proxy.pick_instance_lmetric(insts, None, "sess1", 1000, affinity)
assert idx == 0
# Picker must not mutate the affinity dict either.
assert affinity == {"sess1": 1}
def _record_n_blocks(proxy, inst, n: int) -> list[int]:
"""Record n distinct one-block prefixes on inst; return token_ids covering them."""
block_size = proxy.BLOCK_SIZE
tokens: list[int] = []
for b in range(n):
tokens.extend([1000 + b] * block_size)
inst.record_prefix(tokens)
return tokens
def test_hybrid_high_cache_session_sticks_to_affinity(proxy):
"""Hybrid: affinity instance with cache_ratio > 0.5 and no overload → stick."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
tokens = _record_n_blocks(proxy, insts[1], 2) # 2 blocks cached on inst[1]
affinity = {"sess1": 1}
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, tokens, "sess1", len(tokens), affinity)
assert idx == 1 and chosen is insts[1]
assert decision["decision"] == "affinity"
assert decision["affinity_idx"] == 1
assert decision["chosen_idx"] == 1
assert decision["affinity_cache_ratio"] > 0.5
assert decision["tie_break_used"] is False
def test_hybrid_high_cache_breaks_on_overload(proxy):
"""Hybrid: affinity num_requests > avg * overload_factor → fall back to LMetric,
and with realistic new-token tail the LMetric fallback steers off the hot instance."""
insts = [
_make_inst(proxy, "http://a"),
_make_inst(proxy, "http://b"),
_make_inst(proxy, "http://c"),
]
cached = _record_n_blocks(proxy, insts[1], 2)
# Append one more uncached block so LMetric sees a real prefill cost on the
# cached instance too (BS multiplier becomes visible). Without this, the
# cached instance scores 0 * BS = 0 regardless of how loaded it is.
tokens = cached + [999_999] * proxy.BLOCK_SIZE
insts[1].num_requests = 300 # avg = 100; 300 > 100 * 2.0 ✓ breaks the gate
affinity = {"sess1": 1}
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, tokens, "sess1", len(tokens), affinity)
assert decision["decision"] == "lmetric_fallback"
assert decision["affinity_idx"] == 1
assert idx != 1, "affinity instance is overloaded; fallback should steer away"
def test_hybrid_low_cache_falls_back(proxy):
"""Hybrid: cache_ratio <= 0.5 on affinity → fall back to LMetric."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
tokens = [1] * (proxy.BLOCK_SIZE * 2) # 1024 tokens, nothing cached anywhere
affinity = {"sess1": 1}
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, tokens, "sess1", len(tokens), affinity)
assert decision["decision"] == "lmetric_fallback"
assert decision["affinity_cache_ratio"] == 0.0
def test_hybrid_new_session_tie_break_does_not_always_pick_index_0(proxy):
"""Review #4: when all instances tie (e.g. BS=0), tie-break must rotate."""
insts = [_make_inst(proxy, "http://a") for _ in range(3)]
seen = set()
for _ in range(12):
# No session_id, all empty → score = 0 for everyone → ties → rotate.
chosen, idx, decision = proxy.pick_instance_unified_hybrid(
insts, None, None, 100, {})
seen.add(idx)
assert decision["decision"] == "lmetric_fallback"
assert decision["tie_break_used"] is True
assert seen == {0, 1, 2}, f"tie-breaker did not rotate; only saw {seen}"
def test_hybrid_decision_fields_populated(proxy):
"""Review #7: decision dict must carry the breakdown fields."""
insts = [_make_inst(proxy, "http://a"), _make_inst(proxy, "http://b")]
_, _, decision = proxy.pick_instance_unified_hybrid(
insts, None, None, 100, {})
expected_keys = {
"decision", "affinity_idx", "chosen_idx",
"affinity_cache_hit", "affinity_cache_ratio", "affinity_num_requests",
"avg_num_requests", "fallback_score", "tie_break_used",
}
assert expected_keys.issubset(decision.keys())
def test_settings_has_runtime_knobs(proxy):
"""D5/B4/M3: Settings dataclass exposes the previously-hardcoded knobs."""
s = proxy.SETTINGS