From d9046322c66597d9d926480d3350c192c7392d1c Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Fri, 29 May 2026 16:08:10 +0800 Subject: [PATCH] Add parameter-free LPWL routing policy (--policy leastwork) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Least-Prefill-Work-Left: score = pending_prefill_tokens + max(0, input - cache_hit_here), pure argmin with (num_requests, round-robin) tie-break. Zero hyperparameters — derived from the agentic pattern: decode is cheap (I/O ~217x) so outstanding prefill-token-work is the only load worth modelling. Dropping LMetric's x num_requests factor (a) un-swallows the cache signal so affinity emerges with no gate, and (b) makes an idle-but- decoding host score `input` (its true marginal cost) instead of 0, removing the empty-batch degeneracy. Stick-vs-spill crossover is computed from real token-work, replacing overload_factor + cache_ratio gate. Co-Authored-By: Claude Opus 4.8 --- scripts/cache_aware_proxy.py | 51 ++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/scripts/cache_aware_proxy.py b/scripts/cache_aware_proxy.py index c4bee6d..ca7b014 100644 --- a/scripts/cache_aware_proxy.py +++ b/scripts/cache_aware_proxy.py @@ -378,6 +378,52 @@ def pick_instance_lmetric(instances: list[InstanceState], token_ids: list[int] | return instances[best_idx], best_idx +_leastwork_rr_counter = 0 + + +def pick_instance_leastwork(instances: list[InstanceState], token_ids: list[int] | None, + session_id: str | None, input_length: int, + affinity: dict[str, int]) -> tuple[InstanceState, int]: + """Least-Prefill-Work-Left routing. Parameter-free. + + score = pending_prefill_tokens + max(0, input_length - cache_hit_here) + + Rationale (agentic pattern, no tuned constants): + - Decode is cheap (I/O ~217x), so the only load worth modelling is + outstanding *prefill* token-work. Hence no decode weight, and no + `x num_requests` (that factor both swallowed the cache signal and + zeroed idle-but-decoding hosts; dropping it makes such a host score + `input`, its true marginal cost, not 0). + - Cache-awareness IS the affinity mechanism: a returning session's + owner has new_uncached ~ 0, so it wins unless its prefill backlog + exceeds the cache saving (input tokens). The stick-vs-spill crossover + is therefore computed from real token-work, not an overload_factor or + a cache_ratio gate. + - Session skew degrades gracefully: a heavy session inflates its owner's + pending_prefill, auto-diverting *other* sessions while the heavy one + stays put (avoiding a cold re-prefill). + + Tie-break (cold start, all work equal): fewest num_requests, then + round-robin to avoid degenerate instance-0 pinning. + """ + global _leastwork_rr_counter + keys: list[tuple[int, int, int]] = [] + for i, inst in enumerate(instances): + cache_hit = inst.estimate_cache_hit(token_ids) + new_uncached = max(0, input_length - cache_hit) + work = inst.pending_prefill_tokens + new_uncached + keys.append((work, inst.num_requests, i)) + + best_pair = min(k[:2] for k in keys) + tied = [k for k in keys if k[:2] == best_pair] + if len(tied) > 1: + _leastwork_rr_counter += 1 + winner = tied[_leastwork_rr_counter % len(tied)] + else: + winner = tied[0] + return instances[winner[2]], winner[2] + + _unified_fallback_rr_counter = 0 @@ -1139,6 +1185,10 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h chosen, best_idx = pick_instance_load_only( combined_instances, token_ids, session_id, input_length, session_affinity_combined) + elif policy == "leastwork": + chosen, best_idx = pick_instance_leastwork( + combined_instances, token_ids, session_id, input_length, + session_affinity_combined) elif policy == "sticky": chosen, best_idx = pick_instance_sticky( combined_instances, token_ids, session_id, input_length, @@ -1592,6 +1642,7 @@ def parse_args(): help="Comma-separated bootstrap ports for combined instances (for offload mode)") p.add_argument("--policy", type=str, default="linear", choices=["linear", "lmetric", "load_only", "sticky", + "leastwork", "unified", "unified_kv_both", "unified_nixl_both", "unified_v2", "unified_v3"],