Add parameter-free LPWL routing policy (--policy leastwork)
Least-Prefill-Work-Left: score = pending_prefill_tokens + max(0, input - cache_hit_here), pure argmin with (num_requests, round-robin) tie-break. Zero hyperparameters — derived from the agentic pattern: decode is cheap (I/O ~217x) so outstanding prefill-token-work is the only load worth modelling. Dropping LMetric's x num_requests factor (a) un-swallows the cache signal so affinity emerges with no gate, and (b) makes an idle-but- decoding host score `input` (its true marginal cost) instead of 0, removing the empty-batch degeneracy. Stick-vs-spill crossover is computed from real token-work, replacing overload_factor + cache_ratio gate. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -378,6 +378,52 @@ def pick_instance_lmetric(instances: list[InstanceState], token_ids: list[int] |
|
|||||||
return instances[best_idx], best_idx
|
return instances[best_idx], best_idx
|
||||||
|
|
||||||
|
|
||||||
|
_leastwork_rr_counter = 0
|
||||||
|
|
||||||
|
|
||||||
|
def pick_instance_leastwork(instances: list[InstanceState], token_ids: list[int] | None,
|
||||||
|
session_id: str | None, input_length: int,
|
||||||
|
affinity: dict[str, int]) -> tuple[InstanceState, int]:
|
||||||
|
"""Least-Prefill-Work-Left routing. Parameter-free.
|
||||||
|
|
||||||
|
score = pending_prefill_tokens + max(0, input_length - cache_hit_here)
|
||||||
|
|
||||||
|
Rationale (agentic pattern, no tuned constants):
|
||||||
|
- Decode is cheap (I/O ~217x), so the only load worth modelling is
|
||||||
|
outstanding *prefill* token-work. Hence no decode weight, and no
|
||||||
|
`x num_requests` (that factor both swallowed the cache signal and
|
||||||
|
zeroed idle-but-decoding hosts; dropping it makes such a host score
|
||||||
|
`input`, its true marginal cost, not 0).
|
||||||
|
- Cache-awareness IS the affinity mechanism: a returning session's
|
||||||
|
owner has new_uncached ~ 0, so it wins unless its prefill backlog
|
||||||
|
exceeds the cache saving (input tokens). The stick-vs-spill crossover
|
||||||
|
is therefore computed from real token-work, not an overload_factor or
|
||||||
|
a cache_ratio gate.
|
||||||
|
- Session skew degrades gracefully: a heavy session inflates its owner's
|
||||||
|
pending_prefill, auto-diverting *other* sessions while the heavy one
|
||||||
|
stays put (avoiding a cold re-prefill).
|
||||||
|
|
||||||
|
Tie-break (cold start, all work equal): fewest num_requests, then
|
||||||
|
round-robin to avoid degenerate instance-0 pinning.
|
||||||
|
"""
|
||||||
|
global _leastwork_rr_counter
|
||||||
|
keys: list[tuple[int, int, int]] = []
|
||||||
|
for i, inst in enumerate(instances):
|
||||||
|
cache_hit = inst.estimate_cache_hit(token_ids)
|
||||||
|
new_uncached = max(0, input_length - cache_hit)
|
||||||
|
work = inst.pending_prefill_tokens + new_uncached
|
||||||
|
keys.append((work, inst.num_requests, i))
|
||||||
|
|
||||||
|
best_pair = min(k[:2] for k in keys)
|
||||||
|
tied = [k for k in keys if k[:2] == best_pair]
|
||||||
|
if len(tied) > 1:
|
||||||
|
_leastwork_rr_counter += 1
|
||||||
|
winner = tied[_leastwork_rr_counter % len(tied)]
|
||||||
|
else:
|
||||||
|
winner = tied[0]
|
||||||
|
return instances[winner[2]], winner[2]
|
||||||
|
|
||||||
|
|
||||||
_unified_fallback_rr_counter = 0
|
_unified_fallback_rr_counter = 0
|
||||||
|
|
||||||
|
|
||||||
@@ -1139,6 +1185,10 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
|
|||||||
chosen, best_idx = pick_instance_load_only(
|
chosen, best_idx = pick_instance_load_only(
|
||||||
combined_instances, token_ids, session_id, input_length,
|
combined_instances, token_ids, session_id, input_length,
|
||||||
session_affinity_combined)
|
session_affinity_combined)
|
||||||
|
elif policy == "leastwork":
|
||||||
|
chosen, best_idx = pick_instance_leastwork(
|
||||||
|
combined_instances, token_ids, session_id, input_length,
|
||||||
|
session_affinity_combined)
|
||||||
elif policy == "sticky":
|
elif policy == "sticky":
|
||||||
chosen, best_idx = pick_instance_sticky(
|
chosen, best_idx = pick_instance_sticky(
|
||||||
combined_instances, token_ids, session_id, input_length,
|
combined_instances, token_ids, session_id, input_length,
|
||||||
@@ -1592,6 +1642,7 @@ def parse_args():
|
|||||||
help="Comma-separated bootstrap ports for combined instances (for offload mode)")
|
help="Comma-separated bootstrap ports for combined instances (for offload mode)")
|
||||||
p.add_argument("--policy", type=str, default="linear",
|
p.add_argument("--policy", type=str, default="linear",
|
||||||
choices=["linear", "lmetric", "load_only", "sticky",
|
choices=["linear", "lmetric", "load_only", "sticky",
|
||||||
|
"leastwork",
|
||||||
"unified", "unified_kv_both",
|
"unified", "unified_kv_both",
|
||||||
"unified_nixl_both", "unified_v2",
|
"unified_nixl_both", "unified_v2",
|
||||||
"unified_v3"],
|
"unified_v3"],
|
||||||
|
|||||||
Reference in New Issue
Block a user