Fix cost model: accurate push_cost + aligned hard gate

1. push_cost now models both C and D: max(c_cost, d_cost) where
   c_cost includes C's queue + prefill, d_cost includes D's queue +
   RDMA overhead. Old formula only had D's contention + RDMA.
2. Hard gate uses num_requests instead of ongoing_tokens, aligning
   with the contention-based cost model.
3. Fix migration_discount: min(cap, 5) instead of hardcoded min(cap, 3).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-25 01:01:03 +08:00
parent 1d2148cf65
commit bf4469a150

View File

@@ -537,22 +537,26 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
if (offload_enabled and best_cache_hit > 0 and _push_allowed(best_cache_hit)
and i != best_cache_idx and local_hit < best_cache_hit):
push_new = max(0, input_length - best_cache_hit)
target_contention = inst.num_requests * SETTINGS.decode_iteration_s
push_cost = target_contention + push_new / throughput + SETTINGS.rdma_overhead_s
c_inst = combined_instances[best_cache_idx]
c_cost = (c_inst.num_requests * SETTINGS.decode_iteration_s
+ c_inst.pending_prefill_tokens / throughput
+ push_new / throughput)
d_cost = contention + prefill_queue + SETTINGS.rdma_overhead_s
push_cost = max(c_cost, d_cost)
if session_id and session_id in session_affinity_combined:
turn_discount = min(SETTINGS.migration_discount_cap, 3) * SETTINGS.decode_iteration_s
turn_discount = min(SETTINGS.migration_discount_cap, 5) * SETTINGS.decode_iteration_s
push_cost -= turn_discount
if push_cost < local_cost:
return push_cost, True
return local_cost, False
# Session affinity: prefer the last-used instance if its cost is reasonable
avg_load = max(sum(i.ongoing_tokens for i in combined_instances) / len(combined_instances), 1.0)
avg_reqs = max(sum(i.num_requests for i in combined_instances) / len(combined_instances), 1.0)
affinity_idx = session_affinity_combined.get(session_id) if session_id else None
if affinity_idx is not None and affinity_idx < len(combined_instances):
affinity_inst = combined_instances[affinity_idx]
# Hard gate: break affinity if instance is overloaded regardless of cache
if affinity_inst.ongoing_tokens <= avg_load * SETTINGS.overload_factor:
if affinity_inst.num_requests <= avg_reqs * SETTINGS.overload_factor:
affinity_cost, affinity_push = _instance_cost(affinity_idx)
all_costs = [_instance_cost(i) for i in range(len(combined_instances))]
global_best_cost = min(c for c, _ in all_costs)