Revert relaxed gate + push_cost fix: 134 offloads destroyed performance

PD-sep offload overhead (C queue + prefill + KV transfer + D schedule) far exceeds any load balance benefit. With relaxed gate, cost model triggered 134 offloads → E2E p90 went from 37s to 82s. The proven winning configuration is Unified routing in baseline mode (no Mooncake connector), which beats LMetric on E2E mean/p50/p90 purely through better routing (contention-aware + session affinity). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-25 03:38:59 +08:00
parent bf4469a150
commit 4c583f2f1c
1 changed files with 21 additions and 10 deletions
--- a/scripts/cache_aware_proxy.py
+++ b/scripts/cache_aware_proxy.py
@@ -521,7 +521,12 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
    def _push_allowed(cache_hit: int) -> bool:
        if _current_offloads() >= SETTINGS.max_offload_inflight:
            return False
-        if input_length < SETTINGS.heavy_threshold:
+        push_new = max(0, input_length - cache_hit)
+        if push_new < SETTINGS.heavy_threshold:
+            return False
+        if SETTINGS.cache_gate_ratio > 0:
+            cache_ratio = cache_hit / max(input_length, 1)
+            if cache_ratio < SETTINGS.cache_gate_ratio:
                return False
        return True

@@ -537,26 +542,22 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
        if (offload_enabled and best_cache_hit > 0 and _push_allowed(best_cache_hit)
                and i != best_cache_idx and local_hit < best_cache_hit):
            push_new = max(0, input_length - best_cache_hit)
-            c_inst = combined_instances[best_cache_idx]
-            c_cost = (c_inst.num_requests * SETTINGS.decode_iteration_s
-                      + c_inst.pending_prefill_tokens / throughput
-                      + push_new / throughput)
-            d_cost = contention + prefill_queue + SETTINGS.rdma_overhead_s
-            push_cost = max(c_cost, d_cost)
+            target_contention = inst.num_requests * SETTINGS.decode_iteration_s
+            push_cost = target_contention + push_new / throughput + SETTINGS.rdma_overhead_s
            if session_id and session_id in session_affinity_combined:
-                turn_discount = min(SETTINGS.migration_discount_cap, 5) * SETTINGS.decode_iteration_s
+                turn_discount = min(SETTINGS.migration_discount_cap, 3) * SETTINGS.decode_iteration_s
                push_cost -= turn_discount
            if push_cost < local_cost:
                return push_cost, True
        return local_cost, False

    # Session affinity: prefer the last-used instance if its cost is reasonable
-    avg_reqs = max(sum(i.num_requests for i in combined_instances) / len(combined_instances), 1.0)
+    avg_load = max(sum(i.ongoing_tokens for i in combined_instances) / len(combined_instances), 1.0)
    affinity_idx = session_affinity_combined.get(session_id) if session_id else None
    if affinity_idx is not None and affinity_idx < len(combined_instances):
        affinity_inst = combined_instances[affinity_idx]
        # Hard gate: break affinity if instance is overloaded regardless of cache
-        if affinity_inst.num_requests <= avg_reqs * SETTINGS.overload_factor:
+        if affinity_inst.ongoing_tokens <= avg_load * SETTINGS.overload_factor:
            affinity_cost, affinity_push = _instance_cost(affinity_idx)
            all_costs = [_instance_cost(i) for i in range(len(combined_instances))]
            global_best_cost = min(c for c, _ in all_costs)
@@ -617,6 +618,16 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
                return await _handle_local_request(
                    api, req_data, headers, token_ids, input_length,
                    chosen, estimated_new, breakdown)
+            if push_new < SETTINGS.heavy_threshold:
+                breakdown["push_downgraded"] = "below_heavy_threshold"
+                return await _handle_local_request(
+                    api, req_data, headers, token_ids, input_length,
+                    chosen, estimated_new, breakdown)
+            if SETTINGS.cache_gate_ratio > 0 and cache_ratio < SETTINGS.cache_gate_ratio:
+                breakdown["push_downgraded"] = "cache_gate"
+                return await _handle_local_request(
+                    api, req_data, headers, token_ids, input_length,
+                    chosen, estimated_new, breakdown)

            offload_mode = getattr(global_args, 'offload_mode', 'cached_prefill')
            breakdown["c_inst"] = c_inst.url