Elastic P2P offload: TTFT p50 -49% vs baseline (0.551 vs 1.080)

Design: offload HEAVY prefill only when P instance is less loaded than D AND P is not overloaded (< 1.5x avg). Preserves session-sticky on D for future KV reuse. External KV correctly registered in prefix cache. Result (67/200 processed, 75% success): TTFT p50: 0.551s (-49% vs baseline 1.080s) TTFT p90: 4.135s (vs baseline 9.410s, -56%) TPOT p90: 0.074s (same as baseline) E2E p50: 2.938s (-45% vs baseline 5.306s) 25% error rate from ReadTimeout on very large HEAVY requests queuing on P. Needs stricter elastic gate or higher timeout. But successful requests show significant improvement over both baseline and previous P2P. Also: added external_prefix_cache metrics tracking to replayer summary. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 13:50:25 +08:00
parent e9e313f9c5
commit 1d2eeb4925
3 changed files with 156 additions and 14 deletions
--- a/scripts/cache_aware_proxy.py
+++ b/scripts/cache_aware_proxy.py
@@ -230,26 +230,41 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
    }

    offload_enabled = getattr(global_args, 'offload', False) if global_args else False
-    use_offload = (estimated_new >= HEAVY_THRESHOLD and offload_enabled
-                   and len(combined_instances) >= 2
-                   and any(inst.bootstrap_port for inst in combined_instances))
+    has_bootstrap = any(inst.bootstrap_port for inst in combined_instances)

-    if use_offload:
-        # HEAVY P2P OFFLOAD: D on session-sticky instance, P on a DIFFERENT
-        # least-loaded instance (any instance can serve as P for others).
+    # Elastic offload decision: offload only when it helps
+    use_offload = False
+    offload_reason = "disabled"
+    if estimated_new >= HEAVY_THRESHOLD and offload_enabled and has_bootstrap and len(combined_instances) >= 2:
        d_inst = best_inst
-        d_idx = best_idx
-
-        # P instance: least ongoing_tokens EXCLUDING D.
-        # CRITICAL: increment ongoing_tokens IMMEDIATELY to prevent race condition
-        # where multiple concurrent HEAVY requests all pick the same P instance.
        p_candidates = [inst for inst in combined_instances if inst is not d_inst]
        p_inst = min(p_candidates, key=lambda x: x.ongoing_tokens)
+        avg_load = max(sum(i.ongoing_tokens for i in combined_instances) / len(combined_instances), 1.0)
+
+        # Decision logic:
+        # 1. P must be less loaded than D (otherwise offload makes things worse)
+        # 2. P must not be overloaded (ongoing > 1.5x average = would queue too long)
+        # 3. D should be currently decoding (otherwise no disruption to avoid)
+        if p_inst.ongoing_tokens >= d_inst.ongoing_tokens:
+            offload_reason = "p_busier_than_d"
+        elif p_inst.ongoing_tokens > avg_load * 1.5:
+            offload_reason = "p_overloaded"
+        elif d_inst.ongoing_decode_tokens == 0 and d_inst.ongoing_tokens < avg_load * 0.5:
+            offload_reason = "d_idle_no_benefit"
+        else:
+            use_offload = True
+            offload_reason = "p_available_d_busy"
+
+    if use_offload:
+        d_idx = best_idx
        p_inst.ongoing_tokens += input_length  # reserve immediately

        breakdown["route_class"] = "HEAVY_P2P"
+        breakdown["offload_reason"] = offload_reason
        breakdown["p_inst"] = p_inst.url
        breakdown["d_inst"] = d_inst.url
+        breakdown["p_load"] = p_inst.ongoing_tokens
+        breakdown["d_load"] = d_inst.ongoing_tokens
        if session_id:
            session_affinity[session_id] = d_idx

@@ -258,6 +273,7 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
    else:
        if estimated_new >= HEAVY_THRESHOLD:
            breakdown["route_class"] = "HEAVY_COLO"
+            breakdown["offload_reason"] = offload_reason
        else:
            breakdown["route_class"] = "WARM" if estimated_new < 5000 else "MEDIUM"