Phase 1 milestone: system-level analysis + reproducible report

- REPORT.md: self-contained milestone report covering baseline vs elastic setup, exact launch commands, benchmark params, results, log locations, and repo structure — sufficient for anyone to reproduce - analysis/pd_separation_analysis.md §5: elastic P2P system-level breakdown (KV cache hit ratio, per-class TTFT, GPU util paradox explanation) - scripts/cache_aware_proxy.py: round-robin P-instance selection replacing argmin(ongoing_tokens) to fix GPU load imbalance (3.0x → expected ~2x) - scripts/launch_elastic_p2p.sh: one-command launch for elastic P2P config Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 16:17:41 +08:00
parent 1e8628581b
commit 2b0ac70ee7
5 changed files with 617 additions and 14 deletions
--- a/scripts/cache_aware_proxy.py
+++ b/scripts/cache_aware_proxy.py
@@ -26,6 +26,7 @@ from fastapi.responses import StreamingResponse
 BLOCK_SIZE = 512
 CACHE_HIT_ALPHA = 1.0
 HEAVY_THRESHOLD = 20000  # default; overridden by --heavy-threshold
+OVERLOAD_FACTOR = 2.0


 class InstanceState:
@@ -81,7 +82,6 @@ def pick_instance(instances: list[InstanceState], token_ids: list[int] | None,
        _inst_cumulative_tokens = [0] * len(instances)

    avg_load = max(sum(i.ongoing_tokens for i in instances) / len(instances), 1.0)
-    OVERLOAD_FACTOR = 2.0

    # Session affinity for turn 2+ (with load override)
    if session_id and session_id in affinity:
@@ -118,6 +118,7 @@ is_pd_sep = False
 _breakdown_log: list[dict] = []
 _offload_inflight = 0  # number of currently in-flight offloaded HEAVY requests
 MAX_OFFLOAD_INFLIGHT = 4  # cap concurrent offloads to prevent P overload
+_p_round_robin_idx = 0  # round-robin counter for P-instance selection


 async def init_prefill_bootstrap(instances: list[InstanceState], ready: asyncio.Event):
@@ -239,18 +240,21 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
    offload_reason = "disabled"
    if estimated_new >= HEAVY_THRESHOLD and offload_enabled and has_bootstrap and len(combined_instances) >= 2:
        d_inst = best_inst
-        p_candidates = [inst for inst in combined_instances if inst is not d_inst]
-        p_inst = min(p_candidates, key=lambda x: x.ongoing_tokens)
+        p_candidates = [(i, inst) for i, inst in enumerate(combined_instances) if inst is not d_inst]
        avg_load = max(sum(i.ongoing_tokens for i in combined_instances) / len(combined_instances), 1.0)

-        # Decision logic:
-        # 1. Global cap: max N concurrent offloads (prevents all-offload storm)
-        # 2. P must not already be saturated with heavy prefills
-        # 3. D must be doing something (otherwise no benefit from offloading)
-        # NOTE: We do NOT require P < D. P can be busier than D — the point
-        # is to keep heavy prefill OFF the session-sticky D instance so D's
-        # decode is not disrupted and D's KV cache is available for future turns.
-        global _offload_inflight
+        # Round-robin P selection with overload skip (spreads P-role evenly)
+        global _offload_inflight, _p_round_robin_idx
+        p_inst = None
+        for _ in range(len(p_candidates)):
+            _p_round_robin_idx = (_p_round_robin_idx + 1) % len(p_candidates)
+            candidate = p_candidates[_p_round_robin_idx][1]
+            if candidate.ongoing_tokens < avg_load * OVERLOAD_FACTOR:
+                p_inst = candidate
+                break
+        if p_inst is None:
+            p_inst = min(p_candidates, key=lambda x: x[1].ongoing_tokens)[1]
+
        if _offload_inflight >= MAX_OFFLOAD_INFLIGHT:
            offload_reason = "max_concurrent_reached"
        elif p_inst.ongoing_tokens >= HEAVY_THRESHOLD * 2: