Invalidate prior A/B results + add proper experiment harness

Prior cross-machine comparison (commit 1e86285) was invalid: dash0 baseline used warm instances with residual KV cache, inflating TTFT by 2x. Evidence: inst_7 APC=68.3% impossible from 25 cold-start requests; WARM TTFT p90=3.3s vs fresh=0.26s. Fair same-machine comparison (both fresh restart on dash0): Baseline: TTFT50=1.075 TPOT90=0.076 E2E50=5.075 OK=198/200 Elastic P2P: TTFT50=1.018 TPOT90=0.085 E2E50=6.977 OK=195/200 Elastic is WORSE due to Mooncake kv_both memory overhead. Changes: - REPORT.md: rewrite §3-4 with corrected results, add §3.5 errata - pd_separation_analysis.md: update elastic TL;DR with correct numbers - cache_aware_proxy.py: fix double-decrement bugs in offload path, add 120s prefill timeout with co-located fallback (HEAVY_COLO_FALLBACK) - bench.sh: standardized experiment harness with guaranteed GPU cleanup and fresh-state verification (nvidia-smi check before start) - run_elastic_stability_test.sh: two-phase elastic vs baseline test Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 17:54:21 +08:00
parent e4fa56cb1e
commit fc92410ec9
5 changed files with 775 additions and 116 deletions
--- a/scripts/cache_aware_proxy.py
+++ b/scripts/cache_aware_proxy.py
@@ -361,13 +361,22 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
        return StreamingResponse(generate(), media_type="text/event-stream")


+PREFILL_TIMEOUT_S = 120  # max seconds to wait for P-instance prefill
+
+
 async def _handle_heavy_offload(api, req_data, headers, token_ids, input_length,
                                 p_inst, d_inst, breakdown):
-    """HEAVY request: prefill on p_inst, KV via Mooncake, decode on d_inst."""
+    """HEAVY request: prefill on p_inst, KV via Mooncake, decode on d_inst.
+
+    On prefill timeout/failure, falls back to co-located decode on d_inst.
+    """
+    global _offload_inflight
    request_id = headers.get("X-Request-Id", "")
+    estimated_new = breakdown.get("estimated_new_tokens", 0)

    # Step 1: Await prefill on p_inst (ongoing_tokens already reserved by caller)
    breakdown["t_prefill_sent"] = _time.monotonic()
+    prefill_ok = False
    try:
        prefill_data = req_data.copy()
        prefill_data["kv_transfer_params"] = {
@@ -381,25 +390,56 @@ async def _handle_heavy_offload(api, req_data, headers, token_ids, input_length,
        prefill_data.pop("stream_options", None)

        p_headers = {**headers, "X-data-parallel-rank": "0"}
-        resp = await p_inst.client.post(api, json=prefill_data, headers=p_headers)
+        resp = await asyncio.wait_for(
+            p_inst.client.post(api, json=prefill_data, headers=p_headers),
+            timeout=PREFILL_TIMEOUT_S,
+        )
        resp.raise_for_status()
        await resp.aclose()
        p_inst.record_prefix(token_ids)
        breakdown["t_prefill_done"] = _time.monotonic()
+        prefill_ok = True
    except Exception as e:
        breakdown["t_prefill_done"] = _time.monotonic()
-        breakdown["error"] = str(e)
-        _breakdown_log.append(breakdown)
-        global _offload_inflight
-        _offload_inflight = max(0, _offload_inflight - 1)
-        p_inst.num_requests -= 1
-        raise HTTPException(status_code=502, detail="Prefill failed: %s" % e)
+        breakdown["prefill_error"] = str(e)
    finally:
+        # Always release P-instance resources exactly once
        p_inst.ongoing_tokens -= input_length
-        p_inst.pending_prefill_tokens -= breakdown.get("estimated_new_tokens", 0)
+        p_inst.pending_prefill_tokens -= estimated_new
+        p_inst.num_requests -= 1
        _offload_inflight = max(0, _offload_inflight - 1)

-    p_inst.num_requests -= 1
+    if not prefill_ok:
+        # Fallback: co-located prefill+decode on d_inst (no KV transfer)
+        breakdown["route_class"] = "HEAVY_COLO_FALLBACK"
+        d_inst.ongoing_tokens += input_length
+        d_inst.pending_prefill_tokens += estimated_new
+        d_inst.num_requests += 1
+
+        async def generate_fallback():
+            prefill_done = False
+            try:
+                async with d_inst.client.stream("POST", api, json=req_data, headers=headers) as resp:
+                    resp.raise_for_status()
+                    async for chunk in resp.aiter_bytes():
+                        if not prefill_done:
+                            d_inst.pending_prefill_tokens -= estimated_new
+                            d_inst.ongoing_decode_tokens += input_length
+                            breakdown["t_first_token"] = _time.monotonic()
+                            prefill_done = True
+                        yield chunk
+                d_inst.record_prefix(token_ids)
+            finally:
+                if not prefill_done:
+                    d_inst.pending_prefill_tokens -= estimated_new
+                else:
+                    d_inst.ongoing_decode_tokens -= input_length
+                d_inst.ongoing_tokens -= input_length
+                d_inst.num_requests -= 1
+                breakdown["t_done"] = _time.monotonic()
+                _breakdown_log.append(breakdown)
+
+        return StreamingResponse(generate_fallback(), media_type="text/event-stream")

    # Step 2: Stream decode on d_inst (pulls KV from Mooncake)
    d_inst.ongoing_tokens += input_length