Invalidate prior A/B results + add proper experiment harness

Prior cross-machine comparison (commit 1e86285) was invalid: dash0
baseline used warm instances with residual KV cache, inflating TTFT
by 2x. Evidence: inst_7 APC=68.3% impossible from 25 cold-start
requests; WARM TTFT p90=3.3s vs fresh=0.26s.

Fair same-machine comparison (both fresh restart on dash0):
  Baseline:    TTFT50=1.075  TPOT90=0.076  E2E50=5.075  OK=198/200
  Elastic P2P: TTFT50=1.018  TPOT90=0.085  E2E50=6.977  OK=195/200
Elastic is WORSE due to Mooncake kv_both memory overhead.

Changes:
- REPORT.md: rewrite §3-4 with corrected results, add §3.5 errata
- pd_separation_analysis.md: update elastic TL;DR with correct numbers
- cache_aware_proxy.py: fix double-decrement bugs in offload path,
  add 120s prefill timeout with co-located fallback (HEAVY_COLO_FALLBACK)
- bench.sh: standardized experiment harness with guaranteed GPU cleanup
  and fresh-state verification (nvidia-smi check before start)
- run_elastic_stability_test.sh: two-phase elastic vs baseline test

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-22 17:54:21 +08:00
parent e4fa56cb1e
commit fc92410ec9
5 changed files with 775 additions and 116 deletions

View File

@@ -361,13 +361,22 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
return StreamingResponse(generate(), media_type="text/event-stream")
PREFILL_TIMEOUT_S = 120 # max seconds to wait for P-instance prefill
async def _handle_heavy_offload(api, req_data, headers, token_ids, input_length,
p_inst, d_inst, breakdown):
"""HEAVY request: prefill on p_inst, KV via Mooncake, decode on d_inst."""
"""HEAVY request: prefill on p_inst, KV via Mooncake, decode on d_inst.
On prefill timeout/failure, falls back to co-located decode on d_inst.
"""
global _offload_inflight
request_id = headers.get("X-Request-Id", "")
estimated_new = breakdown.get("estimated_new_tokens", 0)
# Step 1: Await prefill on p_inst (ongoing_tokens already reserved by caller)
breakdown["t_prefill_sent"] = _time.monotonic()
prefill_ok = False
try:
prefill_data = req_data.copy()
prefill_data["kv_transfer_params"] = {
@@ -381,25 +390,56 @@ async def _handle_heavy_offload(api, req_data, headers, token_ids, input_length,
prefill_data.pop("stream_options", None)
p_headers = {**headers, "X-data-parallel-rank": "0"}
resp = await p_inst.client.post(api, json=prefill_data, headers=p_headers)
resp = await asyncio.wait_for(
p_inst.client.post(api, json=prefill_data, headers=p_headers),
timeout=PREFILL_TIMEOUT_S,
)
resp.raise_for_status()
await resp.aclose()
p_inst.record_prefix(token_ids)
breakdown["t_prefill_done"] = _time.monotonic()
prefill_ok = True
except Exception as e:
breakdown["t_prefill_done"] = _time.monotonic()
breakdown["error"] = str(e)
_breakdown_log.append(breakdown)
global _offload_inflight
_offload_inflight = max(0, _offload_inflight - 1)
p_inst.num_requests -= 1
raise HTTPException(status_code=502, detail="Prefill failed: %s" % e)
breakdown["prefill_error"] = str(e)
finally:
# Always release P-instance resources exactly once
p_inst.ongoing_tokens -= input_length
p_inst.pending_prefill_tokens -= breakdown.get("estimated_new_tokens", 0)
p_inst.pending_prefill_tokens -= estimated_new
p_inst.num_requests -= 1
_offload_inflight = max(0, _offload_inflight - 1)
p_inst.num_requests -= 1
if not prefill_ok:
# Fallback: co-located prefill+decode on d_inst (no KV transfer)
breakdown["route_class"] = "HEAVY_COLO_FALLBACK"
d_inst.ongoing_tokens += input_length
d_inst.pending_prefill_tokens += estimated_new
d_inst.num_requests += 1
async def generate_fallback():
prefill_done = False
try:
async with d_inst.client.stream("POST", api, json=req_data, headers=headers) as resp:
resp.raise_for_status()
async for chunk in resp.aiter_bytes():
if not prefill_done:
d_inst.pending_prefill_tokens -= estimated_new
d_inst.ongoing_decode_tokens += input_length
breakdown["t_first_token"] = _time.monotonic()
prefill_done = True
yield chunk
d_inst.record_prefix(token_ids)
finally:
if not prefill_done:
d_inst.pending_prefill_tokens -= estimated_new
else:
d_inst.ongoing_decode_tokens -= input_length
d_inst.ongoing_tokens -= input_length
d_inst.num_requests -= 1
breakdown["t_done"] = _time.monotonic()
_breakdown_log.append(breakdown)
return StreamingResponse(generate_fallback(), media_type="text/event-stream")
# Step 2: Stream decode on d_inst (pulls KV from Mooncake)
d_inst.ongoing_tokens += input_length