Invalidate prior A/B results + add proper experiment harness
Prior cross-machine comparison (commit 1e86285) was invalid: dash0
baseline used warm instances with residual KV cache, inflating TTFT
by 2x. Evidence: inst_7 APC=68.3% impossible from 25 cold-start
requests; WARM TTFT p90=3.3s vs fresh=0.26s.
Fair same-machine comparison (both fresh restart on dash0):
Baseline: TTFT50=1.075 TPOT90=0.076 E2E50=5.075 OK=198/200
Elastic P2P: TTFT50=1.018 TPOT90=0.085 E2E50=6.977 OK=195/200
Elastic is WORSE due to Mooncake kv_both memory overhead.
Changes:
- REPORT.md: rewrite §3-4 with corrected results, add §3.5 errata
- pd_separation_analysis.md: update elastic TL;DR with correct numbers
- cache_aware_proxy.py: fix double-decrement bugs in offload path,
add 120s prefill timeout with co-located fallback (HEAVY_COLO_FALLBACK)
- bench.sh: standardized experiment harness with guaranteed GPU cleanup
and fresh-state verification (nvidia-smi check before start)
- run_elastic_stability_test.sh: two-phase elastic vs baseline test
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -361,13 +361,22 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
|
||||
return StreamingResponse(generate(), media_type="text/event-stream")
|
||||
|
||||
|
||||
PREFILL_TIMEOUT_S = 120 # max seconds to wait for P-instance prefill
|
||||
|
||||
|
||||
async def _handle_heavy_offload(api, req_data, headers, token_ids, input_length,
|
||||
p_inst, d_inst, breakdown):
|
||||
"""HEAVY request: prefill on p_inst, KV via Mooncake, decode on d_inst."""
|
||||
"""HEAVY request: prefill on p_inst, KV via Mooncake, decode on d_inst.
|
||||
|
||||
On prefill timeout/failure, falls back to co-located decode on d_inst.
|
||||
"""
|
||||
global _offload_inflight
|
||||
request_id = headers.get("X-Request-Id", "")
|
||||
estimated_new = breakdown.get("estimated_new_tokens", 0)
|
||||
|
||||
# Step 1: Await prefill on p_inst (ongoing_tokens already reserved by caller)
|
||||
breakdown["t_prefill_sent"] = _time.monotonic()
|
||||
prefill_ok = False
|
||||
try:
|
||||
prefill_data = req_data.copy()
|
||||
prefill_data["kv_transfer_params"] = {
|
||||
@@ -381,25 +390,56 @@ async def _handle_heavy_offload(api, req_data, headers, token_ids, input_length,
|
||||
prefill_data.pop("stream_options", None)
|
||||
|
||||
p_headers = {**headers, "X-data-parallel-rank": "0"}
|
||||
resp = await p_inst.client.post(api, json=prefill_data, headers=p_headers)
|
||||
resp = await asyncio.wait_for(
|
||||
p_inst.client.post(api, json=prefill_data, headers=p_headers),
|
||||
timeout=PREFILL_TIMEOUT_S,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
await resp.aclose()
|
||||
p_inst.record_prefix(token_ids)
|
||||
breakdown["t_prefill_done"] = _time.monotonic()
|
||||
prefill_ok = True
|
||||
except Exception as e:
|
||||
breakdown["t_prefill_done"] = _time.monotonic()
|
||||
breakdown["error"] = str(e)
|
||||
_breakdown_log.append(breakdown)
|
||||
global _offload_inflight
|
||||
_offload_inflight = max(0, _offload_inflight - 1)
|
||||
p_inst.num_requests -= 1
|
||||
raise HTTPException(status_code=502, detail="Prefill failed: %s" % e)
|
||||
breakdown["prefill_error"] = str(e)
|
||||
finally:
|
||||
# Always release P-instance resources exactly once
|
||||
p_inst.ongoing_tokens -= input_length
|
||||
p_inst.pending_prefill_tokens -= breakdown.get("estimated_new_tokens", 0)
|
||||
p_inst.pending_prefill_tokens -= estimated_new
|
||||
p_inst.num_requests -= 1
|
||||
_offload_inflight = max(0, _offload_inflight - 1)
|
||||
|
||||
p_inst.num_requests -= 1
|
||||
if not prefill_ok:
|
||||
# Fallback: co-located prefill+decode on d_inst (no KV transfer)
|
||||
breakdown["route_class"] = "HEAVY_COLO_FALLBACK"
|
||||
d_inst.ongoing_tokens += input_length
|
||||
d_inst.pending_prefill_tokens += estimated_new
|
||||
d_inst.num_requests += 1
|
||||
|
||||
async def generate_fallback():
|
||||
prefill_done = False
|
||||
try:
|
||||
async with d_inst.client.stream("POST", api, json=req_data, headers=headers) as resp:
|
||||
resp.raise_for_status()
|
||||
async for chunk in resp.aiter_bytes():
|
||||
if not prefill_done:
|
||||
d_inst.pending_prefill_tokens -= estimated_new
|
||||
d_inst.ongoing_decode_tokens += input_length
|
||||
breakdown["t_first_token"] = _time.monotonic()
|
||||
prefill_done = True
|
||||
yield chunk
|
||||
d_inst.record_prefix(token_ids)
|
||||
finally:
|
||||
if not prefill_done:
|
||||
d_inst.pending_prefill_tokens -= estimated_new
|
||||
else:
|
||||
d_inst.ongoing_decode_tokens -= input_length
|
||||
d_inst.ongoing_tokens -= input_length
|
||||
d_inst.num_requests -= 1
|
||||
breakdown["t_done"] = _time.monotonic()
|
||||
_breakdown_log.append(breakdown)
|
||||
|
||||
return StreamingResponse(generate_fallback(), media_type="text/event-stream")
|
||||
|
||||
# Step 2: Stream decode on d_inst (pulls KV from Mooncake)
|
||||
d_inst.ongoing_tokens += input_length
|
||||
|
||||
Reference in New Issue
Block a user