Phase 1 milestone: system-level analysis + reproducible report

- REPORT.md: self-contained milestone report covering baseline vs elastic
  setup, exact launch commands, benchmark params, results, log locations,
  and repo structure — sufficient for anyone to reproduce
- analysis/pd_separation_analysis.md §5: elastic P2P system-level breakdown
  (KV cache hit ratio, per-class TTFT, GPU util paradox explanation)
- scripts/cache_aware_proxy.py: round-robin P-instance selection replacing
  argmin(ongoing_tokens) to fix GPU load imbalance (3.0x → expected ~2x)
- scripts/launch_elastic_p2p.sh: one-command launch for elastic P2P config

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-22 16:17:41 +08:00
parent 1e8628581b
commit 2b0ac70ee7
5 changed files with 617 additions and 14 deletions

View File

@@ -26,6 +26,7 @@ from fastapi.responses import StreamingResponse
BLOCK_SIZE = 512
CACHE_HIT_ALPHA = 1.0
HEAVY_THRESHOLD = 20000 # default; overridden by --heavy-threshold
OVERLOAD_FACTOR = 2.0
class InstanceState:
@@ -81,7 +82,6 @@ def pick_instance(instances: list[InstanceState], token_ids: list[int] | None,
_inst_cumulative_tokens = [0] * len(instances)
avg_load = max(sum(i.ongoing_tokens for i in instances) / len(instances), 1.0)
OVERLOAD_FACTOR = 2.0
# Session affinity for turn 2+ (with load override)
if session_id and session_id in affinity:
@@ -118,6 +118,7 @@ is_pd_sep = False
_breakdown_log: list[dict] = []
_offload_inflight = 0 # number of currently in-flight offloaded HEAVY requests
MAX_OFFLOAD_INFLIGHT = 4 # cap concurrent offloads to prevent P overload
_p_round_robin_idx = 0 # round-robin counter for P-instance selection
async def init_prefill_bootstrap(instances: list[InstanceState], ready: asyncio.Event):
@@ -239,18 +240,21 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
offload_reason = "disabled"
if estimated_new >= HEAVY_THRESHOLD and offload_enabled and has_bootstrap and len(combined_instances) >= 2:
d_inst = best_inst
p_candidates = [inst for inst in combined_instances if inst is not d_inst]
p_inst = min(p_candidates, key=lambda x: x.ongoing_tokens)
p_candidates = [(i, inst) for i, inst in enumerate(combined_instances) if inst is not d_inst]
avg_load = max(sum(i.ongoing_tokens for i in combined_instances) / len(combined_instances), 1.0)
# Decision logic:
# 1. Global cap: max N concurrent offloads (prevents all-offload storm)
# 2. P must not already be saturated with heavy prefills
# 3. D must be doing something (otherwise no benefit from offloading)
# NOTE: We do NOT require P < D. P can be busier than D — the point
# is to keep heavy prefill OFF the session-sticky D instance so D's
# decode is not disrupted and D's KV cache is available for future turns.
global _offload_inflight
# Round-robin P selection with overload skip (spreads P-role evenly)
global _offload_inflight, _p_round_robin_idx
p_inst = None
for _ in range(len(p_candidates)):
_p_round_robin_idx = (_p_round_robin_idx + 1) % len(p_candidates)
candidate = p_candidates[_p_round_robin_idx][1]
if candidate.ongoing_tokens < avg_load * OVERLOAD_FACTOR:
p_inst = candidate
break
if p_inst is None:
p_inst = min(p_candidates, key=lambda x: x[1].ongoing_tokens)[1]
if _offload_inflight >= MAX_OFFLOAD_INFLIGHT:
offload_reason = "max_concurrent_reached"
elif p_inst.ongoing_tokens >= HEAVY_THRESHOLD * 2: