Phase 1 milestone: system-level analysis + reproducible report
- REPORT.md: self-contained milestone report covering baseline vs elastic setup, exact launch commands, benchmark params, results, log locations, and repo structure — sufficient for anyone to reproduce - analysis/pd_separation_analysis.md §5: elastic P2P system-level breakdown (KV cache hit ratio, per-class TTFT, GPU util paradox explanation) - scripts/cache_aware_proxy.py: round-robin P-instance selection replacing argmin(ongoing_tokens) to fix GPU load imbalance (3.0x → expected ~2x) - scripts/launch_elastic_p2p.sh: one-command launch for elastic P2P config Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,7 @@ from fastapi.responses import StreamingResponse
|
||||
BLOCK_SIZE = 512
|
||||
CACHE_HIT_ALPHA = 1.0
|
||||
HEAVY_THRESHOLD = 20000 # default; overridden by --heavy-threshold
|
||||
OVERLOAD_FACTOR = 2.0
|
||||
|
||||
|
||||
class InstanceState:
|
||||
@@ -81,7 +82,6 @@ def pick_instance(instances: list[InstanceState], token_ids: list[int] | None,
|
||||
_inst_cumulative_tokens = [0] * len(instances)
|
||||
|
||||
avg_load = max(sum(i.ongoing_tokens for i in instances) / len(instances), 1.0)
|
||||
OVERLOAD_FACTOR = 2.0
|
||||
|
||||
# Session affinity for turn 2+ (with load override)
|
||||
if session_id and session_id in affinity:
|
||||
@@ -118,6 +118,7 @@ is_pd_sep = False
|
||||
_breakdown_log: list[dict] = []
|
||||
_offload_inflight = 0 # number of currently in-flight offloaded HEAVY requests
|
||||
MAX_OFFLOAD_INFLIGHT = 4 # cap concurrent offloads to prevent P overload
|
||||
_p_round_robin_idx = 0 # round-robin counter for P-instance selection
|
||||
|
||||
|
||||
async def init_prefill_bootstrap(instances: list[InstanceState], ready: asyncio.Event):
|
||||
@@ -239,18 +240,21 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
|
||||
offload_reason = "disabled"
|
||||
if estimated_new >= HEAVY_THRESHOLD and offload_enabled and has_bootstrap and len(combined_instances) >= 2:
|
||||
d_inst = best_inst
|
||||
p_candidates = [inst for inst in combined_instances if inst is not d_inst]
|
||||
p_inst = min(p_candidates, key=lambda x: x.ongoing_tokens)
|
||||
p_candidates = [(i, inst) for i, inst in enumerate(combined_instances) if inst is not d_inst]
|
||||
avg_load = max(sum(i.ongoing_tokens for i in combined_instances) / len(combined_instances), 1.0)
|
||||
|
||||
# Decision logic:
|
||||
# 1. Global cap: max N concurrent offloads (prevents all-offload storm)
|
||||
# 2. P must not already be saturated with heavy prefills
|
||||
# 3. D must be doing something (otherwise no benefit from offloading)
|
||||
# NOTE: We do NOT require P < D. P can be busier than D — the point
|
||||
# is to keep heavy prefill OFF the session-sticky D instance so D's
|
||||
# decode is not disrupted and D's KV cache is available for future turns.
|
||||
global _offload_inflight
|
||||
# Round-robin P selection with overload skip (spreads P-role evenly)
|
||||
global _offload_inflight, _p_round_robin_idx
|
||||
p_inst = None
|
||||
for _ in range(len(p_candidates)):
|
||||
_p_round_robin_idx = (_p_round_robin_idx + 1) % len(p_candidates)
|
||||
candidate = p_candidates[_p_round_robin_idx][1]
|
||||
if candidate.ongoing_tokens < avg_load * OVERLOAD_FACTOR:
|
||||
p_inst = candidate
|
||||
break
|
||||
if p_inst is None:
|
||||
p_inst = min(p_candidates, key=lambda x: x[1].ongoing_tokens)[1]
|
||||
|
||||
if _offload_inflight >= MAX_OFFLOAD_INFLIGHT:
|
||||
offload_reason = "max_concurrent_reached"
|
||||
elif p_inst.ongoing_tokens >= HEAVY_THRESHOLD * 2:
|
||||
|
||||
Reference in New Issue
Block a user