From 4b833d33b7d79537bbeb05dfb7707d58a0c04cdf Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 26 May 2026 10:40:57 +0800 Subject: [PATCH] unified_v2.1: relax gates + add unified_kv_both isolation control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2.0 ran on B3 and triggered PD-sep only 2 / 1214 times (0.2%). The gates were too conservative; the v2-vs-v1 latency gap (TTFT p90 7.35 -> 8.96 s) is therefore probably attributable to kv_both always-on overhead, not to the PD-sep mechanism itself. v2.1 has two fixes plus an isolation control. Bug fix: - The "chosen has live decodes worth protecting" gate combined num_requests and ongoing_decode_tokens with AND, falling through when EITHER was small. Under agentic workloads each worker rarely stacks more than 1-2 concurrent requests, so the gate killed 84% of v2.0 candidates that reached it. Replace with a pure ongoing_decode_tokens == 0 check ("chosen_no_active_decode") — same semantic, much higher recall. Threshold relaxation (B2 microbench is the calibration source): - pd_sep_min_new_tokens: 16000 -> 8000 (B2 TPOT idx 1.9x already at 8k, TTFT idx 12x — strictly worth migrating) - pd_sep_min_decodes_protected: 2 -> 1 - pd_sep_min_src_cache_tokens: 8000 -> 4000 - pd_sep_min_extra_cache_tokens: 4000 -> 2000 Isolation control: - New --policy unified_kv_both option. Uses the exact same picker as --policy unified but the vLLMs are launched in kv_role=kv_both (the same launch mode unified_v2 requires). PD-sep never fires. Compares against unified_v2 to attribute any v2 effect to the PD-sep branch alone, not the kv_both always-on overhead. - Both unified_kv_both and unified_v2 auto-enable kv_both launch in b3_isolated_policy.sh. Tests: - Updated the existing "chosen has no decodes" test for the new gate name and semantic. - All 24 proxy tests pass. Refs: window_1_results/v2_breakdown analysis (88.7% of candidates caught by old new_local_below_threshold; 84% of the remainder caught by the old few_decodes gate). Co-Authored-By: Claude Opus 4.7 --- scripts/b3_isolated_policy.sh | 2 +- scripts/cache_aware_proxy.py | 46 ++++++++++++++++++++++++----------- tests/test_proxy_pick.py | 4 +-- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/scripts/b3_isolated_policy.sh b/scripts/b3_isolated_policy.sh index f77e64b..a5d0dce 100755 --- a/scripts/b3_isolated_policy.sh +++ b/scripts/b3_isolated_policy.sh @@ -31,7 +31,7 @@ TRACE="${2:?usage: $0 }" RUNDIR="${3:?usage: $0 }" # Auto-enable kv_both when the policy requires it. -if [ "$POLICY" = "unified_v2" ]; then +if [ "$POLICY" = "unified_v2" ] || [ "$POLICY" = "unified_kv_both" ]; then ENABLE_KV_BOTH=1 fi diff --git a/scripts/cache_aware_proxy.py b/scripts/cache_aware_proxy.py index 4b8d46f..680b898 100644 --- a/scripts/cache_aware_proxy.py +++ b/scripts/cache_aware_proxy.py @@ -70,11 +70,14 @@ class Settings: # 2 × 48 × 4 × 128 × 2 = 98304 bytes per token. kv_bytes_per_token: int = 98304 - # --- unified_v2 gating knobs --- - pd_sep_min_new_tokens: int = 16000 # B2 idx ≥ 3× starts here - pd_sep_min_decodes_protected: int = 2 # require src has live decodes to protect - pd_sep_min_src_cache_tokens: int = 8000 # require non-trivial cache to transfer - pd_sep_min_extra_cache_tokens: int = 4000 # src must have meaningfully more cache than chosen + # --- unified_v2 gating knobs (relaxed in v2.1 after the v1 0.2% trigger rate) --- + # B2 microbench shows TPOT idx 1.9x already at new_tokens=8k and TTFT + # idx ~12x; the previous 16k threshold was too conservative and + # rejected 88.7% of candidates (window_1_results/v2_breakdown). + pd_sep_min_new_tokens: int = 8000 + pd_sep_min_decodes_protected: int = 1 # any in-flight work on chosen counts + pd_sep_min_src_cache_tokens: int = 4000 # half a block; was 8000 + pd_sep_min_extra_cache_tokens: int = 2000 # half a block; was 4000 pd_sep_margin_s: float = 0.2 # require cost gap > 0.2 s before migrating # Patch 6.6: per-request KV-xfer wall-clock timeout (proxy side). pd_sep_xfer_timeout_s: float = 60.0 @@ -461,12 +464,18 @@ def pick_instance_unified_v2( decision["v2_reason"] = f"new_local_below_threshold ({new_local} < {SETTINGS.pd_sep_min_new_tokens})" return chosen, chosen_idx, decision, None - # Hard gate 2: chosen must have live decodes worth protecting. - if chosen.ongoing_decode_tokens // max(1, SETTINGS.heavy_threshold) < 1 \ - and chosen.num_requests < SETTINGS.pd_sep_min_decodes_protected: - # Heuristic for "num_decodes": prefer num_requests as an upper - # bound since we don't track decode count separately at route time. - decision["v2_reason"] = f"chosen_few_decodes ({chosen.num_requests})" + # Hard gate 2: chosen must have live decoding work to protect. + # v2.1 simplification: pure ongoing_decode_tokens check. The previous + # gate combined num_requests and decode_tokens with AND, but + # num_requests includes requests still in prefill — adding a prefill + # to a chosen that has only its own prefill running doesn't disrupt + # any decode, so skipping makes sense. The right semantic is "skip + # iff no decode is currently happening on chosen". + if chosen.ongoing_decode_tokens == 0: + decision["v2_reason"] = ( + f"chosen_no_active_decode " + f"(num_req={chosen.num_requests} decode_tok={chosen.ongoing_decode_tokens})" + ) return chosen, chosen_idx, decision, None # Find best alternative cache source. @@ -684,7 +693,10 @@ async def lifespan(app: FastAPI): # Bootstrap combined instances for offload (need engine_ids for KV transfer) policy = getattr(global_args, 'policy', 'linear') - needs_bootstrap = global_args.offload or policy == "unified_v2" + needs_bootstrap = ( + global_args.offload + or policy in ("unified_v2", "unified_kv_both") + ) if needs_bootstrap and bp_list: await init_prefill_bootstrap(combined_instances, app.state.ready) elif needs_bootstrap and not bp_list: @@ -846,7 +858,11 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h chosen, best_idx = pick_instance_sticky( combined_instances, token_ids, session_id, input_length, session_affinity_combined) - elif policy == "unified": + elif policy == "unified" or policy == "unified_kv_both": + # unified_kv_both: same picker as `unified`, but the vLLMs are + # launched in kv_role=kv_both. Use this as an isolation control + # for `unified_v2` so the v2-vs-v1 gap reflects only the PD-sep + # branch, not the kv_both always-on overhead. chosen, best_idx, decision = pick_instance_unified_hybrid( combined_instances, token_ids, session_id, input_length, session_affinity_combined) @@ -1206,11 +1222,13 @@ def parse_args(): help="Comma-separated bootstrap ports for combined instances (for offload mode)") p.add_argument("--policy", type=str, default="linear", choices=["linear", "lmetric", "load_only", "sticky", - "unified", "unified_v2"], + "unified", "unified_kv_both", "unified_v2"], help="Routing policy: linear (cache-aware), lmetric (P_tokens × BS), " "load_only (B3 control: pure min-num_requests), " "sticky (B3 control: hard session affinity), " "unified (hybrid affinity + LMetric fallback), " + "unified_kv_both (unified on kv_both vLLMs; isolation " + "control for unified_v2; PD-sep never triggers), " "or unified_v2 (unified + selective per-request PD-sep " "via Mooncake; requires --bootstrap-ports and " "kv_role=kv_both vLLM launch)") diff --git a/tests/test_proxy_pick.py b/tests/test_proxy_pick.py index fd4d4f6..b051f09 100644 --- a/tests/test_proxy_pick.py +++ b/tests/test_proxy_pick.py @@ -410,12 +410,12 @@ def test_unified_v2_triggers_when_src_has_meaningful_cache_and_chosen_has_decode def test_unified_v2_falls_through_when_chosen_has_no_decodes(proxy): - """No decodes on chosen → no benefit from PD-sep.""" + """No decoding work on chosen → no benefit from PD-sep.""" insts, prefix = _setup_v2_scene(proxy, chosen_decodes=0, src_cache_blocks=128) chosen, idx, decision, pd_sep = proxy.pick_instance_unified_v2( insts, prefix, None, len(prefix), {}) assert pd_sep is None - assert "few_decodes" in decision["v2_reason"] + assert "no_active_decode" in decision["v2_reason"] def test_estimate_transfer_cost_is_calibrated_function(proxy):