unified_v2.1: relax gates + add unified_kv_both isolation control
v2.0 ran on B3 and triggered PD-sep only 2 / 1214 times (0.2%). The
gates were too conservative; the v2-vs-v1 latency gap (TTFT p90
7.35 -> 8.96 s) is therefore probably attributable to kv_both
always-on overhead, not to the PD-sep mechanism itself. v2.1 has two
fixes plus an isolation control.
Bug fix:
- The "chosen has live decodes worth protecting" gate combined
num_requests and ongoing_decode_tokens with AND, falling through
when EITHER was small. Under agentic workloads each worker rarely
stacks more than 1-2 concurrent requests, so the gate killed 84%
of v2.0 candidates that reached it. Replace with a pure
ongoing_decode_tokens == 0 check ("chosen_no_active_decode") —
same semantic, much higher recall.
Threshold relaxation (B2 microbench is the calibration source):
- pd_sep_min_new_tokens: 16000 -> 8000 (B2 TPOT idx 1.9x already
at 8k, TTFT idx 12x — strictly worth migrating)
- pd_sep_min_decodes_protected: 2 -> 1
- pd_sep_min_src_cache_tokens: 8000 -> 4000
- pd_sep_min_extra_cache_tokens: 4000 -> 2000
Isolation control:
- New --policy unified_kv_both option. Uses the exact same picker as
--policy unified but the vLLMs are launched in kv_role=kv_both
(the same launch mode unified_v2 requires). PD-sep never fires.
Compares against unified_v2 to attribute any v2 effect to the
PD-sep branch alone, not the kv_both always-on overhead.
- Both unified_kv_both and unified_v2 auto-enable kv_both launch in
b3_isolated_policy.sh.
Tests:
- Updated the existing "chosen has no decodes" test for the new
gate name and semantic.
- All 24 proxy tests pass.
Refs: window_1_results/v2_breakdown analysis (88.7% of candidates
caught by old new_local_below_threshold; 84% of the remainder
caught by the old few_decodes gate).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -31,7 +31,7 @@ TRACE="${2:?usage: $0 <policy> <trace> <rundir>}"
|
|||||||
RUNDIR="${3:?usage: $0 <policy> <trace> <rundir>}"
|
RUNDIR="${3:?usage: $0 <policy> <trace> <rundir>}"
|
||||||
|
|
||||||
# Auto-enable kv_both when the policy requires it.
|
# Auto-enable kv_both when the policy requires it.
|
||||||
if [ "$POLICY" = "unified_v2" ]; then
|
if [ "$POLICY" = "unified_v2" ] || [ "$POLICY" = "unified_kv_both" ]; then
|
||||||
ENABLE_KV_BOTH=1
|
ENABLE_KV_BOTH=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -70,11 +70,14 @@ class Settings:
|
|||||||
# 2 × 48 × 4 × 128 × 2 = 98304 bytes per token.
|
# 2 × 48 × 4 × 128 × 2 = 98304 bytes per token.
|
||||||
kv_bytes_per_token: int = 98304
|
kv_bytes_per_token: int = 98304
|
||||||
|
|
||||||
# --- unified_v2 gating knobs ---
|
# --- unified_v2 gating knobs (relaxed in v2.1 after the v1 0.2% trigger rate) ---
|
||||||
pd_sep_min_new_tokens: int = 16000 # B2 idx ≥ 3× starts here
|
# B2 microbench shows TPOT idx 1.9x already at new_tokens=8k and TTFT
|
||||||
pd_sep_min_decodes_protected: int = 2 # require src has live decodes to protect
|
# idx ~12x; the previous 16k threshold was too conservative and
|
||||||
pd_sep_min_src_cache_tokens: int = 8000 # require non-trivial cache to transfer
|
# rejected 88.7% of candidates (window_1_results/v2_breakdown).
|
||||||
pd_sep_min_extra_cache_tokens: int = 4000 # src must have meaningfully more cache than chosen
|
pd_sep_min_new_tokens: int = 8000
|
||||||
|
pd_sep_min_decodes_protected: int = 1 # any in-flight work on chosen counts
|
||||||
|
pd_sep_min_src_cache_tokens: int = 4000 # half a block; was 8000
|
||||||
|
pd_sep_min_extra_cache_tokens: int = 2000 # half a block; was 4000
|
||||||
pd_sep_margin_s: float = 0.2 # require cost gap > 0.2 s before migrating
|
pd_sep_margin_s: float = 0.2 # require cost gap > 0.2 s before migrating
|
||||||
# Patch 6.6: per-request KV-xfer wall-clock timeout (proxy side).
|
# Patch 6.6: per-request KV-xfer wall-clock timeout (proxy side).
|
||||||
pd_sep_xfer_timeout_s: float = 60.0
|
pd_sep_xfer_timeout_s: float = 60.0
|
||||||
@@ -461,12 +464,18 @@ def pick_instance_unified_v2(
|
|||||||
decision["v2_reason"] = f"new_local_below_threshold ({new_local} < {SETTINGS.pd_sep_min_new_tokens})"
|
decision["v2_reason"] = f"new_local_below_threshold ({new_local} < {SETTINGS.pd_sep_min_new_tokens})"
|
||||||
return chosen, chosen_idx, decision, None
|
return chosen, chosen_idx, decision, None
|
||||||
|
|
||||||
# Hard gate 2: chosen must have live decodes worth protecting.
|
# Hard gate 2: chosen must have live decoding work to protect.
|
||||||
if chosen.ongoing_decode_tokens // max(1, SETTINGS.heavy_threshold) < 1 \
|
# v2.1 simplification: pure ongoing_decode_tokens check. The previous
|
||||||
and chosen.num_requests < SETTINGS.pd_sep_min_decodes_protected:
|
# gate combined num_requests and decode_tokens with AND, but
|
||||||
# Heuristic for "num_decodes": prefer num_requests as an upper
|
# num_requests includes requests still in prefill — adding a prefill
|
||||||
# bound since we don't track decode count separately at route time.
|
# to a chosen that has only its own prefill running doesn't disrupt
|
||||||
decision["v2_reason"] = f"chosen_few_decodes ({chosen.num_requests})"
|
# any decode, so skipping makes sense. The right semantic is "skip
|
||||||
|
# iff no decode is currently happening on chosen".
|
||||||
|
if chosen.ongoing_decode_tokens == 0:
|
||||||
|
decision["v2_reason"] = (
|
||||||
|
f"chosen_no_active_decode "
|
||||||
|
f"(num_req={chosen.num_requests} decode_tok={chosen.ongoing_decode_tokens})"
|
||||||
|
)
|
||||||
return chosen, chosen_idx, decision, None
|
return chosen, chosen_idx, decision, None
|
||||||
|
|
||||||
# Find best alternative cache source.
|
# Find best alternative cache source.
|
||||||
@@ -684,7 +693,10 @@ async def lifespan(app: FastAPI):
|
|||||||
|
|
||||||
# Bootstrap combined instances for offload (need engine_ids for KV transfer)
|
# Bootstrap combined instances for offload (need engine_ids for KV transfer)
|
||||||
policy = getattr(global_args, 'policy', 'linear')
|
policy = getattr(global_args, 'policy', 'linear')
|
||||||
needs_bootstrap = global_args.offload or policy == "unified_v2"
|
needs_bootstrap = (
|
||||||
|
global_args.offload
|
||||||
|
or policy in ("unified_v2", "unified_kv_both")
|
||||||
|
)
|
||||||
if needs_bootstrap and bp_list:
|
if needs_bootstrap and bp_list:
|
||||||
await init_prefill_bootstrap(combined_instances, app.state.ready)
|
await init_prefill_bootstrap(combined_instances, app.state.ready)
|
||||||
elif needs_bootstrap and not bp_list:
|
elif needs_bootstrap and not bp_list:
|
||||||
@@ -846,7 +858,11 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
|
|||||||
chosen, best_idx = pick_instance_sticky(
|
chosen, best_idx = pick_instance_sticky(
|
||||||
combined_instances, token_ids, session_id, input_length,
|
combined_instances, token_ids, session_id, input_length,
|
||||||
session_affinity_combined)
|
session_affinity_combined)
|
||||||
elif policy == "unified":
|
elif policy == "unified" or policy == "unified_kv_both":
|
||||||
|
# unified_kv_both: same picker as `unified`, but the vLLMs are
|
||||||
|
# launched in kv_role=kv_both. Use this as an isolation control
|
||||||
|
# for `unified_v2` so the v2-vs-v1 gap reflects only the PD-sep
|
||||||
|
# branch, not the kv_both always-on overhead.
|
||||||
chosen, best_idx, decision = pick_instance_unified_hybrid(
|
chosen, best_idx, decision = pick_instance_unified_hybrid(
|
||||||
combined_instances, token_ids, session_id, input_length,
|
combined_instances, token_ids, session_id, input_length,
|
||||||
session_affinity_combined)
|
session_affinity_combined)
|
||||||
@@ -1206,11 +1222,13 @@ def parse_args():
|
|||||||
help="Comma-separated bootstrap ports for combined instances (for offload mode)")
|
help="Comma-separated bootstrap ports for combined instances (for offload mode)")
|
||||||
p.add_argument("--policy", type=str, default="linear",
|
p.add_argument("--policy", type=str, default="linear",
|
||||||
choices=["linear", "lmetric", "load_only", "sticky",
|
choices=["linear", "lmetric", "load_only", "sticky",
|
||||||
"unified", "unified_v2"],
|
"unified", "unified_kv_both", "unified_v2"],
|
||||||
help="Routing policy: linear (cache-aware), lmetric (P_tokens × BS), "
|
help="Routing policy: linear (cache-aware), lmetric (P_tokens × BS), "
|
||||||
"load_only (B3 control: pure min-num_requests), "
|
"load_only (B3 control: pure min-num_requests), "
|
||||||
"sticky (B3 control: hard session affinity), "
|
"sticky (B3 control: hard session affinity), "
|
||||||
"unified (hybrid affinity + LMetric fallback), "
|
"unified (hybrid affinity + LMetric fallback), "
|
||||||
|
"unified_kv_both (unified on kv_both vLLMs; isolation "
|
||||||
|
"control for unified_v2; PD-sep never triggers), "
|
||||||
"or unified_v2 (unified + selective per-request PD-sep "
|
"or unified_v2 (unified + selective per-request PD-sep "
|
||||||
"via Mooncake; requires --bootstrap-ports and "
|
"via Mooncake; requires --bootstrap-ports and "
|
||||||
"kv_role=kv_both vLLM launch)")
|
"kv_role=kv_both vLLM launch)")
|
||||||
|
|||||||
@@ -410,12 +410,12 @@ def test_unified_v2_triggers_when_src_has_meaningful_cache_and_chosen_has_decode
|
|||||||
|
|
||||||
|
|
||||||
def test_unified_v2_falls_through_when_chosen_has_no_decodes(proxy):
|
def test_unified_v2_falls_through_when_chosen_has_no_decodes(proxy):
|
||||||
"""No decodes on chosen → no benefit from PD-sep."""
|
"""No decoding work on chosen → no benefit from PD-sep."""
|
||||||
insts, prefix = _setup_v2_scene(proxy, chosen_decodes=0, src_cache_blocks=128)
|
insts, prefix = _setup_v2_scene(proxy, chosen_decodes=0, src_cache_blocks=128)
|
||||||
chosen, idx, decision, pd_sep = proxy.pick_instance_unified_v2(
|
chosen, idx, decision, pd_sep = proxy.pick_instance_unified_v2(
|
||||||
insts, prefix, None, len(prefix), {})
|
insts, prefix, None, len(prefix), {})
|
||||||
assert pd_sep is None
|
assert pd_sep is None
|
||||||
assert "few_decodes" in decision["v2_reason"]
|
assert "no_active_decode" in decision["v2_reason"]
|
||||||
|
|
||||||
|
|
||||||
def test_estimate_transfer_cost_is_calibrated_function(proxy):
|
def test_estimate_transfer_cost_is_calibrated_function(proxy):
|
||||||
|
|||||||
Reference in New Issue
Block a user