From 976115ea5eb463f084f1d98760fc63a12df5e13f Mon Sep 17 00:00:00 2001 From: tim Date: Tue, 12 May 2026 11:17:16 +0800 Subject: [PATCH] Revert "feat(policy): cold-D bonus to break overlap-pinning death spiral" Implementation jumped ahead of design. The cold-D bonus is one of several candidates for the overlap-pinning fix (others: load-floor bonus, idle-D bonus, capacity-aware overlap discount, pre-warming boilerplate). Need to evaluate the design space first, including whether a single bonus is even the right shape vs a separate term in the lex score, before committing to a specific knob. This reverts commit 786cbb8 cleanly (forensic docs in bf4da28 and 7f2ebf3 are kept since they record observations, not designs). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agentic_pd_hybrid/benchmark.py | 3 --- src/agentic_pd_hybrid/cli.py | 28 ------------------------ src/agentic_pd_hybrid/policies.py | 34 +++--------------------------- src/agentic_pd_hybrid/replay.py | 5 ----- 4 files changed, 3 insertions(+), 67 deletions(-) diff --git a/src/agentic_pd_hybrid/benchmark.py b/src/agentic_pd_hybrid/benchmark.py index 39062d4..055247c 100644 --- a/src/agentic_pd_hybrid/benchmark.py +++ b/src/agentic_pd_hybrid/benchmark.py @@ -48,7 +48,6 @@ class BenchmarkConfig: enable_backpressure: bool = False backpressure_max_pause_s: float = 2.0 kvcache_migration_reject_threshold: int = 3 - kvcache_cold_d_bonus: int = 0 sample_profile: str = "default" min_initial_input_tokens: int | None = None max_initial_input_tokens: int | None = None @@ -201,7 +200,6 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts: enable_backpressure=config.enable_backpressure, backpressure_max_pause_s=config.backpressure_max_pause_s, kvcache_migration_reject_threshold=config.kvcache_migration_reject_threshold, - kvcache_cold_d_bonus=config.kvcache_cold_d_bonus, ) if config.request_timeout_s is not None: replay_config = replace( @@ -263,7 +261,6 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts: "enable_backpressure": config.enable_backpressure, "backpressure_max_pause_s": config.backpressure_max_pause_s, "kvcache_migration_reject_threshold": config.kvcache_migration_reject_threshold, - "kvcache_cold_d_bonus": config.kvcache_cold_d_bonus, "sample_profile": config.sample_profile, "min_initial_input_tokens": config.min_initial_input_tokens, "max_initial_input_tokens": config.max_initial_input_tokens, diff --git a/src/agentic_pd_hybrid/cli.py b/src/agentic_pd_hybrid/cli.py index 3c4fc30..ca2777c 100644 --- a/src/agentic_pd_hybrid/cli.py +++ b/src/agentic_pd_hybrid/cli.py @@ -270,19 +270,6 @@ def main() -> None: "See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1." ), ) - replay.add_argument( - "--kvcache-cold-d-bonus", - type=int, - default=0, - help=( - "When > 0, fresh sessions (sticky=0) get a synthetic boost added to " - "the lex-score of any D worker that has never been assigned a session " - "yet. Set above max expected cross-session boilerplate overlap " - "(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning " - "death spiral on workloads with shared system prompts. " - "See docs/E1_E2_RESULTS_ZH.md §5d." - ), - ) sample = subparsers.add_parser( "sample-sessions", @@ -534,19 +521,6 @@ def main() -> None: "See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1." ), ) - benchmark.add_argument( - "--kvcache-cold-d-bonus", - type=int, - default=0, - help=( - "When > 0, fresh sessions (sticky=0) get a synthetic boost added to " - "the lex-score of any D worker that has never been assigned a session " - "yet. Set above max expected cross-session boilerplate overlap " - "(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning " - "death spiral on workloads with shared system prompts. " - "See docs/E1_E2_RESULTS_ZH.md §5d." - ), - ) benchmark.add_argument( "--sample-profile", choices=["default", "small-append"], @@ -633,7 +607,6 @@ def main() -> None: enable_backpressure=args.enable_backpressure, backpressure_max_pause_s=args.backpressure_max_pause_s, kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold, - kvcache_cold_d_bonus=args.kvcache_cold_d_bonus, ) results = asyncio.run(replay_trace(config)) print( @@ -781,7 +754,6 @@ def main() -> None: enable_backpressure=args.enable_backpressure, backpressure_max_pause_s=args.backpressure_max_pause_s, kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold, - kvcache_cold_d_bonus=args.kvcache_cold_d_bonus, sample_profile=args.sample_profile, min_initial_input_tokens=args.min_initial_input_tokens, max_initial_input_tokens=args.max_initial_input_tokens, diff --git a/src/agentic_pd_hybrid/policies.py b/src/agentic_pd_hybrid/policies.py index 6344f02..162bf84 100644 --- a/src/agentic_pd_hybrid/policies.py +++ b/src/agentic_pd_hybrid/policies.py @@ -161,24 +161,6 @@ class KvAwarePolicy: # 0 disables the mechanism. Default 3 picked empirically to allow brief # transient saturation without panicking, but to reroute persistent starvation. migration_reject_threshold: int = 3 - # Cold-D bonus: workloads with shared cross-session prefixes (e.g. all - # sessions begin with the same "permissions instructions" boilerplate, as - # in Inferact codex_swebenchpro) cause every D that has hosted any session - # to win the `overlap` term against any D that has not. The result is - # permanent imbalance — D2 stays unused for the whole run, the migration - # mechanism above never fires (since it only triggers on capacity rejects, - # and you have to actually try a D for it to reject you), and D0/D1 - # eventually saturate (see docs/E1_E2_RESULTS_ZH.md §5d). - # - # When > 0, a D worker that has never been assigned a session yet - # receives a synthetic boost added to position 0 of the lex score. - # This boost ONLY applies when the request has no sticky preference for - # any other D (i.e. sticky == 0) — so turn 1+ requests of an existing - # session continue to stick, and only fresh sessions are diverted to cold - # D's. Set above the maximum cross-session boilerplate overlap you expect - # (Inferact's shared system prompt overlaps ~50 24-token blocks → set to - # 1000 to be safe). - cold_d_bonus: int = 0 def select( self, @@ -208,10 +190,8 @@ class KvAwarePolicy: sticky = int(session is not None and session.last_decode_worker == worker.worker_id) inflight_penalty = -state.inflight_decode.get(worker.worker_id, 0) assignment_penalty = -state.decode_assignment_counts.get(worker.worker_id, 0) - is_cold = state.decode_assignment_counts.get(worker.worker_id, 0) == 0 - cold_boost = self.cold_d_bonus if (is_cold and not sticky) else 0 score = ( - overlap + sticky * self.sticky_bonus + cold_boost, + overlap + sticky * self.sticky_bonus, sticky, inflight_penalty, assignment_penalty, @@ -243,22 +223,14 @@ class KvAwarePolicy: ) -def create_policy( - name: str, - *, - migration_reject_threshold: int = 3, - cold_d_bonus: int = 0, -) -> RoutingPolicy: +def create_policy(name: str, *, migration_reject_threshold: int = 3) -> RoutingPolicy: normalized = name.strip().lower() if normalized == "default": return DefaultPolicy() if normalized == "sticky": return StickyDecodePolicy() if normalized in {"kv-aware", "kv_aware", "kv"}: - return KvAwarePolicy( - migration_reject_threshold=migration_reject_threshold, - cold_d_bonus=cold_d_bonus, - ) + return KvAwarePolicy(migration_reject_threshold=migration_reject_threshold) raise ValueError(f"Unsupported policy: {name}") diff --git a/src/agentic_pd_hybrid/replay.py b/src/agentic_pd_hybrid/replay.py index 15a7c24..9e065ef 100644 --- a/src/agentic_pd_hybrid/replay.py +++ b/src/agentic_pd_hybrid/replay.py @@ -111,10 +111,6 @@ class ReplayConfig: # KvAwarePolicy skips that D for the session (forcing migration). Default 3. # Set 0 to disable. See REFACTOR_PLAN_V1 §6.2. kvcache_migration_reject_threshold: int = 3 - # Cold-D bonus: synthetic boost to lex-score position 0 for any D worker - # that has never been assigned a session yet, applied only when the request - # has no sticky preference. 0 disables. See docs/E1_E2_RESULTS_ZH.md §5d. - kvcache_cold_d_bonus: int = 0 structural_log_dir: Path | None = None @@ -202,7 +198,6 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]: policy = create_policy( config.policy_name, migration_reject_threshold=config.kvcache_migration_reject_threshold, - cold_d_bonus=config.kvcache_cold_d_bonus, ) state = RoutingState.create(config.topology) state_lock = asyncio.Lock()