From 976115ea5eb463f084f1d98760fc63a12df5e13f Mon Sep 17 00:00:00 2001
From: tim <VtimothyeveeE@europe.com>
Date: Tue, 12 May 2026 11:17:16 +0800
Subject: [PATCH] Revert "feat(policy): cold-D bonus to break overlap-pinning
 death spiral"

Implementation jumped ahead of design. The cold-D bonus is one of
several candidates for the overlap-pinning fix (others: load-floor
bonus, idle-D bonus, capacity-aware overlap discount, pre-warming
boilerplate). Need to evaluate the design space first, including
whether a single bonus is even the right shape vs a separate term
in the lex score, before committing to a specific knob.

This reverts commit 786cbb8 cleanly (forensic docs in bf4da28 and
7f2ebf3 are kept since they record observations, not designs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/agentic_pd_hybrid/benchmark.py |  3 ---
 src/agentic_pd_hybrid/cli.py       | 28 ------------------------
 src/agentic_pd_hybrid/policies.py  | 34 +++---------------------------
 src/agentic_pd_hybrid/replay.py    |  5 -----
 4 files changed, 3 insertions(+), 67 deletions(-)

diff --git a/src/agentic_pd_hybrid/benchmark.py b/src/agentic_pd_hybrid/benchmark.py
index 39062d4..055247c 100644
--- a/src/agentic_pd_hybrid/benchmark.py
+++ b/src/agentic_pd_hybrid/benchmark.py
@@ -48,7 +48,6 @@ class BenchmarkConfig:
     enable_backpressure: bool = False
     backpressure_max_pause_s: float = 2.0
     kvcache_migration_reject_threshold: int = 3
-    kvcache_cold_d_bonus: int = 0
     sample_profile: str = "default"
     min_initial_input_tokens: int | None = None
     max_initial_input_tokens: int | None = None
@@ -201,7 +200,6 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts:
             enable_backpressure=config.enable_backpressure,
             backpressure_max_pause_s=config.backpressure_max_pause_s,
             kvcache_migration_reject_threshold=config.kvcache_migration_reject_threshold,
-            kvcache_cold_d_bonus=config.kvcache_cold_d_bonus,
         )
         if config.request_timeout_s is not None:
             replay_config = replace(
@@ -263,7 +261,6 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts:
                 "enable_backpressure": config.enable_backpressure,
                 "backpressure_max_pause_s": config.backpressure_max_pause_s,
                 "kvcache_migration_reject_threshold": config.kvcache_migration_reject_threshold,
-                "kvcache_cold_d_bonus": config.kvcache_cold_d_bonus,
                 "sample_profile": config.sample_profile,
                 "min_initial_input_tokens": config.min_initial_input_tokens,
                 "max_initial_input_tokens": config.max_initial_input_tokens,
diff --git a/src/agentic_pd_hybrid/cli.py b/src/agentic_pd_hybrid/cli.py
index 3c4fc30..ca2777c 100644
--- a/src/agentic_pd_hybrid/cli.py
+++ b/src/agentic_pd_hybrid/cli.py
@@ -270,19 +270,6 @@ def main() -> None:
             "See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1."
         ),
     )
-    replay.add_argument(
-        "--kvcache-cold-d-bonus",
-        type=int,
-        default=0,
-        help=(
-            "When > 0, fresh sessions (sticky=0) get a synthetic boost added to "
-            "the lex-score of any D worker that has never been assigned a session "
-            "yet. Set above max expected cross-session boilerplate overlap "
-            "(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning "
-            "death spiral on workloads with shared system prompts. "
-            "See docs/E1_E2_RESULTS_ZH.md §5d."
-        ),
-    )
 
     sample = subparsers.add_parser(
         "sample-sessions",
@@ -534,19 +521,6 @@ def main() -> None:
             "See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1."
         ),
     )
-    benchmark.add_argument(
-        "--kvcache-cold-d-bonus",
-        type=int,
-        default=0,
-        help=(
-            "When > 0, fresh sessions (sticky=0) get a synthetic boost added to "
-            "the lex-score of any D worker that has never been assigned a session "
-            "yet. Set above max expected cross-session boilerplate overlap "
-            "(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning "
-            "death spiral on workloads with shared system prompts. "
-            "See docs/E1_E2_RESULTS_ZH.md §5d."
-        ),
-    )
     benchmark.add_argument(
         "--sample-profile",
         choices=["default", "small-append"],
@@ -633,7 +607,6 @@ def main() -> None:
             enable_backpressure=args.enable_backpressure,
             backpressure_max_pause_s=args.backpressure_max_pause_s,
             kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold,
-            kvcache_cold_d_bonus=args.kvcache_cold_d_bonus,
         )
         results = asyncio.run(replay_trace(config))
         print(
@@ -781,7 +754,6 @@ def main() -> None:
                 enable_backpressure=args.enable_backpressure,
                 backpressure_max_pause_s=args.backpressure_max_pause_s,
                 kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold,
-            kvcache_cold_d_bonus=args.kvcache_cold_d_bonus,
                 sample_profile=args.sample_profile,
                 min_initial_input_tokens=args.min_initial_input_tokens,
                 max_initial_input_tokens=args.max_initial_input_tokens,
diff --git a/src/agentic_pd_hybrid/policies.py b/src/agentic_pd_hybrid/policies.py
index 6344f02..162bf84 100644
--- a/src/agentic_pd_hybrid/policies.py
+++ b/src/agentic_pd_hybrid/policies.py
@@ -161,24 +161,6 @@ class KvAwarePolicy:
     # 0 disables the mechanism. Default 3 picked empirically to allow brief
     # transient saturation without panicking, but to reroute persistent starvation.
     migration_reject_threshold: int = 3
-    # Cold-D bonus: workloads with shared cross-session prefixes (e.g. all
-    # sessions begin with the same "permissions instructions" boilerplate, as
-    # in Inferact codex_swebenchpro) cause every D that has hosted any session
-    # to win the `overlap` term against any D that has not. The result is
-    # permanent imbalance — D2 stays unused for the whole run, the migration
-    # mechanism above never fires (since it only triggers on capacity rejects,
-    # and you have to actually try a D for it to reject you), and D0/D1
-    # eventually saturate (see docs/E1_E2_RESULTS_ZH.md §5d).
-    #
-    # When > 0, a D worker that has never been assigned a session yet
-    # receives a synthetic boost added to position 0 of the lex score.
-    # This boost ONLY applies when the request has no sticky preference for
-    # any other D (i.e. sticky == 0) — so turn 1+ requests of an existing
-    # session continue to stick, and only fresh sessions are diverted to cold
-    # D's. Set above the maximum cross-session boilerplate overlap you expect
-    # (Inferact's shared system prompt overlaps ~50 24-token blocks → set to
-    # 1000 to be safe).
-    cold_d_bonus: int = 0
 
     def select(
         self,
@@ -208,10 +190,8 @@ class KvAwarePolicy:
             sticky = int(session is not None and session.last_decode_worker == worker.worker_id)
             inflight_penalty = -state.inflight_decode.get(worker.worker_id, 0)
             assignment_penalty = -state.decode_assignment_counts.get(worker.worker_id, 0)
-            is_cold = state.decode_assignment_counts.get(worker.worker_id, 0) == 0
-            cold_boost = self.cold_d_bonus if (is_cold and not sticky) else 0
             score = (
-                overlap + sticky * self.sticky_bonus + cold_boost,
+                overlap + sticky * self.sticky_bonus,
                 sticky,
                 inflight_penalty,
                 assignment_penalty,
@@ -243,22 +223,14 @@ class KvAwarePolicy:
         )
 
 
-def create_policy(
-    name: str,
-    *,
-    migration_reject_threshold: int = 3,
-    cold_d_bonus: int = 0,
-) -> RoutingPolicy:
+def create_policy(name: str, *, migration_reject_threshold: int = 3) -> RoutingPolicy:
     normalized = name.strip().lower()
     if normalized == "default":
         return DefaultPolicy()
     if normalized == "sticky":
         return StickyDecodePolicy()
     if normalized in {"kv-aware", "kv_aware", "kv"}:
-        return KvAwarePolicy(
-            migration_reject_threshold=migration_reject_threshold,
-            cold_d_bonus=cold_d_bonus,
-        )
+        return KvAwarePolicy(migration_reject_threshold=migration_reject_threshold)
     raise ValueError(f"Unsupported policy: {name}")
 
 
diff --git a/src/agentic_pd_hybrid/replay.py b/src/agentic_pd_hybrid/replay.py
index 15a7c24..9e065ef 100644
--- a/src/agentic_pd_hybrid/replay.py
+++ b/src/agentic_pd_hybrid/replay.py
@@ -111,10 +111,6 @@ class ReplayConfig:
     # KvAwarePolicy skips that D for the session (forcing migration). Default 3.
     # Set 0 to disable. See REFACTOR_PLAN_V1 §6.2.
     kvcache_migration_reject_threshold: int = 3
-    # Cold-D bonus: synthetic boost to lex-score position 0 for any D worker
-    # that has never been assigned a session yet, applied only when the request
-    # has no sticky preference. 0 disables. See docs/E1_E2_RESULTS_ZH.md §5d.
-    kvcache_cold_d_bonus: int = 0
     structural_log_dir: Path | None = None
 
 
@@ -202,7 +198,6 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
     policy = create_policy(
         config.policy_name,
         migration_reject_threshold=config.kvcache_migration_reject_threshold,
-        cold_d_bonus=config.kvcache_cold_d_bonus,
     )
     state = RoutingState.create(config.topology)
     state_lock = asyncio.Lock()