feat(policy): cold-D bonus to break overlap-pinning death spiral

KvAwarePolicy now accepts an optional cold_d_bonus int. When > 0,
fresh requests (sticky=0, i.e. no prior D for this session) receive
the bonus added to lex-score position 0 (overlap+sticky_bonus) for
any D worker that has never been assigned a session yet
(decode_assignment_counts == 0). This breaks the pathology
documented in docs/E1_E2_RESULTS_ZH.md §5d where workloads with
shared cross-session prefix (e.g. Inferact's "permissions
instructions" boilerplate) cause every D that has hosted any session
to dominate the overlap term against any cold D, leaving the cold D
permanently unused.

Sticky behavior is preserved: turn 1+ requests of an existing
session continue to stick to their original D because the bonus is
gated on `not sticky`.

Plumbed through ReplayConfig.kvcache_cold_d_bonus (default 0,
keeping current behavior unchanged), BenchmarkConfig, and CLI flag
--kvcache-cold-d-bonus on both `replay` and `benchmark-live`
subcommands. Set above max expected boilerplate overlap (Inferact's
~50 24-token blocks → 1000 is safe).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
tim
2026-05-12 11:14:00 +08:00
parent bf4da281c0
commit 786cbb8d91
4 changed files with 67 additions and 3 deletions

View File

@@ -48,6 +48,7 @@ class BenchmarkConfig:
enable_backpressure: bool = False
backpressure_max_pause_s: float = 2.0
kvcache_migration_reject_threshold: int = 3
kvcache_cold_d_bonus: int = 0
sample_profile: str = "default"
min_initial_input_tokens: int | None = None
max_initial_input_tokens: int | None = None
@@ -200,6 +201,7 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts:
enable_backpressure=config.enable_backpressure,
backpressure_max_pause_s=config.backpressure_max_pause_s,
kvcache_migration_reject_threshold=config.kvcache_migration_reject_threshold,
kvcache_cold_d_bonus=config.kvcache_cold_d_bonus,
)
if config.request_timeout_s is not None:
replay_config = replace(
@@ -261,6 +263,7 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts:
"enable_backpressure": config.enable_backpressure,
"backpressure_max_pause_s": config.backpressure_max_pause_s,
"kvcache_migration_reject_threshold": config.kvcache_migration_reject_threshold,
"kvcache_cold_d_bonus": config.kvcache_cold_d_bonus,
"sample_profile": config.sample_profile,
"min_initial_input_tokens": config.min_initial_input_tokens,
"max_initial_input_tokens": config.max_initial_input_tokens,

View File

@@ -270,6 +270,19 @@ def main() -> None:
"See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1."
),
)
replay.add_argument(
"--kvcache-cold-d-bonus",
type=int,
default=0,
help=(
"When > 0, fresh sessions (sticky=0) get a synthetic boost added to "
"the lex-score of any D worker that has never been assigned a session "
"yet. Set above max expected cross-session boilerplate overlap "
"(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning "
"death spiral on workloads with shared system prompts. "
"See docs/E1_E2_RESULTS_ZH.md §5d."
),
)
sample = subparsers.add_parser(
"sample-sessions",
@@ -521,6 +534,19 @@ def main() -> None:
"See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1."
),
)
benchmark.add_argument(
"--kvcache-cold-d-bonus",
type=int,
default=0,
help=(
"When > 0, fresh sessions (sticky=0) get a synthetic boost added to "
"the lex-score of any D worker that has never been assigned a session "
"yet. Set above max expected cross-session boilerplate overlap "
"(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning "
"death spiral on workloads with shared system prompts. "
"See docs/E1_E2_RESULTS_ZH.md §5d."
),
)
benchmark.add_argument(
"--sample-profile",
choices=["default", "small-append"],
@@ -607,6 +633,7 @@ def main() -> None:
enable_backpressure=args.enable_backpressure,
backpressure_max_pause_s=args.backpressure_max_pause_s,
kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold,
kvcache_cold_d_bonus=args.kvcache_cold_d_bonus,
)
results = asyncio.run(replay_trace(config))
print(
@@ -754,6 +781,7 @@ def main() -> None:
enable_backpressure=args.enable_backpressure,
backpressure_max_pause_s=args.backpressure_max_pause_s,
kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold,
kvcache_cold_d_bonus=args.kvcache_cold_d_bonus,
sample_profile=args.sample_profile,
min_initial_input_tokens=args.min_initial_input_tokens,
max_initial_input_tokens=args.max_initial_input_tokens,

View File

@@ -161,6 +161,24 @@ class KvAwarePolicy:
# 0 disables the mechanism. Default 3 picked empirically to allow brief
# transient saturation without panicking, but to reroute persistent starvation.
migration_reject_threshold: int = 3
# Cold-D bonus: workloads with shared cross-session prefixes (e.g. all
# sessions begin with the same "permissions instructions" boilerplate, as
# in Inferact codex_swebenchpro) cause every D that has hosted any session
# to win the `overlap` term against any D that has not. The result is
# permanent imbalance — D2 stays unused for the whole run, the migration
# mechanism above never fires (since it only triggers on capacity rejects,
# and you have to actually try a D for it to reject you), and D0/D1
# eventually saturate (see docs/E1_E2_RESULTS_ZH.md §5d).
#
# When > 0, a D worker that has never been assigned a session yet
# receives a synthetic boost added to position 0 of the lex score.
# This boost ONLY applies when the request has no sticky preference for
# any other D (i.e. sticky == 0) — so turn 1+ requests of an existing
# session continue to stick, and only fresh sessions are diverted to cold
# D's. Set above the maximum cross-session boilerplate overlap you expect
# (Inferact's shared system prompt overlaps ~50 24-token blocks → set to
# 1000 to be safe).
cold_d_bonus: int = 0
def select(
self,
@@ -190,8 +208,10 @@ class KvAwarePolicy:
sticky = int(session is not None and session.last_decode_worker == worker.worker_id)
inflight_penalty = -state.inflight_decode.get(worker.worker_id, 0)
assignment_penalty = -state.decode_assignment_counts.get(worker.worker_id, 0)
is_cold = state.decode_assignment_counts.get(worker.worker_id, 0) == 0
cold_boost = self.cold_d_bonus if (is_cold and not sticky) else 0
score = (
overlap + sticky * self.sticky_bonus,
overlap + sticky * self.sticky_bonus + cold_boost,
sticky,
inflight_penalty,
assignment_penalty,
@@ -223,14 +243,22 @@ class KvAwarePolicy:
)
def create_policy(name: str, *, migration_reject_threshold: int = 3) -> RoutingPolicy:
def create_policy(
name: str,
*,
migration_reject_threshold: int = 3,
cold_d_bonus: int = 0,
) -> RoutingPolicy:
normalized = name.strip().lower()
if normalized == "default":
return DefaultPolicy()
if normalized == "sticky":
return StickyDecodePolicy()
if normalized in {"kv-aware", "kv_aware", "kv"}:
return KvAwarePolicy(migration_reject_threshold=migration_reject_threshold)
return KvAwarePolicy(
migration_reject_threshold=migration_reject_threshold,
cold_d_bonus=cold_d_bonus,
)
raise ValueError(f"Unsupported policy: {name}")

View File

@@ -111,6 +111,10 @@ class ReplayConfig:
# KvAwarePolicy skips that D for the session (forcing migration). Default 3.
# Set 0 to disable. See REFACTOR_PLAN_V1 §6.2.
kvcache_migration_reject_threshold: int = 3
# Cold-D bonus: synthetic boost to lex-score position 0 for any D worker
# that has never been assigned a session yet, applied only when the request
# has no sticky preference. 0 disables. See docs/E1_E2_RESULTS_ZH.md §5d.
kvcache_cold_d_bonus: int = 0
structural_log_dir: Path | None = None
@@ -198,6 +202,7 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
policy = create_policy(
config.policy_name,
migration_reject_threshold=config.kvcache_migration_reject_threshold,
cold_d_bonus=config.kvcache_cold_d_bonus,
)
state = RoutingState.create(config.topology)
state_lock = asyncio.Lock()