Revert "feat(policy): cold-D bonus to break overlap-pinning death spiral"
Implementation jumped ahead of design. The cold-D bonus is one of several candidates for the overlap-pinning fix (others: load-floor bonus, idle-D bonus, capacity-aware overlap discount, pre-warming boilerplate). Need to evaluate the design space first, including whether a single bonus is even the right shape vs a separate term in the lex score, before committing to a specific knob. This reverts commit786cbb8cleanly (forensic docs inbf4da28and7f2ebf3are kept since they record observations, not designs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -48,7 +48,6 @@ class BenchmarkConfig:
|
||||
enable_backpressure: bool = False
|
||||
backpressure_max_pause_s: float = 2.0
|
||||
kvcache_migration_reject_threshold: int = 3
|
||||
kvcache_cold_d_bonus: int = 0
|
||||
sample_profile: str = "default"
|
||||
min_initial_input_tokens: int | None = None
|
||||
max_initial_input_tokens: int | None = None
|
||||
@@ -201,7 +200,6 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts:
|
||||
enable_backpressure=config.enable_backpressure,
|
||||
backpressure_max_pause_s=config.backpressure_max_pause_s,
|
||||
kvcache_migration_reject_threshold=config.kvcache_migration_reject_threshold,
|
||||
kvcache_cold_d_bonus=config.kvcache_cold_d_bonus,
|
||||
)
|
||||
if config.request_timeout_s is not None:
|
||||
replay_config = replace(
|
||||
@@ -263,7 +261,6 @@ def run_live_benchmark(config: BenchmarkConfig) -> BenchmarkArtifacts:
|
||||
"enable_backpressure": config.enable_backpressure,
|
||||
"backpressure_max_pause_s": config.backpressure_max_pause_s,
|
||||
"kvcache_migration_reject_threshold": config.kvcache_migration_reject_threshold,
|
||||
"kvcache_cold_d_bonus": config.kvcache_cold_d_bonus,
|
||||
"sample_profile": config.sample_profile,
|
||||
"min_initial_input_tokens": config.min_initial_input_tokens,
|
||||
"max_initial_input_tokens": config.max_initial_input_tokens,
|
||||
|
||||
@@ -270,19 +270,6 @@ def main() -> None:
|
||||
"See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1."
|
||||
),
|
||||
)
|
||||
replay.add_argument(
|
||||
"--kvcache-cold-d-bonus",
|
||||
type=int,
|
||||
default=0,
|
||||
help=(
|
||||
"When > 0, fresh sessions (sticky=0) get a synthetic boost added to "
|
||||
"the lex-score of any D worker that has never been assigned a session "
|
||||
"yet. Set above max expected cross-session boilerplate overlap "
|
||||
"(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning "
|
||||
"death spiral on workloads with shared system prompts. "
|
||||
"See docs/E1_E2_RESULTS_ZH.md §5d."
|
||||
),
|
||||
)
|
||||
|
||||
sample = subparsers.add_parser(
|
||||
"sample-sessions",
|
||||
@@ -534,19 +521,6 @@ def main() -> None:
|
||||
"See REFACTOR_PLAN_V1 §6.2 / TEAM_REPORT §2.1."
|
||||
),
|
||||
)
|
||||
benchmark.add_argument(
|
||||
"--kvcache-cold-d-bonus",
|
||||
type=int,
|
||||
default=0,
|
||||
help=(
|
||||
"When > 0, fresh sessions (sticky=0) get a synthetic boost added to "
|
||||
"the lex-score of any D worker that has never been assigned a session "
|
||||
"yet. Set above max expected cross-session boilerplate overlap "
|
||||
"(~50 blocks for Inferact → use 1000). Breaks the overlap-pinning "
|
||||
"death spiral on workloads with shared system prompts. "
|
||||
"See docs/E1_E2_RESULTS_ZH.md §5d."
|
||||
),
|
||||
)
|
||||
benchmark.add_argument(
|
||||
"--sample-profile",
|
||||
choices=["default", "small-append"],
|
||||
@@ -633,7 +607,6 @@ def main() -> None:
|
||||
enable_backpressure=args.enable_backpressure,
|
||||
backpressure_max_pause_s=args.backpressure_max_pause_s,
|
||||
kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold,
|
||||
kvcache_cold_d_bonus=args.kvcache_cold_d_bonus,
|
||||
)
|
||||
results = asyncio.run(replay_trace(config))
|
||||
print(
|
||||
@@ -781,7 +754,6 @@ def main() -> None:
|
||||
enable_backpressure=args.enable_backpressure,
|
||||
backpressure_max_pause_s=args.backpressure_max_pause_s,
|
||||
kvcache_migration_reject_threshold=args.kvcache_migration_reject_threshold,
|
||||
kvcache_cold_d_bonus=args.kvcache_cold_d_bonus,
|
||||
sample_profile=args.sample_profile,
|
||||
min_initial_input_tokens=args.min_initial_input_tokens,
|
||||
max_initial_input_tokens=args.max_initial_input_tokens,
|
||||
|
||||
@@ -161,24 +161,6 @@ class KvAwarePolicy:
|
||||
# 0 disables the mechanism. Default 3 picked empirically to allow brief
|
||||
# transient saturation without panicking, but to reroute persistent starvation.
|
||||
migration_reject_threshold: int = 3
|
||||
# Cold-D bonus: workloads with shared cross-session prefixes (e.g. all
|
||||
# sessions begin with the same "permissions instructions" boilerplate, as
|
||||
# in Inferact codex_swebenchpro) cause every D that has hosted any session
|
||||
# to win the `overlap` term against any D that has not. The result is
|
||||
# permanent imbalance — D2 stays unused for the whole run, the migration
|
||||
# mechanism above never fires (since it only triggers on capacity rejects,
|
||||
# and you have to actually try a D for it to reject you), and D0/D1
|
||||
# eventually saturate (see docs/E1_E2_RESULTS_ZH.md §5d).
|
||||
#
|
||||
# When > 0, a D worker that has never been assigned a session yet
|
||||
# receives a synthetic boost added to position 0 of the lex score.
|
||||
# This boost ONLY applies when the request has no sticky preference for
|
||||
# any other D (i.e. sticky == 0) — so turn 1+ requests of an existing
|
||||
# session continue to stick, and only fresh sessions are diverted to cold
|
||||
# D's. Set above the maximum cross-session boilerplate overlap you expect
|
||||
# (Inferact's shared system prompt overlaps ~50 24-token blocks → set to
|
||||
# 1000 to be safe).
|
||||
cold_d_bonus: int = 0
|
||||
|
||||
def select(
|
||||
self,
|
||||
@@ -208,10 +190,8 @@ class KvAwarePolicy:
|
||||
sticky = int(session is not None and session.last_decode_worker == worker.worker_id)
|
||||
inflight_penalty = -state.inflight_decode.get(worker.worker_id, 0)
|
||||
assignment_penalty = -state.decode_assignment_counts.get(worker.worker_id, 0)
|
||||
is_cold = state.decode_assignment_counts.get(worker.worker_id, 0) == 0
|
||||
cold_boost = self.cold_d_bonus if (is_cold and not sticky) else 0
|
||||
score = (
|
||||
overlap + sticky * self.sticky_bonus + cold_boost,
|
||||
overlap + sticky * self.sticky_bonus,
|
||||
sticky,
|
||||
inflight_penalty,
|
||||
assignment_penalty,
|
||||
@@ -243,22 +223,14 @@ class KvAwarePolicy:
|
||||
)
|
||||
|
||||
|
||||
def create_policy(
|
||||
name: str,
|
||||
*,
|
||||
migration_reject_threshold: int = 3,
|
||||
cold_d_bonus: int = 0,
|
||||
) -> RoutingPolicy:
|
||||
def create_policy(name: str, *, migration_reject_threshold: int = 3) -> RoutingPolicy:
|
||||
normalized = name.strip().lower()
|
||||
if normalized == "default":
|
||||
return DefaultPolicy()
|
||||
if normalized == "sticky":
|
||||
return StickyDecodePolicy()
|
||||
if normalized in {"kv-aware", "kv_aware", "kv"}:
|
||||
return KvAwarePolicy(
|
||||
migration_reject_threshold=migration_reject_threshold,
|
||||
cold_d_bonus=cold_d_bonus,
|
||||
)
|
||||
return KvAwarePolicy(migration_reject_threshold=migration_reject_threshold)
|
||||
raise ValueError(f"Unsupported policy: {name}")
|
||||
|
||||
|
||||
|
||||
@@ -111,10 +111,6 @@ class ReplayConfig:
|
||||
# KvAwarePolicy skips that D for the session (forcing migration). Default 3.
|
||||
# Set 0 to disable. See REFACTOR_PLAN_V1 §6.2.
|
||||
kvcache_migration_reject_threshold: int = 3
|
||||
# Cold-D bonus: synthetic boost to lex-score position 0 for any D worker
|
||||
# that has never been assigned a session yet, applied only when the request
|
||||
# has no sticky preference. 0 disables. See docs/E1_E2_RESULTS_ZH.md §5d.
|
||||
kvcache_cold_d_bonus: int = 0
|
||||
structural_log_dir: Path | None = None
|
||||
|
||||
|
||||
@@ -202,7 +198,6 @@ async def replay_trace(config: ReplayConfig) -> list[RequestMetrics]:
|
||||
policy = create_policy(
|
||||
config.policy_name,
|
||||
migration_reject_threshold=config.kvcache_migration_reject_threshold,
|
||||
cold_d_bonus=config.kvcache_cold_d_bonus,
|
||||
)
|
||||
state = RoutingState.create(config.topology)
|
||||
state_lock = asyncio.Lock()
|
||||
|
||||
Reference in New Issue
Block a user