Add profile-driven harness planner

2026-05-12 21:28:44 +08:00
parent 63d6a111f4
commit 17e9681ca0
3 changed files with 976 additions and 2 deletions
--- a/docs/harness-ablation/profile-driven-harness-implementation-20260512.md
+++ b/docs/harness-ablation/profile-driven-harness-implementation-20260512.md
@@ -0,0 +1,73 @@
 # Profile-Driven Harness Implementation Log
 Date: 2026-05-12
 ## Goal
 The harness should accelerate AITuner as a general tuning system, not as a collection of case-specific rules. The current implementation moves the harness toward a performance-engineering loop:
 1. extract a compact profile from each measured trial;
 2. rank bottleneck hypotheses from workload and probe evidence;
 3. generate generic candidate actions from a knob-effect model;
 4. score candidates by expected bottleneck relief, information gain, launch safety, and regression risk;
 5. block early stop while a high-value untested candidate remains.
 This is intended to apply across qwen3.5-27b chat, qwen3-235b prefill-only, qwen3-235b decode-only, and different SLOs without encoding model names, SLO constants, or known winning configs.
 ## Code Changes
 - `src/aituner/harness.py`
  - Added `trial_profiles` to normalize trial topology, performance, probe failures, latency quantiles, and launch failure evidence.
  - Added `bottleneck_hypotheses`, a ranked list instead of a single active bottleneck label.
  - Added `candidate_actions`, generated from topology and runtime knob families.
  - Added `experiment_plan`, which selects the next high-score candidate or declares stop readiness.
  - Updated harness proposal generation to prefer the profile-driven next action before falling back to legacy deterministic proposal code.
  - Updated harness stop logic so convergence/validation stop is blocked when the planner still has a high-value untested candidate.
 - `tests/test_core_flow.py`
  - Added coverage that a strong TP=2 incumbent with TTFT pressure still selects an unmeasured TP=4 topology candidate.
  - Added coverage that decode-only TPOT pressure at max TP can prefer lowering `max-num-seqs` instead of blindly lowering TP.
 ## Current Scoring Model
 The candidate score is intentionally generic:
 ```text
 score = expected_bottleneck_relief * bottleneck_confidence
      + information_gain
      + launch_safety
      - regression_risk
 ```
 Examples:
 - TTFT/prefill bottleneck: increasing TP and prefill batching candidates receive relief score.
 - Decode TPOT bottleneck: increasing TP is useful if a higher legal TP exists; if already at high TP, lowering decode concurrency can become the higher-value candidate.
 - Admission/queueing bottleneck: more DP or higher safe concurrency receives relief score.
 The scores are not tied to qwen27b/qwen235b or a fixed TPOT/TTFT threshold. They are tied to the measured bottleneck class and legal tunable space.
 ## Verification
 Local:
 ```bash
 python3 -m compileall -q src tests
 PYTHONPATH=src python3 -m unittest tests.test_core_flow
 ```
 Result: `93` tests passed.
 ## Next Experiment
 Run the same qwen3.5-27b chat 0-8k setup as the current ablation baseline:
 - workload: chat, input length 0-8k
 - SLO: TTFT p95 <= 4000ms, TPOT p95 <= 25ms, target pass rate 0.95
 - search: full range, `inherit_incumbent_floor=false`
 - budget: 12 total tuning iterations
 - LLM model: `gpt-5.4`
 - variant: harness enabled with profile-driven planner
 The no-harness min-prompt baseline is already available and only needs to be reused for comparison unless the setup changes.
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -32,21 +32,45 @@ def build_harness_context(
    state: StudyState,
 ) -> dict[str, Any]:
    recent_diagnostics = _recent_trial_diagnostics(state)
    trial_profiles = _trial_profiles(study, recent_diagnostics)
    bottleneck_hypotheses = _rank_bottleneck_hypotheses(
        study,
        window_summary,
        trial_profiles,
    )
    experiment_plan = _experiment_plan(
        study,
        window_summary,
        state,
        recent_diagnostics,
        trial_profiles,
        bottleneck_hypotheses,
    )
    return {
        "paper_alignment": {
            "goal": "Use workload-feature-to-knob harnesses to reduce wasted trials and avoid regressing after a good configuration is found.",
            "feature_model": "L-C-A: request lengths, inter-request KV-cache reuse, and arrival dynamics.",
-            "trial_policy": "Map the active bottleneck to one knob family, apply guard conditions, and stop when the incumbent has converged.",
+            "trial_policy": "Profile measured trials, rank bottleneck hypotheses, score generic candidate actions, and stop only when no useful measured hypothesis remains.",
        },
        "workload_lca_profile": _workload_lca_profile(window_summary),
        "recent_trial_diagnostics": recent_diagnostics,
        "trial_profiles": trial_profiles,
        "bottleneck_hypotheses": bottleneck_hypotheses,
        "candidate_actions": experiment_plan["candidate_actions"],
        "experiment_plan": experiment_plan,
        "convergence_guard": _convergence_guard(state, recent_diagnostics),
-        "harness_stop": _harness_stop_decision(study, state, recent_diagnostics),
+        "harness_stop": _harness_stop_decision(
            study,
            state,
            recent_diagnostics,
            experiment_plan=experiment_plan,
        ),
        "harness_proposal": _harness_proposal_decision(
            study,
            window_summary,
            state,
            recent_diagnostics,
            experiment_plan=experiment_plan,
        ),
        "knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
        "proposal_rules": _proposal_rules(),
@@ -348,6 +372,164 @@ def _recent_trial_diagnostics(state: StudyState) -> list[dict[str, Any]]:
    return diagnostics
 def _trial_profiles(
    study: StudySpec,
    recent_diagnostics: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
    profiles: list[dict[str, Any]] = []
    for item in recent_diagnostics:
        flags = _effective_flags_for_item(study, item)
        probe_summary = item.get("probe_summary")
        best_probe = None
        last_probe = None
        all_infeasible = None
        if isinstance(probe_summary, dict):
            best_probe = probe_summary.get("best_feasible_probe")
            last_probe = probe_summary.get("last_probe")
            all_infeasible = probe_summary.get("all_infeasible")
        limiting_probe = (
            last_probe
            if isinstance(last_probe, dict)
            else all_infeasible
            if isinstance(all_infeasible, dict)
            else best_probe
            if isinstance(best_probe, dict)
            else None
        )
        latency = limiting_probe.get("latency_summary") if isinstance(limiting_probe, dict) else {}
        if not isinstance(latency, dict):
            latency = {}
        failed_counts = latency.get("failed_reason_counts")
        if not isinstance(failed_counts, dict):
            failed_counts = {}
        profile = {
            "trial_id": item.get("trial_id"),
            "status": item.get("status"),
            "config_patch": item.get("config_patch") if isinstance(item.get("config_patch"), dict) else {},
            "topology": {
                "tensor_parallel_size": _parse_int_like(
                    flags.get("tensor-parallel-size"),
                    default=1,
                ),
                "data_parallel_size": _parse_int_like(
                    flags.get("data-parallel-size"),
                    default=1,
                ),
                "expert_parallel_size": _parse_int_like(
                    flags.get("expert-parallel-size"),
                    default=1,
                ),
                "enable_expert_parallel": bool(flags.get("enable-expert-parallel", False)),
            },
            "performance": {
                "best_request_rate": item.get("best_request_rate"),
                "best_request_rate_per_gpu": item.get("best_request_rate_per_gpu"),
                "best_pass_rate": item.get("best_pass_rate"),
            },
            "probe_profile": {
                "best_feasible_probe": best_probe,
                "limiting_probe": limiting_probe,
                "active_bottleneck": item.get("active_bottleneck"),
                "failed_reason_counts": failed_counts,
                "latency_quantiles": {
                    "ttft_ms": latency.get("ttft_ms") if isinstance(latency.get("ttft_ms"), dict) else {},
                    "tpot_ms": latency.get("tpot_ms") if isinstance(latency.get("tpot_ms"), dict) else {},
                },
            },
            "failure_profile": {
                "failure_stage": item.get("failure_stage"),
                "failure_reason": item.get("failure_reason"),
            },
        }
        profiles.append(profile)
    return profiles
 def _rank_bottleneck_hypotheses(
    study: StudySpec,
    window_summary: dict[str, Any],
    trial_profiles: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
    scores = {
        "ttft_prefill": 0.0,
        "decode_tpot": 0.0,
        "admission_or_queueing": 0.0,
        "launch_or_memory": 0.0,
    }
    evidence: dict[str, list[str]] = {name: [] for name in scores}
    default = _workload_default_bottleneck(study, window_summary)
    if default in scores:
        scores[default] += 0.18
        evidence[default].append(f"workload default bottleneck is {default}")
    if study.trace.request_mode == "decode_only" and study.slo.tpot_rule is not None:
        scores["decode_tpot"] += 0.22
        evidence["decode_tpot"].append("decode_only study with configured TPOT SLO")
    if study.slo.ttft_rule is not None:
        prompt_p95 = _as_float(window_summary.get("prompt_tokens_p95"))
        if prompt_p95 >= 4096:
            scores["ttft_prefill"] += 0.14
            evidence["ttft_prefill"].append("long prompt p95 makes TTFT/prefill plausible")
    for profile in trial_profiles[-stateful_history_limit() :]:
        active = str(profile.get("probe_profile", {}).get("active_bottleneck") or "")
        if active in scores:
            weight = 0.34 if profile is trial_profiles[-1] else 0.18
            scores[active] += weight
            evidence[active].append(
                f"{profile.get('trial_id')} probe diagnosis is {active}"
            )
        failed = profile.get("probe_profile", {}).get("failed_reason_counts")
        if not isinstance(failed, dict):
            failed = {}
        ttft_count = sum(int(v) for k, v in failed.items() if str(k).startswith("ttft"))
        tpot_count = sum(int(v) for k, v in failed.items() if str(k).startswith("tpot"))
        elapsed_count = sum(
            int(v)
            for k, v in failed.items()
            if str(k).startswith("probe_elapsed_s>")
            or str(k).startswith("arrival_lag_s>")
        )
        total = max(ttft_count + tpot_count + elapsed_count, 1)
        if ttft_count:
            scores["ttft_prefill"] += min(0.24, 0.24 * ttft_count / total)
            evidence["ttft_prefill"].append(
                f"{profile.get('trial_id')} TTFT failures={ttft_count}"
            )
        if tpot_count:
            scores["decode_tpot"] += min(0.24, 0.24 * tpot_count / total)
            evidence["decode_tpot"].append(
                f"{profile.get('trial_id')} TPOT failures={tpot_count}"
            )
        if elapsed_count:
            scores["admission_or_queueing"] += min(0.18, 0.18 * elapsed_count / total)
            evidence["admission_or_queueing"].append(
                f"{profile.get('trial_id')} queue/elapsed failures={elapsed_count}"
            )
        failure_stage = str(profile.get("failure_profile", {}).get("failure_stage") or "")
        failure_reason = str(profile.get("failure_profile", {}).get("failure_reason") or "")
        if failure_stage == "engine_launch" or "out of memory" in failure_reason.lower():
            scores["launch_or_memory"] += 0.4
            evidence["launch_or_memory"].append(
                f"{profile.get('trial_id')} launch or memory failure"
            )
    ranked = []
    for name, score in scores.items():
        if score <= 0:
            continue
        ranked.append(
            {
                "name": name,
                "confidence": min(0.99, round(score, 4)),
                "evidence": evidence[name][:6],
            }
        )
    ranked.sort(key=lambda item: item["confidence"], reverse=True)
    return ranked
 def stateful_history_limit() -> int:
    return 8
@@ -513,6 +695,8 @@ def _harness_stop_decision(
    study: StudySpec,
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
    *,
    experiment_plan: dict[str, Any] | None = None,
 ) -> dict[str, Any]:
    high_saturation = _search_high_saturation_guard(study, state, recent_diagnostics)
    if high_saturation["saturated"]:
@@ -528,6 +712,17 @@ def _harness_stop_decision(
            "reason": "topology_frontier_requires_probe",
            "evidence": topology_frontier,
        }
    if experiment_plan is not None and experiment_plan.get("next_action"):
        action = experiment_plan["next_action"]
        if isinstance(action, dict) and _as_float(action.get("score")) >= 0.35:
            return {
                "should_stop": False,
                "reason": "experiment_plan_has_high_value_candidate",
                "evidence": {
                    "summary": "The profile-driven planner still has a useful measured hypothesis to test.",
                    "next_action": action,
                },
            }
    guard = _convergence_guard(state, recent_diagnostics)
    if guard["deterministic_stop"]:
        return {
@@ -562,6 +757,8 @@ def _harness_proposal_decision(
    window_summary: dict[str, Any],
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
    *,
    experiment_plan: dict[str, Any] | None = None,
 ) -> dict[str, Any]:
    default = {
        "should_propose": False,
@@ -575,6 +772,26 @@ def _harness_proposal_decision(
        for item in recent_diagnostics
    }
    tested_signatures.update(_state_tested_signatures(state))
    if experiment_plan is not None:
        next_action = experiment_plan.get("next_action")
        if isinstance(next_action, dict) and _as_float(next_action.get("score")) >= 0.35:
            patch = next_action.get("config_patch")
            if isinstance(patch, dict):
                signature = _config_signature(patch)
                if signature not in tested_signatures:
                    return {
                        "should_propose": True,
                        "reason": str(next_action.get("action_id") or "profile_driven_candidate"),
                        "diagnosis": str(next_action.get("hypothesis") or "Profile-driven harness candidate."),
                        "config_patch": patch,
                        "expected_effects": [
                            str(item)
                            for item in next_action.get("expected_effects", [])
                            if isinstance(item, str)
                        ],
                        "candidate_score": next_action.get("score"),
                        "bottleneck_hypotheses": experiment_plan.get("bottleneck_hypotheses", []),
                    }
    baseline = recent_diagnostics[0] if recent_diagnostics else {}
    topology_frontier = _topology_frontier_proposal(
        study,
@@ -691,6 +908,526 @@ def _topology_frontier_proposal(
    }
 def _experiment_plan(
    study: StudySpec,
    window_summary: dict[str, Any],
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
    trial_profiles: list[dict[str, Any]],
    bottleneck_hypotheses: list[dict[str, Any]],
 ) -> dict[str, Any]:
    tested_signatures = {
        _config_signature(item.get("config_patch") if isinstance(item, dict) else None)
        for item in recent_diagnostics
    }
    tested_signatures.update(_state_tested_signatures(state))
    candidates = _candidate_actions(
        study,
        window_summary,
        state,
        recent_diagnostics,
        trial_profiles,
        bottleneck_hypotheses,
        tested_signatures=tested_signatures,
    )
    candidates.sort(key=lambda item: _as_float(item.get("score")), reverse=True)
    next_action = candidates[0] if candidates else None
    return {
        "planner_version": "profile-driven-v1",
        "bottleneck_hypotheses": bottleneck_hypotheses,
        "candidate_actions": candidates[:8],
        "next_action": next_action,
        "stop_ready": next_action is None or _as_float(next_action.get("score")) < 0.35,
        "stop_rationale": (
            "no untested high-value candidate remains"
            if not candidates or _as_float(candidates[0].get("score")) < 0.35
            else "continue with the highest-scoring measured hypothesis"
        ),
    }
 def _candidate_actions(
    study: StudySpec,
    window_summary: dict[str, Any],
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
    trial_profiles: list[dict[str, Any]],
    bottleneck_hypotheses: list[dict[str, Any]],
    *,
    tested_signatures: set[str],
 ) -> list[dict[str, Any]]:
    if not recent_diagnostics:
        return []
    anchor = _anchor_profile(study, state, recent_diagnostics, trial_profiles)
    if anchor is None:
        return []
    top_bottleneck = (
        str(bottleneck_hypotheses[0]["name"])
        if bottleneck_hypotheses
        else str(anchor.get("probe_profile", {}).get("active_bottleneck") or "")
    )
    candidates: list[dict[str, Any]] = []
    candidates.extend(
        _topology_candidate_actions(
            study,
            anchor,
            top_bottleneck,
            bottleneck_hypotheses,
            tested_signatures,
        )
    )
    candidates.extend(
        _runtime_candidate_actions(
            study,
            window_summary,
            anchor,
            top_bottleneck,
            bottleneck_hypotheses,
            tested_signatures,
        )
    )
    return candidates
 def _anchor_profile(
    study: StudySpec,
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
    trial_profiles: list[dict[str, Any]],
 ) -> dict[str, Any] | None:
    if state.best_trial_id:
        for profile in reversed(trial_profiles):
            if profile.get("trial_id") == state.best_trial_id:
                return profile
    for profile in reversed(trial_profiles):
        if profile.get("status") == "completed":
            return profile
    return trial_profiles[-1] if trial_profiles else None
 def _topology_candidate_actions(
    study: StudySpec,
    anchor: dict[str, Any],
    top_bottleneck: str,
    bottleneck_hypotheses: list[dict[str, Any]],
    tested_signatures: set[str],
 ) -> list[dict[str, Any]]:
    if not ({"tensor-parallel-size", "data-parallel-size"} & set(study.engine.tunable_flags)):
        return []
    anchor_flags = _effective_flags_for_item(study, anchor)
    current_tp = _parse_int_like(anchor_flags.get("tensor-parallel-size"), default=1)
    current_dp = _parse_int_like(anchor_flags.get("data-parallel-size"), default=1)
    current_ep = _parse_int_like(anchor_flags.get("expert-parallel-size"), default=1)
    current_enable_ep = bool(anchor_flags.get("enable-expert-parallel", False))
    legal = _legal_topology_points(
        study,
        current_tp=current_tp,
        current_dp=current_dp,
        current_ep=current_ep,
        current_enable_ep=current_enable_ep,
    )
    actions: list[dict[str, Any]] = []
    for point in legal:
        if point["tensor-parallel-size"] == current_tp and point["data-parallel-size"] == current_dp:
            continue
        patch = _topology_patch(study, point)
        signature = _config_signature({"env_patch": {}, "flag_patch": patch})
        if signature in tested_signatures:
            continue
        score, factors = _score_topology_candidate(
            top_bottleneck,
            bottleneck_hypotheses,
            current_tp=current_tp,
            current_dp=current_dp,
            candidate_tp=point["tensor-parallel-size"],
            candidate_dp=point["data-parallel-size"],
        )
        if score <= 0:
            continue
        action_id = _topology_action_id(current_tp, current_dp, point)
        actions.append(
            {
                "action_id": action_id,
                "knob_family": "topology",
                "score": round(score, 4),
                "score_factors": factors,
                "config_patch": {"env_patch": {}, "flag_patch": patch},
                "hypothesis": _topology_hypothesis(
                    top_bottleneck,
                    current_tp=current_tp,
                    current_dp=current_dp,
                    candidate_tp=point["tensor-parallel-size"],
                    candidate_dp=point["data-parallel-size"],
                ),
                "expected_effects": [
                    "measure whether topology changes relieve the ranked bottleneck",
                    "compare request_rate_per_gpu under the configured SLO, not raw throughput alone",
                    "reject this hypothesis if latency improves but per-GPU throughput regresses materially",
                ],
            }
        )
    return actions
 def _runtime_candidate_actions(
    study: StudySpec,
    window_summary: dict[str, Any],
    anchor: dict[str, Any],
    top_bottleneck: str,
    bottleneck_hypotheses: list[dict[str, Any]],
    tested_signatures: set[str],
 ) -> list[dict[str, Any]]:
    tunable = set(study.engine.tunable_flags)
    anchor_flags = _effective_flags_for_item(study, anchor)
    topology_patch = _preserve_topology_patch(study, anchor_flags)
    actions: list[dict[str, Any]] = []
    if "max-num-batched-tokens" in tunable:
        current_mbt = _parse_int_like(anchor_flags.get("max-num-batched-tokens"), default=0)
        mbt_targets: list[tuple[str, int]] = []
        if top_bottleneck == "ttft_prefill":
            target = (
                _initial_mbt_from_window(window_summary)
                if current_mbt <= 0
                else _next_mbt_step(current_mbt)
            )
            if target is not None:
                mbt_targets.append(("raise_mbt", target))
        elif top_bottleneck == "decode_tpot" and current_mbt > 8192:
            mbt_targets.append(("lower_mbt", max(8192, current_mbt // 2)))
        for action_id, target in mbt_targets:
            patch = {**topology_patch, "max-num-batched-tokens": target}
            signature = _config_signature({"env_patch": {}, "flag_patch": patch})
            if signature in tested_signatures:
                continue
            relief = 0.24 if top_bottleneck == "ttft_prefill" else 0.14
            actions.append(
                _runtime_action(
                    action_id=action_id,
                    knob_family="max-num-batched-tokens",
                    score=relief + _information_gain(bottleneck_hypotheses, "runtime"),
                    patch=patch,
                    hypothesis=(
                        "Adjust max-num-batched-tokens to test whether batching, not topology, "
                        "is limiting the active latency objective."
                    ),
                    expected_effects=[
                        "change prefill/decode batching pressure on the incumbent topology",
                        "confirm if the latency knee moves without requiring another topology change",
                    ],
                )
            )
    if "max-num-seqs" in tunable:
        current_mns = _parse_int_like(anchor_flags.get("max-num-seqs"), default=0)
        mns_targets: list[tuple[str, int]] = []
        if top_bottleneck == "admission_or_queueing":
            target = max(8, int(current_mns * 1.5)) if current_mns > 0 else 64
            mns_targets.append(("raise_max_num_seqs", _round_up_to_multiple(target, 8)))
        elif top_bottleneck == "decode_tpot" and current_mns > 8:
            mns_targets.append(("lower_max_num_seqs", max(8, current_mns // 2)))
        for action_id, target in mns_targets:
            patch = {**topology_patch, "max-num-seqs": target}
            signature = _config_signature({"env_patch": {}, "flag_patch": patch})
            if signature in tested_signatures:
                continue
            relief = 0.25 if top_bottleneck in {"decode_tpot", "admission_or_queueing"} else 0.08
            actions.append(
                _runtime_action(
                    action_id=action_id,
                    knob_family="max-num-seqs",
                    score=relief + _information_gain(bottleneck_hypotheses, "runtime"),
                    patch=patch,
                    hypothesis=(
                        "Adjust max-num-seqs to test whether concurrency pressure is the "
                        "limiting factor under the configured SLO."
                    ),
                    expected_effects=[
                        "change decode/admission concurrency on the incumbent topology",
                        "confirm if TPOT or queueing pressure is caused by sequence concurrency",
                    ],
                )
            )
    if "enable-chunked-prefill" in tunable and top_bottleneck == "ttft_prefill":
        current = bool(anchor_flags.get("enable-chunked-prefill", False))
        if not current:
            patch = {**topology_patch, "enable-chunked-prefill": True}
            signature = _config_signature({"env_patch": {}, "flag_patch": patch})
            if signature not in tested_signatures:
                actions.append(
                    _runtime_action(
                        action_id="enable_chunked_prefill",
                        knob_family="enable-chunked-prefill",
                        score=0.2 + _information_gain(bottleneck_hypotheses, "runtime"),
                        patch=patch,
                        hypothesis=(
                            "Enable chunked prefill to test whether long-prefill head-of-line "
                            "blocking is driving TTFT failures."
                        ),
                        expected_effects=[
                            "reduce long-prefill interference for mixed-length chat windows",
                            "reject if chunking overhead worsens request_rate_per_gpu",
                        ],
                    )
                )
    return actions
 def _runtime_action(
    *,
    action_id: str,
    knob_family: str,
    score: float,
    patch: dict[str, Any],
    hypothesis: str,
    expected_effects: list[str],
 ) -> dict[str, Any]:
    return {
        "action_id": action_id,
        "knob_family": knob_family,
        "score": round(score, 4),
        "score_factors": {
            "expected_bottleneck_relief": round(max(score - 0.1, 0.0), 4),
            "information_gain": 0.1,
            "launch_safety": 0.05,
            "regression_risk": 0.05,
        },
        "config_patch": {"env_patch": {}, "flag_patch": patch},
        "hypothesis": hypothesis,
        "expected_effects": expected_effects,
    }
 def _legal_topology_points(
    study: StudySpec,
    *,
    current_tp: int,
    current_dp: int,
    current_ep: int,
    current_enable_ep: bool,
 ) -> list[dict[str, Any]]:
    constraints = study.engine.topology_constraints
    tunable = set(study.engine.tunable_flags)
    if constraints is not None and constraints.allowed_tensor_parallel_sizes:
        tp_values = sorted(set(constraints.allowed_tensor_parallel_sizes))
    elif "tensor-parallel-size" in tunable:
        tp_values = [value for value in [1, 2, 4, 8] if value <= study.hardware.gpu_count]
    else:
        tp_values = [current_tp]
    if constraints is not None and constraints.allowed_data_parallel_sizes:
        dp_values = sorted(set(constraints.allowed_data_parallel_sizes))
    elif "data-parallel-size" in tunable:
        dp_values = [value for value in [1, 2, 4, 8] if value <= study.hardware.gpu_count]
    else:
        dp_values = [current_dp]
    if constraints is not None and constraints.allowed_expert_parallel_sizes:
        ep_values = sorted(set(constraints.allowed_expert_parallel_sizes))
    elif "expert-parallel-size" in tunable:
        ep_values = sorted({1, current_ep})
    else:
        ep_values = [current_ep]
    points: list[dict[str, Any]] = []
    for tp in tp_values:
        for dp in dp_values:
            tp_dp_product = tp * dp
            if constraints is not None:
                if (
                    constraints.allowed_tp_dp_products
                    and tp_dp_product not in constraints.allowed_tp_dp_products
                ):
                    continue
                if (
                    constraints.require_tp_dp_product_equals_gpu_count
                    and tp_dp_product != study.hardware.gpu_count
                ):
                    continue
            elif tp_dp_product > study.hardware.gpu_count:
                continue
            if constraints is not None and not constraints.allowed_tp_dp_products:
                if tp_dp_product > study.hardware.gpu_count:
                    continue
            for ep in ep_values:
                enable_ep = current_enable_ep or ep > 1
                if constraints is not None:
                    if constraints.allowed_expert_parallel_sizes and ep not in constraints.allowed_expert_parallel_sizes:
                        continue
                    if constraints.require_ep_size_leq_tp_dp_product and ep > tp_dp_product:
                        continue
                    if constraints.require_ep_size_divides_tp_dp_product and tp_dp_product % ep != 0:
                        continue
                    if (
                        constraints.require_enable_expert_parallel_when_ep_gt_one
                        and ep > 1
                        and not enable_ep
                    ):
                        continue
                points.append(
                    {
                        "tensor-parallel-size": tp,
                        "data-parallel-size": dp,
                        "expert-parallel-size": ep,
                        "enable-expert-parallel": enable_ep,
                    }
                )
    return points
 def _topology_patch(study: StudySpec, point: dict[str, Any]) -> dict[str, Any]:
    patch: dict[str, Any] = {}
    tunable = set(study.engine.tunable_flags)
    base = _normalized_topology_flags(study.engine.base_flags)
    for key in (
        "tensor-parallel-size",
        "data-parallel-size",
        "expert-parallel-size",
        "enable-expert-parallel",
    ):
        if key not in tunable:
            continue
        if key in point and point[key] != base.get(key):
            patch[key] = point[key]
    return patch
 def _preserve_topology_patch(study: StudySpec, flags: dict[str, Any]) -> dict[str, Any]:
    patch: dict[str, Any] = {}
    tunable = set(study.engine.tunable_flags)
    base = _normalized_topology_flags(study.engine.base_flags)
    normalized = _normalized_topology_flags(flags)
    for key in (
        "tensor-parallel-size",
        "data-parallel-size",
        "expert-parallel-size",
        "enable-expert-parallel",
    ):
        if key not in tunable or key not in normalized:
            continue
        if normalized.get(key) != base.get(key):
            patch[key] = normalized[key]
    return patch
 def _normalized_topology_flags(flags: dict[str, Any]) -> dict[str, Any]:
    return {
        "tensor-parallel-size": _parse_int_like(
            flags.get("tensor-parallel-size"),
            default=1,
        ),
        "data-parallel-size": _parse_int_like(
            flags.get("data-parallel-size"),
            default=1,
        ),
        "expert-parallel-size": _parse_int_like(
            flags.get("expert-parallel-size"),
            default=1,
        ),
        "enable-expert-parallel": bool(flags.get("enable-expert-parallel", False)),
    }
 def _score_topology_candidate(
    top_bottleneck: str,
    bottleneck_hypotheses: list[dict[str, Any]],
    *,
    current_tp: int,
    current_dp: int,
    candidate_tp: int,
    candidate_dp: int,
 ) -> tuple[float, dict[str, float]]:
    tp_delta = candidate_tp - current_tp
    dp_delta = candidate_dp - current_dp
    confidence = _hypothesis_confidence(bottleneck_hypotheses, top_bottleneck)
    relief = 0.0
    if top_bottleneck == "ttft_prefill":
        relief = 0.42 if tp_delta > 0 else 0.05
    elif top_bottleneck == "decode_tpot":
        relief = 0.34 if tp_delta > 0 else 0.02
    elif top_bottleneck == "admission_or_queueing":
        relief = 0.34 if dp_delta > 0 else 0.08
    else:
        relief = 0.04
    info_gain = 0.2 if abs(tp_delta) + abs(dp_delta) > 0 else 0.0
    launch_safety = 0.08 if candidate_tp * candidate_dp <= max(current_tp * current_dp, 1) else 0.04
    distance = abs(_log2_ratio(candidate_tp, current_tp)) + abs(_log2_ratio(candidate_dp, current_dp))
    regression_risk = min(0.28, 0.06 * distance)
    score = relief * max(confidence, 0.35) + info_gain + launch_safety - regression_risk
    return score, {
        "expected_bottleneck_relief": round(relief, 4),
        "bottleneck_confidence": round(confidence, 4),
        "information_gain": round(info_gain, 4),
        "launch_safety": round(launch_safety, 4),
        "regression_risk": round(regression_risk, 4),
    }
 def _information_gain(bottleneck_hypotheses: list[dict[str, Any]], family: str) -> float:
    if not bottleneck_hypotheses:
        return 0.08
    top_confidence = _as_float(bottleneck_hypotheses[0].get("confidence"))
    uncertainty = max(0.0, 1.0 - top_confidence)
    return 0.08 + min(0.12, uncertainty * 0.12)
 def _hypothesis_confidence(
    bottleneck_hypotheses: list[dict[str, Any]],
    name: str,
 ) -> float:
    for item in bottleneck_hypotheses:
        if item.get("name") == name:
            return _as_float(item.get("confidence"))
    return 0.0
 def _topology_action_id(
    current_tp: int,
    current_dp: int,
    point: dict[str, Any],
 ) -> str:
    candidate_tp = int(point["tensor-parallel-size"])
    candidate_dp = int(point["data-parallel-size"])
    if candidate_tp > current_tp:
        return "topology_frontier_probe_for_slo_pressure"
    if candidate_dp > current_dp:
        return "increase_data_parallel_probe"
    if candidate_tp < current_tp:
        return "decrease_tensor_parallel_probe"
    return "redistribute_topology_probe"
 def _topology_hypothesis(
    top_bottleneck: str,
    *,
    current_tp: int,
    current_dp: int,
    candidate_tp: int,
    candidate_dp: int,
 ) -> str:
    return (
        f"Ranked bottleneck is {top_bottleneck}. Test topology "
        f"TP={candidate_tp}, DP={candidate_dp} against incumbent TP={current_tp}, "
        f"DP={current_dp}; this distinguishes compute-latency relief from "
        "replica/admission effects under the configured SLO."
    )
 def _log2_ratio(new: int, old: int) -> float:
    if new <= 0 or old <= 0:
        return 0.0
    ratio = new / old
    steps = 0.0
    while ratio >= 2.0:
        steps += 1.0
        ratio /= 2.0
    while ratio <= 0.5:
        steps += 1.0
        ratio *= 2.0
    return steps
 def _topology_frontier_status(
    study: StudySpec,
    state: StudyState,
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -920,6 +920,170 @@ class CoreFlowTests(unittest.TestCase):
                "topology_frontier_probe_for_slo_pressure",
            )
    def test_profile_driven_planner_scores_unmeasured_tp_frontier(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                engine_overrides={
                    "tunable_flags": [
                        "tensor-parallel-size",
                        "max-num-batched-tokens",
                        "enable-chunked-prefill",
                    ],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            result_path = tmp_path / "trial-0002.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.5,
                        "best_request_rate": 2.0,
                        "best_pass_rate": 0.96,
                        "probes": [
                            {
                                "threshold": 0.75,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.6,
                                    "request_rate": 3.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"ttft_ms>4000.0": 35}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 7628, "prompt_tail_ratio_p95_p50": 3.8},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0002",
                    best_request_rate=2.0,
                    best_request_rate_per_gpu=1.0,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=0.5,
                            best_request_rate_per_gpu=0.5,
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        ),
                        TrialSummary(
                            trial_id="trial-0002",
                            status="completed",
                            best_request_rate=2.0,
                            best_request_rate_per_gpu=1.0,
                            result_path=str(result_path),
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"tensor-parallel-size": 2},
                            },
                        ),
                    ],
                ),
            )
            plan = context["experiment_plan"]
            self.assertEqual(plan["planner_version"], "profile-driven-v1")
            self.assertEqual(plan["next_action"]["knob_family"], "topology")
            self.assertEqual(
                plan["next_action"]["config_patch"]["flag_patch"],
                {"tensor-parallel-size": 4},
            )
            self.assertIn("ttft_prefill", context["bottleneck_hypotheses"][0]["name"])
            self.assertFalse(context["harness_stop"]["should_stop"])
    def test_profile_driven_planner_prefers_decode_concurrency_relief(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(
                tmp_path,
                trace_overrides={"request_mode": "decode_only"},
                slo_overrides={
                    "ttft_rule": None,
                    "tpot_rule": {"kind": "fixed_ms", "threshold_ms": 20},
                },
                engine_overrides={
                    "base_flags": {
                        "host": "127.0.0.1",
                        "port": 8000,
                        "tensor-parallel-size": 4,
                        "max-num-seqs": 64,
                    },
                    "tunable_flags": ["tensor-parallel-size", "max-num-seqs"],
                    "topology_constraints": {
                        "allowed_tensor_parallel_sizes": [1, 2, 4],
                        "allowed_tp_dp_products": [1, 2, 4],
                    },
                },
            )
            result_path = tmp_path / "trial-0001.json"
            result_path.write_text(
                json.dumps(
                    {
                        "status": "completed",
                        "best_sampling_u": 0.25,
                        "best_request_rate": 1.0,
                        "best_pass_rate": 0.97,
                        "probes": [
                            {
                                "threshold": 0.5,
                                "feasible": False,
                                "payload": {
                                    "request_count": 100,
                                    "pass_rate": 0.5,
                                    "request_rate": 2.0,
                                    "early_stop_reason": "slo_pass_rate_unrecoverable",
                                    "latency_summary": {
                                        "failed_reason_counts": {"tpot_ms>20.0": 50}
                                    },
                                },
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            study = load_study_spec(study_path)
            context = build_harness_context(
                study=study,
                window_summary={},
                state=StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0001",
                    best_request_rate=1.0,
                    best_request_rate_per_gpu=0.25,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            best_request_rate=1.0,
                            best_request_rate_per_gpu=0.25,
                            result_path=str(result_path),
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        )
                    ],
                ),
            )
            plan = context["experiment_plan"]
            self.assertEqual(plan["next_action"]["knob_family"], "max-num-seqs")
            self.assertEqual(
                plan["next_action"]["config_patch"]["flag_patch"],
                {"max-num-seqs": 32},
            )
    def test_harness_stop_blocked_until_slo_driven_topology_frontier_is_measured(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)