Add harness early stop ablation

2026-05-02 08:08:14 +08:00
parent 6d3459c82d
commit 1a3d628268
9 changed files with 837 additions and 29 deletions
--- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json
+++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json
@@ -0,0 +1,119 @@
 {
  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness",
  "hardware": {
    "gpu_count": 8,
    "gpu_model": "H20",
    "host_candidates": [
      "dash0"
    ]
  },
  "model": {
    "model_id": "Qwen/Qwen3-30B-A3B",
    "served_model_name": "qwen3-30b-a3b-community"
  },
  "engine": {
    "engine_name": "vllm",
    "engine_version": "0.20.0",
    "exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
    "host": "127.0.0.1",
    "port": 18230,
    "healthcheck_path": "/v1/models",
    "ready_timeout_s": 900,
    "request_timeout_s": 900,
    "launch_args": [
      "serve",
      "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
    ],
    "base_envs": {
      "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
    },
    "base_flags": {
      "host": "127.0.0.1",
      "port": 18230,
      "served-model-name": "qwen3-30b-a3b-community"
    },
    "tunable_envs": [],
    "tunable_flags": [
      "tensor-parallel-size",
      "data-parallel-size",
      "enable-expert-parallel",
      "expert-parallel-size",
      "gpu-memory-utilization",
      "max-num-batched-tokens",
      "max-num-seqs",
      "block-size",
      "enable-prefix-caching",
      "enable-chunked-prefill"
    ],
    "topology_constraints": {
      "require_tp_dp_product_equals_gpu_count": false,
      "require_ep_size_leq_tp_dp_product": true,
      "require_ep_size_divides_tp_dp_product": true,
      "require_enable_expert_parallel_when_ep_gt_one": true,
      "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
      "allowed_tp_dp_products": [1, 2, 4, 8],
      "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
      "allowed_data_parallel_sizes": [1, 2, 4, 8],
      "allowed_expert_parallel_sizes": [1, 2, 4, 8]
    },
    "python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
  },
  "trace": {
    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
    "window_id": "chat_w20260311_1000",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
    "max_concurrency": 64,
    "input_length_filter": {
      "min_input_tokens": 0,
      "max_input_tokens": 8192
    },
    "max_requests_per_probe": 2048,
    "replay_time_scale": 1.0,
    "early_stop_max_lag_s": 120.0,
    "early_stop_max_elapsed_s": 900.0
  },
  "slo": {
    "target_pass_rate": 0.95,
    "ttft_rule": {
      "kind": "step_ms",
      "buckets": [
        {
          "max_input_tokens": 4096,
          "threshold_ms": 2000
        },
        {
          "max_input_tokens": 32768,
          "threshold_ms": 4000
        },
        {
          "threshold_ms": 6000
        }
      ]
    },
    "tpot_rule": {
      "kind": "fixed_ms",
      "threshold_ms": 50
    }
  },
  "search": {
    "low": 0.0,
    "high": 0.125,
    "tolerance": 0.001,
    "max_probes": 6,
    "sample_seed": 20260325
  },
  "llm": {
    "system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
    "max_history_trials": 8,
    "use_harness": true,
    "endpoint": {
      "provider": "codex",
      "model": "gpt-5.4",
      "stream": true,
      "api_key_env": "OPENAI_API_KEY",
      "timeout_s": 240
    }
  }
 }
--- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json
+++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json
@@ -0,0 +1,119 @@
 {
  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness",
  "hardware": {
    "gpu_count": 8,
    "gpu_model": "H20",
    "host_candidates": [
      "dash0"
    ]
  },
  "model": {
    "model_id": "Qwen/Qwen3-30B-A3B",
    "served_model_name": "qwen3-30b-a3b-community"
  },
  "engine": {
    "engine_name": "vllm",
    "engine_version": "0.20.0",
    "exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
    "host": "127.0.0.1",
    "port": 18231,
    "healthcheck_path": "/v1/models",
    "ready_timeout_s": 900,
    "request_timeout_s": 900,
    "launch_args": [
      "serve",
      "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
    ],
    "base_envs": {
      "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
    },
    "base_flags": {
      "host": "127.0.0.1",
      "port": 18231,
      "served-model-name": "qwen3-30b-a3b-community"
    },
    "tunable_envs": [],
    "tunable_flags": [
      "tensor-parallel-size",
      "data-parallel-size",
      "enable-expert-parallel",
      "expert-parallel-size",
      "gpu-memory-utilization",
      "max-num-batched-tokens",
      "max-num-seqs",
      "block-size",
      "enable-prefix-caching",
      "enable-chunked-prefill"
    ],
    "topology_constraints": {
      "require_tp_dp_product_equals_gpu_count": false,
      "require_ep_size_leq_tp_dp_product": true,
      "require_ep_size_divides_tp_dp_product": true,
      "require_enable_expert_parallel_when_ep_gt_one": true,
      "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
      "allowed_tp_dp_products": [1, 2, 4, 8],
      "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
      "allowed_data_parallel_sizes": [1, 2, 4, 8],
      "allowed_expert_parallel_sizes": [1, 2, 4, 8]
    },
    "python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
  },
  "trace": {
    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
    "window_id": "chat_w20260311_1000",
    "u_field": "sampling_u",
    "timestamp_field": "timestamp",
    "max_concurrency": 64,
    "input_length_filter": {
      "min_input_tokens": 0,
      "max_input_tokens": 8192
    },
    "max_requests_per_probe": 2048,
    "replay_time_scale": 1.0,
    "early_stop_max_lag_s": 120.0,
    "early_stop_max_elapsed_s": 900.0
  },
  "slo": {
    "target_pass_rate": 0.95,
    "ttft_rule": {
      "kind": "step_ms",
      "buckets": [
        {
          "max_input_tokens": 4096,
          "threshold_ms": 2000
        },
        {
          "max_input_tokens": 32768,
          "threshold_ms": 4000
        },
        {
          "threshold_ms": 6000
        }
      ]
    },
    "tpot_rule": {
      "kind": "fixed_ms",
      "threshold_ms": 50
    }
  },
  "search": {
    "low": 0.0,
    "high": 0.125,
    "tolerance": 0.001,
    "max_probes": 6,
    "sample_seed": 20260325
  },
  "llm": {
    "system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
    "max_history_trials": 8,
    "use_harness": false,
    "endpoint": {
      "provider": "codex",
      "model": "gpt-5.4",
      "stream": true,
      "api_key_env": "OPENAI_API_KEY",
      "timeout_s": 240
    }
  }
 }
--- a/docs/aituner-harness-summary.md
+++ b/docs/aituner-harness-summary.md
@@ -26,11 +26,11 @@ The harness turns each LLM proposal from open-ended config search into a bottlen
     - `gpu-memory-utilization`: memory headroom after topology and batching are stable.
   - Each family has `use_when`, `procedure`, `guards`, and `active_now` fields.
-4. Proposal discipline
+4. Proposal discipline and early stop
   - The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed.
   - It must use adjacent legal topology choices and stay inside topology constraints.
   - It receives tested config signatures, so it should not repeat already-tried configs.
-   - It can return `should_stop=true` when no adjacent harness-guided probe is justified.
+   - A deterministic harness stop can now emit `should_stop=true` before calling the LLM when completed validation evidence says another trial is not justified.
 5. Baseline-first loop
   - LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed.
@@ -44,10 +44,10 @@ The speedup comes from reducing wasted proposal families, not from changing the
   - For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs.
   - Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`.
-2. Guarded stop after a strong incumbent
+2. Guarded stop after validation, not immediately after a strong incumbent
   - If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks.
-   - Without that guard, the LLM still proposed weak MBT trials after finding the qwen27b best config.
+   - It does not stop at the first large gain. It requires post-incumbent validation trials across nearby topology/runtime families, and stops only if those trials fail to produce a feasible per-GPU improvement.
-   - With the guard, it emits `should_stop=true`.
+   - With the guard, `study tune` can write a `harness-stop-XXXX` proposal and exit without spending another GPU trial.
 3. All-infeasible plateau detection
   - When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family.
@@ -78,6 +78,6 @@ Result:
 ## Current Risks
- The harness is prompt-guided, not a hard verifier for every rule. If future LLM outputs ignore a fired guard, proposal validation should reject the blocked family explicitly.
+- The harness is still prompt-guided for choosing the next non-stop proposal. The deterministic stop path is hard-coded in `study tune`, but proposal-family blocking is not yet enforced by a separate validator.
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials once a large gain is already reached. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
+- Strong-incumbent stopping is intentionally biased toward fewer GPU trials after validation evidence accumulates. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
 - Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows.
--- a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md
+++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md
@@ -0,0 +1,93 @@
 # Qwen3-30B-A3B Community vLLM Harness Ablation, 2026-05-02
 ## Goal
 Run a fresh dash0 experiment on the community vLLM latest release with the local community model:
 `/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
 The comparison is:
 | Variant | Spec | Harness |
 | --- | --- | --- |
 | no-harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json` | disabled via `llm.use_harness=false` |
 | harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json` | enabled, including deterministic stop proposal |
 Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model.
 ## vLLM Install
 PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 installation target is:
 `/home/admin/cpfs/wjh/venvs/vllm-0.20.0`
 Install log:
 `/home/admin/cpfs/wjh/aituner/aituner/logs/install_vllm_0.20.0_20260502.log`
 ## Workload
 The experiment reuses the 0-8k chat window that has already been used for qwen27b harness work:
 | Field | Value |
 | --- | --- |
 | window | `chat_w20260311_1000` |
 | source rows | 32606 |
 | input filter | 0 to 8192 tokens |
 | max requests per probe | 2048 |
 | target pass rate | 0.95 |
 | TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above |
 | TPOT SLO | 50ms |
 | search high | 0.125 sampling_u |
 | max probes per trial | 6 |
 The `max_requests_per_probe=2048` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe.
 ## Harness Update Under Test
 This run tests a stricter early-stop harness:
 - The harness still injects L-C-A workload features, recent trial diagnostics, active bottleneck, legal topology candidates, tested signatures, and knob-family rules.
 - A strong incumbent no longer means immediate stop. It means "validate nearby alternatives".
 - Deterministic stop is allowed only after completed validation evidence says continuing is unlikely to be useful:
  - the incumbent beats baseline by a generic large-gain ratio,
  - at least two post-incumbent validation trials have run,
  - those validation trials did not produce a feasible per-GPU improvement,
  - the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts.
 - If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal.
 This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number.
 ## Unit Tests
 Local test command:
 ```bash
 PYTHONPATH=src python3 -m unittest tests.test_core_flow -q
 ```
 Result: passed, 74 tests.
 The added coverage checks:
 | Test | Purpose |
 | --- | --- |
 | `test_harness_does_not_stop_immediately_after_strong_incumbent` | strong incumbent requires validation first |
 | `test_harness_stop_after_post_incumbent_validation_is_exhausted` | deterministic stop after validation exhaustion |
 | `test_cli_tune_uses_harness_stop_before_llm` | `study tune` can stop without calling the LLM or launching another GPU trial |
 | `test_prompt_can_disable_harness_for_ablation` | no-harness prompt removes structured harness context |
 ## Experiment Tracking
 Pending dash0 runs:
 | Variant | tmux session | Log | Study root |
 | --- | --- | --- | --- |
 | no-harness | `qwen30b_vllm020_noharness_20260502` | `logs/qwen30b_vllm020_noharness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness` |
 | harness | `qwen30b_vllm020_harness_20260502` | `logs/qwen30b_vllm020_harness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness` |
 The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.
 ## Results
 Pending. This section will be filled after the dash0 experiments finish.
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -6,9 +6,10 @@ import sys
 from pathlib import Path
 from .compare import run_compare
 from .harness import build_harness_context, build_harness_stop_proposal
 from .job import append_job, build_trial_job
 from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
-from .spec import Proposal, SpecError, load_study_spec
+from .spec import Proposal, SpecError, load_study_spec, to_jsonable
 from .store import StudyStore
 from .trace import load_trace_requests, summarize_window
 from .worker import run_trial
@@ -118,16 +119,23 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
        raise SpecError("max_trials must be positive")
    if proposal_files and max_trials > len(proposal_files):
        max_trials = len(proposal_files)
    if not proposal_files and study.llm.endpoint is None:
        raise SpecError("No proposal files provided and study.llm.endpoint is not configured")
    executed: list[dict[str, object]] = []
    for idx in range(max_trials):
        state = store.load_state(study.study_id)
        window, requests = load_trace_requests(study, study_spec_path=spec_path)
        window_summary = summarize_window(requests, window)
        harness_context = (
            build_harness_context(
                study=study,
                window_summary=window_summary,
                state=state,
            )
            if study.llm.use_harness
            else None
        )
        prompt = build_prompt(
            study=study,
-            window_summary=summarize_window(requests, window),
+            window_summary=window_summary,
            state=state,
            capability_profile=capability_profile,
        )
@@ -162,18 +170,36 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
            proposal_name = proposal_source.stem
        else:
            proposal_source = None
-            proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
+            stop_proposal = (
-            proposal_name = f"proposal-{state.next_trial_index:04d}"
+                build_harness_stop_proposal(harness_context)
                if harness_context is not None
                else None
            )
            if stop_proposal is not None:
                proposal_text = json.dumps(to_jsonable(stop_proposal), ensure_ascii=False)
                proposal_name = f"harness-stop-{state.next_trial_index:04d}"
            else:
                if study.llm.endpoint is None:
                    raise SpecError(
                        "No proposal files provided, study.llm.endpoint is not configured, "
                        "and the harness stop guard did not fire."
                    )
                proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
                proposal_name = f"proposal-{state.next_trial_index:04d}"
        raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
        raw_proposal_path.write_text(proposal_text, encoding="utf-8")
        proposal = parse_proposal_text(proposal_text, study)
        store.write_proposal(study.study_id, proposal_name, proposal)
        if proposal.should_stop:
            if proposal_name.startswith("harness-stop-"):
                proposal_source_label = "harness"
            else:
                proposal_source_label = str(proposal_source) if proposal_source else "llm"
            executed.append(
                {
                    "trial_id": None,
                    "proposal_name": proposal_name,
-                    "proposal_source": str(proposal_source) if proposal_source else "llm",
+                    "proposal_source": proposal_source_label,
                    "stopped": True,
                    "diagnosis": proposal.diagnosis,
                    "state_best_trial_id": state.best_trial_id,
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -4,7 +4,25 @@ import json
 from pathlib import Path
 from typing import Any
-from .spec import StudySpec, StudyState, TrialSummary
+from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary
 _TOPOLOGY_KEYS = {
    "tensor-parallel-size",
    "data-parallel-size",
    "expert-parallel-size",
    "enable-expert-parallel",
 }
 _RUNTIME_KEYS = {
    "max-num-seqs",
    "max-num-batched-tokens",
    "block-size",
    "gpu-memory-utilization",
    "enable-chunked-prefill",
 }
 _STRONG_INCUMBENT_MIN_GAIN = 1.8
 _MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
 _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
 def build_harness_context(
@@ -23,11 +41,39 @@ def build_harness_context(
        "workload_lca_profile": _workload_lca_profile(window_summary),
        "recent_trial_diagnostics": recent_diagnostics,
        "convergence_guard": _convergence_guard(state, recent_diagnostics),
        "harness_stop": _harness_stop_decision(state, recent_diagnostics),
        "knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
        "proposal_rules": _proposal_rules(),
    }
 def build_harness_stop_proposal(context: dict[str, Any]) -> Proposal | None:
    stop = context.get("harness_stop")
    if not isinstance(stop, dict) or not stop.get("should_stop"):
        return None
    reason = str(stop.get("reason") or "harness_converged")
    evidence = stop.get("evidence") if isinstance(stop.get("evidence"), dict) else {}
    observation = (
        "Harness convergence guard triggered before requesting another proposal: "
        f"{reason}."
    )
    diagnosis = str(evidence.get("summary") or reason)
    return Proposal(
        observation=observation,
        diagnosis=diagnosis,
        config_patch=ConfigPatch(env_patch={}, flag_patch={}),
        expected_effects=[
            "stop without spending another GPU trial",
            "preserve the current best observed configuration",
        ],
        why_not_previous_failures=(
            "The stop decision is based on completed validation evidence and does not "
            "repeat any failed configuration."
        ),
        should_stop=True,
    )
 def render_harness_context(context: dict[str, Any]) -> str:
    return json.dumps(context, ensure_ascii=False, indent=2)
@@ -423,6 +469,158 @@ def _convergence_guard(
    }
 def _harness_stop_decision(
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
 ) -> dict[str, Any]:
    guard = _convergence_guard(state, recent_diagnostics)
    if guard["should_stop_if_no_harness_can_justify_a_new_adjacent_probe"]:
        return {
            "should_stop": True,
            "reason": guard["reason"],
            "evidence": {
                "summary": "The convergence guard fired and no further adjacent probe is required.",
                "convergence_guard": guard,
            },
        }
    validation = _validation_exhausted_guard(state, recent_diagnostics)
    if validation["exhausted"]:
        return {
            "should_stop": True,
            "reason": validation["reason"],
            "evidence": validation,
        }
    return {
        "should_stop": False,
        "reason": "continue_harness_guided_search",
        "evidence": {
            "summary": "No deterministic harness stop condition is satisfied.",
            "convergence_guard": guard,
            "validation_exhausted": validation,
        },
    }
 def _validation_exhausted_guard(
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
 ) -> dict[str, Any]:
    default = {
        "exhausted": False,
        "reason": "validation_not_exhausted",
        "summary": "Validation probes are not sufficient to stop yet.",
        "incumbent_trial_id": state.best_trial_id,
        "incumbent_gain_vs_baseline": None,
        "validation_trial_ids": [],
        "validation_families": [],
    }
    if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
        return default
    completed = [
        item
        for item in recent_diagnostics
        if item.get("status") == "completed"
        and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
    ]
    if not completed:
        return default
    baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
    incumbent_rate = _as_float(state.best_request_rate_per_gpu)
    if baseline_rate <= 0 or incumbent_rate <= 0:
        return default
    gain = incumbent_rate / baseline_rate
    if gain < _STRONG_INCUMBENT_MIN_GAIN:
        return {
            **default,
            "reason": "incumbent_gain_not_large_enough_for_validation_stop",
            "incumbent_gain_vs_baseline": gain,
        }
    best_index = next(
        (
            index
            for index, item in enumerate(recent_diagnostics)
            if item.get("trial_id") == state.best_trial_id
        ),
        None,
    )
    if best_index is None:
        return {
            **default,
            "reason": "incumbent_not_in_recent_harness_history",
            "incumbent_gain_vs_baseline": gain,
        }
    after_best = [
        item
        for item in recent_diagnostics[best_index + 1 :]
        if item.get("status") in {"completed", "failed"}
    ]
    if len(after_best) < _MIN_POST_INCUMBENT_VALIDATION_TRIALS:
        return {
            **default,
            "reason": "need_at_least_two_post_incumbent_validation_trials",
            "incumbent_gain_vs_baseline": gain,
            "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
        }
    if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best):
        return {
            **default,
            "reason": "post_incumbent_validation_found_feasible_candidate",
            "incumbent_gain_vs_baseline": gain,
            "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
        }
    families: set[str] = set()
    for item in after_best:
        families.update(_validation_families(item))
    has_topology = "topology" in families
    has_runtime = bool(families & {"runtime", "max-num-seqs", "max-num-batched-tokens"})
    enough_evidence = (
        len(after_best) >= _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE
        or (has_topology and has_runtime)
    )
    if not enough_evidence:
        return {
            **default,
            "reason": "post_incumbent_validation_has_not_covered_enough_families",
            "incumbent_gain_vs_baseline": gain,
            "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
            "validation_families": sorted(families),
        }
    return {
        "exhausted": True,
        "reason": "post_incumbent_validation_exhausted",
        "summary": (
            "A strong incumbent was followed by validation probes across nearby "
            "topology/runtime families, and none produced a feasible candidate."
        ),
        "incumbent_trial_id": state.best_trial_id,
        "incumbent_gain_vs_baseline": gain,
        "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
        "validation_families": sorted(families),
    }
 def _validation_families(item: dict[str, Any]) -> set[str]:
    config_patch = item.get("config_patch")
    if not isinstance(config_patch, dict):
        return set()
    flag_patch = config_patch.get("flag_patch")
    if not isinstance(flag_patch, dict):
        return set()
    families: set[str] = set()
    if any(key in flag_patch for key in _TOPOLOGY_KEYS):
        families.add("topology")
    for key in _RUNTIME_KEYS:
        if key in flag_patch:
            families.add("runtime")
            families.add(key)
    if not families and flag_patch:
        families.add("other")
    return families
 def _strong_incumbent_guard(
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -312,23 +312,45 @@ def build_prompt(
        "",
        "Tested config signatures:",
        json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
-        "",
+    ]
-        "Harnesses:",
+    if study.llm.use_harness:
-        render_harness_context(
+        sections.extend(
-            build_harness_context(
+            [
-                study=study,
+                "",
-                window_summary=window_summary,
+                "Harnesses:",
-                state=state,
+                render_harness_context(
-            )
+                    build_harness_context(
-        ),
+                        study=study,
-        "",
+                        window_summary=window_summary,
                        state=state,
                    )
                ),
                "",
            ]
        )
    else:
        sections.extend(
            [
                "",
                "Harnesses:",
                "Disabled by llm.use_harness=false for ablation.",
                "",
            ]
        )
    sections.extend(
        [
        "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
        "The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
        "The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
        "If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
        "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
-        "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
+        (
-    ]
+            "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
            if study.llm.use_harness
            else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
        ),
        ]
    )
    return "\n".join(sections)
--- a/src/aituner/spec.py
+++ b/src/aituner/spec.py
@@ -576,6 +576,7 @@ class LLMPolicySpec:
    endpoint: LLMEndpointSpec | None
    system_prompt: str
    max_history_trials: int
    use_harness: bool = True
    @classmethod
    def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
@@ -593,6 +594,11 @@ class LLMPolicySpec:
            max_history_trials=_require_int(
                payload.get("max_history_trials", 8), context="llm.max_history_trials"
            ),
            use_harness=(
                _require_bool(payload.get("use_harness"), context="llm.use_harness")
                if payload.get("use_harness") is not None
                else True
            ),
        )
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -13,7 +13,7 @@ from aituner.compare import load_compare_spec, run_compare
 from aituner.engine import build_launch_recipe
 from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
 from aituner.job import append_job, build_trial_job
-from aituner.harness import build_harness_context
+from aituner.harness import build_harness_context, build_harness_stop_proposal
 from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
 from aituner.search import ThresholdProbe, binary_search_max_feasible
 from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
@@ -422,6 +422,119 @@ class CoreFlowTests(unittest.TestCase):
            )
            self.assertIn("validate", guard["recommended_next_action"])
    def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=8,
                best_sampling_u=0.02,
                best_request_rate=2.4,
                best_request_rate_per_gpu=0.3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.8,
                        best_request_rate_per_gpu=0.1,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.4,
                        best_request_rate_per_gpu=0.3,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0003",
                        status="completed",
                        parallel_size=8,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 1,
                                "data-parallel-size": 8,
                            },
                        },
                    ),
                    TrialSummary(
                        trial_id="trial-0004",
                        status="completed",
                        parallel_size=8,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {"max-num-seqs": 160},
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertTrue(context["harness_stop"]["should_stop"])
            self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
            proposal = build_harness_stop_proposal(context)
            self.assertIsNotNone(proposal)
            self.assertTrue(proposal.should_stop)
    def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            state = StudyState(
                study_id=study.study_id,
                best_trial_id="trial-0002",
                best_parallel_size=8,
                best_request_rate=2.4,
                best_request_rate_per_gpu=0.3,
                trials=[
                    TrialSummary(
                        trial_id="trial-0001",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=0.8,
                        best_request_rate_per_gpu=0.1,
                        config_patch={"env_patch": {}, "flag_patch": {}},
                    ),
                    TrialSummary(
                        trial_id="trial-0002",
                        status="completed",
                        parallel_size=8,
                        best_request_rate=2.4,
                        best_request_rate_per_gpu=0.3,
                        config_patch={
                            "env_patch": {},
                            "flag_patch": {
                                "tensor-parallel-size": 2,
                                "data-parallel-size": 4,
                            },
                        },
                    ),
                ],
            )
            context = build_harness_context(
                study=study,
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            self.assertFalse(context["harness_stop"]["should_stop"])
            self.assertIsNone(build_harness_stop_proposal(context))
    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -564,6 +677,26 @@ class CoreFlowTests(unittest.TestCase):
                "\n".join(context["proposal_rules"]),
            )
    def test_prompt_can_disable_harness_for_ablation(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            payload = json.loads(study_path.read_text(encoding="utf-8"))
            payload["llm"]["use_harness"] = False
            study_path.write_text(json.dumps(payload), encoding="utf-8")
            study = load_study_spec(study_path)
            window, requests = load_trace_requests(study, study_spec_path=study_path)
            prompt = build_prompt(
                study=study,
                window_summary=summarize_window(requests, window),
                state=StudyState(study_id=study.study_id),
                capability_profile=None,
            )
            self.assertFalse(study.llm.use_harness)
            self.assertIn("Disabled by llm.use_harness=false", prompt)
            self.assertNotIn('"paper_alignment"', prompt)
            self.assertIn("without harness hints", prompt)
    def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -2299,6 +2432,98 @@ class CoreFlowTests(unittest.TestCase):
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 1)
    def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            study_path = _write_study_assets(tmp_path)
            study = load_study_spec(study_path)
            store_root = tmp_path / "store"
            store = StudyStore(store_root)
            store.init_study(spec_path=study_path, study=study)
            store.save_state(
                StudyState(
                    study_id=study.study_id,
                    best_trial_id="trial-0002",
                    best_parallel_size=8,
                    best_sampling_u=0.02,
                    best_request_rate=2.4,
                    best_request_rate_per_gpu=0.3,
                    next_trial_index=5,
                    trials=[
                        TrialSummary(
                            trial_id="trial-0001",
                            status="completed",
                            parallel_size=8,
                            best_request_rate=0.8,
                            best_request_rate_per_gpu=0.1,
                            config_patch={"env_patch": {}, "flag_patch": {}},
                        ),
                        TrialSummary(
                            trial_id="trial-0002",
                            status="completed",
                            parallel_size=8,
                            best_request_rate=2.4,
                            best_request_rate_per_gpu=0.3,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {
                                    "tensor-parallel-size": 2,
                                    "data-parallel-size": 4,
                                },
                            },
                        ),
                        TrialSummary(
                            trial_id="trial-0003",
                            status="completed",
                            parallel_size=8,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {
                                    "tensor-parallel-size": 1,
                                    "data-parallel-size": 8,
                                },
                            },
                        ),
                        TrialSummary(
                            trial_id="trial-0004",
                            status="completed",
                            parallel_size=8,
                            config_patch={
                                "env_patch": {},
                                "flag_patch": {"max-num-seqs": 160},
                            },
                        ),
                    ],
                )
            )
            with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
                with mock.patch("aituner.cli.run_trial") as run_trial_mock:
                    exit_code = cli_main(
                        [
                            "study",
                            "tune",
                            "--spec",
                            str(study_path),
                            "--store-root",
                            str(store_root),
                            "--max-trials",
                            "1",
                        ]
                    )
            self.assertEqual(exit_code, 0)
            llm_mock.assert_not_called()
            run_trial_mock.assert_not_called()
            proposal_path = (
                store.study_root(study.study_id)
                / "proposals"
                / "harness-stop-0005.json"
            )
            self.assertTrue(proposal_path.exists())
            proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
            self.assertTrue(proposal["should_stop"])
    def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)