Add harness early stop ablation

2026-05-02 08:08:14 +08:00
parent 6d3459c82d
commit 1a3d628268
9 changed files with 837 additions and 29 deletions
--- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json
+++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json
@@ -0,0 +1,119 @@
+{
+  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness",
+  "hardware": {
+    "gpu_count": 8,
+    "gpu_model": "H20",
+    "host_candidates": [
+      "dash0"
+    ]
+  },
+  "model": {
+    "model_id": "Qwen/Qwen3-30B-A3B",
+    "served_model_name": "qwen3-30b-a3b-community"
+  },
+  "engine": {
+    "engine_name": "vllm",
+    "engine_version": "0.20.0",
+    "exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
+    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
+    "host": "127.0.0.1",
+    "port": 18230,
+    "healthcheck_path": "/v1/models",
+    "ready_timeout_s": 900,
+    "request_timeout_s": 900,
+    "launch_args": [
+      "serve",
+      "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
+    ],
+    "base_envs": {
+      "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
+    },
+    "base_flags": {
+      "host": "127.0.0.1",
+      "port": 18230,
+      "served-model-name": "qwen3-30b-a3b-community"
+    },
+    "tunable_envs": [],
+    "tunable_flags": [
+      "tensor-parallel-size",
+      "data-parallel-size",
+      "enable-expert-parallel",
+      "expert-parallel-size",
+      "gpu-memory-utilization",
+      "max-num-batched-tokens",
+      "max-num-seqs",
+      "block-size",
+      "enable-prefix-caching",
+      "enable-chunked-prefill"
+    ],
+    "topology_constraints": {
+      "require_tp_dp_product_equals_gpu_count": false,
+      "require_ep_size_leq_tp_dp_product": true,
+      "require_ep_size_divides_tp_dp_product": true,
+      "require_enable_expert_parallel_when_ep_gt_one": true,
+      "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
+      "allowed_tp_dp_products": [1, 2, 4, 8],
+      "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
+      "allowed_data_parallel_sizes": [1, 2, 4, 8],
+      "allowed_expert_parallel_sizes": [1, 2, 4, 8]
+    },
+    "python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
+  },
+  "trace": {
+    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
+    "window_id": "chat_w20260311_1000",
+    "u_field": "sampling_u",
+    "timestamp_field": "timestamp",
+    "max_concurrency": 64,
+    "input_length_filter": {
+      "min_input_tokens": 0,
+      "max_input_tokens": 8192
+    },
+    "max_requests_per_probe": 2048,
+    "replay_time_scale": 1.0,
+    "early_stop_max_lag_s": 120.0,
+    "early_stop_max_elapsed_s": 900.0
+  },
+  "slo": {
+    "target_pass_rate": 0.95,
+    "ttft_rule": {
+      "kind": "step_ms",
+      "buckets": [
+        {
+          "max_input_tokens": 4096,
+          "threshold_ms": 2000
+        },
+        {
+          "max_input_tokens": 32768,
+          "threshold_ms": 4000
+        },
+        {
+          "threshold_ms": 6000
+        }
+      ]
+    },
+    "tpot_rule": {
+      "kind": "fixed_ms",
+      "threshold_ms": 50
+    }
+  },
+  "search": {
+    "low": 0.0,
+    "high": 0.125,
+    "tolerance": 0.001,
+    "max_probes": 6,
+    "sample_seed": 20260325
+  },
+  "llm": {
+    "system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
+    "max_history_trials": 8,
+    "use_harness": true,
+    "endpoint": {
+      "provider": "codex",
+      "model": "gpt-5.4",
+      "stream": true,
+      "api_key_env": "OPENAI_API_KEY",
+      "timeout_s": 240
+    }
+  }
+}
--- a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json
+++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json
@@ -0,0 +1,119 @@
+{
+  "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness",
+  "hardware": {
+    "gpu_count": 8,
+    "gpu_model": "H20",
+    "host_candidates": [
+      "dash0"
+    ]
+  },
+  "model": {
+    "model_id": "Qwen/Qwen3-30B-A3B",
+    "served_model_name": "qwen3-30b-a3b-community"
+  },
+  "engine": {
+    "engine_name": "vllm",
+    "engine_version": "0.20.0",
+    "exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
+    "cwd": "/home/admin/cpfs/wjh/aituner/aituner",
+    "host": "127.0.0.1",
+    "port": 18231,
+    "healthcheck_path": "/v1/models",
+    "ready_timeout_s": 900,
+    "request_timeout_s": 900,
+    "launch_args": [
+      "serve",
+      "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
+    ],
+    "base_envs": {
+      "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
+    },
+    "base_flags": {
+      "host": "127.0.0.1",
+      "port": 18231,
+      "served-model-name": "qwen3-30b-a3b-community"
+    },
+    "tunable_envs": [],
+    "tunable_flags": [
+      "tensor-parallel-size",
+      "data-parallel-size",
+      "enable-expert-parallel",
+      "expert-parallel-size",
+      "gpu-memory-utilization",
+      "max-num-batched-tokens",
+      "max-num-seqs",
+      "block-size",
+      "enable-prefix-caching",
+      "enable-chunked-prefill"
+    ],
+    "topology_constraints": {
+      "require_tp_dp_product_equals_gpu_count": false,
+      "require_ep_size_leq_tp_dp_product": true,
+      "require_ep_size_divides_tp_dp_product": true,
+      "require_enable_expert_parallel_when_ep_gt_one": true,
+      "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
+      "allowed_tp_dp_products": [1, 2, 4, 8],
+      "allowed_tensor_parallel_sizes": [1, 2, 4, 8],
+      "allowed_data_parallel_sizes": [1, 2, 4, 8],
+      "allowed_expert_parallel_sizes": [1, 2, 4, 8]
+    },
+    "python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
+  },
+  "trace": {
+    "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
+    "window_id": "chat_w20260311_1000",
+    "u_field": "sampling_u",
+    "timestamp_field": "timestamp",
+    "max_concurrency": 64,
+    "input_length_filter": {
+      "min_input_tokens": 0,
+      "max_input_tokens": 8192
+    },
+    "max_requests_per_probe": 2048,
+    "replay_time_scale": 1.0,
+    "early_stop_max_lag_s": 120.0,
+    "early_stop_max_elapsed_s": 900.0
+  },
+  "slo": {
+    "target_pass_rate": 0.95,
+    "ttft_rule": {
+      "kind": "step_ms",
+      "buckets": [
+        {
+          "max_input_tokens": 4096,
+          "threshold_ms": 2000
+        },
+        {
+          "max_input_tokens": 32768,
+          "threshold_ms": 4000
+        },
+        {
+          "threshold_ms": 6000
+        }
+      ]
+    },
+    "tpot_rule": {
+      "kind": "fixed_ms",
+      "threshold_ms": 50
+    }
+  },
+  "search": {
+    "low": 0.0,
+    "high": 0.125,
+    "tolerance": 0.001,
+    "max_probes": 6,
+    "sample_seed": 20260325
+  },
+  "llm": {
+    "system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
+    "max_history_trials": 8,
+    "use_harness": false,
+    "endpoint": {
+      "provider": "codex",
+      "model": "gpt-5.4",
+      "stream": true,
+      "api_key_env": "OPENAI_API_KEY",
+      "timeout_s": 240
+    }
+  }
+}
--- a/docs/aituner-harness-summary.md
+++ b/docs/aituner-harness-summary.md
@@ -26,11 +26,11 @@ The harness turns each LLM proposal from open-ended config search into a bottlen
     - `gpu-memory-utilization`: memory headroom after topology and batching are stable.
   - Each family has `use_when`, `procedure`, `guards`, and `active_now` fields.

-4. Proposal discipline
+4. Proposal discipline and early stop
   - The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed.
   - It must use adjacent legal topology choices and stay inside topology constraints.
   - It receives tested config signatures, so it should not repeat already-tried configs.
-   - It can return `should_stop=true` when no adjacent harness-guided probe is justified.
+   - A deterministic harness stop can now emit `should_stop=true` before calling the LLM when completed validation evidence says another trial is not justified.

 5. Baseline-first loop
   - LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed.
@@ -44,10 +44,10 @@ The speedup comes from reducing wasted proposal families, not from changing the
   - For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs.
   - Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`.

-2. Guarded stop after a strong incumbent
+2. Guarded stop after validation, not immediately after a strong incumbent
   - If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks.
-   - Without that guard, the LLM still proposed weak MBT trials after finding the qwen27b best config.
-   - With the guard, it emits `should_stop=true`.
+   - It does not stop at the first large gain. It requires post-incumbent validation trials across nearby topology/runtime families, and stops only if those trials fail to produce a feasible per-GPU improvement.
+   - With the guard, `study tune` can write a `harness-stop-XXXX` proposal and exit without spending another GPU trial.

 3. All-infeasible plateau detection
   - When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family.
@@ -78,6 +78,6 @@ Result:

 ## Current Risks

- The harness is prompt-guided, not a hard verifier for every rule. If future LLM outputs ignore a fired guard, proposal validation should reject the blocked family explicitly.
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials once a large gain is already reached. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
+- The harness is still prompt-guided for choosing the next non-stop proposal. The deterministic stop path is hard-coded in `study tune`, but proposal-family blocking is not yet enforced by a separate validator.
+- Strong-incumbent stopping is intentionally biased toward fewer GPU trials after validation evidence accumulates. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
 - Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows.
--- a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md
+++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md
@@ -0,0 +1,93 @@
+# Qwen3-30B-A3B Community vLLM Harness Ablation, 2026-05-02
+
+## Goal
+
+Run a fresh dash0 experiment on the community vLLM latest release with the local community model:
+
+`/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
+
+The comparison is:
+
+| Variant | Spec | Harness |
+| --- | --- | --- |
+| no-harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json` | disabled via `llm.use_harness=false` |
+| harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json` | enabled, including deterministic stop proposal |
+
+Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model.
+
+## vLLM Install
+
+PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 installation target is:
+
+`/home/admin/cpfs/wjh/venvs/vllm-0.20.0`
+
+Install log:
+
+`/home/admin/cpfs/wjh/aituner/aituner/logs/install_vllm_0.20.0_20260502.log`
+
+## Workload
+
+The experiment reuses the 0-8k chat window that has already been used for qwen27b harness work:
+
+| Field | Value |
+| --- | --- |
+| window | `chat_w20260311_1000` |
+| source rows | 32606 |
+| input filter | 0 to 8192 tokens |
+| max requests per probe | 2048 |
+| target pass rate | 0.95 |
+| TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above |
+| TPOT SLO | 50ms |
+| search high | 0.125 sampling_u |
+| max probes per trial | 6 |
+
+The `max_requests_per_probe=2048` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe.
+
+## Harness Update Under Test
+
+This run tests a stricter early-stop harness:
+
+- The harness still injects L-C-A workload features, recent trial diagnostics, active bottleneck, legal topology candidates, tested signatures, and knob-family rules.
+- A strong incumbent no longer means immediate stop. It means "validate nearby alternatives".
+- Deterministic stop is allowed only after completed validation evidence says continuing is unlikely to be useful:
+  - the incumbent beats baseline by a generic large-gain ratio,
+  - at least two post-incumbent validation trials have run,
+  - those validation trials did not produce a feasible per-GPU improvement,
+  - the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts.
+- If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal.
+
+This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number.
+
+## Unit Tests
+
+Local test command:
+
+```bash
+PYTHONPATH=src python3 -m unittest tests.test_core_flow -q
+```
+
+Result: passed, 74 tests.
+
+The added coverage checks:
+
+| Test | Purpose |
+| --- | --- |
+| `test_harness_does_not_stop_immediately_after_strong_incumbent` | strong incumbent requires validation first |
+| `test_harness_stop_after_post_incumbent_validation_is_exhausted` | deterministic stop after validation exhaustion |
+| `test_cli_tune_uses_harness_stop_before_llm` | `study tune` can stop without calling the LLM or launching another GPU trial |
+| `test_prompt_can_disable_harness_for_ablation` | no-harness prompt removes structured harness context |
+
+## Experiment Tracking
+
+Pending dash0 runs:
+
+| Variant | tmux session | Log | Study root |
+| --- | --- | --- | --- |
+| no-harness | `qwen30b_vllm020_noharness_20260502` | `logs/qwen30b_vllm020_noharness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness` |
+| harness | `qwen30b_vllm020_harness_20260502` | `logs/qwen30b_vllm020_harness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness` |
+
+The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.
+
+## Results
+
+Pending. This section will be filled after the dash0 experiments finish.
--- a/src/aituner/cli.py
+++ b/src/aituner/cli.py
@@ -6,9 +6,10 @@ import sys
 from pathlib import Path

 from .compare import run_compare
+from .harness import build_harness_context, build_harness_stop_proposal
 from .job import append_job, build_trial_job
 from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
-from .spec import Proposal, SpecError, load_study_spec
+from .spec import Proposal, SpecError, load_study_spec, to_jsonable
 from .store import StudyStore
 from .trace import load_trace_requests, summarize_window
 from .worker import run_trial
@@ -118,16 +119,23 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
        raise SpecError("max_trials must be positive")
    if proposal_files and max_trials > len(proposal_files):
        max_trials = len(proposal_files)
-    if not proposal_files and study.llm.endpoint is None:
-        raise SpecError("No proposal files provided and study.llm.endpoint is not configured")
-
    executed: list[dict[str, object]] = []
    for idx in range(max_trials):
        state = store.load_state(study.study_id)
        window, requests = load_trace_requests(study, study_spec_path=spec_path)
+        window_summary = summarize_window(requests, window)
+        harness_context = (
+            build_harness_context(
+                study=study,
+                window_summary=window_summary,
+                state=state,
+            )
+            if study.llm.use_harness
+            else None
+        )
        prompt = build_prompt(
            study=study,
-            window_summary=summarize_window(requests, window),
+            window_summary=window_summary,
            state=state,
            capability_profile=capability_profile,
        )
@@ -162,18 +170,36 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
            proposal_name = proposal_source.stem
        else:
            proposal_source = None
-            proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
-            proposal_name = f"proposal-{state.next_trial_index:04d}"
+            stop_proposal = (
+                build_harness_stop_proposal(harness_context)
+                if harness_context is not None
+                else None
+            )
+            if stop_proposal is not None:
+                proposal_text = json.dumps(to_jsonable(stop_proposal), ensure_ascii=False)
+                proposal_name = f"harness-stop-{state.next_trial_index:04d}"
+            else:
+                if study.llm.endpoint is None:
+                    raise SpecError(
+                        "No proposal files provided, study.llm.endpoint is not configured, "
+                        "and the harness stop guard did not fire."
+                    )
+                proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
+                proposal_name = f"proposal-{state.next_trial_index:04d}"
        raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
        raw_proposal_path.write_text(proposal_text, encoding="utf-8")
        proposal = parse_proposal_text(proposal_text, study)
        store.write_proposal(study.study_id, proposal_name, proposal)
        if proposal.should_stop:
+            if proposal_name.startswith("harness-stop-"):
+                proposal_source_label = "harness"
+            else:
+                proposal_source_label = str(proposal_source) if proposal_source else "llm"
            executed.append(
                {
                    "trial_id": None,
                    "proposal_name": proposal_name,
-                    "proposal_source": str(proposal_source) if proposal_source else "llm",
+                    "proposal_source": proposal_source_label,
                    "stopped": True,
                    "diagnosis": proposal.diagnosis,
                    "state_best_trial_id": state.best_trial_id,
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -4,7 +4,25 @@ import json
 from pathlib import Path
 from typing import Any

-from .spec import StudySpec, StudyState, TrialSummary
+from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary
+
+
+_TOPOLOGY_KEYS = {
+    "tensor-parallel-size",
+    "data-parallel-size",
+    "expert-parallel-size",
+    "enable-expert-parallel",
+}
+_RUNTIME_KEYS = {
+    "max-num-seqs",
+    "max-num-batched-tokens",
+    "block-size",
+    "gpu-memory-utilization",
+    "enable-chunked-prefill",
+}
+_STRONG_INCUMBENT_MIN_GAIN = 1.8
+_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
+_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3


 def build_harness_context(
@@ -23,11 +41,39 @@ def build_harness_context(
        "workload_lca_profile": _workload_lca_profile(window_summary),
        "recent_trial_diagnostics": recent_diagnostics,
        "convergence_guard": _convergence_guard(state, recent_diagnostics),
+        "harness_stop": _harness_stop_decision(state, recent_diagnostics),
        "knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
        "proposal_rules": _proposal_rules(),
    }


+def build_harness_stop_proposal(context: dict[str, Any]) -> Proposal | None:
+    stop = context.get("harness_stop")
+    if not isinstance(stop, dict) or not stop.get("should_stop"):
+        return None
+    reason = str(stop.get("reason") or "harness_converged")
+    evidence = stop.get("evidence") if isinstance(stop.get("evidence"), dict) else {}
+    observation = (
+        "Harness convergence guard triggered before requesting another proposal: "
+        f"{reason}."
+    )
+    diagnosis = str(evidence.get("summary") or reason)
+    return Proposal(
+        observation=observation,
+        diagnosis=diagnosis,
+        config_patch=ConfigPatch(env_patch={}, flag_patch={}),
+        expected_effects=[
+            "stop without spending another GPU trial",
+            "preserve the current best observed configuration",
+        ],
+        why_not_previous_failures=(
+            "The stop decision is based on completed validation evidence and does not "
+            "repeat any failed configuration."
+        ),
+        should_stop=True,
+    )
+
+
 def render_harness_context(context: dict[str, Any]) -> str:
    return json.dumps(context, ensure_ascii=False, indent=2)

@@ -423,6 +469,158 @@ def _convergence_guard(
    }


+def _harness_stop_decision(
+    state: StudyState,
+    recent_diagnostics: list[dict[str, Any]],
+) -> dict[str, Any]:
+    guard = _convergence_guard(state, recent_diagnostics)
+    if guard["should_stop_if_no_harness_can_justify_a_new_adjacent_probe"]:
+        return {
+            "should_stop": True,
+            "reason": guard["reason"],
+            "evidence": {
+                "summary": "The convergence guard fired and no further adjacent probe is required.",
+                "convergence_guard": guard,
+            },
+        }
+    validation = _validation_exhausted_guard(state, recent_diagnostics)
+    if validation["exhausted"]:
+        return {
+            "should_stop": True,
+            "reason": validation["reason"],
+            "evidence": validation,
+        }
+    return {
+        "should_stop": False,
+        "reason": "continue_harness_guided_search",
+        "evidence": {
+            "summary": "No deterministic harness stop condition is satisfied.",
+            "convergence_guard": guard,
+            "validation_exhausted": validation,
+        },
+    }
+
+
+def _validation_exhausted_guard(
+    state: StudyState,
+    recent_diagnostics: list[dict[str, Any]],
+) -> dict[str, Any]:
+    default = {
+        "exhausted": False,
+        "reason": "validation_not_exhausted",
+        "summary": "Validation probes are not sufficient to stop yet.",
+        "incumbent_trial_id": state.best_trial_id,
+        "incumbent_gain_vs_baseline": None,
+        "validation_trial_ids": [],
+        "validation_families": [],
+    }
+    if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
+        return default
+    completed = [
+        item
+        for item in recent_diagnostics
+        if item.get("status") == "completed"
+        and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
+    ]
+    if not completed:
+        return default
+    baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
+    incumbent_rate = _as_float(state.best_request_rate_per_gpu)
+    if baseline_rate <= 0 or incumbent_rate <= 0:
+        return default
+    gain = incumbent_rate / baseline_rate
+    if gain < _STRONG_INCUMBENT_MIN_GAIN:
+        return {
+            **default,
+            "reason": "incumbent_gain_not_large_enough_for_validation_stop",
+            "incumbent_gain_vs_baseline": gain,
+        }
+
+    best_index = next(
+        (
+            index
+            for index, item in enumerate(recent_diagnostics)
+            if item.get("trial_id") == state.best_trial_id
+        ),
+        None,
+    )
+    if best_index is None:
+        return {
+            **default,
+            "reason": "incumbent_not_in_recent_harness_history",
+            "incumbent_gain_vs_baseline": gain,
+        }
+    after_best = [
+        item
+        for item in recent_diagnostics[best_index + 1 :]
+        if item.get("status") in {"completed", "failed"}
+    ]
+    if len(after_best) < _MIN_POST_INCUMBENT_VALIDATION_TRIALS:
+        return {
+            **default,
+            "reason": "need_at_least_two_post_incumbent_validation_trials",
+            "incumbent_gain_vs_baseline": gain,
+            "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
+        }
+    if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best):
+        return {
+            **default,
+            "reason": "post_incumbent_validation_found_feasible_candidate",
+            "incumbent_gain_vs_baseline": gain,
+            "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
+        }
+
+    families: set[str] = set()
+    for item in after_best:
+        families.update(_validation_families(item))
+    has_topology = "topology" in families
+    has_runtime = bool(families & {"runtime", "max-num-seqs", "max-num-batched-tokens"})
+    enough_evidence = (
+        len(after_best) >= _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE
+        or (has_topology and has_runtime)
+    )
+    if not enough_evidence:
+        return {
+            **default,
+            "reason": "post_incumbent_validation_has_not_covered_enough_families",
+            "incumbent_gain_vs_baseline": gain,
+            "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
+            "validation_families": sorted(families),
+        }
+
+    return {
+        "exhausted": True,
+        "reason": "post_incumbent_validation_exhausted",
+        "summary": (
+            "A strong incumbent was followed by validation probes across nearby "
+            "topology/runtime families, and none produced a feasible candidate."
+        ),
+        "incumbent_trial_id": state.best_trial_id,
+        "incumbent_gain_vs_baseline": gain,
+        "validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
+        "validation_families": sorted(families),
+    }
+
+
+def _validation_families(item: dict[str, Any]) -> set[str]:
+    config_patch = item.get("config_patch")
+    if not isinstance(config_patch, dict):
+        return set()
+    flag_patch = config_patch.get("flag_patch")
+    if not isinstance(flag_patch, dict):
+        return set()
+    families: set[str] = set()
+    if any(key in flag_patch for key in _TOPOLOGY_KEYS):
+        families.add("topology")
+    for key in _RUNTIME_KEYS:
+        if key in flag_patch:
+            families.add("runtime")
+            families.add(key)
+    if not families and flag_patch:
+        families.add("other")
+    return families
+
+
 def _strong_incumbent_guard(
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
--- a/src/aituner/llm.py
+++ b/src/aituner/llm.py
@@ -312,23 +312,45 @@ def build_prompt(
        "",
        "Tested config signatures:",
        json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
-        "",
-        "Harnesses:",
-        render_harness_context(
-            build_harness_context(
-                study=study,
-                window_summary=window_summary,
-                state=state,
-            )
-        ),
-        "",
+    ]
+    if study.llm.use_harness:
+        sections.extend(
+            [
+                "",
+                "Harnesses:",
+                render_harness_context(
+                    build_harness_context(
+                        study=study,
+                        window_summary=window_summary,
+                        state=state,
+                    )
+                ),
+                "",
+            ]
+        )
+    else:
+        sections.extend(
+            [
+                "",
+                "Harnesses:",
+                "Disabled by llm.use_harness=false for ablation.",
+                "",
+            ]
+        )
+    sections.extend(
+        [
        "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
        "The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
        "The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
        "If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
        "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
-        "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
-    ]
+        (
+            "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
+            if study.llm.use_harness
+            else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
+        ),
+        ]
+    )
    return "\n".join(sections)


--- a/src/aituner/spec.py
+++ b/src/aituner/spec.py
@@ -576,6 +576,7 @@ class LLMPolicySpec:
    endpoint: LLMEndpointSpec | None
    system_prompt: str
    max_history_trials: int
+    use_harness: bool = True

    @classmethod
    def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
@@ -593,6 +594,11 @@ class LLMPolicySpec:
            max_history_trials=_require_int(
                payload.get("max_history_trials", 8), context="llm.max_history_trials"
            ),
+            use_harness=(
+                _require_bool(payload.get("use_harness"), context="llm.use_harness")
+                if payload.get("use_harness") is not None
+                else True
+            ),
        )


--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -13,7 +13,7 @@ from aituner.compare import load_compare_spec, run_compare
 from aituner.engine import build_launch_recipe
 from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
 from aituner.job import append_job, build_trial_job
-from aituner.harness import build_harness_context
+from aituner.harness import build_harness_context, build_harness_stop_proposal
 from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
 from aituner.search import ThresholdProbe, binary_search_max_feasible
 from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
@@ -422,6 +422,119 @@ class CoreFlowTests(unittest.TestCase):
            )
            self.assertIn("validate", guard["recommended_next_action"])

+    def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0002",
+                best_parallel_size=8,
+                best_sampling_u=0.02,
+                best_request_rate=2.4,
+                best_request_rate_per_gpu=0.3,
+                trials=[
+                    TrialSummary(
+                        trial_id="trial-0001",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=0.8,
+                        best_request_rate_per_gpu=0.1,
+                        config_patch={"env_patch": {}, "flag_patch": {}},
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0002",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=2.4,
+                        best_request_rate_per_gpu=0.3,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {
+                                "tensor-parallel-size": 2,
+                                "data-parallel-size": 4,
+                            },
+                        },
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0003",
+                        status="completed",
+                        parallel_size=8,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {
+                                "tensor-parallel-size": 1,
+                                "data-parallel-size": 8,
+                            },
+                        },
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0004",
+                        status="completed",
+                        parallel_size=8,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {"max-num-seqs": 160},
+                        },
+                    ),
+                ],
+            )
+            context = build_harness_context(
+                study=study,
+                window_summary={"prompt_tokens_p95": 2048},
+                state=state,
+            )
+            self.assertTrue(context["harness_stop"]["should_stop"])
+            self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
+            proposal = build_harness_stop_proposal(context)
+            self.assertIsNotNone(proposal)
+            self.assertTrue(proposal.should_stop)
+
+    def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            state = StudyState(
+                study_id=study.study_id,
+                best_trial_id="trial-0002",
+                best_parallel_size=8,
+                best_request_rate=2.4,
+                best_request_rate_per_gpu=0.3,
+                trials=[
+                    TrialSummary(
+                        trial_id="trial-0001",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=0.8,
+                        best_request_rate_per_gpu=0.1,
+                        config_patch={"env_patch": {}, "flag_patch": {}},
+                    ),
+                    TrialSummary(
+                        trial_id="trial-0002",
+                        status="completed",
+                        parallel_size=8,
+                        best_request_rate=2.4,
+                        best_request_rate_per_gpu=0.3,
+                        config_patch={
+                            "env_patch": {},
+                            "flag_patch": {
+                                "tensor-parallel-size": 2,
+                                "data-parallel-size": 4,
+                            },
+                        },
+                    ),
+                ],
+            )
+            context = build_harness_context(
+                study=study,
+                window_summary={"prompt_tokens_p95": 2048},
+                state=state,
+            )
+            self.assertFalse(context["harness_stop"]["should_stop"])
+            self.assertIsNone(build_harness_stop_proposal(context))
+
    def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -564,6 +677,26 @@ class CoreFlowTests(unittest.TestCase):
                "\n".join(context["proposal_rules"]),
            )

+    def test_prompt_can_disable_harness_for_ablation(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            payload = json.loads(study_path.read_text(encoding="utf-8"))
+            payload["llm"]["use_harness"] = False
+            study_path.write_text(json.dumps(payload), encoding="utf-8")
+            study = load_study_spec(study_path)
+            window, requests = load_trace_requests(study, study_spec_path=study_path)
+            prompt = build_prompt(
+                study=study,
+                window_summary=summarize_window(requests, window),
+                state=StudyState(study_id=study.study_id),
+                capability_profile=None,
+            )
+            self.assertFalse(study.llm.use_harness)
+            self.assertIn("Disabled by llm.use_harness=false", prompt)
+            self.assertNotIn('"paper_alignment"', prompt)
+            self.assertIn("without harness hints", prompt)
+
    def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
@@ -2299,6 +2432,98 @@ class CoreFlowTests(unittest.TestCase):
            state = store.load_state("study-1")
            self.assertEqual(state.next_trial_index, 1)

+    def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            study_path = _write_study_assets(tmp_path)
+            study = load_study_spec(study_path)
+            store_root = tmp_path / "store"
+            store = StudyStore(store_root)
+            store.init_study(spec_path=study_path, study=study)
+            store.save_state(
+                StudyState(
+                    study_id=study.study_id,
+                    best_trial_id="trial-0002",
+                    best_parallel_size=8,
+                    best_sampling_u=0.02,
+                    best_request_rate=2.4,
+                    best_request_rate_per_gpu=0.3,
+                    next_trial_index=5,
+                    trials=[
+                        TrialSummary(
+                            trial_id="trial-0001",
+                            status="completed",
+                            parallel_size=8,
+                            best_request_rate=0.8,
+                            best_request_rate_per_gpu=0.1,
+                            config_patch={"env_patch": {}, "flag_patch": {}},
+                        ),
+                        TrialSummary(
+                            trial_id="trial-0002",
+                            status="completed",
+                            parallel_size=8,
+                            best_request_rate=2.4,
+                            best_request_rate_per_gpu=0.3,
+                            config_patch={
+                                "env_patch": {},
+                                "flag_patch": {
+                                    "tensor-parallel-size": 2,
+                                    "data-parallel-size": 4,
+                                },
+                            },
+                        ),
+                        TrialSummary(
+                            trial_id="trial-0003",
+                            status="completed",
+                            parallel_size=8,
+                            config_patch={
+                                "env_patch": {},
+                                "flag_patch": {
+                                    "tensor-parallel-size": 1,
+                                    "data-parallel-size": 8,
+                                },
+                            },
+                        ),
+                        TrialSummary(
+                            trial_id="trial-0004",
+                            status="completed",
+                            parallel_size=8,
+                            config_patch={
+                                "env_patch": {},
+                                "flag_patch": {"max-num-seqs": 160},
+                            },
+                        ),
+                    ],
+                )
+            )
+
+            with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
+                with mock.patch("aituner.cli.run_trial") as run_trial_mock:
+                    exit_code = cli_main(
+                        [
+                            "study",
+                            "tune",
+                            "--spec",
+                            str(study_path),
+                            "--store-root",
+                            str(store_root),
+                            "--max-trials",
+                            "1",
+                        ]
+                    )
+
+            self.assertEqual(exit_code, 0)
+            llm_mock.assert_not_called()
+            run_trial_mock.assert_not_called()
+            proposal_path = (
+                store.study_root(study.study_id)
+                / "proposals"
+                / "harness-stop-0005.json"
+            )
+            self.assertTrue(proposal_path.exists())
+            proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
+            self.assertTrue(proposal["should_stop"])
+
    def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)