Add harness early stop ablation

This commit is contained in:
2026-05-02 08:08:14 +08:00
parent 6d3459c82d
commit 1a3d628268
9 changed files with 837 additions and 29 deletions

View File

@@ -0,0 +1,119 @@
{
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "Qwen/Qwen3-30B-A3B",
"served_model_name": "qwen3-30b-a3b-community"
},
"engine": {
"engine_name": "vllm",
"engine_version": "0.20.0",
"exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18230,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18230,
"served-model-name": "qwen3-30b-a3b-community"
},
"tunable_envs": [],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
},
"python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
},
"max_requests_per_probe": 2048,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
"max_history_trials": 8,
"use_harness": true,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -0,0 +1,119 @@
{
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "Qwen/Qwen3-30B-A3B",
"served_model_name": "qwen3-30b-a3b-community"
},
"engine": {
"engine_name": "vllm",
"engine_version": "0.20.0",
"exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18231,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18231,
"served-model-name": "qwen3-30b-a3b-community"
},
"tunable_envs": [],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
},
"python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
},
"max_requests_per_probe": 2048,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
"max_history_trials": 8,
"use_harness": false,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -26,11 +26,11 @@ The harness turns each LLM proposal from open-ended config search into a bottlen
- `gpu-memory-utilization`: memory headroom after topology and batching are stable. - `gpu-memory-utilization`: memory headroom after topology and batching are stable.
- Each family has `use_when`, `procedure`, `guards`, and `active_now` fields. - Each family has `use_when`, `procedure`, `guards`, and `active_now` fields.
4. Proposal discipline 4. Proposal discipline and early stop
- The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed. - The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed.
- It must use adjacent legal topology choices and stay inside topology constraints. - It must use adjacent legal topology choices and stay inside topology constraints.
- It receives tested config signatures, so it should not repeat already-tried configs. - It receives tested config signatures, so it should not repeat already-tried configs.
- It can return `should_stop=true` when no adjacent harness-guided probe is justified. - A deterministic harness stop can now emit `should_stop=true` before calling the LLM when completed validation evidence says another trial is not justified.
5. Baseline-first loop 5. Baseline-first loop
- LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed. - LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed.
@@ -44,10 +44,10 @@ The speedup comes from reducing wasted proposal families, not from changing the
- For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs. - For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs.
- Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`. - Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`.
2. Guarded stop after a strong incumbent 2. Guarded stop after validation, not immediately after a strong incumbent
- If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks. - If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks.
- Without that guard, the LLM still proposed weak MBT trials after finding the qwen27b best config. - It does not stop at the first large gain. It requires post-incumbent validation trials across nearby topology/runtime families, and stops only if those trials fail to produce a feasible per-GPU improvement.
- With the guard, it emits `should_stop=true`. - With the guard, `study tune` can write a `harness-stop-XXXX` proposal and exit without spending another GPU trial.
3. All-infeasible plateau detection 3. All-infeasible plateau detection
- When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family. - When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family.
@@ -78,6 +78,6 @@ Result:
## Current Risks ## Current Risks
- The harness is prompt-guided, not a hard verifier for every rule. If future LLM outputs ignore a fired guard, proposal validation should reject the blocked family explicitly. - The harness is still prompt-guided for choosing the next non-stop proposal. The deterministic stop path is hard-coded in `study tune`, but proposal-family blocking is not yet enforced by a separate validator.
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials once a large gain is already reached. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config. - Strong-incumbent stopping is intentionally biased toward fewer GPU trials after validation evidence accumulates. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
- Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows. - Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows.

View File

@@ -0,0 +1,93 @@
# Qwen3-30B-A3B Community vLLM Harness Ablation, 2026-05-02
## Goal
Run a fresh dash0 experiment on the community vLLM latest release with the local community model:
`/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
The comparison is:
| Variant | Spec | Harness |
| --- | --- | --- |
| no-harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json` | disabled via `llm.use_harness=false` |
| harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json` | enabled, including deterministic stop proposal |
Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model.
## vLLM Install
PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 installation target is:
`/home/admin/cpfs/wjh/venvs/vllm-0.20.0`
Install log:
`/home/admin/cpfs/wjh/aituner/aituner/logs/install_vllm_0.20.0_20260502.log`
## Workload
The experiment reuses the 0-8k chat window that has already been used for qwen27b harness work:
| Field | Value |
| --- | --- |
| window | `chat_w20260311_1000` |
| source rows | 32606 |
| input filter | 0 to 8192 tokens |
| max requests per probe | 2048 |
| target pass rate | 0.95 |
| TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above |
| TPOT SLO | 50ms |
| search high | 0.125 sampling_u |
| max probes per trial | 6 |
The `max_requests_per_probe=2048` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe.
## Harness Update Under Test
This run tests a stricter early-stop harness:
- The harness still injects L-C-A workload features, recent trial diagnostics, active bottleneck, legal topology candidates, tested signatures, and knob-family rules.
- A strong incumbent no longer means immediate stop. It means "validate nearby alternatives".
- Deterministic stop is allowed only after completed validation evidence says continuing is unlikely to be useful:
- the incumbent beats baseline by a generic large-gain ratio,
- at least two post-incumbent validation trials have run,
- those validation trials did not produce a feasible per-GPU improvement,
- the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts.
- If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal.
This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number.
## Unit Tests
Local test command:
```bash
PYTHONPATH=src python3 -m unittest tests.test_core_flow -q
```
Result: passed, 74 tests.
The added coverage checks:
| Test | Purpose |
| --- | --- |
| `test_harness_does_not_stop_immediately_after_strong_incumbent` | strong incumbent requires validation first |
| `test_harness_stop_after_post_incumbent_validation_is_exhausted` | deterministic stop after validation exhaustion |
| `test_cli_tune_uses_harness_stop_before_llm` | `study tune` can stop without calling the LLM or launching another GPU trial |
| `test_prompt_can_disable_harness_for_ablation` | no-harness prompt removes structured harness context |
## Experiment Tracking
Pending dash0 runs:
| Variant | tmux session | Log | Study root |
| --- | --- | --- | --- |
| no-harness | `qwen30b_vllm020_noharness_20260502` | `logs/qwen30b_vllm020_noharness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness` |
| harness | `qwen30b_vllm020_harness_20260502` | `logs/qwen30b_vllm020_harness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness` |
The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.
## Results
Pending. This section will be filled after the dash0 experiments finish.

View File

@@ -6,9 +6,10 @@ import sys
from pathlib import Path from pathlib import Path
from .compare import run_compare from .compare import run_compare
from .harness import build_harness_context, build_harness_stop_proposal
from .job import append_job, build_trial_job from .job import append_job, build_trial_job
from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
from .spec import Proposal, SpecError, load_study_spec from .spec import Proposal, SpecError, load_study_spec, to_jsonable
from .store import StudyStore from .store import StudyStore
from .trace import load_trace_requests, summarize_window from .trace import load_trace_requests, summarize_window
from .worker import run_trial from .worker import run_trial
@@ -118,16 +119,23 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
raise SpecError("max_trials must be positive") raise SpecError("max_trials must be positive")
if proposal_files and max_trials > len(proposal_files): if proposal_files and max_trials > len(proposal_files):
max_trials = len(proposal_files) max_trials = len(proposal_files)
if not proposal_files and study.llm.endpoint is None:
raise SpecError("No proposal files provided and study.llm.endpoint is not configured")
executed: list[dict[str, object]] = [] executed: list[dict[str, object]] = []
for idx in range(max_trials): for idx in range(max_trials):
state = store.load_state(study.study_id) state = store.load_state(study.study_id)
window, requests = load_trace_requests(study, study_spec_path=spec_path) window, requests = load_trace_requests(study, study_spec_path=spec_path)
window_summary = summarize_window(requests, window)
harness_context = (
build_harness_context(
study=study,
window_summary=window_summary,
state=state,
)
if study.llm.use_harness
else None
)
prompt = build_prompt( prompt = build_prompt(
study=study, study=study,
window_summary=summarize_window(requests, window), window_summary=window_summary,
state=state, state=state,
capability_profile=capability_profile, capability_profile=capability_profile,
) )
@@ -162,6 +170,20 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
proposal_name = proposal_source.stem proposal_name = proposal_source.stem
else: else:
proposal_source = None proposal_source = None
stop_proposal = (
build_harness_stop_proposal(harness_context)
if harness_context is not None
else None
)
if stop_proposal is not None:
proposal_text = json.dumps(to_jsonable(stop_proposal), ensure_ascii=False)
proposal_name = f"harness-stop-{state.next_trial_index:04d}"
else:
if study.llm.endpoint is None:
raise SpecError(
"No proposal files provided, study.llm.endpoint is not configured, "
"and the harness stop guard did not fire."
)
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
proposal_name = f"proposal-{state.next_trial_index:04d}" proposal_name = f"proposal-{state.next_trial_index:04d}"
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt" raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
@@ -169,11 +191,15 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
proposal = parse_proposal_text(proposal_text, study) proposal = parse_proposal_text(proposal_text, study)
store.write_proposal(study.study_id, proposal_name, proposal) store.write_proposal(study.study_id, proposal_name, proposal)
if proposal.should_stop: if proposal.should_stop:
if proposal_name.startswith("harness-stop-"):
proposal_source_label = "harness"
else:
proposal_source_label = str(proposal_source) if proposal_source else "llm"
executed.append( executed.append(
{ {
"trial_id": None, "trial_id": None,
"proposal_name": proposal_name, "proposal_name": proposal_name,
"proposal_source": str(proposal_source) if proposal_source else "llm", "proposal_source": proposal_source_label,
"stopped": True, "stopped": True,
"diagnosis": proposal.diagnosis, "diagnosis": proposal.diagnosis,
"state_best_trial_id": state.best_trial_id, "state_best_trial_id": state.best_trial_id,

View File

@@ -4,7 +4,25 @@ import json
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from .spec import StudySpec, StudyState, TrialSummary from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary
_TOPOLOGY_KEYS = {
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"enable-expert-parallel",
}
_RUNTIME_KEYS = {
"max-num-seqs",
"max-num-batched-tokens",
"block-size",
"gpu-memory-utilization",
"enable-chunked-prefill",
}
_STRONG_INCUMBENT_MIN_GAIN = 1.8
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
def build_harness_context( def build_harness_context(
@@ -23,11 +41,39 @@ def build_harness_context(
"workload_lca_profile": _workload_lca_profile(window_summary), "workload_lca_profile": _workload_lca_profile(window_summary),
"recent_trial_diagnostics": recent_diagnostics, "recent_trial_diagnostics": recent_diagnostics,
"convergence_guard": _convergence_guard(state, recent_diagnostics), "convergence_guard": _convergence_guard(state, recent_diagnostics),
"harness_stop": _harness_stop_decision(state, recent_diagnostics),
"knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics), "knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
"proposal_rules": _proposal_rules(), "proposal_rules": _proposal_rules(),
} }
def build_harness_stop_proposal(context: dict[str, Any]) -> Proposal | None:
stop = context.get("harness_stop")
if not isinstance(stop, dict) or not stop.get("should_stop"):
return None
reason = str(stop.get("reason") or "harness_converged")
evidence = stop.get("evidence") if isinstance(stop.get("evidence"), dict) else {}
observation = (
"Harness convergence guard triggered before requesting another proposal: "
f"{reason}."
)
diagnosis = str(evidence.get("summary") or reason)
return Proposal(
observation=observation,
diagnosis=diagnosis,
config_patch=ConfigPatch(env_patch={}, flag_patch={}),
expected_effects=[
"stop without spending another GPU trial",
"preserve the current best observed configuration",
],
why_not_previous_failures=(
"The stop decision is based on completed validation evidence and does not "
"repeat any failed configuration."
),
should_stop=True,
)
def render_harness_context(context: dict[str, Any]) -> str: def render_harness_context(context: dict[str, Any]) -> str:
return json.dumps(context, ensure_ascii=False, indent=2) return json.dumps(context, ensure_ascii=False, indent=2)
@@ -423,6 +469,158 @@ def _convergence_guard(
} }
def _harness_stop_decision(
state: StudyState,
recent_diagnostics: list[dict[str, Any]],
) -> dict[str, Any]:
guard = _convergence_guard(state, recent_diagnostics)
if guard["should_stop_if_no_harness_can_justify_a_new_adjacent_probe"]:
return {
"should_stop": True,
"reason": guard["reason"],
"evidence": {
"summary": "The convergence guard fired and no further adjacent probe is required.",
"convergence_guard": guard,
},
}
validation = _validation_exhausted_guard(state, recent_diagnostics)
if validation["exhausted"]:
return {
"should_stop": True,
"reason": validation["reason"],
"evidence": validation,
}
return {
"should_stop": False,
"reason": "continue_harness_guided_search",
"evidence": {
"summary": "No deterministic harness stop condition is satisfied.",
"convergence_guard": guard,
"validation_exhausted": validation,
},
}
def _validation_exhausted_guard(
state: StudyState,
recent_diagnostics: list[dict[str, Any]],
) -> dict[str, Any]:
default = {
"exhausted": False,
"reason": "validation_not_exhausted",
"summary": "Validation probes are not sufficient to stop yet.",
"incumbent_trial_id": state.best_trial_id,
"incumbent_gain_vs_baseline": None,
"validation_trial_ids": [],
"validation_families": [],
}
if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
return default
completed = [
item
for item in recent_diagnostics
if item.get("status") == "completed"
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
]
if not completed:
return default
baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
incumbent_rate = _as_float(state.best_request_rate_per_gpu)
if baseline_rate <= 0 or incumbent_rate <= 0:
return default
gain = incumbent_rate / baseline_rate
if gain < _STRONG_INCUMBENT_MIN_GAIN:
return {
**default,
"reason": "incumbent_gain_not_large_enough_for_validation_stop",
"incumbent_gain_vs_baseline": gain,
}
best_index = next(
(
index
for index, item in enumerate(recent_diagnostics)
if item.get("trial_id") == state.best_trial_id
),
None,
)
if best_index is None:
return {
**default,
"reason": "incumbent_not_in_recent_harness_history",
"incumbent_gain_vs_baseline": gain,
}
after_best = [
item
for item in recent_diagnostics[best_index + 1 :]
if item.get("status") in {"completed", "failed"}
]
if len(after_best) < _MIN_POST_INCUMBENT_VALIDATION_TRIALS:
return {
**default,
"reason": "need_at_least_two_post_incumbent_validation_trials",
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
}
if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best):
return {
**default,
"reason": "post_incumbent_validation_found_feasible_candidate",
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
}
families: set[str] = set()
for item in after_best:
families.update(_validation_families(item))
has_topology = "topology" in families
has_runtime = bool(families & {"runtime", "max-num-seqs", "max-num-batched-tokens"})
enough_evidence = (
len(after_best) >= _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE
or (has_topology and has_runtime)
)
if not enough_evidence:
return {
**default,
"reason": "post_incumbent_validation_has_not_covered_enough_families",
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
"validation_families": sorted(families),
}
return {
"exhausted": True,
"reason": "post_incumbent_validation_exhausted",
"summary": (
"A strong incumbent was followed by validation probes across nearby "
"topology/runtime families, and none produced a feasible candidate."
),
"incumbent_trial_id": state.best_trial_id,
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
"validation_families": sorted(families),
}
def _validation_families(item: dict[str, Any]) -> set[str]:
config_patch = item.get("config_patch")
if not isinstance(config_patch, dict):
return set()
flag_patch = config_patch.get("flag_patch")
if not isinstance(flag_patch, dict):
return set()
families: set[str] = set()
if any(key in flag_patch for key in _TOPOLOGY_KEYS):
families.add("topology")
for key in _RUNTIME_KEYS:
if key in flag_patch:
families.add("runtime")
families.add(key)
if not families and flag_patch:
families.add("other")
return families
def _strong_incumbent_guard( def _strong_incumbent_guard(
state: StudyState, state: StudyState,
recent_diagnostics: list[dict[str, Any]], recent_diagnostics: list[dict[str, Any]],

View File

@@ -312,6 +312,10 @@ def build_prompt(
"", "",
"Tested config signatures:", "Tested config signatures:",
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2), json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
]
if study.llm.use_harness:
sections.extend(
[
"", "",
"Harnesses:", "Harnesses:",
render_harness_context( render_harness_context(
@@ -322,13 +326,31 @@ def build_prompt(
) )
), ),
"", "",
]
)
else:
sections.extend(
[
"",
"Harnesses:",
"Disabled by llm.use_harness=false for ablation.",
"",
]
)
sections.extend(
[
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.", "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
"The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.", "The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
"The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.", "The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
"If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.", "If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.", "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.", (
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
if study.llm.use_harness
else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
),
] ]
)
return "\n".join(sections) return "\n".join(sections)

View File

@@ -576,6 +576,7 @@ class LLMPolicySpec:
endpoint: LLMEndpointSpec | None endpoint: LLMEndpointSpec | None
system_prompt: str system_prompt: str
max_history_trials: int max_history_trials: int
use_harness: bool = True
@classmethod @classmethod
def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec": def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
@@ -593,6 +594,11 @@ class LLMPolicySpec:
max_history_trials=_require_int( max_history_trials=_require_int(
payload.get("max_history_trials", 8), context="llm.max_history_trials" payload.get("max_history_trials", 8), context="llm.max_history_trials"
), ),
use_harness=(
_require_bool(payload.get("use_harness"), context="llm.use_harness")
if payload.get("use_harness") is not None
else True
),
) )

View File

@@ -13,7 +13,7 @@ from aituner.compare import load_compare_spec, run_compare
from aituner.engine import build_launch_recipe from aituner.engine import build_launch_recipe
from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
from aituner.job import append_job, build_trial_job from aituner.job import append_job, build_trial_job
from aituner.harness import build_harness_context from aituner.harness import build_harness_context, build_harness_stop_proposal
from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
from aituner.search import ThresholdProbe, binary_search_max_feasible from aituner.search import ThresholdProbe, binary_search_max_feasible
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
@@ -422,6 +422,119 @@ class CoreFlowTests(unittest.TestCase):
) )
self.assertIn("validate", guard["recommended_next_action"]) self.assertIn("validate", guard["recommended_next_action"])
def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.02,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": 8,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 160},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
proposal = build_harness_stop_proposal(context)
self.assertIsNotNone(proposal)
self.assertTrue(proposal.should_stop)
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertIsNone(build_harness_stop_proposal(context))
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None: def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)
@@ -564,6 +677,26 @@ class CoreFlowTests(unittest.TestCase):
"\n".join(context["proposal_rules"]), "\n".join(context["proposal_rules"]),
) )
def test_prompt_can_disable_harness_for_ablation(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["use_harness"] = False
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertFalse(study.llm.use_harness)
self.assertIn("Disabled by llm.use_harness=false", prompt)
self.assertNotIn('"paper_alignment"', prompt)
self.assertIn("without harness hints", prompt)
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None: def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)
@@ -2299,6 +2432,98 @@ class CoreFlowTests(unittest.TestCase):
state = store.load_state("study-1") state = store.load_state("study-1")
self.assertEqual(state.next_trial_index, 1) self.assertEqual(state.next_trial_index, 1)
def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store_root = tmp_path / "store"
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
store.save_state(
StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.02,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
next_trial_index=5,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": 8,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 160},
},
),
],
)
)
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--max-trials",
"1",
]
)
self.assertEqual(exit_code, 0)
llm_mock.assert_not_called()
run_trial_mock.assert_not_called()
proposal_path = (
store.study_root(study.study_id)
/ "proposals"
/ "harness-stop-0005.json"
)
self.assertTrue(proposal_path.exists())
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
self.assertTrue(proposal["should_stop"])
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None: def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp) tmp_path = Path(tmp)