Add harness early stop ablation

This commit is contained in:
2026-05-02 08:08:14 +08:00
parent 6d3459c82d
commit 1a3d628268
9 changed files with 837 additions and 29 deletions

View File

@@ -0,0 +1,119 @@
{
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "Qwen/Qwen3-30B-A3B",
"served_model_name": "qwen3-30b-a3b-community"
},
"engine": {
"engine_name": "vllm",
"engine_version": "0.20.0",
"exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18230,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18230,
"served-model-name": "qwen3-30b-a3b-community"
},
"tunable_envs": [],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
},
"python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
},
"max_requests_per_probe": 2048,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
"max_history_trials": 8,
"use_harness": true,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -0,0 +1,119 @@
{
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness",
"hardware": {
"gpu_count": 8,
"gpu_model": "H20",
"host_candidates": [
"dash0"
]
},
"model": {
"model_id": "Qwen/Qwen3-30B-A3B",
"served_model_name": "qwen3-30b-a3b-community"
},
"engine": {
"engine_name": "vllm",
"engine_version": "0.20.0",
"exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
"host": "127.0.0.1",
"port": 18231,
"healthcheck_path": "/v1/models",
"ready_timeout_s": 900,
"request_timeout_s": 900,
"launch_args": [
"serve",
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
],
"base_envs": {
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
},
"base_flags": {
"host": "127.0.0.1",
"port": 18231,
"served-model-name": "qwen3-30b-a3b-community"
},
"tunable_envs": [],
"tunable_flags": [
"tensor-parallel-size",
"data-parallel-size",
"enable-expert-parallel",
"expert-parallel-size",
"gpu-memory-utilization",
"max-num-batched-tokens",
"max-num-seqs",
"block-size",
"enable-prefix-caching",
"enable-chunked-prefill"
],
"topology_constraints": {
"require_tp_dp_product_equals_gpu_count": false,
"require_ep_size_leq_tp_dp_product": true,
"require_ep_size_divides_tp_dp_product": true,
"require_enable_expert_parallel_when_ep_gt_one": true,
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
"allowed_tp_dp_products": [1, 2, 4, 8],
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
"allowed_data_parallel_sizes": [1, 2, 4, 8],
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
},
"python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
},
"trace": {
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
"window_id": "chat_w20260311_1000",
"u_field": "sampling_u",
"timestamp_field": "timestamp",
"max_concurrency": 64,
"input_length_filter": {
"min_input_tokens": 0,
"max_input_tokens": 8192
},
"max_requests_per_probe": 2048,
"replay_time_scale": 1.0,
"early_stop_max_lag_s": 120.0,
"early_stop_max_elapsed_s": 900.0
},
"slo": {
"target_pass_rate": 0.95,
"ttft_rule": {
"kind": "step_ms",
"buckets": [
{
"max_input_tokens": 4096,
"threshold_ms": 2000
},
{
"max_input_tokens": 32768,
"threshold_ms": 4000
},
{
"threshold_ms": 6000
}
]
},
"tpot_rule": {
"kind": "fixed_ms",
"threshold_ms": 50
}
},
"search": {
"low": 0.0,
"high": 0.125,
"tolerance": 0.001,
"max_probes": 6,
"sample_seed": 20260325
},
"llm": {
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
"max_history_trials": 8,
"use_harness": false,
"endpoint": {
"provider": "codex",
"model": "gpt-5.4",
"stream": true,
"api_key_env": "OPENAI_API_KEY",
"timeout_s": 240
}
}
}

View File

@@ -26,11 +26,11 @@ The harness turns each LLM proposal from open-ended config search into a bottlen
- `gpu-memory-utilization`: memory headroom after topology and batching are stable.
- Each family has `use_when`, `procedure`, `guards`, and `active_now` fields.
4. Proposal discipline
4. Proposal discipline and early stop
- The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed.
- It must use adjacent legal topology choices and stay inside topology constraints.
- It receives tested config signatures, so it should not repeat already-tried configs.
- It can return `should_stop=true` when no adjacent harness-guided probe is justified.
- A deterministic harness stop can now emit `should_stop=true` before calling the LLM when completed validation evidence says another trial is not justified.
5. Baseline-first loop
- LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed.
@@ -44,10 +44,10 @@ The speedup comes from reducing wasted proposal families, not from changing the
- For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs.
- Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`.
2. Guarded stop after a strong incumbent
2. Guarded stop after validation, not immediately after a strong incumbent
- If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks.
- Without that guard, the LLM still proposed weak MBT trials after finding the qwen27b best config.
- With the guard, it emits `should_stop=true`.
- It does not stop at the first large gain. It requires post-incumbent validation trials across nearby topology/runtime families, and stops only if those trials fail to produce a feasible per-GPU improvement.
- With the guard, `study tune` can write a `harness-stop-XXXX` proposal and exit without spending another GPU trial.
3. All-infeasible plateau detection
- When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family.
@@ -78,6 +78,6 @@ Result:
## Current Risks
- The harness is prompt-guided, not a hard verifier for every rule. If future LLM outputs ignore a fired guard, proposal validation should reject the blocked family explicitly.
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials once a large gain is already reached. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
- The harness is still prompt-guided for choosing the next non-stop proposal. The deterministic stop path is hard-coded in `study tune`, but proposal-family blocking is not yet enforced by a separate validator.
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials after validation evidence accumulates. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
- Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows.

View File

@@ -0,0 +1,93 @@
# Qwen3-30B-A3B Community vLLM Harness Ablation, 2026-05-02
## Goal
Run a fresh dash0 experiment on the community vLLM latest release with the local community model:
`/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
The comparison is:
| Variant | Spec | Harness |
| --- | --- | --- |
| no-harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json` | disabled via `llm.use_harness=false` |
| harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json` | enabled, including deterministic stop proposal |
Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model.
## vLLM Install
PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 installation target is:
`/home/admin/cpfs/wjh/venvs/vllm-0.20.0`
Install log:
`/home/admin/cpfs/wjh/aituner/aituner/logs/install_vllm_0.20.0_20260502.log`
## Workload
The experiment reuses the 0-8k chat window that has already been used for qwen27b harness work:
| Field | Value |
| --- | --- |
| window | `chat_w20260311_1000` |
| source rows | 32606 |
| input filter | 0 to 8192 tokens |
| max requests per probe | 2048 |
| target pass rate | 0.95 |
| TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above |
| TPOT SLO | 50ms |
| search high | 0.125 sampling_u |
| max probes per trial | 6 |
The `max_requests_per_probe=2048` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe.
## Harness Update Under Test
This run tests a stricter early-stop harness:
- The harness still injects L-C-A workload features, recent trial diagnostics, active bottleneck, legal topology candidates, tested signatures, and knob-family rules.
- A strong incumbent no longer means immediate stop. It means "validate nearby alternatives".
- Deterministic stop is allowed only after completed validation evidence says continuing is unlikely to be useful:
- the incumbent beats baseline by a generic large-gain ratio,
- at least two post-incumbent validation trials have run,
- those validation trials did not produce a feasible per-GPU improvement,
- the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts.
- If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal.
This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number.
## Unit Tests
Local test command:
```bash
PYTHONPATH=src python3 -m unittest tests.test_core_flow -q
```
Result: passed, 74 tests.
The added coverage checks:
| Test | Purpose |
| --- | --- |
| `test_harness_does_not_stop_immediately_after_strong_incumbent` | strong incumbent requires validation first |
| `test_harness_stop_after_post_incumbent_validation_is_exhausted` | deterministic stop after validation exhaustion |
| `test_cli_tune_uses_harness_stop_before_llm` | `study tune` can stop without calling the LLM or launching another GPU trial |
| `test_prompt_can_disable_harness_for_ablation` | no-harness prompt removes structured harness context |
## Experiment Tracking
Pending dash0 runs:
| Variant | tmux session | Log | Study root |
| --- | --- | --- | --- |
| no-harness | `qwen30b_vllm020_noharness_20260502` | `logs/qwen30b_vllm020_noharness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness` |
| harness | `qwen30b_vllm020_harness_20260502` | `logs/qwen30b_vllm020_harness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness` |
The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.
## Results
Pending. This section will be filled after the dash0 experiments finish.

View File

@@ -6,9 +6,10 @@ import sys
from pathlib import Path
from .compare import run_compare
from .harness import build_harness_context, build_harness_stop_proposal
from .job import append_job, build_trial_job
from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
from .spec import Proposal, SpecError, load_study_spec
from .spec import Proposal, SpecError, load_study_spec, to_jsonable
from .store import StudyStore
from .trace import load_trace_requests, summarize_window
from .worker import run_trial
@@ -118,16 +119,23 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
raise SpecError("max_trials must be positive")
if proposal_files and max_trials > len(proposal_files):
max_trials = len(proposal_files)
if not proposal_files and study.llm.endpoint is None:
raise SpecError("No proposal files provided and study.llm.endpoint is not configured")
executed: list[dict[str, object]] = []
for idx in range(max_trials):
state = store.load_state(study.study_id)
window, requests = load_trace_requests(study, study_spec_path=spec_path)
window_summary = summarize_window(requests, window)
harness_context = (
build_harness_context(
study=study,
window_summary=window_summary,
state=state,
)
if study.llm.use_harness
else None
)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
window_summary=window_summary,
state=state,
capability_profile=capability_profile,
)
@@ -162,18 +170,36 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
proposal_name = proposal_source.stem
else:
proposal_source = None
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
proposal_name = f"proposal-{state.next_trial_index:04d}"
stop_proposal = (
build_harness_stop_proposal(harness_context)
if harness_context is not None
else None
)
if stop_proposal is not None:
proposal_text = json.dumps(to_jsonable(stop_proposal), ensure_ascii=False)
proposal_name = f"harness-stop-{state.next_trial_index:04d}"
else:
if study.llm.endpoint is None:
raise SpecError(
"No proposal files provided, study.llm.endpoint is not configured, "
"and the harness stop guard did not fire."
)
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
proposal_name = f"proposal-{state.next_trial_index:04d}"
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
raw_proposal_path.write_text(proposal_text, encoding="utf-8")
proposal = parse_proposal_text(proposal_text, study)
store.write_proposal(study.study_id, proposal_name, proposal)
if proposal.should_stop:
if proposal_name.startswith("harness-stop-"):
proposal_source_label = "harness"
else:
proposal_source_label = str(proposal_source) if proposal_source else "llm"
executed.append(
{
"trial_id": None,
"proposal_name": proposal_name,
"proposal_source": str(proposal_source) if proposal_source else "llm",
"proposal_source": proposal_source_label,
"stopped": True,
"diagnosis": proposal.diagnosis,
"state_best_trial_id": state.best_trial_id,

View File

@@ -4,7 +4,25 @@ import json
from pathlib import Path
from typing import Any
from .spec import StudySpec, StudyState, TrialSummary
from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary
_TOPOLOGY_KEYS = {
"tensor-parallel-size",
"data-parallel-size",
"expert-parallel-size",
"enable-expert-parallel",
}
_RUNTIME_KEYS = {
"max-num-seqs",
"max-num-batched-tokens",
"block-size",
"gpu-memory-utilization",
"enable-chunked-prefill",
}
_STRONG_INCUMBENT_MIN_GAIN = 1.8
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
def build_harness_context(
@@ -23,11 +41,39 @@ def build_harness_context(
"workload_lca_profile": _workload_lca_profile(window_summary),
"recent_trial_diagnostics": recent_diagnostics,
"convergence_guard": _convergence_guard(state, recent_diagnostics),
"harness_stop": _harness_stop_decision(state, recent_diagnostics),
"knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
"proposal_rules": _proposal_rules(),
}
def build_harness_stop_proposal(context: dict[str, Any]) -> Proposal | None:
stop = context.get("harness_stop")
if not isinstance(stop, dict) or not stop.get("should_stop"):
return None
reason = str(stop.get("reason") or "harness_converged")
evidence = stop.get("evidence") if isinstance(stop.get("evidence"), dict) else {}
observation = (
"Harness convergence guard triggered before requesting another proposal: "
f"{reason}."
)
diagnosis = str(evidence.get("summary") or reason)
return Proposal(
observation=observation,
diagnosis=diagnosis,
config_patch=ConfigPatch(env_patch={}, flag_patch={}),
expected_effects=[
"stop without spending another GPU trial",
"preserve the current best observed configuration",
],
why_not_previous_failures=(
"The stop decision is based on completed validation evidence and does not "
"repeat any failed configuration."
),
should_stop=True,
)
def render_harness_context(context: dict[str, Any]) -> str:
return json.dumps(context, ensure_ascii=False, indent=2)
@@ -423,6 +469,158 @@ def _convergence_guard(
}
def _harness_stop_decision(
state: StudyState,
recent_diagnostics: list[dict[str, Any]],
) -> dict[str, Any]:
guard = _convergence_guard(state, recent_diagnostics)
if guard["should_stop_if_no_harness_can_justify_a_new_adjacent_probe"]:
return {
"should_stop": True,
"reason": guard["reason"],
"evidence": {
"summary": "The convergence guard fired and no further adjacent probe is required.",
"convergence_guard": guard,
},
}
validation = _validation_exhausted_guard(state, recent_diagnostics)
if validation["exhausted"]:
return {
"should_stop": True,
"reason": validation["reason"],
"evidence": validation,
}
return {
"should_stop": False,
"reason": "continue_harness_guided_search",
"evidence": {
"summary": "No deterministic harness stop condition is satisfied.",
"convergence_guard": guard,
"validation_exhausted": validation,
},
}
def _validation_exhausted_guard(
state: StudyState,
recent_diagnostics: list[dict[str, Any]],
) -> dict[str, Any]:
default = {
"exhausted": False,
"reason": "validation_not_exhausted",
"summary": "Validation probes are not sufficient to stop yet.",
"incumbent_trial_id": state.best_trial_id,
"incumbent_gain_vs_baseline": None,
"validation_trial_ids": [],
"validation_families": [],
}
if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
return default
completed = [
item
for item in recent_diagnostics
if item.get("status") == "completed"
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
]
if not completed:
return default
baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
incumbent_rate = _as_float(state.best_request_rate_per_gpu)
if baseline_rate <= 0 or incumbent_rate <= 0:
return default
gain = incumbent_rate / baseline_rate
if gain < _STRONG_INCUMBENT_MIN_GAIN:
return {
**default,
"reason": "incumbent_gain_not_large_enough_for_validation_stop",
"incumbent_gain_vs_baseline": gain,
}
best_index = next(
(
index
for index, item in enumerate(recent_diagnostics)
if item.get("trial_id") == state.best_trial_id
),
None,
)
if best_index is None:
return {
**default,
"reason": "incumbent_not_in_recent_harness_history",
"incumbent_gain_vs_baseline": gain,
}
after_best = [
item
for item in recent_diagnostics[best_index + 1 :]
if item.get("status") in {"completed", "failed"}
]
if len(after_best) < _MIN_POST_INCUMBENT_VALIDATION_TRIALS:
return {
**default,
"reason": "need_at_least_two_post_incumbent_validation_trials",
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
}
if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best):
return {
**default,
"reason": "post_incumbent_validation_found_feasible_candidate",
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
}
families: set[str] = set()
for item in after_best:
families.update(_validation_families(item))
has_topology = "topology" in families
has_runtime = bool(families & {"runtime", "max-num-seqs", "max-num-batched-tokens"})
enough_evidence = (
len(after_best) >= _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE
or (has_topology and has_runtime)
)
if not enough_evidence:
return {
**default,
"reason": "post_incumbent_validation_has_not_covered_enough_families",
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
"validation_families": sorted(families),
}
return {
"exhausted": True,
"reason": "post_incumbent_validation_exhausted",
"summary": (
"A strong incumbent was followed by validation probes across nearby "
"topology/runtime families, and none produced a feasible candidate."
),
"incumbent_trial_id": state.best_trial_id,
"incumbent_gain_vs_baseline": gain,
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
"validation_families": sorted(families),
}
def _validation_families(item: dict[str, Any]) -> set[str]:
config_patch = item.get("config_patch")
if not isinstance(config_patch, dict):
return set()
flag_patch = config_patch.get("flag_patch")
if not isinstance(flag_patch, dict):
return set()
families: set[str] = set()
if any(key in flag_patch for key in _TOPOLOGY_KEYS):
families.add("topology")
for key in _RUNTIME_KEYS:
if key in flag_patch:
families.add("runtime")
families.add(key)
if not families and flag_patch:
families.add("other")
return families
def _strong_incumbent_guard(
state: StudyState,
recent_diagnostics: list[dict[str, Any]],

View File

@@ -312,23 +312,45 @@ def build_prompt(
"",
"Tested config signatures:",
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
"",
"Harnesses:",
render_harness_context(
build_harness_context(
study=study,
window_summary=window_summary,
state=state,
)
),
"",
]
if study.llm.use_harness:
sections.extend(
[
"",
"Harnesses:",
render_harness_context(
build_harness_context(
study=study,
window_summary=window_summary,
state=state,
)
),
"",
]
)
else:
sections.extend(
[
"",
"Harnesses:",
"Disabled by llm.use_harness=false for ablation.",
"",
]
)
sections.extend(
[
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
"The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
"The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
"If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
]
(
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
if study.llm.use_harness
else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
),
]
)
return "\n".join(sections)

View File

@@ -576,6 +576,7 @@ class LLMPolicySpec:
endpoint: LLMEndpointSpec | None
system_prompt: str
max_history_trials: int
use_harness: bool = True
@classmethod
def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
@@ -593,6 +594,11 @@ class LLMPolicySpec:
max_history_trials=_require_int(
payload.get("max_history_trials", 8), context="llm.max_history_trials"
),
use_harness=(
_require_bool(payload.get("use_harness"), context="llm.use_harness")
if payload.get("use_harness") is not None
else True
),
)

View File

@@ -13,7 +13,7 @@ from aituner.compare import load_compare_spec, run_compare
from aituner.engine import build_launch_recipe
from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
from aituner.job import append_job, build_trial_job
from aituner.harness import build_harness_context
from aituner.harness import build_harness_context, build_harness_stop_proposal
from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
from aituner.search import ThresholdProbe, binary_search_max_feasible
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
@@ -422,6 +422,119 @@ class CoreFlowTests(unittest.TestCase):
)
self.assertIn("validate", guard["recommended_next_action"])
def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.02,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": 8,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 160},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertTrue(context["harness_stop"]["should_stop"])
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
proposal = build_harness_stop_proposal(context)
self.assertIsNotNone(proposal)
self.assertTrue(proposal.should_stop)
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
state = StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
],
)
context = build_harness_context(
study=study,
window_summary={"prompt_tokens_p95": 2048},
state=state,
)
self.assertFalse(context["harness_stop"]["should_stop"])
self.assertIsNone(build_harness_stop_proposal(context))
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
@@ -564,6 +677,26 @@ class CoreFlowTests(unittest.TestCase):
"\n".join(context["proposal_rules"]),
)
def test_prompt_can_disable_harness_for_ablation(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
payload = json.loads(study_path.read_text(encoding="utf-8"))
payload["llm"]["use_harness"] = False
study_path.write_text(json.dumps(payload), encoding="utf-8")
study = load_study_spec(study_path)
window, requests = load_trace_requests(study, study_spec_path=study_path)
prompt = build_prompt(
study=study,
window_summary=summarize_window(requests, window),
state=StudyState(study_id=study.study_id),
capability_profile=None,
)
self.assertFalse(study.llm.use_harness)
self.assertIn("Disabled by llm.use_harness=false", prompt)
self.assertNotIn('"paper_alignment"', prompt)
self.assertIn("without harness hints", prompt)
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
@@ -2299,6 +2432,98 @@ class CoreFlowTests(unittest.TestCase):
state = store.load_state("study-1")
self.assertEqual(state.next_trial_index, 1)
def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
study_path = _write_study_assets(tmp_path)
study = load_study_spec(study_path)
store_root = tmp_path / "store"
store = StudyStore(store_root)
store.init_study(spec_path=study_path, study=study)
store.save_state(
StudyState(
study_id=study.study_id,
best_trial_id="trial-0002",
best_parallel_size=8,
best_sampling_u=0.02,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
next_trial_index=5,
trials=[
TrialSummary(
trial_id="trial-0001",
status="completed",
parallel_size=8,
best_request_rate=0.8,
best_request_rate_per_gpu=0.1,
config_patch={"env_patch": {}, "flag_patch": {}},
),
TrialSummary(
trial_id="trial-0002",
status="completed",
parallel_size=8,
best_request_rate=2.4,
best_request_rate_per_gpu=0.3,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 2,
"data-parallel-size": 4,
},
},
),
TrialSummary(
trial_id="trial-0003",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {
"tensor-parallel-size": 1,
"data-parallel-size": 8,
},
},
),
TrialSummary(
trial_id="trial-0004",
status="completed",
parallel_size=8,
config_patch={
"env_patch": {},
"flag_patch": {"max-num-seqs": 160},
},
),
],
)
)
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
exit_code = cli_main(
[
"study",
"tune",
"--spec",
str(study_path),
"--store-root",
str(store_root),
"--max-trials",
"1",
]
)
self.assertEqual(exit_code, 0)
llm_mock.assert_not_called()
run_trial_mock.assert_not_called()
proposal_path = (
store.study_root(study.study_id)
/ "proposals"
/ "harness-stop-0005.json"
)
self.assertTrue(proposal_path.exists())
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
self.assertTrue(proposal["should_stop"])
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)