diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json new file mode 100644 index 0000000..8d71310 --- /dev/null +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json @@ -0,0 +1,119 @@ +{ + "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness", + "hardware": { + "gpu_count": 8, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "Qwen/Qwen3-30B-A3B", + "served_model_name": "qwen3-30b-a3b-community" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "0.20.0", + "exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18230, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18230, + "served-model-name": "qwen3-30b-a3b-community" + }, + "tunable_envs": [], + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "enable-expert-parallel", + "expert-parallel-size", + "gpu-memory-utilization", + "max-num-batched-tokens", + "max-num-seqs", + "block-size", + "enable-prefix-caching", + "enable-chunked-prefill" + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [1, 2, 4, 8], + "allowed_tensor_parallel_sizes": [1, 2, 4, 8], + "allowed_data_parallel_sizes": [1, 2, 4, 8], + "allowed_expert_parallel_sizes": [1, 2, 4, 8] + }, + "python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 64, + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192 + }, + "max_requests_per_probe": 2048, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.125, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.", + "max_history_trials": 8, + "use_harness": true, + "endpoint": { + "provider": "codex", + "model": "gpt-5.4", + "stream": true, + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 240 + } + } +} diff --git a/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json new file mode 100644 index 0000000..469c9df --- /dev/null +++ b/configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json @@ -0,0 +1,119 @@ +{ + "study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness", + "hardware": { + "gpu_count": 8, + "gpu_model": "H20", + "host_candidates": [ + "dash0" + ] + }, + "model": { + "model_id": "Qwen/Qwen3-30B-A3B", + "served_model_name": "qwen3-30b-a3b-community" + }, + "engine": { + "engine_name": "vllm", + "engine_version": "0.20.0", + "exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm", + "cwd": "/home/admin/cpfs/wjh/aituner/aituner", + "host": "127.0.0.1", + "port": 18231, + "healthcheck_path": "/v1/models", + "ready_timeout_s": 900, + "request_timeout_s": 900, + "launch_args": [ + "serve", + "/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B" + ], + "base_envs": { + "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7" + }, + "base_flags": { + "host": "127.0.0.1", + "port": 18231, + "served-model-name": "qwen3-30b-a3b-community" + }, + "tunable_envs": [], + "tunable_flags": [ + "tensor-parallel-size", + "data-parallel-size", + "enable-expert-parallel", + "expert-parallel-size", + "gpu-memory-utilization", + "max-num-batched-tokens", + "max-num-seqs", + "block-size", + "enable-prefix-caching", + "enable-chunked-prefill" + ], + "topology_constraints": { + "require_tp_dp_product_equals_gpu_count": false, + "require_ep_size_leq_tp_dp_product": true, + "require_ep_size_divides_tp_dp_product": true, + "require_enable_expert_parallel_when_ep_gt_one": true, + "validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true, + "allowed_tp_dp_products": [1, 2, 4, 8], + "allowed_tensor_parallel_sizes": [1, 2, 4, 8], + "allowed_data_parallel_sizes": [1, 2, 4, 8], + "allowed_expert_parallel_sizes": [1, 2, 4, 8] + }, + "python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python" + }, + "trace": { + "windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json", + "window_id": "chat_w20260311_1000", + "u_field": "sampling_u", + "timestamp_field": "timestamp", + "max_concurrency": 64, + "input_length_filter": { + "min_input_tokens": 0, + "max_input_tokens": 8192 + }, + "max_requests_per_probe": 2048, + "replay_time_scale": 1.0, + "early_stop_max_lag_s": 120.0, + "early_stop_max_elapsed_s": 900.0 + }, + "slo": { + "target_pass_rate": 0.95, + "ttft_rule": { + "kind": "step_ms", + "buckets": [ + { + "max_input_tokens": 4096, + "threshold_ms": 2000 + }, + { + "max_input_tokens": 32768, + "threshold_ms": 4000 + }, + { + "threshold_ms": 6000 + } + ] + }, + "tpot_rule": { + "kind": "fixed_ms", + "threshold_ms": 50 + } + }, + "search": { + "low": 0.0, + "high": 0.125, + "tolerance": 0.001, + "max_probes": 6, + "sample_seed": 20260325 + }, + "llm": { + "system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.", + "max_history_trials": 8, + "use_harness": false, + "endpoint": { + "provider": "codex", + "model": "gpt-5.4", + "stream": true, + "api_key_env": "OPENAI_API_KEY", + "timeout_s": 240 + } + } +} diff --git a/docs/aituner-harness-summary.md b/docs/aituner-harness-summary.md index 1c0e8c1..9b12707 100644 --- a/docs/aituner-harness-summary.md +++ b/docs/aituner-harness-summary.md @@ -26,11 +26,11 @@ The harness turns each LLM proposal from open-ended config search into a bottlen - `gpu-memory-utilization`: memory headroom after topology and batching are stable. - Each family has `use_when`, `procedure`, `guards`, and `active_now` fields. -4. Proposal discipline +4. Proposal discipline and early stop - The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed. - It must use adjacent legal topology choices and stay inside topology constraints. - It receives tested config signatures, so it should not repeat already-tried configs. - - It can return `should_stop=true` when no adjacent harness-guided probe is justified. + - A deterministic harness stop can now emit `should_stop=true` before calling the LLM when completed validation evidence says another trial is not justified. 5. Baseline-first loop - LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed. @@ -44,10 +44,10 @@ The speedup comes from reducing wasted proposal families, not from changing the - For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs. - Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`. -2. Guarded stop after a strong incumbent +2. Guarded stop after validation, not immediately after a strong incumbent - If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks. - - Without that guard, the LLM still proposed weak MBT trials after finding the qwen27b best config. - - With the guard, it emits `should_stop=true`. + - It does not stop at the first large gain. It requires post-incumbent validation trials across nearby topology/runtime families, and stops only if those trials fail to produce a feasible per-GPU improvement. + - With the guard, `study tune` can write a `harness-stop-XXXX` proposal and exit without spending another GPU trial. 3. All-infeasible plateau detection - When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family. @@ -78,6 +78,6 @@ Result: ## Current Risks -- The harness is prompt-guided, not a hard verifier for every rule. If future LLM outputs ignore a fired guard, proposal validation should reject the blocked family explicitly. -- Strong-incumbent stopping is intentionally biased toward fewer GPU trials once a large gain is already reached. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config. +- The harness is still prompt-guided for choosing the next non-stop proposal. The deterministic stop path is hard-coded in `study tune`, but proposal-family blocking is not yet enforced by a separate validator. +- Strong-incumbent stopping is intentionally biased toward fewer GPU trials after validation evidence accumulates. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config. - Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows. diff --git a/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md new file mode 100644 index 0000000..6852f7b --- /dev/null +++ b/docs/qwen30b-community-vllm020/harness-early-stop-ablation-20260502.md @@ -0,0 +1,93 @@ +# Qwen3-30B-A3B Community vLLM Harness Ablation, 2026-05-02 + +## Goal + +Run a fresh dash0 experiment on the community vLLM latest release with the local community model: + +`/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B` + +The comparison is: + +| Variant | Spec | Harness | +| --- | --- | --- | +| no-harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json` | disabled via `llm.use_harness=false` | +| harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json` | enabled, including deterministic stop proposal | + +Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model. + +## vLLM Install + +PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 installation target is: + +`/home/admin/cpfs/wjh/venvs/vllm-0.20.0` + +Install log: + +`/home/admin/cpfs/wjh/aituner/aituner/logs/install_vllm_0.20.0_20260502.log` + +## Workload + +The experiment reuses the 0-8k chat window that has already been used for qwen27b harness work: + +| Field | Value | +| --- | --- | +| window | `chat_w20260311_1000` | +| source rows | 32606 | +| input filter | 0 to 8192 tokens | +| max requests per probe | 2048 | +| target pass rate | 0.95 | +| TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above | +| TPOT SLO | 50ms | +| search high | 0.125 sampling_u | +| max probes per trial | 6 | + +The `max_requests_per_probe=2048` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe. + +## Harness Update Under Test + +This run tests a stricter early-stop harness: + +- The harness still injects L-C-A workload features, recent trial diagnostics, active bottleneck, legal topology candidates, tested signatures, and knob-family rules. +- A strong incumbent no longer means immediate stop. It means "validate nearby alternatives". +- Deterministic stop is allowed only after completed validation evidence says continuing is unlikely to be useful: + - the incumbent beats baseline by a generic large-gain ratio, + - at least two post-incumbent validation trials have run, + - those validation trials did not produce a feasible per-GPU improvement, + - the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts. +- If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal. + +This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number. + +## Unit Tests + +Local test command: + +```bash +PYTHONPATH=src python3 -m unittest tests.test_core_flow -q +``` + +Result: passed, 74 tests. + +The added coverage checks: + +| Test | Purpose | +| --- | --- | +| `test_harness_does_not_stop_immediately_after_strong_incumbent` | strong incumbent requires validation first | +| `test_harness_stop_after_post_incumbent_validation_is_exhausted` | deterministic stop after validation exhaustion | +| `test_cli_tune_uses_harness_stop_before_llm` | `study tune` can stop without calling the LLM or launching another GPU trial | +| `test_prompt_can_disable_harness_for_ablation` | no-harness prompt removes structured harness context | + +## Experiment Tracking + +Pending dash0 runs: + +| Variant | tmux session | Log | Study root | +| --- | --- | --- | --- | +| no-harness | `qwen30b_vllm020_noharness_20260502` | `logs/qwen30b_vllm020_noharness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness` | +| harness | `qwen30b_vllm020_harness_20260502` | `logs/qwen30b_vllm020_harness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness` | + +The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point. + +## Results + +Pending. This section will be filled after the dash0 experiments finish. diff --git a/src/aituner/cli.py b/src/aituner/cli.py index ccd08bb..05878b1 100644 --- a/src/aituner/cli.py +++ b/src/aituner/cli.py @@ -6,9 +6,10 @@ import sys from pathlib import Path from .compare import run_compare +from .harness import build_harness_context, build_harness_stop_proposal from .job import append_job, build_trial_job from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text -from .spec import Proposal, SpecError, load_study_spec +from .spec import Proposal, SpecError, load_study_spec, to_jsonable from .store import StudyStore from .trace import load_trace_requests, summarize_window from .worker import run_trial @@ -118,16 +119,23 @@ def cmd_study_tune(args: argparse.Namespace) -> int: raise SpecError("max_trials must be positive") if proposal_files and max_trials > len(proposal_files): max_trials = len(proposal_files) - if not proposal_files and study.llm.endpoint is None: - raise SpecError("No proposal files provided and study.llm.endpoint is not configured") - executed: list[dict[str, object]] = [] for idx in range(max_trials): state = store.load_state(study.study_id) window, requests = load_trace_requests(study, study_spec_path=spec_path) + window_summary = summarize_window(requests, window) + harness_context = ( + build_harness_context( + study=study, + window_summary=window_summary, + state=state, + ) + if study.llm.use_harness + else None + ) prompt = build_prompt( study=study, - window_summary=summarize_window(requests, window), + window_summary=window_summary, state=state, capability_profile=capability_profile, ) @@ -162,18 +170,36 @@ def cmd_study_tune(args: argparse.Namespace) -> int: proposal_name = proposal_source.stem else: proposal_source = None - proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) - proposal_name = f"proposal-{state.next_trial_index:04d}" + stop_proposal = ( + build_harness_stop_proposal(harness_context) + if harness_context is not None + else None + ) + if stop_proposal is not None: + proposal_text = json.dumps(to_jsonable(stop_proposal), ensure_ascii=False) + proposal_name = f"harness-stop-{state.next_trial_index:04d}" + else: + if study.llm.endpoint is None: + raise SpecError( + "No proposal files provided, study.llm.endpoint is not configured, " + "and the harness stop guard did not fire." + ) + proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt) + proposal_name = f"proposal-{state.next_trial_index:04d}" raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt" raw_proposal_path.write_text(proposal_text, encoding="utf-8") proposal = parse_proposal_text(proposal_text, study) store.write_proposal(study.study_id, proposal_name, proposal) if proposal.should_stop: + if proposal_name.startswith("harness-stop-"): + proposal_source_label = "harness" + else: + proposal_source_label = str(proposal_source) if proposal_source else "llm" executed.append( { "trial_id": None, "proposal_name": proposal_name, - "proposal_source": str(proposal_source) if proposal_source else "llm", + "proposal_source": proposal_source_label, "stopped": True, "diagnosis": proposal.diagnosis, "state_best_trial_id": state.best_trial_id, diff --git a/src/aituner/harness.py b/src/aituner/harness.py index 10059f9..d1bd40d 100644 --- a/src/aituner/harness.py +++ b/src/aituner/harness.py @@ -4,7 +4,25 @@ import json from pathlib import Path from typing import Any -from .spec import StudySpec, StudyState, TrialSummary +from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary + + +_TOPOLOGY_KEYS = { + "tensor-parallel-size", + "data-parallel-size", + "expert-parallel-size", + "enable-expert-parallel", +} +_RUNTIME_KEYS = { + "max-num-seqs", + "max-num-batched-tokens", + "block-size", + "gpu-memory-utilization", + "enable-chunked-prefill", +} +_STRONG_INCUMBENT_MIN_GAIN = 1.8 +_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2 +_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3 def build_harness_context( @@ -23,11 +41,39 @@ def build_harness_context( "workload_lca_profile": _workload_lca_profile(window_summary), "recent_trial_diagnostics": recent_diagnostics, "convergence_guard": _convergence_guard(state, recent_diagnostics), + "harness_stop": _harness_stop_decision(state, recent_diagnostics), "knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics), "proposal_rules": _proposal_rules(), } +def build_harness_stop_proposal(context: dict[str, Any]) -> Proposal | None: + stop = context.get("harness_stop") + if not isinstance(stop, dict) or not stop.get("should_stop"): + return None + reason = str(stop.get("reason") or "harness_converged") + evidence = stop.get("evidence") if isinstance(stop.get("evidence"), dict) else {} + observation = ( + "Harness convergence guard triggered before requesting another proposal: " + f"{reason}." + ) + diagnosis = str(evidence.get("summary") or reason) + return Proposal( + observation=observation, + diagnosis=diagnosis, + config_patch=ConfigPatch(env_patch={}, flag_patch={}), + expected_effects=[ + "stop without spending another GPU trial", + "preserve the current best observed configuration", + ], + why_not_previous_failures=( + "The stop decision is based on completed validation evidence and does not " + "repeat any failed configuration." + ), + should_stop=True, + ) + + def render_harness_context(context: dict[str, Any]) -> str: return json.dumps(context, ensure_ascii=False, indent=2) @@ -423,6 +469,158 @@ def _convergence_guard( } +def _harness_stop_decision( + state: StudyState, + recent_diagnostics: list[dict[str, Any]], +) -> dict[str, Any]: + guard = _convergence_guard(state, recent_diagnostics) + if guard["should_stop_if_no_harness_can_justify_a_new_adjacent_probe"]: + return { + "should_stop": True, + "reason": guard["reason"], + "evidence": { + "summary": "The convergence guard fired and no further adjacent probe is required.", + "convergence_guard": guard, + }, + } + validation = _validation_exhausted_guard(state, recent_diagnostics) + if validation["exhausted"]: + return { + "should_stop": True, + "reason": validation["reason"], + "evidence": validation, + } + return { + "should_stop": False, + "reason": "continue_harness_guided_search", + "evidence": { + "summary": "No deterministic harness stop condition is satisfied.", + "convergence_guard": guard, + "validation_exhausted": validation, + }, + } + + +def _validation_exhausted_guard( + state: StudyState, + recent_diagnostics: list[dict[str, Any]], +) -> dict[str, Any]: + default = { + "exhausted": False, + "reason": "validation_not_exhausted", + "summary": "Validation probes are not sufficient to stop yet.", + "incumbent_trial_id": state.best_trial_id, + "incumbent_gain_vs_baseline": None, + "validation_trial_ids": [], + "validation_families": [], + } + if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)): + return default + completed = [ + item + for item in recent_diagnostics + if item.get("status") == "completed" + and isinstance(item.get("best_request_rate_per_gpu"), (int, float)) + ] + if not completed: + return default + baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu")) + incumbent_rate = _as_float(state.best_request_rate_per_gpu) + if baseline_rate <= 0 or incumbent_rate <= 0: + return default + gain = incumbent_rate / baseline_rate + if gain < _STRONG_INCUMBENT_MIN_GAIN: + return { + **default, + "reason": "incumbent_gain_not_large_enough_for_validation_stop", + "incumbent_gain_vs_baseline": gain, + } + + best_index = next( + ( + index + for index, item in enumerate(recent_diagnostics) + if item.get("trial_id") == state.best_trial_id + ), + None, + ) + if best_index is None: + return { + **default, + "reason": "incumbent_not_in_recent_harness_history", + "incumbent_gain_vs_baseline": gain, + } + after_best = [ + item + for item in recent_diagnostics[best_index + 1 :] + if item.get("status") in {"completed", "failed"} + ] + if len(after_best) < _MIN_POST_INCUMBENT_VALIDATION_TRIALS: + return { + **default, + "reason": "need_at_least_two_post_incumbent_validation_trials", + "incumbent_gain_vs_baseline": gain, + "validation_trial_ids": [str(item.get("trial_id")) for item in after_best], + } + if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best): + return { + **default, + "reason": "post_incumbent_validation_found_feasible_candidate", + "incumbent_gain_vs_baseline": gain, + "validation_trial_ids": [str(item.get("trial_id")) for item in after_best], + } + + families: set[str] = set() + for item in after_best: + families.update(_validation_families(item)) + has_topology = "topology" in families + has_runtime = bool(families & {"runtime", "max-num-seqs", "max-num-batched-tokens"}) + enough_evidence = ( + len(after_best) >= _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE + or (has_topology and has_runtime) + ) + if not enough_evidence: + return { + **default, + "reason": "post_incumbent_validation_has_not_covered_enough_families", + "incumbent_gain_vs_baseline": gain, + "validation_trial_ids": [str(item.get("trial_id")) for item in after_best], + "validation_families": sorted(families), + } + + return { + "exhausted": True, + "reason": "post_incumbent_validation_exhausted", + "summary": ( + "A strong incumbent was followed by validation probes across nearby " + "topology/runtime families, and none produced a feasible candidate." + ), + "incumbent_trial_id": state.best_trial_id, + "incumbent_gain_vs_baseline": gain, + "validation_trial_ids": [str(item.get("trial_id")) for item in after_best], + "validation_families": sorted(families), + } + + +def _validation_families(item: dict[str, Any]) -> set[str]: + config_patch = item.get("config_patch") + if not isinstance(config_patch, dict): + return set() + flag_patch = config_patch.get("flag_patch") + if not isinstance(flag_patch, dict): + return set() + families: set[str] = set() + if any(key in flag_patch for key in _TOPOLOGY_KEYS): + families.add("topology") + for key in _RUNTIME_KEYS: + if key in flag_patch: + families.add("runtime") + families.add(key) + if not families and flag_patch: + families.add("other") + return families + + def _strong_incumbent_guard( state: StudyState, recent_diagnostics: list[dict[str, Any]], diff --git a/src/aituner/llm.py b/src/aituner/llm.py index 49b6b12..d27d9bf 100644 --- a/src/aituner/llm.py +++ b/src/aituner/llm.py @@ -312,23 +312,45 @@ def build_prompt( "", "Tested config signatures:", json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2), - "", - "Harnesses:", - render_harness_context( - build_harness_context( - study=study, - window_summary=window_summary, - state=state, - ) - ), - "", + ] + if study.llm.use_harness: + sections.extend( + [ + "", + "Harnesses:", + render_harness_context( + build_harness_context( + study=study, + window_summary=window_summary, + state=state, + ) + ), + "", + ] + ) + else: + sections.extend( + [ + "", + "Harnesses:", + "Disabled by llm.use_harness=false for ablation.", + "", + ] + ) + sections.extend( + [ "The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.", "The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.", "The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.", "If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.", "Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.", - "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.", - ] + ( + "Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged." + if study.llm.use_harness + else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints." + ), + ] + ) return "\n".join(sections) diff --git a/src/aituner/spec.py b/src/aituner/spec.py index f4d0944..98f7573 100644 --- a/src/aituner/spec.py +++ b/src/aituner/spec.py @@ -576,6 +576,7 @@ class LLMPolicySpec: endpoint: LLMEndpointSpec | None system_prompt: str max_history_trials: int + use_harness: bool = True @classmethod def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec": @@ -593,6 +594,11 @@ class LLMPolicySpec: max_history_trials=_require_int( payload.get("max_history_trials", 8), context="llm.max_history_trials" ), + use_harness=( + _require_bool(payload.get("use_harness"), context="llm.use_harness") + if payload.get("use_harness") is not None + else True + ), ) diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index af33269..f33125b 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -13,7 +13,7 @@ from aituner.compare import load_compare_spec, run_compare from aituner.engine import build_launch_recipe from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy from aituner.job import append_job, build_trial_job -from aituner.harness import build_harness_context +from aituner.harness import build_harness_context, build_harness_stop_proposal from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal from aituner.search import ThresholdProbe, binary_search_max_feasible from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations @@ -422,6 +422,119 @@ class CoreFlowTests(unittest.TestCase): ) self.assertIn("validate", guard["recommended_next_action"]) + def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + study = load_study_spec(study_path) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0002", + best_parallel_size=8, + best_sampling_u=0.02, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + parallel_size=8, + best_request_rate=0.8, + best_request_rate_per_gpu=0.1, + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + TrialSummary( + trial_id="trial-0002", + status="completed", + parallel_size=8, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + config_patch={ + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 2, + "data-parallel-size": 4, + }, + }, + ), + TrialSummary( + trial_id="trial-0003", + status="completed", + parallel_size=8, + config_patch={ + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 1, + "data-parallel-size": 8, + }, + }, + ), + TrialSummary( + trial_id="trial-0004", + status="completed", + parallel_size=8, + config_patch={ + "env_patch": {}, + "flag_patch": {"max-num-seqs": 160}, + }, + ), + ], + ) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 2048}, + state=state, + ) + self.assertTrue(context["harness_stop"]["should_stop"]) + self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted") + proposal = build_harness_stop_proposal(context) + self.assertIsNotNone(proposal) + self.assertTrue(proposal.should_stop) + + def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + study = load_study_spec(study_path) + state = StudyState( + study_id=study.study_id, + best_trial_id="trial-0002", + best_parallel_size=8, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + parallel_size=8, + best_request_rate=0.8, + best_request_rate_per_gpu=0.1, + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + TrialSummary( + trial_id="trial-0002", + status="completed", + parallel_size=8, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + config_patch={ + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 2, + "data-parallel-size": 4, + }, + }, + ), + ], + ) + context = build_harness_context( + study=study, + window_summary={"prompt_tokens_p95": 2048}, + state=state, + ) + self.assertFalse(context["harness_stop"]["should_stop"]) + self.assertIsNone(build_harness_stop_proposal(context)) + def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) @@ -564,6 +677,26 @@ class CoreFlowTests(unittest.TestCase): "\n".join(context["proposal_rules"]), ) + def test_prompt_can_disable_harness_for_ablation(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + payload = json.loads(study_path.read_text(encoding="utf-8")) + payload["llm"]["use_harness"] = False + study_path.write_text(json.dumps(payload), encoding="utf-8") + study = load_study_spec(study_path) + window, requests = load_trace_requests(study, study_spec_path=study_path) + prompt = build_prompt( + study=study, + window_summary=summarize_window(requests, window), + state=StudyState(study_id=study.study_id), + capability_profile=None, + ) + self.assertFalse(study.llm.use_harness) + self.assertIn("Disabled by llm.use_harness=false", prompt) + self.assertNotIn('"paper_alignment"', prompt) + self.assertIn("without harness hints", prompt) + def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) @@ -2299,6 +2432,98 @@ class CoreFlowTests(unittest.TestCase): state = store.load_state("study-1") self.assertEqual(state.next_trial_index, 1) + def test_cli_tune_uses_harness_stop_before_llm(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + study_path = _write_study_assets(tmp_path) + study = load_study_spec(study_path) + store_root = tmp_path / "store" + store = StudyStore(store_root) + store.init_study(spec_path=study_path, study=study) + store.save_state( + StudyState( + study_id=study.study_id, + best_trial_id="trial-0002", + best_parallel_size=8, + best_sampling_u=0.02, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + next_trial_index=5, + trials=[ + TrialSummary( + trial_id="trial-0001", + status="completed", + parallel_size=8, + best_request_rate=0.8, + best_request_rate_per_gpu=0.1, + config_patch={"env_patch": {}, "flag_patch": {}}, + ), + TrialSummary( + trial_id="trial-0002", + status="completed", + parallel_size=8, + best_request_rate=2.4, + best_request_rate_per_gpu=0.3, + config_patch={ + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 2, + "data-parallel-size": 4, + }, + }, + ), + TrialSummary( + trial_id="trial-0003", + status="completed", + parallel_size=8, + config_patch={ + "env_patch": {}, + "flag_patch": { + "tensor-parallel-size": 1, + "data-parallel-size": 8, + }, + }, + ), + TrialSummary( + trial_id="trial-0004", + status="completed", + parallel_size=8, + config_patch={ + "env_patch": {}, + "flag_patch": {"max-num-seqs": 160}, + }, + ), + ], + ) + ) + + with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock: + with mock.patch("aituner.cli.run_trial") as run_trial_mock: + exit_code = cli_main( + [ + "study", + "tune", + "--spec", + str(study_path), + "--store-root", + str(store_root), + "--max-trials", + "1", + ] + ) + + self.assertEqual(exit_code, 0) + llm_mock.assert_not_called() + run_trial_mock.assert_not_called() + proposal_path = ( + store.study_root(study.study_id) + / "proposals" + / "harness-stop-0005.json" + ) + self.assertTrue(proposal_path.exists()) + proposal = json.loads(proposal_path.read_text(encoding="utf-8")) + self.assertTrue(proposal["should_stop"]) + def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None: with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp)