Add harness early stop ablation
This commit is contained in:
@@ -0,0 +1,119 @@
|
|||||||
|
{
|
||||||
|
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness",
|
||||||
|
"hardware": {
|
||||||
|
"gpu_count": 8,
|
||||||
|
"gpu_model": "H20",
|
||||||
|
"host_candidates": [
|
||||||
|
"dash0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"model_id": "Qwen/Qwen3-30B-A3B",
|
||||||
|
"served_model_name": "qwen3-30b-a3b-community"
|
||||||
|
},
|
||||||
|
"engine": {
|
||||||
|
"engine_name": "vllm",
|
||||||
|
"engine_version": "0.20.0",
|
||||||
|
"exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
|
||||||
|
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 18230,
|
||||||
|
"healthcheck_path": "/v1/models",
|
||||||
|
"ready_timeout_s": 900,
|
||||||
|
"request_timeout_s": 900,
|
||||||
|
"launch_args": [
|
||||||
|
"serve",
|
||||||
|
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
|
||||||
|
],
|
||||||
|
"base_envs": {
|
||||||
|
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
|
||||||
|
},
|
||||||
|
"base_flags": {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 18230,
|
||||||
|
"served-model-name": "qwen3-30b-a3b-community"
|
||||||
|
},
|
||||||
|
"tunable_envs": [],
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"enable-expert-parallel",
|
||||||
|
"expert-parallel-size",
|
||||||
|
"gpu-memory-utilization",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
"max-num-seqs",
|
||||||
|
"block-size",
|
||||||
|
"enable-prefix-caching",
|
||||||
|
"enable-chunked-prefill"
|
||||||
|
],
|
||||||
|
"topology_constraints": {
|
||||||
|
"require_tp_dp_product_equals_gpu_count": false,
|
||||||
|
"require_ep_size_leq_tp_dp_product": true,
|
||||||
|
"require_ep_size_divides_tp_dp_product": true,
|
||||||
|
"require_enable_expert_parallel_when_ep_gt_one": true,
|
||||||
|
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
|
||||||
|
"allowed_tp_dp_products": [1, 2, 4, 8],
|
||||||
|
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
|
||||||
|
"allowed_data_parallel_sizes": [1, 2, 4, 8],
|
||||||
|
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
|
||||||
|
},
|
||||||
|
"python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
|
||||||
|
},
|
||||||
|
"trace": {
|
||||||
|
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
|
||||||
|
"window_id": "chat_w20260311_1000",
|
||||||
|
"u_field": "sampling_u",
|
||||||
|
"timestamp_field": "timestamp",
|
||||||
|
"max_concurrency": 64,
|
||||||
|
"input_length_filter": {
|
||||||
|
"min_input_tokens": 0,
|
||||||
|
"max_input_tokens": 8192
|
||||||
|
},
|
||||||
|
"max_requests_per_probe": 2048,
|
||||||
|
"replay_time_scale": 1.0,
|
||||||
|
"early_stop_max_lag_s": 120.0,
|
||||||
|
"early_stop_max_elapsed_s": 900.0
|
||||||
|
},
|
||||||
|
"slo": {
|
||||||
|
"target_pass_rate": 0.95,
|
||||||
|
"ttft_rule": {
|
||||||
|
"kind": "step_ms",
|
||||||
|
"buckets": [
|
||||||
|
{
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"threshold_ms": 2000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"max_input_tokens": 32768,
|
||||||
|
"threshold_ms": 4000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"threshold_ms": 6000
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"tpot_rule": {
|
||||||
|
"kind": "fixed_ms",
|
||||||
|
"threshold_ms": 50
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"search": {
|
||||||
|
"low": 0.0,
|
||||||
|
"high": 0.125,
|
||||||
|
"tolerance": 0.001,
|
||||||
|
"max_probes": 6,
|
||||||
|
"sample_seed": 20260325
|
||||||
|
},
|
||||||
|
"llm": {
|
||||||
|
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
|
||||||
|
"max_history_trials": 8,
|
||||||
|
"use_harness": true,
|
||||||
|
"endpoint": {
|
||||||
|
"provider": "codex",
|
||||||
|
"model": "gpt-5.4",
|
||||||
|
"stream": true,
|
||||||
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
|
"timeout_s": 240
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,119 @@
|
|||||||
|
{
|
||||||
|
"study_id": "dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness",
|
||||||
|
"hardware": {
|
||||||
|
"gpu_count": 8,
|
||||||
|
"gpu_model": "H20",
|
||||||
|
"host_candidates": [
|
||||||
|
"dash0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"model_id": "Qwen/Qwen3-30B-A3B",
|
||||||
|
"served_model_name": "qwen3-30b-a3b-community"
|
||||||
|
},
|
||||||
|
"engine": {
|
||||||
|
"engine_name": "vllm",
|
||||||
|
"engine_version": "0.20.0",
|
||||||
|
"exec_path": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/vllm",
|
||||||
|
"cwd": "/home/admin/cpfs/wjh/aituner/aituner",
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 18231,
|
||||||
|
"healthcheck_path": "/v1/models",
|
||||||
|
"ready_timeout_s": 900,
|
||||||
|
"request_timeout_s": 900,
|
||||||
|
"launch_args": [
|
||||||
|
"serve",
|
||||||
|
"/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B"
|
||||||
|
],
|
||||||
|
"base_envs": {
|
||||||
|
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7"
|
||||||
|
},
|
||||||
|
"base_flags": {
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 18231,
|
||||||
|
"served-model-name": "qwen3-30b-a3b-community"
|
||||||
|
},
|
||||||
|
"tunable_envs": [],
|
||||||
|
"tunable_flags": [
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"enable-expert-parallel",
|
||||||
|
"expert-parallel-size",
|
||||||
|
"gpu-memory-utilization",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
"max-num-seqs",
|
||||||
|
"block-size",
|
||||||
|
"enable-prefix-caching",
|
||||||
|
"enable-chunked-prefill"
|
||||||
|
],
|
||||||
|
"topology_constraints": {
|
||||||
|
"require_tp_dp_product_equals_gpu_count": false,
|
||||||
|
"require_ep_size_leq_tp_dp_product": true,
|
||||||
|
"require_ep_size_divides_tp_dp_product": true,
|
||||||
|
"require_enable_expert_parallel_when_ep_gt_one": true,
|
||||||
|
"validate_cuda_graph_sizes_divisible_by_tp_when_tp_ep_reduce_scatter": true,
|
||||||
|
"allowed_tp_dp_products": [1, 2, 4, 8],
|
||||||
|
"allowed_tensor_parallel_sizes": [1, 2, 4, 8],
|
||||||
|
"allowed_data_parallel_sizes": [1, 2, 4, 8],
|
||||||
|
"allowed_expert_parallel_sizes": [1, 2, 4, 8]
|
||||||
|
},
|
||||||
|
"python_executable": "/home/admin/cpfs/wjh/venvs/vllm-0.20.0/bin/python"
|
||||||
|
},
|
||||||
|
"trace": {
|
||||||
|
"windows_path": "/home/admin/cpfs/wjh/aituner/aituner/trace_windows/windows.json",
|
||||||
|
"window_id": "chat_w20260311_1000",
|
||||||
|
"u_field": "sampling_u",
|
||||||
|
"timestamp_field": "timestamp",
|
||||||
|
"max_concurrency": 64,
|
||||||
|
"input_length_filter": {
|
||||||
|
"min_input_tokens": 0,
|
||||||
|
"max_input_tokens": 8192
|
||||||
|
},
|
||||||
|
"max_requests_per_probe": 2048,
|
||||||
|
"replay_time_scale": 1.0,
|
||||||
|
"early_stop_max_lag_s": 120.0,
|
||||||
|
"early_stop_max_elapsed_s": 900.0
|
||||||
|
},
|
||||||
|
"slo": {
|
||||||
|
"target_pass_rate": 0.95,
|
||||||
|
"ttft_rule": {
|
||||||
|
"kind": "step_ms",
|
||||||
|
"buckets": [
|
||||||
|
{
|
||||||
|
"max_input_tokens": 4096,
|
||||||
|
"threshold_ms": 2000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"max_input_tokens": 32768,
|
||||||
|
"threshold_ms": 4000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"threshold_ms": 6000
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"tpot_rule": {
|
||||||
|
"kind": "fixed_ms",
|
||||||
|
"threshold_ms": 50
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"search": {
|
||||||
|
"low": 0.0,
|
||||||
|
"high": 0.125,
|
||||||
|
"tolerance": 0.001,
|
||||||
|
"max_probes": 6,
|
||||||
|
"sample_seed": 20260325
|
||||||
|
},
|
||||||
|
"llm": {
|
||||||
|
"system_prompt": "Tune community vLLM 0.20.0 serving for Qwen3-30B-A3B. Start from the default vLLM engine configuration, use only launch-safe patches, and optimize request_rate_per_gpu under the configured SLO.",
|
||||||
|
"max_history_trials": 8,
|
||||||
|
"use_harness": false,
|
||||||
|
"endpoint": {
|
||||||
|
"provider": "codex",
|
||||||
|
"model": "gpt-5.4",
|
||||||
|
"stream": true,
|
||||||
|
"api_key_env": "OPENAI_API_KEY",
|
||||||
|
"timeout_s": 240
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -26,11 +26,11 @@ The harness turns each LLM proposal from open-ended config search into a bottlen
|
|||||||
- `gpu-memory-utilization`: memory headroom after topology and batching are stable.
|
- `gpu-memory-utilization`: memory headroom after topology and batching are stable.
|
||||||
- Each family has `use_when`, `procedure`, `guards`, and `active_now` fields.
|
- Each family has `use_when`, `procedure`, `guards`, and `active_now` fields.
|
||||||
|
|
||||||
4. Proposal discipline
|
4. Proposal discipline and early stop
|
||||||
- The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed.
|
- The prompt requires the LLM to choose at most one primary knob family unless history proves a coupled change is needed.
|
||||||
- It must use adjacent legal topology choices and stay inside topology constraints.
|
- It must use adjacent legal topology choices and stay inside topology constraints.
|
||||||
- It receives tested config signatures, so it should not repeat already-tried configs.
|
- It receives tested config signatures, so it should not repeat already-tried configs.
|
||||||
- It can return `should_stop=true` when no adjacent harness-guided probe is justified.
|
- A deterministic harness stop can now emit `should_stop=true` before calling the LLM when completed validation evidence says another trial is not justified.
|
||||||
|
|
||||||
5. Baseline-first loop
|
5. Baseline-first loop
|
||||||
- LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed.
|
- LLM-driven `study tune` now evaluates the initial engine config first unless `--skip-baseline` is passed.
|
||||||
@@ -44,10 +44,10 @@ The speedup comes from reducing wasted proposal families, not from changing the
|
|||||||
- For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs.
|
- For long-prompt, low-cache-reuse windows, the harness activates the TP harness before speculative runtime knobs.
|
||||||
- Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`.
|
- Example: qwen27b 0-8k chat reached `TP=2, DP=1` at iter 2 under harness replay, while the original run spent iter 2 on `DP=2` and iter 3 on `DP=4`.
|
||||||
|
|
||||||
2. Guarded stop after a strong incumbent
|
2. Guarded stop after validation, not immediately after a strong incumbent
|
||||||
- If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks.
|
- If the newest trial is the incumbent and improves per-GPU throughput by at least `1.8x` over baseline, the harness requires direct evidence before trying runtime-only tweaks.
|
||||||
- Without that guard, the LLM still proposed weak MBT trials after finding the qwen27b best config.
|
- It does not stop at the first large gain. It requires post-incumbent validation trials across nearby topology/runtime families, and stops only if those trials fail to produce a feasible per-GPU improvement.
|
||||||
- With the guard, it emits `should_stop=true`.
|
- With the guard, `study tune` can write a `harness-stop-XXXX` proposal and exit without spending another GPU trial.
|
||||||
|
|
||||||
3. All-infeasible plateau detection
|
3. All-infeasible plateau detection
|
||||||
- When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family.
|
- When recent all-infeasible trials at the same sampling threshold stop improving pass rate and p95 TTFT, the harness blocks repeating the same primary knob family.
|
||||||
@@ -78,6 +78,6 @@ Result:
|
|||||||
|
|
||||||
## Current Risks
|
## Current Risks
|
||||||
|
|
||||||
- The harness is prompt-guided, not a hard verifier for every rule. If future LLM outputs ignore a fired guard, proposal validation should reject the blocked family explicitly.
|
- The harness is still prompt-guided for choosing the next non-stop proposal. The deterministic stop path is hard-coded in `study tune`, but proposal-family blocking is not yet enforced by a separate validator.
|
||||||
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials once a large gain is already reached. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
|
- Strong-incumbent stopping is intentionally biased toward fewer GPU trials after validation evidence accumulates. Workloads with very narrow runtime sweet spots may still need a "continue local refinement" exception when the user wants absolute best throughput rather than fastest convergence to a good config.
|
||||||
- Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows.
|
- Full fresh reruns on large models are expensive. Strict replay is useful for measuring proposal-path improvements when the proposed configs already exist in prior measured runs, but publication-quality claims still need fresh no-relaunch runs when time allows.
|
||||||
|
|||||||
@@ -0,0 +1,93 @@
|
|||||||
|
# Qwen3-30B-A3B Community vLLM Harness Ablation, 2026-05-02
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Run a fresh dash0 experiment on the community vLLM latest release with the local community model:
|
||||||
|
|
||||||
|
`/home/admin/cpfs/wjh/models/Qwen/Qwen3-30B-A3B`
|
||||||
|
|
||||||
|
The comparison is:
|
||||||
|
|
||||||
|
| Variant | Spec | Harness |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| no-harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_noharness.json` | disabled via `llm.use_harness=false` |
|
||||||
|
| harness | `configs/examples/dash0_qwen30b_a3b_community_vllm020_harness.json` | enabled, including deterministic stop proposal |
|
||||||
|
|
||||||
|
Both specs start from the same base vLLM configuration. The base contains only serving access fields: `host`, `port`, and `served-model-name`. It does not set performance flags such as TP, DP, EP, max model length, prefix cache, chunked prefill, max-num-seqs, max-num-batched-tokens, or gpu-memory-utilization. The first trial therefore measures community vLLM defaults for this model.
|
||||||
|
|
||||||
|
## vLLM Install
|
||||||
|
|
||||||
|
PyPI reports `vllm==0.20.0` as the current community release checked on 2026-05-02. The dash0 installation target is:
|
||||||
|
|
||||||
|
`/home/admin/cpfs/wjh/venvs/vllm-0.20.0`
|
||||||
|
|
||||||
|
Install log:
|
||||||
|
|
||||||
|
`/home/admin/cpfs/wjh/aituner/aituner/logs/install_vllm_0.20.0_20260502.log`
|
||||||
|
|
||||||
|
## Workload
|
||||||
|
|
||||||
|
The experiment reuses the 0-8k chat window that has already been used for qwen27b harness work:
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
| --- | --- |
|
||||||
|
| window | `chat_w20260311_1000` |
|
||||||
|
| source rows | 32606 |
|
||||||
|
| input filter | 0 to 8192 tokens |
|
||||||
|
| max requests per probe | 2048 |
|
||||||
|
| target pass rate | 0.95 |
|
||||||
|
| TTFT SLO | 2s up to 4k, 4s up to 32k, 6s above |
|
||||||
|
| TPOT SLO | 50ms |
|
||||||
|
| search high | 0.125 sampling_u |
|
||||||
|
| max probes per trial | 6 |
|
||||||
|
|
||||||
|
The `max_requests_per_probe=2048` cap keeps the fresh community-vLLM ablation practical while preserving a real trace-shaped replay, SLO scoring, and binary-search threshold probe.
|
||||||
|
|
||||||
|
## Harness Update Under Test
|
||||||
|
|
||||||
|
This run tests a stricter early-stop harness:
|
||||||
|
|
||||||
|
- The harness still injects L-C-A workload features, recent trial diagnostics, active bottleneck, legal topology candidates, tested signatures, and knob-family rules.
|
||||||
|
- A strong incumbent no longer means immediate stop. It means "validate nearby alternatives".
|
||||||
|
- Deterministic stop is allowed only after completed validation evidence says continuing is unlikely to be useful:
|
||||||
|
- the incumbent beats baseline by a generic large-gain ratio,
|
||||||
|
- at least two post-incumbent validation trials have run,
|
||||||
|
- those validation trials did not produce a feasible per-GPU improvement,
|
||||||
|
- the validation covered topology and runtime families, or accumulated at least three post-incumbent validation attempts.
|
||||||
|
- If the stop guard fires, `study tune` writes `harness-stop-XXXX` and exits without spending another GPU trial or asking the LLM for another proposal.
|
||||||
|
|
||||||
|
This is a generic harness rule, not a testcase-specific threshold. It does not depend on qwen27b, qwen235b, qwen30b, a fixed TP/DP value, or a hardcoded SLO number.
|
||||||
|
|
||||||
|
## Unit Tests
|
||||||
|
|
||||||
|
Local test command:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
PYTHONPATH=src python3 -m unittest tests.test_core_flow -q
|
||||||
|
```
|
||||||
|
|
||||||
|
Result: passed, 74 tests.
|
||||||
|
|
||||||
|
The added coverage checks:
|
||||||
|
|
||||||
|
| Test | Purpose |
|
||||||
|
| --- | --- |
|
||||||
|
| `test_harness_does_not_stop_immediately_after_strong_incumbent` | strong incumbent requires validation first |
|
||||||
|
| `test_harness_stop_after_post_incumbent_validation_is_exhausted` | deterministic stop after validation exhaustion |
|
||||||
|
| `test_cli_tune_uses_harness_stop_before_llm` | `study tune` can stop without calling the LLM or launching another GPU trial |
|
||||||
|
| `test_prompt_can_disable_harness_for_ablation` | no-harness prompt removes structured harness context |
|
||||||
|
|
||||||
|
## Experiment Tracking
|
||||||
|
|
||||||
|
Pending dash0 runs:
|
||||||
|
|
||||||
|
| Variant | tmux session | Log | Study root |
|
||||||
|
| --- | --- | --- | --- |
|
||||||
|
| no-harness | `qwen30b_vllm020_noharness_20260502` | `logs/qwen30b_vllm020_noharness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-noharness` |
|
||||||
|
| harness | `qwen30b_vllm020_harness_20260502` | `logs/qwen30b_vllm020_harness_20260502.log` | `.aituner-community-vllm020/studies/dash0-qwen30b-a3b-community-vllm020-chat-0-8k-harness` |
|
||||||
|
|
||||||
|
The harness run should be judged by best-so-far `request_rate_per_gpu` per tuning iteration, plus whether it stops only after validation evidence. The no-harness run should use the same trial budget so the ablation exposes whether the early-stop harness saves iterations without hiding a later better point.
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
Pending. This section will be filled after the dash0 experiments finish.
|
||||||
@@ -6,9 +6,10 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .compare import run_compare
|
from .compare import run_compare
|
||||||
|
from .harness import build_harness_context, build_harness_stop_proposal
|
||||||
from .job import append_job, build_trial_job
|
from .job import append_job, build_trial_job
|
||||||
from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
|
from .llm import build_prompt, call_llm_for_proposal, load_capability_profile, parse_proposal_text
|
||||||
from .spec import Proposal, SpecError, load_study_spec
|
from .spec import Proposal, SpecError, load_study_spec, to_jsonable
|
||||||
from .store import StudyStore
|
from .store import StudyStore
|
||||||
from .trace import load_trace_requests, summarize_window
|
from .trace import load_trace_requests, summarize_window
|
||||||
from .worker import run_trial
|
from .worker import run_trial
|
||||||
@@ -118,16 +119,23 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
raise SpecError("max_trials must be positive")
|
raise SpecError("max_trials must be positive")
|
||||||
if proposal_files and max_trials > len(proposal_files):
|
if proposal_files and max_trials > len(proposal_files):
|
||||||
max_trials = len(proposal_files)
|
max_trials = len(proposal_files)
|
||||||
if not proposal_files and study.llm.endpoint is None:
|
|
||||||
raise SpecError("No proposal files provided and study.llm.endpoint is not configured")
|
|
||||||
|
|
||||||
executed: list[dict[str, object]] = []
|
executed: list[dict[str, object]] = []
|
||||||
for idx in range(max_trials):
|
for idx in range(max_trials):
|
||||||
state = store.load_state(study.study_id)
|
state = store.load_state(study.study_id)
|
||||||
window, requests = load_trace_requests(study, study_spec_path=spec_path)
|
window, requests = load_trace_requests(study, study_spec_path=spec_path)
|
||||||
|
window_summary = summarize_window(requests, window)
|
||||||
|
harness_context = (
|
||||||
|
build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary=window_summary,
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
if study.llm.use_harness
|
||||||
|
else None
|
||||||
|
)
|
||||||
prompt = build_prompt(
|
prompt = build_prompt(
|
||||||
study=study,
|
study=study,
|
||||||
window_summary=summarize_window(requests, window),
|
window_summary=window_summary,
|
||||||
state=state,
|
state=state,
|
||||||
capability_profile=capability_profile,
|
capability_profile=capability_profile,
|
||||||
)
|
)
|
||||||
@@ -162,18 +170,36 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
|||||||
proposal_name = proposal_source.stem
|
proposal_name = proposal_source.stem
|
||||||
else:
|
else:
|
||||||
proposal_source = None
|
proposal_source = None
|
||||||
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
|
stop_proposal = (
|
||||||
proposal_name = f"proposal-{state.next_trial_index:04d}"
|
build_harness_stop_proposal(harness_context)
|
||||||
|
if harness_context is not None
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
if stop_proposal is not None:
|
||||||
|
proposal_text = json.dumps(to_jsonable(stop_proposal), ensure_ascii=False)
|
||||||
|
proposal_name = f"harness-stop-{state.next_trial_index:04d}"
|
||||||
|
else:
|
||||||
|
if study.llm.endpoint is None:
|
||||||
|
raise SpecError(
|
||||||
|
"No proposal files provided, study.llm.endpoint is not configured, "
|
||||||
|
"and the harness stop guard did not fire."
|
||||||
|
)
|
||||||
|
proposal_text = call_llm_for_proposal(policy=study.llm, prompt=prompt)
|
||||||
|
proposal_name = f"proposal-{state.next_trial_index:04d}"
|
||||||
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
|
raw_proposal_path = store.study_root(study.study_id) / "proposals" / f"{proposal_name}.raw.txt"
|
||||||
raw_proposal_path.write_text(proposal_text, encoding="utf-8")
|
raw_proposal_path.write_text(proposal_text, encoding="utf-8")
|
||||||
proposal = parse_proposal_text(proposal_text, study)
|
proposal = parse_proposal_text(proposal_text, study)
|
||||||
store.write_proposal(study.study_id, proposal_name, proposal)
|
store.write_proposal(study.study_id, proposal_name, proposal)
|
||||||
if proposal.should_stop:
|
if proposal.should_stop:
|
||||||
|
if proposal_name.startswith("harness-stop-"):
|
||||||
|
proposal_source_label = "harness"
|
||||||
|
else:
|
||||||
|
proposal_source_label = str(proposal_source) if proposal_source else "llm"
|
||||||
executed.append(
|
executed.append(
|
||||||
{
|
{
|
||||||
"trial_id": None,
|
"trial_id": None,
|
||||||
"proposal_name": proposal_name,
|
"proposal_name": proposal_name,
|
||||||
"proposal_source": str(proposal_source) if proposal_source else "llm",
|
"proposal_source": proposal_source_label,
|
||||||
"stopped": True,
|
"stopped": True,
|
||||||
"diagnosis": proposal.diagnosis,
|
"diagnosis": proposal.diagnosis,
|
||||||
"state_best_trial_id": state.best_trial_id,
|
"state_best_trial_id": state.best_trial_id,
|
||||||
|
|||||||
@@ -4,7 +4,25 @@ import json
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from .spec import StudySpec, StudyState, TrialSummary
|
from .spec import ConfigPatch, Proposal, StudySpec, StudyState, TrialSummary
|
||||||
|
|
||||||
|
|
||||||
|
_TOPOLOGY_KEYS = {
|
||||||
|
"tensor-parallel-size",
|
||||||
|
"data-parallel-size",
|
||||||
|
"expert-parallel-size",
|
||||||
|
"enable-expert-parallel",
|
||||||
|
}
|
||||||
|
_RUNTIME_KEYS = {
|
||||||
|
"max-num-seqs",
|
||||||
|
"max-num-batched-tokens",
|
||||||
|
"block-size",
|
||||||
|
"gpu-memory-utilization",
|
||||||
|
"enable-chunked-prefill",
|
||||||
|
}
|
||||||
|
_STRONG_INCUMBENT_MIN_GAIN = 1.8
|
||||||
|
_MIN_POST_INCUMBENT_VALIDATION_TRIALS = 2
|
||||||
|
_VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE = 3
|
||||||
|
|
||||||
|
|
||||||
def build_harness_context(
|
def build_harness_context(
|
||||||
@@ -23,11 +41,39 @@ def build_harness_context(
|
|||||||
"workload_lca_profile": _workload_lca_profile(window_summary),
|
"workload_lca_profile": _workload_lca_profile(window_summary),
|
||||||
"recent_trial_diagnostics": recent_diagnostics,
|
"recent_trial_diagnostics": recent_diagnostics,
|
||||||
"convergence_guard": _convergence_guard(state, recent_diagnostics),
|
"convergence_guard": _convergence_guard(state, recent_diagnostics),
|
||||||
|
"harness_stop": _harness_stop_decision(state, recent_diagnostics),
|
||||||
"knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
|
"knob_harnesses": _knob_harnesses(study, window_summary, recent_diagnostics),
|
||||||
"proposal_rules": _proposal_rules(),
|
"proposal_rules": _proposal_rules(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_harness_stop_proposal(context: dict[str, Any]) -> Proposal | None:
|
||||||
|
stop = context.get("harness_stop")
|
||||||
|
if not isinstance(stop, dict) or not stop.get("should_stop"):
|
||||||
|
return None
|
||||||
|
reason = str(stop.get("reason") or "harness_converged")
|
||||||
|
evidence = stop.get("evidence") if isinstance(stop.get("evidence"), dict) else {}
|
||||||
|
observation = (
|
||||||
|
"Harness convergence guard triggered before requesting another proposal: "
|
||||||
|
f"{reason}."
|
||||||
|
)
|
||||||
|
diagnosis = str(evidence.get("summary") or reason)
|
||||||
|
return Proposal(
|
||||||
|
observation=observation,
|
||||||
|
diagnosis=diagnosis,
|
||||||
|
config_patch=ConfigPatch(env_patch={}, flag_patch={}),
|
||||||
|
expected_effects=[
|
||||||
|
"stop without spending another GPU trial",
|
||||||
|
"preserve the current best observed configuration",
|
||||||
|
],
|
||||||
|
why_not_previous_failures=(
|
||||||
|
"The stop decision is based on completed validation evidence and does not "
|
||||||
|
"repeat any failed configuration."
|
||||||
|
),
|
||||||
|
should_stop=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def render_harness_context(context: dict[str, Any]) -> str:
|
def render_harness_context(context: dict[str, Any]) -> str:
|
||||||
return json.dumps(context, ensure_ascii=False, indent=2)
|
return json.dumps(context, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
@@ -423,6 +469,158 @@ def _convergence_guard(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _harness_stop_decision(
|
||||||
|
state: StudyState,
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
guard = _convergence_guard(state, recent_diagnostics)
|
||||||
|
if guard["should_stop_if_no_harness_can_justify_a_new_adjacent_probe"]:
|
||||||
|
return {
|
||||||
|
"should_stop": True,
|
||||||
|
"reason": guard["reason"],
|
||||||
|
"evidence": {
|
||||||
|
"summary": "The convergence guard fired and no further adjacent probe is required.",
|
||||||
|
"convergence_guard": guard,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
validation = _validation_exhausted_guard(state, recent_diagnostics)
|
||||||
|
if validation["exhausted"]:
|
||||||
|
return {
|
||||||
|
"should_stop": True,
|
||||||
|
"reason": validation["reason"],
|
||||||
|
"evidence": validation,
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
"should_stop": False,
|
||||||
|
"reason": "continue_harness_guided_search",
|
||||||
|
"evidence": {
|
||||||
|
"summary": "No deterministic harness stop condition is satisfied.",
|
||||||
|
"convergence_guard": guard,
|
||||||
|
"validation_exhausted": validation,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _validation_exhausted_guard(
|
||||||
|
state: StudyState,
|
||||||
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
default = {
|
||||||
|
"exhausted": False,
|
||||||
|
"reason": "validation_not_exhausted",
|
||||||
|
"summary": "Validation probes are not sufficient to stop yet.",
|
||||||
|
"incumbent_trial_id": state.best_trial_id,
|
||||||
|
"incumbent_gain_vs_baseline": None,
|
||||||
|
"validation_trial_ids": [],
|
||||||
|
"validation_families": [],
|
||||||
|
}
|
||||||
|
if not state.best_trial_id or not isinstance(state.best_request_rate_per_gpu, (int, float)):
|
||||||
|
return default
|
||||||
|
completed = [
|
||||||
|
item
|
||||||
|
for item in recent_diagnostics
|
||||||
|
if item.get("status") == "completed"
|
||||||
|
and isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||||
|
]
|
||||||
|
if not completed:
|
||||||
|
return default
|
||||||
|
baseline_rate = _as_float(completed[0].get("best_request_rate_per_gpu"))
|
||||||
|
incumbent_rate = _as_float(state.best_request_rate_per_gpu)
|
||||||
|
if baseline_rate <= 0 or incumbent_rate <= 0:
|
||||||
|
return default
|
||||||
|
gain = incumbent_rate / baseline_rate
|
||||||
|
if gain < _STRONG_INCUMBENT_MIN_GAIN:
|
||||||
|
return {
|
||||||
|
**default,
|
||||||
|
"reason": "incumbent_gain_not_large_enough_for_validation_stop",
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
}
|
||||||
|
|
||||||
|
best_index = next(
|
||||||
|
(
|
||||||
|
index
|
||||||
|
for index, item in enumerate(recent_diagnostics)
|
||||||
|
if item.get("trial_id") == state.best_trial_id
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if best_index is None:
|
||||||
|
return {
|
||||||
|
**default,
|
||||||
|
"reason": "incumbent_not_in_recent_harness_history",
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
}
|
||||||
|
after_best = [
|
||||||
|
item
|
||||||
|
for item in recent_diagnostics[best_index + 1 :]
|
||||||
|
if item.get("status") in {"completed", "failed"}
|
||||||
|
]
|
||||||
|
if len(after_best) < _MIN_POST_INCUMBENT_VALIDATION_TRIALS:
|
||||||
|
return {
|
||||||
|
**default,
|
||||||
|
"reason": "need_at_least_two_post_incumbent_validation_trials",
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
||||||
|
}
|
||||||
|
if any(isinstance(item.get("best_request_rate_per_gpu"), (int, float)) for item in after_best):
|
||||||
|
return {
|
||||||
|
**default,
|
||||||
|
"reason": "post_incumbent_validation_found_feasible_candidate",
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
||||||
|
}
|
||||||
|
|
||||||
|
families: set[str] = set()
|
||||||
|
for item in after_best:
|
||||||
|
families.update(_validation_families(item))
|
||||||
|
has_topology = "topology" in families
|
||||||
|
has_runtime = bool(families & {"runtime", "max-num-seqs", "max-num-batched-tokens"})
|
||||||
|
enough_evidence = (
|
||||||
|
len(after_best) >= _VALIDATION_TRIALS_WITHOUT_FAMILY_COVERAGE
|
||||||
|
or (has_topology and has_runtime)
|
||||||
|
)
|
||||||
|
if not enough_evidence:
|
||||||
|
return {
|
||||||
|
**default,
|
||||||
|
"reason": "post_incumbent_validation_has_not_covered_enough_families",
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
||||||
|
"validation_families": sorted(families),
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"exhausted": True,
|
||||||
|
"reason": "post_incumbent_validation_exhausted",
|
||||||
|
"summary": (
|
||||||
|
"A strong incumbent was followed by validation probes across nearby "
|
||||||
|
"topology/runtime families, and none produced a feasible candidate."
|
||||||
|
),
|
||||||
|
"incumbent_trial_id": state.best_trial_id,
|
||||||
|
"incumbent_gain_vs_baseline": gain,
|
||||||
|
"validation_trial_ids": [str(item.get("trial_id")) for item in after_best],
|
||||||
|
"validation_families": sorted(families),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _validation_families(item: dict[str, Any]) -> set[str]:
|
||||||
|
config_patch = item.get("config_patch")
|
||||||
|
if not isinstance(config_patch, dict):
|
||||||
|
return set()
|
||||||
|
flag_patch = config_patch.get("flag_patch")
|
||||||
|
if not isinstance(flag_patch, dict):
|
||||||
|
return set()
|
||||||
|
families: set[str] = set()
|
||||||
|
if any(key in flag_patch for key in _TOPOLOGY_KEYS):
|
||||||
|
families.add("topology")
|
||||||
|
for key in _RUNTIME_KEYS:
|
||||||
|
if key in flag_patch:
|
||||||
|
families.add("runtime")
|
||||||
|
families.add(key)
|
||||||
|
if not families and flag_patch:
|
||||||
|
families.add("other")
|
||||||
|
return families
|
||||||
|
|
||||||
|
|
||||||
def _strong_incumbent_guard(
|
def _strong_incumbent_guard(
|
||||||
state: StudyState,
|
state: StudyState,
|
||||||
recent_diagnostics: list[dict[str, Any]],
|
recent_diagnostics: list[dict[str, Any]],
|
||||||
|
|||||||
@@ -312,23 +312,45 @@ def build_prompt(
|
|||||||
"",
|
"",
|
||||||
"Tested config signatures:",
|
"Tested config signatures:",
|
||||||
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
|
json.dumps(_tested_config_signatures(state), ensure_ascii=False, indent=2),
|
||||||
"",
|
]
|
||||||
"Harnesses:",
|
if study.llm.use_harness:
|
||||||
render_harness_context(
|
sections.extend(
|
||||||
build_harness_context(
|
[
|
||||||
study=study,
|
"",
|
||||||
window_summary=window_summary,
|
"Harnesses:",
|
||||||
state=state,
|
render_harness_context(
|
||||||
)
|
build_harness_context(
|
||||||
),
|
study=study,
|
||||||
"",
|
window_summary=window_summary,
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
sections.extend(
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"Harnesses:",
|
||||||
|
"Disabled by llm.use_harness=false for ablation.",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
sections.extend(
|
||||||
|
[
|
||||||
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
|
"The primary cross-topology comparison metric is request_rate_per_gpu, not raw request_rate.",
|
||||||
"The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
|
"The proposal should beat the incumbent on request_rate_per_gpu under the 95%+ SLO target.",
|
||||||
"The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
|
"The evaluator uses the best feasible sampling_u from the same tp_dp_product group when it exists.",
|
||||||
"If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
|
"If a tp_dp_product group has no history yet, the evaluator starts from the study's original search.low and runs a full binary search for that group.",
|
||||||
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
|
"Do not assume a configuration with fewer GPUs should inherit the global incumbent sampling_u.",
|
||||||
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged.",
|
(
|
||||||
]
|
"Follow the active harness. Prefer stop over a weak exploratory proposal once a good incumbent has converged."
|
||||||
|
if study.llm.use_harness
|
||||||
|
else "For this ablation, reason from the raw study stack, trial history, launch failures, and tested config signatures without harness hints."
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
return "\n".join(sections)
|
return "\n".join(sections)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -576,6 +576,7 @@ class LLMPolicySpec:
|
|||||||
endpoint: LLMEndpointSpec | None
|
endpoint: LLMEndpointSpec | None
|
||||||
system_prompt: str
|
system_prompt: str
|
||||||
max_history_trials: int
|
max_history_trials: int
|
||||||
|
use_harness: bool = True
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
|
def from_dict(cls, data: Mapping[str, Any] | None) -> "LLMPolicySpec":
|
||||||
@@ -593,6 +594,11 @@ class LLMPolicySpec:
|
|||||||
max_history_trials=_require_int(
|
max_history_trials=_require_int(
|
||||||
payload.get("max_history_trials", 8), context="llm.max_history_trials"
|
payload.get("max_history_trials", 8), context="llm.max_history_trials"
|
||||||
),
|
),
|
||||||
|
use_harness=(
|
||||||
|
_require_bool(payload.get("use_harness"), context="llm.use_harness")
|
||||||
|
if payload.get("use_harness") is not None
|
||||||
|
else True
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from aituner.compare import load_compare_spec, run_compare
|
|||||||
from aituner.engine import build_launch_recipe
|
from aituner.engine import build_launch_recipe
|
||||||
from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
|
from aituner.http_client import _auth_headers, _openai_url, _should_bypass_proxy
|
||||||
from aituner.job import append_job, build_trial_job
|
from aituner.job import append_job, build_trial_job
|
||||||
from aituner.harness import build_harness_context
|
from aituner.harness import build_harness_context, build_harness_stop_proposal
|
||||||
from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
|
from aituner.llm import _extract_response_text, build_prompt, parse_proposal_text, validate_proposal
|
||||||
from aituner.search import ThresholdProbe, binary_search_max_feasible
|
from aituner.search import ThresholdProbe, binary_search_max_feasible
|
||||||
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
|
from aituner.slo import RequestOutcome, evaluate_request, summarize_evaluations
|
||||||
@@ -422,6 +422,119 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
self.assertIn("validate", guard["recommended_next_action"])
|
self.assertIn("validate", guard["recommended_next_action"])
|
||||||
|
|
||||||
|
def test_harness_stop_after_post_incumbent_validation_is_exhausted(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_parallel_size=8,
|
||||||
|
best_sampling_u=0.02,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=0.8,
|
||||||
|
best_request_rate_per_gpu=0.1,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 2,
|
||||||
|
"data-parallel-size": 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0003",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 1,
|
||||||
|
"data-parallel-size": 8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0004",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {"max-num-seqs": 160},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p95": 2048},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
self.assertTrue(context["harness_stop"]["should_stop"])
|
||||||
|
self.assertEqual(context["harness_stop"]["reason"], "post_incumbent_validation_exhausted")
|
||||||
|
proposal = build_harness_stop_proposal(context)
|
||||||
|
self.assertIsNotNone(proposal)
|
||||||
|
self.assertTrue(proposal.should_stop)
|
||||||
|
|
||||||
|
def test_harness_does_not_stop_immediately_after_strong_incumbent(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
state = StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_parallel_size=8,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=0.8,
|
||||||
|
best_request_rate_per_gpu=0.1,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 2,
|
||||||
|
"data-parallel-size": 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
context = build_harness_context(
|
||||||
|
study=study,
|
||||||
|
window_summary={"prompt_tokens_p95": 2048},
|
||||||
|
state=state,
|
||||||
|
)
|
||||||
|
self.assertFalse(context["harness_stop"]["should_stop"])
|
||||||
|
self.assertIsNone(build_harness_stop_proposal(context))
|
||||||
|
|
||||||
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
|
def test_trace_input_length_filter_keeps_only_matching_rows(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
@@ -564,6 +677,26 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
"\n".join(context["proposal_rules"]),
|
"\n".join(context["proposal_rules"]),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_prompt_can_disable_harness_for_ablation(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
payload = json.loads(study_path.read_text(encoding="utf-8"))
|
||||||
|
payload["llm"]["use_harness"] = False
|
||||||
|
study_path.write_text(json.dumps(payload), encoding="utf-8")
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
window, requests = load_trace_requests(study, study_spec_path=study_path)
|
||||||
|
prompt = build_prompt(
|
||||||
|
study=study,
|
||||||
|
window_summary=summarize_window(requests, window),
|
||||||
|
state=StudyState(study_id=study.study_id),
|
||||||
|
capability_profile=None,
|
||||||
|
)
|
||||||
|
self.assertFalse(study.llm.use_harness)
|
||||||
|
self.assertIn("Disabled by llm.use_harness=false", prompt)
|
||||||
|
self.assertNotIn('"paper_alignment"', prompt)
|
||||||
|
self.assertIn("without harness hints", prompt)
|
||||||
|
|
||||||
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
|
def test_harness_uses_prior_infeasible_probe_for_active_bottleneck(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
@@ -2299,6 +2432,98 @@ class CoreFlowTests(unittest.TestCase):
|
|||||||
state = store.load_state("study-1")
|
state = store.load_state("study-1")
|
||||||
self.assertEqual(state.next_trial_index, 1)
|
self.assertEqual(state.next_trial_index, 1)
|
||||||
|
|
||||||
|
def test_cli_tune_uses_harness_stop_before_llm(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
tmp_path = Path(tmp)
|
||||||
|
study_path = _write_study_assets(tmp_path)
|
||||||
|
study = load_study_spec(study_path)
|
||||||
|
store_root = tmp_path / "store"
|
||||||
|
store = StudyStore(store_root)
|
||||||
|
store.init_study(spec_path=study_path, study=study)
|
||||||
|
store.save_state(
|
||||||
|
StudyState(
|
||||||
|
study_id=study.study_id,
|
||||||
|
best_trial_id="trial-0002",
|
||||||
|
best_parallel_size=8,
|
||||||
|
best_sampling_u=0.02,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
next_trial_index=5,
|
||||||
|
trials=[
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0001",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=0.8,
|
||||||
|
best_request_rate_per_gpu=0.1,
|
||||||
|
config_patch={"env_patch": {}, "flag_patch": {}},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0002",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
best_request_rate=2.4,
|
||||||
|
best_request_rate_per_gpu=0.3,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 2,
|
||||||
|
"data-parallel-size": 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0003",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {
|
||||||
|
"tensor-parallel-size": 1,
|
||||||
|
"data-parallel-size": 8,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
TrialSummary(
|
||||||
|
trial_id="trial-0004",
|
||||||
|
status="completed",
|
||||||
|
parallel_size=8,
|
||||||
|
config_patch={
|
||||||
|
"env_patch": {},
|
||||||
|
"flag_patch": {"max-num-seqs": 160},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
with mock.patch("aituner.cli.call_llm_for_proposal") as llm_mock:
|
||||||
|
with mock.patch("aituner.cli.run_trial") as run_trial_mock:
|
||||||
|
exit_code = cli_main(
|
||||||
|
[
|
||||||
|
"study",
|
||||||
|
"tune",
|
||||||
|
"--spec",
|
||||||
|
str(study_path),
|
||||||
|
"--store-root",
|
||||||
|
str(store_root),
|
||||||
|
"--max-trials",
|
||||||
|
"1",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
llm_mock.assert_not_called()
|
||||||
|
run_trial_mock.assert_not_called()
|
||||||
|
proposal_path = (
|
||||||
|
store.study_root(study.study_id)
|
||||||
|
/ "proposals"
|
||||||
|
/ "harness-stop-0005.json"
|
||||||
|
)
|
||||||
|
self.assertTrue(proposal_path.exists())
|
||||||
|
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
|
||||||
|
self.assertTrue(proposal["should_stop"])
|
||||||
|
|
||||||
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
|
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
tmp_path = Path(tmp)
|
tmp_path = Path(tmp)
|
||||||
|
|||||||
Reference in New Issue
Block a user