Add harness candidate set audit

2026-06-26 22:02:09 +08:00
parent 42f75553a6
commit 825d3e03e9
3 changed files with 245 additions and 1 deletions
--- a/docs/aituner-roadmap.md
+++ b/docs/aituner-roadmap.md
@@ -95,6 +95,10 @@ declarative intervention grammar + coverage-relative validator。
 最低验收：
 - CandidateSet 完整枚举并持久化 snapshot；
 - CandidateSet v1 先限定为当前 harness generator 实际构造出的 concrete candidates，
  不 claim 全 Cartesian knob space 枚举；`candidate_set_hash`、eligible/blocked
  records 和 blocked reason summary 已在 harness context 中实现，独立 sidecar JSON
  persistence 是下一片；
 - `harness_priority` 与 backend ranking 分离；
 - CoverageUnit 结构化，stop 不能只依赖 exact signature；
 - `search_high_saturated_by_incumbent` 不能绕过 CandidateSet coverage；对 `req/s/GPU`
--- a/src/aituner/harness.py
+++ b/src/aituner/harness.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import json
 import hashlib
 from pathlib import Path
 from typing import Any
@@ -1061,7 +1062,7 @@ def _experiment_plan(
        for item in recent_diagnostics
    }
    tested_signatures.update(_state_tested_signatures(study, state))
-    candidates = _candidate_actions(
+    candidate_set = _candidate_action_set(
        study,
        window_summary,
        state,
@@ -1070,12 +1071,18 @@ def _experiment_plan(
        bottleneck_hypotheses,
        tested_signatures=tested_signatures,
    )
    candidates = list(candidate_set["eligible_candidates"])
    candidates.sort(key=lambda item: _as_float(item.get("score")), reverse=True)
    next_action = candidates[0] if candidates else None
    return {
        "planner_version": "profile-driven-v1",
        "bottleneck_hypotheses": bottleneck_hypotheses,
        "candidate_actions": candidates[:8],
        "candidate_set": {
            **candidate_set,
            "eligible_candidates": candidates[:8],
            "blocked_candidates": candidate_set["blocked_candidates"][:16],
        },
        "next_action": next_action,
        "stop_ready": next_action is None or _as_float(next_action.get("score")) < 0.35,
        "stop_rationale": (
@@ -1086,6 +1093,46 @@ def _experiment_plan(
    }
 def _candidate_action_set(
    study: StudySpec,
    window_summary: dict[str, Any],
    state: StudyState,
    recent_diagnostics: list[dict[str, Any]],
    trial_profiles: list[dict[str, Any]],
    bottleneck_hypotheses: list[dict[str, Any]],
    *,
    tested_signatures: set[str],
 ) -> dict[str, Any]:
    blocked_candidates: list[dict[str, Any]] = []
    eligible_candidates = _candidate_actions(
        study,
        window_summary,
        state,
        recent_diagnostics,
        trial_profiles,
        bottleneck_hypotheses,
        tested_signatures=tested_signatures,
        blocked_candidates=blocked_candidates,
    )
    _annotate_candidate_signatures(study, eligible_candidates)
    eligible_candidates.sort(key=lambda item: _as_float(item.get("score")), reverse=True)
    blocked_summary = _blocked_reason_summary(blocked_candidates)
    candidate_set_hash = _candidate_set_hash(eligible_candidates, blocked_candidates)
    return {
        "version": "candidate-set-v1",
        "signature_semantics": (
            "effective_config_fingerprint = sha256(normalized effective full-config JSON)"
        ),
        "candidate_set_hash": candidate_set_hash,
        "tested_signature_count": len(tested_signatures),
        "eligible_count": len(eligible_candidates),
        "blocked_count": len(blocked_candidates),
        "blocked_reason_summary": blocked_summary,
        "eligible_candidates": eligible_candidates,
        "blocked_candidates": blocked_candidates,
    }
 def _candidate_actions(
    study: StudySpec,
    window_summary: dict[str, Any],
@@ -1095,6 +1142,7 @@ def _candidate_actions(
    bottleneck_hypotheses: list[dict[str, Any]],
    *,
    tested_signatures: set[str],
    blocked_candidates: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
    if not recent_diagnostics:
        return []
@@ -1114,6 +1162,7 @@ def _candidate_actions(
            top_bottleneck,
            bottleneck_hypotheses,
            tested_signatures,
            blocked_candidates,
        )
    )
    candidates.extend(
@@ -1125,6 +1174,7 @@ def _candidate_actions(
            bottleneck_hypotheses,
            recent_diagnostics,
            tested_signatures,
            blocked_candidates,
        )
    )
    return candidates
@@ -1152,6 +1202,7 @@ def _topology_candidate_actions(
    top_bottleneck: str,
    bottleneck_hypotheses: list[dict[str, Any]],
    tested_signatures: set[str],
    blocked_candidates: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
    if not ({"tensor-parallel-size", "data-parallel-size"} & set(study.engine.tunable_flags)):
        return []
@@ -1191,6 +1242,15 @@ def _topology_candidate_actions(
        patch = _topology_patch(study, point)
        signature = _effective_config_signature(study, {"env_patch": {}, "flag_patch": patch})
        if signature in tested_signatures:
            blocked_candidates.append(
                _blocked_candidate(
                    action_id=_topology_action_id(current_tp, current_dp, point),
                    knob_family="topology",
                    config_patch={"env_patch": {}, "flag_patch": patch},
                    blocked_reason="blocked_noop_or_repeat_effective_full_config",
                    effective_config_signature=signature,
                )
            )
            continue
        score, factors = _score_topology_candidate(
            top_bottleneck,
@@ -1210,6 +1270,17 @@ def _topology_candidate_actions(
            score = max(score, 0.74)
            factors["bad_start_topology_bracket"] = 0.74
        if score <= 0:
            blocked_candidates.append(
                _blocked_candidate(
                    action_id=_topology_action_id(current_tp, current_dp, point),
                    knob_family="topology",
                    config_patch={"env_patch": {}, "flag_patch": patch},
                    blocked_reason="blocked_non_positive_candidate_score",
                    effective_config_signature=signature,
                    score=round(score, 4),
                    score_factors=factors,
                )
            )
            continue
        action_id = _topology_action_id(current_tp, current_dp, point)
        actions.append(
@@ -1244,6 +1315,7 @@ def _runtime_candidate_actions(
    bottleneck_hypotheses: list[dict[str, Any]],
    recent_diagnostics: list[dict[str, Any]],
    tested_signatures: set[str],
    blocked_candidates: list[dict[str, Any]],
 ) -> list[dict[str, Any]]:
    tunable = set(study.engine.tunable_flags)
    anchor_flags = _effective_flags_for_item(study, anchor)
@@ -1285,6 +1357,15 @@ def _runtime_candidate_actions(
            patch = {**runtime_base_patch, "max-num-batched-tokens": target}
            signature = _effective_config_signature(study, {"env_patch": {}, "flag_patch": patch})
            if signature in tested_signatures:
                blocked_candidates.append(
                    _blocked_candidate(
                        action_id=action_id,
                        knob_family="max-num-batched-tokens",
                        config_patch={"env_patch": {}, "flag_patch": patch},
                        blocked_reason="blocked_noop_or_repeat_effective_full_config",
                        effective_config_signature=signature,
                    )
                )
                continue
            relief = 0.24 if top_bottleneck == "ttft_prefill" else 0.14
            actions.append(
@@ -1341,6 +1422,15 @@ def _runtime_candidate_actions(
            patch = {**runtime_base_patch, "max-num-seqs": target}
            signature = _effective_config_signature(study, {"env_patch": {}, "flag_patch": patch})
            if signature in tested_signatures:
                blocked_candidates.append(
                    _blocked_candidate(
                        action_id=action_id,
                        knob_family="max-num-seqs",
                        config_patch={"env_patch": {}, "flag_patch": patch},
                        blocked_reason="blocked_noop_or_repeat_effective_full_config",
                        effective_config_signature=signature,
                    )
                )
                continue
            if top_bottleneck in {"decode_tpot", "admission_or_queueing"}:
                relief = 0.25
@@ -1415,6 +1505,16 @@ def _runtime_candidate_actions(
                        ],
                    )
                )
            else:
                blocked_candidates.append(
                    _blocked_candidate(
                        action_id="raise_mbt_and_max_num_seqs",
                        knob_family="prefill-runtime-interaction",
                        config_patch={"env_patch": {}, "flag_patch": patch},
                        blocked_reason="blocked_noop_or_repeat_effective_full_config",
                        effective_config_signature=signature,
                    )
                )
    if "enable-chunked-prefill" in tunable and top_bottleneck == "ttft_prefill":
        current = bool(anchor_flags.get("enable-chunked-prefill", False))
@@ -1438,6 +1538,16 @@ def _runtime_candidate_actions(
                        ],
                    )
                )
            else:
                blocked_candidates.append(
                    _blocked_candidate(
                        action_id="enable_chunked_prefill",
                        knob_family="enable-chunked-prefill",
                        config_patch={"env_patch": {}, "flag_patch": patch},
                        blocked_reason="blocked_noop_or_repeat_effective_full_config",
                        effective_config_signature=signature,
                    )
                )
    if (
        "gpu-memory-utilization" in tunable
@@ -1469,6 +1579,16 @@ def _runtime_candidate_actions(
                        ],
                    )
                )
            else:
                blocked_candidates.append(
                    _blocked_candidate(
                        action_id="raise_gpu_memory_utilization",
                        knob_family="gpu-memory-utilization",
                        config_patch={"env_patch": {}, "flag_patch": patch},
                        blocked_reason="blocked_noop_or_repeat_effective_full_config",
                        effective_config_signature=signature,
                    )
                )
    return actions
@@ -1540,6 +1660,98 @@ def _runtime_action(
    }
 def _annotate_candidate_signatures(
    study: StudySpec,
    candidates: list[dict[str, Any]],
 ) -> None:
    for candidate in candidates:
        if not isinstance(candidate, dict):
            continue
        patch = candidate.get("config_patch")
        if not isinstance(patch, dict):
            patch = {"env_patch": {}, "flag_patch": {}}
        signature = _effective_config_signature(study, patch)
        fingerprint = _effective_config_fingerprint(signature)
        candidate["status"] = "eligible"
        candidate["candidate_id"] = _candidate_id(candidate.get("action_id"), fingerprint)
        candidate["effective_config_fingerprint"] = fingerprint
 def _blocked_candidate(
    *,
    action_id: str,
    knob_family: str,
    config_patch: dict[str, Any],
    blocked_reason: str,
    effective_config_signature: str,
    score: float | None = None,
    score_factors: dict[str, Any] | None = None,
 ) -> dict[str, Any]:
    fingerprint = _effective_config_fingerprint(effective_config_signature)
    item: dict[str, Any] = {
        "candidate_id": _candidate_id(action_id, fingerprint),
        "action_id": action_id,
        "knob_family": knob_family,
        "status": "blocked",
        "blocked_reason": blocked_reason,
        "blocked_reasons": [{"code": blocked_reason}],
        "config_patch": _normalized_config_patch(config_patch),
        "effective_config_fingerprint": fingerprint,
    }
    if score is not None:
        item["score"] = score
    if score_factors is not None:
        item["score_factors"] = score_factors
    return item
 def _candidate_id(action_id: Any, fingerprint: str) -> str:
    action = str(action_id or "candidate").strip() or "candidate"
    return f"{action}:{fingerprint[:12]}"
 def _effective_config_fingerprint(effective_config_signature: str) -> str:
    return hashlib.sha256(effective_config_signature.encode("utf-8")).hexdigest()
 def _blocked_reason_summary(blocked_candidates: list[dict[str, Any]]) -> dict[str, int]:
    summary: dict[str, int] = {}
    for item in blocked_candidates:
        reason = str(item.get("blocked_reason") or "unknown")
        summary[reason] = summary.get(reason, 0) + 1
    return summary
 def _candidate_set_hash(
    eligible_candidates: list[dict[str, Any]],
    blocked_candidates: list[dict[str, Any]],
 ) -> str:
    records: list[dict[str, Any]] = []
    for status, candidates in (
        ("eligible", eligible_candidates),
        ("blocked", blocked_candidates),
    ):
        for item in candidates:
            records.append(
                {
                    "status": status,
                    "candidate_id": item.get("candidate_id"),
                    "action_id": item.get("action_id"),
                    "knob_family": item.get("knob_family"),
                    "effective_config_fingerprint": item.get("effective_config_fingerprint"),
                    "blocked_reason": item.get("blocked_reason"),
                    "score": item.get("score"),
                }
            )
    payload = json.dumps(
        records,
        ensure_ascii=False,
        sort_keys=True,
        separators=(",", ":"),
    )
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
 def _anchor_has_topology_patch(anchor: dict[str, Any]) -> bool:
    patch = anchor.get("config_patch")
    if not isinstance(patch, dict):
--- a/tests/test_core_flow.py
+++ b/tests/test_core_flow.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import json
 import hashlib
 import contextlib
 import io
 import math
@@ -1803,6 +1804,33 @@ class CoreFlowTests(unittest.TestCase):
                window_summary={"prompt_tokens_p95": 2048},
                state=state,
            )
            candidate_set = context["experiment_plan"]["candidate_set"]
            self.assertEqual(candidate_set["version"], "candidate-set-v1")
            self.assertIn("candidate_set_hash", candidate_set)
            self.assertGreaterEqual(
                candidate_set["blocked_reason_summary"].get(
                    "blocked_noop_or_repeat_effective_full_config",
                    0,
                ),
                1,
            )
            baseline_fingerprint = hashlib.sha256(
                _effective_config_signature(
                    study,
                    {"env_patch": {}, "flag_patch": {}},
                ).encode("utf-8")
            ).hexdigest()
            blocked_baseline_equivalent = [
                item
                for item in candidate_set["blocked_candidates"]
                if item.get("effective_config_fingerprint") == baseline_fingerprint
            ]
            self.assertTrue(blocked_baseline_equivalent)
            self.assertEqual(
                blocked_baseline_equivalent[0]["blocked_reason"],
                "blocked_noop_or_repeat_effective_full_config",
            )
            self.assertIn("effective_config_fingerprint", blocked_baseline_equivalent[0])
            actions = context["experiment_plan"]["candidate_actions"]
            self.assertFalse(
                any(