Add tuning progress report for harness evaluation

2026-06-21 00:48:21 +08:00
parent 426151bc9f
commit 488fae7e63
4 changed files with 752 additions and 0 deletions
--- a/configs/examples/tuning_report.example.json
+++ b/configs/examples/tuning_report.example.json
@@ -0,0 +1,26 @@
 {
  "report_id": "qwen27b-abl12-harness-vs-naive",
  "output_root": "../../.aituner-reports/qwen27b-abl12-harness-vs-naive",
  "target_fraction": 0.95,
  "min_final_ratio": 0.98,
  "cases": [
    {
      "case_id": "qwen27b-chat-0-8k-real-output",
      "description": "12-trial harness-vs-naive ablation on the 0-8k chat window with real output lengths.",
      "tags": ["qwen27b", "chat", "0-8k", "h20", "real-output"],
      "budgets": [1, 2, 3, 4, 6, 8, 12],
      "arms": [
        {
          "name": "harness",
          "kind": "harness",
          "study_root": "../../.aituner/abl12-harness/dash0-qwen27b-ablation-harness-on"
        },
        {
          "name": "naive",
          "kind": "naive",
          "study_root": "../../.aituner/abl12-naive/dash0-qwen27b-ablation-naive-off"
        }
      ]
    }
  ]
 }
--- a/scripts/tuning_report.py
+++ b/scripts/tuning_report.py
@@ -0,0 +1,36 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
 from aituner.tuning_report import run_tuning_report
 def main() -> int:
    parser = argparse.ArgumentParser(
        description="Summarize anytime tuning progress across harness/naive study stores."
    )
    parser.add_argument("--spec", required=True, help="Path to a tuning report JSON spec.")
    args = parser.parse_args()
    summary = run_tuning_report(Path(args.spec))
    print(
        json.dumps(
            {
                "report_id": summary["report_id"],
                "report_root": summary["report_root"],
                "case_count": summary["aggregate"]["case_count"],
                "harness_vs_naive_pass_count": summary["aggregate"]["harness_vs_naive_pass_count"],
                "harness_vs_naive_check_count": summary["aggregate"]["harness_vs_naive_check_count"],
                "winner_counts": summary["aggregate"]["winner_counts"],
            },
            ensure_ascii=False,
            indent=2,
        )
    )
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/src/aituner/tuning_report.py
+++ b/src/aituner/tuning_report.py
@@ -0,0 +1,581 @@
 from __future__ import annotations
 import json
 from pathlib import Path
 from typing import Any
 from .spec import SpecError, load_structured_file
 from .store import StudyStore
 DEFAULT_BUDGETS = [1, 2, 3, 4, 6, 8, 12]
 DEFAULT_TARGET_FRACTION = 0.95
 DEFAULT_MIN_FINAL_RATIO = 0.98
 def run_tuning_report(spec_path: Path) -> dict[str, Any]:
    spec_path = spec_path.resolve()
    spec = _load_report_spec(spec_path)
    report_root = _resolve_output_root(spec, spec_path=spec_path)
    report_root.mkdir(parents=True, exist_ok=True)
    cases = [
        _summarize_case(case, spec_path=spec_path)
        for case in spec["cases"]
    ]
    summary = {
        "report_id": spec["report_id"],
        "report_root": str(report_root),
        "target_fraction": spec["target_fraction"],
        "min_final_ratio": spec["min_final_ratio"],
        "cases": cases,
        "aggregate": _aggregate_cases(cases),
    }
    StudyStore.write_json(report_root / "summary.json", summary)
    (report_root / "report.md").write_text(_render_report(summary), encoding="utf-8")
    return summary
 def _load_report_spec(path: Path) -> dict[str, Any]:
    payload = dict(load_structured_file(path))
    report_id = str(payload.get("report_id") or "").strip()
    if not report_id:
        raise SpecError("report_id must be a non-empty string.")
    raw_cases = payload.get("cases")
    if not isinstance(raw_cases, list) or not raw_cases:
        raise SpecError("cases must be a non-empty list.")
    target_fraction = _as_float(payload.get("target_fraction"), default=DEFAULT_TARGET_FRACTION)
    if target_fraction <= 0:
        raise SpecError("target_fraction must be positive.")
    min_final_ratio = _as_float(payload.get("min_final_ratio"), default=DEFAULT_MIN_FINAL_RATIO)
    if min_final_ratio <= 0:
        raise SpecError("min_final_ratio must be positive.")
    cases = [
        _load_case(
            item,
            idx=idx,
            default_target_fraction=target_fraction,
            default_min_final_ratio=min_final_ratio,
        )
        for idx, item in enumerate(raw_cases)
    ]
    return {
        "report_id": report_id,
        "output_root": str(payload.get("output_root") or "").strip() or None,
        "target_fraction": target_fraction,
        "min_final_ratio": min_final_ratio,
        "cases": cases,
    }
 def _load_case(
    raw: Any,
    *,
    idx: int,
    default_target_fraction: float,
    default_min_final_ratio: float,
 ) -> dict[str, Any]:
    if not isinstance(raw, dict):
        raise SpecError(f"cases[{idx}] must be an object.")
    case_id = str(raw.get("case_id") or "").strip()
    if not case_id:
        raise SpecError(f"cases[{idx}].case_id must be a non-empty string.")
    raw_arms = raw.get("arms")
    if not isinstance(raw_arms, list) or not raw_arms:
        raise SpecError(f"cases[{idx}].arms must be a non-empty list.")
    arms = [_load_arm(item, context=f"cases[{idx}].arms[{arm_idx}]") for arm_idx, item in enumerate(raw_arms)]
    names = [item["name"] for item in arms]
    if len(names) != len(set(names)):
        raise SpecError(f"cases[{idx}].arms names must be unique.")
    raw_budgets = raw.get("budgets", DEFAULT_BUDGETS)
    if not isinstance(raw_budgets, list) or not raw_budgets:
        raise SpecError(f"cases[{idx}].budgets must be a non-empty list.")
    budgets = sorted({_positive_int(item, context=f"cases[{idx}].budgets") for item in raw_budgets})
    return {
        "case_id": case_id,
        "description": str(raw.get("description") or "").strip(),
        "tags": [str(item).strip() for item in raw.get("tags", []) if str(item).strip()]
        if isinstance(raw.get("tags", []), list)
        else [],
        "budgets": budgets,
        "target_fraction": _as_float(raw.get("target_fraction"), default=default_target_fraction),
        "min_final_ratio": _as_float(raw.get("min_final_ratio"), default=default_min_final_ratio),
        "arms": arms,
    }
 def _load_arm(raw: Any, *, context: str) -> dict[str, Any]:
    if not isinstance(raw, dict):
        raise SpecError(f"{context} must be an object.")
    name = str(raw.get("name") or "").strip()
    if not name:
        raise SpecError(f"{context}.name must be a non-empty string.")
    kind = str(raw.get("kind") or name).strip()
    study_root = str(raw.get("study_root") or "").strip()
    if not study_root:
        raise SpecError(f"{context}.study_root must be a non-empty string.")
    return {
        "name": name,
        "kind": kind,
        "study_root": study_root,
        "label": str(raw.get("label") or "").strip() or name,
    }
 def _resolve_output_root(spec: dict[str, Any], *, spec_path: Path) -> Path:
    raw = spec.get("output_root")
    if raw:
        return _resolve_path(str(raw), base_dir=spec_path.parent)
    return (Path(".aituner-reports") / str(spec["report_id"])).resolve()
 def _summarize_case(case: dict[str, Any], *, spec_path: Path) -> dict[str, Any]:
    arms = [
        _summarize_arm(arm, budgets=case["budgets"], spec_path=spec_path)
        for arm in case["arms"]
    ]
    reference = _reference_best(arms)
    max_budget = max(case["budgets"] + [arm["trial_count"] for arm in arms])
    for arm in arms:
        _add_reference_metrics(
            arm,
            reference=reference,
            max_budget=max_budget,
            target_fraction=case["target_fraction"],
        )
    winners = _case_winners(arms)
    comparison = _harness_vs_naive(
        arms,
        min_final_ratio=case["min_final_ratio"],
    )
    return {
        "case_id": case["case_id"],
        "description": case["description"],
        "tags": case["tags"],
        "budgets": case["budgets"],
        "target_fraction": case["target_fraction"],
        "min_final_ratio": case["min_final_ratio"],
        "reference_best_per_gpu": reference,
        "max_budget": max_budget,
        "arms": arms,
        "winners": winners,
        "harness_vs_naive": comparison,
        "warnings": _case_warnings(case, arms, comparison),
    }
 def _summarize_arm(arm: dict[str, Any], *, budgets: list[int], spec_path: Path) -> dict[str, Any]:
    study_root = _resolve_study_root(arm["study_root"], base_dir=spec_path.parent)
    state = json.loads((study_root / "state.json").read_text(encoding="utf-8"))
    trials = state.get("trials") if isinstance(state.get("trials"), list) else []
    curve = _running_best_curve(trials)
    final_best = curve[-1] if curve else None
    best_trial_index = _first_index_at_value(curve, final_best)
    return {
        "name": arm["name"],
        "kind": arm["kind"],
        "label": arm["label"],
        "study_root": str(study_root),
        "study_id": state.get("study_id"),
        "trial_count": len(trials),
        "completed_count": sum(1 for item in trials if item.get("status") == "completed"),
        "failed_count": sum(1 for item in trials if item.get("status") == "failed"),
        "no_feasible_count": sum(
            1 for item in trials if not isinstance(item.get("best_request_rate_per_gpu"), (int, float))
        ),
        "best_trial_id": state.get("best_trial_id"),
        "best_trial_index": best_trial_index,
        "final_best_per_gpu": final_best,
        "state_best_per_gpu": state.get("best_request_rate_per_gpu"),
        "best_at_budget": {str(budget): _value_at_budget(curve, budget) for budget in budgets},
        "running_best_per_gpu": curve,
        "stop_reason": str(state.get("tuning_stop_reason") or ""),
        "stop_diagnosis": str(state.get("tuning_stop_diagnosis") or ""),
    }
 def _add_reference_metrics(
    arm: dict[str, Any],
    *,
    reference: float | None,
    max_budget: int,
    target_fraction: float,
 ) -> None:
    final_best = arm.get("final_best_per_gpu")
    arm["final_ratio_to_reference"] = (
        float(final_best) / reference
        if reference and isinstance(final_best, (int, float))
        else None
    )
    target = reference * target_fraction if reference else None
    arm["target_per_gpu"] = target
    arm["trials_to_target"] = _trials_to_target(arm["running_best_per_gpu"], target)
    arm["normalized_auc"] = _normalized_auc(
        arm["running_best_per_gpu"],
        reference=reference,
        max_budget=max_budget,
    )
 def _harness_vs_naive(arms: list[dict[str, Any]], *, min_final_ratio: float) -> list[dict[str, Any]]:
    naive = [arm for arm in arms if arm["kind"] == "naive"]
    harnesses = [arm for arm in arms if arm["kind"] == "harness"]
    if not naive or not harnesses:
        return []
    best_naive_final = _max_optional(arm.get("final_best_per_gpu") for arm in naive)
    best_naive_ttt = _min_optional(arm.get("trials_to_target") for arm in naive)
    best_naive_auc = _max_optional(arm.get("normalized_auc") for arm in naive)
    rows = []
    for harness in harnesses:
        final = harness.get("final_best_per_gpu")
        ttt = harness.get("trials_to_target")
        auc = harness.get("normalized_auc")
        final_ratio = (
            float(final) / best_naive_final
            if best_naive_final and isinstance(final, (int, float))
            else None
        )
        auc_ratio = (
            float(auc) / best_naive_auc
            if best_naive_auc and isinstance(auc, (int, float))
            else None
        )
        speedup = _speedup(best_naive_ttt, ttt)
        pass_final = final_ratio is not None and final_ratio >= min_final_ratio
        pass_speed = speedup is None or speedup >= 1.0
        rows.append(
            {
                "harness": harness["name"],
                "best_naive_final_per_gpu": best_naive_final,
                "best_naive_trials_to_target": best_naive_ttt,
                "best_naive_normalized_auc": best_naive_auc,
                "final_ratio_vs_best_naive": final_ratio,
                "target_trial_speedup_vs_best_naive": speedup,
                "auc_ratio_vs_best_naive": auc_ratio,
                "passes_min_final_ratio": pass_final,
                "passes_speed": pass_speed,
                "passes": pass_final and pass_speed,
            }
        )
    return rows
 def _case_winners(arms: list[dict[str, Any]]) -> dict[str, str | None]:
    return {
        "final_best": _argmax(arms, "final_best_per_gpu"),
        "fastest_to_target": _argmin(arms, "trials_to_target"),
        "normalized_auc": _argmax(arms, "normalized_auc"),
    }
 def _aggregate_cases(cases: list[dict[str, Any]]) -> dict[str, Any]:
    by_kind: dict[str, dict[str, Any]] = {}
    final_wins: dict[str, int] = {}
    speed_wins: dict[str, int] = {}
    auc_wins: dict[str, int] = {}
    harness_passes = 0
    harness_checks = 0
    for case in cases:
        for winner_key, target in (
            ("final_best", final_wins),
            ("fastest_to_target", speed_wins),
            ("normalized_auc", auc_wins),
        ):
            winner = case["winners"].get(winner_key)
            if winner:
                target[winner] = target.get(winner, 0) + 1
        for row in case["harness_vs_naive"]:
            harness_checks += 1
            if row["passes"]:
                harness_passes += 1
        for arm in case["arms"]:
            bucket = by_kind.setdefault(
                arm["kind"],
                {
                    "arm_count": 0,
                    "mean_final_ratio_to_reference": None,
                    "mean_normalized_auc": None,
                    "target_reached_count": 0,
                    "_final_ratios": [],
                    "_aucs": [],
                },
            )
            bucket["arm_count"] += 1
            if isinstance(arm.get("final_ratio_to_reference"), (int, float)):
                bucket["_final_ratios"].append(float(arm["final_ratio_to_reference"]))
            if isinstance(arm.get("normalized_auc"), (int, float)):
                bucket["_aucs"].append(float(arm["normalized_auc"]))
            if isinstance(arm.get("trials_to_target"), int):
                bucket["target_reached_count"] += 1
    for bucket in by_kind.values():
        ratios = bucket.pop("_final_ratios")
        aucs = bucket.pop("_aucs")
        bucket["mean_final_ratio_to_reference"] = _mean(ratios)
        bucket["mean_normalized_auc"] = _mean(aucs)
    return {
        "case_count": len(cases),
        "by_kind": by_kind,
        "winner_counts": {
            "final_best": final_wins,
            "fastest_to_target": speed_wins,
            "normalized_auc": auc_wins,
        },
        "harness_vs_naive_pass_count": harness_passes,
        "harness_vs_naive_check_count": harness_checks,
    }
 def _case_warnings(
    case: dict[str, Any],
    arms: list[dict[str, Any]],
    comparison: list[dict[str, Any]],
 ) -> list[str]:
    warnings = []
    kinds = {arm["kind"] for arm in arms}
    if "harness" not in kinds or "naive" not in kinds:
        warnings.append("case does not include both harness and naive arms")
    if len(case["tags"]) < 2:
        warnings.append("case has few tags; add workload/model/SLO tags to support generalization claims")
    if not comparison:
        return warnings
    for row in comparison:
        if not row["passes_min_final_ratio"]:
            warnings.append(
                f"{row['harness']} final best is below min_final_ratio versus best naive"
            )
        if not row["passes_speed"]:
            warnings.append(
                f"{row['harness']} reaches target later than best naive"
            )
    return warnings
 def _running_best_curve(trials: list[Any]) -> list[float | None]:
    curve: list[float | None] = []
    incumbent: float | None = None
    for trial in trials:
        rate = trial.get("best_request_rate_per_gpu") if isinstance(trial, dict) else None
        if isinstance(rate, (int, float)) and (incumbent is None or float(rate) > incumbent):
            incumbent = float(rate)
        curve.append(incumbent)
    return curve
 def _value_at_budget(curve: list[float | None], budget: int) -> float | None:
    if not curve:
        return None
    index = min(max(budget, 1), len(curve)) - 1
    return curve[index]
 def _trials_to_target(curve: list[float | None], target: float | None) -> int | None:
    if target is None:
        return None
    for idx, value in enumerate(curve, start=1):
        if isinstance(value, (int, float)) and value >= target:
            return idx
    return None
 def _normalized_auc(
    curve: list[float | None],
    *,
    reference: float | None,
    max_budget: int,
 ) -> float | None:
    if not reference or max_budget <= 0:
        return None
    total = 0.0
    for budget in range(1, max_budget + 1):
        value = _value_at_budget(curve, budget)
        total += float(value) if isinstance(value, (int, float)) else 0.0
    return total / (reference * max_budget)
 def _reference_best(arms: list[dict[str, Any]]) -> float | None:
    return _max_optional(arm.get("final_best_per_gpu") for arm in arms)
 def _resolve_study_root(raw_path: str, *, base_dir: Path) -> Path:
    path = _resolve_path(raw_path, base_dir=base_dir)
    if (path / "state.json").exists():
        return path
    matches = sorted(path.glob("*/state.json"))
    if len(matches) == 1:
        return matches[0].parent
    if not matches:
        raise SpecError(f"study_root does not contain state.json: {path}")
    raise SpecError(f"study_root is ambiguous; point to a specific study directory: {path}")
 def _resolve_path(raw_path: str, *, base_dir: Path) -> Path:
    path = Path(raw_path)
    if not path.is_absolute():
        path = (base_dir / path).resolve()
    return path
 def _as_float(value: Any, *, default: float) -> float:
    if value is None:
        return default
    if isinstance(value, bool) or not isinstance(value, (int, float)):
        raise SpecError(f"Expected numeric value, got {value!r}.")
    return float(value)
 def _positive_int(value: Any, *, context: str) -> int:
    if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
        raise SpecError(f"{context} must contain positive integers.")
    return value
 def _first_index_at_value(curve: list[float | None], value: float | None) -> int | None:
    if value is None:
        return None
    for idx, item in enumerate(curve, start=1):
        if item == value:
            return idx
    return None
 def _argmax(rows: list[dict[str, Any]], key: str) -> str | None:
    scored = [
        (str(row["name"]), float(row[key]))
        for row in rows
        if isinstance(row.get(key), (int, float))
    ]
    if not scored:
        return None
    scored.sort(key=lambda item: item[1], reverse=True)
    return scored[0][0]
 def _argmin(rows: list[dict[str, Any]], key: str) -> str | None:
    scored = [
        (str(row["name"]), int(row[key]))
        for row in rows
        if isinstance(row.get(key), int)
    ]
    if not scored:
        return None
    scored.sort(key=lambda item: item[1])
    return scored[0][0]
 def _max_optional(values: Any) -> float | None:
    scored = [float(item) for item in values if isinstance(item, (int, float))]
    return max(scored) if scored else None
 def _min_optional(values: Any) -> int | None:
    scored = [int(item) for item in values if isinstance(item, int)]
    return min(scored) if scored else None
 def _mean(values: list[float]) -> float | None:
    return sum(values) / len(values) if values else None
 def _speedup(naive_trials: int | None, harness_trials: int | None) -> float | None:
    if harness_trials is None:
        return 0.0 if naive_trials is not None else None
    if naive_trials is None:
        return None
    if harness_trials <= 0:
        return None
    return float(naive_trials) / float(harness_trials)
 def _fmt(value: Any) -> str:
    if isinstance(value, float):
        return f"{value:.4f}"
    if value is None:
        return "-"
    return str(value)
 def _render_report(summary: dict[str, Any]) -> str:
    lines = [
        f"# {summary['report_id']}",
        "",
        "## Aggregate",
        "",
        f"- Cases: `{summary['aggregate']['case_count']}`",
        f"- Harness-vs-naive pass/checks: `{summary['aggregate']['harness_vs_naive_pass_count']}`/`{summary['aggregate']['harness_vs_naive_check_count']}`",
        f"- Winner counts: `{json.dumps(summary['aggregate']['winner_counts'], ensure_ascii=False)}`",
        "",
        "## By Kind",
        "",
        "| Kind | Arms | Mean final/ref | Mean AUC | Target reached |",
        "| --- | ---: | ---: | ---: | ---: |",
    ]
    for kind, payload in sorted(summary["aggregate"]["by_kind"].items()):
        lines.append(
            "| "
            + " | ".join(
                [
                    f"`{kind}`",
                    str(payload["arm_count"]),
                    _fmt(payload["mean_final_ratio_to_reference"]),
                    _fmt(payload["mean_normalized_auc"]),
                    str(payload["target_reached_count"]),
                ]
            )
            + " |"
        )
    lines.extend(["", "## Cases", ""])
    for case in summary["cases"]:
        lines.extend(
            [
                f"### {case['case_id']}",
                "",
                f"- Reference best req/s/GPU: `{_fmt(case['reference_best_per_gpu'])}`",
                f"- Target fraction: `{case['target_fraction']}`",
                f"- Winners: `{json.dumps(case['winners'], ensure_ascii=False)}`",
            ]
        )
        if case["warnings"]:
            lines.append(f"- Warnings: `{json.dumps(case['warnings'], ensure_ascii=False)}`")
        lines.extend(
            [
                "",
                "| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |",
                "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
            ]
        )
        for arm in case["arms"]:
            lines.append(
                "| "
                + " | ".join(
                    [
                        f"`{arm['name']}`",
                        f"`{arm['kind']}`",
                        str(arm["trial_count"]),
                        _fmt(arm["final_best_per_gpu"]),
                        _fmt(arm["final_ratio_to_reference"]),
                        _fmt(arm["trials_to_target"]),
                        _fmt(arm["normalized_auc"]),
                        str(arm["failed_count"]),
                        str(arm["no_feasible_count"]),
                    ]
                )
                + " |"
            )
        if case["harness_vs_naive"]:
            lines.extend(["", "| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |", "| --- | ---: | ---: | ---: | --- |"])
            for row in case["harness_vs_naive"]:
                lines.append(
                    "| "
                    + " | ".join(
                        [
                            f"`{row['harness']}`",
                            _fmt(row["final_ratio_vs_best_naive"]),
                            _fmt(row["target_trial_speedup_vs_best_naive"]),
                            _fmt(row["auc_ratio_vs_best_naive"]),
                            f"`{row['passes']}`",
                        ]
                    )
                    + " |"
                )
        lines.append("")
    return "\n".join(lines)
--- a/tests/test_tuning_report.py
+++ b/tests/test_tuning_report.py
@@ -0,0 +1,109 @@
 from __future__ import annotations
 import json
 import tempfile
 import unittest
 from pathlib import Path
 from aituner.tuning_report import run_tuning_report
 def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
    root.mkdir(parents=True)
    trials = []
    best_rate = None
    best_trial_id = None
    for idx, rate in enumerate(rates, start=1):
        trial_id = f"trial-{idx:04d}"
        trials.append(
            {
                "trial_id": trial_id,
                "status": "completed" if rate is not None else "failed",
                "parallel_size": 1,
                "best_request_rate": rate,
                "best_request_rate_per_gpu": rate,
                "config_patch": {"env_patch": {}, "flag_patch": {}},
            }
        )
        if rate is not None and (best_rate is None or rate > best_rate):
            best_rate = rate
            best_trial_id = trial_id
    payload = {
        "study_id": study_id,
        "best_trial_id": best_trial_id,
        "best_request_rate": best_rate,
        "best_request_rate_per_gpu": best_rate,
        "next_trial_index": len(rates) + 1,
        "trials": trials,
    }
    (root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
 class TuningReportTests(unittest.TestCase):
    def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            _write_state(
                tmp_path / "studies" / "harness-study",
                study_id="harness-study",
                rates=[0.4, 0.9],
            )
            _write_state(
                tmp_path / "naive-study",
                study_id="naive-study",
                rates=[0.4, None, 0.7, 0.9],
            )
            spec_path = tmp_path / "report.json"
            spec_path.write_text(
                json.dumps(
                    {
                        "report_id": "report-1",
                        "output_root": str(tmp_path / "out"),
                        "target_fraction": 0.8,
                        "cases": [
                            {
                                "case_id": "case-1",
                                "tags": ["model-a", "chat"],
                                "budgets": [1, 2, 4],
                                "arms": [
                                    {
                                        "name": "harness",
                                        "kind": "harness",
                                        "study_root": str(tmp_path / "studies"),
                                    },
                                    {
                                        "name": "naive",
                                        "kind": "naive",
                                        "study_root": str(tmp_path / "naive-study"),
                                    },
                                ],
                            }
                        ],
                    }
                ),
                encoding="utf-8",
            )
            summary = run_tuning_report(spec_path)
            case = summary["cases"][0]
            self.assertEqual(case["reference_best_per_gpu"], 0.9)
            self.assertEqual(case["winners"]["final_best"], "harness")
            self.assertEqual(case["winners"]["fastest_to_target"], "harness")
            harness = case["arms"][0]
            naive = case["arms"][1]
            self.assertEqual(harness["best_at_budget"]["2"], 0.9)
            self.assertEqual(naive["best_at_budget"]["2"], 0.4)
            self.assertEqual(case["target_fraction"], 0.8)
            self.assertEqual(harness["trials_to_target"], 2)
            self.assertEqual(naive["trials_to_target"], 4)
            self.assertEqual(naive["failed_count"], 1)
            comparison = case["harness_vs_naive"][0]
            self.assertTrue(comparison["passes"])
            self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
            self.assertTrue((tmp_path / "out" / "summary.json").exists())
            self.assertTrue((tmp_path / "out" / "report.md").exists())
 if __name__ == "__main__":
    unittest.main()