From 488fae7e633dab8db3eb50dde9b167124b33117f Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sun, 21 Jun 2026 00:48:21 +0800 Subject: [PATCH] Add tuning progress report for harness evaluation --- configs/examples/tuning_report.example.json | 26 + scripts/tuning_report.py | 36 ++ src/aituner/tuning_report.py | 581 ++++++++++++++++++++ tests/test_tuning_report.py | 109 ++++ 4 files changed, 752 insertions(+) create mode 100644 configs/examples/tuning_report.example.json create mode 100644 scripts/tuning_report.py create mode 100644 src/aituner/tuning_report.py create mode 100644 tests/test_tuning_report.py diff --git a/configs/examples/tuning_report.example.json b/configs/examples/tuning_report.example.json new file mode 100644 index 0000000..58cecc7 --- /dev/null +++ b/configs/examples/tuning_report.example.json @@ -0,0 +1,26 @@ +{ + "report_id": "qwen27b-abl12-harness-vs-naive", + "output_root": "../../.aituner-reports/qwen27b-abl12-harness-vs-naive", + "target_fraction": 0.95, + "min_final_ratio": 0.98, + "cases": [ + { + "case_id": "qwen27b-chat-0-8k-real-output", + "description": "12-trial harness-vs-naive ablation on the 0-8k chat window with real output lengths.", + "tags": ["qwen27b", "chat", "0-8k", "h20", "real-output"], + "budgets": [1, 2, 3, 4, 6, 8, 12], + "arms": [ + { + "name": "harness", + "kind": "harness", + "study_root": "../../.aituner/abl12-harness/dash0-qwen27b-ablation-harness-on" + }, + { + "name": "naive", + "kind": "naive", + "study_root": "../../.aituner/abl12-naive/dash0-qwen27b-ablation-naive-off" + } + ] + } + ] +} diff --git a/scripts/tuning_report.py b/scripts/tuning_report.py new file mode 100644 index 0000000..95f0dc4 --- /dev/null +++ b/scripts/tuning_report.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from aituner.tuning_report import run_tuning_report + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Summarize anytime tuning progress across harness/naive study stores." + ) + parser.add_argument("--spec", required=True, help="Path to a tuning report JSON spec.") + args = parser.parse_args() + summary = run_tuning_report(Path(args.spec)) + print( + json.dumps( + { + "report_id": summary["report_id"], + "report_root": summary["report_root"], + "case_count": summary["aggregate"]["case_count"], + "harness_vs_naive_pass_count": summary["aggregate"]["harness_vs_naive_pass_count"], + "harness_vs_naive_check_count": summary["aggregate"]["harness_vs_naive_check_count"], + "winner_counts": summary["aggregate"]["winner_counts"], + }, + ensure_ascii=False, + indent=2, + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/aituner/tuning_report.py b/src/aituner/tuning_report.py new file mode 100644 index 0000000..4ff7e99 --- /dev/null +++ b/src/aituner/tuning_report.py @@ -0,0 +1,581 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from .spec import SpecError, load_structured_file +from .store import StudyStore + + +DEFAULT_BUDGETS = [1, 2, 3, 4, 6, 8, 12] +DEFAULT_TARGET_FRACTION = 0.95 +DEFAULT_MIN_FINAL_RATIO = 0.98 + + +def run_tuning_report(spec_path: Path) -> dict[str, Any]: + spec_path = spec_path.resolve() + spec = _load_report_spec(spec_path) + report_root = _resolve_output_root(spec, spec_path=spec_path) + report_root.mkdir(parents=True, exist_ok=True) + + cases = [ + _summarize_case(case, spec_path=spec_path) + for case in spec["cases"] + ] + summary = { + "report_id": spec["report_id"], + "report_root": str(report_root), + "target_fraction": spec["target_fraction"], + "min_final_ratio": spec["min_final_ratio"], + "cases": cases, + "aggregate": _aggregate_cases(cases), + } + StudyStore.write_json(report_root / "summary.json", summary) + (report_root / "report.md").write_text(_render_report(summary), encoding="utf-8") + return summary + + +def _load_report_spec(path: Path) -> dict[str, Any]: + payload = dict(load_structured_file(path)) + report_id = str(payload.get("report_id") or "").strip() + if not report_id: + raise SpecError("report_id must be a non-empty string.") + raw_cases = payload.get("cases") + if not isinstance(raw_cases, list) or not raw_cases: + raise SpecError("cases must be a non-empty list.") + target_fraction = _as_float(payload.get("target_fraction"), default=DEFAULT_TARGET_FRACTION) + if target_fraction <= 0: + raise SpecError("target_fraction must be positive.") + min_final_ratio = _as_float(payload.get("min_final_ratio"), default=DEFAULT_MIN_FINAL_RATIO) + if min_final_ratio <= 0: + raise SpecError("min_final_ratio must be positive.") + cases = [ + _load_case( + item, + idx=idx, + default_target_fraction=target_fraction, + default_min_final_ratio=min_final_ratio, + ) + for idx, item in enumerate(raw_cases) + ] + return { + "report_id": report_id, + "output_root": str(payload.get("output_root") or "").strip() or None, + "target_fraction": target_fraction, + "min_final_ratio": min_final_ratio, + "cases": cases, + } + + +def _load_case( + raw: Any, + *, + idx: int, + default_target_fraction: float, + default_min_final_ratio: float, +) -> dict[str, Any]: + if not isinstance(raw, dict): + raise SpecError(f"cases[{idx}] must be an object.") + case_id = str(raw.get("case_id") or "").strip() + if not case_id: + raise SpecError(f"cases[{idx}].case_id must be a non-empty string.") + raw_arms = raw.get("arms") + if not isinstance(raw_arms, list) or not raw_arms: + raise SpecError(f"cases[{idx}].arms must be a non-empty list.") + arms = [_load_arm(item, context=f"cases[{idx}].arms[{arm_idx}]") for arm_idx, item in enumerate(raw_arms)] + names = [item["name"] for item in arms] + if len(names) != len(set(names)): + raise SpecError(f"cases[{idx}].arms names must be unique.") + raw_budgets = raw.get("budgets", DEFAULT_BUDGETS) + if not isinstance(raw_budgets, list) or not raw_budgets: + raise SpecError(f"cases[{idx}].budgets must be a non-empty list.") + budgets = sorted({_positive_int(item, context=f"cases[{idx}].budgets") for item in raw_budgets}) + return { + "case_id": case_id, + "description": str(raw.get("description") or "").strip(), + "tags": [str(item).strip() for item in raw.get("tags", []) if str(item).strip()] + if isinstance(raw.get("tags", []), list) + else [], + "budgets": budgets, + "target_fraction": _as_float(raw.get("target_fraction"), default=default_target_fraction), + "min_final_ratio": _as_float(raw.get("min_final_ratio"), default=default_min_final_ratio), + "arms": arms, + } + + +def _load_arm(raw: Any, *, context: str) -> dict[str, Any]: + if not isinstance(raw, dict): + raise SpecError(f"{context} must be an object.") + name = str(raw.get("name") or "").strip() + if not name: + raise SpecError(f"{context}.name must be a non-empty string.") + kind = str(raw.get("kind") or name).strip() + study_root = str(raw.get("study_root") or "").strip() + if not study_root: + raise SpecError(f"{context}.study_root must be a non-empty string.") + return { + "name": name, + "kind": kind, + "study_root": study_root, + "label": str(raw.get("label") or "").strip() or name, + } + + +def _resolve_output_root(spec: dict[str, Any], *, spec_path: Path) -> Path: + raw = spec.get("output_root") + if raw: + return _resolve_path(str(raw), base_dir=spec_path.parent) + return (Path(".aituner-reports") / str(spec["report_id"])).resolve() + + +def _summarize_case(case: dict[str, Any], *, spec_path: Path) -> dict[str, Any]: + arms = [ + _summarize_arm(arm, budgets=case["budgets"], spec_path=spec_path) + for arm in case["arms"] + ] + reference = _reference_best(arms) + max_budget = max(case["budgets"] + [arm["trial_count"] for arm in arms]) + for arm in arms: + _add_reference_metrics( + arm, + reference=reference, + max_budget=max_budget, + target_fraction=case["target_fraction"], + ) + winners = _case_winners(arms) + comparison = _harness_vs_naive( + arms, + min_final_ratio=case["min_final_ratio"], + ) + return { + "case_id": case["case_id"], + "description": case["description"], + "tags": case["tags"], + "budgets": case["budgets"], + "target_fraction": case["target_fraction"], + "min_final_ratio": case["min_final_ratio"], + "reference_best_per_gpu": reference, + "max_budget": max_budget, + "arms": arms, + "winners": winners, + "harness_vs_naive": comparison, + "warnings": _case_warnings(case, arms, comparison), + } + + +def _summarize_arm(arm: dict[str, Any], *, budgets: list[int], spec_path: Path) -> dict[str, Any]: + study_root = _resolve_study_root(arm["study_root"], base_dir=spec_path.parent) + state = json.loads((study_root / "state.json").read_text(encoding="utf-8")) + trials = state.get("trials") if isinstance(state.get("trials"), list) else [] + curve = _running_best_curve(trials) + final_best = curve[-1] if curve else None + best_trial_index = _first_index_at_value(curve, final_best) + return { + "name": arm["name"], + "kind": arm["kind"], + "label": arm["label"], + "study_root": str(study_root), + "study_id": state.get("study_id"), + "trial_count": len(trials), + "completed_count": sum(1 for item in trials if item.get("status") == "completed"), + "failed_count": sum(1 for item in trials if item.get("status") == "failed"), + "no_feasible_count": sum( + 1 for item in trials if not isinstance(item.get("best_request_rate_per_gpu"), (int, float)) + ), + "best_trial_id": state.get("best_trial_id"), + "best_trial_index": best_trial_index, + "final_best_per_gpu": final_best, + "state_best_per_gpu": state.get("best_request_rate_per_gpu"), + "best_at_budget": {str(budget): _value_at_budget(curve, budget) for budget in budgets}, + "running_best_per_gpu": curve, + "stop_reason": str(state.get("tuning_stop_reason") or ""), + "stop_diagnosis": str(state.get("tuning_stop_diagnosis") or ""), + } + + +def _add_reference_metrics( + arm: dict[str, Any], + *, + reference: float | None, + max_budget: int, + target_fraction: float, +) -> None: + final_best = arm.get("final_best_per_gpu") + arm["final_ratio_to_reference"] = ( + float(final_best) / reference + if reference and isinstance(final_best, (int, float)) + else None + ) + target = reference * target_fraction if reference else None + arm["target_per_gpu"] = target + arm["trials_to_target"] = _trials_to_target(arm["running_best_per_gpu"], target) + arm["normalized_auc"] = _normalized_auc( + arm["running_best_per_gpu"], + reference=reference, + max_budget=max_budget, + ) + + +def _harness_vs_naive(arms: list[dict[str, Any]], *, min_final_ratio: float) -> list[dict[str, Any]]: + naive = [arm for arm in arms if arm["kind"] == "naive"] + harnesses = [arm for arm in arms if arm["kind"] == "harness"] + if not naive or not harnesses: + return [] + best_naive_final = _max_optional(arm.get("final_best_per_gpu") for arm in naive) + best_naive_ttt = _min_optional(arm.get("trials_to_target") for arm in naive) + best_naive_auc = _max_optional(arm.get("normalized_auc") for arm in naive) + rows = [] + for harness in harnesses: + final = harness.get("final_best_per_gpu") + ttt = harness.get("trials_to_target") + auc = harness.get("normalized_auc") + final_ratio = ( + float(final) / best_naive_final + if best_naive_final and isinstance(final, (int, float)) + else None + ) + auc_ratio = ( + float(auc) / best_naive_auc + if best_naive_auc and isinstance(auc, (int, float)) + else None + ) + speedup = _speedup(best_naive_ttt, ttt) + pass_final = final_ratio is not None and final_ratio >= min_final_ratio + pass_speed = speedup is None or speedup >= 1.0 + rows.append( + { + "harness": harness["name"], + "best_naive_final_per_gpu": best_naive_final, + "best_naive_trials_to_target": best_naive_ttt, + "best_naive_normalized_auc": best_naive_auc, + "final_ratio_vs_best_naive": final_ratio, + "target_trial_speedup_vs_best_naive": speedup, + "auc_ratio_vs_best_naive": auc_ratio, + "passes_min_final_ratio": pass_final, + "passes_speed": pass_speed, + "passes": pass_final and pass_speed, + } + ) + return rows + + +def _case_winners(arms: list[dict[str, Any]]) -> dict[str, str | None]: + return { + "final_best": _argmax(arms, "final_best_per_gpu"), + "fastest_to_target": _argmin(arms, "trials_to_target"), + "normalized_auc": _argmax(arms, "normalized_auc"), + } + + +def _aggregate_cases(cases: list[dict[str, Any]]) -> dict[str, Any]: + by_kind: dict[str, dict[str, Any]] = {} + final_wins: dict[str, int] = {} + speed_wins: dict[str, int] = {} + auc_wins: dict[str, int] = {} + harness_passes = 0 + harness_checks = 0 + for case in cases: + for winner_key, target in ( + ("final_best", final_wins), + ("fastest_to_target", speed_wins), + ("normalized_auc", auc_wins), + ): + winner = case["winners"].get(winner_key) + if winner: + target[winner] = target.get(winner, 0) + 1 + for row in case["harness_vs_naive"]: + harness_checks += 1 + if row["passes"]: + harness_passes += 1 + for arm in case["arms"]: + bucket = by_kind.setdefault( + arm["kind"], + { + "arm_count": 0, + "mean_final_ratio_to_reference": None, + "mean_normalized_auc": None, + "target_reached_count": 0, + "_final_ratios": [], + "_aucs": [], + }, + ) + bucket["arm_count"] += 1 + if isinstance(arm.get("final_ratio_to_reference"), (int, float)): + bucket["_final_ratios"].append(float(arm["final_ratio_to_reference"])) + if isinstance(arm.get("normalized_auc"), (int, float)): + bucket["_aucs"].append(float(arm["normalized_auc"])) + if isinstance(arm.get("trials_to_target"), int): + bucket["target_reached_count"] += 1 + for bucket in by_kind.values(): + ratios = bucket.pop("_final_ratios") + aucs = bucket.pop("_aucs") + bucket["mean_final_ratio_to_reference"] = _mean(ratios) + bucket["mean_normalized_auc"] = _mean(aucs) + return { + "case_count": len(cases), + "by_kind": by_kind, + "winner_counts": { + "final_best": final_wins, + "fastest_to_target": speed_wins, + "normalized_auc": auc_wins, + }, + "harness_vs_naive_pass_count": harness_passes, + "harness_vs_naive_check_count": harness_checks, + } + + +def _case_warnings( + case: dict[str, Any], + arms: list[dict[str, Any]], + comparison: list[dict[str, Any]], +) -> list[str]: + warnings = [] + kinds = {arm["kind"] for arm in arms} + if "harness" not in kinds or "naive" not in kinds: + warnings.append("case does not include both harness and naive arms") + if len(case["tags"]) < 2: + warnings.append("case has few tags; add workload/model/SLO tags to support generalization claims") + if not comparison: + return warnings + for row in comparison: + if not row["passes_min_final_ratio"]: + warnings.append( + f"{row['harness']} final best is below min_final_ratio versus best naive" + ) + if not row["passes_speed"]: + warnings.append( + f"{row['harness']} reaches target later than best naive" + ) + return warnings + + +def _running_best_curve(trials: list[Any]) -> list[float | None]: + curve: list[float | None] = [] + incumbent: float | None = None + for trial in trials: + rate = trial.get("best_request_rate_per_gpu") if isinstance(trial, dict) else None + if isinstance(rate, (int, float)) and (incumbent is None or float(rate) > incumbent): + incumbent = float(rate) + curve.append(incumbent) + return curve + + +def _value_at_budget(curve: list[float | None], budget: int) -> float | None: + if not curve: + return None + index = min(max(budget, 1), len(curve)) - 1 + return curve[index] + + +def _trials_to_target(curve: list[float | None], target: float | None) -> int | None: + if target is None: + return None + for idx, value in enumerate(curve, start=1): + if isinstance(value, (int, float)) and value >= target: + return idx + return None + + +def _normalized_auc( + curve: list[float | None], + *, + reference: float | None, + max_budget: int, +) -> float | None: + if not reference or max_budget <= 0: + return None + total = 0.0 + for budget in range(1, max_budget + 1): + value = _value_at_budget(curve, budget) + total += float(value) if isinstance(value, (int, float)) else 0.0 + return total / (reference * max_budget) + + +def _reference_best(arms: list[dict[str, Any]]) -> float | None: + return _max_optional(arm.get("final_best_per_gpu") for arm in arms) + + +def _resolve_study_root(raw_path: str, *, base_dir: Path) -> Path: + path = _resolve_path(raw_path, base_dir=base_dir) + if (path / "state.json").exists(): + return path + matches = sorted(path.glob("*/state.json")) + if len(matches) == 1: + return matches[0].parent + if not matches: + raise SpecError(f"study_root does not contain state.json: {path}") + raise SpecError(f"study_root is ambiguous; point to a specific study directory: {path}") + + +def _resolve_path(raw_path: str, *, base_dir: Path) -> Path: + path = Path(raw_path) + if not path.is_absolute(): + path = (base_dir / path).resolve() + return path + + +def _as_float(value: Any, *, default: float) -> float: + if value is None: + return default + if isinstance(value, bool) or not isinstance(value, (int, float)): + raise SpecError(f"Expected numeric value, got {value!r}.") + return float(value) + + +def _positive_int(value: Any, *, context: str) -> int: + if isinstance(value, bool) or not isinstance(value, int) or value <= 0: + raise SpecError(f"{context} must contain positive integers.") + return value + + +def _first_index_at_value(curve: list[float | None], value: float | None) -> int | None: + if value is None: + return None + for idx, item in enumerate(curve, start=1): + if item == value: + return idx + return None + + +def _argmax(rows: list[dict[str, Any]], key: str) -> str | None: + scored = [ + (str(row["name"]), float(row[key])) + for row in rows + if isinstance(row.get(key), (int, float)) + ] + if not scored: + return None + scored.sort(key=lambda item: item[1], reverse=True) + return scored[0][0] + + +def _argmin(rows: list[dict[str, Any]], key: str) -> str | None: + scored = [ + (str(row["name"]), int(row[key])) + for row in rows + if isinstance(row.get(key), int) + ] + if not scored: + return None + scored.sort(key=lambda item: item[1]) + return scored[0][0] + + +def _max_optional(values: Any) -> float | None: + scored = [float(item) for item in values if isinstance(item, (int, float))] + return max(scored) if scored else None + + +def _min_optional(values: Any) -> int | None: + scored = [int(item) for item in values if isinstance(item, int)] + return min(scored) if scored else None + + +def _mean(values: list[float]) -> float | None: + return sum(values) / len(values) if values else None + + +def _speedup(naive_trials: int | None, harness_trials: int | None) -> float | None: + if harness_trials is None: + return 0.0 if naive_trials is not None else None + if naive_trials is None: + return None + if harness_trials <= 0: + return None + return float(naive_trials) / float(harness_trials) + + +def _fmt(value: Any) -> str: + if isinstance(value, float): + return f"{value:.4f}" + if value is None: + return "-" + return str(value) + + +def _render_report(summary: dict[str, Any]) -> str: + lines = [ + f"# {summary['report_id']}", + "", + "## Aggregate", + "", + f"- Cases: `{summary['aggregate']['case_count']}`", + f"- Harness-vs-naive pass/checks: `{summary['aggregate']['harness_vs_naive_pass_count']}`/`{summary['aggregate']['harness_vs_naive_check_count']}`", + f"- Winner counts: `{json.dumps(summary['aggregate']['winner_counts'], ensure_ascii=False)}`", + "", + "## By Kind", + "", + "| Kind | Arms | Mean final/ref | Mean AUC | Target reached |", + "| --- | ---: | ---: | ---: | ---: |", + ] + for kind, payload in sorted(summary["aggregate"]["by_kind"].items()): + lines.append( + "| " + + " | ".join( + [ + f"`{kind}`", + str(payload["arm_count"]), + _fmt(payload["mean_final_ratio_to_reference"]), + _fmt(payload["mean_normalized_auc"]), + str(payload["target_reached_count"]), + ] + ) + + " |" + ) + lines.extend(["", "## Cases", ""]) + for case in summary["cases"]: + lines.extend( + [ + f"### {case['case_id']}", + "", + f"- Reference best req/s/GPU: `{_fmt(case['reference_best_per_gpu'])}`", + f"- Target fraction: `{case['target_fraction']}`", + f"- Winners: `{json.dumps(case['winners'], ensure_ascii=False)}`", + ] + ) + if case["warnings"]: + lines.append(f"- Warnings: `{json.dumps(case['warnings'], ensure_ascii=False)}`") + lines.extend( + [ + "", + "| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |", + "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + ) + for arm in case["arms"]: + lines.append( + "| " + + " | ".join( + [ + f"`{arm['name']}`", + f"`{arm['kind']}`", + str(arm["trial_count"]), + _fmt(arm["final_best_per_gpu"]), + _fmt(arm["final_ratio_to_reference"]), + _fmt(arm["trials_to_target"]), + _fmt(arm["normalized_auc"]), + str(arm["failed_count"]), + str(arm["no_feasible_count"]), + ] + ) + + " |" + ) + if case["harness_vs_naive"]: + lines.extend(["", "| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |", "| --- | ---: | ---: | ---: | --- |"]) + for row in case["harness_vs_naive"]: + lines.append( + "| " + + " | ".join( + [ + f"`{row['harness']}`", + _fmt(row["final_ratio_vs_best_naive"]), + _fmt(row["target_trial_speedup_vs_best_naive"]), + _fmt(row["auc_ratio_vs_best_naive"]), + f"`{row['passes']}`", + ] + ) + + " |" + ) + lines.append("") + return "\n".join(lines) diff --git a/tests/test_tuning_report.py b/tests/test_tuning_report.py new file mode 100644 index 0000000..beac547 --- /dev/null +++ b/tests/test_tuning_report.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import json +import tempfile +import unittest +from pathlib import Path + +from aituner.tuning_report import run_tuning_report + + +def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None: + root.mkdir(parents=True) + trials = [] + best_rate = None + best_trial_id = None + for idx, rate in enumerate(rates, start=1): + trial_id = f"trial-{idx:04d}" + trials.append( + { + "trial_id": trial_id, + "status": "completed" if rate is not None else "failed", + "parallel_size": 1, + "best_request_rate": rate, + "best_request_rate_per_gpu": rate, + "config_patch": {"env_patch": {}, "flag_patch": {}}, + } + ) + if rate is not None and (best_rate is None or rate > best_rate): + best_rate = rate + best_trial_id = trial_id + payload = { + "study_id": study_id, + "best_trial_id": best_trial_id, + "best_request_rate": best_rate, + "best_request_rate_per_gpu": best_rate, + "next_trial_index": len(rates) + 1, + "trials": trials, + } + (root / "state.json").write_text(json.dumps(payload), encoding="utf-8") + + +class TuningReportTests(unittest.TestCase): + def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + _write_state( + tmp_path / "studies" / "harness-study", + study_id="harness-study", + rates=[0.4, 0.9], + ) + _write_state( + tmp_path / "naive-study", + study_id="naive-study", + rates=[0.4, None, 0.7, 0.9], + ) + spec_path = tmp_path / "report.json" + spec_path.write_text( + json.dumps( + { + "report_id": "report-1", + "output_root": str(tmp_path / "out"), + "target_fraction": 0.8, + "cases": [ + { + "case_id": "case-1", + "tags": ["model-a", "chat"], + "budgets": [1, 2, 4], + "arms": [ + { + "name": "harness", + "kind": "harness", + "study_root": str(tmp_path / "studies"), + }, + { + "name": "naive", + "kind": "naive", + "study_root": str(tmp_path / "naive-study"), + }, + ], + } + ], + } + ), + encoding="utf-8", + ) + + summary = run_tuning_report(spec_path) + + case = summary["cases"][0] + self.assertEqual(case["reference_best_per_gpu"], 0.9) + self.assertEqual(case["winners"]["final_best"], "harness") + self.assertEqual(case["winners"]["fastest_to_target"], "harness") + harness = case["arms"][0] + naive = case["arms"][1] + self.assertEqual(harness["best_at_budget"]["2"], 0.9) + self.assertEqual(naive["best_at_budget"]["2"], 0.4) + self.assertEqual(case["target_fraction"], 0.8) + self.assertEqual(harness["trials_to_target"], 2) + self.assertEqual(naive["trials_to_target"], 4) + self.assertEqual(naive["failed_count"], 1) + comparison = case["harness_vs_naive"][0] + self.assertTrue(comparison["passes"]) + self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0) + self.assertTrue((tmp_path / "out" / "summary.json").exists()) + self.assertTrue((tmp_path / "out" / "report.md").exists()) + + +if __name__ == "__main__": + unittest.main()