From 488fae7e633dab8db3eb50dde9b167124b33117f Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Sun, 21 Jun 2026 00:48:21 +0800
Subject: [PATCH] Add tuning progress report for harness evaluation

---
 configs/examples/tuning_report.example.json |  26 +
 scripts/tuning_report.py                    |  36 ++
 src/aituner/tuning_report.py                | 581 ++++++++++++++++++++
 tests/test_tuning_report.py                 | 109 ++++
 4 files changed, 752 insertions(+)
 create mode 100644 configs/examples/tuning_report.example.json
 create mode 100644 scripts/tuning_report.py
 create mode 100644 src/aituner/tuning_report.py
 create mode 100644 tests/test_tuning_report.py

diff --git a/configs/examples/tuning_report.example.json b/configs/examples/tuning_report.example.json
new file mode 100644
index 0000000..58cecc7
--- /dev/null
+++ b/configs/examples/tuning_report.example.json
@@ -0,0 +1,26 @@
+{
+  "report_id": "qwen27b-abl12-harness-vs-naive",
+  "output_root": "../../.aituner-reports/qwen27b-abl12-harness-vs-naive",
+  "target_fraction": 0.95,
+  "min_final_ratio": 0.98,
+  "cases": [
+    {
+      "case_id": "qwen27b-chat-0-8k-real-output",
+      "description": "12-trial harness-vs-naive ablation on the 0-8k chat window with real output lengths.",
+      "tags": ["qwen27b", "chat", "0-8k", "h20", "real-output"],
+      "budgets": [1, 2, 3, 4, 6, 8, 12],
+      "arms": [
+        {
+          "name": "harness",
+          "kind": "harness",
+          "study_root": "../../.aituner/abl12-harness/dash0-qwen27b-ablation-harness-on"
+        },
+        {
+          "name": "naive",
+          "kind": "naive",
+          "study_root": "../../.aituner/abl12-naive/dash0-qwen27b-ablation-naive-off"
+        }
+      ]
+    }
+  ]
+}
diff --git a/scripts/tuning_report.py b/scripts/tuning_report.py
new file mode 100644
index 0000000..95f0dc4
--- /dev/null
+++ b/scripts/tuning_report.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from aituner.tuning_report import run_tuning_report
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Summarize anytime tuning progress across harness/naive study stores."
+    )
+    parser.add_argument("--spec", required=True, help="Path to a tuning report JSON spec.")
+    args = parser.parse_args()
+    summary = run_tuning_report(Path(args.spec))
+    print(
+        json.dumps(
+            {
+                "report_id": summary["report_id"],
+                "report_root": summary["report_root"],
+                "case_count": summary["aggregate"]["case_count"],
+                "harness_vs_naive_pass_count": summary["aggregate"]["harness_vs_naive_pass_count"],
+                "harness_vs_naive_check_count": summary["aggregate"]["harness_vs_naive_check_count"],
+                "winner_counts": summary["aggregate"]["winner_counts"],
+            },
+            ensure_ascii=False,
+            indent=2,
+        )
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/aituner/tuning_report.py b/src/aituner/tuning_report.py
new file mode 100644
index 0000000..4ff7e99
--- /dev/null
+++ b/src/aituner/tuning_report.py
@@ -0,0 +1,581 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+from .spec import SpecError, load_structured_file
+from .store import StudyStore
+
+
+DEFAULT_BUDGETS = [1, 2, 3, 4, 6, 8, 12]
+DEFAULT_TARGET_FRACTION = 0.95
+DEFAULT_MIN_FINAL_RATIO = 0.98
+
+
+def run_tuning_report(spec_path: Path) -> dict[str, Any]:
+    spec_path = spec_path.resolve()
+    spec = _load_report_spec(spec_path)
+    report_root = _resolve_output_root(spec, spec_path=spec_path)
+    report_root.mkdir(parents=True, exist_ok=True)
+
+    cases = [
+        _summarize_case(case, spec_path=spec_path)
+        for case in spec["cases"]
+    ]
+    summary = {
+        "report_id": spec["report_id"],
+        "report_root": str(report_root),
+        "target_fraction": spec["target_fraction"],
+        "min_final_ratio": spec["min_final_ratio"],
+        "cases": cases,
+        "aggregate": _aggregate_cases(cases),
+    }
+    StudyStore.write_json(report_root / "summary.json", summary)
+    (report_root / "report.md").write_text(_render_report(summary), encoding="utf-8")
+    return summary
+
+
+def _load_report_spec(path: Path) -> dict[str, Any]:
+    payload = dict(load_structured_file(path))
+    report_id = str(payload.get("report_id") or "").strip()
+    if not report_id:
+        raise SpecError("report_id must be a non-empty string.")
+    raw_cases = payload.get("cases")
+    if not isinstance(raw_cases, list) or not raw_cases:
+        raise SpecError("cases must be a non-empty list.")
+    target_fraction = _as_float(payload.get("target_fraction"), default=DEFAULT_TARGET_FRACTION)
+    if target_fraction <= 0:
+        raise SpecError("target_fraction must be positive.")
+    min_final_ratio = _as_float(payload.get("min_final_ratio"), default=DEFAULT_MIN_FINAL_RATIO)
+    if min_final_ratio <= 0:
+        raise SpecError("min_final_ratio must be positive.")
+    cases = [
+        _load_case(
+            item,
+            idx=idx,
+            default_target_fraction=target_fraction,
+            default_min_final_ratio=min_final_ratio,
+        )
+        for idx, item in enumerate(raw_cases)
+    ]
+    return {
+        "report_id": report_id,
+        "output_root": str(payload.get("output_root") or "").strip() or None,
+        "target_fraction": target_fraction,
+        "min_final_ratio": min_final_ratio,
+        "cases": cases,
+    }
+
+
+def _load_case(
+    raw: Any,
+    *,
+    idx: int,
+    default_target_fraction: float,
+    default_min_final_ratio: float,
+) -> dict[str, Any]:
+    if not isinstance(raw, dict):
+        raise SpecError(f"cases[{idx}] must be an object.")
+    case_id = str(raw.get("case_id") or "").strip()
+    if not case_id:
+        raise SpecError(f"cases[{idx}].case_id must be a non-empty string.")
+    raw_arms = raw.get("arms")
+    if not isinstance(raw_arms, list) or not raw_arms:
+        raise SpecError(f"cases[{idx}].arms must be a non-empty list.")
+    arms = [_load_arm(item, context=f"cases[{idx}].arms[{arm_idx}]") for arm_idx, item in enumerate(raw_arms)]
+    names = [item["name"] for item in arms]
+    if len(names) != len(set(names)):
+        raise SpecError(f"cases[{idx}].arms names must be unique.")
+    raw_budgets = raw.get("budgets", DEFAULT_BUDGETS)
+    if not isinstance(raw_budgets, list) or not raw_budgets:
+        raise SpecError(f"cases[{idx}].budgets must be a non-empty list.")
+    budgets = sorted({_positive_int(item, context=f"cases[{idx}].budgets") for item in raw_budgets})
+    return {
+        "case_id": case_id,
+        "description": str(raw.get("description") or "").strip(),
+        "tags": [str(item).strip() for item in raw.get("tags", []) if str(item).strip()]
+        if isinstance(raw.get("tags", []), list)
+        else [],
+        "budgets": budgets,
+        "target_fraction": _as_float(raw.get("target_fraction"), default=default_target_fraction),
+        "min_final_ratio": _as_float(raw.get("min_final_ratio"), default=default_min_final_ratio),
+        "arms": arms,
+    }
+
+
+def _load_arm(raw: Any, *, context: str) -> dict[str, Any]:
+    if not isinstance(raw, dict):
+        raise SpecError(f"{context} must be an object.")
+    name = str(raw.get("name") or "").strip()
+    if not name:
+        raise SpecError(f"{context}.name must be a non-empty string.")
+    kind = str(raw.get("kind") or name).strip()
+    study_root = str(raw.get("study_root") or "").strip()
+    if not study_root:
+        raise SpecError(f"{context}.study_root must be a non-empty string.")
+    return {
+        "name": name,
+        "kind": kind,
+        "study_root": study_root,
+        "label": str(raw.get("label") or "").strip() or name,
+    }
+
+
+def _resolve_output_root(spec: dict[str, Any], *, spec_path: Path) -> Path:
+    raw = spec.get("output_root")
+    if raw:
+        return _resolve_path(str(raw), base_dir=spec_path.parent)
+    return (Path(".aituner-reports") / str(spec["report_id"])).resolve()
+
+
+def _summarize_case(case: dict[str, Any], *, spec_path: Path) -> dict[str, Any]:
+    arms = [
+        _summarize_arm(arm, budgets=case["budgets"], spec_path=spec_path)
+        for arm in case["arms"]
+    ]
+    reference = _reference_best(arms)
+    max_budget = max(case["budgets"] + [arm["trial_count"] for arm in arms])
+    for arm in arms:
+        _add_reference_metrics(
+            arm,
+            reference=reference,
+            max_budget=max_budget,
+            target_fraction=case["target_fraction"],
+        )
+    winners = _case_winners(arms)
+    comparison = _harness_vs_naive(
+        arms,
+        min_final_ratio=case["min_final_ratio"],
+    )
+    return {
+        "case_id": case["case_id"],
+        "description": case["description"],
+        "tags": case["tags"],
+        "budgets": case["budgets"],
+        "target_fraction": case["target_fraction"],
+        "min_final_ratio": case["min_final_ratio"],
+        "reference_best_per_gpu": reference,
+        "max_budget": max_budget,
+        "arms": arms,
+        "winners": winners,
+        "harness_vs_naive": comparison,
+        "warnings": _case_warnings(case, arms, comparison),
+    }
+
+
+def _summarize_arm(arm: dict[str, Any], *, budgets: list[int], spec_path: Path) -> dict[str, Any]:
+    study_root = _resolve_study_root(arm["study_root"], base_dir=spec_path.parent)
+    state = json.loads((study_root / "state.json").read_text(encoding="utf-8"))
+    trials = state.get("trials") if isinstance(state.get("trials"), list) else []
+    curve = _running_best_curve(trials)
+    final_best = curve[-1] if curve else None
+    best_trial_index = _first_index_at_value(curve, final_best)
+    return {
+        "name": arm["name"],
+        "kind": arm["kind"],
+        "label": arm["label"],
+        "study_root": str(study_root),
+        "study_id": state.get("study_id"),
+        "trial_count": len(trials),
+        "completed_count": sum(1 for item in trials if item.get("status") == "completed"),
+        "failed_count": sum(1 for item in trials if item.get("status") == "failed"),
+        "no_feasible_count": sum(
+            1 for item in trials if not isinstance(item.get("best_request_rate_per_gpu"), (int, float))
+        ),
+        "best_trial_id": state.get("best_trial_id"),
+        "best_trial_index": best_trial_index,
+        "final_best_per_gpu": final_best,
+        "state_best_per_gpu": state.get("best_request_rate_per_gpu"),
+        "best_at_budget": {str(budget): _value_at_budget(curve, budget) for budget in budgets},
+        "running_best_per_gpu": curve,
+        "stop_reason": str(state.get("tuning_stop_reason") or ""),
+        "stop_diagnosis": str(state.get("tuning_stop_diagnosis") or ""),
+    }
+
+
+def _add_reference_metrics(
+    arm: dict[str, Any],
+    *,
+    reference: float | None,
+    max_budget: int,
+    target_fraction: float,
+) -> None:
+    final_best = arm.get("final_best_per_gpu")
+    arm["final_ratio_to_reference"] = (
+        float(final_best) / reference
+        if reference and isinstance(final_best, (int, float))
+        else None
+    )
+    target = reference * target_fraction if reference else None
+    arm["target_per_gpu"] = target
+    arm["trials_to_target"] = _trials_to_target(arm["running_best_per_gpu"], target)
+    arm["normalized_auc"] = _normalized_auc(
+        arm["running_best_per_gpu"],
+        reference=reference,
+        max_budget=max_budget,
+    )
+
+
+def _harness_vs_naive(arms: list[dict[str, Any]], *, min_final_ratio: float) -> list[dict[str, Any]]:
+    naive = [arm for arm in arms if arm["kind"] == "naive"]
+    harnesses = [arm for arm in arms if arm["kind"] == "harness"]
+    if not naive or not harnesses:
+        return []
+    best_naive_final = _max_optional(arm.get("final_best_per_gpu") for arm in naive)
+    best_naive_ttt = _min_optional(arm.get("trials_to_target") for arm in naive)
+    best_naive_auc = _max_optional(arm.get("normalized_auc") for arm in naive)
+    rows = []
+    for harness in harnesses:
+        final = harness.get("final_best_per_gpu")
+        ttt = harness.get("trials_to_target")
+        auc = harness.get("normalized_auc")
+        final_ratio = (
+            float(final) / best_naive_final
+            if best_naive_final and isinstance(final, (int, float))
+            else None
+        )
+        auc_ratio = (
+            float(auc) / best_naive_auc
+            if best_naive_auc and isinstance(auc, (int, float))
+            else None
+        )
+        speedup = _speedup(best_naive_ttt, ttt)
+        pass_final = final_ratio is not None and final_ratio >= min_final_ratio
+        pass_speed = speedup is None or speedup >= 1.0
+        rows.append(
+            {
+                "harness": harness["name"],
+                "best_naive_final_per_gpu": best_naive_final,
+                "best_naive_trials_to_target": best_naive_ttt,
+                "best_naive_normalized_auc": best_naive_auc,
+                "final_ratio_vs_best_naive": final_ratio,
+                "target_trial_speedup_vs_best_naive": speedup,
+                "auc_ratio_vs_best_naive": auc_ratio,
+                "passes_min_final_ratio": pass_final,
+                "passes_speed": pass_speed,
+                "passes": pass_final and pass_speed,
+            }
+        )
+    return rows
+
+
+def _case_winners(arms: list[dict[str, Any]]) -> dict[str, str | None]:
+    return {
+        "final_best": _argmax(arms, "final_best_per_gpu"),
+        "fastest_to_target": _argmin(arms, "trials_to_target"),
+        "normalized_auc": _argmax(arms, "normalized_auc"),
+    }
+
+
+def _aggregate_cases(cases: list[dict[str, Any]]) -> dict[str, Any]:
+    by_kind: dict[str, dict[str, Any]] = {}
+    final_wins: dict[str, int] = {}
+    speed_wins: dict[str, int] = {}
+    auc_wins: dict[str, int] = {}
+    harness_passes = 0
+    harness_checks = 0
+    for case in cases:
+        for winner_key, target in (
+            ("final_best", final_wins),
+            ("fastest_to_target", speed_wins),
+            ("normalized_auc", auc_wins),
+        ):
+            winner = case["winners"].get(winner_key)
+            if winner:
+                target[winner] = target.get(winner, 0) + 1
+        for row in case["harness_vs_naive"]:
+            harness_checks += 1
+            if row["passes"]:
+                harness_passes += 1
+        for arm in case["arms"]:
+            bucket = by_kind.setdefault(
+                arm["kind"],
+                {
+                    "arm_count": 0,
+                    "mean_final_ratio_to_reference": None,
+                    "mean_normalized_auc": None,
+                    "target_reached_count": 0,
+                    "_final_ratios": [],
+                    "_aucs": [],
+                },
+            )
+            bucket["arm_count"] += 1
+            if isinstance(arm.get("final_ratio_to_reference"), (int, float)):
+                bucket["_final_ratios"].append(float(arm["final_ratio_to_reference"]))
+            if isinstance(arm.get("normalized_auc"), (int, float)):
+                bucket["_aucs"].append(float(arm["normalized_auc"]))
+            if isinstance(arm.get("trials_to_target"), int):
+                bucket["target_reached_count"] += 1
+    for bucket in by_kind.values():
+        ratios = bucket.pop("_final_ratios")
+        aucs = bucket.pop("_aucs")
+        bucket["mean_final_ratio_to_reference"] = _mean(ratios)
+        bucket["mean_normalized_auc"] = _mean(aucs)
+    return {
+        "case_count": len(cases),
+        "by_kind": by_kind,
+        "winner_counts": {
+            "final_best": final_wins,
+            "fastest_to_target": speed_wins,
+            "normalized_auc": auc_wins,
+        },
+        "harness_vs_naive_pass_count": harness_passes,
+        "harness_vs_naive_check_count": harness_checks,
+    }
+
+
+def _case_warnings(
+    case: dict[str, Any],
+    arms: list[dict[str, Any]],
+    comparison: list[dict[str, Any]],
+) -> list[str]:
+    warnings = []
+    kinds = {arm["kind"] for arm in arms}
+    if "harness" not in kinds or "naive" not in kinds:
+        warnings.append("case does not include both harness and naive arms")
+    if len(case["tags"]) < 2:
+        warnings.append("case has few tags; add workload/model/SLO tags to support generalization claims")
+    if not comparison:
+        return warnings
+    for row in comparison:
+        if not row["passes_min_final_ratio"]:
+            warnings.append(
+                f"{row['harness']} final best is below min_final_ratio versus best naive"
+            )
+        if not row["passes_speed"]:
+            warnings.append(
+                f"{row['harness']} reaches target later than best naive"
+            )
+    return warnings
+
+
+def _running_best_curve(trials: list[Any]) -> list[float | None]:
+    curve: list[float | None] = []
+    incumbent: float | None = None
+    for trial in trials:
+        rate = trial.get("best_request_rate_per_gpu") if isinstance(trial, dict) else None
+        if isinstance(rate, (int, float)) and (incumbent is None or float(rate) > incumbent):
+            incumbent = float(rate)
+        curve.append(incumbent)
+    return curve
+
+
+def _value_at_budget(curve: list[float | None], budget: int) -> float | None:
+    if not curve:
+        return None
+    index = min(max(budget, 1), len(curve)) - 1
+    return curve[index]
+
+
+def _trials_to_target(curve: list[float | None], target: float | None) -> int | None:
+    if target is None:
+        return None
+    for idx, value in enumerate(curve, start=1):
+        if isinstance(value, (int, float)) and value >= target:
+            return idx
+    return None
+
+
+def _normalized_auc(
+    curve: list[float | None],
+    *,
+    reference: float | None,
+    max_budget: int,
+) -> float | None:
+    if not reference or max_budget <= 0:
+        return None
+    total = 0.0
+    for budget in range(1, max_budget + 1):
+        value = _value_at_budget(curve, budget)
+        total += float(value) if isinstance(value, (int, float)) else 0.0
+    return total / (reference * max_budget)
+
+
+def _reference_best(arms: list[dict[str, Any]]) -> float | None:
+    return _max_optional(arm.get("final_best_per_gpu") for arm in arms)
+
+
+def _resolve_study_root(raw_path: str, *, base_dir: Path) -> Path:
+    path = _resolve_path(raw_path, base_dir=base_dir)
+    if (path / "state.json").exists():
+        return path
+    matches = sorted(path.glob("*/state.json"))
+    if len(matches) == 1:
+        return matches[0].parent
+    if not matches:
+        raise SpecError(f"study_root does not contain state.json: {path}")
+    raise SpecError(f"study_root is ambiguous; point to a specific study directory: {path}")
+
+
+def _resolve_path(raw_path: str, *, base_dir: Path) -> Path:
+    path = Path(raw_path)
+    if not path.is_absolute():
+        path = (base_dir / path).resolve()
+    return path
+
+
+def _as_float(value: Any, *, default: float) -> float:
+    if value is None:
+        return default
+    if isinstance(value, bool) or not isinstance(value, (int, float)):
+        raise SpecError(f"Expected numeric value, got {value!r}.")
+    return float(value)
+
+
+def _positive_int(value: Any, *, context: str) -> int:
+    if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
+        raise SpecError(f"{context} must contain positive integers.")
+    return value
+
+
+def _first_index_at_value(curve: list[float | None], value: float | None) -> int | None:
+    if value is None:
+        return None
+    for idx, item in enumerate(curve, start=1):
+        if item == value:
+            return idx
+    return None
+
+
+def _argmax(rows: list[dict[str, Any]], key: str) -> str | None:
+    scored = [
+        (str(row["name"]), float(row[key]))
+        for row in rows
+        if isinstance(row.get(key), (int, float))
+    ]
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[1], reverse=True)
+    return scored[0][0]
+
+
+def _argmin(rows: list[dict[str, Any]], key: str) -> str | None:
+    scored = [
+        (str(row["name"]), int(row[key]))
+        for row in rows
+        if isinstance(row.get(key), int)
+    ]
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[1])
+    return scored[0][0]
+
+
+def _max_optional(values: Any) -> float | None:
+    scored = [float(item) for item in values if isinstance(item, (int, float))]
+    return max(scored) if scored else None
+
+
+def _min_optional(values: Any) -> int | None:
+    scored = [int(item) for item in values if isinstance(item, int)]
+    return min(scored) if scored else None
+
+
+def _mean(values: list[float]) -> float | None:
+    return sum(values) / len(values) if values else None
+
+
+def _speedup(naive_trials: int | None, harness_trials: int | None) -> float | None:
+    if harness_trials is None:
+        return 0.0 if naive_trials is not None else None
+    if naive_trials is None:
+        return None
+    if harness_trials <= 0:
+        return None
+    return float(naive_trials) / float(harness_trials)
+
+
+def _fmt(value: Any) -> str:
+    if isinstance(value, float):
+        return f"{value:.4f}"
+    if value is None:
+        return "-"
+    return str(value)
+
+
+def _render_report(summary: dict[str, Any]) -> str:
+    lines = [
+        f"# {summary['report_id']}",
+        "",
+        "## Aggregate",
+        "",
+        f"- Cases: `{summary['aggregate']['case_count']}`",
+        f"- Harness-vs-naive pass/checks: `{summary['aggregate']['harness_vs_naive_pass_count']}`/`{summary['aggregate']['harness_vs_naive_check_count']}`",
+        f"- Winner counts: `{json.dumps(summary['aggregate']['winner_counts'], ensure_ascii=False)}`",
+        "",
+        "## By Kind",
+        "",
+        "| Kind | Arms | Mean final/ref | Mean AUC | Target reached |",
+        "| --- | ---: | ---: | ---: | ---: |",
+    ]
+    for kind, payload in sorted(summary["aggregate"]["by_kind"].items()):
+        lines.append(
+            "| "
+            + " | ".join(
+                [
+                    f"`{kind}`",
+                    str(payload["arm_count"]),
+                    _fmt(payload["mean_final_ratio_to_reference"]),
+                    _fmt(payload["mean_normalized_auc"]),
+                    str(payload["target_reached_count"]),
+                ]
+            )
+            + " |"
+        )
+    lines.extend(["", "## Cases", ""])
+    for case in summary["cases"]:
+        lines.extend(
+            [
+                f"### {case['case_id']}",
+                "",
+                f"- Reference best req/s/GPU: `{_fmt(case['reference_best_per_gpu'])}`",
+                f"- Target fraction: `{case['target_fraction']}`",
+                f"- Winners: `{json.dumps(case['winners'], ensure_ascii=False)}`",
+            ]
+        )
+        if case["warnings"]:
+            lines.append(f"- Warnings: `{json.dumps(case['warnings'], ensure_ascii=False)}`")
+        lines.extend(
+            [
+                "",
+                "| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |",
+                "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
+            ]
+        )
+        for arm in case["arms"]:
+            lines.append(
+                "| "
+                + " | ".join(
+                    [
+                        f"`{arm['name']}`",
+                        f"`{arm['kind']}`",
+                        str(arm["trial_count"]),
+                        _fmt(arm["final_best_per_gpu"]),
+                        _fmt(arm["final_ratio_to_reference"]),
+                        _fmt(arm["trials_to_target"]),
+                        _fmt(arm["normalized_auc"]),
+                        str(arm["failed_count"]),
+                        str(arm["no_feasible_count"]),
+                    ]
+                )
+                + " |"
+            )
+        if case["harness_vs_naive"]:
+            lines.extend(["", "| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |", "| --- | ---: | ---: | ---: | --- |"])
+            for row in case["harness_vs_naive"]:
+                lines.append(
+                    "| "
+                    + " | ".join(
+                        [
+                            f"`{row['harness']}`",
+                            _fmt(row["final_ratio_vs_best_naive"]),
+                            _fmt(row["target_trial_speedup_vs_best_naive"]),
+                            _fmt(row["auc_ratio_vs_best_naive"]),
+                            f"`{row['passes']}`",
+                        ]
+                    )
+                    + " |"
+                )
+        lines.append("")
+    return "\n".join(lines)
diff --git a/tests/test_tuning_report.py b/tests/test_tuning_report.py
new file mode 100644
index 0000000..beac547
--- /dev/null
+++ b/tests/test_tuning_report.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from aituner.tuning_report import run_tuning_report
+
+
+def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
+    root.mkdir(parents=True)
+    trials = []
+    best_rate = None
+    best_trial_id = None
+    for idx, rate in enumerate(rates, start=1):
+        trial_id = f"trial-{idx:04d}"
+        trials.append(
+            {
+                "trial_id": trial_id,
+                "status": "completed" if rate is not None else "failed",
+                "parallel_size": 1,
+                "best_request_rate": rate,
+                "best_request_rate_per_gpu": rate,
+                "config_patch": {"env_patch": {}, "flag_patch": {}},
+            }
+        )
+        if rate is not None and (best_rate is None or rate > best_rate):
+            best_rate = rate
+            best_trial_id = trial_id
+    payload = {
+        "study_id": study_id,
+        "best_trial_id": best_trial_id,
+        "best_request_rate": best_rate,
+        "best_request_rate_per_gpu": best_rate,
+        "next_trial_index": len(rates) + 1,
+        "trials": trials,
+    }
+    (root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
+
+
+class TuningReportTests(unittest.TestCase):
+    def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_path = Path(tmp)
+            _write_state(
+                tmp_path / "studies" / "harness-study",
+                study_id="harness-study",
+                rates=[0.4, 0.9],
+            )
+            _write_state(
+                tmp_path / "naive-study",
+                study_id="naive-study",
+                rates=[0.4, None, 0.7, 0.9],
+            )
+            spec_path = tmp_path / "report.json"
+            spec_path.write_text(
+                json.dumps(
+                    {
+                        "report_id": "report-1",
+                        "output_root": str(tmp_path / "out"),
+                        "target_fraction": 0.8,
+                        "cases": [
+                            {
+                                "case_id": "case-1",
+                                "tags": ["model-a", "chat"],
+                                "budgets": [1, 2, 4],
+                                "arms": [
+                                    {
+                                        "name": "harness",
+                                        "kind": "harness",
+                                        "study_root": str(tmp_path / "studies"),
+                                    },
+                                    {
+                                        "name": "naive",
+                                        "kind": "naive",
+                                        "study_root": str(tmp_path / "naive-study"),
+                                    },
+                                ],
+                            }
+                        ],
+                    }
+                ),
+                encoding="utf-8",
+            )
+
+            summary = run_tuning_report(spec_path)
+
+            case = summary["cases"][0]
+            self.assertEqual(case["reference_best_per_gpu"], 0.9)
+            self.assertEqual(case["winners"]["final_best"], "harness")
+            self.assertEqual(case["winners"]["fastest_to_target"], "harness")
+            harness = case["arms"][0]
+            naive = case["arms"][1]
+            self.assertEqual(harness["best_at_budget"]["2"], 0.9)
+            self.assertEqual(naive["best_at_budget"]["2"], 0.4)
+            self.assertEqual(case["target_fraction"], 0.8)
+            self.assertEqual(harness["trials_to_target"], 2)
+            self.assertEqual(naive["trials_to_target"], 4)
+            self.assertEqual(naive["failed_count"], 1)
+            comparison = case["harness_vs_naive"][0]
+            self.assertTrue(comparison["passes"])
+            self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
+            self.assertTrue((tmp_path / "out" / "summary.json").exists())
+            self.assertTrue((tmp_path / "out" / "report.md").exists())
+
+
+if __name__ == "__main__":
+    unittest.main()