Add tuning progress report for harness evaluation

This commit is contained in:
2026-06-21 00:48:21 +08:00
parent 426151bc9f
commit 488fae7e63
4 changed files with 752 additions and 0 deletions

109
tests/test_tuning_report.py Normal file
View File

@@ -0,0 +1,109 @@
from __future__ import annotations
import json
import tempfile
import unittest
from pathlib import Path
from aituner.tuning_report import run_tuning_report
def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
root.mkdir(parents=True)
trials = []
best_rate = None
best_trial_id = None
for idx, rate in enumerate(rates, start=1):
trial_id = f"trial-{idx:04d}"
trials.append(
{
"trial_id": trial_id,
"status": "completed" if rate is not None else "failed",
"parallel_size": 1,
"best_request_rate": rate,
"best_request_rate_per_gpu": rate,
"config_patch": {"env_patch": {}, "flag_patch": {}},
}
)
if rate is not None and (best_rate is None or rate > best_rate):
best_rate = rate
best_trial_id = trial_id
payload = {
"study_id": study_id,
"best_trial_id": best_trial_id,
"best_request_rate": best_rate,
"best_request_rate_per_gpu": best_rate,
"next_trial_index": len(rates) + 1,
"trials": trials,
}
(root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
class TuningReportTests(unittest.TestCase):
def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
_write_state(
tmp_path / "studies" / "harness-study",
study_id="harness-study",
rates=[0.4, 0.9],
)
_write_state(
tmp_path / "naive-study",
study_id="naive-study",
rates=[0.4, None, 0.7, 0.9],
)
spec_path = tmp_path / "report.json"
spec_path.write_text(
json.dumps(
{
"report_id": "report-1",
"output_root": str(tmp_path / "out"),
"target_fraction": 0.8,
"cases": [
{
"case_id": "case-1",
"tags": ["model-a", "chat"],
"budgets": [1, 2, 4],
"arms": [
{
"name": "harness",
"kind": "harness",
"study_root": str(tmp_path / "studies"),
},
{
"name": "naive",
"kind": "naive",
"study_root": str(tmp_path / "naive-study"),
},
],
}
],
}
),
encoding="utf-8",
)
summary = run_tuning_report(spec_path)
case = summary["cases"][0]
self.assertEqual(case["reference_best_per_gpu"], 0.9)
self.assertEqual(case["winners"]["final_best"], "harness")
self.assertEqual(case["winners"]["fastest_to_target"], "harness")
harness = case["arms"][0]
naive = case["arms"][1]
self.assertEqual(harness["best_at_budget"]["2"], 0.9)
self.assertEqual(naive["best_at_budget"]["2"], 0.4)
self.assertEqual(case["target_fraction"], 0.8)
self.assertEqual(harness["trials_to_target"], 2)
self.assertEqual(naive["trials_to_target"], 4)
self.assertEqual(naive["failed_count"], 1)
comparison = case["harness_vs_naive"][0]
self.assertTrue(comparison["passes"])
self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
self.assertTrue((tmp_path / "out" / "summary.json").exists())
self.assertTrue((tmp_path / "out" / "report.md").exists())
if __name__ == "__main__":
unittest.main()