Add tuning progress report for harness evaluation
This commit is contained in:
109
tests/test_tuning_report.py
Normal file
109
tests/test_tuning_report.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from aituner.tuning_report import run_tuning_report
|
||||
|
||||
|
||||
def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
|
||||
root.mkdir(parents=True)
|
||||
trials = []
|
||||
best_rate = None
|
||||
best_trial_id = None
|
||||
for idx, rate in enumerate(rates, start=1):
|
||||
trial_id = f"trial-{idx:04d}"
|
||||
trials.append(
|
||||
{
|
||||
"trial_id": trial_id,
|
||||
"status": "completed" if rate is not None else "failed",
|
||||
"parallel_size": 1,
|
||||
"best_request_rate": rate,
|
||||
"best_request_rate_per_gpu": rate,
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {}},
|
||||
}
|
||||
)
|
||||
if rate is not None and (best_rate is None or rate > best_rate):
|
||||
best_rate = rate
|
||||
best_trial_id = trial_id
|
||||
payload = {
|
||||
"study_id": study_id,
|
||||
"best_trial_id": best_trial_id,
|
||||
"best_request_rate": best_rate,
|
||||
"best_request_rate_per_gpu": best_rate,
|
||||
"next_trial_index": len(rates) + 1,
|
||||
"trials": trials,
|
||||
}
|
||||
(root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
|
||||
|
||||
|
||||
class TuningReportTests(unittest.TestCase):
|
||||
def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
_write_state(
|
||||
tmp_path / "studies" / "harness-study",
|
||||
study_id="harness-study",
|
||||
rates=[0.4, 0.9],
|
||||
)
|
||||
_write_state(
|
||||
tmp_path / "naive-study",
|
||||
study_id="naive-study",
|
||||
rates=[0.4, None, 0.7, 0.9],
|
||||
)
|
||||
spec_path = tmp_path / "report.json"
|
||||
spec_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"report_id": "report-1",
|
||||
"output_root": str(tmp_path / "out"),
|
||||
"target_fraction": 0.8,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "case-1",
|
||||
"tags": ["model-a", "chat"],
|
||||
"budgets": [1, 2, 4],
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": str(tmp_path / "studies"),
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": str(tmp_path / "naive-study"),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
summary = run_tuning_report(spec_path)
|
||||
|
||||
case = summary["cases"][0]
|
||||
self.assertEqual(case["reference_best_per_gpu"], 0.9)
|
||||
self.assertEqual(case["winners"]["final_best"], "harness")
|
||||
self.assertEqual(case["winners"]["fastest_to_target"], "harness")
|
||||
harness = case["arms"][0]
|
||||
naive = case["arms"][1]
|
||||
self.assertEqual(harness["best_at_budget"]["2"], 0.9)
|
||||
self.assertEqual(naive["best_at_budget"]["2"], 0.4)
|
||||
self.assertEqual(case["target_fraction"], 0.8)
|
||||
self.assertEqual(harness["trials_to_target"], 2)
|
||||
self.assertEqual(naive["trials_to_target"], 4)
|
||||
self.assertEqual(naive["failed_count"], 1)
|
||||
comparison = case["harness_vs_naive"][0]
|
||||
self.assertTrue(comparison["passes"])
|
||||
self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
|
||||
self.assertTrue((tmp_path / "out" / "summary.json").exists())
|
||||
self.assertTrue((tmp_path / "out" / "report.md").exists())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user