Add tuning progress report for harness evaluation

This commit is contained in:
2026-06-21 00:48:21 +08:00
parent 426151bc9f
commit 488fae7e63
4 changed files with 752 additions and 0 deletions

View File

@@ -0,0 +1,26 @@
{
"report_id": "qwen27b-abl12-harness-vs-naive",
"output_root": "../../.aituner-reports/qwen27b-abl12-harness-vs-naive",
"target_fraction": 0.95,
"min_final_ratio": 0.98,
"cases": [
{
"case_id": "qwen27b-chat-0-8k-real-output",
"description": "12-trial harness-vs-naive ablation on the 0-8k chat window with real output lengths.",
"tags": ["qwen27b", "chat", "0-8k", "h20", "real-output"],
"budgets": [1, 2, 3, 4, 6, 8, 12],
"arms": [
{
"name": "harness",
"kind": "harness",
"study_root": "../../.aituner/abl12-harness/dash0-qwen27b-ablation-harness-on"
},
{
"name": "naive",
"kind": "naive",
"study_root": "../../.aituner/abl12-naive/dash0-qwen27b-ablation-naive-off"
}
]
}
]
}

36
scripts/tuning_report.py Normal file
View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
from pathlib import Path
from aituner.tuning_report import run_tuning_report
def main() -> int:
parser = argparse.ArgumentParser(
description="Summarize anytime tuning progress across harness/naive study stores."
)
parser.add_argument("--spec", required=True, help="Path to a tuning report JSON spec.")
args = parser.parse_args()
summary = run_tuning_report(Path(args.spec))
print(
json.dumps(
{
"report_id": summary["report_id"],
"report_root": summary["report_root"],
"case_count": summary["aggregate"]["case_count"],
"harness_vs_naive_pass_count": summary["aggregate"]["harness_vs_naive_pass_count"],
"harness_vs_naive_check_count": summary["aggregate"]["harness_vs_naive_check_count"],
"winner_counts": summary["aggregate"]["winner_counts"],
},
ensure_ascii=False,
indent=2,
)
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,581 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from .spec import SpecError, load_structured_file
from .store import StudyStore
DEFAULT_BUDGETS = [1, 2, 3, 4, 6, 8, 12]
DEFAULT_TARGET_FRACTION = 0.95
DEFAULT_MIN_FINAL_RATIO = 0.98
def run_tuning_report(spec_path: Path) -> dict[str, Any]:
spec_path = spec_path.resolve()
spec = _load_report_spec(spec_path)
report_root = _resolve_output_root(spec, spec_path=spec_path)
report_root.mkdir(parents=True, exist_ok=True)
cases = [
_summarize_case(case, spec_path=spec_path)
for case in spec["cases"]
]
summary = {
"report_id": spec["report_id"],
"report_root": str(report_root),
"target_fraction": spec["target_fraction"],
"min_final_ratio": spec["min_final_ratio"],
"cases": cases,
"aggregate": _aggregate_cases(cases),
}
StudyStore.write_json(report_root / "summary.json", summary)
(report_root / "report.md").write_text(_render_report(summary), encoding="utf-8")
return summary
def _load_report_spec(path: Path) -> dict[str, Any]:
payload = dict(load_structured_file(path))
report_id = str(payload.get("report_id") or "").strip()
if not report_id:
raise SpecError("report_id must be a non-empty string.")
raw_cases = payload.get("cases")
if not isinstance(raw_cases, list) or not raw_cases:
raise SpecError("cases must be a non-empty list.")
target_fraction = _as_float(payload.get("target_fraction"), default=DEFAULT_TARGET_FRACTION)
if target_fraction <= 0:
raise SpecError("target_fraction must be positive.")
min_final_ratio = _as_float(payload.get("min_final_ratio"), default=DEFAULT_MIN_FINAL_RATIO)
if min_final_ratio <= 0:
raise SpecError("min_final_ratio must be positive.")
cases = [
_load_case(
item,
idx=idx,
default_target_fraction=target_fraction,
default_min_final_ratio=min_final_ratio,
)
for idx, item in enumerate(raw_cases)
]
return {
"report_id": report_id,
"output_root": str(payload.get("output_root") or "").strip() or None,
"target_fraction": target_fraction,
"min_final_ratio": min_final_ratio,
"cases": cases,
}
def _load_case(
raw: Any,
*,
idx: int,
default_target_fraction: float,
default_min_final_ratio: float,
) -> dict[str, Any]:
if not isinstance(raw, dict):
raise SpecError(f"cases[{idx}] must be an object.")
case_id = str(raw.get("case_id") or "").strip()
if not case_id:
raise SpecError(f"cases[{idx}].case_id must be a non-empty string.")
raw_arms = raw.get("arms")
if not isinstance(raw_arms, list) or not raw_arms:
raise SpecError(f"cases[{idx}].arms must be a non-empty list.")
arms = [_load_arm(item, context=f"cases[{idx}].arms[{arm_idx}]") for arm_idx, item in enumerate(raw_arms)]
names = [item["name"] for item in arms]
if len(names) != len(set(names)):
raise SpecError(f"cases[{idx}].arms names must be unique.")
raw_budgets = raw.get("budgets", DEFAULT_BUDGETS)
if not isinstance(raw_budgets, list) or not raw_budgets:
raise SpecError(f"cases[{idx}].budgets must be a non-empty list.")
budgets = sorted({_positive_int(item, context=f"cases[{idx}].budgets") for item in raw_budgets})
return {
"case_id": case_id,
"description": str(raw.get("description") or "").strip(),
"tags": [str(item).strip() for item in raw.get("tags", []) if str(item).strip()]
if isinstance(raw.get("tags", []), list)
else [],
"budgets": budgets,
"target_fraction": _as_float(raw.get("target_fraction"), default=default_target_fraction),
"min_final_ratio": _as_float(raw.get("min_final_ratio"), default=default_min_final_ratio),
"arms": arms,
}
def _load_arm(raw: Any, *, context: str) -> dict[str, Any]:
if not isinstance(raw, dict):
raise SpecError(f"{context} must be an object.")
name = str(raw.get("name") or "").strip()
if not name:
raise SpecError(f"{context}.name must be a non-empty string.")
kind = str(raw.get("kind") or name).strip()
study_root = str(raw.get("study_root") or "").strip()
if not study_root:
raise SpecError(f"{context}.study_root must be a non-empty string.")
return {
"name": name,
"kind": kind,
"study_root": study_root,
"label": str(raw.get("label") or "").strip() or name,
}
def _resolve_output_root(spec: dict[str, Any], *, spec_path: Path) -> Path:
raw = spec.get("output_root")
if raw:
return _resolve_path(str(raw), base_dir=spec_path.parent)
return (Path(".aituner-reports") / str(spec["report_id"])).resolve()
def _summarize_case(case: dict[str, Any], *, spec_path: Path) -> dict[str, Any]:
arms = [
_summarize_arm(arm, budgets=case["budgets"], spec_path=spec_path)
for arm in case["arms"]
]
reference = _reference_best(arms)
max_budget = max(case["budgets"] + [arm["trial_count"] for arm in arms])
for arm in arms:
_add_reference_metrics(
arm,
reference=reference,
max_budget=max_budget,
target_fraction=case["target_fraction"],
)
winners = _case_winners(arms)
comparison = _harness_vs_naive(
arms,
min_final_ratio=case["min_final_ratio"],
)
return {
"case_id": case["case_id"],
"description": case["description"],
"tags": case["tags"],
"budgets": case["budgets"],
"target_fraction": case["target_fraction"],
"min_final_ratio": case["min_final_ratio"],
"reference_best_per_gpu": reference,
"max_budget": max_budget,
"arms": arms,
"winners": winners,
"harness_vs_naive": comparison,
"warnings": _case_warnings(case, arms, comparison),
}
def _summarize_arm(arm: dict[str, Any], *, budgets: list[int], spec_path: Path) -> dict[str, Any]:
study_root = _resolve_study_root(arm["study_root"], base_dir=spec_path.parent)
state = json.loads((study_root / "state.json").read_text(encoding="utf-8"))
trials = state.get("trials") if isinstance(state.get("trials"), list) else []
curve = _running_best_curve(trials)
final_best = curve[-1] if curve else None
best_trial_index = _first_index_at_value(curve, final_best)
return {
"name": arm["name"],
"kind": arm["kind"],
"label": arm["label"],
"study_root": str(study_root),
"study_id": state.get("study_id"),
"trial_count": len(trials),
"completed_count": sum(1 for item in trials if item.get("status") == "completed"),
"failed_count": sum(1 for item in trials if item.get("status") == "failed"),
"no_feasible_count": sum(
1 for item in trials if not isinstance(item.get("best_request_rate_per_gpu"), (int, float))
),
"best_trial_id": state.get("best_trial_id"),
"best_trial_index": best_trial_index,
"final_best_per_gpu": final_best,
"state_best_per_gpu": state.get("best_request_rate_per_gpu"),
"best_at_budget": {str(budget): _value_at_budget(curve, budget) for budget in budgets},
"running_best_per_gpu": curve,
"stop_reason": str(state.get("tuning_stop_reason") or ""),
"stop_diagnosis": str(state.get("tuning_stop_diagnosis") or ""),
}
def _add_reference_metrics(
arm: dict[str, Any],
*,
reference: float | None,
max_budget: int,
target_fraction: float,
) -> None:
final_best = arm.get("final_best_per_gpu")
arm["final_ratio_to_reference"] = (
float(final_best) / reference
if reference and isinstance(final_best, (int, float))
else None
)
target = reference * target_fraction if reference else None
arm["target_per_gpu"] = target
arm["trials_to_target"] = _trials_to_target(arm["running_best_per_gpu"], target)
arm["normalized_auc"] = _normalized_auc(
arm["running_best_per_gpu"],
reference=reference,
max_budget=max_budget,
)
def _harness_vs_naive(arms: list[dict[str, Any]], *, min_final_ratio: float) -> list[dict[str, Any]]:
naive = [arm for arm in arms if arm["kind"] == "naive"]
harnesses = [arm for arm in arms if arm["kind"] == "harness"]
if not naive or not harnesses:
return []
best_naive_final = _max_optional(arm.get("final_best_per_gpu") for arm in naive)
best_naive_ttt = _min_optional(arm.get("trials_to_target") for arm in naive)
best_naive_auc = _max_optional(arm.get("normalized_auc") for arm in naive)
rows = []
for harness in harnesses:
final = harness.get("final_best_per_gpu")
ttt = harness.get("trials_to_target")
auc = harness.get("normalized_auc")
final_ratio = (
float(final) / best_naive_final
if best_naive_final and isinstance(final, (int, float))
else None
)
auc_ratio = (
float(auc) / best_naive_auc
if best_naive_auc and isinstance(auc, (int, float))
else None
)
speedup = _speedup(best_naive_ttt, ttt)
pass_final = final_ratio is not None and final_ratio >= min_final_ratio
pass_speed = speedup is None or speedup >= 1.0
rows.append(
{
"harness": harness["name"],
"best_naive_final_per_gpu": best_naive_final,
"best_naive_trials_to_target": best_naive_ttt,
"best_naive_normalized_auc": best_naive_auc,
"final_ratio_vs_best_naive": final_ratio,
"target_trial_speedup_vs_best_naive": speedup,
"auc_ratio_vs_best_naive": auc_ratio,
"passes_min_final_ratio": pass_final,
"passes_speed": pass_speed,
"passes": pass_final and pass_speed,
}
)
return rows
def _case_winners(arms: list[dict[str, Any]]) -> dict[str, str | None]:
return {
"final_best": _argmax(arms, "final_best_per_gpu"),
"fastest_to_target": _argmin(arms, "trials_to_target"),
"normalized_auc": _argmax(arms, "normalized_auc"),
}
def _aggregate_cases(cases: list[dict[str, Any]]) -> dict[str, Any]:
by_kind: dict[str, dict[str, Any]] = {}
final_wins: dict[str, int] = {}
speed_wins: dict[str, int] = {}
auc_wins: dict[str, int] = {}
harness_passes = 0
harness_checks = 0
for case in cases:
for winner_key, target in (
("final_best", final_wins),
("fastest_to_target", speed_wins),
("normalized_auc", auc_wins),
):
winner = case["winners"].get(winner_key)
if winner:
target[winner] = target.get(winner, 0) + 1
for row in case["harness_vs_naive"]:
harness_checks += 1
if row["passes"]:
harness_passes += 1
for arm in case["arms"]:
bucket = by_kind.setdefault(
arm["kind"],
{
"arm_count": 0,
"mean_final_ratio_to_reference": None,
"mean_normalized_auc": None,
"target_reached_count": 0,
"_final_ratios": [],
"_aucs": [],
},
)
bucket["arm_count"] += 1
if isinstance(arm.get("final_ratio_to_reference"), (int, float)):
bucket["_final_ratios"].append(float(arm["final_ratio_to_reference"]))
if isinstance(arm.get("normalized_auc"), (int, float)):
bucket["_aucs"].append(float(arm["normalized_auc"]))
if isinstance(arm.get("trials_to_target"), int):
bucket["target_reached_count"] += 1
for bucket in by_kind.values():
ratios = bucket.pop("_final_ratios")
aucs = bucket.pop("_aucs")
bucket["mean_final_ratio_to_reference"] = _mean(ratios)
bucket["mean_normalized_auc"] = _mean(aucs)
return {
"case_count": len(cases),
"by_kind": by_kind,
"winner_counts": {
"final_best": final_wins,
"fastest_to_target": speed_wins,
"normalized_auc": auc_wins,
},
"harness_vs_naive_pass_count": harness_passes,
"harness_vs_naive_check_count": harness_checks,
}
def _case_warnings(
case: dict[str, Any],
arms: list[dict[str, Any]],
comparison: list[dict[str, Any]],
) -> list[str]:
warnings = []
kinds = {arm["kind"] for arm in arms}
if "harness" not in kinds or "naive" not in kinds:
warnings.append("case does not include both harness and naive arms")
if len(case["tags"]) < 2:
warnings.append("case has few tags; add workload/model/SLO tags to support generalization claims")
if not comparison:
return warnings
for row in comparison:
if not row["passes_min_final_ratio"]:
warnings.append(
f"{row['harness']} final best is below min_final_ratio versus best naive"
)
if not row["passes_speed"]:
warnings.append(
f"{row['harness']} reaches target later than best naive"
)
return warnings
def _running_best_curve(trials: list[Any]) -> list[float | None]:
curve: list[float | None] = []
incumbent: float | None = None
for trial in trials:
rate = trial.get("best_request_rate_per_gpu") if isinstance(trial, dict) else None
if isinstance(rate, (int, float)) and (incumbent is None or float(rate) > incumbent):
incumbent = float(rate)
curve.append(incumbent)
return curve
def _value_at_budget(curve: list[float | None], budget: int) -> float | None:
if not curve:
return None
index = min(max(budget, 1), len(curve)) - 1
return curve[index]
def _trials_to_target(curve: list[float | None], target: float | None) -> int | None:
if target is None:
return None
for idx, value in enumerate(curve, start=1):
if isinstance(value, (int, float)) and value >= target:
return idx
return None
def _normalized_auc(
curve: list[float | None],
*,
reference: float | None,
max_budget: int,
) -> float | None:
if not reference or max_budget <= 0:
return None
total = 0.0
for budget in range(1, max_budget + 1):
value = _value_at_budget(curve, budget)
total += float(value) if isinstance(value, (int, float)) else 0.0
return total / (reference * max_budget)
def _reference_best(arms: list[dict[str, Any]]) -> float | None:
return _max_optional(arm.get("final_best_per_gpu") for arm in arms)
def _resolve_study_root(raw_path: str, *, base_dir: Path) -> Path:
path = _resolve_path(raw_path, base_dir=base_dir)
if (path / "state.json").exists():
return path
matches = sorted(path.glob("*/state.json"))
if len(matches) == 1:
return matches[0].parent
if not matches:
raise SpecError(f"study_root does not contain state.json: {path}")
raise SpecError(f"study_root is ambiguous; point to a specific study directory: {path}")
def _resolve_path(raw_path: str, *, base_dir: Path) -> Path:
path = Path(raw_path)
if not path.is_absolute():
path = (base_dir / path).resolve()
return path
def _as_float(value: Any, *, default: float) -> float:
if value is None:
return default
if isinstance(value, bool) or not isinstance(value, (int, float)):
raise SpecError(f"Expected numeric value, got {value!r}.")
return float(value)
def _positive_int(value: Any, *, context: str) -> int:
if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
raise SpecError(f"{context} must contain positive integers.")
return value
def _first_index_at_value(curve: list[float | None], value: float | None) -> int | None:
if value is None:
return None
for idx, item in enumerate(curve, start=1):
if item == value:
return idx
return None
def _argmax(rows: list[dict[str, Any]], key: str) -> str | None:
scored = [
(str(row["name"]), float(row[key]))
for row in rows
if isinstance(row.get(key), (int, float))
]
if not scored:
return None
scored.sort(key=lambda item: item[1], reverse=True)
return scored[0][0]
def _argmin(rows: list[dict[str, Any]], key: str) -> str | None:
scored = [
(str(row["name"]), int(row[key]))
for row in rows
if isinstance(row.get(key), int)
]
if not scored:
return None
scored.sort(key=lambda item: item[1])
return scored[0][0]
def _max_optional(values: Any) -> float | None:
scored = [float(item) for item in values if isinstance(item, (int, float))]
return max(scored) if scored else None
def _min_optional(values: Any) -> int | None:
scored = [int(item) for item in values if isinstance(item, int)]
return min(scored) if scored else None
def _mean(values: list[float]) -> float | None:
return sum(values) / len(values) if values else None
def _speedup(naive_trials: int | None, harness_trials: int | None) -> float | None:
if harness_trials is None:
return 0.0 if naive_trials is not None else None
if naive_trials is None:
return None
if harness_trials <= 0:
return None
return float(naive_trials) / float(harness_trials)
def _fmt(value: Any) -> str:
if isinstance(value, float):
return f"{value:.4f}"
if value is None:
return "-"
return str(value)
def _render_report(summary: dict[str, Any]) -> str:
lines = [
f"# {summary['report_id']}",
"",
"## Aggregate",
"",
f"- Cases: `{summary['aggregate']['case_count']}`",
f"- Harness-vs-naive pass/checks: `{summary['aggregate']['harness_vs_naive_pass_count']}`/`{summary['aggregate']['harness_vs_naive_check_count']}`",
f"- Winner counts: `{json.dumps(summary['aggregate']['winner_counts'], ensure_ascii=False)}`",
"",
"## By Kind",
"",
"| Kind | Arms | Mean final/ref | Mean AUC | Target reached |",
"| --- | ---: | ---: | ---: | ---: |",
]
for kind, payload in sorted(summary["aggregate"]["by_kind"].items()):
lines.append(
"| "
+ " | ".join(
[
f"`{kind}`",
str(payload["arm_count"]),
_fmt(payload["mean_final_ratio_to_reference"]),
_fmt(payload["mean_normalized_auc"]),
str(payload["target_reached_count"]),
]
)
+ " |"
)
lines.extend(["", "## Cases", ""])
for case in summary["cases"]:
lines.extend(
[
f"### {case['case_id']}",
"",
f"- Reference best req/s/GPU: `{_fmt(case['reference_best_per_gpu'])}`",
f"- Target fraction: `{case['target_fraction']}`",
f"- Winners: `{json.dumps(case['winners'], ensure_ascii=False)}`",
]
)
if case["warnings"]:
lines.append(f"- Warnings: `{json.dumps(case['warnings'], ensure_ascii=False)}`")
lines.extend(
[
"",
"| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |",
"| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
]
)
for arm in case["arms"]:
lines.append(
"| "
+ " | ".join(
[
f"`{arm['name']}`",
f"`{arm['kind']}`",
str(arm["trial_count"]),
_fmt(arm["final_best_per_gpu"]),
_fmt(arm["final_ratio_to_reference"]),
_fmt(arm["trials_to_target"]),
_fmt(arm["normalized_auc"]),
str(arm["failed_count"]),
str(arm["no_feasible_count"]),
]
)
+ " |"
)
if case["harness_vs_naive"]:
lines.extend(["", "| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |", "| --- | ---: | ---: | ---: | --- |"])
for row in case["harness_vs_naive"]:
lines.append(
"| "
+ " | ".join(
[
f"`{row['harness']}`",
_fmt(row["final_ratio_vs_best_naive"]),
_fmt(row["target_trial_speedup_vs_best_naive"]),
_fmt(row["auc_ratio_vs_best_naive"]),
f"`{row['passes']}`",
]
)
+ " |"
)
lines.append("")
return "\n".join(lines)

109
tests/test_tuning_report.py Normal file
View File

@@ -0,0 +1,109 @@
from __future__ import annotations
import json
import tempfile
import unittest
from pathlib import Path
from aituner.tuning_report import run_tuning_report
def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
root.mkdir(parents=True)
trials = []
best_rate = None
best_trial_id = None
for idx, rate in enumerate(rates, start=1):
trial_id = f"trial-{idx:04d}"
trials.append(
{
"trial_id": trial_id,
"status": "completed" if rate is not None else "failed",
"parallel_size": 1,
"best_request_rate": rate,
"best_request_rate_per_gpu": rate,
"config_patch": {"env_patch": {}, "flag_patch": {}},
}
)
if rate is not None and (best_rate is None or rate > best_rate):
best_rate = rate
best_trial_id = trial_id
payload = {
"study_id": study_id,
"best_trial_id": best_trial_id,
"best_request_rate": best_rate,
"best_request_rate_per_gpu": best_rate,
"next_trial_index": len(rates) + 1,
"trials": trials,
}
(root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
class TuningReportTests(unittest.TestCase):
def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
_write_state(
tmp_path / "studies" / "harness-study",
study_id="harness-study",
rates=[0.4, 0.9],
)
_write_state(
tmp_path / "naive-study",
study_id="naive-study",
rates=[0.4, None, 0.7, 0.9],
)
spec_path = tmp_path / "report.json"
spec_path.write_text(
json.dumps(
{
"report_id": "report-1",
"output_root": str(tmp_path / "out"),
"target_fraction": 0.8,
"cases": [
{
"case_id": "case-1",
"tags": ["model-a", "chat"],
"budgets": [1, 2, 4],
"arms": [
{
"name": "harness",
"kind": "harness",
"study_root": str(tmp_path / "studies"),
},
{
"name": "naive",
"kind": "naive",
"study_root": str(tmp_path / "naive-study"),
},
],
}
],
}
),
encoding="utf-8",
)
summary = run_tuning_report(spec_path)
case = summary["cases"][0]
self.assertEqual(case["reference_best_per_gpu"], 0.9)
self.assertEqual(case["winners"]["final_best"], "harness")
self.assertEqual(case["winners"]["fastest_to_target"], "harness")
harness = case["arms"][0]
naive = case["arms"][1]
self.assertEqual(harness["best_at_budget"]["2"], 0.9)
self.assertEqual(naive["best_at_budget"]["2"], 0.4)
self.assertEqual(case["target_fraction"], 0.8)
self.assertEqual(harness["trials_to_target"], 2)
self.assertEqual(naive["trials_to_target"], 4)
self.assertEqual(naive["failed_count"], 1)
comparison = case["harness_vs_naive"][0]
self.assertTrue(comparison["passes"])
self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
self.assertTrue((tmp_path / "out" / "summary.json").exists())
self.assertTrue((tmp_path / "out" / "report.md").exists())
if __name__ == "__main__":
unittest.main()