Add tuning progress report for harness evaluation
This commit is contained in:
26
configs/examples/tuning_report.example.json
Normal file
26
configs/examples/tuning_report.example.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"report_id": "qwen27b-abl12-harness-vs-naive",
|
||||
"output_root": "../../.aituner-reports/qwen27b-abl12-harness-vs-naive",
|
||||
"target_fraction": 0.95,
|
||||
"min_final_ratio": 0.98,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "qwen27b-chat-0-8k-real-output",
|
||||
"description": "12-trial harness-vs-naive ablation on the 0-8k chat window with real output lengths.",
|
||||
"tags": ["qwen27b", "chat", "0-8k", "h20", "real-output"],
|
||||
"budgets": [1, 2, 3, 4, 6, 8, 12],
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": "../../.aituner/abl12-harness/dash0-qwen27b-ablation-harness-on"
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": "../../.aituner/abl12-naive/dash0-qwen27b-ablation-naive-off"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
36
scripts/tuning_report.py
Normal file
36
scripts/tuning_report.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from aituner.tuning_report import run_tuning_report
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Summarize anytime tuning progress across harness/naive study stores."
|
||||
)
|
||||
parser.add_argument("--spec", required=True, help="Path to a tuning report JSON spec.")
|
||||
args = parser.parse_args()
|
||||
summary = run_tuning_report(Path(args.spec))
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"report_id": summary["report_id"],
|
||||
"report_root": summary["report_root"],
|
||||
"case_count": summary["aggregate"]["case_count"],
|
||||
"harness_vs_naive_pass_count": summary["aggregate"]["harness_vs_naive_pass_count"],
|
||||
"harness_vs_naive_check_count": summary["aggregate"]["harness_vs_naive_check_count"],
|
||||
"winner_counts": summary["aggregate"]["winner_counts"],
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
581
src/aituner/tuning_report.py
Normal file
581
src/aituner/tuning_report.py
Normal file
@@ -0,0 +1,581 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .spec import SpecError, load_structured_file
|
||||
from .store import StudyStore
|
||||
|
||||
|
||||
DEFAULT_BUDGETS = [1, 2, 3, 4, 6, 8, 12]
|
||||
DEFAULT_TARGET_FRACTION = 0.95
|
||||
DEFAULT_MIN_FINAL_RATIO = 0.98
|
||||
|
||||
|
||||
def run_tuning_report(spec_path: Path) -> dict[str, Any]:
|
||||
spec_path = spec_path.resolve()
|
||||
spec = _load_report_spec(spec_path)
|
||||
report_root = _resolve_output_root(spec, spec_path=spec_path)
|
||||
report_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cases = [
|
||||
_summarize_case(case, spec_path=spec_path)
|
||||
for case in spec["cases"]
|
||||
]
|
||||
summary = {
|
||||
"report_id": spec["report_id"],
|
||||
"report_root": str(report_root),
|
||||
"target_fraction": spec["target_fraction"],
|
||||
"min_final_ratio": spec["min_final_ratio"],
|
||||
"cases": cases,
|
||||
"aggregate": _aggregate_cases(cases),
|
||||
}
|
||||
StudyStore.write_json(report_root / "summary.json", summary)
|
||||
(report_root / "report.md").write_text(_render_report(summary), encoding="utf-8")
|
||||
return summary
|
||||
|
||||
|
||||
def _load_report_spec(path: Path) -> dict[str, Any]:
|
||||
payload = dict(load_structured_file(path))
|
||||
report_id = str(payload.get("report_id") or "").strip()
|
||||
if not report_id:
|
||||
raise SpecError("report_id must be a non-empty string.")
|
||||
raw_cases = payload.get("cases")
|
||||
if not isinstance(raw_cases, list) or not raw_cases:
|
||||
raise SpecError("cases must be a non-empty list.")
|
||||
target_fraction = _as_float(payload.get("target_fraction"), default=DEFAULT_TARGET_FRACTION)
|
||||
if target_fraction <= 0:
|
||||
raise SpecError("target_fraction must be positive.")
|
||||
min_final_ratio = _as_float(payload.get("min_final_ratio"), default=DEFAULT_MIN_FINAL_RATIO)
|
||||
if min_final_ratio <= 0:
|
||||
raise SpecError("min_final_ratio must be positive.")
|
||||
cases = [
|
||||
_load_case(
|
||||
item,
|
||||
idx=idx,
|
||||
default_target_fraction=target_fraction,
|
||||
default_min_final_ratio=min_final_ratio,
|
||||
)
|
||||
for idx, item in enumerate(raw_cases)
|
||||
]
|
||||
return {
|
||||
"report_id": report_id,
|
||||
"output_root": str(payload.get("output_root") or "").strip() or None,
|
||||
"target_fraction": target_fraction,
|
||||
"min_final_ratio": min_final_ratio,
|
||||
"cases": cases,
|
||||
}
|
||||
|
||||
|
||||
def _load_case(
|
||||
raw: Any,
|
||||
*,
|
||||
idx: int,
|
||||
default_target_fraction: float,
|
||||
default_min_final_ratio: float,
|
||||
) -> dict[str, Any]:
|
||||
if not isinstance(raw, dict):
|
||||
raise SpecError(f"cases[{idx}] must be an object.")
|
||||
case_id = str(raw.get("case_id") or "").strip()
|
||||
if not case_id:
|
||||
raise SpecError(f"cases[{idx}].case_id must be a non-empty string.")
|
||||
raw_arms = raw.get("arms")
|
||||
if not isinstance(raw_arms, list) or not raw_arms:
|
||||
raise SpecError(f"cases[{idx}].arms must be a non-empty list.")
|
||||
arms = [_load_arm(item, context=f"cases[{idx}].arms[{arm_idx}]") for arm_idx, item in enumerate(raw_arms)]
|
||||
names = [item["name"] for item in arms]
|
||||
if len(names) != len(set(names)):
|
||||
raise SpecError(f"cases[{idx}].arms names must be unique.")
|
||||
raw_budgets = raw.get("budgets", DEFAULT_BUDGETS)
|
||||
if not isinstance(raw_budgets, list) or not raw_budgets:
|
||||
raise SpecError(f"cases[{idx}].budgets must be a non-empty list.")
|
||||
budgets = sorted({_positive_int(item, context=f"cases[{idx}].budgets") for item in raw_budgets})
|
||||
return {
|
||||
"case_id": case_id,
|
||||
"description": str(raw.get("description") or "").strip(),
|
||||
"tags": [str(item).strip() for item in raw.get("tags", []) if str(item).strip()]
|
||||
if isinstance(raw.get("tags", []), list)
|
||||
else [],
|
||||
"budgets": budgets,
|
||||
"target_fraction": _as_float(raw.get("target_fraction"), default=default_target_fraction),
|
||||
"min_final_ratio": _as_float(raw.get("min_final_ratio"), default=default_min_final_ratio),
|
||||
"arms": arms,
|
||||
}
|
||||
|
||||
|
||||
def _load_arm(raw: Any, *, context: str) -> dict[str, Any]:
|
||||
if not isinstance(raw, dict):
|
||||
raise SpecError(f"{context} must be an object.")
|
||||
name = str(raw.get("name") or "").strip()
|
||||
if not name:
|
||||
raise SpecError(f"{context}.name must be a non-empty string.")
|
||||
kind = str(raw.get("kind") or name).strip()
|
||||
study_root = str(raw.get("study_root") or "").strip()
|
||||
if not study_root:
|
||||
raise SpecError(f"{context}.study_root must be a non-empty string.")
|
||||
return {
|
||||
"name": name,
|
||||
"kind": kind,
|
||||
"study_root": study_root,
|
||||
"label": str(raw.get("label") or "").strip() or name,
|
||||
}
|
||||
|
||||
|
||||
def _resolve_output_root(spec: dict[str, Any], *, spec_path: Path) -> Path:
|
||||
raw = spec.get("output_root")
|
||||
if raw:
|
||||
return _resolve_path(str(raw), base_dir=spec_path.parent)
|
||||
return (Path(".aituner-reports") / str(spec["report_id"])).resolve()
|
||||
|
||||
|
||||
def _summarize_case(case: dict[str, Any], *, spec_path: Path) -> dict[str, Any]:
|
||||
arms = [
|
||||
_summarize_arm(arm, budgets=case["budgets"], spec_path=spec_path)
|
||||
for arm in case["arms"]
|
||||
]
|
||||
reference = _reference_best(arms)
|
||||
max_budget = max(case["budgets"] + [arm["trial_count"] for arm in arms])
|
||||
for arm in arms:
|
||||
_add_reference_metrics(
|
||||
arm,
|
||||
reference=reference,
|
||||
max_budget=max_budget,
|
||||
target_fraction=case["target_fraction"],
|
||||
)
|
||||
winners = _case_winners(arms)
|
||||
comparison = _harness_vs_naive(
|
||||
arms,
|
||||
min_final_ratio=case["min_final_ratio"],
|
||||
)
|
||||
return {
|
||||
"case_id": case["case_id"],
|
||||
"description": case["description"],
|
||||
"tags": case["tags"],
|
||||
"budgets": case["budgets"],
|
||||
"target_fraction": case["target_fraction"],
|
||||
"min_final_ratio": case["min_final_ratio"],
|
||||
"reference_best_per_gpu": reference,
|
||||
"max_budget": max_budget,
|
||||
"arms": arms,
|
||||
"winners": winners,
|
||||
"harness_vs_naive": comparison,
|
||||
"warnings": _case_warnings(case, arms, comparison),
|
||||
}
|
||||
|
||||
|
||||
def _summarize_arm(arm: dict[str, Any], *, budgets: list[int], spec_path: Path) -> dict[str, Any]:
|
||||
study_root = _resolve_study_root(arm["study_root"], base_dir=spec_path.parent)
|
||||
state = json.loads((study_root / "state.json").read_text(encoding="utf-8"))
|
||||
trials = state.get("trials") if isinstance(state.get("trials"), list) else []
|
||||
curve = _running_best_curve(trials)
|
||||
final_best = curve[-1] if curve else None
|
||||
best_trial_index = _first_index_at_value(curve, final_best)
|
||||
return {
|
||||
"name": arm["name"],
|
||||
"kind": arm["kind"],
|
||||
"label": arm["label"],
|
||||
"study_root": str(study_root),
|
||||
"study_id": state.get("study_id"),
|
||||
"trial_count": len(trials),
|
||||
"completed_count": sum(1 for item in trials if item.get("status") == "completed"),
|
||||
"failed_count": sum(1 for item in trials if item.get("status") == "failed"),
|
||||
"no_feasible_count": sum(
|
||||
1 for item in trials if not isinstance(item.get("best_request_rate_per_gpu"), (int, float))
|
||||
),
|
||||
"best_trial_id": state.get("best_trial_id"),
|
||||
"best_trial_index": best_trial_index,
|
||||
"final_best_per_gpu": final_best,
|
||||
"state_best_per_gpu": state.get("best_request_rate_per_gpu"),
|
||||
"best_at_budget": {str(budget): _value_at_budget(curve, budget) for budget in budgets},
|
||||
"running_best_per_gpu": curve,
|
||||
"stop_reason": str(state.get("tuning_stop_reason") or ""),
|
||||
"stop_diagnosis": str(state.get("tuning_stop_diagnosis") or ""),
|
||||
}
|
||||
|
||||
|
||||
def _add_reference_metrics(
|
||||
arm: dict[str, Any],
|
||||
*,
|
||||
reference: float | None,
|
||||
max_budget: int,
|
||||
target_fraction: float,
|
||||
) -> None:
|
||||
final_best = arm.get("final_best_per_gpu")
|
||||
arm["final_ratio_to_reference"] = (
|
||||
float(final_best) / reference
|
||||
if reference and isinstance(final_best, (int, float))
|
||||
else None
|
||||
)
|
||||
target = reference * target_fraction if reference else None
|
||||
arm["target_per_gpu"] = target
|
||||
arm["trials_to_target"] = _trials_to_target(arm["running_best_per_gpu"], target)
|
||||
arm["normalized_auc"] = _normalized_auc(
|
||||
arm["running_best_per_gpu"],
|
||||
reference=reference,
|
||||
max_budget=max_budget,
|
||||
)
|
||||
|
||||
|
||||
def _harness_vs_naive(arms: list[dict[str, Any]], *, min_final_ratio: float) -> list[dict[str, Any]]:
|
||||
naive = [arm for arm in arms if arm["kind"] == "naive"]
|
||||
harnesses = [arm for arm in arms if arm["kind"] == "harness"]
|
||||
if not naive or not harnesses:
|
||||
return []
|
||||
best_naive_final = _max_optional(arm.get("final_best_per_gpu") for arm in naive)
|
||||
best_naive_ttt = _min_optional(arm.get("trials_to_target") for arm in naive)
|
||||
best_naive_auc = _max_optional(arm.get("normalized_auc") for arm in naive)
|
||||
rows = []
|
||||
for harness in harnesses:
|
||||
final = harness.get("final_best_per_gpu")
|
||||
ttt = harness.get("trials_to_target")
|
||||
auc = harness.get("normalized_auc")
|
||||
final_ratio = (
|
||||
float(final) / best_naive_final
|
||||
if best_naive_final and isinstance(final, (int, float))
|
||||
else None
|
||||
)
|
||||
auc_ratio = (
|
||||
float(auc) / best_naive_auc
|
||||
if best_naive_auc and isinstance(auc, (int, float))
|
||||
else None
|
||||
)
|
||||
speedup = _speedup(best_naive_ttt, ttt)
|
||||
pass_final = final_ratio is not None and final_ratio >= min_final_ratio
|
||||
pass_speed = speedup is None or speedup >= 1.0
|
||||
rows.append(
|
||||
{
|
||||
"harness": harness["name"],
|
||||
"best_naive_final_per_gpu": best_naive_final,
|
||||
"best_naive_trials_to_target": best_naive_ttt,
|
||||
"best_naive_normalized_auc": best_naive_auc,
|
||||
"final_ratio_vs_best_naive": final_ratio,
|
||||
"target_trial_speedup_vs_best_naive": speedup,
|
||||
"auc_ratio_vs_best_naive": auc_ratio,
|
||||
"passes_min_final_ratio": pass_final,
|
||||
"passes_speed": pass_speed,
|
||||
"passes": pass_final and pass_speed,
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _case_winners(arms: list[dict[str, Any]]) -> dict[str, str | None]:
|
||||
return {
|
||||
"final_best": _argmax(arms, "final_best_per_gpu"),
|
||||
"fastest_to_target": _argmin(arms, "trials_to_target"),
|
||||
"normalized_auc": _argmax(arms, "normalized_auc"),
|
||||
}
|
||||
|
||||
|
||||
def _aggregate_cases(cases: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
by_kind: dict[str, dict[str, Any]] = {}
|
||||
final_wins: dict[str, int] = {}
|
||||
speed_wins: dict[str, int] = {}
|
||||
auc_wins: dict[str, int] = {}
|
||||
harness_passes = 0
|
||||
harness_checks = 0
|
||||
for case in cases:
|
||||
for winner_key, target in (
|
||||
("final_best", final_wins),
|
||||
("fastest_to_target", speed_wins),
|
||||
("normalized_auc", auc_wins),
|
||||
):
|
||||
winner = case["winners"].get(winner_key)
|
||||
if winner:
|
||||
target[winner] = target.get(winner, 0) + 1
|
||||
for row in case["harness_vs_naive"]:
|
||||
harness_checks += 1
|
||||
if row["passes"]:
|
||||
harness_passes += 1
|
||||
for arm in case["arms"]:
|
||||
bucket = by_kind.setdefault(
|
||||
arm["kind"],
|
||||
{
|
||||
"arm_count": 0,
|
||||
"mean_final_ratio_to_reference": None,
|
||||
"mean_normalized_auc": None,
|
||||
"target_reached_count": 0,
|
||||
"_final_ratios": [],
|
||||
"_aucs": [],
|
||||
},
|
||||
)
|
||||
bucket["arm_count"] += 1
|
||||
if isinstance(arm.get("final_ratio_to_reference"), (int, float)):
|
||||
bucket["_final_ratios"].append(float(arm["final_ratio_to_reference"]))
|
||||
if isinstance(arm.get("normalized_auc"), (int, float)):
|
||||
bucket["_aucs"].append(float(arm["normalized_auc"]))
|
||||
if isinstance(arm.get("trials_to_target"), int):
|
||||
bucket["target_reached_count"] += 1
|
||||
for bucket in by_kind.values():
|
||||
ratios = bucket.pop("_final_ratios")
|
||||
aucs = bucket.pop("_aucs")
|
||||
bucket["mean_final_ratio_to_reference"] = _mean(ratios)
|
||||
bucket["mean_normalized_auc"] = _mean(aucs)
|
||||
return {
|
||||
"case_count": len(cases),
|
||||
"by_kind": by_kind,
|
||||
"winner_counts": {
|
||||
"final_best": final_wins,
|
||||
"fastest_to_target": speed_wins,
|
||||
"normalized_auc": auc_wins,
|
||||
},
|
||||
"harness_vs_naive_pass_count": harness_passes,
|
||||
"harness_vs_naive_check_count": harness_checks,
|
||||
}
|
||||
|
||||
|
||||
def _case_warnings(
|
||||
case: dict[str, Any],
|
||||
arms: list[dict[str, Any]],
|
||||
comparison: list[dict[str, Any]],
|
||||
) -> list[str]:
|
||||
warnings = []
|
||||
kinds = {arm["kind"] for arm in arms}
|
||||
if "harness" not in kinds or "naive" not in kinds:
|
||||
warnings.append("case does not include both harness and naive arms")
|
||||
if len(case["tags"]) < 2:
|
||||
warnings.append("case has few tags; add workload/model/SLO tags to support generalization claims")
|
||||
if not comparison:
|
||||
return warnings
|
||||
for row in comparison:
|
||||
if not row["passes_min_final_ratio"]:
|
||||
warnings.append(
|
||||
f"{row['harness']} final best is below min_final_ratio versus best naive"
|
||||
)
|
||||
if not row["passes_speed"]:
|
||||
warnings.append(
|
||||
f"{row['harness']} reaches target later than best naive"
|
||||
)
|
||||
return warnings
|
||||
|
||||
|
||||
def _running_best_curve(trials: list[Any]) -> list[float | None]:
|
||||
curve: list[float | None] = []
|
||||
incumbent: float | None = None
|
||||
for trial in trials:
|
||||
rate = trial.get("best_request_rate_per_gpu") if isinstance(trial, dict) else None
|
||||
if isinstance(rate, (int, float)) and (incumbent is None or float(rate) > incumbent):
|
||||
incumbent = float(rate)
|
||||
curve.append(incumbent)
|
||||
return curve
|
||||
|
||||
|
||||
def _value_at_budget(curve: list[float | None], budget: int) -> float | None:
|
||||
if not curve:
|
||||
return None
|
||||
index = min(max(budget, 1), len(curve)) - 1
|
||||
return curve[index]
|
||||
|
||||
|
||||
def _trials_to_target(curve: list[float | None], target: float | None) -> int | None:
|
||||
if target is None:
|
||||
return None
|
||||
for idx, value in enumerate(curve, start=1):
|
||||
if isinstance(value, (int, float)) and value >= target:
|
||||
return idx
|
||||
return None
|
||||
|
||||
|
||||
def _normalized_auc(
|
||||
curve: list[float | None],
|
||||
*,
|
||||
reference: float | None,
|
||||
max_budget: int,
|
||||
) -> float | None:
|
||||
if not reference or max_budget <= 0:
|
||||
return None
|
||||
total = 0.0
|
||||
for budget in range(1, max_budget + 1):
|
||||
value = _value_at_budget(curve, budget)
|
||||
total += float(value) if isinstance(value, (int, float)) else 0.0
|
||||
return total / (reference * max_budget)
|
||||
|
||||
|
||||
def _reference_best(arms: list[dict[str, Any]]) -> float | None:
|
||||
return _max_optional(arm.get("final_best_per_gpu") for arm in arms)
|
||||
|
||||
|
||||
def _resolve_study_root(raw_path: str, *, base_dir: Path) -> Path:
|
||||
path = _resolve_path(raw_path, base_dir=base_dir)
|
||||
if (path / "state.json").exists():
|
||||
return path
|
||||
matches = sorted(path.glob("*/state.json"))
|
||||
if len(matches) == 1:
|
||||
return matches[0].parent
|
||||
if not matches:
|
||||
raise SpecError(f"study_root does not contain state.json: {path}")
|
||||
raise SpecError(f"study_root is ambiguous; point to a specific study directory: {path}")
|
||||
|
||||
|
||||
def _resolve_path(raw_path: str, *, base_dir: Path) -> Path:
|
||||
path = Path(raw_path)
|
||||
if not path.is_absolute():
|
||||
path = (base_dir / path).resolve()
|
||||
return path
|
||||
|
||||
|
||||
def _as_float(value: Any, *, default: float) -> float:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
||||
raise SpecError(f"Expected numeric value, got {value!r}.")
|
||||
return float(value)
|
||||
|
||||
|
||||
def _positive_int(value: Any, *, context: str) -> int:
|
||||
if isinstance(value, bool) or not isinstance(value, int) or value <= 0:
|
||||
raise SpecError(f"{context} must contain positive integers.")
|
||||
return value
|
||||
|
||||
|
||||
def _first_index_at_value(curve: list[float | None], value: float | None) -> int | None:
|
||||
if value is None:
|
||||
return None
|
||||
for idx, item in enumerate(curve, start=1):
|
||||
if item == value:
|
||||
return idx
|
||||
return None
|
||||
|
||||
|
||||
def _argmax(rows: list[dict[str, Any]], key: str) -> str | None:
|
||||
scored = [
|
||||
(str(row["name"]), float(row[key]))
|
||||
for row in rows
|
||||
if isinstance(row.get(key), (int, float))
|
||||
]
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[1], reverse=True)
|
||||
return scored[0][0]
|
||||
|
||||
|
||||
def _argmin(rows: list[dict[str, Any]], key: str) -> str | None:
|
||||
scored = [
|
||||
(str(row["name"]), int(row[key]))
|
||||
for row in rows
|
||||
if isinstance(row.get(key), int)
|
||||
]
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[1])
|
||||
return scored[0][0]
|
||||
|
||||
|
||||
def _max_optional(values: Any) -> float | None:
|
||||
scored = [float(item) for item in values if isinstance(item, (int, float))]
|
||||
return max(scored) if scored else None
|
||||
|
||||
|
||||
def _min_optional(values: Any) -> int | None:
|
||||
scored = [int(item) for item in values if isinstance(item, int)]
|
||||
return min(scored) if scored else None
|
||||
|
||||
|
||||
def _mean(values: list[float]) -> float | None:
|
||||
return sum(values) / len(values) if values else None
|
||||
|
||||
|
||||
def _speedup(naive_trials: int | None, harness_trials: int | None) -> float | None:
|
||||
if harness_trials is None:
|
||||
return 0.0 if naive_trials is not None else None
|
||||
if naive_trials is None:
|
||||
return None
|
||||
if harness_trials <= 0:
|
||||
return None
|
||||
return float(naive_trials) / float(harness_trials)
|
||||
|
||||
|
||||
def _fmt(value: Any) -> str:
|
||||
if isinstance(value, float):
|
||||
return f"{value:.4f}"
|
||||
if value is None:
|
||||
return "-"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _render_report(summary: dict[str, Any]) -> str:
|
||||
lines = [
|
||||
f"# {summary['report_id']}",
|
||||
"",
|
||||
"## Aggregate",
|
||||
"",
|
||||
f"- Cases: `{summary['aggregate']['case_count']}`",
|
||||
f"- Harness-vs-naive pass/checks: `{summary['aggregate']['harness_vs_naive_pass_count']}`/`{summary['aggregate']['harness_vs_naive_check_count']}`",
|
||||
f"- Winner counts: `{json.dumps(summary['aggregate']['winner_counts'], ensure_ascii=False)}`",
|
||||
"",
|
||||
"## By Kind",
|
||||
"",
|
||||
"| Kind | Arms | Mean final/ref | Mean AUC | Target reached |",
|
||||
"| --- | ---: | ---: | ---: | ---: |",
|
||||
]
|
||||
for kind, payload in sorted(summary["aggregate"]["by_kind"].items()):
|
||||
lines.append(
|
||||
"| "
|
||||
+ " | ".join(
|
||||
[
|
||||
f"`{kind}`",
|
||||
str(payload["arm_count"]),
|
||||
_fmt(payload["mean_final_ratio_to_reference"]),
|
||||
_fmt(payload["mean_normalized_auc"]),
|
||||
str(payload["target_reached_count"]),
|
||||
]
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
lines.extend(["", "## Cases", ""])
|
||||
for case in summary["cases"]:
|
||||
lines.extend(
|
||||
[
|
||||
f"### {case['case_id']}",
|
||||
"",
|
||||
f"- Reference best req/s/GPU: `{_fmt(case['reference_best_per_gpu'])}`",
|
||||
f"- Target fraction: `{case['target_fraction']}`",
|
||||
f"- Winners: `{json.dumps(case['winners'], ensure_ascii=False)}`",
|
||||
]
|
||||
)
|
||||
if case["warnings"]:
|
||||
lines.append(f"- Warnings: `{json.dumps(case['warnings'], ensure_ascii=False)}`")
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"| Arm | Kind | Trials | Final/GPU | Final/ref | TTT | AUC | Failed | No feasible |",
|
||||
"| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
|
||||
]
|
||||
)
|
||||
for arm in case["arms"]:
|
||||
lines.append(
|
||||
"| "
|
||||
+ " | ".join(
|
||||
[
|
||||
f"`{arm['name']}`",
|
||||
f"`{arm['kind']}`",
|
||||
str(arm["trial_count"]),
|
||||
_fmt(arm["final_best_per_gpu"]),
|
||||
_fmt(arm["final_ratio_to_reference"]),
|
||||
_fmt(arm["trials_to_target"]),
|
||||
_fmt(arm["normalized_auc"]),
|
||||
str(arm["failed_count"]),
|
||||
str(arm["no_feasible_count"]),
|
||||
]
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
if case["harness_vs_naive"]:
|
||||
lines.extend(["", "| Harness | Final vs best naive | Target speedup | AUC vs best naive | Pass |", "| --- | ---: | ---: | ---: | --- |"])
|
||||
for row in case["harness_vs_naive"]:
|
||||
lines.append(
|
||||
"| "
|
||||
+ " | ".join(
|
||||
[
|
||||
f"`{row['harness']}`",
|
||||
_fmt(row["final_ratio_vs_best_naive"]),
|
||||
_fmt(row["target_trial_speedup_vs_best_naive"]),
|
||||
_fmt(row["auc_ratio_vs_best_naive"]),
|
||||
f"`{row['passes']}`",
|
||||
]
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
109
tests/test_tuning_report.py
Normal file
109
tests/test_tuning_report.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from aituner.tuning_report import run_tuning_report
|
||||
|
||||
|
||||
def _write_state(root: Path, *, study_id: str, rates: list[float | None]) -> None:
|
||||
root.mkdir(parents=True)
|
||||
trials = []
|
||||
best_rate = None
|
||||
best_trial_id = None
|
||||
for idx, rate in enumerate(rates, start=1):
|
||||
trial_id = f"trial-{idx:04d}"
|
||||
trials.append(
|
||||
{
|
||||
"trial_id": trial_id,
|
||||
"status": "completed" if rate is not None else "failed",
|
||||
"parallel_size": 1,
|
||||
"best_request_rate": rate,
|
||||
"best_request_rate_per_gpu": rate,
|
||||
"config_patch": {"env_patch": {}, "flag_patch": {}},
|
||||
}
|
||||
)
|
||||
if rate is not None and (best_rate is None or rate > best_rate):
|
||||
best_rate = rate
|
||||
best_trial_id = trial_id
|
||||
payload = {
|
||||
"study_id": study_id,
|
||||
"best_trial_id": best_trial_id,
|
||||
"best_request_rate": best_rate,
|
||||
"best_request_rate_per_gpu": best_rate,
|
||||
"next_trial_index": len(rates) + 1,
|
||||
"trials": trials,
|
||||
}
|
||||
(root / "state.json").write_text(json.dumps(payload), encoding="utf-8")
|
||||
|
||||
|
||||
class TuningReportTests(unittest.TestCase):
|
||||
def test_tuning_report_scores_harness_vs_naive_anytime_progress(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
_write_state(
|
||||
tmp_path / "studies" / "harness-study",
|
||||
study_id="harness-study",
|
||||
rates=[0.4, 0.9],
|
||||
)
|
||||
_write_state(
|
||||
tmp_path / "naive-study",
|
||||
study_id="naive-study",
|
||||
rates=[0.4, None, 0.7, 0.9],
|
||||
)
|
||||
spec_path = tmp_path / "report.json"
|
||||
spec_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"report_id": "report-1",
|
||||
"output_root": str(tmp_path / "out"),
|
||||
"target_fraction": 0.8,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "case-1",
|
||||
"tags": ["model-a", "chat"],
|
||||
"budgets": [1, 2, 4],
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": str(tmp_path / "studies"),
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": str(tmp_path / "naive-study"),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
summary = run_tuning_report(spec_path)
|
||||
|
||||
case = summary["cases"][0]
|
||||
self.assertEqual(case["reference_best_per_gpu"], 0.9)
|
||||
self.assertEqual(case["winners"]["final_best"], "harness")
|
||||
self.assertEqual(case["winners"]["fastest_to_target"], "harness")
|
||||
harness = case["arms"][0]
|
||||
naive = case["arms"][1]
|
||||
self.assertEqual(harness["best_at_budget"]["2"], 0.9)
|
||||
self.assertEqual(naive["best_at_budget"]["2"], 0.4)
|
||||
self.assertEqual(case["target_fraction"], 0.8)
|
||||
self.assertEqual(harness["trials_to_target"], 2)
|
||||
self.assertEqual(naive["trials_to_target"], 4)
|
||||
self.assertEqual(naive["failed_count"], 1)
|
||||
comparison = case["harness_vs_naive"][0]
|
||||
self.assertTrue(comparison["passes"])
|
||||
self.assertEqual(comparison["target_trial_speedup_vs_best_naive"], 2.0)
|
||||
self.assertTrue((tmp_path / "out" / "summary.json").exists())
|
||||
self.assertTrue((tmp_path / "out" / "report.md").exists())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user