diff --git a/scripts/run_clean_pair_from_specs.sh b/scripts/run_clean_pair_from_specs.sh new file mode 100755 index 0000000..538cc23 --- /dev/null +++ b/scripts/run_clean_pair_from_specs.sh @@ -0,0 +1,177 @@ +#!/usr/bin/env bash +# Run a clean same-policy harness-vs-naive pair from one or two base specs. +# +# Required env: +# RUN_LABEL +# CASE_ID +# HARNESS_BASE_SPEC +# +# Optional env: +# NAIVE_BASE_SPEC defaults to HARNESS_BASE_SPEC +# MAX_TRIALS defaults to 12 +# CASE_DESCRIPTION +# CASE_TAGS_JSON JSON list, defaults to [] +# BUDGETS_JSON JSON list, defaults to [1,2,3,4,6,8,MAX_TRIALS] +# COMMON_SPEC_PATCH_FILE JSON deep-merged into both generated specs +# HARNESS_SPEC_PATCH_FILE JSON deep-merged into harness generated spec +# NAIVE_SPEC_PATCH_FILE JSON deep-merged into naive generated spec +set -euo pipefail + +RUN_LABEL="${RUN_LABEL:?RUN_LABEL is required}" +CASE_ID="${CASE_ID:?CASE_ID is required}" +HARNESS_BASE_SPEC="${HARNESS_BASE_SPEC:?HARNESS_BASE_SPEC is required}" +NAIVE_BASE_SPEC="${NAIVE_BASE_SPEC:-${HARNESS_BASE_SPEC}}" +MAX_TRIALS="${MAX_TRIALS:-12}" +CASE_DESCRIPTION="${CASE_DESCRIPTION:-Clean same-policy harness-vs-naive pair.}" +CASE_TAGS_JSON="${CASE_TAGS_JSON:-[]}" +BUDGETS_JSON="${BUDGETS_JSON:-}" + +ROOT="$(pwd)" +RUN_CONFIG_ROOT=".aituner-run-configs/${RUN_LABEL}" +HARNESS_SPEC="${RUN_CONFIG_ROOT}/harness.json" +NAIVE_SPEC="${RUN_CONFIG_ROOT}/naive.json" +HARNESS_STORE=".aituner/${RUN_LABEL}-harness" +NAIVE_STORE=".aituner/${RUN_LABEL}-naive" +REPORT_ROOT=".aituner-reports/${RUN_LABEL}" +REPORT_SPEC=".aituner-reports/${RUN_LABEL}.spec.json" +export RUN_LABEL CASE_ID HARNESS_BASE_SPEC NAIVE_BASE_SPEC MAX_TRIALS CASE_DESCRIPTION +export CASE_TAGS_JSON BUDGETS_JSON ROOT RUN_CONFIG_ROOT HARNESS_SPEC NAIVE_SPEC +export HARNESS_STORE NAIVE_STORE REPORT_ROOT REPORT_SPEC + +read_key() { + if [ -z "${OPENAI_API_KEY:-}" ]; then + export OPENAI_API_KEY + OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')" + fi +} + +export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*' +mkdir -p "${RUN_CONFIG_ROOT}" .aituner .aituner-reports +rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${REPORT_SPEC}" + +python3 - <<'PY' +import json +import os +from pathlib import Path +from typing import Any + + +def deep_merge(base: dict[str, Any], patch: dict[str, Any]) -> dict[str, Any]: + merged = dict(base) + for key, value in patch.items(): + if isinstance(value, dict) and isinstance(merged.get(key), dict): + merged[key] = deep_merge(merged[key], value) + else: + merged[key] = value + return merged + + +def load_patch(env_name: str) -> dict[str, Any]: + path = os.environ.get(env_name) + if not path: + return {} + payload = json.loads(Path(path).read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise SystemExit(f"{env_name} must point to a JSON object") + return payload + + +def generated_spec(base_path: str, *, use_harness: bool, suffix: str, arm_patch: dict[str, Any]) -> dict[str, Any]: + base = json.loads(Path(base_path).read_text(encoding="utf-8")) + if not isinstance(base, dict): + raise SystemExit(f"{base_path} must contain a JSON object") + common = load_patch("COMMON_SPEC_PATCH_FILE") + spec = deep_merge(base, common) + spec = deep_merge(spec, arm_patch) + spec["study_id"] = str(spec.get("study_id") or os.environ["CASE_ID"]) + f"-{suffix}" + llm = dict(spec.get("llm") or {}) + llm["use_harness"] = use_harness + spec["llm"] = llm + return spec + + +run_config_root = Path(os.environ["RUN_CONFIG_ROOT"]) +harness = generated_spec( + os.environ["HARNESS_BASE_SPEC"], + use_harness=True, + suffix="harness", + arm_patch=load_patch("HARNESS_SPEC_PATCH_FILE"), +) +naive = generated_spec( + os.environ["NAIVE_BASE_SPEC"], + use_harness=False, + suffix="naive", + arm_patch=load_patch("NAIVE_SPEC_PATCH_FILE"), +) +(run_config_root / "harness.json").write_text(json.dumps(harness, indent=2) + "\n", encoding="utf-8") +(run_config_root / "naive.json").write_text(json.dumps(naive, indent=2) + "\n", encoding="utf-8") +print(json.dumps({"harness_study_id": harness["study_id"], "naive_study_id": naive["study_id"]}, ensure_ascii=False)) +PY + +read_key +echo "=== harness clean pair start $(date -Is) label=${RUN_LABEL} ===" +PYTHONPATH=src python3 -m aituner.cli study tune \ + --spec "${HARNESS_SPEC}" \ + --store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \ + > ".aituner/${RUN_LABEL}-harness.log" 2>&1 +echo "=== harness clean pair done $(date -Is) ===" + +read_key +echo "=== naive clean pair start $(date -Is) label=${RUN_LABEL} ===" +PYTHONPATH=src python3 -m aituner.cli study tune \ + --spec "${NAIVE_SPEC}" \ + --store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \ + > ".aituner/${RUN_LABEL}-naive.log" 2>&1 +echo "=== naive clean pair done $(date -Is) ===" + +python3 - <<'PY' +import json +import os +from pathlib import Path + +root = Path(os.environ["ROOT"]) +run_label = os.environ["RUN_LABEL"] +harness = json.loads(Path(os.environ["HARNESS_SPEC"]).read_text(encoding="utf-8")) +naive = json.loads(Path(os.environ["NAIVE_SPEC"]).read_text(encoding="utf-8")) +max_trials = int(os.environ["MAX_TRIALS"]) +budgets_text = os.environ.get("BUDGETS_JSON") or "" +if budgets_text: + budgets = json.loads(budgets_text) +else: + budgets = [1, 2, 3, 4, 6, 8, max_trials] +budgets = sorted({int(item) for item in budgets if int(item) > 0}) +tags = json.loads(os.environ.get("CASE_TAGS_JSON") or "[]") +spec = { + "report_id": run_label, + "output_root": str(root / os.environ["REPORT_ROOT"]), + "target_fraction": 0.95, + "min_final_ratio": 0.98, + "cases": [ + { + "case_id": os.environ["CASE_ID"], + "description": os.environ["CASE_DESCRIPTION"], + "tags": tags, + "budgets": budgets, + "arms": [ + { + "name": "harness", + "kind": "harness", + "study_root": str( + root / os.environ["HARNESS_STORE"] / harness["study_id"] + ), + }, + { + "name": "naive", + "kind": "naive", + "study_root": str(root / os.environ["NAIVE_STORE"] / naive["study_id"]), + }, + ], + } + ], +} +Path(os.environ["REPORT_SPEC"]).write_text(json.dumps(spec, indent=2) + "\n", encoding="utf-8") +PY + +PYTHONPATH=src python3 scripts/tuning_report.py --spec "${REPORT_SPEC}" +touch ".aituner/${RUN_LABEL}.DONE" +echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ===" diff --git a/src/aituner/cli.py b/src/aituner/cli.py index 2c1d4a0..6dec945 100644 --- a/src/aituner/cli.py +++ b/src/aituner/cli.py @@ -367,20 +367,41 @@ def cmd_study_tune(args: argparse.Namespace) -> int: proposal_source_label = "harness" else: proposal_source_label = str(proposal_source) if proposal_source else "llm" + stop_authorized_by = ( + "validator" + if (is_harness_stop or authorized) + else "file_proposal" + if proposal_source is not None + else "llm_after_veto_budget" + ) + stop_reason = ( + "harness_stop" + if is_harness_stop + else "proposal_file_stop" + if proposal_source is not None + else "llm_stop" + ) + stop_details = { + "proposal_name": proposal_name, + "proposal_source": proposal_source_label, + "stop_authorized_by": stop_authorized_by, + } + if stop_authority: + stop_details["validator_reason"] = stop_authority.get("reason") + state.tuning_stop_reason = stop_reason + state.tuning_stop_diagnosis = proposal.diagnosis + state.tuning_stop_details = stop_details + store.save_state(state) executed.append( { "trial_id": None, "proposal_name": proposal_name, "proposal_source": proposal_source_label, "stopped": True, - "stop_authorized_by": ( - "validator" - if (is_harness_stop or authorized) - else "file_proposal" - if proposal_source is not None - else "llm_after_veto_budget" - ), + "reason": state.tuning_stop_reason, + "stop_authorized_by": stop_authorized_by, "diagnosis": proposal.diagnosis, + "details": stop_details, "state_best_trial_id": state.best_trial_id, "state_best_request_rate": state.best_request_rate, } diff --git a/tests/test_core_flow.py b/tests/test_core_flow.py index 6c90bd8..d0b9d82 100644 --- a/tests/test_core_flow.py +++ b/tests/test_core_flow.py @@ -4845,6 +4845,18 @@ class CoreFlowTests(unittest.TestCase): self.assertTrue(proposal_path.exists()) proposal = json.loads(proposal_path.read_text(encoding="utf-8")) self.assertTrue(proposal["should_stop"]) + state = store.load_state(study.study_id) + self.assertEqual(state.tuning_stop_reason, "harness_stop") + self.assertEqual( + state.tuning_stop_details["proposal_name"], + "harness-stop-0005", + ) + self.assertEqual(state.tuning_stop_details["proposal_source"], "harness") + self.assertEqual( + state.tuning_stop_details["stop_authorized_by"], + "validator", + ) + self.assertTrue(state.tuning_stop_diagnosis) def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None: with tempfile.TemporaryDirectory() as tmp: