Add reusable clean pair runner
This commit is contained in:
177
scripts/run_clean_pair_from_specs.sh
Executable file
177
scripts/run_clean_pair_from_specs.sh
Executable file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run a clean same-policy harness-vs-naive pair from one or two base specs.
|
||||
#
|
||||
# Required env:
|
||||
# RUN_LABEL
|
||||
# CASE_ID
|
||||
# HARNESS_BASE_SPEC
|
||||
#
|
||||
# Optional env:
|
||||
# NAIVE_BASE_SPEC defaults to HARNESS_BASE_SPEC
|
||||
# MAX_TRIALS defaults to 12
|
||||
# CASE_DESCRIPTION
|
||||
# CASE_TAGS_JSON JSON list, defaults to []
|
||||
# BUDGETS_JSON JSON list, defaults to [1,2,3,4,6,8,MAX_TRIALS]
|
||||
# COMMON_SPEC_PATCH_FILE JSON deep-merged into both generated specs
|
||||
# HARNESS_SPEC_PATCH_FILE JSON deep-merged into harness generated spec
|
||||
# NAIVE_SPEC_PATCH_FILE JSON deep-merged into naive generated spec
|
||||
set -euo pipefail
|
||||
|
||||
RUN_LABEL="${RUN_LABEL:?RUN_LABEL is required}"
|
||||
CASE_ID="${CASE_ID:?CASE_ID is required}"
|
||||
HARNESS_BASE_SPEC="${HARNESS_BASE_SPEC:?HARNESS_BASE_SPEC is required}"
|
||||
NAIVE_BASE_SPEC="${NAIVE_BASE_SPEC:-${HARNESS_BASE_SPEC}}"
|
||||
MAX_TRIALS="${MAX_TRIALS:-12}"
|
||||
CASE_DESCRIPTION="${CASE_DESCRIPTION:-Clean same-policy harness-vs-naive pair.}"
|
||||
CASE_TAGS_JSON="${CASE_TAGS_JSON:-[]}"
|
||||
BUDGETS_JSON="${BUDGETS_JSON:-}"
|
||||
|
||||
ROOT="$(pwd)"
|
||||
RUN_CONFIG_ROOT=".aituner-run-configs/${RUN_LABEL}"
|
||||
HARNESS_SPEC="${RUN_CONFIG_ROOT}/harness.json"
|
||||
NAIVE_SPEC="${RUN_CONFIG_ROOT}/naive.json"
|
||||
HARNESS_STORE=".aituner/${RUN_LABEL}-harness"
|
||||
NAIVE_STORE=".aituner/${RUN_LABEL}-naive"
|
||||
REPORT_ROOT=".aituner-reports/${RUN_LABEL}"
|
||||
REPORT_SPEC=".aituner-reports/${RUN_LABEL}.spec.json"
|
||||
export RUN_LABEL CASE_ID HARNESS_BASE_SPEC NAIVE_BASE_SPEC MAX_TRIALS CASE_DESCRIPTION
|
||||
export CASE_TAGS_JSON BUDGETS_JSON ROOT RUN_CONFIG_ROOT HARNESS_SPEC NAIVE_SPEC
|
||||
export HARNESS_STORE NAIVE_STORE REPORT_ROOT REPORT_SPEC
|
||||
|
||||
read_key() {
|
||||
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||
export OPENAI_API_KEY
|
||||
OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')"
|
||||
fi
|
||||
}
|
||||
|
||||
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
||||
mkdir -p "${RUN_CONFIG_ROOT}" .aituner .aituner-reports
|
||||
rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${REPORT_SPEC}"
|
||||
|
||||
python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def deep_merge(base: dict[str, Any], patch: dict[str, Any]) -> dict[str, Any]:
|
||||
merged = dict(base)
|
||||
for key, value in patch.items():
|
||||
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||
merged[key] = deep_merge(merged[key], value)
|
||||
else:
|
||||
merged[key] = value
|
||||
return merged
|
||||
|
||||
|
||||
def load_patch(env_name: str) -> dict[str, Any]:
|
||||
path = os.environ.get(env_name)
|
||||
if not path:
|
||||
return {}
|
||||
payload = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
if not isinstance(payload, dict):
|
||||
raise SystemExit(f"{env_name} must point to a JSON object")
|
||||
return payload
|
||||
|
||||
|
||||
def generated_spec(base_path: str, *, use_harness: bool, suffix: str, arm_patch: dict[str, Any]) -> dict[str, Any]:
|
||||
base = json.loads(Path(base_path).read_text(encoding="utf-8"))
|
||||
if not isinstance(base, dict):
|
||||
raise SystemExit(f"{base_path} must contain a JSON object")
|
||||
common = load_patch("COMMON_SPEC_PATCH_FILE")
|
||||
spec = deep_merge(base, common)
|
||||
spec = deep_merge(spec, arm_patch)
|
||||
spec["study_id"] = str(spec.get("study_id") or os.environ["CASE_ID"]) + f"-{suffix}"
|
||||
llm = dict(spec.get("llm") or {})
|
||||
llm["use_harness"] = use_harness
|
||||
spec["llm"] = llm
|
||||
return spec
|
||||
|
||||
|
||||
run_config_root = Path(os.environ["RUN_CONFIG_ROOT"])
|
||||
harness = generated_spec(
|
||||
os.environ["HARNESS_BASE_SPEC"],
|
||||
use_harness=True,
|
||||
suffix="harness",
|
||||
arm_patch=load_patch("HARNESS_SPEC_PATCH_FILE"),
|
||||
)
|
||||
naive = generated_spec(
|
||||
os.environ["NAIVE_BASE_SPEC"],
|
||||
use_harness=False,
|
||||
suffix="naive",
|
||||
arm_patch=load_patch("NAIVE_SPEC_PATCH_FILE"),
|
||||
)
|
||||
(run_config_root / "harness.json").write_text(json.dumps(harness, indent=2) + "\n", encoding="utf-8")
|
||||
(run_config_root / "naive.json").write_text(json.dumps(naive, indent=2) + "\n", encoding="utf-8")
|
||||
print(json.dumps({"harness_study_id": harness["study_id"], "naive_study_id": naive["study_id"]}, ensure_ascii=False))
|
||||
PY
|
||||
|
||||
read_key
|
||||
echo "=== harness clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec "${HARNESS_SPEC}" \
|
||||
--store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||
> ".aituner/${RUN_LABEL}-harness.log" 2>&1
|
||||
echo "=== harness clean pair done $(date -Is) ==="
|
||||
|
||||
read_key
|
||||
echo "=== naive clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
||||
PYTHONPATH=src python3 -m aituner.cli study tune \
|
||||
--spec "${NAIVE_SPEC}" \
|
||||
--store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
||||
> ".aituner/${RUN_LABEL}-naive.log" 2>&1
|
||||
echo "=== naive clean pair done $(date -Is) ==="
|
||||
|
||||
python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
root = Path(os.environ["ROOT"])
|
||||
run_label = os.environ["RUN_LABEL"]
|
||||
harness = json.loads(Path(os.environ["HARNESS_SPEC"]).read_text(encoding="utf-8"))
|
||||
naive = json.loads(Path(os.environ["NAIVE_SPEC"]).read_text(encoding="utf-8"))
|
||||
max_trials = int(os.environ["MAX_TRIALS"])
|
||||
budgets_text = os.environ.get("BUDGETS_JSON") or ""
|
||||
if budgets_text:
|
||||
budgets = json.loads(budgets_text)
|
||||
else:
|
||||
budgets = [1, 2, 3, 4, 6, 8, max_trials]
|
||||
budgets = sorted({int(item) for item in budgets if int(item) > 0})
|
||||
tags = json.loads(os.environ.get("CASE_TAGS_JSON") or "[]")
|
||||
spec = {
|
||||
"report_id": run_label,
|
||||
"output_root": str(root / os.environ["REPORT_ROOT"]),
|
||||
"target_fraction": 0.95,
|
||||
"min_final_ratio": 0.98,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": os.environ["CASE_ID"],
|
||||
"description": os.environ["CASE_DESCRIPTION"],
|
||||
"tags": tags,
|
||||
"budgets": budgets,
|
||||
"arms": [
|
||||
{
|
||||
"name": "harness",
|
||||
"kind": "harness",
|
||||
"study_root": str(
|
||||
root / os.environ["HARNESS_STORE"] / harness["study_id"]
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "naive",
|
||||
"kind": "naive",
|
||||
"study_root": str(root / os.environ["NAIVE_STORE"] / naive["study_id"]),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
Path(os.environ["REPORT_SPEC"]).write_text(json.dumps(spec, indent=2) + "\n", encoding="utf-8")
|
||||
PY
|
||||
|
||||
PYTHONPATH=src python3 scripts/tuning_report.py --spec "${REPORT_SPEC}"
|
||||
touch ".aituner/${RUN_LABEL}.DONE"
|
||||
echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ==="
|
||||
@@ -367,20 +367,41 @@ def cmd_study_tune(args: argparse.Namespace) -> int:
|
||||
proposal_source_label = "harness"
|
||||
else:
|
||||
proposal_source_label = str(proposal_source) if proposal_source else "llm"
|
||||
stop_authorized_by = (
|
||||
"validator"
|
||||
if (is_harness_stop or authorized)
|
||||
else "file_proposal"
|
||||
if proposal_source is not None
|
||||
else "llm_after_veto_budget"
|
||||
)
|
||||
stop_reason = (
|
||||
"harness_stop"
|
||||
if is_harness_stop
|
||||
else "proposal_file_stop"
|
||||
if proposal_source is not None
|
||||
else "llm_stop"
|
||||
)
|
||||
stop_details = {
|
||||
"proposal_name": proposal_name,
|
||||
"proposal_source": proposal_source_label,
|
||||
"stop_authorized_by": stop_authorized_by,
|
||||
}
|
||||
if stop_authority:
|
||||
stop_details["validator_reason"] = stop_authority.get("reason")
|
||||
state.tuning_stop_reason = stop_reason
|
||||
state.tuning_stop_diagnosis = proposal.diagnosis
|
||||
state.tuning_stop_details = stop_details
|
||||
store.save_state(state)
|
||||
executed.append(
|
||||
{
|
||||
"trial_id": None,
|
||||
"proposal_name": proposal_name,
|
||||
"proposal_source": proposal_source_label,
|
||||
"stopped": True,
|
||||
"stop_authorized_by": (
|
||||
"validator"
|
||||
if (is_harness_stop or authorized)
|
||||
else "file_proposal"
|
||||
if proposal_source is not None
|
||||
else "llm_after_veto_budget"
|
||||
),
|
||||
"reason": state.tuning_stop_reason,
|
||||
"stop_authorized_by": stop_authorized_by,
|
||||
"diagnosis": proposal.diagnosis,
|
||||
"details": stop_details,
|
||||
"state_best_trial_id": state.best_trial_id,
|
||||
"state_best_request_rate": state.best_request_rate,
|
||||
}
|
||||
|
||||
@@ -4845,6 +4845,18 @@ class CoreFlowTests(unittest.TestCase):
|
||||
self.assertTrue(proposal_path.exists())
|
||||
proposal = json.loads(proposal_path.read_text(encoding="utf-8"))
|
||||
self.assertTrue(proposal["should_stop"])
|
||||
state = store.load_state(study.study_id)
|
||||
self.assertEqual(state.tuning_stop_reason, "harness_stop")
|
||||
self.assertEqual(
|
||||
state.tuning_stop_details["proposal_name"],
|
||||
"harness-stop-0005",
|
||||
)
|
||||
self.assertEqual(state.tuning_stop_details["proposal_source"], "harness")
|
||||
self.assertEqual(
|
||||
state.tuning_stop_details["stop_authorized_by"],
|
||||
"validator",
|
||||
)
|
||||
self.assertTrue(state.tuning_stop_diagnosis)
|
||||
|
||||
def test_cli_tune_evaluates_baseline_before_llm_proposal(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
|
||||
Reference in New Issue
Block a user