Add clean dash1 harness ablation runner

2026-06-21 00:51:08 +08:00
parent 488fae7e63
commit d23b69219b
1 changed files with 81 additions and 0 deletions
--- a/scripts/run_clean_ablation_pair_d1.sh
+++ b/scripts/run_clean_ablation_pair_d1.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# Clean same-policy harness-vs-naive ablation on dash1.
+#
+# This is intended as the first robustness gate for harness evaluation:
+# both arms use the same study substrate and the same configured LLM endpoint;
+# the only intended difference is llm.use_harness.
+set -euo pipefail
+
+RUN_LABEL="${AITUNER_RUN_ID:-qwen27b-clean-pair-$(date -u +%Y%m%dT%H%M%SZ)}"
+MAX_TRIALS="${MAX_TRIALS:-12}"
+ROOT="$(pwd)"
+HARNESS_STORE=".aituner/${RUN_LABEL}-harness"
+NAIVE_STORE=".aituner/${RUN_LABEL}-naive"
+REPORT_ROOT=".aituner-reports/${RUN_LABEL}"
+SPEC_PATH=".aituner-reports/${RUN_LABEL}.spec.json"
+
+read_key() {
+  if [ -z "${OPENAI_API_KEY:-}" ]; then
+    export OPENAI_API_KEY
+    OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')"
+  fi
+}
+
+export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
+mkdir -p .aituner .aituner-reports
+rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${SPEC_PATH}"
+
+read_key
+echo "=== harness ON clean pair start $(date -Is) label=${RUN_LABEL} ==="
+PYTHONPATH=src python3 -m aituner.cli study tune \
+  --spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
+  --store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
+  > ".aituner/${RUN_LABEL}-harness.log" 2>&1
+echo "=== harness ON clean pair done  $(date -Is) ==="
+
+read_key
+echo "=== naive OFF clean pair start $(date -Is) label=${RUN_LABEL} ==="
+PYTHONPATH=src python3 -m aituner.cli study tune \
+  --spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
+  --store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
+  > ".aituner/${RUN_LABEL}-naive.log" 2>&1
+echo "=== naive OFF clean pair done  $(date -Is) ==="
+
+python3 - <<PY
+import json
+from pathlib import Path
+
+root = Path("${ROOT}")
+run_label = "${RUN_LABEL}"
+spec = {
+    "report_id": run_label,
+    "output_root": str(root / "${REPORT_ROOT}"),
+    "target_fraction": 0.95,
+    "min_final_ratio": 0.98,
+    "cases": [
+        {
+            "case_id": "qwen27b-chat-0-8k-clean-gpt55",
+            "description": "Clean same-policy gpt-5.5 harness-vs-naive pair on dash1.",
+            "tags": ["qwen27b", "chat", "0-8k", "h20", "clean-pair", "gpt-5.5"],
+            "budgets": [1, 2, 3, 4, 6, 8, 12],
+            "arms": [
+                {
+                    "name": "harness",
+                    "kind": "harness",
+                    "study_root": str(root / "${HARNESS_STORE}" / "dash0-qwen27b-ablation-harness-on"),
+                },
+                {
+                    "name": "naive",
+                    "kind": "naive",
+                    "study_root": str(root / "${NAIVE_STORE}" / "dash0-qwen27b-ablation-naive-off"),
+                },
+            ],
+        }
+    ],
+}
+Path("${SPEC_PATH}").write_text(json.dumps(spec, indent=2) + "\\n", encoding="utf-8")
+PY
+
+PYTHONPATH=src python3 scripts/tuning_report.py --spec "${SPEC_PATH}"
+touch ".aituner/${RUN_LABEL}.DONE"
+echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ==="