82 lines
3.0 KiB
Bash
82 lines
3.0 KiB
Bash
#!/usr/bin/env bash
|
|
# Clean same-policy harness-vs-naive ablation on dash1.
|
|
#
|
|
# This is intended as the first robustness gate for harness evaluation:
|
|
# both arms use the same study substrate and the same configured LLM endpoint;
|
|
# the only intended difference is llm.use_harness.
|
|
set -euo pipefail
|
|
|
|
RUN_LABEL="${AITUNER_RUN_ID:-qwen27b-clean-pair-$(date -u +%Y%m%dT%H%M%SZ)}"
|
|
MAX_TRIALS="${MAX_TRIALS:-12}"
|
|
ROOT="$(pwd)"
|
|
HARNESS_STORE=".aituner/${RUN_LABEL}-harness"
|
|
NAIVE_STORE=".aituner/${RUN_LABEL}-naive"
|
|
REPORT_ROOT=".aituner-reports/${RUN_LABEL}"
|
|
SPEC_PATH=".aituner-reports/${RUN_LABEL}.spec.json"
|
|
|
|
read_key() {
|
|
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
|
export OPENAI_API_KEY
|
|
OPENAI_API_KEY="$(python3 -c 'import json,pathlib;print(json.load(open(pathlib.Path.home()/".codex/auth.json"))["OPENAI_API_KEY"])')"
|
|
fi
|
|
}
|
|
|
|
export http_proxy= https_proxy= all_proxy= HTTP_PROXY= HTTPS_PROXY= ALL_PROXY= no_proxy='*'
|
|
mkdir -p .aituner .aituner-reports
|
|
rm -rf "${HARNESS_STORE}" "${NAIVE_STORE}" "${REPORT_ROOT}" "${SPEC_PATH}"
|
|
|
|
read_key
|
|
echo "=== harness ON clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
|
--spec configs/examples/dash0_qwen27b_ablation_harness_on.json \
|
|
--store-root "${HARNESS_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
|
> ".aituner/${RUN_LABEL}-harness.log" 2>&1
|
|
echo "=== harness ON clean pair done $(date -Is) ==="
|
|
|
|
read_key
|
|
echo "=== naive OFF clean pair start $(date -Is) label=${RUN_LABEL} ==="
|
|
PYTHONPATH=src python3 -m aituner.cli study tune \
|
|
--spec configs/examples/dash0_qwen27b_ablation_naive_off.json \
|
|
--store-root "${NAIVE_STORE}" --max-trials "${MAX_TRIALS}" --skip-baseline \
|
|
> ".aituner/${RUN_LABEL}-naive.log" 2>&1
|
|
echo "=== naive OFF clean pair done $(date -Is) ==="
|
|
|
|
python3 - <<PY
|
|
import json
|
|
from pathlib import Path
|
|
|
|
root = Path("${ROOT}")
|
|
run_label = "${RUN_LABEL}"
|
|
spec = {
|
|
"report_id": run_label,
|
|
"output_root": str(root / "${REPORT_ROOT}"),
|
|
"target_fraction": 0.95,
|
|
"min_final_ratio": 0.98,
|
|
"cases": [
|
|
{
|
|
"case_id": "qwen27b-chat-0-8k-clean-gpt55",
|
|
"description": "Clean same-policy gpt-5.5 harness-vs-naive pair on dash1.",
|
|
"tags": ["qwen27b", "chat", "0-8k", "h20", "clean-pair", "gpt-5.5"],
|
|
"budgets": [1, 2, 3, 4, 6, 8, 12],
|
|
"arms": [
|
|
{
|
|
"name": "harness",
|
|
"kind": "harness",
|
|
"study_root": str(root / "${HARNESS_STORE}" / "dash0-qwen27b-ablation-harness-on"),
|
|
},
|
|
{
|
|
"name": "naive",
|
|
"kind": "naive",
|
|
"study_root": str(root / "${NAIVE_STORE}" / "dash0-qwen27b-ablation-naive-off"),
|
|
},
|
|
],
|
|
}
|
|
],
|
|
}
|
|
Path("${SPEC_PATH}").write_text(json.dumps(spec, indent=2) + "\\n", encoding="utf-8")
|
|
PY
|
|
|
|
PYTHONPATH=src python3 scripts/tuning_report.py --spec "${SPEC_PATH}"
|
|
touch ".aituner/${RUN_LABEL}.DONE"
|
|
echo "=== clean pair report ready ${REPORT_ROOT} $(date -Is) ==="
|