Scale ablation early-stop caps to the compressed window (scale=0.2)

At replay_time_scale=0.2 the 600s arrival window compresses to 120s, so the inherited 900s wall-clock elapsed cap let overloaded TP1 probes burn ~15min each (the tractability hazard the brief flagged). Scale the caps proportionately to the time axis: early_stop_max_elapsed_s 900->180, early_stop_max_lag_s 120->30. Feasible probes (~120s arrival + drain) finish well inside 180s; overloaded probes die in ~3min. Both configs still differ only in use_harness + study_id. Adds the ablation doc skeleton and a read-only trajectory-extraction helper. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 19:49:57 +08:00
parent a16016a876
commit d975e57bb5
4 changed files with 137 additions and 4 deletions
--- a/scripts/ablation_trajectory.py
+++ b/scripts/ablation_trajectory.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""Extract a per-iteration trajectory table from an ablation study store.
+
+Usage: python3 ablation_trajectory.py <study_store_dir>
+Prints iter, proposal source/name, config_patch summary, per_gpu, status,
+and the running incumbent per_gpu. Read-only.
+"""
+import json
+import sys
+from pathlib import Path
+
+
+def topo(patch):
+    fp = (patch or {}).get("flag_patch", {}) or {}
+    ep = (patch or {}).get("env_patch", {}) or {}
+    parts = []
+    for k, label in (
+        ("tensor-parallel-size", "TP"),
+        ("data-parallel-size", "DP"),
+        ("expert-parallel-size", "EP"),
+    ):
+        if k in fp:
+            parts.append(f"{label}{fp[k]}")
+    runtime = {
+        k: v
+        for k, v in fp.items()
+        if k not in ("tensor-parallel-size", "data-parallel-size", "expert-parallel-size")
+    }
+    runtime.update({f"env:{k}": v for k, v in ep.items()})
+    base = "+".join(parts) if parts else "baseline-topo"
+    if runtime:
+        base += " | " + ", ".join(f"{k}={v}" for k, v in runtime.items())
+    return base
+
+
+def main():
+    store = Path(sys.argv[1])
+    state = json.load(open(store / "state.json"))
+    print(f"study_id: {state.get('study_id')}")
+    print(f"best_trial: {state.get('best_trial_id')}  best_per_gpu: {state.get('best_request_rate_per_gpu')}")
+    print(f"stop_reason: {state.get('tuning_stop_reason')!r}")
+    print(f"stop_diagnosis: {state.get('tuning_stop_diagnosis')!r}")
+    print(f"stop_details: {json.dumps(state.get('tuning_stop_details'), ensure_ascii=False)}")
+    print()
+    incumbent = None
+    hdr = f"{'iter':<5}{'trial':<11}{'status':<14}{'per_gpu':<10}{'incumbent':<11}config"
+    print(hdr)
+    print("-" * len(hdr))
+    for i, t in enumerate(state.get("trials", []), 1):
+        pg = t.get("best_request_rate_per_gpu")
+        if pg is not None and (incumbent is None or pg > incumbent):
+            incumbent = pg
+        pgs = f"{pg:.4f}" if isinstance(pg, (int, float)) else str(pg)
+        incs = f"{incumbent:.4f}" if isinstance(incumbent, (int, float)) else str(incumbent)
+        print(
+            f"{i:<5}{t.get('trial_id',''):<11}{str(t.get('status','')):<14}{pgs:<10}{incs:<11}{topo(t.get('config_patch'))}"
+        )
+    # also dump proposals dir to see what was *proposed* (incl. vetoed/failed)
+    pdir = store / "proposals"
+    if pdir.exists():
+        print("\n-- proposal files (chronological) --")
+        for p in sorted(pdir.glob("*.json")):
+            try:
+                pr = json.load(open(p))
+            except Exception:
+                continue
+            print(f"  {p.stem}: should_stop={pr.get('should_stop')} | {topo(pr.get('config_patch'))}")
+
+
+if __name__ == "__main__":
+    main()