Harden trial measurement accounting

2026-05-06 21:18:09 +08:00
parent 871c4cfc02
commit c1ff64381d
8 changed files with 366 additions and 16 deletions
--- a/scripts/run_multi_compare.py
+++ b/scripts/run_multi_compare.py
@@ -372,6 +372,7 @@ def _aggregate(rows: list[dict[str, Any]], candidates: list[MultiCompareCandidat
            if rates_per_gpu
            else None,
            "mean_pass_rate": (sum(pass_rates) / len(pass_rates)) if pass_rates else None,
+            **_candidate_result_counts(rows, name),
        }
    for row in rows:
        wins[row["winner"]] = wins.get(row["winner"], 0) + 1
@@ -382,6 +383,26 @@ def _aggregate(rows: list[dict[str, Any]], candidates: list[MultiCompareCandidat
    }


+def _candidate_result_counts(rows: list[dict[str, Any]], name: str) -> dict[str, int]:
+    counts = {
+        "completed_window_count": 0,
+        "failed_window_count": 0,
+        "no_feasible_window_count": 0,
+    }
+    for row in rows:
+        result = row.get("candidates", {}).get(name)
+        if not isinstance(result, dict):
+            continue
+        status = str(result.get("status") or "")
+        if status == "completed":
+            counts["completed_window_count"] += 1
+        elif status == "failed":
+            counts["failed_window_count"] += 1
+        if not isinstance(result.get("best_request_rate_per_gpu"), (int, float)):
+            counts["no_feasible_window_count"] += 1
+    return counts
+
+
 def _render_report(summary: dict[str, Any], candidates: list[MultiCompareCandidate]) -> str:
    candidate_names = [item.name for item in candidates]
    lines = [
@@ -413,6 +434,9 @@ def _render_report(summary: dict[str, Any], candidates: list[MultiCompareCandida
        lines.append(
            f"- `{name}` mean req/s=`{aggregate['mean_request_rate']}`, mean req/s/gpu=`{aggregate['mean_request_rate_per_gpu']}`, mean pass_rate=`{aggregate['mean_pass_rate']}`"
        )
+        lines.append(
+            f"  completed/failed/no-feasible windows=`{aggregate['completed_window_count']}`/`{aggregate['failed_window_count']}`/`{aggregate['no_feasible_window_count']}`"
+        )
    header = ["Window", "Date"]
    for name in candidate_names:
        header.extend([f"{name} req/s", f"{name} req/s/gpu"])