#!/usr/bin/env python3 """V4 results analysis: errors, execution modes, latency by mode.""" import json import numpy as np from pathlib import Path from collections import Counter BASE = Path(__file__).parent def load_rows(jsonl_path): rows = [] with open(jsonl_path) as f: for line in f: rows.append(json.loads(line)) return rows for name, path in [ ("Exp1 1P7D cap=16", BASE / "exp1_1p7d_kvc_cap16_metrics.jsonl"), ("Exp2 2P6D cap=16", BASE / "exp2_2p6d_kvc_cap16_metrics.jsonl"), ]: rows = load_rows(path) print(f"\n========== {name} ==========") ok = [r for r in rows if r.get("error") is None] err = [r for r in rows if r.get("error") is not None] print(f"Total: {len(rows)}, OK: {len(ok)}, Errors: {len(err)}") # Errors finish_reason if err: finish_reasons = Counter() for r in err: fr = str(r.get("finish_reason") or r.get("error") or "?") # Truncate long messages short = fr[:120] finish_reasons[short] += 1 print(f"\nError finish_reasons (top 5):") for fr, cnt in finish_reasons.most_common(5): print(f" {cnt}x: {fr}") # Execution mode latency breakdown modes = Counter(r["execution_mode"] for r in ok) print(f"\nTop execution modes by latency:") print(f"{'mode':<55}{'n':<8}{'%':<8}{'P50 lat':<10}{'P90 lat':<10}{'TTFT P50':<10}") for mode, count in modes.most_common(8): mode_rows = [r for r in ok if r["execution_mode"] == mode] lats = [r["latency_s"] for r in mode_rows] ttfts = [r["ttft_s"] for r in mode_rows] print(f" {mode:<53}{count:<8}{count/len(ok)*100:>5.1f}% {np.percentile(lats,50):>7.3f}s {np.percentile(lats,90):>7.3f}s {np.percentile(ttfts,50):>7.3f}s") # Per-D load per_d = Counter(r.get("assigned_decode_node", "?") for r in ok) print(f"\nPer-D load: max/min ratio = {max(per_d.values())/max(min(per_d.values()),1):.2f}x") print(f" {dict(per_d.most_common())}")