#!/usr/bin/env python3 """Deep dive into v4 errors: which path, which D, which session, which turn.""" import json import numpy as np from pathlib import Path from collections import Counter, defaultdict BASE = Path(__file__).parent def load_rows(jsonl_path): rows = [] with open(jsonl_path) as f: for line in f: rows.append(json.loads(line)) return rows # Compare v3 and v4 errors for label, path in [ ("v3 1P7D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp1_1p7d_kvc_kvaware_metrics.jsonl"), ("v4 1P7D", BASE / "exp1_1p7d_kvc_cap16_metrics.jsonl"), ("v3 2P6D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp2_2p6d_kvc_kvaware_metrics.jsonl"), ("v4 2P6D", BASE / "exp2_2p6d_kvc_cap16_metrics.jsonl"), ]: if not path.exists(): print(f"\nSKIP {label}: {path} not found") continue rows = load_rows(path) err = [r for r in rows if r.get("error") is not None] print(f"\n========== {label} ({len(err)} errors / {len(rows)} total = {len(err)/len(rows)*100:.1f}%) ==========") # Error finish_reason distribution fr_counter = Counter() for r in err: fr = str(r.get("finish_reason") or r.get("error") or "?") fr_counter[fr[:80]] += 1 print(f"finish_reason distribution:") for fr, cnt in fr_counter.most_common(): print(f" {cnt:>4}x {fr}") # Errors by execution mode (these are aborted before mode assignment usually) mode_counter = Counter(r.get("execution_mode", "?") for r in err) print(f"\nerror by execution_mode:") for mode, cnt in mode_counter.most_common(): print(f" {cnt:>4}x {mode}") # Errors per D worker dw_counter = Counter(r.get("assigned_decode_node", "?") for r in err) print(f"\nerror per assigned_decode_node:") for dw, cnt in dw_counter.most_common(): print(f" {cnt:>4}x {dw}") # Errors by turn distribution turn_counter = Counter(r.get("turn_id", -1) for r in err) early = sum(c for t, c in turn_counter.items() if t <= 5) mid = sum(c for t, c in turn_counter.items() if 5 < t <= 30) late = sum(c for t, c in turn_counter.items() if t > 30) print(f"\nerror by turn: early(0-5)={early} mid(6-30)={mid} late(31+)={late}") # Per-session error rate per_sess_err = defaultdict(int) per_sess_total = defaultdict(int) for r in rows: per_sess_total[r["session_id"]] += 1 if r.get("error") is not None: per_sess_err[r["session_id"]] += 1 sess_with_err = [(sid, per_sess_err[sid], per_sess_total[sid]) for sid in per_sess_err] sess_with_err.sort(key=lambda x: -x[1]) print(f"\ntop 5 sessions by error count:") for sid, e, t in sess_with_err[:5]: print(f" session {sid}: {e}/{t} errors ({e/t*100:.0f}%)") # Errors timeline: are they bursty? err_ts = sorted([r.get("trace_timestamp_s", 0) for r in err]) if err_ts: first_ts = err_ts[0] last_ts = err_ts[-1] all_ts = sorted([r.get("trace_timestamp_s", 0) for r in rows]) first_all = all_ts[0] last_all = all_ts[-1] run_duration = last_all - first_all err_first_pct = (err_ts[0] - first_all) / run_duration * 100 if run_duration > 0 else 0 err_last_pct = (err_ts[-1] - first_all) / run_duration * 100 if run_duration > 0 else 0 print(f"\nerror time range (% of run): {err_first_pct:.1f}% - {err_last_pct:.1f}%")