agentic-pd-hybrid/scripts/analysis/analyze_errors.py

#!/usr/bin/env python3
"""Deep dive into v4 errors: which path, which D, which session, which turn."""
import json
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict

BASE = Path(__file__).parent

def load_rows(jsonl_path):
    rows = []
    with open(jsonl_path) as f:
        for line in f:
            rows.append(json.loads(line))
    return rows

# Compare v3 and v4 errors
for label, path in [
    ("v3 1P7D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp1_1p7d_kvc_kvaware_metrics.jsonl"),
    ("v4 1P7D", BASE / "exp1_1p7d_kvc_cap16_metrics.jsonl"),
    ("v3 2P6D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp2_2p6d_kvc_kvaware_metrics.jsonl"),
    ("v4 2P6D", BASE / "exp2_2p6d_kvc_cap16_metrics.jsonl"),
]:
    if not path.exists():
        print(f"\nSKIP {label}: {path} not found")
        continue
    rows = load_rows(path)
    err = [r for r in rows if r.get("error") is not None]
    print(f"\n========== {label} ({len(err)} errors / {len(rows)} total = {len(err)/len(rows)*100:.1f}%) ==========")

    # Error finish_reason distribution
    fr_counter = Counter()
    for r in err:
        fr = str(r.get("finish_reason") or r.get("error") or "?")
        fr_counter[fr[:80]] += 1
    print(f"finish_reason distribution:")
    for fr, cnt in fr_counter.most_common():
        print(f"  {cnt:>4}x  {fr}")

    # Errors by execution mode (these are aborted before mode assignment usually)
    mode_counter = Counter(r.get("execution_mode", "?") for r in err)
    print(f"\nerror by execution_mode:")
    for mode, cnt in mode_counter.most_common():
        print(f"  {cnt:>4}x  {mode}")

    # Errors per D worker
    dw_counter = Counter(r.get("assigned_decode_node", "?") for r in err)
    print(f"\nerror per assigned_decode_node:")
    for dw, cnt in dw_counter.most_common():
        print(f"  {cnt:>4}x  {dw}")

    # Errors by turn distribution
    turn_counter = Counter(r.get("turn_id", -1) for r in err)
    early = sum(c for t, c in turn_counter.items() if t <= 5)
    mid = sum(c for t, c in turn_counter.items() if 5 < t <= 30)
    late = sum(c for t, c in turn_counter.items() if t > 30)
    print(f"\nerror by turn: early(0-5)={early} mid(6-30)={mid} late(31+)={late}")

    # Per-session error rate
    per_sess_err = defaultdict(int)
    per_sess_total = defaultdict(int)
    for r in rows:
        per_sess_total[r["session_id"]] += 1
        if r.get("error") is not None:
            per_sess_err[r["session_id"]] += 1
    sess_with_err = [(sid, per_sess_err[sid], per_sess_total[sid]) for sid in per_sess_err]
    sess_with_err.sort(key=lambda x: -x[1])
    print(f"\ntop 5 sessions by error count:")
    for sid, e, t in sess_with_err[:5]:
        print(f"  session {sid}: {e}/{t} errors ({e/t*100:.0f}%)")

    # Errors timeline: are they bursty?
    err_ts = sorted([r.get("trace_timestamp_s", 0) for r in err])
    if err_ts:
        first_ts = err_ts[0]
        last_ts = err_ts[-1]
        all_ts = sorted([r.get("trace_timestamp_s", 0) for r in rows])
        first_all = all_ts[0]
        last_all = all_ts[-1]
        run_duration = last_all - first_all
        err_first_pct = (err_ts[0] - first_all) / run_duration * 100 if run_duration > 0 else 0
        err_last_pct = (err_ts[-1] - first_all) / run_duration * 100 if run_duration > 0 else 0
        print(f"\nerror time range (% of run): {err_first_pct:.1f}% - {err_last_pct:.1f}%")