Files
agentic-pd-hybrid/scripts/analysis/analyze_errors.py
kzlin 74194e660a docs: v4 final results, error analysis, and updated journey
Add v4 sweep results and post-mortem analysis showing:

- direct-to-D path: 54.3% (1P7D) / 58.0% (2P6D) of requests now use
  KVC cleanly. P50=0.5s and TTFT P50=0.043s; this path beats baseline
  8DP across the board (P50 -24%, TTFT P50 -54%, TTFT P90 -79%).

- Overall vs baseline (errors+truncated excluded):
  v4 2P6D P50=0.85s vs baseline 0.66s (28% slower).
  Reason is not errors -- 35% of requests still hit
  fallback-large-append-session-cap, where capacity-based
  cap = usable_tokens / target_tokens evaluates to 1-2 (not 16)
  for large agentic inputs.

- 9-10% errors on KVC variants are mooncake TCP transfer timeouts,
  not SGLang logic bugs. Prefill log shows
  "Failed to send kv chunk ... 32s timeout ... session not alive".
  Errors concentrate in turn>=31 (large inputs) after run >44.8%.

Track:
- docs/KVC_DEBUG_JOURNEY_V1_TO_V4.md: append v4 results table,
  per-mode breakdown, and error root cause.
- scripts/analysis/{analyze_v3,analyze_v4,analyze_errors,compare_no_error}.py
- outputs/qwen3-30b-tp1-v{3,4}*/exp*_summary.json (force-added,
  small JSON; metrics.jsonl excluded due to size).
- outputs/qwen3-30b-tp1-v{3,4}*/sweep_results.txt

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 23:34:01 +08:00

84 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""Deep dive into v4 errors: which path, which D, which session, which turn."""
import json
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict
BASE = Path(__file__).parent
def load_rows(jsonl_path):
rows = []
with open(jsonl_path) as f:
for line in f:
rows.append(json.loads(line))
return rows
# Compare v3 and v4 errors
for label, path in [
("v3 1P7D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp1_1p7d_kvc_kvaware_metrics.jsonl"),
("v4 1P7D", BASE / "exp1_1p7d_kvc_cap16_metrics.jsonl"),
("v3 2P6D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp2_2p6d_kvc_kvaware_metrics.jsonl"),
("v4 2P6D", BASE / "exp2_2p6d_kvc_cap16_metrics.jsonl"),
]:
if not path.exists():
print(f"\nSKIP {label}: {path} not found")
continue
rows = load_rows(path)
err = [r for r in rows if r.get("error") is not None]
print(f"\n========== {label} ({len(err)} errors / {len(rows)} total = {len(err)/len(rows)*100:.1f}%) ==========")
# Error finish_reason distribution
fr_counter = Counter()
for r in err:
fr = str(r.get("finish_reason") or r.get("error") or "?")
fr_counter[fr[:80]] += 1
print(f"finish_reason distribution:")
for fr, cnt in fr_counter.most_common():
print(f" {cnt:>4}x {fr}")
# Errors by execution mode (these are aborted before mode assignment usually)
mode_counter = Counter(r.get("execution_mode", "?") for r in err)
print(f"\nerror by execution_mode:")
for mode, cnt in mode_counter.most_common():
print(f" {cnt:>4}x {mode}")
# Errors per D worker
dw_counter = Counter(r.get("assigned_decode_node", "?") for r in err)
print(f"\nerror per assigned_decode_node:")
for dw, cnt in dw_counter.most_common():
print(f" {cnt:>4}x {dw}")
# Errors by turn distribution
turn_counter = Counter(r.get("turn_id", -1) for r in err)
early = sum(c for t, c in turn_counter.items() if t <= 5)
mid = sum(c for t, c in turn_counter.items() if 5 < t <= 30)
late = sum(c for t, c in turn_counter.items() if t > 30)
print(f"\nerror by turn: early(0-5)={early} mid(6-30)={mid} late(31+)={late}")
# Per-session error rate
per_sess_err = defaultdict(int)
per_sess_total = defaultdict(int)
for r in rows:
per_sess_total[r["session_id"]] += 1
if r.get("error") is not None:
per_sess_err[r["session_id"]] += 1
sess_with_err = [(sid, per_sess_err[sid], per_sess_total[sid]) for sid in per_sess_err]
sess_with_err.sort(key=lambda x: -x[1])
print(f"\ntop 5 sessions by error count:")
for sid, e, t in sess_with_err[:5]:
print(f" session {sid}: {e}/{t} errors ({e/t*100:.0f}%)")
# Errors timeline: are they bursty?
err_ts = sorted([r.get("trace_timestamp_s", 0) for r in err])
if err_ts:
first_ts = err_ts[0]
last_ts = err_ts[-1]
all_ts = sorted([r.get("trace_timestamp_s", 0) for r in rows])
first_all = all_ts[0]
last_all = all_ts[-1]
run_duration = last_all - first_all
err_first_pct = (err_ts[0] - first_all) / run_duration * 100 if run_duration > 0 else 0
err_last_pct = (err_ts[-1] - first_all) / run_duration * 100 if run_duration > 0 else 0
print(f"\nerror time range (% of run): {err_first_pct:.1f}% - {err_last_pct:.1f}%")