Add v4 sweep results and post-mortem analysis showing:
- direct-to-D path: 54.3% (1P7D) / 58.0% (2P6D) of requests now use
KVC cleanly. P50=0.5s and TTFT P50=0.043s; this path beats baseline
8DP across the board (P50 -24%, TTFT P50 -54%, TTFT P90 -79%).
- Overall vs baseline (errors+truncated excluded):
v4 2P6D P50=0.85s vs baseline 0.66s (28% slower).
Reason is not errors -- 35% of requests still hit
fallback-large-append-session-cap, where capacity-based
cap = usable_tokens / target_tokens evaluates to 1-2 (not 16)
for large agentic inputs.
- 9-10% errors on KVC variants are mooncake TCP transfer timeouts,
not SGLang logic bugs. Prefill log shows
"Failed to send kv chunk ... 32s timeout ... session not alive".
Errors concentrate in turn>=31 (large inputs) after run >44.8%.
Track:
- docs/KVC_DEBUG_JOURNEY_V1_TO_V4.md: append v4 results table,
per-mode breakdown, and error root cause.
- scripts/analysis/{analyze_v3,analyze_v4,analyze_errors,compare_no_error}.py
- outputs/qwen3-30b-tp1-v{3,4}*/exp*_summary.json (force-added,
small JSON; metrics.jsonl excluded due to size).
- outputs/qwen3-30b-tp1-v{3,4}*/sweep_results.txt
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
84 lines
3.3 KiB
Python
84 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Deep dive into v4 errors: which path, which D, which session, which turn."""
|
|
import json
|
|
import numpy as np
|
|
from pathlib import Path
|
|
from collections import Counter, defaultdict
|
|
|
|
BASE = Path(__file__).parent
|
|
|
|
def load_rows(jsonl_path):
|
|
rows = []
|
|
with open(jsonl_path) as f:
|
|
for line in f:
|
|
rows.append(json.loads(line))
|
|
return rows
|
|
|
|
# Compare v3 and v4 errors
|
|
for label, path in [
|
|
("v3 1P7D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp1_1p7d_kvc_kvaware_metrics.jsonl"),
|
|
("v4 1P7D", BASE / "exp1_1p7d_kvc_cap16_metrics.jsonl"),
|
|
("v3 2P6D", BASE.parent / "qwen3-30b-tp1-v3-kvaware/exp2_2p6d_kvc_kvaware_metrics.jsonl"),
|
|
("v4 2P6D", BASE / "exp2_2p6d_kvc_cap16_metrics.jsonl"),
|
|
]:
|
|
if not path.exists():
|
|
print(f"\nSKIP {label}: {path} not found")
|
|
continue
|
|
rows = load_rows(path)
|
|
err = [r for r in rows if r.get("error") is not None]
|
|
print(f"\n========== {label} ({len(err)} errors / {len(rows)} total = {len(err)/len(rows)*100:.1f}%) ==========")
|
|
|
|
# Error finish_reason distribution
|
|
fr_counter = Counter()
|
|
for r in err:
|
|
fr = str(r.get("finish_reason") or r.get("error") or "?")
|
|
fr_counter[fr[:80]] += 1
|
|
print(f"finish_reason distribution:")
|
|
for fr, cnt in fr_counter.most_common():
|
|
print(f" {cnt:>4}x {fr}")
|
|
|
|
# Errors by execution mode (these are aborted before mode assignment usually)
|
|
mode_counter = Counter(r.get("execution_mode", "?") for r in err)
|
|
print(f"\nerror by execution_mode:")
|
|
for mode, cnt in mode_counter.most_common():
|
|
print(f" {cnt:>4}x {mode}")
|
|
|
|
# Errors per D worker
|
|
dw_counter = Counter(r.get("assigned_decode_node", "?") for r in err)
|
|
print(f"\nerror per assigned_decode_node:")
|
|
for dw, cnt in dw_counter.most_common():
|
|
print(f" {cnt:>4}x {dw}")
|
|
|
|
# Errors by turn distribution
|
|
turn_counter = Counter(r.get("turn_id", -1) for r in err)
|
|
early = sum(c for t, c in turn_counter.items() if t <= 5)
|
|
mid = sum(c for t, c in turn_counter.items() if 5 < t <= 30)
|
|
late = sum(c for t, c in turn_counter.items() if t > 30)
|
|
print(f"\nerror by turn: early(0-5)={early} mid(6-30)={mid} late(31+)={late}")
|
|
|
|
# Per-session error rate
|
|
per_sess_err = defaultdict(int)
|
|
per_sess_total = defaultdict(int)
|
|
for r in rows:
|
|
per_sess_total[r["session_id"]] += 1
|
|
if r.get("error") is not None:
|
|
per_sess_err[r["session_id"]] += 1
|
|
sess_with_err = [(sid, per_sess_err[sid], per_sess_total[sid]) for sid in per_sess_err]
|
|
sess_with_err.sort(key=lambda x: -x[1])
|
|
print(f"\ntop 5 sessions by error count:")
|
|
for sid, e, t in sess_with_err[:5]:
|
|
print(f" session {sid}: {e}/{t} errors ({e/t*100:.0f}%)")
|
|
|
|
# Errors timeline: are they bursty?
|
|
err_ts = sorted([r.get("trace_timestamp_s", 0) for r in err])
|
|
if err_ts:
|
|
first_ts = err_ts[0]
|
|
last_ts = err_ts[-1]
|
|
all_ts = sorted([r.get("trace_timestamp_s", 0) for r in rows])
|
|
first_all = all_ts[0]
|
|
last_all = all_ts[-1]
|
|
run_duration = last_all - first_all
|
|
err_first_pct = (err_ts[0] - first_all) / run_duration * 100 if run_duration > 0 else 0
|
|
err_last_pct = (err_ts[-1] - first_all) / run_duration * 100 if run_duration > 0 else 0
|
|
print(f"\nerror time range (% of run): {err_first_pct:.1f}% - {err_last_pct:.1f}%")
|