Files
agentic-pd-hybrid/scripts/analysis/analyze_v3.py
kzlin 74194e660a docs: v4 final results, error analysis, and updated journey
Add v4 sweep results and post-mortem analysis showing:

- direct-to-D path: 54.3% (1P7D) / 58.0% (2P6D) of requests now use
  KVC cleanly. P50=0.5s and TTFT P50=0.043s; this path beats baseline
  8DP across the board (P50 -24%, TTFT P50 -54%, TTFT P90 -79%).

- Overall vs baseline (errors+truncated excluded):
  v4 2P6D P50=0.85s vs baseline 0.66s (28% slower).
  Reason is not errors -- 35% of requests still hit
  fallback-large-append-session-cap, where capacity-based
  cap = usable_tokens / target_tokens evaluates to 1-2 (not 16)
  for large agentic inputs.

- 9-10% errors on KVC variants are mooncake TCP transfer timeouts,
  not SGLang logic bugs. Prefill log shows
  "Failed to send kv chunk ... 32s timeout ... session not alive".
  Errors concentrate in turn>=31 (large inputs) after run >44.8%.

Track:
- docs/KVC_DEBUG_JOURNEY_V1_TO_V4.md: append v4 results table,
  per-mode breakdown, and error root cause.
- scripts/analysis/{analyze_v3,analyze_v4,analyze_errors,compare_no_error}.py
- outputs/qwen3-30b-tp1-v{3,4}*/exp*_summary.json (force-added,
  small JSON; metrics.jsonl excluded due to size).
- outputs/qwen3-30b-tp1-v{3,4}*/sweep_results.txt

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 23:34:01 +08:00

90 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""Analyze v3 (kv-aware) results — find why fallback-large-append-session-cap dominates."""
import json
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict
BASE = Path(__file__).parent
def load_rows(jsonl_path):
rows = []
with open(jsonl_path) as f:
for line in f:
rows.append(json.loads(line))
return rows
exp1 = load_rows(BASE / "exp1_1p7d_kvc_kvaware_metrics.jsonl")
exp2 = load_rows(BASE / "exp2_2p6d_kvc_kvaware_metrics.jsonl")
for name, rows in [("Exp1 1P7D", exp1), ("Exp2 2P6D", exp2)]:
print(f"\n========== {name} ==========")
ok = [r for r in rows if r.get("error") is None]
# Execution mode breakdown by latency
modes = Counter(r["execution_mode"] for r in ok)
print(f"\nExecution modes (n={len(ok)}):")
for mode, count in modes.most_common():
mode_rows = [r for r in ok if r["execution_mode"] == mode]
lats = [r["latency_s"] for r in mode_rows]
ttfts = [r["ttft_s"] for r in mode_rows]
print(f" {mode}: n={count} ({count/len(ok)*100:.1f}%) "
f"lat P50={np.percentile(lats,50):.3f}s P90={np.percentile(lats,90):.3f}s | "
f"ttft P50={np.percentile(ttfts,50):.3f}s P90={np.percentile(ttfts,90):.3f}s")
# Per-D session distribution
per_d_sessions = defaultdict(set)
for r in ok:
d = r.get("assigned_decode_node", "?")
per_d_sessions[d].add(r["session_id"])
print(f"\nSessions per D worker:")
for d in sorted(per_d_sessions.keys()):
print(f" {d}: {len(per_d_sessions[d])} unique sessions")
# session-cap fallback analysis
sc_rows = [r for r in ok if r["execution_mode"] == "pd-router-fallback-large-append-session-cap"]
if sc_rows:
print(f"\nSession-cap fallback details (n={len(sc_rows)}):")
# Which sessions hit this most?
sc_per_sess = Counter(r["session_id"] for r in sc_rows)
print(f" Sessions hitting session-cap (top 5):")
for sid, cnt in sc_per_sess.most_common(5):
print(f" session {sid}: {cnt} times")
# Per-D distribution
sc_per_d = Counter(r.get("assigned_decode_node", "?") for r in sc_rows)
print(f" Per-D distribution: {dict(sc_per_d.most_common())}")
# Input length distribution
inp = [r.get("input_length", 0) for r in sc_rows]
print(f" Input length: P50={np.percentile(inp,50):.0f} P90={np.percentile(inp,90):.0f}")
# Turn distribution
turns = Counter(r.get("turn_id", -1) for r in sc_rows)
print(f" Turn distribution (top 5): {dict(turns.most_common(5))}")
# Direct-to-D analysis (ideal path)
dd_rows = [r for r in ok if r["execution_mode"] == "kvcache-direct-to-d-session"]
if dd_rows:
lats = [r["latency_s"] for r in dd_rows]
ttfts = [r["ttft_s"] for r in dd_rows]
kv_blocks = [r.get("actual_kv_transfer_blocks", 0) for r in dd_rows]
cached = [r.get("cached_tokens", 0) for r in dd_rows]
print(f"\nDirect-to-D details (n={len(dd_rows)}):")
print(f" lat P50={np.percentile(lats,50):.3f}s P90={np.percentile(lats,90):.3f}s P99={np.percentile(lats,99):.3f}s")
print(f" ttft P50={np.percentile(ttfts,50):.3f}s P90={np.percentile(ttfts,90):.3f}s")
print(f" KV transfer: P50={np.percentile(kv_blocks,50):.0f} (should be 0 — no P involved)")
print(f" cached_tokens P50={np.percentile(cached,50):.0f}")
# Sessions: how many turns each, how many used direct-to-d
print(f"\nPer-session direct-to-D rate (top 10 by total turns):")
per_sess = defaultdict(list)
for r in ok:
per_sess[r["session_id"]].append(r)
sess_stats = []
for sid, sreqs in per_sess.items():
total = len(sreqs)
dd = sum(1 for r in sreqs if r["execution_mode"] == "kvcache-direct-to-d-session")
sc = sum(1 for r in sreqs if "session-cap" in r["execution_mode"])
sess_stats.append((sid, total, dd, sc))
sess_stats.sort(key=lambda x: -x[1])
for sid, total, dd, sc in sess_stats[:10]:
print(f" session {sid}: {total} turns, {dd} direct-to-D ({dd/total*100:.0f}%), {sc} session-cap fallback ({sc/total*100:.0f}%)")