#!/usr/bin/env python3 """Analyze v3 (kv-aware) results — find why fallback-large-append-session-cap dominates.""" import json import numpy as np from pathlib import Path from collections import Counter, defaultdict BASE = Path(__file__).parent def load_rows(jsonl_path): rows = [] with open(jsonl_path) as f: for line in f: rows.append(json.loads(line)) return rows exp1 = load_rows(BASE / "exp1_1p7d_kvc_kvaware_metrics.jsonl") exp2 = load_rows(BASE / "exp2_2p6d_kvc_kvaware_metrics.jsonl") for name, rows in [("Exp1 1P7D", exp1), ("Exp2 2P6D", exp2)]: print(f"\n========== {name} ==========") ok = [r for r in rows if r.get("error") is None] # Execution mode breakdown by latency modes = Counter(r["execution_mode"] for r in ok) print(f"\nExecution modes (n={len(ok)}):") for mode, count in modes.most_common(): mode_rows = [r for r in ok if r["execution_mode"] == mode] lats = [r["latency_s"] for r in mode_rows] ttfts = [r["ttft_s"] for r in mode_rows] print(f" {mode}: n={count} ({count/len(ok)*100:.1f}%) " f"lat P50={np.percentile(lats,50):.3f}s P90={np.percentile(lats,90):.3f}s | " f"ttft P50={np.percentile(ttfts,50):.3f}s P90={np.percentile(ttfts,90):.3f}s") # Per-D session distribution per_d_sessions = defaultdict(set) for r in ok: d = r.get("assigned_decode_node", "?") per_d_sessions[d].add(r["session_id"]) print(f"\nSessions per D worker:") for d in sorted(per_d_sessions.keys()): print(f" {d}: {len(per_d_sessions[d])} unique sessions") # session-cap fallback analysis sc_rows = [r for r in ok if r["execution_mode"] == "pd-router-fallback-large-append-session-cap"] if sc_rows: print(f"\nSession-cap fallback details (n={len(sc_rows)}):") # Which sessions hit this most? sc_per_sess = Counter(r["session_id"] for r in sc_rows) print(f" Sessions hitting session-cap (top 5):") for sid, cnt in sc_per_sess.most_common(5): print(f" session {sid}: {cnt} times") # Per-D distribution sc_per_d = Counter(r.get("assigned_decode_node", "?") for r in sc_rows) print(f" Per-D distribution: {dict(sc_per_d.most_common())}") # Input length distribution inp = [r.get("input_length", 0) for r in sc_rows] print(f" Input length: P50={np.percentile(inp,50):.0f} P90={np.percentile(inp,90):.0f}") # Turn distribution turns = Counter(r.get("turn_id", -1) for r in sc_rows) print(f" Turn distribution (top 5): {dict(turns.most_common(5))}") # Direct-to-D analysis (ideal path) dd_rows = [r for r in ok if r["execution_mode"] == "kvcache-direct-to-d-session"] if dd_rows: lats = [r["latency_s"] for r in dd_rows] ttfts = [r["ttft_s"] for r in dd_rows] kv_blocks = [r.get("actual_kv_transfer_blocks", 0) for r in dd_rows] cached = [r.get("cached_tokens", 0) for r in dd_rows] print(f"\nDirect-to-D details (n={len(dd_rows)}):") print(f" lat P50={np.percentile(lats,50):.3f}s P90={np.percentile(lats,90):.3f}s P99={np.percentile(lats,99):.3f}s") print(f" ttft P50={np.percentile(ttfts,50):.3f}s P90={np.percentile(ttfts,90):.3f}s") print(f" KV transfer: P50={np.percentile(kv_blocks,50):.0f} (should be 0 — no P involved)") print(f" cached_tokens P50={np.percentile(cached,50):.0f}") # Sessions: how many turns each, how many used direct-to-d print(f"\nPer-session direct-to-D rate (top 10 by total turns):") per_sess = defaultdict(list) for r in ok: per_sess[r["session_id"]].append(r) sess_stats = [] for sid, sreqs in per_sess.items(): total = len(sreqs) dd = sum(1 for r in sreqs if r["execution_mode"] == "kvcache-direct-to-d-session") sc = sum(1 for r in sreqs if "session-cap" in r["execution_mode"]) sess_stats.append((sid, total, dd, sc)) sess_stats.sort(key=lambda x: -x[1]) for sid, total, dd, sc in sess_stats[:10]: print(f" session {sid}: {total} turns, {dd} direct-to-D ({dd/total*100:.0f}%), {sc} session-cap fallback ({sc/total*100:.0f}%)")