agentic-pd-hybrid/scripts/analysis/analyze_ts1_validation.py

#!/usr/bin/env python3
"""TS=1 validation analysis: KVC 1P3D × N=3 + 4DP × 1.

Reads metrics from outputs/qwen3-30b-tp1-ts1-validation/{kvc_1p3d_run{1,2,3},dp4}_metrics.jsonl
and reports per the structural claims in docs/AGENTIC_FIT_ANALYSIS_ZH.md and TEAM_REPORT.

Sections:
  1. Headline summary table (errors, latency p50/p90/p99, TTFT p50)
  2. §1 (session pinning): distinct-D-per-session distribution + direct-to-D bimodal
  3. §1 (cross-run consistency): sessions consistently starved across all 3 runs + size ratio
  4. §2 (LRU): KVTransferError counts per D + peak token_usage from worker logs
  5. §7 (ts=1 vs ts=10): direct-to-D rate, fallback rate, per-D load balance
  6. KVC vs DP same-scale comparison

Usage: python scripts/analysis/analyze_ts1_validation.py [--root PATH]
"""
import argparse
import json
import re
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np


def load_metrics(path):
    rows = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows


def load_summary(path):
    with open(path) as f:
        return json.load(f)


def pct(arr, p):
    if not arr:
        return float("nan")
    return float(np.percentile(arr, p))


def summarize_run(label, rows, summary):
    ok = [r for r in rows if r.get("error") is None]
    err = [r for r in rows if r.get("error") is not None]
    lats = [r["latency_s"] for r in ok if r.get("latency_s") is not None]
    ttfts = [r["ttft_s"] for r in ok if r.get("ttft_s") is not None]
    return {
        "label": label,
        "n": len(rows),
        "ok": len(ok),
        "err": len(err),
        "lat_mean": float(np.mean(lats)) if lats else float("nan"),
        "lat_p50": pct(lats, 50),
        "lat_p90": pct(lats, 90),
        "lat_p99": pct(lats, 99),
        "ttft_mean": float(np.mean(ttfts)) if ttfts else float("nan"),
        "ttft_p50": pct(ttfts, 50),
        "summary": summary,
    }


def headline_table(stats):
    print("\n" + "=" * 110)
    print("HEADLINE: same trace, same scale, same ts=1")
    print("=" * 110)
    cols = ["label", "ok/n", "err", "lat_mean", "lat_p50", "lat_p90", "lat_p99", "ttft_mean", "ttft_p50"]
    print(f"{cols[0]:<22}{cols[1]:>12}{cols[2]:>6}{cols[3]:>10}{cols[4]:>10}{cols[5]:>10}{cols[6]:>10}{cols[7]:>10}{cols[8]:>10}")
    for s in stats:
        ok_n = f"{s['ok']}/{s['n']}"
        print(f"{s['label']:<22}{ok_n:>12}{s['err']:>6}"
              f"{s['lat_mean']:>9.3f}s{s['lat_p50']:>9.3f}s{s['lat_p90']:>9.3f}s{s['lat_p99']:>9.3f}s"
              f"{s['ttft_mean']:>9.3f}s{s['ttft_p50']:>9.3f}s")


def session_pinning(rows, label):
    """§1: distinct D per session — should be ~1.0 if pin behavior persists."""
    sess_d = defaultdict(set)
    for r in rows:
        sid = r.get("session_id")
        d = r.get("assigned_decode_node") or r.get("decode_node")
        if sid is not None and d is not None:
            sess_d[sid].add(d)
    if not sess_d:
        return None
    distinct = [len(s) for s in sess_d.values()]
    return {
        "label": label,
        "n_sessions": len(sess_d),
        "avg_distinct_D": float(np.mean(distinct)),
        "max_distinct_D": max(distinct),
        "sess_d": {sid: sorted(ds) for sid, ds in sess_d.items()},
    }


def direct_to_d_distribution(rows, label):
    """§1: per-session direct-to-D rate; check for bimodal."""
    sess_total = Counter()
    sess_direct = Counter()
    for r in rows:
        sid = r.get("session_id")
        if sid is None:
            continue
        sess_total[sid] += 1
        mode = r.get("execution_mode", "")
        if mode == "kvcache-direct-to-d-session":
            sess_direct[sid] += 1
    rates = []
    for sid in sess_total:
        rate = sess_direct[sid] / sess_total[sid]
        rates.append((sid, rate, sess_total[sid]))
    bins = [0, 0.2, 0.4, 0.6, 0.8, 1.01]
    bin_labels = ["0-20%", "20-40%", "40-60%", "60-80%", "80-100%"]
    counts = [0] * 5
    for _, r, _ in rates:
        for i in range(5):
            if bins[i] <= r < bins[i + 1]:
                counts[i] += 1
                break
    print(f"\n  [{label}] direct-to-D rate distribution (n={len(rates)} sessions):")
    for lbl, cnt in zip(bin_labels, counts):
        bar = "█" * cnt
        print(f"    {lbl:<10}: {cnt:>3}  {bar}")
    return rates


def starved_cross_run(per_run_rates, threshold=0.20):
    """§1: sessions starved (<threshold direct-to-D) in ALL runs."""
    if len(per_run_rates) < 2:
        return None
    sess_starved = defaultdict(int)
    sess_lucky = defaultdict(int)
    for rates in per_run_rates:
        for sid, rate, _ in rates:
            if rate < threshold:
                sess_starved[sid] += 1
            elif rate > 0.80:
                sess_lucky[sid] += 1
    n_runs = len(per_run_rates)
    consistently_starved = [sid for sid, c in sess_starved.items() if c == n_runs]
    consistently_lucky = [sid for sid, c in sess_lucky.items() if c == n_runs]
    return {
        "n_runs": n_runs,
        "consistently_starved": consistently_starved,
        "consistently_lucky": consistently_lucky,
    }


def session_size_comparison(rows, sids_a, sids_b, label_a="A", label_b="B"):
    """Compare peak input_length of two session groups."""
    sess_max_input = defaultdict(int)
    for r in rows:
        sid = r.get("session_id")
        ilen = r.get("input_length") or 0
        if sid is not None and ilen > sess_max_input[sid]:
            sess_max_input[sid] = ilen
    a_inputs = [sess_max_input[s] for s in sids_a if s in sess_max_input]
    b_inputs = [sess_max_input[s] for s in sids_b if s in sess_max_input]
    if a_inputs and b_inputs:
        ratio = np.mean(a_inputs) / np.mean(b_inputs)
        print(f"\n  Cross-run starvation correlates with session size?")
        print(f"    consistently {label_a} (n={len(a_inputs)}): peak_input mean = {np.mean(a_inputs):.0f}")
        print(f"    consistently {label_b} (n={len(b_inputs)}): peak_input mean = {np.mean(b_inputs):.0f}")
        print(f"    {label_a}/{label_b} ratio = {ratio:.2f}x (ts=10 baseline was 1.98x)")


def per_d_balance(rows, label):
    """§7: per-D load balance."""
    per_d = Counter()
    for r in rows:
        d = r.get("assigned_decode_node") or r.get("decode_node")
        if d:
            per_d[d] += 1
    if not per_d:
        return
    counts = list(per_d.values())
    spread = (max(counts) - min(counts)) / max(np.mean(counts), 1)
    print(f"\n  [{label}] per-D load: {dict(sorted(per_d.items()))}")
    print(f"    spread (max-min)/mean = {spread*100:.1f}%   "
          f"(ts=10 KVC 2P6D = ±26%, 8DP CA = ±10%)")


def execution_modes_table(rows, label):
    """Show top execution modes."""
    ok = [r for r in rows if r.get("error") is None]
    if not ok:
        return
    modes = Counter(r["execution_mode"] for r in ok)
    print(f"\n  [{label}] execution modes (n_ok={len(ok)}):")
    for mode, cnt in modes.most_common(8):
        mode_rows = [r for r in ok if r["execution_mode"] == mode]
        lats = [r["latency_s"] for r in mode_rows if r.get("latency_s") is not None]
        ttfts = [r["ttft_s"] for r in mode_rows if r.get("ttft_s") is not None]
        if lats:
            print(f"    {mode:<55} {cnt:>5}  ({cnt/len(ok)*100:>4.1f}%)  "
                  f"lat p50={pct(lats,50):.3f}s p90={pct(lats,90):.3f}s  ttft p50={pct(ttfts,50):.3f}s")


def lru_vs_errors(run_dir, label):
    """§2: trim events vs KVTransferError per worker."""
    log_dir = run_dir / "logs"
    if not log_dir.exists():
        return
    print(f"\n  [{label}] D-side LRU vs errors (from worker logs):")
    print(f"    {'worker':<14}{'trim':>8}{'KVTransferError':>20}{'peak_token_usage':>20}")
    for log_file in sorted(log_dir.glob("decode-*.log")):
        worker = log_file.stem
        text = log_file.read_text(errors="ignore")
        trim_count = len(re.findall(r"Trimmed decode session cache", text))
        err_count = len(re.findall(r"KVTransferError", text))
        usages = re.findall(r"token usage: ([\d.]+)", text)
        peak = max((float(u) for u in usages), default=0.0)
        print(f"    {worker:<14}{trim_count:>8}{err_count:>20}{peak:>20.3f}")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--root", default="outputs/qwen3-30b-tp1-ts1-validation",
                        help="Sweep output root")
    args = parser.parse_args()

    root = Path(args.root)
    if not root.is_absolute():
        root = Path("/mnt/kzlin/workflow/pd-hybrid/agentic-pd-hybrid") / root

    # Load all available runs
    stats = []
    rows_by_run = {}
    for label in ("kvc_1p3d_run1", "kvc_1p3d_run2", "kvc_1p3d_run3", "dp4"):
        m = root / f"{label}_metrics.jsonl"
        s = root / f"{label}_summary.json"
        if not m.exists() or not s.exists():
            print(f"  [{label}] not yet available ({m.name})")
            continue
        rows = load_metrics(m)
        summary = load_summary(s)
        rows_by_run[label] = rows
        stats.append(summarize_run(label, rows, summary))

    if not stats:
        print("No runs available yet.")
        return

    # 1. Headline table
    headline_table(stats)

    # 2. §1 session pinning per KVC run + per-D balance + execution modes
    print("\n" + "=" * 110)
    print("§1 / §7: SESSION PINNING + LOAD BALANCE")
    print("=" * 110)
    per_run_rates = []
    for label, rows in rows_by_run.items():
        if not label.startswith("kvc_"):
            continue
        pin = session_pinning(rows, label)
        if pin:
            print(f"\n  [{label}] sessions={pin['n_sessions']}  "
                  f"avg_distinct_D={pin['avg_distinct_D']:.2f}  "
                  f"max_distinct_D={pin['max_distinct_D']}  "
                  f"(ts=10 baseline avg=1.00 → 100% pin)")
        rates = direct_to_d_distribution(rows, label)
        per_run_rates.append(rates)
        per_d_balance(rows, label)
        execution_modes_table(rows, label)

    # 3. §1 cross-run starvation
    if len(per_run_rates) >= 2:
        print("\n" + "=" * 110)
        print(f"§1 CROSS-RUN STARVATION (across {len(per_run_rates)} KVC runs)")
        print("=" * 110)
        cross = starved_cross_run(per_run_rates)
        if cross:
            n_starved = len(cross["consistently_starved"])
            n_lucky = len(cross["consistently_lucky"])
            print(f"\n  Sessions starved (<20% direct-to-D) in all {cross['n_runs']} runs: {n_starved}")
            print(f"  Sessions lucky (>80% direct-to-D) in all {cross['n_runs']} runs: {n_lucky}")
            print(f"  (ts=10 baseline: 13/52 starved, 14/52 lucky — extreme bimodal)")
            # session size comparison from run 1
            if "kvc_1p3d_run1" in rows_by_run and n_starved and n_lucky:
                session_size_comparison(rows_by_run["kvc_1p3d_run1"],
                                        cross["consistently_starved"],
                                        cross["consistently_lucky"],
                                        "starved", "lucky")

    # 4. §2 D-side LRU vs errors from raw logs
    print("\n" + "=" * 110)
    print("§2: D-SIDE LRU TRIM vs KVTransferError (from worker logs)")
    print("=" * 110)
    for label in rows_by_run:
        if not label.startswith("kvc_"):
            continue
        # find the matching raw run dir
        run_dirs = sorted(root.glob("kvcache-centric-*/"))
        if not run_dirs:
            continue
        # naive: index matches run order; could be wrong if dirs got reordered
        idx = int(label.split("run")[-1]) - 1
        if idx < len(run_dirs):
            lru_vs_errors(run_dirs[idx], label)

    # 5. DP-only inspection
    if "dp4" in rows_by_run:
        print("\n" + "=" * 110)
        print("4DP CA SANITY")
        print("=" * 110)
        per_d_balance(rows_by_run["dp4"], "dp4")
        execution_modes_table(rows_by_run["dp4"], "dp4")


if __name__ == "__main__":
    main()