agentic-pd-hybrid/scripts/analysis/analyze_pool_timeseries.py

#!/usr/bin/env python3
"""Analyze d-pool-timeseries.jsonl produced by --pool-poll-interval-s.

Answers v6's main question: where is D's KV pool actually spent?

For each decode worker, decomposes capacity over the run wall-clock into:
  - resident_held_active   = held - idle_evictable      (sessions in active use)
  - resident_held_idle     = idle_evictable             (sessions kept around but evictable)
  - prefill_backup_or_other = capacity - held - available (everything else: backup blocks,
                                                          in-flight transfers, fragmentation)
  - free_available         = available

Also reports session residency churn (how many distinct sessions ever resided per D, and
how often a session bounced between workers — a strong starvation signal).

Usage:
  python scripts/analysis/analyze_pool_timeseries.py <run_dir>
or
  python scripts/analysis/analyze_pool_timeseries.py <pool_timeseries.jsonl>

Output: human-readable text. Add --json to also print a machine-readable summary.
"""
from __future__ import annotations

import argparse
import json
import statistics
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any


def _load_jsonl(path: Path) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    with path.open() as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows


def _resolve_input(path: Path) -> Path:
    if path.is_file():
        return path
    if path.is_dir():
        candidate = path / "d-pool-timeseries.jsonl"
        if candidate.is_file():
            return candidate
        raise FileNotFoundError(
            f"{candidate} not found; pass the file directly or a run dir containing it."
        )
    raise FileNotFoundError(path)


def _percentile(values: list[float], p: float) -> float:
    if not values:
        return 0.0
    s = sorted(values)
    idx = min(len(s) - 1, max(0, int(round((len(s) - 1) * p))))
    return s[idx]


def _fmt_tokens(n: float) -> str:
    if n >= 1_000_000:
        return f"{n / 1_000_000:.2f}M"
    if n >= 1_000:
        return f"{n / 1_000:.1f}K"
    return f"{int(n)}"


def _fmt_pct(n: float, total: float) -> str:
    if total <= 0:
        return "  -  "
    return f"{100 * n / total:5.1f}%"


def analyze(timeseries_path: Path) -> dict[str, Any]:
    rows = _load_jsonl(timeseries_path)
    if not rows:
        raise ValueError(f"empty timeseries: {timeseries_path}")

    by_worker: dict[str, list[dict[str, Any]]] = defaultdict(list)
    for row in rows:
        if row.get("error") and "session_cache_enabled" not in row:
            # poller failed at this tick — skip
            continue
        wid = row.get("worker_id") or "?"
        by_worker[wid].append(row)

    summary: dict[str, Any] = {
        "timeseries_path": str(timeseries_path),
        "total_rows": len(rows),
        "tick_count": len(by_worker[next(iter(by_worker))]) if by_worker else 0,
        "wall_s_span": (
            max(r.get("wall_s", 0.0) for r in rows)
            - min(r.get("wall_s", 0.0) for r in rows)
        ),
        "workers": {},
    }

    print(f"\n=== Pool timeseries: {timeseries_path}")
    print(
        f"  rows={summary['total_rows']}  workers={len(by_worker)}  "
        f"span={summary['wall_s_span']:.1f}s"
    )

    # Print per-worker decomposition table
    header = (
        f"{'worker':<12} {'role':<8} {'cap':>8} | "
        f"{'avg_active':>10} {'avg_idle':>10} {'avg_other':>10} {'avg_free':>10} | "
        f"{'p90_held':>10} {'max_held':>10} {'p90_avail':>10}"
    )
    print(header)
    print("-" * len(header))

    for wid in sorted(by_worker.keys()):
        ws = by_worker[wid]
        role = ws[0].get("worker_role", "?")
        cap_vals = [int(r.get("capacity_tokens") or 0) for r in ws]
        held_vals = [int(r.get("held_tokens") or 0) for r in ws]
        avail_vals = [int(r.get("available_tokens") or 0) for r in ws]
        idle_vals = [int(r.get("idle_evictable_tokens") or 0) for r in ws]
        # active = held - idle (sessions in active use)
        active_vals = [max(0, h - i) for h, i in zip(held_vals, idle_vals)]
        # other = capacity - held - available (prefill backup blocks, in-flight, fragmentation)
        other_vals = [
            max(0, c - h - a) for c, h, a in zip(cap_vals, held_vals, avail_vals)
        ]
        cap = max(cap_vals) if cap_vals else 0

        avg_active = statistics.fmean(active_vals) if active_vals else 0.0
        avg_idle = statistics.fmean(idle_vals) if idle_vals else 0.0
        avg_other = statistics.fmean(other_vals) if other_vals else 0.0
        avg_avail = statistics.fmean(avail_vals) if avail_vals else 0.0

        p90_held = _percentile([float(v) for v in held_vals], 0.90)
        max_held = max(held_vals) if held_vals else 0
        p90_avail = _percentile([float(v) for v in avail_vals], 0.90)

        sess_counts = [int(r.get("session_count") or 0) for r in ws]
        resident_counts = [int(r.get("resident_session_count") or 0) for r in ws]

        print(
            f"{wid:<12} {role:<8} {_fmt_tokens(cap):>8} | "
            f"{_fmt_tokens(avg_active):>4} {_fmt_pct(avg_active, cap):>5} "
            f"{_fmt_tokens(avg_idle):>4} {_fmt_pct(avg_idle, cap):>5} "
            f"{_fmt_tokens(avg_other):>4} {_fmt_pct(avg_other, cap):>5} "
            f"{_fmt_tokens(avg_avail):>4} {_fmt_pct(avg_avail, cap):>5} | "
            f"{_fmt_tokens(p90_held):>10} {_fmt_tokens(max_held):>10} "
            f"{_fmt_tokens(p90_avail):>10}"
        )

        summary["workers"][wid] = {
            "role": role,
            "capacity_tokens": cap,
            "avg_active_held_tokens": avg_active,
            "avg_idle_evictable_tokens": avg_idle,
            "avg_other_tokens": avg_other,
            "avg_available_tokens": avg_avail,
            "p90_held_tokens": p90_held,
            "max_held_tokens": max_held,
            "p90_available_tokens": p90_avail,
            "max_session_count": max(sess_counts) if sess_counts else 0,
            "max_resident_session_count": (
                max(resident_counts) if resident_counts else 0
            ),
            "ticks": len(ws),
        }

    print(
        "\nLegend: active=held-idle  idle=idle_evictable  "
        "other=cap-held-avail (radix-protected + running-batch + in-flight + frag)"
    )

    # P1: decomposition of "other" using pool_breakdown fields (zeros if instrument absent)
    has_breakdown = any(
        any(r.get(k) for k in (
            "radix_evictable_tokens",
            "radix_protected_tokens",
            "running_batch_kv_tokens",
            "transfer_queue_tokens",
            "prealloc_queue_tokens",
            "retracted_queue_tokens",
        ))
        for r in rows
    )

    if has_breakdown:
        print("\n=== P1 'other' decomposition (per worker, mean over run) ===")
        print(
            f"{'worker':<12} {'role':<8} | "
            f"{'r_evictable':>11} {'r_protected':>11} {'slot_private':>12} | "
            f"{'run_batch':>10} {'transfer':>9} {'prealloc':>9} {'retracted':>10} | "
            f"{'unaccounted':>11}"
        )
        for wid in sorted(by_worker.keys()):
            ws = by_worker[wid]
            role = ws[0].get("worker_role", "?")
            cap = max(int(r.get("capacity_tokens") or 0) for r in ws)

            def m(field: str) -> float:
                vals = [int(r.get(field) or 0) for r in ws]
                return statistics.fmean(vals) if vals else 0.0

            r_ev = m("radix_evictable_tokens")
            r_pr = m("radix_protected_tokens")
            slot = m("slot_private_held_tokens")
            rb = m("running_batch_kv_tokens")
            tq = m("transfer_queue_tokens")
            pq = m("prealloc_queue_tokens")
            rq = m("retracted_queue_tokens")
            avail = m("available_tokens")
            # `running_batch_kv_tokens` overlaps with radix_protected for tree-tracked
            # reqs — do NOT subtract it again. Decomposition assumes:
            # capacity ≈ avail + r_evictable + r_protected + slot_private
            #           + transfer_queue + prealloc_queue + retracted_queue + unaccounted
            unacc = max(
                0,
                cap - avail - r_ev - r_pr - slot - tq - pq - rq,
            )
            print(
                f"{wid:<12} {role:<8} | "
                f"{_fmt_tokens(r_ev):>11} {_fmt_tokens(r_pr):>11} {_fmt_tokens(slot):>12} | "
                f"{_fmt_tokens(rb):>10} {_fmt_tokens(tq):>9} {_fmt_tokens(pq):>9} {_fmt_tokens(rq):>10} | "
                f"{_fmt_tokens(unacc):>11}"
            )

            summary["workers"][wid]["pool_breakdown_avg"] = {
                "radix_evictable": r_ev,
                "radix_protected": r_pr,
                "slot_private_held": slot,
                "running_batch_kv": rb,
                "transfer_queue": tq,
                "prealloc_queue": pq,
                "retracted_queue": rq,
                "available": avail,
                "unaccounted": unacc,
            }
        print(
            "\nNote: running_batch_kv_tokens overlaps with radix_protected_tokens "
            "(tree-tracked decode reqs are also in protected); not summed."
        )
    else:
        print("\n(P1 instrument absent: pool_breakdown fields are all zero)")

    # Session residency churn: how many distinct sessions ever sat on each worker,
    # and how many sessions hopped across workers (= starvation indicator).
    print("\n=== Session residency churn ===")
    sessions_per_worker: dict[str, set[str]] = defaultdict(set)
    workers_per_session: dict[str, set[str]] = defaultdict(set)
    resident_ticks_per_session: Counter[str] = Counter()
    resident_ticks_per_worker: Counter[str] = Counter()

    for row in rows:
        wid = row.get("worker_id")
        if wid is None or row.get("worker_role") != "decode":
            continue
        sessions = row.get("sessions") or []
        if not isinstance(sessions, list):
            continue
        for entry in sessions:
            if not isinstance(entry, dict):
                continue
            sid = entry.get("session_id")
            if sid is None:
                continue
            if entry.get("resident"):
                sessions_per_worker[wid].add(sid)
                workers_per_session[sid].add(wid)
                resident_ticks_per_session[(wid, sid)] += 1
                resident_ticks_per_worker[wid] += 1

    # Per-decode worker: distinct session count
    print(f"  {'worker':<12} {'distinct_sess':>14} {'resident_ticks':>16}")
    for wid in sorted(sessions_per_worker.keys()):
        print(
            f"  {wid:<12} {len(sessions_per_worker[wid]):>14} "
            f"{resident_ticks_per_worker[wid]:>16}"
        )

    # Per session: how many workers it hopped across
    hops = Counter(len(ws) for ws in workers_per_session.values())
    print(f"\n  Sessions seen on N workers (decode side):")
    for n, count in sorted(hops.items()):
        print(f"    on {n} worker(s): {count} sessions")

    starvation = [sid for sid, ws in workers_per_session.items() if len(ws) == 0]
    multi_hopper = sorted(
        ((sid, ws) for sid, ws in workers_per_session.items() if len(ws) >= 2),
        key=lambda x: -len(x[1]),
    )[:10]
    if multi_hopper:
        print(
            "\n  Top sessions seen resident on multiple workers (potential thrashing):"
        )
        for sid, ws in multi_hopper:
            print(f"    {sid}: {len(ws)} workers ({sorted(ws)})")

    summary["session_residency"] = {
        "distinct_sessions_per_worker": {
            wid: len(s) for wid, s in sessions_per_worker.items()
        },
        "session_hop_count_distribution": dict(hops),
        "starvation_session_count": len(starvation),
    }

    # If a request-metrics file is co-located, also bucket fallback reasons
    # against contemporaneous pool state (rough — uses tick nearest to median tick).
    metrics_path = timeseries_path.with_name("request-metrics.jsonl")
    if metrics_path.exists():
        print(f"\n=== Request-metrics summary ({metrics_path.name}) ===")
        mrows = _load_jsonl(metrics_path)
        modes = Counter(r.get("execution_mode") or "?" for r in mrows)
        total = sum(modes.values())
        for mode, count in modes.most_common():
            print(f"  {count:>6} ({100 * count / total:5.1f}%)  {mode}")
        summary["execution_modes"] = dict(modes)

    return summary


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "path",
        type=Path,
        help="Path to d-pool-timeseries.jsonl OR a run dir containing it",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Also print a machine-readable JSON summary",
    )
    args = parser.parse_args()

    resolved = _resolve_input(args.path)
    summary = analyze(resolved)
    if args.json:
        print("\n=== JSON summary ===")
        print(json.dumps(summary, indent=2, sort_keys=True, default=str))


if __name__ == "__main__":
    main()