agentic-kvc/scripts/plot_session_skew_cdf.py

#!/usr/bin/env python3
"""Plot a CDF of cumulative input-token mass by session rank.

Primary curve is the *production* trace
(``/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl``
on dash0), which has 1.3 M sessions across 2.1 M records over a 7200 s
window. Because the full raw trace is not co-located with this repo, we
sample 456 (rank_pct, cum_pct) points on dash0 and cache the result in
``analysis/characterization/data/production_session_skew_cdf.json``. Any
top-K%% mass figure can be read off the resulting curve.

The replay-trace CDF (``traces/w600_r0.0015_st30.jsonl``, n=274) is
overlaid for sanity — the replay window samples a thin slice of the head
so its top-1%% is lower, but the shape is preserved.
"""
from __future__ import annotations

import argparse
import json
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np


def load_replay_cdf(trace_path: Path) -> tuple[np.ndarray, np.ndarray, int]:
    totals: dict[str, int] = defaultdict(int)
    with trace_path.open() as f:
        for line in f:
            row = json.loads(line)
            totals[row["session_id"]] += int(row["input_length"])
    n = len(totals)
    sorted_vals = np.sort(np.array(list(totals.values())))[::-1]
    cum = np.cumsum(sorted_vals) / sorted_vals.sum()
    rank_pct = np.arange(1, n + 1) / n * 100
    return rank_pct, cum * 100, n


def load_production_cdf(
    cache_path: Path,
) -> tuple[np.ndarray, np.ndarray, int, dict[str, float]]:
    d = json.loads(cache_path.read_text())
    samples = d["samples"]
    xs = np.array([s["rank_pct"] for s in samples])
    ys = np.array([s["cum_pct"] for s in samples])
    return xs, ys, d["n_sessions"], d["anchors_check"]


ANNOTATE_PTS = [1.0, 5.0, 10.0, 25.0, 50.0]


def _annotate_anchors(ax, prod_x, prod_y) -> None:
    for p in ANNOTATE_PTS:
        y = float(np.interp(p, prod_x, prod_y))
        ax.scatter([p], [y], color="#c44e52", s=55, zorder=5)
        ax.annotate(
            f"top {p:g}% → {y:.1f}%",
            xy=(p, y),
            xytext=(p + 2.5, y - 6),
            fontsize=10,
            color="#7a1d1d",
        )


def plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
                  out_path: Path) -> None:
    fig, ax = plt.subplots(figsize=(9, 5.5))
    ax.plot(prod_x, prod_y, color="#c44e52", lw=2.4,
            label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)")
    _annotate_anchors(ax, prod_x, prod_y)
    ax.plot(replay_rank_pct, replay_cum_pct, color="#2f6fab", lw=1.6, alpha=0.85,
            label=f"replay window (n={replay_n} sessions, raw CDF)")
    ax.plot([0, 100], [0, 100], color="#888", ls="--", lw=1,
            label="uniform reference (y = x)")
    ax.set_xlim(0, 100); ax.set_ylim(0, 102)
    ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
    ax.set_ylabel("Cumulative % of input-token mass")
    ax.set_title("Session input-token mass CDF — Qwen3 production trace")
    ax.grid(True, alpha=0.3)
    ax.legend(loc="lower right", framealpha=0.92, fontsize=9)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out_path, dpi=150, bbox_inches="tight")
    plt.close(fig)
    print(f"wrote {out_path}")


def plot_production_solo(prod_x, prod_y, prod_n, out_path: Path) -> None:
    fig, ax = plt.subplots(figsize=(9, 5.5))
    ax.plot(prod_x, prod_y, color="#c44e52", lw=2.6)
    _annotate_anchors(ax, prod_x, prod_y)
    ax.set_xlim(0, 100); ax.set_ylim(0, 102)
    ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
    ax.set_ylabel("Cumulative % of input-token mass")
    ax.set_title(
        f"Session input-token mass CDF — Qwen3 production trace (n = {prod_n:,} sessions)"
    )
    ax.grid(True, alpha=0.3)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out_path, dpi=150, bbox_inches="tight")
    plt.close(fig)
    print(f"wrote {out_path}")


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--replay-trace", default="traces/w600_r0.0015_st30.jsonl")
    parser.add_argument(
        "--prod-cache",
        default="analysis/characterization/data/production_session_skew_cdf.json",
    )
    parser.add_argument("--out-combined", default="figs/f2b_session_skew.png")
    parser.add_argument("--out-solo", default="figs/f2b_session_skew_prod.png")
    args = parser.parse_args()

    prod_x, prod_y, prod_n, _ = load_production_cdf(Path(args.prod_cache))
    replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace))

    plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
                  Path(args.out_combined))
    plot_production_solo(prod_x, prod_y, prod_n, Path(args.out_solo))


if __name__ == "__main__":
    main()