#!/usr/bin/env python3 """Plot a CDF of cumulative input-token mass by session rank. Primary curve is the *production* trace (``/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl`` on dash0), which has 1.3 M sessions across 2.1 M records over a 7200 s window. Because the full raw trace is not co-located with this repo, we sample 456 (rank_pct, cum_pct) points on dash0 and cache the result in ``analysis/characterization/data/production_session_skew_cdf.json``. Any top-K%% mass figure can be read off the resulting curve. The replay-trace CDF (``traces/w600_r0.0015_st30.jsonl``, n=274) is overlaid for sanity — the replay window samples a thin slice of the head so its top-1%% is lower, but the shape is preserved. """ from __future__ import annotations import argparse import json from collections import defaultdict from pathlib import Path import matplotlib.pyplot as plt import numpy as np def load_replay_cdf(trace_path: Path) -> tuple[np.ndarray, np.ndarray, int]: totals: dict[str, int] = defaultdict(int) with trace_path.open() as f: for line in f: row = json.loads(line) totals[row["session_id"]] += int(row["input_length"]) n = len(totals) sorted_vals = np.sort(np.array(list(totals.values())))[::-1] cum = np.cumsum(sorted_vals) / sorted_vals.sum() rank_pct = np.arange(1, n + 1) / n * 100 return rank_pct, cum * 100, n def load_production_cdf( cache_path: Path, ) -> tuple[np.ndarray, np.ndarray, int, dict[str, float]]: d = json.loads(cache_path.read_text()) samples = d["samples"] xs = np.array([s["rank_pct"] for s in samples]) ys = np.array([s["cum_pct"] for s in samples]) return xs, ys, d["n_sessions"], d["anchors_check"] ANNOTATE_PTS = [1.0, 5.0, 10.0, 25.0, 50.0] def _annotate_anchors(ax, prod_x, prod_y) -> None: for p in ANNOTATE_PTS: y = float(np.interp(p, prod_x, prod_y)) ax.scatter([p], [y], color="#c44e52", s=55, zorder=5) ax.annotate( f"top {p:g}% → {y:.1f}%", xy=(p, y), xytext=(p + 2.5, y - 6), fontsize=10, color="#7a1d1d", ) def plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n, out_path: Path) -> None: fig, ax = plt.subplots(figsize=(9, 5.5)) ax.plot(prod_x, prod_y, color="#c44e52", lw=2.4, label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)") _annotate_anchors(ax, prod_x, prod_y) ax.plot(replay_rank_pct, replay_cum_pct, color="#2f6fab", lw=1.6, alpha=0.85, label=f"replay window (n={replay_n} sessions, raw CDF)") ax.plot([0, 100], [0, 100], color="#888", ls="--", lw=1, label="uniform reference (y = x)") ax.set_xlim(0, 100); ax.set_ylim(0, 102) ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)") ax.set_ylabel("Cumulative % of input-token mass") ax.set_title("Session input-token mass CDF — Qwen3 production trace") ax.grid(True, alpha=0.3) ax.legend(loc="lower right", framealpha=0.92, fontsize=9) out_path.parent.mkdir(parents=True, exist_ok=True) fig.savefig(out_path, dpi=150, bbox_inches="tight") plt.close(fig) print(f"wrote {out_path}") def plot_production_solo(prod_x, prod_y, prod_n, out_path: Path) -> None: fig, ax = plt.subplots(figsize=(9, 5.5)) ax.plot(prod_x, prod_y, color="#c44e52", lw=2.6) _annotate_anchors(ax, prod_x, prod_y) ax.set_xlim(0, 100); ax.set_ylim(0, 102) ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)") ax.set_ylabel("Cumulative % of input-token mass") ax.set_title( f"Session input-token mass CDF — Qwen3 production trace (n = {prod_n:,} sessions)" ) ax.grid(True, alpha=0.3) out_path.parent.mkdir(parents=True, exist_ok=True) fig.savefig(out_path, dpi=150, bbox_inches="tight") plt.close(fig) print(f"wrote {out_path}") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--replay-trace", default="traces/w600_r0.0015_st30.jsonl") parser.add_argument( "--prod-cache", default="analysis/characterization/data/production_session_skew_cdf.json", ) parser.add_argument("--out-combined", default="figs/f2b_session_skew.png") parser.add_argument("--out-solo", default="figs/f2b_session_skew_prod.png") args = parser.parse_args() prod_x, prod_y, prod_n, _ = load_production_cdf(Path(args.prod_cache)) replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace)) plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n, Path(args.out_combined)) plot_production_solo(prod_x, prod_y, prod_n, Path(args.out_solo)) if __name__ == "__main__": main()