Files
agentic-kvc/scripts/plot_session_skew_cdf.py
Gahow Wang 74e0c2157a Add solo production-trace CDF figure (f2b_session_skew_prod.png)
Single-curve variant of f2b — production trace only, no replay overlay
and no uniform reference. Cleaner for boss-meeting/talk slides where the
extra context is noise. The combined three-curve figure is unchanged.

scripts/plot_session_skew_cdf.py: split into plot_combined +
plot_production_solo helpers; one run emits both PNGs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 10:53:30 +08:00

126 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""Plot a CDF of cumulative input-token mass by session rank.
Primary curve is the *production* trace
(``/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl``
on dash0), which has 1.3 M sessions across 2.1 M records over a 7200 s
window. Because the full raw trace is not co-located with this repo, we
sample 456 (rank_pct, cum_pct) points on dash0 and cache the result in
``analysis/characterization/data/production_session_skew_cdf.json``. Any
top-K%% mass figure can be read off the resulting curve.
The replay-trace CDF (``traces/w600_r0.0015_st30.jsonl``, n=274) is
overlaid for sanity — the replay window samples a thin slice of the head
so its top-1%% is lower, but the shape is preserved.
"""
from __future__ import annotations
import argparse
import json
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
def load_replay_cdf(trace_path: Path) -> tuple[np.ndarray, np.ndarray, int]:
totals: dict[str, int] = defaultdict(int)
with trace_path.open() as f:
for line in f:
row = json.loads(line)
totals[row["session_id"]] += int(row["input_length"])
n = len(totals)
sorted_vals = np.sort(np.array(list(totals.values())))[::-1]
cum = np.cumsum(sorted_vals) / sorted_vals.sum()
rank_pct = np.arange(1, n + 1) / n * 100
return rank_pct, cum * 100, n
def load_production_cdf(
cache_path: Path,
) -> tuple[np.ndarray, np.ndarray, int, dict[str, float]]:
d = json.loads(cache_path.read_text())
samples = d["samples"]
xs = np.array([s["rank_pct"] for s in samples])
ys = np.array([s["cum_pct"] for s in samples])
return xs, ys, d["n_sessions"], d["anchors_check"]
ANNOTATE_PTS = [1.0, 5.0, 10.0, 25.0, 50.0]
def _annotate_anchors(ax, prod_x, prod_y) -> None:
for p in ANNOTATE_PTS:
y = float(np.interp(p, prod_x, prod_y))
ax.scatter([p], [y], color="#c44e52", s=55, zorder=5)
ax.annotate(
f"top {p:g}% → {y:.1f}%",
xy=(p, y),
xytext=(p + 2.5, y - 6),
fontsize=10,
color="#7a1d1d",
)
def plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
out_path: Path) -> None:
fig, ax = plt.subplots(figsize=(9, 5.5))
ax.plot(prod_x, prod_y, color="#c44e52", lw=2.4,
label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)")
_annotate_anchors(ax, prod_x, prod_y)
ax.plot(replay_rank_pct, replay_cum_pct, color="#2f6fab", lw=1.6, alpha=0.85,
label=f"replay window (n={replay_n} sessions, raw CDF)")
ax.plot([0, 100], [0, 100], color="#888", ls="--", lw=1,
label="uniform reference (y = x)")
ax.set_xlim(0, 100); ax.set_ylim(0, 102)
ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
ax.set_ylabel("Cumulative % of input-token mass")
ax.set_title("Session input-token mass CDF — Qwen3 production trace")
ax.grid(True, alpha=0.3)
ax.legend(loc="lower right", framealpha=0.92, fontsize=9)
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f"wrote {out_path}")
def plot_production_solo(prod_x, prod_y, prod_n, out_path: Path) -> None:
fig, ax = plt.subplots(figsize=(9, 5.5))
ax.plot(prod_x, prod_y, color="#c44e52", lw=2.6)
_annotate_anchors(ax, prod_x, prod_y)
ax.set_xlim(0, 100); ax.set_ylim(0, 102)
ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
ax.set_ylabel("Cumulative % of input-token mass")
ax.set_title(
f"Session input-token mass CDF — Qwen3 production trace (n = {prod_n:,} sessions)"
)
ax.grid(True, alpha=0.3)
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f"wrote {out_path}")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--replay-trace", default="traces/w600_r0.0015_st30.jsonl")
parser.add_argument(
"--prod-cache",
default="analysis/characterization/data/production_session_skew_cdf.json",
)
parser.add_argument("--out-combined", default="figs/f2b_session_skew.png")
parser.add_argument("--out-solo", default="figs/f2b_session_skew_prod.png")
args = parser.parse_args()
prod_x, prod_y, prod_n, _ = load_production_cdf(Path(args.prod_cache))
replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace))
plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
Path(args.out_combined))
plot_production_solo(prod_x, prod_y, prod_n, Path(args.out_solo))
if __name__ == "__main__":
main()