Single-curve variant of f2b — production trace only, no replay overlay and no uniform reference. Cleaner for boss-meeting/talk slides where the extra context is noise. The combined three-curve figure is unchanged. scripts/plot_session_skew_cdf.py: split into plot_combined + plot_production_solo helpers; one run emits both PNGs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
126 lines
4.8 KiB
Python
126 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Plot a CDF of cumulative input-token mass by session rank.
|
|
|
|
Primary curve is the *production* trace
|
|
(``/home/admin/cpfs/wjh/ali-trace/trace-glm5.1-formatted/051315-051317.jsonl``
|
|
on dash0), which has 1.3 M sessions across 2.1 M records over a 7200 s
|
|
window. Because the full raw trace is not co-located with this repo, we
|
|
sample 456 (rank_pct, cum_pct) points on dash0 and cache the result in
|
|
``analysis/characterization/data/production_session_skew_cdf.json``. Any
|
|
top-K%% mass figure can be read off the resulting curve.
|
|
|
|
The replay-trace CDF (``traces/w600_r0.0015_st30.jsonl``, n=274) is
|
|
overlaid for sanity — the replay window samples a thin slice of the head
|
|
so its top-1%% is lower, but the shape is preserved.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
|
|
|
|
def load_replay_cdf(trace_path: Path) -> tuple[np.ndarray, np.ndarray, int]:
|
|
totals: dict[str, int] = defaultdict(int)
|
|
with trace_path.open() as f:
|
|
for line in f:
|
|
row = json.loads(line)
|
|
totals[row["session_id"]] += int(row["input_length"])
|
|
n = len(totals)
|
|
sorted_vals = np.sort(np.array(list(totals.values())))[::-1]
|
|
cum = np.cumsum(sorted_vals) / sorted_vals.sum()
|
|
rank_pct = np.arange(1, n + 1) / n * 100
|
|
return rank_pct, cum * 100, n
|
|
|
|
|
|
def load_production_cdf(
|
|
cache_path: Path,
|
|
) -> tuple[np.ndarray, np.ndarray, int, dict[str, float]]:
|
|
d = json.loads(cache_path.read_text())
|
|
samples = d["samples"]
|
|
xs = np.array([s["rank_pct"] for s in samples])
|
|
ys = np.array([s["cum_pct"] for s in samples])
|
|
return xs, ys, d["n_sessions"], d["anchors_check"]
|
|
|
|
|
|
ANNOTATE_PTS = [1.0, 5.0, 10.0, 25.0, 50.0]
|
|
|
|
|
|
def _annotate_anchors(ax, prod_x, prod_y) -> None:
|
|
for p in ANNOTATE_PTS:
|
|
y = float(np.interp(p, prod_x, prod_y))
|
|
ax.scatter([p], [y], color="#c44e52", s=55, zorder=5)
|
|
ax.annotate(
|
|
f"top {p:g}% → {y:.1f}%",
|
|
xy=(p, y),
|
|
xytext=(p + 2.5, y - 6),
|
|
fontsize=10,
|
|
color="#7a1d1d",
|
|
)
|
|
|
|
|
|
def plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
|
|
out_path: Path) -> None:
|
|
fig, ax = plt.subplots(figsize=(9, 5.5))
|
|
ax.plot(prod_x, prod_y, color="#c44e52", lw=2.4,
|
|
label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)")
|
|
_annotate_anchors(ax, prod_x, prod_y)
|
|
ax.plot(replay_rank_pct, replay_cum_pct, color="#2f6fab", lw=1.6, alpha=0.85,
|
|
label=f"replay window (n={replay_n} sessions, raw CDF)")
|
|
ax.plot([0, 100], [0, 100], color="#888", ls="--", lw=1,
|
|
label="uniform reference (y = x)")
|
|
ax.set_xlim(0, 100); ax.set_ylim(0, 102)
|
|
ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
|
|
ax.set_ylabel("Cumulative % of input-token mass")
|
|
ax.set_title("Session input-token mass CDF — Qwen3 production trace")
|
|
ax.grid(True, alpha=0.3)
|
|
ax.legend(loc="lower right", framealpha=0.92, fontsize=9)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
fig.savefig(out_path, dpi=150, bbox_inches="tight")
|
|
plt.close(fig)
|
|
print(f"wrote {out_path}")
|
|
|
|
|
|
def plot_production_solo(prod_x, prod_y, prod_n, out_path: Path) -> None:
|
|
fig, ax = plt.subplots(figsize=(9, 5.5))
|
|
ax.plot(prod_x, prod_y, color="#c44e52", lw=2.6)
|
|
_annotate_anchors(ax, prod_x, prod_y)
|
|
ax.set_xlim(0, 100); ax.set_ylim(0, 102)
|
|
ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
|
|
ax.set_ylabel("Cumulative % of input-token mass")
|
|
ax.set_title(
|
|
f"Session input-token mass CDF — Qwen3 production trace (n = {prod_n:,} sessions)"
|
|
)
|
|
ax.grid(True, alpha=0.3)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
fig.savefig(out_path, dpi=150, bbox_inches="tight")
|
|
plt.close(fig)
|
|
print(f"wrote {out_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--replay-trace", default="traces/w600_r0.0015_st30.jsonl")
|
|
parser.add_argument(
|
|
"--prod-cache",
|
|
default="analysis/characterization/data/production_session_skew_cdf.json",
|
|
)
|
|
parser.add_argument("--out-combined", default="figs/f2b_session_skew.png")
|
|
parser.add_argument("--out-solo", default="figs/f2b_session_skew_prod.png")
|
|
args = parser.parse_args()
|
|
|
|
prod_x, prod_y, prod_n, _ = load_production_cdf(Path(args.prod_cache))
|
|
replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace))
|
|
|
|
plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
|
|
Path(args.out_combined))
|
|
plot_production_solo(prod_x, prod_y, prod_n, Path(args.out_solo))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|