f2b: replace top-1/5/10% bars with full CDF; align all docs to replay-trace numbers
The previous f2b_session_skew.png was a 3-bar chart (top 1/5/10%) computed from the production trace summary (which is not present locally, only its precomputed JSON). The new figure is a continuous CDF of cumulative input-token mass vs session rank percentile, generated directly from the replay trace traces/w600_r0.0015_st30.jsonl so any percentile is readable. Headline numbers update accordingly: replay trace (n=274 sessions): top 1% = 24.3%, top 5% = 61.9%, top 10% = 75.8% production trace (n=1.3M): top 1% = 46.5%, top 5% = 66.5%, top 10% = 74.6% Both show extreme skew well above the y=x uniform reference; the replay trace is less extreme at top-1% because n=274 makes that bucket only ~3 sessions. We standardize §2/§3 narrative on the replay-trace numbers so motivation matches §5 evaluation; production numbers kept as a side note for context. - scripts/plot_session_skew_cdf.py: reproducible figure generator - MEETING.md / PAPER_OUTLINE.md: update narrative + caption Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
90
scripts/plot_session_skew_cdf.py
Normal file
90
scripts/plot_session_skew_cdf.py
Normal file
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Plot a CDF of cumulative input-token mass by session rank.
|
||||
|
||||
Reads a JSONL trace (chat_id, session_id, input_length, ...), aggregates
|
||||
per-session input_length, sorts sessions descending by total, and plots
|
||||
cumulative fraction of input-token mass vs session-rank percentile.
|
||||
|
||||
The figure replaces the previous discrete top-1%/5%/10% bars with a
|
||||
continuous curve so any percentile can be read off directly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
def load_session_input_tokens(trace_path: Path) -> dict[str, int]:
|
||||
totals: dict[str, int] = defaultdict(int)
|
||||
with trace_path.open() as f:
|
||||
for line in f:
|
||||
row = json.loads(line)
|
||||
totals[row["session_id"]] += int(row["input_length"])
|
||||
return dict(totals)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--trace",
|
||||
default="traces/w600_r0.0015_st30.jsonl",
|
||||
help="JSONL trace path",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
default="figs/f2b_session_skew.png",
|
||||
help="Output figure path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
session_totals = load_session_input_tokens(Path(args.trace))
|
||||
n_sessions = len(session_totals)
|
||||
sorted_vals = np.sort(np.array(list(session_totals.values())))[::-1]
|
||||
cum = np.cumsum(sorted_vals) / sorted_vals.sum()
|
||||
rank_pct = np.arange(1, n_sessions + 1) / n_sessions * 100
|
||||
|
||||
marks = [1, 5, 10, 25, 50]
|
||||
mark_idx = [int(np.ceil(n_sessions * p / 100)) - 1 for p in marks]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(8, 5))
|
||||
ax.plot(rank_pct, cum * 100, color="#2f6fab", lw=2.2,
|
||||
label="cumulative input-token mass")
|
||||
ax.plot([0, 100], [0, 100], color="#999", ls="--", lw=1,
|
||||
label="uniform reference (y = x)")
|
||||
|
||||
for p, i in zip(marks, mark_idx):
|
||||
y = cum[i] * 100
|
||||
ax.scatter([p], [y], color="#c44e52", zorder=5, s=40)
|
||||
ax.annotate(
|
||||
f"top {p}% → {y:.1f}%",
|
||||
xy=(p, y),
|
||||
xytext=(p + 2, y - 5),
|
||||
fontsize=9,
|
||||
color="#333",
|
||||
)
|
||||
|
||||
ax.set_xlim(0, 100)
|
||||
ax.set_ylim(0, 102)
|
||||
ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
|
||||
ax.set_ylabel("Cumulative % of input-token mass")
|
||||
ax.set_title(
|
||||
f"Session input-token mass CDF "
|
||||
f"(n={n_sessions} sessions, "
|
||||
f"total={sorted_vals.sum() / 1e6:.1f} M tokens)"
|
||||
)
|
||||
ax.grid(True, alpha=0.3)
|
||||
ax.legend(loc="lower right", framealpha=0.9)
|
||||
|
||||
out_path = Path(args.out)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig.savefig(out_path, dpi=150, bbox_inches="tight")
|
||||
print(f"wrote {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user