diff --git a/figs/f2b_session_skew_prod.png b/figs/f2b_session_skew_prod.png new file mode 100644 index 0000000..61e9e1d Binary files /dev/null and b/figs/f2b_session_skew_prod.png differ diff --git a/scripts/plot_session_skew_cdf.py b/scripts/plot_session_skew_cdf.py index 4fe1477..e840c4d 100644 --- a/scripts/plot_session_skew_cdf.py +++ b/scripts/plot_session_skew_cdf.py @@ -47,32 +47,11 @@ def load_production_cdf( return xs, ys, d["n_sessions"], d["anchors_check"] -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument( - "--replay-trace", - default="traces/w600_r0.0015_st30.jsonl", - ) - parser.add_argument( - "--prod-cache", - default="analysis/characterization/data/production_session_skew_cdf.json", - ) - parser.add_argument("--out", default="figs/f2b_session_skew.png") - args = parser.parse_args() +ANNOTATE_PTS = [1.0, 5.0, 10.0, 25.0, 50.0] - prod_x, prod_y, prod_n, prod_anchors = load_production_cdf(Path(args.prod_cache)) - replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace)) - fig, ax = plt.subplots(figsize=(9, 5.5)) - - ax.plot( - prod_x, prod_y, - color="#c44e52", lw=2.4, - label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)", - ) - - annotate_pts = [1.0, 5.0, 10.0, 25.0, 50.0] - for p in annotate_pts: +def _annotate_anchors(ax, prod_x, prod_y) -> None: + for p in ANNOTATE_PTS: y = float(np.interp(p, prod_x, prod_y)) ax.scatter([p], [y], color="#c44e52", s=55, zorder=5) ax.annotate( @@ -83,32 +62,64 @@ def main() -> None: color="#7a1d1d", ) - ax.plot( - replay_rank_pct, replay_cum_pct, - color="#2f6fab", lw=1.6, - alpha=0.85, - label=f"replay window (n={replay_n} sessions, raw CDF)", - ) - ax.plot( - [0, 100], [0, 100], - color="#888", ls="--", lw=1, - label="uniform reference (y = x)", - ) - - ax.set_xlim(0, 100) - ax.set_ylim(0, 102) +def plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n, + out_path: Path) -> None: + fig, ax = plt.subplots(figsize=(9, 5.5)) + ax.plot(prod_x, prod_y, color="#c44e52", lw=2.4, + label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)") + _annotate_anchors(ax, prod_x, prod_y) + ax.plot(replay_rank_pct, replay_cum_pct, color="#2f6fab", lw=1.6, alpha=0.85, + label=f"replay window (n={replay_n} sessions, raw CDF)") + ax.plot([0, 100], [0, 100], color="#888", ls="--", lw=1, + label="uniform reference (y = x)") + ax.set_xlim(0, 100); ax.set_ylim(0, 102) ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)") ax.set_ylabel("Cumulative % of input-token mass") ax.set_title("Session input-token mass CDF — Qwen3 production trace") ax.grid(True, alpha=0.3) ax.legend(loc="lower right", framealpha=0.92, fontsize=9) - - out_path = Path(args.out) out_path.parent.mkdir(parents=True, exist_ok=True) fig.savefig(out_path, dpi=150, bbox_inches="tight") + plt.close(fig) print(f"wrote {out_path}") +def plot_production_solo(prod_x, prod_y, prod_n, out_path: Path) -> None: + fig, ax = plt.subplots(figsize=(9, 5.5)) + ax.plot(prod_x, prod_y, color="#c44e52", lw=2.6) + _annotate_anchors(ax, prod_x, prod_y) + ax.set_xlim(0, 100); ax.set_ylim(0, 102) + ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)") + ax.set_ylabel("Cumulative % of input-token mass") + ax.set_title( + f"Session input-token mass CDF — Qwen3 production trace (n = {prod_n:,} sessions)" + ) + ax.grid(True, alpha=0.3) + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=150, bbox_inches="tight") + plt.close(fig) + print(f"wrote {out_path}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--replay-trace", default="traces/w600_r0.0015_st30.jsonl") + parser.add_argument( + "--prod-cache", + default="analysis/characterization/data/production_session_skew_cdf.json", + ) + parser.add_argument("--out-combined", default="figs/f2b_session_skew.png") + parser.add_argument("--out-solo", default="figs/f2b_session_skew_prod.png") + args = parser.parse_args() + + prod_x, prod_y, prod_n, _ = load_production_cdf(Path(args.prod_cache)) + replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace)) + + plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n, + Path(args.out_combined)) + plot_production_solo(prod_x, prod_y, prod_n, Path(args.out_solo)) + + if __name__ == "__main__": main()