Add solo production-trace CDF figure (f2b_session_skew_prod.png)

Single-curve variant of f2b — production trace only, no replay overlay
and no uniform reference. Cleaner for boss-meeting/talk slides where the
extra context is noise. The combined three-curve figure is unchanged.

scripts/plot_session_skew_cdf.py: split into plot_combined +
plot_production_solo helpers; one run emits both PNGs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 10:53:30 +08:00
parent 1220da249c
commit 74e0c2157a
2 changed files with 51 additions and 40 deletions

View File

@@ -47,32 +47,11 @@ def load_production_cdf(
return xs, ys, d["n_sessions"], d["anchors_check"]
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--replay-trace",
default="traces/w600_r0.0015_st30.jsonl",
)
parser.add_argument(
"--prod-cache",
default="analysis/characterization/data/production_session_skew_cdf.json",
)
parser.add_argument("--out", default="figs/f2b_session_skew.png")
args = parser.parse_args()
ANNOTATE_PTS = [1.0, 5.0, 10.0, 25.0, 50.0]
prod_x, prod_y, prod_n, prod_anchors = load_production_cdf(Path(args.prod_cache))
replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace))
fig, ax = plt.subplots(figsize=(9, 5.5))
ax.plot(
prod_x, prod_y,
color="#c44e52", lw=2.4,
label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)",
)
annotate_pts = [1.0, 5.0, 10.0, 25.0, 50.0]
for p in annotate_pts:
def _annotate_anchors(ax, prod_x, prod_y) -> None:
for p in ANNOTATE_PTS:
y = float(np.interp(p, prod_x, prod_y))
ax.scatter([p], [y], color="#c44e52", s=55, zorder=5)
ax.annotate(
@@ -83,32 +62,64 @@ def main() -> None:
color="#7a1d1d",
)
ax.plot(
replay_rank_pct, replay_cum_pct,
color="#2f6fab", lw=1.6,
alpha=0.85,
label=f"replay window (n={replay_n} sessions, raw CDF)",
)
ax.plot(
[0, 100], [0, 100],
color="#888", ls="--", lw=1,
label="uniform reference (y = x)",
)
ax.set_xlim(0, 100)
ax.set_ylim(0, 102)
def plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
out_path: Path) -> None:
fig, ax = plt.subplots(figsize=(9, 5.5))
ax.plot(prod_x, prod_y, color="#c44e52", lw=2.4,
label=f"production trace (n={prod_n:,} sessions, 456-pt sampled)")
_annotate_anchors(ax, prod_x, prod_y)
ax.plot(replay_rank_pct, replay_cum_pct, color="#2f6fab", lw=1.6, alpha=0.85,
label=f"replay window (n={replay_n} sessions, raw CDF)")
ax.plot([0, 100], [0, 100], color="#888", ls="--", lw=1,
label="uniform reference (y = x)")
ax.set_xlim(0, 100); ax.set_ylim(0, 102)
ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
ax.set_ylabel("Cumulative % of input-token mass")
ax.set_title("Session input-token mass CDF — Qwen3 production trace")
ax.grid(True, alpha=0.3)
ax.legend(loc="lower right", framealpha=0.92, fontsize=9)
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f"wrote {out_path}")
def plot_production_solo(prod_x, prod_y, prod_n, out_path: Path) -> None:
fig, ax = plt.subplots(figsize=(9, 5.5))
ax.plot(prod_x, prod_y, color="#c44e52", lw=2.6)
_annotate_anchors(ax, prod_x, prod_y)
ax.set_xlim(0, 100); ax.set_ylim(0, 102)
ax.set_xlabel("Session rank percentile (top → bottom by input-token mass)")
ax.set_ylabel("Cumulative % of input-token mass")
ax.set_title(
f"Session input-token mass CDF — Qwen3 production trace (n = {prod_n:,} sessions)"
)
ax.grid(True, alpha=0.3)
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(out_path, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f"wrote {out_path}")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--replay-trace", default="traces/w600_r0.0015_st30.jsonl")
parser.add_argument(
"--prod-cache",
default="analysis/characterization/data/production_session_skew_cdf.json",
)
parser.add_argument("--out-combined", default="figs/f2b_session_skew.png")
parser.add_argument("--out-solo", default="figs/f2b_session_skew_prod.png")
args = parser.parse_args()
prod_x, prod_y, prod_n, _ = load_production_cdf(Path(args.prod_cache))
replay_rank_pct, replay_cum_pct, replay_n = load_replay_cdf(Path(args.replay_trace))
plot_combined(prod_x, prod_y, prod_n, replay_rank_pct, replay_cum_pct, replay_n,
Path(args.out_combined))
plot_production_solo(prod_x, prod_y, prod_n, Path(args.out_solo))
if __name__ == "__main__":
main()