#!/usr/bin/env python3 """Render the 4 per-policy comparison figures from a fresh b3 replay run. Replicates the f4a / f4c per-worker / f6 headline / f6 full-grid figures from analysis/characterization/render_window1_figures.py but reads its inputs directly from a run directory (one subdir per policy, each with metrics.summary.json + metrics.jsonl), rather than from the older window_1_results derived JSONs. Usage: python scripts/render_b3_figures_v2.py \\ --run-dir outputs/b3_replay_20260527_0114 \\ --apc-upper-json analysis/characterization/window_1_results/apc_upper_w600.json \\ --out-dir figs/v2 """ from __future__ import annotations import argparse import json import statistics from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "unified_v2"] POLICY_COLOR = { "lmetric": "#1f77b4", "load_only": "#ff7f0e", "sticky": "#d62728", "unified": "#2ca02c", "unified_v2": "#17becf", } def load_policy_summary(run_dir: Path, pol: str) -> dict: p = run_dir / pol / "metrics.summary.json" return json.loads(p.read_text()) def per_worker_ttft_p90(run_dir: Path, pol: str) -> dict[str, float]: """Group successful requests by routed backend (from breakdown.json), return TTFT p90. In the b3_replay_20260527_0114 setup the replayer's endpoint_url is the proxy (127.0.0.1:9300), not the actual backend worker. The proxy emits a per-request breakdown.json with `routed_to` (e.g. 127.0.0.1:8001), which we join on request_id to recover the per-worker assignment. """ breakdown_path = run_dir / pol / "breakdown.json" metrics_path = run_dir / pol / "metrics.jsonl" routed_to: dict[str, str] = {} if breakdown_path.exists(): for item in json.loads(breakdown_path.read_text()): rid = item.get("request_id") url = item.get("routed_to") if rid is not None and url is not None: routed_to[rid] = url by_url: dict[str, list[float]] = {} with metrics_path.open() as f: for line in f: r = json.loads(line) if r.get("error"): continue t = r.get("ttft_s") if t is None: continue rid = r.get("request_id") or r.get("proxy_request_id") url = routed_to.get(rid) or r.get("endpoint_url") if url is None: continue by_url.setdefault(url, []).append(float(t)) out: dict[str, float] = {} for url, vals in by_url.items(): arr = np.array(vals) out[url] = float(np.percentile(arr, 90)) return out def fig_apc_loss(run_dir: Path, apc_upper: dict, pols: list[str], out: Path) -> None: apc_by_pol = {} for pol in pols: s = load_policy_summary(run_dir, pol) apc_by_pol[pol] = s["total_cached_tokens"] / s["total_input_tokens"] fig, ax = plt.subplots(figsize=(7, 4.2)) vals = [apc_by_pol[p] * 100 for p in pols] ax.bar(pols, vals, color=[POLICY_COLOR.get(p, "gray") for p in pols], edgecolor="black", linewidth=0.5) for i, v in enumerate(vals): ax.text(i, v, f"{v:.1f}%", ha="center", va="bottom", fontsize=9) ax.axhline(apc_upper["apc_upper_intra_session"] * 100, linestyle="--", color="#444", alpha=0.7, label=f"intra-session ceiling {apc_upper['apc_upper_intra_session']*100:.1f}%") ax.axhline(apc_upper["apc_upper_any_session"] * 100, linestyle=":", color="#888", alpha=0.7, label=f"any-session ceiling {apc_upper['apc_upper_any_session']*100:.1f}%") ax.set_ylim(0, 100) ax.set_ylabel("APC ratio (%)") ax.set_title("APC achieved vs theoretical ceiling (b3 replay 20260527_0114)") ax.legend(loc="lower right", fontsize=9) ax.grid(alpha=0.3, axis="y") fig.tight_layout() fig.savefig(out, dpi=120) plt.close(fig) def fig_per_worker_ttft(run_dir: Path, pols: list[str], out: Path) -> None: fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4.2), sharey=True) if len(pols) == 1: axes = [axes] for ax, pol in zip(axes, pols): per = per_worker_ttft_p90(run_dir, pol) items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1])) labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items] vals = [v for _, v in items] ax.bar(labels, vals, color=POLICY_COLOR.get(pol, "gray"), edgecolor="black", linewidth=0.5) for i, v in enumerate(vals): ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8) median_v = statistics.median(vals) max_v = max(vals) ax.set_title(f"{pol}\nmedian {median_v:.1f}s · max {max_v:.1f}s", fontsize=10) ax.tick_params(axis="x", labelsize=8) ax.grid(alpha=0.3, axis="y") axes[0].set_ylabel("worker TTFT p90 (s)") fig.suptitle("Per-worker TTFT p90 distribution (b3 replay 20260527_0114)") fig.tight_layout() fig.savefig(out, dpi=120) plt.close(fig) def fig_latency_bars(run_dir: Path, pols: list[str], out: Path) -> None: metrics = [("TTFT p90 (s)", "ttft_stats_s", "p90", 1.0), ("TPOT p90 (ms)", "tpot_stats_s", "p90", 1000.0), ("E2E p90 (s)", "latency_stats_s", "p90", 1.0)] summaries = {p: load_policy_summary(run_dir, p) for p in pols} fig, axes = plt.subplots(1, 3, figsize=(13, 4.2)) for ax, (label, key, agg, scale) in zip(axes, metrics): vals = [summaries[p][key][agg] * scale for p in pols] ax.bar(pols, vals, color=[POLICY_COLOR.get(p, "gray") for p in pols], edgecolor="black", linewidth=0.5) ax.set_title(label) ax.tick_params(axis="x", rotation=20) for i, v in enumerate(vals): ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9) ax.grid(alpha=0.3, axis="y") fig.suptitle("Headline latencies per policy (b3 replay 20260527_0114)") fig.tight_layout() fig.savefig(out, dpi=120) plt.close(fig) def fig_latency_full_grid(run_dir: Path, pols: list[str], out: Path) -> None: rows = [("mean", "mean"), ("p50", "p50"), ("p90", "p90"), ("p99", "p99")] cols = [("TTFT (s)", "ttft_stats_s", 1.0), ("TPOT (ms)", "tpot_stats_s", 1000.0), ("E2E (s)", "latency_stats_s", 1.0)] summaries = {p: load_policy_summary(run_dir, p) for p in pols} fig, axes = plt.subplots(len(rows), len(cols), figsize=(12.5, 11.5), sharex=True) for i, (row_label, agg) in enumerate(rows): for j, (col_label, key, scale) in enumerate(cols): ax = axes[i][j] vals = [summaries[p][key][agg] * scale for p in pols] ax.bar(pols, vals, color=[POLICY_COLOR.get(p, "gray") for p in pols], edgecolor="black", linewidth=0.5) for k, v in enumerate(vals): ax.text(k, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8) if j == 0: ax.set_ylabel(row_label, fontsize=11) if i == 0: ax.set_title(col_label, fontsize=11) ax.grid(alpha=0.3, axis="y") ax.tick_params(axis="x", rotation=20, labelsize=9) ax.margins(y=0.18) fig.suptitle("Latencies per policy — mean / p50 / p90 / p99 (b3 replay 20260527_0114)") fig.tight_layout() fig.savefig(out, dpi=120) plt.close(fig) def main() -> None: p = argparse.ArgumentParser() p.add_argument("--run-dir", type=Path, required=True) p.add_argument("--apc-upper-json", type=Path, required=True) p.add_argument("--out-dir", type=Path, required=True) p.add_argument("--exclude-policies", default="", help="Comma-separated policies to drop") args = p.parse_args() args.out_dir.mkdir(parents=True, exist_ok=True) excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()} pols = [pol for pol in POLICY_ORDER if pol not in excluded and (args.run_dir / pol).is_dir()] print(f"policies: {pols}") apc_upper = json.loads(args.apc_upper_json.read_text()) fig_apc_loss(args.run_dir, apc_upper, pols, args.out_dir / "f4a_apc_loss.png") print(f"wrote {args.out_dir / 'f4a_apc_loss.png'}") fig_per_worker_ttft(args.run_dir, pols, args.out_dir / "f4c_per_worker_ttft.png") print(f"wrote {args.out_dir / 'f4c_per_worker_ttft.png'}") fig_latency_bars(args.run_dir, pols, args.out_dir / "f6_e2e_latency_bars.png") print(f"wrote {args.out_dir / 'f6_e2e_latency_bars.png'}") fig_latency_full_grid(args.run_dir, pols, args.out_dir / "f6_e2e_latency_full_grid.png") print(f"wrote {args.out_dir / 'f6_e2e_latency_full_grid.png'}") if __name__ == "__main__": main()