Render 4 per-policy figures on b3_replay_20260527_0114 into figs/v2/

User-provided fresh run with five policies (lmetric, load_only, sticky, unified, plus a new unified_v2 variant). Reproduces the v1 set under figs/v2/ so we can A/B the same panels: f4a_apc_loss.png — APC bars per policy f4c_per_worker_ttft.png — per-worker TTFT p90 panel per policy f6_e2e_latency_bars.png — TTFT/TPOT/E2E p90 bars per policy f6_e2e_latency_full_grid — mean/p50/p90/p99 × TTFT/TPOT/E2E grid scripts/render_b3_figures_v2.py is a standalone driver that reads each policy's metrics.summary.json and breakdown.json directly from the run directory — the breakdown.json `routed_to` field is required to recover per-worker assignment because the new setup routes every request through a proxy (127.0.0.1:9300), so metrics.jsonl's endpoint_url no longer identifies the backend. Headline numbers, new vs v1: APC v2: lmetric 57.2% / load_only 53.9% / sticky 77.7% unified 78.7% / unified_v2 78.4% v1: lmetric 56.9% / load_only 54.1% / sticky 77.2% / unified 79.4% TTFT p90 (s) v2: lmetric 14.8 / load_only 20.1 / sticky 14.8 / unified 8.8 / unified_v2 10.1 v1: lmetric 15.7 / load_only 20.2 / sticky 18.0 / unified 7.3 E2E p90 (s) v2: lmetric 25.4 / load_only 33.9 / sticky 30.3 / unified 20.0 / unified_v2 24.1 v1: lmetric 24.8 / load_only 33.5 / sticky 34.6 / unified 18.0 Worker p90 (s, median / max) v2: lmetric 13.3/30.4 · load_only 21.3/29.2 · sticky 13.5/33.0 unified 10.0/35.1 · unified_v2 8.6/34.2 v1: lmetric 13.9/31.3 · load_only 19.4/25.1 · sticky 20.3/55.4 unified 10.3/37.7 Story is unchanged: unified dominates at p90 across TTFT/E2E and on median-worker latency; unified_v2 is competitive at p50 but slightly worse than unified at p90. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 13:52:17 +08:00
parent 41232f49d3
commit 03d8c5d0d1
5 changed files with 217 additions and 0 deletions
--- a/figs/v2/f4a_apc_loss.png
+++ b/figs/v2/f4a_apc_loss.png
--- a/figs/v2/f4c_per_worker_ttft.png
+++ b/figs/v2/f4c_per_worker_ttft.png
--- a/figs/v2/f6_e2e_latency_bars.png
+++ b/figs/v2/f6_e2e_latency_bars.png
--- a/figs/v2/f6_e2e_latency_full_grid.png
+++ b/figs/v2/f6_e2e_latency_full_grid.png
--- a/scripts/render_b3_figures_v2.py
+++ b/scripts/render_b3_figures_v2.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""Render the 4 per-policy comparison figures from a fresh b3 replay run.
+
+Replicates the f4a / f4c per-worker / f6 headline / f6 full-grid figures
+from analysis/characterization/render_window1_figures.py but reads its
+inputs directly from a run directory (one subdir per policy, each with
+metrics.summary.json + metrics.jsonl), rather than from the older
+window_1_results derived JSONs.
+
+Usage:
+    python scripts/render_b3_figures_v2.py \\
+        --run-dir outputs/b3_replay_20260527_0114 \\
+        --apc-upper-json analysis/characterization/window_1_results/apc_upper_w600.json \\
+        --out-dir figs/v2
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+from pathlib import Path
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+
+POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "unified_v2"]
+POLICY_COLOR = {
+    "lmetric":    "#1f77b4",
+    "load_only":  "#ff7f0e",
+    "sticky":     "#d62728",
+    "unified":    "#2ca02c",
+    "unified_v2": "#17becf",
+}
+
+
+def load_policy_summary(run_dir: Path, pol: str) -> dict:
+    p = run_dir / pol / "metrics.summary.json"
+    return json.loads(p.read_text())
+
+
+def per_worker_ttft_p90(run_dir: Path, pol: str) -> dict[str, float]:
+    """Group successful requests by routed backend (from breakdown.json), return TTFT p90.
+
+    In the b3_replay_20260527_0114 setup the replayer's endpoint_url is the
+    proxy (127.0.0.1:9300), not the actual backend worker. The proxy emits
+    a per-request breakdown.json with `routed_to` (e.g. 127.0.0.1:8001),
+    which we join on request_id to recover the per-worker assignment.
+    """
+    breakdown_path = run_dir / pol / "breakdown.json"
+    metrics_path = run_dir / pol / "metrics.jsonl"
+
+    routed_to: dict[str, str] = {}
+    if breakdown_path.exists():
+        for item in json.loads(breakdown_path.read_text()):
+            rid = item.get("request_id")
+            url = item.get("routed_to")
+            if rid is not None and url is not None:
+                routed_to[rid] = url
+
+    by_url: dict[str, list[float]] = {}
+    with metrics_path.open() as f:
+        for line in f:
+            r = json.loads(line)
+            if r.get("error"):
+                continue
+            t = r.get("ttft_s")
+            if t is None:
+                continue
+            rid = r.get("request_id") or r.get("proxy_request_id")
+            url = routed_to.get(rid) or r.get("endpoint_url")
+            if url is None:
+                continue
+            by_url.setdefault(url, []).append(float(t))
+
+    out: dict[str, float] = {}
+    for url, vals in by_url.items():
+        arr = np.array(vals)
+        out[url] = float(np.percentile(arr, 90))
+    return out
+
+
+def fig_apc_loss(run_dir: Path, apc_upper: dict, pols: list[str], out: Path) -> None:
+    apc_by_pol = {}
+    for pol in pols:
+        s = load_policy_summary(run_dir, pol)
+        apc_by_pol[pol] = s["total_cached_tokens"] / s["total_input_tokens"]
+    fig, ax = plt.subplots(figsize=(7, 4.2))
+    vals = [apc_by_pol[p] * 100 for p in pols]
+    ax.bar(pols, vals,
+           color=[POLICY_COLOR.get(p, "gray") for p in pols],
+           edgecolor="black", linewidth=0.5)
+    for i, v in enumerate(vals):
+        ax.text(i, v, f"{v:.1f}%", ha="center", va="bottom", fontsize=9)
+    ax.axhline(apc_upper["apc_upper_intra_session"] * 100,
+               linestyle="--", color="#444", alpha=0.7,
+               label=f"intra-session ceiling {apc_upper['apc_upper_intra_session']*100:.1f}%")
+    ax.axhline(apc_upper["apc_upper_any_session"] * 100,
+               linestyle=":", color="#888", alpha=0.7,
+               label=f"any-session ceiling {apc_upper['apc_upper_any_session']*100:.1f}%")
+    ax.set_ylim(0, 100)
+    ax.set_ylabel("APC ratio (%)")
+    ax.set_title("APC achieved vs theoretical ceiling (b3 replay 20260527_0114)")
+    ax.legend(loc="lower right", fontsize=9)
+    ax.grid(alpha=0.3, axis="y")
+    fig.tight_layout()
+    fig.savefig(out, dpi=120)
+    plt.close(fig)
+
+
+def fig_per_worker_ttft(run_dir: Path, pols: list[str], out: Path) -> None:
+    fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4.2), sharey=True)
+    if len(pols) == 1:
+        axes = [axes]
+    for ax, pol in zip(axes, pols):
+        per = per_worker_ttft_p90(run_dir, pol)
+        items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1]))
+        labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items]
+        vals = [v for _, v in items]
+        ax.bar(labels, vals,
+               color=POLICY_COLOR.get(pol, "gray"),
+               edgecolor="black", linewidth=0.5)
+        for i, v in enumerate(vals):
+            ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
+        median_v = statistics.median(vals)
+        max_v = max(vals)
+        ax.set_title(f"{pol}\nmedian {median_v:.1f}s · max {max_v:.1f}s", fontsize=10)
+        ax.tick_params(axis="x", labelsize=8)
+        ax.grid(alpha=0.3, axis="y")
+    axes[0].set_ylabel("worker TTFT p90 (s)")
+    fig.suptitle("Per-worker TTFT p90 distribution (b3 replay 20260527_0114)")
+    fig.tight_layout()
+    fig.savefig(out, dpi=120)
+    plt.close(fig)
+
+
+def fig_latency_bars(run_dir: Path, pols: list[str], out: Path) -> None:
+    metrics = [("TTFT p90 (s)",  "ttft_stats_s",    "p90", 1.0),
+               ("TPOT p90 (ms)", "tpot_stats_s",    "p90", 1000.0),
+               ("E2E p90 (s)",   "latency_stats_s", "p90", 1.0)]
+    summaries = {p: load_policy_summary(run_dir, p) for p in pols}
+    fig, axes = plt.subplots(1, 3, figsize=(13, 4.2))
+    for ax, (label, key, agg, scale) in zip(axes, metrics):
+        vals = [summaries[p][key][agg] * scale for p in pols]
+        ax.bar(pols, vals,
+               color=[POLICY_COLOR.get(p, "gray") for p in pols],
+               edgecolor="black", linewidth=0.5)
+        ax.set_title(label)
+        ax.tick_params(axis="x", rotation=20)
+        for i, v in enumerate(vals):
+            ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9)
+        ax.grid(alpha=0.3, axis="y")
+    fig.suptitle("Headline latencies per policy (b3 replay 20260527_0114)")
+    fig.tight_layout()
+    fig.savefig(out, dpi=120)
+    plt.close(fig)
+
+
+def fig_latency_full_grid(run_dir: Path, pols: list[str], out: Path) -> None:
+    rows = [("mean", "mean"), ("p50", "p50"), ("p90", "p90"), ("p99", "p99")]
+    cols = [("TTFT (s)",  "ttft_stats_s",    1.0),
+            ("TPOT (ms)", "tpot_stats_s",    1000.0),
+            ("E2E (s)",   "latency_stats_s", 1.0)]
+    summaries = {p: load_policy_summary(run_dir, p) for p in pols}
+    fig, axes = plt.subplots(len(rows), len(cols), figsize=(12.5, 11.5), sharex=True)
+    for i, (row_label, agg) in enumerate(rows):
+        for j, (col_label, key, scale) in enumerate(cols):
+            ax = axes[i][j]
+            vals = [summaries[p][key][agg] * scale for p in pols]
+            ax.bar(pols, vals,
+                   color=[POLICY_COLOR.get(p, "gray") for p in pols],
+                   edgecolor="black", linewidth=0.5)
+            for k, v in enumerate(vals):
+                ax.text(k, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
+            if j == 0:
+                ax.set_ylabel(row_label, fontsize=11)
+            if i == 0:
+                ax.set_title(col_label, fontsize=11)
+            ax.grid(alpha=0.3, axis="y")
+            ax.tick_params(axis="x", rotation=20, labelsize=9)
+            ax.margins(y=0.18)
+    fig.suptitle("Latencies per policy — mean / p50 / p90 / p99 (b3 replay 20260527_0114)")
+    fig.tight_layout()
+    fig.savefig(out, dpi=120)
+    plt.close(fig)
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--run-dir", type=Path, required=True)
+    p.add_argument("--apc-upper-json", type=Path, required=True)
+    p.add_argument("--out-dir", type=Path, required=True)
+    p.add_argument("--exclude-policies", default="",
+                   help="Comma-separated policies to drop")
+    args = p.parse_args()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()}
+    pols = [pol for pol in POLICY_ORDER
+            if pol not in excluded and (args.run_dir / pol).is_dir()]
+    print(f"policies: {pols}")
+
+    apc_upper = json.loads(args.apc_upper_json.read_text())
+
+    fig_apc_loss(args.run_dir, apc_upper, pols, args.out_dir / "f4a_apc_loss.png")
+    print(f"wrote {args.out_dir / 'f4a_apc_loss.png'}")
+    fig_per_worker_ttft(args.run_dir, pols, args.out_dir / "f4c_per_worker_ttft.png")
+    print(f"wrote {args.out_dir / 'f4c_per_worker_ttft.png'}")
+    fig_latency_bars(args.run_dir, pols, args.out_dir / "f6_e2e_latency_bars.png")
+    print(f"wrote {args.out_dir / 'f6_e2e_latency_bars.png'}")
+    fig_latency_full_grid(args.run_dir, pols, args.out_dir / "f6_e2e_latency_full_grid.png")
+    print(f"wrote {args.out_dir / 'f6_e2e_latency_full_grid.png'}")
+
+
+if __name__ == "__main__":
+    main()