Render 4 per-policy figures on b3_replay_20260527_0114 into figs/v2/
User-provided fresh run with five policies (lmetric, load_only, sticky,
unified, plus a new unified_v2 variant). Reproduces the v1 set under
figs/v2/ so we can A/B the same panels:
f4a_apc_loss.png — APC bars per policy
f4c_per_worker_ttft.png — per-worker TTFT p90 panel per policy
f6_e2e_latency_bars.png — TTFT/TPOT/E2E p90 bars per policy
f6_e2e_latency_full_grid — mean/p50/p90/p99 × TTFT/TPOT/E2E grid
scripts/render_b3_figures_v2.py is a standalone driver that reads each
policy's metrics.summary.json and breakdown.json directly from the run
directory — the breakdown.json `routed_to` field is required to recover
per-worker assignment because the new setup routes every request
through a proxy (127.0.0.1:9300), so metrics.jsonl's endpoint_url no
longer identifies the backend.
Headline numbers, new vs v1:
APC v2: lmetric 57.2% / load_only 53.9% / sticky 77.7%
unified 78.7% / unified_v2 78.4%
v1: lmetric 56.9% / load_only 54.1% / sticky 77.2% / unified 79.4%
TTFT p90 (s) v2: lmetric 14.8 / load_only 20.1 / sticky 14.8 /
unified 8.8 / unified_v2 10.1
v1: lmetric 15.7 / load_only 20.2 / sticky 18.0 / unified 7.3
E2E p90 (s) v2: lmetric 25.4 / load_only 33.9 / sticky 30.3 /
unified 20.0 / unified_v2 24.1
v1: lmetric 24.8 / load_only 33.5 / sticky 34.6 / unified 18.0
Worker p90 (s, median / max)
v2: lmetric 13.3/30.4 · load_only 21.3/29.2 · sticky 13.5/33.0
unified 10.0/35.1 · unified_v2 8.6/34.2
v1: lmetric 13.9/31.3 · load_only 19.4/25.1 · sticky 20.3/55.4
unified 10.3/37.7
Story is unchanged: unified dominates at p90 across TTFT/E2E and on
median-worker latency; unified_v2 is competitive at p50 but slightly
worse than unified at p90.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
BIN
figs/v2/f4a_apc_loss.png
Normal file
BIN
figs/v2/f4a_apc_loss.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 42 KiB |
BIN
figs/v2/f4c_per_worker_ttft.png
Normal file
BIN
figs/v2/f4c_per_worker_ttft.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 65 KiB |
BIN
figs/v2/f6_e2e_latency_bars.png
Normal file
BIN
figs/v2/f6_e2e_latency_bars.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 64 KiB |
BIN
figs/v2/f6_e2e_latency_full_grid.png
Normal file
BIN
figs/v2/f6_e2e_latency_full_grid.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 132 KiB |
217
scripts/render_b3_figures_v2.py
Normal file
217
scripts/render_b3_figures_v2.py
Normal file
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Render the 4 per-policy comparison figures from a fresh b3 replay run.
|
||||
|
||||
Replicates the f4a / f4c per-worker / f6 headline / f6 full-grid figures
|
||||
from analysis/characterization/render_window1_figures.py but reads its
|
||||
inputs directly from a run directory (one subdir per policy, each with
|
||||
metrics.summary.json + metrics.jsonl), rather than from the older
|
||||
window_1_results derived JSONs.
|
||||
|
||||
Usage:
|
||||
python scripts/render_b3_figures_v2.py \\
|
||||
--run-dir outputs/b3_replay_20260527_0114 \\
|
||||
--apc-upper-json analysis/characterization/window_1_results/apc_upper_w600.json \\
|
||||
--out-dir figs/v2
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "unified_v2"]
|
||||
POLICY_COLOR = {
|
||||
"lmetric": "#1f77b4",
|
||||
"load_only": "#ff7f0e",
|
||||
"sticky": "#d62728",
|
||||
"unified": "#2ca02c",
|
||||
"unified_v2": "#17becf",
|
||||
}
|
||||
|
||||
|
||||
def load_policy_summary(run_dir: Path, pol: str) -> dict:
|
||||
p = run_dir / pol / "metrics.summary.json"
|
||||
return json.loads(p.read_text())
|
||||
|
||||
|
||||
def per_worker_ttft_p90(run_dir: Path, pol: str) -> dict[str, float]:
|
||||
"""Group successful requests by routed backend (from breakdown.json), return TTFT p90.
|
||||
|
||||
In the b3_replay_20260527_0114 setup the replayer's endpoint_url is the
|
||||
proxy (127.0.0.1:9300), not the actual backend worker. The proxy emits
|
||||
a per-request breakdown.json with `routed_to` (e.g. 127.0.0.1:8001),
|
||||
which we join on request_id to recover the per-worker assignment.
|
||||
"""
|
||||
breakdown_path = run_dir / pol / "breakdown.json"
|
||||
metrics_path = run_dir / pol / "metrics.jsonl"
|
||||
|
||||
routed_to: dict[str, str] = {}
|
||||
if breakdown_path.exists():
|
||||
for item in json.loads(breakdown_path.read_text()):
|
||||
rid = item.get("request_id")
|
||||
url = item.get("routed_to")
|
||||
if rid is not None and url is not None:
|
||||
routed_to[rid] = url
|
||||
|
||||
by_url: dict[str, list[float]] = {}
|
||||
with metrics_path.open() as f:
|
||||
for line in f:
|
||||
r = json.loads(line)
|
||||
if r.get("error"):
|
||||
continue
|
||||
t = r.get("ttft_s")
|
||||
if t is None:
|
||||
continue
|
||||
rid = r.get("request_id") or r.get("proxy_request_id")
|
||||
url = routed_to.get(rid) or r.get("endpoint_url")
|
||||
if url is None:
|
||||
continue
|
||||
by_url.setdefault(url, []).append(float(t))
|
||||
|
||||
out: dict[str, float] = {}
|
||||
for url, vals in by_url.items():
|
||||
arr = np.array(vals)
|
||||
out[url] = float(np.percentile(arr, 90))
|
||||
return out
|
||||
|
||||
|
||||
def fig_apc_loss(run_dir: Path, apc_upper: dict, pols: list[str], out: Path) -> None:
|
||||
apc_by_pol = {}
|
||||
for pol in pols:
|
||||
s = load_policy_summary(run_dir, pol)
|
||||
apc_by_pol[pol] = s["total_cached_tokens"] / s["total_input_tokens"]
|
||||
fig, ax = plt.subplots(figsize=(7, 4.2))
|
||||
vals = [apc_by_pol[p] * 100 for p in pols]
|
||||
ax.bar(pols, vals,
|
||||
color=[POLICY_COLOR.get(p, "gray") for p in pols],
|
||||
edgecolor="black", linewidth=0.5)
|
||||
for i, v in enumerate(vals):
|
||||
ax.text(i, v, f"{v:.1f}%", ha="center", va="bottom", fontsize=9)
|
||||
ax.axhline(apc_upper["apc_upper_intra_session"] * 100,
|
||||
linestyle="--", color="#444", alpha=0.7,
|
||||
label=f"intra-session ceiling {apc_upper['apc_upper_intra_session']*100:.1f}%")
|
||||
ax.axhline(apc_upper["apc_upper_any_session"] * 100,
|
||||
linestyle=":", color="#888", alpha=0.7,
|
||||
label=f"any-session ceiling {apc_upper['apc_upper_any_session']*100:.1f}%")
|
||||
ax.set_ylim(0, 100)
|
||||
ax.set_ylabel("APC ratio (%)")
|
||||
ax.set_title("APC achieved vs theoretical ceiling (b3 replay 20260527_0114)")
|
||||
ax.legend(loc="lower right", fontsize=9)
|
||||
ax.grid(alpha=0.3, axis="y")
|
||||
fig.tight_layout()
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def fig_per_worker_ttft(run_dir: Path, pols: list[str], out: Path) -> None:
|
||||
fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4.2), sharey=True)
|
||||
if len(pols) == 1:
|
||||
axes = [axes]
|
||||
for ax, pol in zip(axes, pols):
|
||||
per = per_worker_ttft_p90(run_dir, pol)
|
||||
items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1]))
|
||||
labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items]
|
||||
vals = [v for _, v in items]
|
||||
ax.bar(labels, vals,
|
||||
color=POLICY_COLOR.get(pol, "gray"),
|
||||
edgecolor="black", linewidth=0.5)
|
||||
for i, v in enumerate(vals):
|
||||
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
|
||||
median_v = statistics.median(vals)
|
||||
max_v = max(vals)
|
||||
ax.set_title(f"{pol}\nmedian {median_v:.1f}s · max {max_v:.1f}s", fontsize=10)
|
||||
ax.tick_params(axis="x", labelsize=8)
|
||||
ax.grid(alpha=0.3, axis="y")
|
||||
axes[0].set_ylabel("worker TTFT p90 (s)")
|
||||
fig.suptitle("Per-worker TTFT p90 distribution (b3 replay 20260527_0114)")
|
||||
fig.tight_layout()
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def fig_latency_bars(run_dir: Path, pols: list[str], out: Path) -> None:
|
||||
metrics = [("TTFT p90 (s)", "ttft_stats_s", "p90", 1.0),
|
||||
("TPOT p90 (ms)", "tpot_stats_s", "p90", 1000.0),
|
||||
("E2E p90 (s)", "latency_stats_s", "p90", 1.0)]
|
||||
summaries = {p: load_policy_summary(run_dir, p) for p in pols}
|
||||
fig, axes = plt.subplots(1, 3, figsize=(13, 4.2))
|
||||
for ax, (label, key, agg, scale) in zip(axes, metrics):
|
||||
vals = [summaries[p][key][agg] * scale for p in pols]
|
||||
ax.bar(pols, vals,
|
||||
color=[POLICY_COLOR.get(p, "gray") for p in pols],
|
||||
edgecolor="black", linewidth=0.5)
|
||||
ax.set_title(label)
|
||||
ax.tick_params(axis="x", rotation=20)
|
||||
for i, v in enumerate(vals):
|
||||
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9)
|
||||
ax.grid(alpha=0.3, axis="y")
|
||||
fig.suptitle("Headline latencies per policy (b3 replay 20260527_0114)")
|
||||
fig.tight_layout()
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def fig_latency_full_grid(run_dir: Path, pols: list[str], out: Path) -> None:
|
||||
rows = [("mean", "mean"), ("p50", "p50"), ("p90", "p90"), ("p99", "p99")]
|
||||
cols = [("TTFT (s)", "ttft_stats_s", 1.0),
|
||||
("TPOT (ms)", "tpot_stats_s", 1000.0),
|
||||
("E2E (s)", "latency_stats_s", 1.0)]
|
||||
summaries = {p: load_policy_summary(run_dir, p) for p in pols}
|
||||
fig, axes = plt.subplots(len(rows), len(cols), figsize=(12.5, 11.5), sharex=True)
|
||||
for i, (row_label, agg) in enumerate(rows):
|
||||
for j, (col_label, key, scale) in enumerate(cols):
|
||||
ax = axes[i][j]
|
||||
vals = [summaries[p][key][agg] * scale for p in pols]
|
||||
ax.bar(pols, vals,
|
||||
color=[POLICY_COLOR.get(p, "gray") for p in pols],
|
||||
edgecolor="black", linewidth=0.5)
|
||||
for k, v in enumerate(vals):
|
||||
ax.text(k, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
|
||||
if j == 0:
|
||||
ax.set_ylabel(row_label, fontsize=11)
|
||||
if i == 0:
|
||||
ax.set_title(col_label, fontsize=11)
|
||||
ax.grid(alpha=0.3, axis="y")
|
||||
ax.tick_params(axis="x", rotation=20, labelsize=9)
|
||||
ax.margins(y=0.18)
|
||||
fig.suptitle("Latencies per policy — mean / p50 / p90 / p99 (b3 replay 20260527_0114)")
|
||||
fig.tight_layout()
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--run-dir", type=Path, required=True)
|
||||
p.add_argument("--apc-upper-json", type=Path, required=True)
|
||||
p.add_argument("--out-dir", type=Path, required=True)
|
||||
p.add_argument("--exclude-policies", default="",
|
||||
help="Comma-separated policies to drop")
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()}
|
||||
pols = [pol for pol in POLICY_ORDER
|
||||
if pol not in excluded and (args.run_dir / pol).is_dir()]
|
||||
print(f"policies: {pols}")
|
||||
|
||||
apc_upper = json.loads(args.apc_upper_json.read_text())
|
||||
|
||||
fig_apc_loss(args.run_dir, apc_upper, pols, args.out_dir / "f4a_apc_loss.png")
|
||||
print(f"wrote {args.out_dir / 'f4a_apc_loss.png'}")
|
||||
fig_per_worker_ttft(args.run_dir, pols, args.out_dir / "f4c_per_worker_ttft.png")
|
||||
print(f"wrote {args.out_dir / 'f4c_per_worker_ttft.png'}")
|
||||
fig_latency_bars(args.run_dir, pols, args.out_dir / "f6_e2e_latency_bars.png")
|
||||
print(f"wrote {args.out_dir / 'f6_e2e_latency_bars.png'}")
|
||||
fig_latency_full_grid(args.run_dir, pols, args.out_dir / "f6_e2e_latency_full_grid.png")
|
||||
print(f"wrote {args.out_dir / 'f6_e2e_latency_full_grid.png'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user