Files
agentic-kvc/scripts/render_b3_figures_v2.py
Gahow Wang 03d8c5d0d1 Render 4 per-policy figures on b3_replay_20260527_0114 into figs/v2/
User-provided fresh run with five policies (lmetric, load_only, sticky,
unified, plus a new unified_v2 variant). Reproduces the v1 set under
figs/v2/ so we can A/B the same panels:
  f4a_apc_loss.png         — APC bars per policy
  f4c_per_worker_ttft.png  — per-worker TTFT p90 panel per policy
  f6_e2e_latency_bars.png  — TTFT/TPOT/E2E p90 bars per policy
  f6_e2e_latency_full_grid — mean/p50/p90/p99 × TTFT/TPOT/E2E grid

scripts/render_b3_figures_v2.py is a standalone driver that reads each
policy's metrics.summary.json and breakdown.json directly from the run
directory — the breakdown.json `routed_to` field is required to recover
per-worker assignment because the new setup routes every request
through a proxy (127.0.0.1:9300), so metrics.jsonl's endpoint_url no
longer identifies the backend.

Headline numbers, new vs v1:
  APC          v2: lmetric 57.2% / load_only 53.9% / sticky 77.7%
                   unified 78.7% / unified_v2 78.4%
              v1: lmetric 56.9% / load_only 54.1% / sticky 77.2% / unified 79.4%
  TTFT p90 (s) v2: lmetric 14.8 / load_only 20.1 / sticky 14.8 /
                   unified  8.8 / unified_v2 10.1
              v1: lmetric 15.7 / load_only 20.2 / sticky 18.0 / unified 7.3
  E2E p90 (s)  v2: lmetric 25.4 / load_only 33.9 / sticky 30.3 /
                   unified 20.0 / unified_v2 24.1
              v1: lmetric 24.8 / load_only 33.5 / sticky 34.6 / unified 18.0
  Worker p90 (s, median / max)
              v2: lmetric 13.3/30.4 · load_only 21.3/29.2 · sticky 13.5/33.0
                  unified 10.0/35.1 · unified_v2 8.6/34.2
              v1: lmetric 13.9/31.3 · load_only 19.4/25.1 · sticky 20.3/55.4
                  unified 10.3/37.7

Story is unchanged: unified dominates at p90 across TTFT/E2E and on
median-worker latency; unified_v2 is competitive at p50 but slightly
worse than unified at p90.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 13:52:17 +08:00

218 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""Render the 4 per-policy comparison figures from a fresh b3 replay run.
Replicates the f4a / f4c per-worker / f6 headline / f6 full-grid figures
from analysis/characterization/render_window1_figures.py but reads its
inputs directly from a run directory (one subdir per policy, each with
metrics.summary.json + metrics.jsonl), rather than from the older
window_1_results derived JSONs.
Usage:
python scripts/render_b3_figures_v2.py \\
--run-dir outputs/b3_replay_20260527_0114 \\
--apc-upper-json analysis/characterization/window_1_results/apc_upper_w600.json \\
--out-dir figs/v2
"""
from __future__ import annotations
import argparse
import json
import statistics
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
POLICY_ORDER = ["lmetric", "load_only", "sticky", "unified", "unified_v2"]
POLICY_COLOR = {
"lmetric": "#1f77b4",
"load_only": "#ff7f0e",
"sticky": "#d62728",
"unified": "#2ca02c",
"unified_v2": "#17becf",
}
def load_policy_summary(run_dir: Path, pol: str) -> dict:
p = run_dir / pol / "metrics.summary.json"
return json.loads(p.read_text())
def per_worker_ttft_p90(run_dir: Path, pol: str) -> dict[str, float]:
"""Group successful requests by routed backend (from breakdown.json), return TTFT p90.
In the b3_replay_20260527_0114 setup the replayer's endpoint_url is the
proxy (127.0.0.1:9300), not the actual backend worker. The proxy emits
a per-request breakdown.json with `routed_to` (e.g. 127.0.0.1:8001),
which we join on request_id to recover the per-worker assignment.
"""
breakdown_path = run_dir / pol / "breakdown.json"
metrics_path = run_dir / pol / "metrics.jsonl"
routed_to: dict[str, str] = {}
if breakdown_path.exists():
for item in json.loads(breakdown_path.read_text()):
rid = item.get("request_id")
url = item.get("routed_to")
if rid is not None and url is not None:
routed_to[rid] = url
by_url: dict[str, list[float]] = {}
with metrics_path.open() as f:
for line in f:
r = json.loads(line)
if r.get("error"):
continue
t = r.get("ttft_s")
if t is None:
continue
rid = r.get("request_id") or r.get("proxy_request_id")
url = routed_to.get(rid) or r.get("endpoint_url")
if url is None:
continue
by_url.setdefault(url, []).append(float(t))
out: dict[str, float] = {}
for url, vals in by_url.items():
arr = np.array(vals)
out[url] = float(np.percentile(arr, 90))
return out
def fig_apc_loss(run_dir: Path, apc_upper: dict, pols: list[str], out: Path) -> None:
apc_by_pol = {}
for pol in pols:
s = load_policy_summary(run_dir, pol)
apc_by_pol[pol] = s["total_cached_tokens"] / s["total_input_tokens"]
fig, ax = plt.subplots(figsize=(7, 4.2))
vals = [apc_by_pol[p] * 100 for p in pols]
ax.bar(pols, vals,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}%", ha="center", va="bottom", fontsize=9)
ax.axhline(apc_upper["apc_upper_intra_session"] * 100,
linestyle="--", color="#444", alpha=0.7,
label=f"intra-session ceiling {apc_upper['apc_upper_intra_session']*100:.1f}%")
ax.axhline(apc_upper["apc_upper_any_session"] * 100,
linestyle=":", color="#888", alpha=0.7,
label=f"any-session ceiling {apc_upper['apc_upper_any_session']*100:.1f}%")
ax.set_ylim(0, 100)
ax.set_ylabel("APC ratio (%)")
ax.set_title("APC achieved vs theoretical ceiling (b3 replay 20260527_0114)")
ax.legend(loc="lower right", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_per_worker_ttft(run_dir: Path, pols: list[str], out: Path) -> None:
fig, axes = plt.subplots(1, len(pols), figsize=(3 * len(pols), 4.2), sharey=True)
if len(pols) == 1:
axes = [axes]
for ax, pol in zip(axes, pols):
per = per_worker_ttft_p90(run_dir, pol)
items = sorted(per.items(), key=lambda kv: int(kv[0].rsplit(":", 1)[1]))
labels = [f"e{int(k.rsplit(':', 1)[1]) - 8000}" for k, _ in items]
vals = [v for _, v in items]
ax.bar(labels, vals,
color=POLICY_COLOR.get(pol, "gray"),
edgecolor="black", linewidth=0.5)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
median_v = statistics.median(vals)
max_v = max(vals)
ax.set_title(f"{pol}\nmedian {median_v:.1f}s · max {max_v:.1f}s", fontsize=10)
ax.tick_params(axis="x", labelsize=8)
ax.grid(alpha=0.3, axis="y")
axes[0].set_ylabel("worker TTFT p90 (s)")
fig.suptitle("Per-worker TTFT p90 distribution (b3 replay 20260527_0114)")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_latency_bars(run_dir: Path, pols: list[str], out: Path) -> None:
metrics = [("TTFT p90 (s)", "ttft_stats_s", "p90", 1.0),
("TPOT p90 (ms)", "tpot_stats_s", "p90", 1000.0),
("E2E p90 (s)", "latency_stats_s", "p90", 1.0)]
summaries = {p: load_policy_summary(run_dir, p) for p in pols}
fig, axes = plt.subplots(1, 3, figsize=(13, 4.2))
for ax, (label, key, agg, scale) in zip(axes, metrics):
vals = [summaries[p][key][agg] * scale for p in pols]
ax.bar(pols, vals,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
ax.set_title(label)
ax.tick_params(axis="x", rotation=20)
for i, v in enumerate(vals):
ax.text(i, v, f"{v:.1f}", ha="center", va="bottom", fontsize=9)
ax.grid(alpha=0.3, axis="y")
fig.suptitle("Headline latencies per policy (b3 replay 20260527_0114)")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def fig_latency_full_grid(run_dir: Path, pols: list[str], out: Path) -> None:
rows = [("mean", "mean"), ("p50", "p50"), ("p90", "p90"), ("p99", "p99")]
cols = [("TTFT (s)", "ttft_stats_s", 1.0),
("TPOT (ms)", "tpot_stats_s", 1000.0),
("E2E (s)", "latency_stats_s", 1.0)]
summaries = {p: load_policy_summary(run_dir, p) for p in pols}
fig, axes = plt.subplots(len(rows), len(cols), figsize=(12.5, 11.5), sharex=True)
for i, (row_label, agg) in enumerate(rows):
for j, (col_label, key, scale) in enumerate(cols):
ax = axes[i][j]
vals = [summaries[p][key][agg] * scale for p in pols]
ax.bar(pols, vals,
color=[POLICY_COLOR.get(p, "gray") for p in pols],
edgecolor="black", linewidth=0.5)
for k, v in enumerate(vals):
ax.text(k, v, f"{v:.1f}", ha="center", va="bottom", fontsize=8)
if j == 0:
ax.set_ylabel(row_label, fontsize=11)
if i == 0:
ax.set_title(col_label, fontsize=11)
ax.grid(alpha=0.3, axis="y")
ax.tick_params(axis="x", rotation=20, labelsize=9)
ax.margins(y=0.18)
fig.suptitle("Latencies per policy — mean / p50 / p90 / p99 (b3 replay 20260527_0114)")
fig.tight_layout()
fig.savefig(out, dpi=120)
plt.close(fig)
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--run-dir", type=Path, required=True)
p.add_argument("--apc-upper-json", type=Path, required=True)
p.add_argument("--out-dir", type=Path, required=True)
p.add_argument("--exclude-policies", default="",
help="Comma-separated policies to drop")
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
excluded = {s.strip() for s in args.exclude_policies.split(",") if s.strip()}
pols = [pol for pol in POLICY_ORDER
if pol not in excluded and (args.run_dir / pol).is_dir()]
print(f"policies: {pols}")
apc_upper = json.loads(args.apc_upper_json.read_text())
fig_apc_loss(args.run_dir, apc_upper, pols, args.out_dir / "f4a_apc_loss.png")
print(f"wrote {args.out_dir / 'f4a_apc_loss.png'}")
fig_per_worker_ttft(args.run_dir, pols, args.out_dir / "f4c_per_worker_ttft.png")
print(f"wrote {args.out_dir / 'f4c_per_worker_ttft.png'}")
fig_latency_bars(args.run_dir, pols, args.out_dir / "f6_e2e_latency_bars.png")
print(f"wrote {args.out_dir / 'f6_e2e_latency_bars.png'}")
fig_latency_full_grid(args.run_dir, pols, args.out_dir / "f6_e2e_latency_full_grid.png")
print(f"wrote {args.out_dir / 'f6_e2e_latency_full_grid.png'}")
if __name__ == "__main__":
main()