v2 exp(d): expand figure to 6 panels (TTFT/E2E mean+p90, TPS, per-worker GPU util)

Per request: TTFT mean+p90, E2E mean+p90, decode TPS (output goodput; total/ prefill TPS omitted as cache-miss-inflated), and per-worker GPU-util boxplots (8 workers/arm, tracets vs thinktime) showing utilization level + balance. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 21:10:27 +08:00
parent 9b6091fe6e
commit 0b180c191e
3 changed files with 65 additions and 35 deletions
--- a/v2/exp_d_policy_dispatch/README.md
+++ b/v2/exp_d_policy_dispatch/README.md
@@ -27,6 +27,13 @@ Analyzer: `scripts/bench_report.py` (summaries in `results/`).

 ## Result (ms; `figs/exp_d_policy_dispatch.png`)

+The figure has 6 panels — TTFT mean/p90, E2E mean/p90, decode TPS (output
+goodput), and the per-worker GPU-util box (8 workers/arm). Decode TPS is the
+honest throughput metric (total/prefill TPS is inflated by cache-miss recompute,
+e.g. LMetric); thinktime ≥ tracets on it everywhere (the system drains faster
+with real think-time). The GPU-util box shows LPWL also keeps the tightest
+worker balance.
+
 | policy | mode | TTFT p90 | E2E mean | E2E p90 | E2E p99 | TPOT p90 | APC | req-bal |
 |---|---|---:|---:|---:|---:|---:|---:|---:|
 | **LPWL** | tracets | 11099 | 9827 | 25366 | 93929 | 33 | 0.650 | **1.49×** |
--- a/v2/exp_d_policy_dispatch/plot.py
+++ b/v2/exp_d_policy_dispatch/plot.py
@@ -1,8 +1,9 @@
 """exp (d): 5-policy routing under tracets vs thinktime dispatch.

-Shows the ranking FLIP: under the faithful `thinktime` load the parameter-free
-LPWL (leastwork) is the clear winner, but under `tracets` (think-collapse bursts)
-its advantage disappears (it ties unified_ab on TTFT p90 and *loses* on E2E mean).
+Six panels: TTFT mean/p90, E2E mean/p90, decode-TPS (output goodput), and the
+per-worker GPU-util distribution. Shows the ranking FLIP — under faithful
+`thinktime` the parameter-free LPWL (leastwork) is the clear winner; under
+`tracets` (think-collapse bursts) its advantage disappears.

 Reads the two bench_report summaries; writes v2/figs/exp_d_policy_dispatch.png.
 Usage: python v2/exp_d_policy_dispatch/plot.py
@@ -13,56 +14,78 @@ import os
 import matplotlib
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt
+from matplotlib.patches import Patch

 HERE = os.path.dirname(__file__)
 TC = json.load(open(os.path.join(HERE, "results/tracets.json")))
 TT = json.load(open(os.path.join(HERE, "results/thinktime.json")))

-# canonical order: LPWL first; pretty labels
 ARMS = ["leastwork", "unified_ab", "unified_def", "lmetric", "sticky"]
 LABEL = {"leastwork": "LPWL\n(leastwork)", "unified_ab": "unified\n+A+B",
         "unified_def": "unified\ndefault", "lmetric": "LMetric", "sticky": "sticky"}
 C_TC, C_TT = "#d62728", "#2ca02c"  # tracets red / thinktime green (match exp_c)
+W = 0.38


-def panel(ax, key, sub, title, ylab):
-    tc = [TC[a][key][sub] / 1000.0 for a in ARMS]   # ms -> s
-    tt = [TT[a][key][sub] / 1000.0 for a in ARMS]
+def bar_panel(ax, tc, tt, title, ylab, fmt="{:.1f}", higher_better=False):
    x = range(len(ARMS))
-    w = 0.38
-    b1 = ax.bar([i - w / 2 for i in x], tc, w, label="tracets (burst)", color=C_TC)
-    b2 = ax.bar([i + w / 2 for i in x], tt, w, label="thinktime (faithful)", color=C_TT)
+    b1 = ax.bar([i - W / 2 for i in x], tc, W, color=C_TC)
+    b2 = ax.bar([i + W / 2 for i in x], tt, W, color=C_TT)
    for bars in (b1, b2):
        for r in bars:
            ax.text(r.get_x() + r.get_width() / 2, r.get_height(),
-                    f"{r.get_height():.1f}", ha="center", va="bottom", fontsize=8)
-    ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=9)
-    ax.set_ylabel(ylab); ax.set_title(title, fontsize=11)
+                    fmt.format(r.get_height()), ha="center", va="bottom", fontsize=7.5)
+    ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
+    arrow = "higher = better" if higher_better else "lower = better"
+    ax.set_ylabel(ylab); ax.set_title(f"{title}  ({arrow})", fontsize=10.5)
+    ax.grid(axis="y", alpha=.3); ax.set_ylim(0, max(tc + tt) * 1.18)
+
+
+def gpu_panel(ax):
+    """Per-worker gpu_util_mean distribution: tracets vs thinktime box per policy."""
+    def utils(D, a):
+        pw = D[a]["per_worker"]
+        return [pw[w]["gpu_util_mean"] for w in sorted(pw, key=int)
+                if pw[w].get("gpu_util_mean") is not None]
+    for i, a in enumerate(ARMS):
+        for D, off, c in [(TC, -W / 2, C_TC), (TT, +W / 2, C_TT)]:
+            bp = ax.boxplot([utils(D, a)], positions=[i + off], widths=0.30,
+                            patch_artist=True, showfliers=False,
+                            medianprops=dict(color="black"))
+            bp["boxes"][0].set(facecolor=c, alpha=.65)
+    ax.set_xticks(range(len(ARMS)))
+    ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
+    ax.set_ylabel("per-worker GPU util %"); ax.set_ylim(0, 100)
+    ax.set_title("Per-worker GPU util (box = 8 workers; tighter = balanced)", fontsize=10.5)
    ax.grid(axis="y", alpha=.3)
-    ax.set_ylim(0, max(tc + tt) * 1.18)
-    # mark LPWL-thinktime as the winner (lowest green) in each panel
-    ax.annotate("LPWL wins\nunder thinktime", xy=(0 + w / 2, tt[0]),
-                xytext=(0.9, max(tc + tt) * 0.86), fontsize=8.5, color=C_TT,
-                ha="left", arrowprops=dict(arrowstyle="->", color=C_TT, lw=1.3))
-    return b1, b2


-fig, (axL, axR) = plt.subplots(1, 2, figsize=(11.2, 4.6))
-panel(axL, "ttft_ms", "p90", "TTFT p90 (lower = better)", "TTFT p90 (s)")
-panel(axR, "e2e_ms", "mean", "E2E mean (lower = better)", "E2E mean (s)")
-axL.legend(loc="upper left", fontsize=9)
-fig.suptitle("5-policy routing: dispatch mode flips the ranking — "
-             "LPWL is best under faithful thinktime, only ties/loses under tracets bursts",
-             fontsize=11.5)
-fig.tight_layout(rect=(0, 0, 1, 0.95))
+def col(D, key, sub, scale=1.0):
+    return [D[a][key][sub] * scale for a in ARMS]
+
+
+fig, ax = plt.subplots(2, 3, figsize=(15.5, 8.6))
+bar_panel(ax[0, 0], col(TC, "ttft_ms", "mean", 1e-3), col(TT, "ttft_ms", "mean", 1e-3),
+          "TTFT mean", "s")
+bar_panel(ax[0, 1], col(TC, "ttft_ms", "p90", 1e-3), col(TT, "ttft_ms", "p90", 1e-3),
+          "TTFT p90", "s")
+bar_panel(ax[0, 2],
+          [TC[a]["throughput"]["decode_tps"] for a in ARMS],
+          [TT[a]["throughput"]["decode_tps"] for a in ARMS],
+          "Decode TPS (output goodput)", "tok/s", fmt="{:.0f}", higher_better=True)
+bar_panel(ax[1, 0], col(TC, "e2e_ms", "mean", 1e-3), col(TT, "e2e_ms", "mean", 1e-3),
+          "E2E mean", "s")
+bar_panel(ax[1, 1], col(TC, "e2e_ms", "p90", 1e-3), col(TT, "e2e_ms", "p90", 1e-3),
+          "E2E p90", "s")
+gpu_panel(ax[1, 2])
+
+fig.legend(handles=[Patch(facecolor=C_TC, label="tracets (burst artifact)"),
+                    Patch(facecolor=C_TT, label="thinktime (faithful load)")],
+           loc="lower center", ncol=2, fontsize=10.5, bbox_to_anchor=(0.5, 0.0))
+fig.suptitle("5-policy routing: tracets vs thinktime (807 reqs, dash0 8xH20) — "
+             "LPWL wins across the board under faithful thinktime",
+             fontsize=12.5)
+fig.tight_layout(rect=(0, 0.035, 1, 0.96))
 out = os.path.join(HERE, "..", "figs", "exp_d_policy_dispatch.png")
 fig.savefig(out, dpi=140)
 print("wrote", os.path.normpath(out))
-
-# also print the deltas the README cites
-print("\npolicy        TTFTp90 tc->tt    E2Emean tc->tt")
-for a in ARMS:
-    t1, t2 = TC[a]["ttft_ms"]["p90"], TT[a]["ttft_ms"]["p90"]
-    e1, e2 = TC[a]["e2e_ms"]["mean"], TT[a]["e2e_ms"]["mean"]
-    print(f"{a:<13} {t1/1000:5.1f}->{t2/1000:4.1f}s ({(t2-t1)/t1:+.0%})   "
-          f"{e1/1000:5.1f}->{e2/1000:4.1f}s ({(e2-e1)/e1:+.0%})")
--- a/v2/figs/exp_d_policy_dispatch.png
+++ b/v2/figs/exp_d_policy_dispatch.png