v2 exp(d): expand figure to 6 panels (TTFT/E2E mean+p90, TPS, per-worker GPU util)

Per request: TTFT mean+p90, E2E mean+p90, decode TPS (output goodput; total/
prefill TPS omitted as cache-miss-inflated), and per-worker GPU-util boxplots
(8 workers/arm, tracets vs thinktime) showing utilization level + balance.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-05-30 21:10:27 +08:00
parent 9b6091fe6e
commit 0b180c191e
3 changed files with 65 additions and 35 deletions

View File

@@ -27,6 +27,13 @@ Analyzer: `scripts/bench_report.py` (summaries in `results/`).
## Result (ms; `figs/exp_d_policy_dispatch.png`)
The figure has 6 panels — TTFT mean/p90, E2E mean/p90, decode TPS (output
goodput), and the per-worker GPU-util box (8 workers/arm). Decode TPS is the
honest throughput metric (total/prefill TPS is inflated by cache-miss recompute,
e.g. LMetric); thinktime ≥ tracets on it everywhere (the system drains faster
with real think-time). The GPU-util box shows LPWL also keeps the tightest
worker balance.
| policy | mode | TTFT p90 | E2E mean | E2E p90 | E2E p99 | TPOT p90 | APC | req-bal |
|---|---|---:|---:|---:|---:|---:|---:|---:|
| **LPWL** | tracets | 11099 | 9827 | 25366 | 93929 | 33 | 0.650 | **1.49×** |

View File

@@ -1,8 +1,9 @@
"""exp (d): 5-policy routing under tracets vs thinktime dispatch.
Shows the ranking FLIP: under the faithful `thinktime` load the parameter-free
LPWL (leastwork) is the clear winner, but under `tracets` (think-collapse bursts)
its advantage disappears (it ties unified_ab on TTFT p90 and *loses* on E2E mean).
Six panels: TTFT mean/p90, E2E mean/p90, decode-TPS (output goodput), and the
per-worker GPU-util distribution. Shows the ranking FLIP — under faithful
`thinktime` the parameter-free LPWL (leastwork) is the clear winner; under
`tracets` (think-collapse bursts) its advantage disappears.
Reads the two bench_report summaries; writes v2/figs/exp_d_policy_dispatch.png.
Usage: python v2/exp_d_policy_dispatch/plot.py
@@ -13,56 +14,78 @@ import os
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
HERE = os.path.dirname(__file__)
TC = json.load(open(os.path.join(HERE, "results/tracets.json")))
TT = json.load(open(os.path.join(HERE, "results/thinktime.json")))
# canonical order: LPWL first; pretty labels
ARMS = ["leastwork", "unified_ab", "unified_def", "lmetric", "sticky"]
LABEL = {"leastwork": "LPWL\n(leastwork)", "unified_ab": "unified\n+A+B",
"unified_def": "unified\ndefault", "lmetric": "LMetric", "sticky": "sticky"}
C_TC, C_TT = "#d62728", "#2ca02c" # tracets red / thinktime green (match exp_c)
W = 0.38
def panel(ax, key, sub, title, ylab):
tc = [TC[a][key][sub] / 1000.0 for a in ARMS] # ms -> s
tt = [TT[a][key][sub] / 1000.0 for a in ARMS]
def bar_panel(ax, tc, tt, title, ylab, fmt="{:.1f}", higher_better=False):
x = range(len(ARMS))
w = 0.38
b1 = ax.bar([i - w / 2 for i in x], tc, w, label="tracets (burst)", color=C_TC)
b2 = ax.bar([i + w / 2 for i in x], tt, w, label="thinktime (faithful)", color=C_TT)
b1 = ax.bar([i - W / 2 for i in x], tc, W, color=C_TC)
b2 = ax.bar([i + W / 2 for i in x], tt, W, color=C_TT)
for bars in (b1, b2):
for r in bars:
ax.text(r.get_x() + r.get_width() / 2, r.get_height(),
f"{r.get_height():.1f}", ha="center", va="bottom", fontsize=8)
ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=9)
ax.set_ylabel(ylab); ax.set_title(title, fontsize=11)
fmt.format(r.get_height()), ha="center", va="bottom", fontsize=7.5)
ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
arrow = "higher = better" if higher_better else "lower = better"
ax.set_ylabel(ylab); ax.set_title(f"{title} ({arrow})", fontsize=10.5)
ax.grid(axis="y", alpha=.3); ax.set_ylim(0, max(tc + tt) * 1.18)
def gpu_panel(ax):
"""Per-worker gpu_util_mean distribution: tracets vs thinktime box per policy."""
def utils(D, a):
pw = D[a]["per_worker"]
return [pw[w]["gpu_util_mean"] for w in sorted(pw, key=int)
if pw[w].get("gpu_util_mean") is not None]
for i, a in enumerate(ARMS):
for D, off, c in [(TC, -W / 2, C_TC), (TT, +W / 2, C_TT)]:
bp = ax.boxplot([utils(D, a)], positions=[i + off], widths=0.30,
patch_artist=True, showfliers=False,
medianprops=dict(color="black"))
bp["boxes"][0].set(facecolor=c, alpha=.65)
ax.set_xticks(range(len(ARMS)))
ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
ax.set_ylabel("per-worker GPU util %"); ax.set_ylim(0, 100)
ax.set_title("Per-worker GPU util (box = 8 workers; tighter = balanced)", fontsize=10.5)
ax.grid(axis="y", alpha=.3)
ax.set_ylim(0, max(tc + tt) * 1.18)
# mark LPWL-thinktime as the winner (lowest green) in each panel
ax.annotate("LPWL wins\nunder thinktime", xy=(0 + w / 2, tt[0]),
xytext=(0.9, max(tc + tt) * 0.86), fontsize=8.5, color=C_TT,
ha="left", arrowprops=dict(arrowstyle="->", color=C_TT, lw=1.3))
return b1, b2
fig, (axL, axR) = plt.subplots(1, 2, figsize=(11.2, 4.6))
panel(axL, "ttft_ms", "p90", "TTFT p90 (lower = better)", "TTFT p90 (s)")
panel(axR, "e2e_ms", "mean", "E2E mean (lower = better)", "E2E mean (s)")
axL.legend(loc="upper left", fontsize=9)
fig.suptitle("5-policy routing: dispatch mode flips the ranking — "
"LPWL is best under faithful thinktime, only ties/loses under tracets bursts",
fontsize=11.5)
fig.tight_layout(rect=(0, 0, 1, 0.95))
def col(D, key, sub, scale=1.0):
return [D[a][key][sub] * scale for a in ARMS]
fig, ax = plt.subplots(2, 3, figsize=(15.5, 8.6))
bar_panel(ax[0, 0], col(TC, "ttft_ms", "mean", 1e-3), col(TT, "ttft_ms", "mean", 1e-3),
"TTFT mean", "s")
bar_panel(ax[0, 1], col(TC, "ttft_ms", "p90", 1e-3), col(TT, "ttft_ms", "p90", 1e-3),
"TTFT p90", "s")
bar_panel(ax[0, 2],
[TC[a]["throughput"]["decode_tps"] for a in ARMS],
[TT[a]["throughput"]["decode_tps"] for a in ARMS],
"Decode TPS (output goodput)", "tok/s", fmt="{:.0f}", higher_better=True)
bar_panel(ax[1, 0], col(TC, "e2e_ms", "mean", 1e-3), col(TT, "e2e_ms", "mean", 1e-3),
"E2E mean", "s")
bar_panel(ax[1, 1], col(TC, "e2e_ms", "p90", 1e-3), col(TT, "e2e_ms", "p90", 1e-3),
"E2E p90", "s")
gpu_panel(ax[1, 2])
fig.legend(handles=[Patch(facecolor=C_TC, label="tracets (burst artifact)"),
Patch(facecolor=C_TT, label="thinktime (faithful load)")],
loc="lower center", ncol=2, fontsize=10.5, bbox_to_anchor=(0.5, 0.0))
fig.suptitle("5-policy routing: tracets vs thinktime (807 reqs, dash0 8xH20) — "
"LPWL wins across the board under faithful thinktime",
fontsize=12.5)
fig.tight_layout(rect=(0, 0.035, 1, 0.96))
out = os.path.join(HERE, "..", "figs", "exp_d_policy_dispatch.png")
fig.savefig(out, dpi=140)
print("wrote", os.path.normpath(out))
# also print the deltas the README cites
print("\npolicy TTFTp90 tc->tt E2Emean tc->tt")
for a in ARMS:
t1, t2 = TC[a]["ttft_ms"]["p90"], TT[a]["ttft_ms"]["p90"]
e1, e2 = TC[a]["e2e_ms"]["mean"], TT[a]["e2e_ms"]["mean"]
print(f"{a:<13} {t1/1000:5.1f}->{t2/1000:4.1f}s ({(t2-t1)/t1:+.0%}) "
f"{e1/1000:5.1f}->{e2/1000:4.1f}s ({(e2-e1)/e1:+.0%})")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 81 KiB

After

Width:  |  Height:  |  Size: 152 KiB