v2 exp(d): expand figure to 6 panels (TTFT/E2E mean+p90, TPS, per-worker GPU util)
Per request: TTFT mean+p90, E2E mean+p90, decode TPS (output goodput; total/ prefill TPS omitted as cache-miss-inflated), and per-worker GPU-util boxplots (8 workers/arm, tracets vs thinktime) showing utilization level + balance. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -27,6 +27,13 @@ Analyzer: `scripts/bench_report.py` (summaries in `results/`).
|
||||
|
||||
## Result (ms; `figs/exp_d_policy_dispatch.png`)
|
||||
|
||||
The figure has 6 panels — TTFT mean/p90, E2E mean/p90, decode TPS (output
|
||||
goodput), and the per-worker GPU-util box (8 workers/arm). Decode TPS is the
|
||||
honest throughput metric (total/prefill TPS is inflated by cache-miss recompute,
|
||||
e.g. LMetric); thinktime ≥ tracets on it everywhere (the system drains faster
|
||||
with real think-time). The GPU-util box shows LPWL also keeps the tightest
|
||||
worker balance.
|
||||
|
||||
| policy | mode | TTFT p90 | E2E mean | E2E p90 | E2E p99 | TPOT p90 | APC | req-bal |
|
||||
|---|---|---:|---:|---:|---:|---:|---:|---:|
|
||||
| **LPWL** | tracets | 11099 | 9827 | 25366 | 93929 | 33 | 0.650 | **1.49×** |
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
"""exp (d): 5-policy routing under tracets vs thinktime dispatch.
|
||||
|
||||
Shows the ranking FLIP: under the faithful `thinktime` load the parameter-free
|
||||
LPWL (leastwork) is the clear winner, but under `tracets` (think-collapse bursts)
|
||||
its advantage disappears (it ties unified_ab on TTFT p90 and *loses* on E2E mean).
|
||||
Six panels: TTFT mean/p90, E2E mean/p90, decode-TPS (output goodput), and the
|
||||
per-worker GPU-util distribution. Shows the ranking FLIP — under faithful
|
||||
`thinktime` the parameter-free LPWL (leastwork) is the clear winner; under
|
||||
`tracets` (think-collapse bursts) its advantage disappears.
|
||||
|
||||
Reads the two bench_report summaries; writes v2/figs/exp_d_policy_dispatch.png.
|
||||
Usage: python v2/exp_d_policy_dispatch/plot.py
|
||||
@@ -13,56 +14,78 @@ import os
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.patches import Patch
|
||||
|
||||
HERE = os.path.dirname(__file__)
|
||||
TC = json.load(open(os.path.join(HERE, "results/tracets.json")))
|
||||
TT = json.load(open(os.path.join(HERE, "results/thinktime.json")))
|
||||
|
||||
# canonical order: LPWL first; pretty labels
|
||||
ARMS = ["leastwork", "unified_ab", "unified_def", "lmetric", "sticky"]
|
||||
LABEL = {"leastwork": "LPWL\n(leastwork)", "unified_ab": "unified\n+A+B",
|
||||
"unified_def": "unified\ndefault", "lmetric": "LMetric", "sticky": "sticky"}
|
||||
C_TC, C_TT = "#d62728", "#2ca02c" # tracets red / thinktime green (match exp_c)
|
||||
W = 0.38
|
||||
|
||||
|
||||
def panel(ax, key, sub, title, ylab):
|
||||
tc = [TC[a][key][sub] / 1000.0 for a in ARMS] # ms -> s
|
||||
tt = [TT[a][key][sub] / 1000.0 for a in ARMS]
|
||||
def bar_panel(ax, tc, tt, title, ylab, fmt="{:.1f}", higher_better=False):
|
||||
x = range(len(ARMS))
|
||||
w = 0.38
|
||||
b1 = ax.bar([i - w / 2 for i in x], tc, w, label="tracets (burst)", color=C_TC)
|
||||
b2 = ax.bar([i + w / 2 for i in x], tt, w, label="thinktime (faithful)", color=C_TT)
|
||||
b1 = ax.bar([i - W / 2 for i in x], tc, W, color=C_TC)
|
||||
b2 = ax.bar([i + W / 2 for i in x], tt, W, color=C_TT)
|
||||
for bars in (b1, b2):
|
||||
for r in bars:
|
||||
ax.text(r.get_x() + r.get_width() / 2, r.get_height(),
|
||||
f"{r.get_height():.1f}", ha="center", va="bottom", fontsize=8)
|
||||
ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=9)
|
||||
ax.set_ylabel(ylab); ax.set_title(title, fontsize=11)
|
||||
fmt.format(r.get_height()), ha="center", va="bottom", fontsize=7.5)
|
||||
ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
|
||||
arrow = "higher = better" if higher_better else "lower = better"
|
||||
ax.set_ylabel(ylab); ax.set_title(f"{title} ({arrow})", fontsize=10.5)
|
||||
ax.grid(axis="y", alpha=.3); ax.set_ylim(0, max(tc + tt) * 1.18)
|
||||
|
||||
|
||||
def gpu_panel(ax):
|
||||
"""Per-worker gpu_util_mean distribution: tracets vs thinktime box per policy."""
|
||||
def utils(D, a):
|
||||
pw = D[a]["per_worker"]
|
||||
return [pw[w]["gpu_util_mean"] for w in sorted(pw, key=int)
|
||||
if pw[w].get("gpu_util_mean") is not None]
|
||||
for i, a in enumerate(ARMS):
|
||||
for D, off, c in [(TC, -W / 2, C_TC), (TT, +W / 2, C_TT)]:
|
||||
bp = ax.boxplot([utils(D, a)], positions=[i + off], widths=0.30,
|
||||
patch_artist=True, showfliers=False,
|
||||
medianprops=dict(color="black"))
|
||||
bp["boxes"][0].set(facecolor=c, alpha=.65)
|
||||
ax.set_xticks(range(len(ARMS)))
|
||||
ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
|
||||
ax.set_ylabel("per-worker GPU util %"); ax.set_ylim(0, 100)
|
||||
ax.set_title("Per-worker GPU util (box = 8 workers; tighter = balanced)", fontsize=10.5)
|
||||
ax.grid(axis="y", alpha=.3)
|
||||
ax.set_ylim(0, max(tc + tt) * 1.18)
|
||||
# mark LPWL-thinktime as the winner (lowest green) in each panel
|
||||
ax.annotate("LPWL wins\nunder thinktime", xy=(0 + w / 2, tt[0]),
|
||||
xytext=(0.9, max(tc + tt) * 0.86), fontsize=8.5, color=C_TT,
|
||||
ha="left", arrowprops=dict(arrowstyle="->", color=C_TT, lw=1.3))
|
||||
return b1, b2
|
||||
|
||||
|
||||
fig, (axL, axR) = plt.subplots(1, 2, figsize=(11.2, 4.6))
|
||||
panel(axL, "ttft_ms", "p90", "TTFT p90 (lower = better)", "TTFT p90 (s)")
|
||||
panel(axR, "e2e_ms", "mean", "E2E mean (lower = better)", "E2E mean (s)")
|
||||
axL.legend(loc="upper left", fontsize=9)
|
||||
fig.suptitle("5-policy routing: dispatch mode flips the ranking — "
|
||||
"LPWL is best under faithful thinktime, only ties/loses under tracets bursts",
|
||||
fontsize=11.5)
|
||||
fig.tight_layout(rect=(0, 0, 1, 0.95))
|
||||
def col(D, key, sub, scale=1.0):
|
||||
return [D[a][key][sub] * scale for a in ARMS]
|
||||
|
||||
|
||||
fig, ax = plt.subplots(2, 3, figsize=(15.5, 8.6))
|
||||
bar_panel(ax[0, 0], col(TC, "ttft_ms", "mean", 1e-3), col(TT, "ttft_ms", "mean", 1e-3),
|
||||
"TTFT mean", "s")
|
||||
bar_panel(ax[0, 1], col(TC, "ttft_ms", "p90", 1e-3), col(TT, "ttft_ms", "p90", 1e-3),
|
||||
"TTFT p90", "s")
|
||||
bar_panel(ax[0, 2],
|
||||
[TC[a]["throughput"]["decode_tps"] for a in ARMS],
|
||||
[TT[a]["throughput"]["decode_tps"] for a in ARMS],
|
||||
"Decode TPS (output goodput)", "tok/s", fmt="{:.0f}", higher_better=True)
|
||||
bar_panel(ax[1, 0], col(TC, "e2e_ms", "mean", 1e-3), col(TT, "e2e_ms", "mean", 1e-3),
|
||||
"E2E mean", "s")
|
||||
bar_panel(ax[1, 1], col(TC, "e2e_ms", "p90", 1e-3), col(TT, "e2e_ms", "p90", 1e-3),
|
||||
"E2E p90", "s")
|
||||
gpu_panel(ax[1, 2])
|
||||
|
||||
fig.legend(handles=[Patch(facecolor=C_TC, label="tracets (burst artifact)"),
|
||||
Patch(facecolor=C_TT, label="thinktime (faithful load)")],
|
||||
loc="lower center", ncol=2, fontsize=10.5, bbox_to_anchor=(0.5, 0.0))
|
||||
fig.suptitle("5-policy routing: tracets vs thinktime (807 reqs, dash0 8xH20) — "
|
||||
"LPWL wins across the board under faithful thinktime",
|
||||
fontsize=12.5)
|
||||
fig.tight_layout(rect=(0, 0.035, 1, 0.96))
|
||||
out = os.path.join(HERE, "..", "figs", "exp_d_policy_dispatch.png")
|
||||
fig.savefig(out, dpi=140)
|
||||
print("wrote", os.path.normpath(out))
|
||||
|
||||
# also print the deltas the README cites
|
||||
print("\npolicy TTFTp90 tc->tt E2Emean tc->tt")
|
||||
for a in ARMS:
|
||||
t1, t2 = TC[a]["ttft_ms"]["p90"], TT[a]["ttft_ms"]["p90"]
|
||||
e1, e2 = TC[a]["e2e_ms"]["mean"], TT[a]["e2e_ms"]["mean"]
|
||||
print(f"{a:<13} {t1/1000:5.1f}->{t2/1000:4.1f}s ({(t2-t1)/t1:+.0%}) "
|
||||
f"{e1/1000:5.1f}->{e2/1000:4.1f}s ({(e2-e1)/e1:+.0%})")
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 81 KiB After Width: | Height: | Size: 152 KiB |
Reference in New Issue
Block a user