v2 exp(d): expand figure to 6 panels (TTFT/E2E mean+p90, TPS, per-worker GPU util)
Per request: TTFT mean+p90, E2E mean+p90, decode TPS (output goodput; total/ prefill TPS omitted as cache-miss-inflated), and per-worker GPU-util boxplots (8 workers/arm, tracets vs thinktime) showing utilization level + balance. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -27,6 +27,13 @@ Analyzer: `scripts/bench_report.py` (summaries in `results/`).
|
|||||||
|
|
||||||
## Result (ms; `figs/exp_d_policy_dispatch.png`)
|
## Result (ms; `figs/exp_d_policy_dispatch.png`)
|
||||||
|
|
||||||
|
The figure has 6 panels — TTFT mean/p90, E2E mean/p90, decode TPS (output
|
||||||
|
goodput), and the per-worker GPU-util box (8 workers/arm). Decode TPS is the
|
||||||
|
honest throughput metric (total/prefill TPS is inflated by cache-miss recompute,
|
||||||
|
e.g. LMetric); thinktime ≥ tracets on it everywhere (the system drains faster
|
||||||
|
with real think-time). The GPU-util box shows LPWL also keeps the tightest
|
||||||
|
worker balance.
|
||||||
|
|
||||||
| policy | mode | TTFT p90 | E2E mean | E2E p90 | E2E p99 | TPOT p90 | APC | req-bal |
|
| policy | mode | TTFT p90 | E2E mean | E2E p90 | E2E p99 | TPOT p90 | APC | req-bal |
|
||||||
|---|---|---:|---:|---:|---:|---:|---:|---:|
|
|---|---|---:|---:|---:|---:|---:|---:|---:|
|
||||||
| **LPWL** | tracets | 11099 | 9827 | 25366 | 93929 | 33 | 0.650 | **1.49×** |
|
| **LPWL** | tracets | 11099 | 9827 | 25366 | 93929 | 33 | 0.650 | **1.49×** |
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
"""exp (d): 5-policy routing under tracets vs thinktime dispatch.
|
"""exp (d): 5-policy routing under tracets vs thinktime dispatch.
|
||||||
|
|
||||||
Shows the ranking FLIP: under the faithful `thinktime` load the parameter-free
|
Six panels: TTFT mean/p90, E2E mean/p90, decode-TPS (output goodput), and the
|
||||||
LPWL (leastwork) is the clear winner, but under `tracets` (think-collapse bursts)
|
per-worker GPU-util distribution. Shows the ranking FLIP — under faithful
|
||||||
its advantage disappears (it ties unified_ab on TTFT p90 and *loses* on E2E mean).
|
`thinktime` the parameter-free LPWL (leastwork) is the clear winner; under
|
||||||
|
`tracets` (think-collapse bursts) its advantage disappears.
|
||||||
|
|
||||||
Reads the two bench_report summaries; writes v2/figs/exp_d_policy_dispatch.png.
|
Reads the two bench_report summaries; writes v2/figs/exp_d_policy_dispatch.png.
|
||||||
Usage: python v2/exp_d_policy_dispatch/plot.py
|
Usage: python v2/exp_d_policy_dispatch/plot.py
|
||||||
@@ -13,56 +14,78 @@ import os
|
|||||||
import matplotlib
|
import matplotlib
|
||||||
matplotlib.use("Agg")
|
matplotlib.use("Agg")
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
from matplotlib.patches import Patch
|
||||||
|
|
||||||
HERE = os.path.dirname(__file__)
|
HERE = os.path.dirname(__file__)
|
||||||
TC = json.load(open(os.path.join(HERE, "results/tracets.json")))
|
TC = json.load(open(os.path.join(HERE, "results/tracets.json")))
|
||||||
TT = json.load(open(os.path.join(HERE, "results/thinktime.json")))
|
TT = json.load(open(os.path.join(HERE, "results/thinktime.json")))
|
||||||
|
|
||||||
# canonical order: LPWL first; pretty labels
|
|
||||||
ARMS = ["leastwork", "unified_ab", "unified_def", "lmetric", "sticky"]
|
ARMS = ["leastwork", "unified_ab", "unified_def", "lmetric", "sticky"]
|
||||||
LABEL = {"leastwork": "LPWL\n(leastwork)", "unified_ab": "unified\n+A+B",
|
LABEL = {"leastwork": "LPWL\n(leastwork)", "unified_ab": "unified\n+A+B",
|
||||||
"unified_def": "unified\ndefault", "lmetric": "LMetric", "sticky": "sticky"}
|
"unified_def": "unified\ndefault", "lmetric": "LMetric", "sticky": "sticky"}
|
||||||
C_TC, C_TT = "#d62728", "#2ca02c" # tracets red / thinktime green (match exp_c)
|
C_TC, C_TT = "#d62728", "#2ca02c" # tracets red / thinktime green (match exp_c)
|
||||||
|
W = 0.38
|
||||||
|
|
||||||
|
|
||||||
def panel(ax, key, sub, title, ylab):
|
def bar_panel(ax, tc, tt, title, ylab, fmt="{:.1f}", higher_better=False):
|
||||||
tc = [TC[a][key][sub] / 1000.0 for a in ARMS] # ms -> s
|
|
||||||
tt = [TT[a][key][sub] / 1000.0 for a in ARMS]
|
|
||||||
x = range(len(ARMS))
|
x = range(len(ARMS))
|
||||||
w = 0.38
|
b1 = ax.bar([i - W / 2 for i in x], tc, W, color=C_TC)
|
||||||
b1 = ax.bar([i - w / 2 for i in x], tc, w, label="tracets (burst)", color=C_TC)
|
b2 = ax.bar([i + W / 2 for i in x], tt, W, color=C_TT)
|
||||||
b2 = ax.bar([i + w / 2 for i in x], tt, w, label="thinktime (faithful)", color=C_TT)
|
|
||||||
for bars in (b1, b2):
|
for bars in (b1, b2):
|
||||||
for r in bars:
|
for r in bars:
|
||||||
ax.text(r.get_x() + r.get_width() / 2, r.get_height(),
|
ax.text(r.get_x() + r.get_width() / 2, r.get_height(),
|
||||||
f"{r.get_height():.1f}", ha="center", va="bottom", fontsize=8)
|
fmt.format(r.get_height()), ha="center", va="bottom", fontsize=7.5)
|
||||||
ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=9)
|
ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
|
||||||
ax.set_ylabel(ylab); ax.set_title(title, fontsize=11)
|
arrow = "higher = better" if higher_better else "lower = better"
|
||||||
|
ax.set_ylabel(ylab); ax.set_title(f"{title} ({arrow})", fontsize=10.5)
|
||||||
|
ax.grid(axis="y", alpha=.3); ax.set_ylim(0, max(tc + tt) * 1.18)
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_panel(ax):
|
||||||
|
"""Per-worker gpu_util_mean distribution: tracets vs thinktime box per policy."""
|
||||||
|
def utils(D, a):
|
||||||
|
pw = D[a]["per_worker"]
|
||||||
|
return [pw[w]["gpu_util_mean"] for w in sorted(pw, key=int)
|
||||||
|
if pw[w].get("gpu_util_mean") is not None]
|
||||||
|
for i, a in enumerate(ARMS):
|
||||||
|
for D, off, c in [(TC, -W / 2, C_TC), (TT, +W / 2, C_TT)]:
|
||||||
|
bp = ax.boxplot([utils(D, a)], positions=[i + off], widths=0.30,
|
||||||
|
patch_artist=True, showfliers=False,
|
||||||
|
medianprops=dict(color="black"))
|
||||||
|
bp["boxes"][0].set(facecolor=c, alpha=.65)
|
||||||
|
ax.set_xticks(range(len(ARMS)))
|
||||||
|
ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5)
|
||||||
|
ax.set_ylabel("per-worker GPU util %"); ax.set_ylim(0, 100)
|
||||||
|
ax.set_title("Per-worker GPU util (box = 8 workers; tighter = balanced)", fontsize=10.5)
|
||||||
ax.grid(axis="y", alpha=.3)
|
ax.grid(axis="y", alpha=.3)
|
||||||
ax.set_ylim(0, max(tc + tt) * 1.18)
|
|
||||||
# mark LPWL-thinktime as the winner (lowest green) in each panel
|
|
||||||
ax.annotate("LPWL wins\nunder thinktime", xy=(0 + w / 2, tt[0]),
|
|
||||||
xytext=(0.9, max(tc + tt) * 0.86), fontsize=8.5, color=C_TT,
|
|
||||||
ha="left", arrowprops=dict(arrowstyle="->", color=C_TT, lw=1.3))
|
|
||||||
return b1, b2
|
|
||||||
|
|
||||||
|
|
||||||
fig, (axL, axR) = plt.subplots(1, 2, figsize=(11.2, 4.6))
|
def col(D, key, sub, scale=1.0):
|
||||||
panel(axL, "ttft_ms", "p90", "TTFT p90 (lower = better)", "TTFT p90 (s)")
|
return [D[a][key][sub] * scale for a in ARMS]
|
||||||
panel(axR, "e2e_ms", "mean", "E2E mean (lower = better)", "E2E mean (s)")
|
|
||||||
axL.legend(loc="upper left", fontsize=9)
|
|
||||||
fig.suptitle("5-policy routing: dispatch mode flips the ranking — "
|
fig, ax = plt.subplots(2, 3, figsize=(15.5, 8.6))
|
||||||
"LPWL is best under faithful thinktime, only ties/loses under tracets bursts",
|
bar_panel(ax[0, 0], col(TC, "ttft_ms", "mean", 1e-3), col(TT, "ttft_ms", "mean", 1e-3),
|
||||||
fontsize=11.5)
|
"TTFT mean", "s")
|
||||||
fig.tight_layout(rect=(0, 0, 1, 0.95))
|
bar_panel(ax[0, 1], col(TC, "ttft_ms", "p90", 1e-3), col(TT, "ttft_ms", "p90", 1e-3),
|
||||||
|
"TTFT p90", "s")
|
||||||
|
bar_panel(ax[0, 2],
|
||||||
|
[TC[a]["throughput"]["decode_tps"] for a in ARMS],
|
||||||
|
[TT[a]["throughput"]["decode_tps"] for a in ARMS],
|
||||||
|
"Decode TPS (output goodput)", "tok/s", fmt="{:.0f}", higher_better=True)
|
||||||
|
bar_panel(ax[1, 0], col(TC, "e2e_ms", "mean", 1e-3), col(TT, "e2e_ms", "mean", 1e-3),
|
||||||
|
"E2E mean", "s")
|
||||||
|
bar_panel(ax[1, 1], col(TC, "e2e_ms", "p90", 1e-3), col(TT, "e2e_ms", "p90", 1e-3),
|
||||||
|
"E2E p90", "s")
|
||||||
|
gpu_panel(ax[1, 2])
|
||||||
|
|
||||||
|
fig.legend(handles=[Patch(facecolor=C_TC, label="tracets (burst artifact)"),
|
||||||
|
Patch(facecolor=C_TT, label="thinktime (faithful load)")],
|
||||||
|
loc="lower center", ncol=2, fontsize=10.5, bbox_to_anchor=(0.5, 0.0))
|
||||||
|
fig.suptitle("5-policy routing: tracets vs thinktime (807 reqs, dash0 8xH20) — "
|
||||||
|
"LPWL wins across the board under faithful thinktime",
|
||||||
|
fontsize=12.5)
|
||||||
|
fig.tight_layout(rect=(0, 0.035, 1, 0.96))
|
||||||
out = os.path.join(HERE, "..", "figs", "exp_d_policy_dispatch.png")
|
out = os.path.join(HERE, "..", "figs", "exp_d_policy_dispatch.png")
|
||||||
fig.savefig(out, dpi=140)
|
fig.savefig(out, dpi=140)
|
||||||
print("wrote", os.path.normpath(out))
|
print("wrote", os.path.normpath(out))
|
||||||
|
|
||||||
# also print the deltas the README cites
|
|
||||||
print("\npolicy TTFTp90 tc->tt E2Emean tc->tt")
|
|
||||||
for a in ARMS:
|
|
||||||
t1, t2 = TC[a]["ttft_ms"]["p90"], TT[a]["ttft_ms"]["p90"]
|
|
||||||
e1, e2 = TC[a]["e2e_ms"]["mean"], TT[a]["e2e_ms"]["mean"]
|
|
||||||
print(f"{a:<13} {t1/1000:5.1f}->{t2/1000:4.1f}s ({(t2-t1)/t1:+.0%}) "
|
|
||||||
f"{e1/1000:5.1f}->{e2/1000:4.1f}s ({(e2-e1)/e1:+.0%})")
|
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 81 KiB After Width: | Height: | Size: 152 KiB |
Reference in New Issue
Block a user