diff --git a/v2/exp_d_policy_dispatch/README.md b/v2/exp_d_policy_dispatch/README.md index c74de29..9729b10 100644 --- a/v2/exp_d_policy_dispatch/README.md +++ b/v2/exp_d_policy_dispatch/README.md @@ -27,6 +27,13 @@ Analyzer: `scripts/bench_report.py` (summaries in `results/`). ## Result (ms; `figs/exp_d_policy_dispatch.png`) +The figure has 6 panels — TTFT mean/p90, E2E mean/p90, decode TPS (output +goodput), and the per-worker GPU-util box (8 workers/arm). Decode TPS is the +honest throughput metric (total/prefill TPS is inflated by cache-miss recompute, +e.g. LMetric); thinktime ≥ tracets on it everywhere (the system drains faster +with real think-time). The GPU-util box shows LPWL also keeps the tightest +worker balance. + | policy | mode | TTFT p90 | E2E mean | E2E p90 | E2E p99 | TPOT p90 | APC | req-bal | |---|---|---:|---:|---:|---:|---:|---:|---:| | **LPWL** | tracets | 11099 | 9827 | 25366 | 93929 | 33 | 0.650 | **1.49×** | diff --git a/v2/exp_d_policy_dispatch/plot.py b/v2/exp_d_policy_dispatch/plot.py index cfb501b..a4d968e 100644 --- a/v2/exp_d_policy_dispatch/plot.py +++ b/v2/exp_d_policy_dispatch/plot.py @@ -1,8 +1,9 @@ """exp (d): 5-policy routing under tracets vs thinktime dispatch. -Shows the ranking FLIP: under the faithful `thinktime` load the parameter-free -LPWL (leastwork) is the clear winner, but under `tracets` (think-collapse bursts) -its advantage disappears (it ties unified_ab on TTFT p90 and *loses* on E2E mean). +Six panels: TTFT mean/p90, E2E mean/p90, decode-TPS (output goodput), and the +per-worker GPU-util distribution. Shows the ranking FLIP — under faithful +`thinktime` the parameter-free LPWL (leastwork) is the clear winner; under +`tracets` (think-collapse bursts) its advantage disappears. Reads the two bench_report summaries; writes v2/figs/exp_d_policy_dispatch.png. Usage: python v2/exp_d_policy_dispatch/plot.py @@ -13,56 +14,78 @@ import os import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt +from matplotlib.patches import Patch HERE = os.path.dirname(__file__) TC = json.load(open(os.path.join(HERE, "results/tracets.json"))) TT = json.load(open(os.path.join(HERE, "results/thinktime.json"))) -# canonical order: LPWL first; pretty labels ARMS = ["leastwork", "unified_ab", "unified_def", "lmetric", "sticky"] LABEL = {"leastwork": "LPWL\n(leastwork)", "unified_ab": "unified\n+A+B", "unified_def": "unified\ndefault", "lmetric": "LMetric", "sticky": "sticky"} C_TC, C_TT = "#d62728", "#2ca02c" # tracets red / thinktime green (match exp_c) +W = 0.38 -def panel(ax, key, sub, title, ylab): - tc = [TC[a][key][sub] / 1000.0 for a in ARMS] # ms -> s - tt = [TT[a][key][sub] / 1000.0 for a in ARMS] +def bar_panel(ax, tc, tt, title, ylab, fmt="{:.1f}", higher_better=False): x = range(len(ARMS)) - w = 0.38 - b1 = ax.bar([i - w / 2 for i in x], tc, w, label="tracets (burst)", color=C_TC) - b2 = ax.bar([i + w / 2 for i in x], tt, w, label="thinktime (faithful)", color=C_TT) + b1 = ax.bar([i - W / 2 for i in x], tc, W, color=C_TC) + b2 = ax.bar([i + W / 2 for i in x], tt, W, color=C_TT) for bars in (b1, b2): for r in bars: ax.text(r.get_x() + r.get_width() / 2, r.get_height(), - f"{r.get_height():.1f}", ha="center", va="bottom", fontsize=8) - ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=9) - ax.set_ylabel(ylab); ax.set_title(title, fontsize=11) + fmt.format(r.get_height()), ha="center", va="bottom", fontsize=7.5) + ax.set_xticks(list(x)); ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5) + arrow = "higher = better" if higher_better else "lower = better" + ax.set_ylabel(ylab); ax.set_title(f"{title} ({arrow})", fontsize=10.5) + ax.grid(axis="y", alpha=.3); ax.set_ylim(0, max(tc + tt) * 1.18) + + +def gpu_panel(ax): + """Per-worker gpu_util_mean distribution: tracets vs thinktime box per policy.""" + def utils(D, a): + pw = D[a]["per_worker"] + return [pw[w]["gpu_util_mean"] for w in sorted(pw, key=int) + if pw[w].get("gpu_util_mean") is not None] + for i, a in enumerate(ARMS): + for D, off, c in [(TC, -W / 2, C_TC), (TT, +W / 2, C_TT)]: + bp = ax.boxplot([utils(D, a)], positions=[i + off], widths=0.30, + patch_artist=True, showfliers=False, + medianprops=dict(color="black")) + bp["boxes"][0].set(facecolor=c, alpha=.65) + ax.set_xticks(range(len(ARMS))) + ax.set_xticklabels([LABEL[a] for a in ARMS], fontsize=8.5) + ax.set_ylabel("per-worker GPU util %"); ax.set_ylim(0, 100) + ax.set_title("Per-worker GPU util (box = 8 workers; tighter = balanced)", fontsize=10.5) ax.grid(axis="y", alpha=.3) - ax.set_ylim(0, max(tc + tt) * 1.18) - # mark LPWL-thinktime as the winner (lowest green) in each panel - ax.annotate("LPWL wins\nunder thinktime", xy=(0 + w / 2, tt[0]), - xytext=(0.9, max(tc + tt) * 0.86), fontsize=8.5, color=C_TT, - ha="left", arrowprops=dict(arrowstyle="->", color=C_TT, lw=1.3)) - return b1, b2 -fig, (axL, axR) = plt.subplots(1, 2, figsize=(11.2, 4.6)) -panel(axL, "ttft_ms", "p90", "TTFT p90 (lower = better)", "TTFT p90 (s)") -panel(axR, "e2e_ms", "mean", "E2E mean (lower = better)", "E2E mean (s)") -axL.legend(loc="upper left", fontsize=9) -fig.suptitle("5-policy routing: dispatch mode flips the ranking — " - "LPWL is best under faithful thinktime, only ties/loses under tracets bursts", - fontsize=11.5) -fig.tight_layout(rect=(0, 0, 1, 0.95)) +def col(D, key, sub, scale=1.0): + return [D[a][key][sub] * scale for a in ARMS] + + +fig, ax = plt.subplots(2, 3, figsize=(15.5, 8.6)) +bar_panel(ax[0, 0], col(TC, "ttft_ms", "mean", 1e-3), col(TT, "ttft_ms", "mean", 1e-3), + "TTFT mean", "s") +bar_panel(ax[0, 1], col(TC, "ttft_ms", "p90", 1e-3), col(TT, "ttft_ms", "p90", 1e-3), + "TTFT p90", "s") +bar_panel(ax[0, 2], + [TC[a]["throughput"]["decode_tps"] for a in ARMS], + [TT[a]["throughput"]["decode_tps"] for a in ARMS], + "Decode TPS (output goodput)", "tok/s", fmt="{:.0f}", higher_better=True) +bar_panel(ax[1, 0], col(TC, "e2e_ms", "mean", 1e-3), col(TT, "e2e_ms", "mean", 1e-3), + "E2E mean", "s") +bar_panel(ax[1, 1], col(TC, "e2e_ms", "p90", 1e-3), col(TT, "e2e_ms", "p90", 1e-3), + "E2E p90", "s") +gpu_panel(ax[1, 2]) + +fig.legend(handles=[Patch(facecolor=C_TC, label="tracets (burst artifact)"), + Patch(facecolor=C_TT, label="thinktime (faithful load)")], + loc="lower center", ncol=2, fontsize=10.5, bbox_to_anchor=(0.5, 0.0)) +fig.suptitle("5-policy routing: tracets vs thinktime (807 reqs, dash0 8xH20) — " + "LPWL wins across the board under faithful thinktime", + fontsize=12.5) +fig.tight_layout(rect=(0, 0.035, 1, 0.96)) out = os.path.join(HERE, "..", "figs", "exp_d_policy_dispatch.png") fig.savefig(out, dpi=140) print("wrote", os.path.normpath(out)) - -# also print the deltas the README cites -print("\npolicy TTFTp90 tc->tt E2Emean tc->tt") -for a in ARMS: - t1, t2 = TC[a]["ttft_ms"]["p90"], TT[a]["ttft_ms"]["p90"] - e1, e2 = TC[a]["e2e_ms"]["mean"], TT[a]["e2e_ms"]["mean"] - print(f"{a:<13} {t1/1000:5.1f}->{t2/1000:4.1f}s ({(t2-t1)/t1:+.0%}) " - f"{e1/1000:5.1f}->{e2/1000:4.1f}s ({(e2-e1)/e1:+.0%})") diff --git a/v2/figs/exp_d_policy_dispatch.png b/v2/figs/exp_d_policy_dispatch.png index 77ad94f..a583f03 100644 Binary files a/v2/figs/exp_d_policy_dispatch.png and b/v2/figs/exp_d_policy_dispatch.png differ