MB5 PD ablation v2 tooling: conc completion-panel plot + gpu_monitor dep
- plot_pd_crossover.py fig_conc: lead with request-completion % (the honest collapse signal; latency percentiles count successes only), then mean-E2E / TPS; note PD-capped/colo-uncapped in the title. - add microbench/fresh_setup/gpu_monitor.sh (referenced by the committed mb5_run_gpu.sh:73 for per-GPU util collection). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
18
microbench/fresh_setup/gpu_monitor.sh
Executable file
18
microbench/fresh_setup/gpu_monitor.sh
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Sample GPU utilization every 5s, output CSV
|
||||||
|
# Usage: bash gpu_monitor.sh <output_file> [interval_s]
|
||||||
|
# Runs until killed (Ctrl+C or kill)
|
||||||
|
|
||||||
|
OUT="${1:-/tmp/gpu_util.csv}"
|
||||||
|
INTERVAL="${2:-5}"
|
||||||
|
|
||||||
|
echo "timestamp,gpu,util_pct,mem_used_mb,mem_total_mb,power_w" > "$OUT"
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
TS=$(date +%s.%N)
|
||||||
|
nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total,power.draw \
|
||||||
|
--format=csv,noheader,nounits 2>/dev/null | while IFS=', ' read -r idx util mem_used mem_total power; do
|
||||||
|
echo "$TS,$idx,$util,$mem_used,$mem_total,$power"
|
||||||
|
done >> "$OUT"
|
||||||
|
sleep "$INTERVAL"
|
||||||
|
done
|
||||||
@@ -154,28 +154,33 @@ def fig_conc():
|
|||||||
g = by_axis(load("fig3_conc32k.json"),
|
g = by_axis(load("fig3_conc32k.json"),
|
||||||
lambda n: (int(m.group(1)) if (m := re.search(r"_N(\d+)_", n)) else None))
|
lambda n: (int(m.group(1)) if (m := re.search(r"_N(\d+)_", n)) else None))
|
||||||
xs = sorted(g)
|
xs = sorted(g)
|
||||||
fig, axes = plt.subplots(1, 3, figsize=(15, 4.2))
|
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 4.2))
|
||||||
ax1, ax2, ax3 = axes
|
|
||||||
|
# (a) request completion % — the headline (latency percentiles count successes
|
||||||
|
# only, so they understate PD; completion is the honest collapse signal).
|
||||||
for arm in ["colo", *PD_ARMS]:
|
for arm in ["colo", *PD_ARMS]:
|
||||||
ax1.plot(xs, series(g, xs, arm, "e2e_mean"), **STYLE[arm])
|
comp = [(g[x][arm]["n"] / g[x][arm]["req"] * 100) if arm in g[x] else None for x in xs]
|
||||||
ax1.axhline(10.0, color="red", ls=":", lw=1, label="SLO (mean E2E 10s)")
|
ax1.plot(xs, comp, **STYLE[arm])
|
||||||
ax1.set_yscale("log"); ax1.set_xticks(xs); ax1.set_xticklabels(xs, fontsize=7)
|
ax1.axhline(100, color="grey", ls=":", lw=1)
|
||||||
ax1.set_xlabel("concurrent sessions N"); ax1.set_ylabel("E2E latency mean (s, log)")
|
ax1.set_xticks(xs); ax1.set_xticklabels(xs, fontsize=7)
|
||||||
ax1.set_title("(a) mean-E2E vs concurrency"); ax1.legend(fontsize=8); ax1.grid(alpha=.3, which="both")
|
ax1.set_xlabel("concurrent sessions N"); ax1.set_ylabel("request completion (%)")
|
||||||
|
ax1.set_title("(a) completion: colo 100%, PD collapses"); ax1.legend(fontsize=8); ax1.grid(alpha=.3)
|
||||||
|
|
||||||
for arm in ["colo", *PD_ARMS]:
|
for arm in ["colo", *PD_ARMS]:
|
||||||
ax2.plot(xs, series(g, xs, arm, "tps"), **STYLE[arm])
|
ax2.plot(xs, series(g, xs, arm, "e2e_mean"), **STYLE[arm])
|
||||||
ax2.set_xticks(xs); ax2.set_xticklabels(xs, fontsize=7)
|
ax2.axhline(10.0, color="red", ls=":", lw=1, label="SLO 10s")
|
||||||
ax2.set_xlabel("concurrent sessions N"); ax2.set_ylabel("throughput (tok/s)")
|
ax2.set_yscale("log"); ax2.set_xticks(xs); ax2.set_xticklabels(xs, fontsize=7)
|
||||||
ax2.set_title("(b) TPS: colo scales, PD plateaus/drops"); ax2.legend(fontsize=8); ax2.grid(alpha=.3)
|
ax2.set_xlabel("concurrent sessions N"); ax2.set_ylabel("E2E latency mean (s, log)")
|
||||||
|
ax2.set_title("(b) mean-E2E (successes only)"); ax2.legend(fontsize=8); ax2.grid(alpha=.3, which="both")
|
||||||
|
|
||||||
for arm in ["colo", *PD_ARMS]:
|
for arm in ["colo", *PD_ARMS]:
|
||||||
ax3.plot(xs, [v * 100 if v is not None else None for v in series(g, xs, arm, "apc")], **STYLE[arm])
|
ax3.plot(xs, series(g, xs, arm, "tps"), **STYLE[arm])
|
||||||
ax3.set_xticks(xs); ax3.set_xticklabels(xs, fontsize=7)
|
ax3.set_xticks(xs); ax3.set_xticklabels(xs, fontsize=7)
|
||||||
ax3.set_xlabel("concurrent sessions N"); ax3.set_ylabel("producer prefix-cache hit-rate (%)")
|
ax3.set_xlabel("concurrent sessions N"); ax3.set_ylabel("throughput (tok/s)")
|
||||||
ax3.set_title("(c) APC vs concurrency"); ax3.legend(fontsize=8); ax3.grid(alpha=.3)
|
ax3.set_title("(c) TPS"); ax3.legend(fontsize=8); ax3.grid(alpha=.3)
|
||||||
fig.suptitle("Fig 3 — Concurrency axis (in32768/out128, reuse~0.984): sweep N by 8 to the 10s-SLO ceiling",
|
fig.suptitle("Fig 3 — Concurrency axis (in32768/out128, reuse~0.984, PD capped 600s / colo uncapped): "
|
||||||
fontsize=11, y=1.02)
|
"colo degrades gracefully (100% completion), PD collapses earlier as N rises",
|
||||||
|
fontsize=10, y=1.02)
|
||||||
fig.tight_layout(); p = OUT / "fig3_concurrency_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
|
fig.tight_layout(); p = OUT / "fig3_concurrency_axis.png"; fig.savefig(p, dpi=130, bbox_inches="tight")
|
||||||
print("wrote", p)
|
print("wrote", p)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user