Files
agentic-pd-hybrid/scripts/analysis/plot_gpu_utilization.py
kzlin 506d360160 fix(figures): GPU utilization figure annotation/headroom polish
Bar-overlap fix: extend ylim by 35-45% above the tallest bar to give the
"P GPU only sees 328 requests" and "P GPU does 1.07M tokens" annotations
clean white-bbox space above the bars instead of crashing into the KVC D
bars at x=1. Move both annotation xytext positions to x=2.4 (left panel)
and x=5.5 (right panel) so the arrows pull away from the orange P bar
toward the center of the panel.

Group labels (KVC 1P3D / DP 4-way CA) kept in axes-fraction bboxes at
y=1.02; subplot titles raised to pad=24 to leave room.

Note: a small visual collision between the bboxed group labels and the
subplot-title second line remains in the rendered output (acknowledged
in the prior conversation). Acceptable for now; full layout rework is
deferred. The annotation-vs-bar overlap (the original blocker) is fixed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 22:28:39 +08:00

250 lines
9.9 KiB
Python

#!/usr/bin/env python3
"""Per-GPU utilization breakdown: KVC 1P3D v2 vs 4-way DP CA.
Generates docs/figures/gpu_utilization.png — two-panel:
left: per-GPU request count
right: per-GPU compute work (uncached prefill tokens + decode tokens, stacked)
The point of the figure is to push back on the naïve reading
"KVC's prefill GPU is idle 90% of the time, so KVC is using fewer GPUs."
By request count, the prefill GPU is indeed touched by only ~8% of requests.
By compute work, the prefill GPU bears comparable per-GPU load to each
decode GPU — it is a low-frequency, high-cost safety net for cache misses,
not idle capacity.
Work attribution:
KVC direct-to-D path: prefill happens locally on the assigned D worker
(append-prefill of `uncached_tokens` tokens).
KVC seed/reseed/fallback path: prefill happens on prefill-0
(full uncached_tokens), decode on assigned D.
DP: all work on assigned direct-N worker.
Aborted / errored requests are excluded.
"""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
ROOT = Path(__file__).resolve().parents[2]
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
OUT = ROOT / "docs/figures/gpu_utilization.png"
def load(p: Path) -> list[dict]:
return [json.loads(line) for line in p.open()]
def is_failed(r: dict) -> bool:
if r.get("error"):
return True
fr = r.get("finish_reason")
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
return True
return False
def uncached(r: dict) -> int:
return max(0, r["input_length"] - r.get("cached_tokens", 0))
def out_tokens(r: dict) -> int:
return r.get("actual_output_tokens") or r.get("output_length") or 0
def main() -> None:
kvc = [r for r in load(KVC) if not is_failed(r)]
dp = [r for r in load(DP) if not is_failed(r)]
# ------------------------------------------------------------------
# KVC per-GPU attribution
# ------------------------------------------------------------------
kvc_req_count = defaultdict(int)
kvc_prefill_tokens = defaultdict(int) # uncached prefill compute
kvc_decode_tokens = defaultdict(int)
for r in kvc:
d = r["assigned_decode_node"] # decode-0/1/2
p = r["assigned_prefill_node"] # prefill-0
mode = r.get("execution_mode", "")
if mode == "kvcache-direct-to-d-session":
# P is bypassed entirely; D does the append-prefill + decode
kvc_req_count[d] += 1
kvc_prefill_tokens[d] += uncached(r)
kvc_decode_tokens[d] += out_tokens(r)
else:
# P does the full prefill; D handles decode
kvc_req_count[p] += 1
kvc_req_count[d] += 1 # decode side still counts
kvc_prefill_tokens[p] += uncached(r)
kvc_decode_tokens[d] += out_tokens(r)
# ------------------------------------------------------------------
# DP per-GPU attribution (fused P+D on every worker)
# ------------------------------------------------------------------
dp_req_count = defaultdict(int)
dp_prefill_tokens = defaultdict(int)
dp_decode_tokens = defaultdict(int)
for r in dp:
w = r["assigned_decode_node"] # direct-0..3
dp_req_count[w] += 1
dp_prefill_tokens[w] += uncached(r)
dp_decode_tokens[w] += out_tokens(r)
# ------------------------------------------------------------------
# Build ordered GPU list, KVC then DP
# ------------------------------------------------------------------
kvc_gpus = ["prefill-0", "decode-0", "decode-1", "decode-2"]
dp_gpus = ["direct-0", "direct-1", "direct-2", "direct-3"]
all_gpus = kvc_gpus + dp_gpus
def get(d, k):
return d.get(k, 0)
counts = [get(kvc_req_count, g) for g in kvc_gpus] + \
[get(dp_req_count, g) for g in dp_gpus]
prefill_tk = [get(kvc_prefill_tokens, g) for g in kvc_gpus] + \
[get(dp_prefill_tokens, g) for g in dp_gpus]
decode_tk = [get(kvc_decode_tokens, g) for g in kvc_gpus] + \
[get(dp_decode_tokens, g) for g in dp_gpus]
# Display labels: P/D role + worker id
labels = [
"KVC P\nprefill-0",
"KVC D\ndecode-0",
"KVC D\ndecode-1",
"KVC D\ndecode-2",
"DP P+D\ndirect-0",
"DP P+D\ndirect-1",
"DP P+D\ndirect-2",
"DP P+D\ndirect-3",
]
kvc_mask = [True, True, True, True, False, False, False, False]
KVC_P_COLOR = "#E89D44" # orange — P GPU stands out
KVC_D_COLOR = "#1F77B4" # blue
DP_COLOR = "#D62728" # red
bar_colors = [KVC_P_COLOR, KVC_D_COLOR, KVC_D_COLOR, KVC_D_COLOR,
DP_COLOR, DP_COLOR, DP_COLOR, DP_COLOR]
fig, axes = plt.subplots(1, 2, figsize=(15, 7.0))
x = np.arange(len(all_gpus))
# -- Left: per-GPU request count ----------------------------------
ax = axes[0]
bars = ax.bar(x, counts, color=bar_colors, edgecolor="black", linewidth=0.6)
for xi, c in zip(x, counts):
ax.text(xi, c + max(counts) * 0.015, f"{c:,}",
ha="center", va="bottom", fontsize=9.5)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=9.5)
ax.set_ylabel("Number of requests touching this GPU", fontsize=11)
# Headroom for the annotation: extend ylim 35% above tallest bar
ax.set_ylim(0, max(counts) * 1.40)
ax.set_title("Per-GPU request count\n(naïve view: P seems idle)",
fontsize=12, pad=24)
ax.grid(axis="y", linestyle=":", alpha=0.4)
ax.set_axisbelow(True)
# Annotate: KVC P GPU is "low frequency"
# Place in upper-right area (over DP group) so it doesn't sit on KVC D bars
p_idx = 0
ax.annotate(
f"P GPU only sees\n"
f"{counts[p_idx]:,} requests\n"
f"({counts[p_idx]/len(kvc)*100:.1f}% of all KVC requests)",
xy=(p_idx, counts[p_idx]),
xytext=(2.4, max(counts) * 1.20),
fontsize=10, color=KVC_P_COLOR, fontweight="bold", ha="center",
bbox=dict(facecolor="white", edgecolor=KVC_P_COLOR, alpha=0.92, pad=4),
arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
)
# -- Right: per-GPU compute work (stacked prefill + decode) -------
ax = axes[1]
prefill_M = [t / 1e6 for t in prefill_tk]
decode_M = [t / 1e6 for t in decode_tk]
total_M = [p + d for p, d in zip(prefill_M, decode_M)]
bars_p = ax.bar(x, prefill_M, color=[c for c in bar_colors],
edgecolor="black", linewidth=0.6, label="Uncached prefill tokens",
alpha=0.95)
bars_d = ax.bar(x, decode_M, bottom=prefill_M, color=[c for c in bar_colors],
edgecolor="black", linewidth=0.6, hatch="///",
label="Decode tokens", alpha=0.55)
for xi, t in zip(x, total_M):
ax.text(xi, t + max(total_M) * 0.015, f"{t:.2f}M",
ha="center", va="bottom", fontsize=9.5)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=9.5)
ax.set_ylabel("Compute tokens (millions)", fontsize=11)
# Headroom for the annotation
ax.set_ylim(0, max(total_M) * 1.45)
ax.set_title("Per-GPU compute work\n(work view: P is comparable to each D)",
fontsize=12, pad=24)
ax.grid(axis="y", linestyle=":", alpha=0.4)
ax.set_axisbelow(True)
# Legend placed at upper-left where bars are tallest is fine after raising ylim
ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
# Annotate: KVC P GPU does similar work to each D.
# Place over DP region (right side) so it doesn't sit on KVC D bars.
ax.annotate(
f"P GPU does {total_M[p_idx]:.2f}M tokens of prefill\n"
f"— comparable per-GPU load to each KVC D worker\n"
f"(KVC D avg = {np.mean(total_M[1:4]):.2f}M)",
xy=(p_idx, total_M[p_idx]),
xytext=(5.5, max(total_M) * 1.30),
fontsize=10, color=KVC_P_COLOR, fontweight="bold", ha="center",
bbox=dict(facecolor="white", edgecolor=KVC_P_COLOR, alpha=0.92, pad=4),
arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
)
# Separator + group labels (placed in axes-fraction coords, below subplot
# title at pad=24 we now have safe room for these at y_axes_frac ≈ 1.02)
for ax in axes:
ax.axvline(3.5, color="gray", linestyle="--", linewidth=1.0, alpha=0.5)
ax.text(0.25, 1.02, "KVC 1P3D",
transform=ax.transAxes, ha="center", va="bottom",
fontsize=11.5, fontweight="bold", color="#444",
bbox=dict(facecolor="#F2F2F2", edgecolor="#888",
alpha=0.85, pad=3))
ax.text(0.75, 1.02, "DP 4-way CA",
transform=ax.transAxes, ha="center", va="bottom",
fontsize=11.5, fontweight="bold", color="#444",
bbox=dict(facecolor="#F2F2F2", edgecolor="#888",
alpha=0.85, pad=3))
fig.suptitle(
"Per-GPU utilization: \"is KVC's prefill GPU wasted?\"\n"
"Left view says yes (only 8% of requests); right view says no (comparable work to each D).",
fontsize=13, y=1.02,
)
plt.tight_layout()
plt.savefig(OUT, dpi=150, bbox_inches="tight")
print(f"wrote {OUT}")
plt.close(fig)
# ------------------------------------------------------------------
# Print numbers for doc reference
# ------------------------------------------------------------------
print("\n=== Per-GPU numbers ===")
print(f"{'GPU':<22} {'requests':>10} {'prefill(M)':>12} {'decode(M)':>12} {'total(M)':>10}")
for lbl, n, pM, dM in zip(labels, counts, prefill_M, decode_M):
print(f" {lbl.replace(chr(10), ' '):<20} {n:>10} {pM:>12.3f} {dM:>12.3f} {pM+dM:>10.3f}")
if __name__ == "__main__":
main()