Files
agentic-pd-hybrid/scripts/analysis/plot_gpu_utilization.py
kzlin 517677d7f2 docs(kvc): add GPU-utilization and cache-efficiency figures (rebut critic)
Two figures inserted into V2_DEEP_ANALYSIS §4.5 and §4.4 respectively, to
visually rebut the two critic-agent claims that we argued in prose were
design intent, not deficiencies.

(1) gpu_utilization.png  -- §4.5  "P GPU is wasted 90% of the time"
  Two-panel side-by-side:
    Left  (request count view, the naive reading): KVC P = 328 reqs (7.4%),
          KVC D = ~1450 each, DP = ~1100 each. P "looks idle."
    Right (compute work view, the honest reading): KVC P does 1.07M tokens
          of prefill, comparable to each KVC D worker's ~0.80M. P is a
          low-frequency high-cost safety net, not idle capacity.
  Bonus finding: KVC's total compute (3.47M tokens across 4 GPUs) is 33%
  LESS than DP's (5.17M). Same GPUs, less work done. That's the affinity
  win.

(2) cache_efficiency.png  -- §4.4  "Cache concentration is not policy win"
  Two-panel side-by-side. The setup: KVC has 27% LESS total KV pool
  (276K vs 351K tokens) yet caches MORE per request.
    Left  (cache hit rate vs turn number): KVC's session-affinity lets
          hit rate accumulate with turns; DP's hash + radix-LRU causes
          a mid-turn drift around turns 8-25 where KVC = 97.0% vs DP
          = 95.8% (1.24pp gap). Shows mechanism, not just outcome.
    Right (ECDF of per-request uncached tokens, log x): KVC's distribution
          concentrates near zero (50% < 187 tokens), DP's is spread
          (50% < 781 tokens). At uncached = 500 tokens threshold, KVC
          has 74% of requests below, DP has 31%.
  → smaller pool, better retention, less per-request work. Direct empirical
  rebuttal to "fragmentation is architectural, not policy."

Bundled scripts (rerunable):
- scripts/analysis/plot_gpu_utilization.py
- scripts/analysis/plot_cache_efficiency.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 18:04:49 +08:00

235 lines
8.9 KiB
Python

#!/usr/bin/env python3
"""Per-GPU utilization breakdown: KVC 1P3D v2 vs 4-way DP CA.
Generates docs/figures/gpu_utilization.png — two-panel:
left: per-GPU request count
right: per-GPU compute work (uncached prefill tokens + decode tokens, stacked)
The point of the figure is to push back on the naïve reading
"KVC's prefill GPU is idle 90% of the time, so KVC is using fewer GPUs."
By request count, the prefill GPU is indeed touched by only ~8% of requests.
By compute work, the prefill GPU bears comparable per-GPU load to each
decode GPU — it is a low-frequency, high-cost safety net for cache misses,
not idle capacity.
Work attribution:
KVC direct-to-D path: prefill happens locally on the assigned D worker
(append-prefill of `uncached_tokens` tokens).
KVC seed/reseed/fallback path: prefill happens on prefill-0
(full uncached_tokens), decode on assigned D.
DP: all work on assigned direct-N worker.
Aborted / errored requests are excluded.
"""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
ROOT = Path(__file__).resolve().parents[2]
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
OUT = ROOT / "docs/figures/gpu_utilization.png"
def load(p: Path) -> list[dict]:
return [json.loads(line) for line in p.open()]
def is_failed(r: dict) -> bool:
if r.get("error"):
return True
fr = r.get("finish_reason")
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
return True
return False
def uncached(r: dict) -> int:
return max(0, r["input_length"] - r.get("cached_tokens", 0))
def out_tokens(r: dict) -> int:
return r.get("actual_output_tokens") or r.get("output_length") or 0
def main() -> None:
kvc = [r for r in load(KVC) if not is_failed(r)]
dp = [r for r in load(DP) if not is_failed(r)]
# ------------------------------------------------------------------
# KVC per-GPU attribution
# ------------------------------------------------------------------
kvc_req_count = defaultdict(int)
kvc_prefill_tokens = defaultdict(int) # uncached prefill compute
kvc_decode_tokens = defaultdict(int)
for r in kvc:
d = r["assigned_decode_node"] # decode-0/1/2
p = r["assigned_prefill_node"] # prefill-0
mode = r.get("execution_mode", "")
if mode == "kvcache-direct-to-d-session":
# P is bypassed entirely; D does the append-prefill + decode
kvc_req_count[d] += 1
kvc_prefill_tokens[d] += uncached(r)
kvc_decode_tokens[d] += out_tokens(r)
else:
# P does the full prefill; D handles decode
kvc_req_count[p] += 1
kvc_req_count[d] += 1 # decode side still counts
kvc_prefill_tokens[p] += uncached(r)
kvc_decode_tokens[d] += out_tokens(r)
# ------------------------------------------------------------------
# DP per-GPU attribution (fused P+D on every worker)
# ------------------------------------------------------------------
dp_req_count = defaultdict(int)
dp_prefill_tokens = defaultdict(int)
dp_decode_tokens = defaultdict(int)
for r in dp:
w = r["assigned_decode_node"] # direct-0..3
dp_req_count[w] += 1
dp_prefill_tokens[w] += uncached(r)
dp_decode_tokens[w] += out_tokens(r)
# ------------------------------------------------------------------
# Build ordered GPU list, KVC then DP
# ------------------------------------------------------------------
kvc_gpus = ["prefill-0", "decode-0", "decode-1", "decode-2"]
dp_gpus = ["direct-0", "direct-1", "direct-2", "direct-3"]
all_gpus = kvc_gpus + dp_gpus
def get(d, k):
return d.get(k, 0)
counts = [get(kvc_req_count, g) for g in kvc_gpus] + \
[get(dp_req_count, g) for g in dp_gpus]
prefill_tk = [get(kvc_prefill_tokens, g) for g in kvc_gpus] + \
[get(dp_prefill_tokens, g) for g in dp_gpus]
decode_tk = [get(kvc_decode_tokens, g) for g in kvc_gpus] + \
[get(dp_decode_tokens, g) for g in dp_gpus]
# Display labels: P/D role + worker id
labels = [
"KVC P\nprefill-0",
"KVC D\ndecode-0",
"KVC D\ndecode-1",
"KVC D\ndecode-2",
"DP P+D\ndirect-0",
"DP P+D\ndirect-1",
"DP P+D\ndirect-2",
"DP P+D\ndirect-3",
]
kvc_mask = [True, True, True, True, False, False, False, False]
KVC_P_COLOR = "#E89D44" # orange — P GPU stands out
KVC_D_COLOR = "#1F77B4" # blue
DP_COLOR = "#D62728" # red
bar_colors = [KVC_P_COLOR, KVC_D_COLOR, KVC_D_COLOR, KVC_D_COLOR,
DP_COLOR, DP_COLOR, DP_COLOR, DP_COLOR]
fig, axes = plt.subplots(1, 2, figsize=(15, 6.5))
x = np.arange(len(all_gpus))
# -- Left: per-GPU request count ----------------------------------
ax = axes[0]
bars = ax.bar(x, counts, color=bar_colors, edgecolor="black", linewidth=0.6)
for xi, c in zip(x, counts):
ax.text(xi, c + max(counts) * 0.015, f"{c:,}",
ha="center", va="bottom", fontsize=9.5)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=9.5)
ax.set_ylabel("Number of requests touching this GPU", fontsize=11)
ax.set_title("Per-GPU request count\n(naïve view: P seems idle)", fontsize=12, pad=10)
ax.grid(axis="y", linestyle=":", alpha=0.4)
ax.set_axisbelow(True)
# Annotate: KVC P GPU is "low frequency"
p_idx = 0
p_pct = counts[p_idx] / sum(counts[:4]) * 100 # vs KVC total
ax.annotate(
f"P GPU only sees\n"
f"{counts[p_idx]:,} requests\n"
f"({counts[p_idx]/len(kvc)*100:.1f}% of total)",
xy=(p_idx, counts[p_idx]),
xytext=(p_idx + 0.6, max(counts) * 0.55),
fontsize=9, color=KVC_P_COLOR, fontweight="bold",
arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
)
# -- Right: per-GPU compute work (stacked prefill + decode) -------
ax = axes[1]
prefill_M = [t / 1e6 for t in prefill_tk]
decode_M = [t / 1e6 for t in decode_tk]
total_M = [p + d for p, d in zip(prefill_M, decode_M)]
bars_p = ax.bar(x, prefill_M, color=[c for c in bar_colors],
edgecolor="black", linewidth=0.6, label="Uncached prefill tokens",
alpha=0.95)
bars_d = ax.bar(x, decode_M, bottom=prefill_M, color=[c for c in bar_colors],
edgecolor="black", linewidth=0.6, hatch="///",
label="Decode tokens", alpha=0.55)
for xi, t in zip(x, total_M):
ax.text(xi, t + max(total_M) * 0.015, f"{t:.2f}M",
ha="center", va="bottom", fontsize=9.5)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=9.5)
ax.set_ylabel("Compute tokens (millions)", fontsize=11)
ax.set_title("Per-GPU compute work\n(work view: P is comparable to each D)",
fontsize=12, pad=10)
ax.grid(axis="y", linestyle=":", alpha=0.4)
ax.set_axisbelow(True)
ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
# Annotate: KVC P GPU does similar work to each D
ax.annotate(
f"P GPU does {total_M[p_idx]:.2f}M tokens of\n"
f"prefill — comparable per-GPU\n"
f"load to each KVC D worker",
xy=(p_idx, total_M[p_idx]),
xytext=(p_idx + 0.6, max(total_M) * 0.62),
fontsize=9, color=KVC_P_COLOR, fontweight="bold",
arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
)
# Separator + group labels
for ax in axes:
ax.axvline(3.5, color="gray", linestyle="--", linewidth=1.0, alpha=0.5)
ymin, ymax = ax.get_ylim()
ax.text(1.5, ymax * 1.05, "KVC 1P3D", ha="center", fontsize=11,
fontweight="bold", color="#444")
ax.text(5.5, ymax * 1.05, "DP 4-way CA", ha="center", fontsize=11,
fontweight="bold", color="#444")
fig.suptitle(
"Per-GPU utilization: \"is KVC's prefill GPU wasted?\"\n"
"Left view says yes (only 8% of requests); right view says no (comparable work to each D).",
fontsize=13, y=1.02,
)
plt.tight_layout()
plt.savefig(OUT, dpi=150, bbox_inches="tight")
print(f"wrote {OUT}")
plt.close(fig)
# ------------------------------------------------------------------
# Print numbers for doc reference
# ------------------------------------------------------------------
print("\n=== Per-GPU numbers ===")
print(f"{'GPU':<22} {'requests':>10} {'prefill(M)':>12} {'decode(M)':>12} {'total(M)':>10}")
for lbl, n, pM, dM in zip(labels, counts, prefill_M, decode_M):
print(f" {lbl.replace(chr(10), ' '):<20} {n:>10} {pM:>12.3f} {dM:>12.3f} {pM+dM:>10.3f}")
if __name__ == "__main__":
main()