Two figures inserted into V2_DEEP_ANALYSIS §4.5 and §4.4 respectively, to
visually rebut the two critic-agent claims that we argued in prose were
design intent, not deficiencies.
(1) gpu_utilization.png -- §4.5 "P GPU is wasted 90% of the time"
Two-panel side-by-side:
Left (request count view, the naive reading): KVC P = 328 reqs (7.4%),
KVC D = ~1450 each, DP = ~1100 each. P "looks idle."
Right (compute work view, the honest reading): KVC P does 1.07M tokens
of prefill, comparable to each KVC D worker's ~0.80M. P is a
low-frequency high-cost safety net, not idle capacity.
Bonus finding: KVC's total compute (3.47M tokens across 4 GPUs) is 33%
LESS than DP's (5.17M). Same GPUs, less work done. That's the affinity
win.
(2) cache_efficiency.png -- §4.4 "Cache concentration is not policy win"
Two-panel side-by-side. The setup: KVC has 27% LESS total KV pool
(276K vs 351K tokens) yet caches MORE per request.
Left (cache hit rate vs turn number): KVC's session-affinity lets
hit rate accumulate with turns; DP's hash + radix-LRU causes
a mid-turn drift around turns 8-25 where KVC = 97.0% vs DP
= 95.8% (1.24pp gap). Shows mechanism, not just outcome.
Right (ECDF of per-request uncached tokens, log x): KVC's distribution
concentrates near zero (50% < 187 tokens), DP's is spread
(50% < 781 tokens). At uncached = 500 tokens threshold, KVC
has 74% of requests below, DP has 31%.
→ smaller pool, better retention, less per-request work. Direct empirical
rebuttal to "fragmentation is architectural, not policy."
Bundled scripts (rerunable):
- scripts/analysis/plot_gpu_utilization.py
- scripts/analysis/plot_cache_efficiency.py
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
235 lines
8.9 KiB
Python
235 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Per-GPU utilization breakdown: KVC 1P3D v2 vs 4-way DP CA.
|
|
|
|
Generates docs/figures/gpu_utilization.png — two-panel:
|
|
left: per-GPU request count
|
|
right: per-GPU compute work (uncached prefill tokens + decode tokens, stacked)
|
|
|
|
The point of the figure is to push back on the naïve reading
|
|
"KVC's prefill GPU is idle 90% of the time, so KVC is using fewer GPUs."
|
|
|
|
By request count, the prefill GPU is indeed touched by only ~8% of requests.
|
|
By compute work, the prefill GPU bears comparable per-GPU load to each
|
|
decode GPU — it is a low-frequency, high-cost safety net for cache misses,
|
|
not idle capacity.
|
|
|
|
Work attribution:
|
|
KVC direct-to-D path: prefill happens locally on the assigned D worker
|
|
(append-prefill of `uncached_tokens` tokens).
|
|
KVC seed/reseed/fallback path: prefill happens on prefill-0
|
|
(full uncached_tokens), decode on assigned D.
|
|
DP: all work on assigned direct-N worker.
|
|
|
|
Aborted / errored requests are excluded.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
|
|
ROOT = Path(__file__).resolve().parents[2]
|
|
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
|
|
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
|
|
OUT = ROOT / "docs/figures/gpu_utilization.png"
|
|
|
|
|
|
def load(p: Path) -> list[dict]:
|
|
return [json.loads(line) for line in p.open()]
|
|
|
|
|
|
def is_failed(r: dict) -> bool:
|
|
if r.get("error"):
|
|
return True
|
|
fr = r.get("finish_reason")
|
|
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
|
|
return True
|
|
return False
|
|
|
|
|
|
def uncached(r: dict) -> int:
|
|
return max(0, r["input_length"] - r.get("cached_tokens", 0))
|
|
|
|
|
|
def out_tokens(r: dict) -> int:
|
|
return r.get("actual_output_tokens") or r.get("output_length") or 0
|
|
|
|
|
|
def main() -> None:
|
|
kvc = [r for r in load(KVC) if not is_failed(r)]
|
|
dp = [r for r in load(DP) if not is_failed(r)]
|
|
|
|
# ------------------------------------------------------------------
|
|
# KVC per-GPU attribution
|
|
# ------------------------------------------------------------------
|
|
kvc_req_count = defaultdict(int)
|
|
kvc_prefill_tokens = defaultdict(int) # uncached prefill compute
|
|
kvc_decode_tokens = defaultdict(int)
|
|
|
|
for r in kvc:
|
|
d = r["assigned_decode_node"] # decode-0/1/2
|
|
p = r["assigned_prefill_node"] # prefill-0
|
|
mode = r.get("execution_mode", "")
|
|
if mode == "kvcache-direct-to-d-session":
|
|
# P is bypassed entirely; D does the append-prefill + decode
|
|
kvc_req_count[d] += 1
|
|
kvc_prefill_tokens[d] += uncached(r)
|
|
kvc_decode_tokens[d] += out_tokens(r)
|
|
else:
|
|
# P does the full prefill; D handles decode
|
|
kvc_req_count[p] += 1
|
|
kvc_req_count[d] += 1 # decode side still counts
|
|
kvc_prefill_tokens[p] += uncached(r)
|
|
kvc_decode_tokens[d] += out_tokens(r)
|
|
|
|
# ------------------------------------------------------------------
|
|
# DP per-GPU attribution (fused P+D on every worker)
|
|
# ------------------------------------------------------------------
|
|
dp_req_count = defaultdict(int)
|
|
dp_prefill_tokens = defaultdict(int)
|
|
dp_decode_tokens = defaultdict(int)
|
|
|
|
for r in dp:
|
|
w = r["assigned_decode_node"] # direct-0..3
|
|
dp_req_count[w] += 1
|
|
dp_prefill_tokens[w] += uncached(r)
|
|
dp_decode_tokens[w] += out_tokens(r)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Build ordered GPU list, KVC then DP
|
|
# ------------------------------------------------------------------
|
|
kvc_gpus = ["prefill-0", "decode-0", "decode-1", "decode-2"]
|
|
dp_gpus = ["direct-0", "direct-1", "direct-2", "direct-3"]
|
|
all_gpus = kvc_gpus + dp_gpus
|
|
|
|
def get(d, k):
|
|
return d.get(k, 0)
|
|
|
|
counts = [get(kvc_req_count, g) for g in kvc_gpus] + \
|
|
[get(dp_req_count, g) for g in dp_gpus]
|
|
prefill_tk = [get(kvc_prefill_tokens, g) for g in kvc_gpus] + \
|
|
[get(dp_prefill_tokens, g) for g in dp_gpus]
|
|
decode_tk = [get(kvc_decode_tokens, g) for g in kvc_gpus] + \
|
|
[get(dp_decode_tokens, g) for g in dp_gpus]
|
|
|
|
# Display labels: P/D role + worker id
|
|
labels = [
|
|
"KVC P\nprefill-0",
|
|
"KVC D\ndecode-0",
|
|
"KVC D\ndecode-1",
|
|
"KVC D\ndecode-2",
|
|
"DP P+D\ndirect-0",
|
|
"DP P+D\ndirect-1",
|
|
"DP P+D\ndirect-2",
|
|
"DP P+D\ndirect-3",
|
|
]
|
|
kvc_mask = [True, True, True, True, False, False, False, False]
|
|
|
|
KVC_P_COLOR = "#E89D44" # orange — P GPU stands out
|
|
KVC_D_COLOR = "#1F77B4" # blue
|
|
DP_COLOR = "#D62728" # red
|
|
|
|
bar_colors = [KVC_P_COLOR, KVC_D_COLOR, KVC_D_COLOR, KVC_D_COLOR,
|
|
DP_COLOR, DP_COLOR, DP_COLOR, DP_COLOR]
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(15, 6.5))
|
|
x = np.arange(len(all_gpus))
|
|
|
|
# -- Left: per-GPU request count ----------------------------------
|
|
ax = axes[0]
|
|
bars = ax.bar(x, counts, color=bar_colors, edgecolor="black", linewidth=0.6)
|
|
for xi, c in zip(x, counts):
|
|
ax.text(xi, c + max(counts) * 0.015, f"{c:,}",
|
|
ha="center", va="bottom", fontsize=9.5)
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(labels, fontsize=9.5)
|
|
ax.set_ylabel("Number of requests touching this GPU", fontsize=11)
|
|
ax.set_title("Per-GPU request count\n(naïve view: P seems idle)", fontsize=12, pad=10)
|
|
ax.grid(axis="y", linestyle=":", alpha=0.4)
|
|
ax.set_axisbelow(True)
|
|
|
|
# Annotate: KVC P GPU is "low frequency"
|
|
p_idx = 0
|
|
p_pct = counts[p_idx] / sum(counts[:4]) * 100 # vs KVC total
|
|
ax.annotate(
|
|
f"P GPU only sees\n"
|
|
f"{counts[p_idx]:,} requests\n"
|
|
f"({counts[p_idx]/len(kvc)*100:.1f}% of total)",
|
|
xy=(p_idx, counts[p_idx]),
|
|
xytext=(p_idx + 0.6, max(counts) * 0.55),
|
|
fontsize=9, color=KVC_P_COLOR, fontweight="bold",
|
|
arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
|
|
)
|
|
|
|
# -- Right: per-GPU compute work (stacked prefill + decode) -------
|
|
ax = axes[1]
|
|
prefill_M = [t / 1e6 for t in prefill_tk]
|
|
decode_M = [t / 1e6 for t in decode_tk]
|
|
total_M = [p + d for p, d in zip(prefill_M, decode_M)]
|
|
|
|
bars_p = ax.bar(x, prefill_M, color=[c for c in bar_colors],
|
|
edgecolor="black", linewidth=0.6, label="Uncached prefill tokens",
|
|
alpha=0.95)
|
|
bars_d = ax.bar(x, decode_M, bottom=prefill_M, color=[c for c in bar_colors],
|
|
edgecolor="black", linewidth=0.6, hatch="///",
|
|
label="Decode tokens", alpha=0.55)
|
|
|
|
for xi, t in zip(x, total_M):
|
|
ax.text(xi, t + max(total_M) * 0.015, f"{t:.2f}M",
|
|
ha="center", va="bottom", fontsize=9.5)
|
|
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(labels, fontsize=9.5)
|
|
ax.set_ylabel("Compute tokens (millions)", fontsize=11)
|
|
ax.set_title("Per-GPU compute work\n(work view: P is comparable to each D)",
|
|
fontsize=12, pad=10)
|
|
ax.grid(axis="y", linestyle=":", alpha=0.4)
|
|
ax.set_axisbelow(True)
|
|
ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
|
|
|
|
# Annotate: KVC P GPU does similar work to each D
|
|
ax.annotate(
|
|
f"P GPU does {total_M[p_idx]:.2f}M tokens of\n"
|
|
f"prefill — comparable per-GPU\n"
|
|
f"load to each KVC D worker",
|
|
xy=(p_idx, total_M[p_idx]),
|
|
xytext=(p_idx + 0.6, max(total_M) * 0.62),
|
|
fontsize=9, color=KVC_P_COLOR, fontweight="bold",
|
|
arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
|
|
)
|
|
|
|
# Separator + group labels
|
|
for ax in axes:
|
|
ax.axvline(3.5, color="gray", linestyle="--", linewidth=1.0, alpha=0.5)
|
|
ymin, ymax = ax.get_ylim()
|
|
ax.text(1.5, ymax * 1.05, "KVC 1P3D", ha="center", fontsize=11,
|
|
fontweight="bold", color="#444")
|
|
ax.text(5.5, ymax * 1.05, "DP 4-way CA", ha="center", fontsize=11,
|
|
fontweight="bold", color="#444")
|
|
|
|
fig.suptitle(
|
|
"Per-GPU utilization: \"is KVC's prefill GPU wasted?\"\n"
|
|
"Left view says yes (only 8% of requests); right view says no (comparable work to each D).",
|
|
fontsize=13, y=1.02,
|
|
)
|
|
plt.tight_layout()
|
|
plt.savefig(OUT, dpi=150, bbox_inches="tight")
|
|
print(f"wrote {OUT}")
|
|
plt.close(fig)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Print numbers for doc reference
|
|
# ------------------------------------------------------------------
|
|
print("\n=== Per-GPU numbers ===")
|
|
print(f"{'GPU':<22} {'requests':>10} {'prefill(M)':>12} {'decode(M)':>12} {'total(M)':>10}")
|
|
for lbl, n, pM, dM in zip(labels, counts, prefill_M, decode_M):
|
|
print(f" {lbl.replace(chr(10), ' '):<20} {n:>10} {pM:>12.3f} {dM:>12.3f} {pM+dM:>10.3f}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|