#!/usr/bin/env python3 """System compute economy: KVC 1P3D v2 vs 4-way DP CA. Generates docs/figures/gpu_utilization.png -- two-panel: left: total system compute (stacked by work type) right: per-GPU compute distribution (specialized vs fused) The punchline is the TOTAL system compute reduction: KVC v2 system: 3.47 M tokens of compute (1.07 P-prefill + 1.39 D-append + 1.01 decode) DP 4-way: 5.17 M tokens of compute (4.17 full-prefill + 1.00 decode) → KVC does 33% LESS compute for the SAME workload (same 4449 requests). This is the non-trivial finding: session affinity converts to reduced system-wide work, not just locality. The per-GPU panel then explains the architectural shape: KVC concentrates heavy prefill on a specialized P worker, leaves D workers with light append + decode; DP forces every worker to absorb the full prefill load mixed with decode. The earlier version of this figure showed per-GPU request count + per-GPU compute and was confusing to external reviewers ("P doing prefill is trivial"). This version leads with the system-total comparison, which IS the non-trivial result. Aborted / errored requests are excluded. """ from __future__ import annotations import json from collections import defaultdict from pathlib import Path import matplotlib.pyplot as plt import numpy as np ROOT = Path(__file__).resolve().parents[2] KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl" DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl" OUT = ROOT / "docs/figures/gpu_utilization.png" def load(p: Path) -> list[dict]: return [json.loads(line) for line in p.open()] def is_failed(r: dict) -> bool: if r.get("error"): return True fr = r.get("finish_reason") if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()): return True return False def uncached(r: dict) -> int: return max(0, r["input_length"] - r.get("cached_tokens", 0)) def out_tokens(r: dict) -> int: return r.get("actual_output_tokens") or r.get("output_length") or 0 def main() -> None: kvc = [r for r in load(KVC) if not is_failed(r)] dp = [r for r in load(DP) if not is_failed(r)] # ------------------------------------------------------------------ # KVC per-GPU + per-work-type attribution # ------------------------------------------------------------------ kvc_prefill_tokens = defaultdict(int) kvc_decode_tokens = defaultdict(int) for r in kvc: d = r["assigned_decode_node"] p = r["assigned_prefill_node"] mode = r.get("execution_mode", "") if mode == "kvcache-direct-to-d-session": # P bypassed; D does small append-prefill + decode kvc_prefill_tokens[d] += uncached(r) kvc_decode_tokens[d] += out_tokens(r) else: # P does heavy prefill; D handles decode kvc_prefill_tokens[p] += uncached(r) kvc_decode_tokens[d] += out_tokens(r) # ------------------------------------------------------------------ # DP per-GPU attribution (fused P+D on every worker) # ------------------------------------------------------------------ dp_prefill_tokens = defaultdict(int) dp_decode_tokens = defaultdict(int) for r in dp: w = r["assigned_decode_node"] dp_prefill_tokens[w] += uncached(r) dp_decode_tokens[w] += out_tokens(r) # ------------------------------------------------------------------ # Aggregate work by category for the left panel # ------------------------------------------------------------------ kvc_p_prefill = kvc_prefill_tokens.get("prefill-0", 0) kvc_d_prefill = sum(v for k, v in kvc_prefill_tokens.items() if k.startswith("decode-")) kvc_d_decode = sum(kvc_decode_tokens.values()) kvc_total = kvc_p_prefill + kvc_d_prefill + kvc_d_decode dp_prefill_total = sum(dp_prefill_tokens.values()) dp_decode_total = sum(dp_decode_tokens.values()) dp_total = dp_prefill_total + dp_decode_total M = 1e6 saving_pct = (1 - kvc_total / dp_total) * 100 # ------------------------------------------------------------------ # Colors # ------------------------------------------------------------------ KVC_P_COLOR = "#E89D44" # orange — P GPU KVC_D_PREF_COLOR = "#7AB6D9" # light blue — D-side small append-prefill KVC_D_DEC_COLOR = "#1F77B4" # dark blue — D-side decode DP_PREF_COLOR = "#E07474" # light red — DP full prefill DP_DEC_COLOR = "#D62728" # dark red — DP decode fig, axes = plt.subplots(1, 2, figsize=(15, 7.0)) # ================================================================== # Left panel: System-wide compute, stacked by work type # ================================================================== ax = axes[0] x = np.array([0, 1]) bar_w = 0.55 # KVC stack: P-prefill (bottom orange) + D-prefill (light blue) + D-decode (dark blue) ax.bar(0, kvc_p_prefill / M, bar_w, color=KVC_P_COLOR, edgecolor="black", linewidth=0.6, label="KVC: P-side heavy prefill (reseed / seed)") ax.bar(0, kvc_d_prefill / M, bar_w, bottom=kvc_p_prefill / M, color=KVC_D_PREF_COLOR, edgecolor="black", linewidth=0.6, label="KVC: D-side append-prefill (direct-to-D, small)") ax.bar(0, kvc_d_decode / M, bar_w, bottom=(kvc_p_prefill + kvc_d_prefill) / M, color=KVC_D_DEC_COLOR, edgecolor="black", linewidth=0.6, label="Decode (both)") # DP stack: full prefill (light red) + decode (dark red) ax.bar(1, dp_prefill_total / M, bar_w, color=DP_PREF_COLOR, edgecolor="black", linewidth=0.6, label="DP: fused worker prefill (full uncached)") ax.bar(1, dp_decode_total / M, bar_w, bottom=dp_prefill_total / M, color=DP_DEC_COLOR, edgecolor="black", linewidth=0.6, label="_nolegend_") # Inline labels for stack segments def stack_label(xpos, ypos, text, color="white", fontsize=10): ax.text(xpos, ypos, text, ha="center", va="center", fontsize=fontsize, color=color, fontweight="bold") stack_label(0, kvc_p_prefill / M / 2, f"P heavy prefill\n{kvc_p_prefill/M:.2f}M") stack_label(0, (kvc_p_prefill + kvc_d_prefill / 2) / M, f"D append-prefill\n{kvc_d_prefill/M:.2f}M", color="black") stack_label(0, (kvc_p_prefill + kvc_d_prefill + kvc_d_decode / 2) / M, f"D decode\n{kvc_d_decode/M:.2f}M") stack_label(1, dp_prefill_total / M / 2, f"Full prefill\n(every worker)\n{dp_prefill_total/M:.2f}M", color="black") stack_label(1, (dp_prefill_total + dp_decode_total / 2) / M, f"Decode\n{dp_decode_total/M:.2f}M") # Totals on top ax.text(0, kvc_total / M + 0.15, f"{kvc_total/M:.2f}M tokens", ha="center", va="bottom", fontsize=12, fontweight="bold", color="#1F77B4") ax.text(1, dp_total / M + 0.15, f"{dp_total/M:.2f}M tokens", ha="center", va="bottom", fontsize=12, fontweight="bold", color="#D62728") # Big savings annotation — placed centrally inside the panel, # bracketed by a horizontal arrow connecting the bar tops. headroom_top = max(kvc_total, dp_total) / M * 1.42 arrow_y = max(kvc_total, dp_total) / M * 1.08 text_y = max(kvc_total, dp_total) / M * 1.22 ax.annotate("", xy=(0.78, arrow_y), xytext=(0.22, arrow_y), arrowprops=dict(arrowstyle="<->", color="#2C8C2C", lw=1.8)) ax.text( 0.5, text_y, f"−{saving_pct:.0f}%\ntotal compute", ha="center", va="center", fontsize=13, fontweight="bold", color="#2C8C2C", bbox=dict(facecolor="#E8F5E8", edgecolor="#2C8C2C", alpha=0.95, pad=5), ) ax.set_xticks(x) ax.set_xlim(-0.5, 1.5) ax.set_xticklabels(["KVC 1P3D v2", "DP 4-way CA"], fontsize=12, fontweight="bold") ax.set_ylabel("Total system compute (millions of token-equivalents)", fontsize=11) ax.set_ylim(0, headroom_top) ax.set_title("System-wide compute economy | same 4449-request workload", fontsize=12, pad=10) ax.grid(axis="y", linestyle=":", alpha=0.4) ax.set_axisbelow(True) ax.legend(loc="upper left", fontsize=8.5, framealpha=0.95) # ================================================================== # Right panel: per-GPU breakdown showing the architectural shape # ================================================================== ax = axes[1] kvc_gpus = ["prefill-0", "decode-0", "decode-1", "decode-2"] dp_gpus = ["direct-0", "direct-1", "direct-2", "direct-3"] all_gpus = kvc_gpus + dp_gpus labels = [ "KVC\nP-only", "KVC\nD-0", "KVC\nD-1", "KVC\nD-2", "DP\nP+D-0", "DP\nP+D-1", "DP\nP+D-2", "DP\nP+D-3", ] x = np.arange(len(all_gpus)) prefill_M = ([kvc_prefill_tokens.get(g, 0) / M for g in kvc_gpus] + [dp_prefill_tokens.get(g, 0) / M for g in dp_gpus]) decode_M = ([kvc_decode_tokens.get(g, 0) / M for g in kvc_gpus] + [dp_decode_tokens.get(g, 0) / M for g in dp_gpus]) # Color by group: orange for KVC P, blue for KVC D, red for DP bar_colors_prefill = [KVC_P_COLOR, KVC_D_PREF_COLOR, KVC_D_PREF_COLOR, KVC_D_PREF_COLOR, DP_PREF_COLOR, DP_PREF_COLOR, DP_PREF_COLOR, DP_PREF_COLOR] bar_colors_decode = [KVC_D_DEC_COLOR, KVC_D_DEC_COLOR, KVC_D_DEC_COLOR, KVC_D_DEC_COLOR, DP_DEC_COLOR, DP_DEC_COLOR, DP_DEC_COLOR, DP_DEC_COLOR] ax.bar(x, prefill_M, color=bar_colors_prefill, edgecolor="black", linewidth=0.5, label="Prefill compute") ax.bar(x, decode_M, bottom=prefill_M, color=bar_colors_decode, edgecolor="black", linewidth=0.5, hatch="///", alpha=0.75, label="Decode compute") total_M = [p + d for p, d in zip(prefill_M, decode_M)] for xi, t in zip(x, total_M): ax.text(xi, t + max(total_M) * 0.015, f"{t:.2f}M", ha="center", va="bottom", fontsize=9.5) ax.set_xticks(x) ax.set_xticklabels(labels, fontsize=9.5) ax.set_ylabel("Compute (millions of token-equivalents)", fontsize=11) ax.set_ylim(0, max(total_M) * 1.30) ax.set_title("Where the work lives | specialized P + light D vs uniform fused workers", fontsize=12, pad=10) ax.grid(axis="y", linestyle=":", alpha=0.4) ax.set_axisbelow(True) # Separator + headline takeaways under the GROUP labels (in axes # fraction coords so they don't shift if ylim changes). ax.axvline(3.5, color="gray", linestyle="--", linewidth=1.0, alpha=0.5) ax.text( 0.22, 0.97, f"KVC: P specialized for heavy prefill\nD workers ~{np.mean(total_M[1:4]):.2f}M each (light)", transform=ax.transAxes, ha="center", va="top", fontsize=9.5, bbox=dict(facecolor="#FFFAE6", edgecolor="#888", alpha=0.92, pad=4), ) ax.text( 0.78, 0.97, f"DP: every worker {np.mean(total_M[4:]):.2f}M (fused)\nfull prefill interleaved with decode", transform=ax.transAxes, ha="center", va="top", fontsize=9.5, bbox=dict(facecolor="#FFE8E8", edgecolor="#888", alpha=0.92, pad=4), ) # No second legend on the right panel — the colours are already # introduced in the left panel and the in-panel annotation boxes # explain what each group means. Decode being hatched is signalled # in the right-panel bar style itself. fig.suptitle( "KVC v2 reduces system-wide compute by 33% vs DP 4-way CA, same workload (4449 requests).\n" "Mechanism: 91.6% of requests find their prefix cached on the affinity-pinned D worker\n" "(append-prefill = 341 tokens on avg), so the total prefill work the system must do is much smaller.", fontsize=12, y=1.05, ) plt.tight_layout() plt.savefig(OUT, dpi=150, bbox_inches="tight") print(f"wrote {OUT}") plt.close(fig) # ------------------------------------------------------------------ # Print numbers for doc reference # ------------------------------------------------------------------ print("\n=== System totals ===") print(f"KVC v2 total: {kvc_total/M:.3f}M tokens") print(f" P heavy prefill: {kvc_p_prefill/M:.3f}M") print(f" D append-prefill: {kvc_d_prefill/M:.3f}M") print(f" D decode: {kvc_d_decode/M:.3f}M") print(f"DP 4w total: {dp_total/M:.3f}M tokens") print(f" Full prefill: {dp_prefill_total/M:.3f}M") print(f" Decode: {dp_decode_total/M:.3f}M") print(f"\nKVC vs DP: -{saving_pct:.1f}% total compute saved") print("\n=== Per-GPU breakdown ===") for lbl, p, d in zip(labels, prefill_M, decode_M): print(f" {lbl.replace(chr(10), ' '):<14} prefill={p:.3f}M decode={d:.3f}M total={p+d:.3f}M") if __name__ == "__main__": main()