agentic-pd-hybrid/scripts/analysis/plot_gpu_utilization.py

#!/usr/bin/env python3
"""System compute economy: KVC 1P3D v2 vs 4-way DP CA.

Generates docs/figures/gpu_utilization.png -- two-panel:
  left:  total system compute (stacked by work type)
  right: per-GPU compute distribution (specialized vs fused)

The punchline is the TOTAL system compute reduction:
  KVC v2 system: 3.47 M tokens of compute (1.07 P-prefill + 1.39 D-append + 1.01 decode)
  DP 4-way:      5.17 M tokens of compute (4.17 full-prefill + 1.00 decode)
  → KVC does 33% LESS compute for the SAME workload (same 4449 requests).

This is the non-trivial finding: session affinity converts to reduced
system-wide work, not just locality. The per-GPU panel then explains
the architectural shape: KVC concentrates heavy prefill on a specialized
P worker, leaves D workers with light append + decode; DP forces every
worker to absorb the full prefill load mixed with decode.

The earlier version of this figure showed per-GPU request count + per-GPU
compute and was confusing to external reviewers ("P doing prefill is
trivial"). This version leads with the system-total comparison, which IS
the non-trivial result.

Aborted / errored requests are excluded.
"""

from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

ROOT = Path(__file__).resolve().parents[2]
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
OUT = ROOT / "docs/figures/gpu_utilization.png"


def load(p: Path) -> list[dict]:
    return [json.loads(line) for line in p.open()]


def is_failed(r: dict) -> bool:
    if r.get("error"):
        return True
    fr = r.get("finish_reason")
    if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
        return True
    return False


def uncached(r: dict) -> int:
    return max(0, r["input_length"] - r.get("cached_tokens", 0))


def out_tokens(r: dict) -> int:
    return r.get("actual_output_tokens") or r.get("output_length") or 0


def main() -> None:
    kvc = [r for r in load(KVC) if not is_failed(r)]
    dp = [r for r in load(DP) if not is_failed(r)]

    # ------------------------------------------------------------------
    # KVC per-GPU + per-work-type attribution
    # ------------------------------------------------------------------
    kvc_prefill_tokens = defaultdict(int)
    kvc_decode_tokens = defaultdict(int)

    for r in kvc:
        d = r["assigned_decode_node"]
        p = r["assigned_prefill_node"]
        mode = r.get("execution_mode", "")
        if mode == "kvcache-direct-to-d-session":
            # P bypassed; D does small append-prefill + decode
            kvc_prefill_tokens[d] += uncached(r)
            kvc_decode_tokens[d] += out_tokens(r)
        else:
            # P does heavy prefill; D handles decode
            kvc_prefill_tokens[p] += uncached(r)
            kvc_decode_tokens[d] += out_tokens(r)

    # ------------------------------------------------------------------
    # DP per-GPU attribution (fused P+D on every worker)
    # ------------------------------------------------------------------
    dp_prefill_tokens = defaultdict(int)
    dp_decode_tokens = defaultdict(int)

    for r in dp:
        w = r["assigned_decode_node"]
        dp_prefill_tokens[w] += uncached(r)
        dp_decode_tokens[w] += out_tokens(r)

    # ------------------------------------------------------------------
    # Aggregate work by category for the left panel
    # ------------------------------------------------------------------
    kvc_p_prefill = kvc_prefill_tokens.get("prefill-0", 0)
    kvc_d_prefill = sum(v for k, v in kvc_prefill_tokens.items() if k.startswith("decode-"))
    kvc_d_decode = sum(kvc_decode_tokens.values())
    kvc_total = kvc_p_prefill + kvc_d_prefill + kvc_d_decode

    dp_prefill_total = sum(dp_prefill_tokens.values())
    dp_decode_total = sum(dp_decode_tokens.values())
    dp_total = dp_prefill_total + dp_decode_total

    M = 1e6
    saving_pct = (1 - kvc_total / dp_total) * 100

    # ------------------------------------------------------------------
    # Colors
    # ------------------------------------------------------------------
    KVC_P_COLOR = "#E89D44"       # orange — P GPU
    KVC_D_PREF_COLOR = "#7AB6D9"  # light blue — D-side small append-prefill
    KVC_D_DEC_COLOR = "#1F77B4"   # dark blue — D-side decode
    DP_PREF_COLOR = "#E07474"     # light red — DP full prefill
    DP_DEC_COLOR = "#D62728"      # dark red — DP decode

    fig, axes = plt.subplots(1, 2, figsize=(15, 7.0))

    # ==================================================================
    # Left panel: System-wide compute, stacked by work type
    # ==================================================================
    ax = axes[0]
    x = np.array([0, 1])
    bar_w = 0.55

    # KVC stack: P-prefill (bottom orange) + D-prefill (light blue) + D-decode (dark blue)
    ax.bar(0, kvc_p_prefill / M, bar_w, color=KVC_P_COLOR,
           edgecolor="black", linewidth=0.6,
           label="KVC: P-side heavy prefill  (reseed / seed)")
    ax.bar(0, kvc_d_prefill / M, bar_w, bottom=kvc_p_prefill / M,
           color=KVC_D_PREF_COLOR, edgecolor="black", linewidth=0.6,
           label="KVC: D-side append-prefill  (direct-to-D, small)")
    ax.bar(0, kvc_d_decode / M, bar_w,
           bottom=(kvc_p_prefill + kvc_d_prefill) / M,
           color=KVC_D_DEC_COLOR, edgecolor="black", linewidth=0.6,
           label="Decode  (both)")

    # DP stack: full prefill (light red) + decode (dark red)
    ax.bar(1, dp_prefill_total / M, bar_w,
           color=DP_PREF_COLOR, edgecolor="black", linewidth=0.6,
           label="DP: fused worker prefill  (full uncached)")
    ax.bar(1, dp_decode_total / M, bar_w, bottom=dp_prefill_total / M,
           color=DP_DEC_COLOR, edgecolor="black", linewidth=0.6,
           label="_nolegend_")

    # Inline labels for stack segments
    def stack_label(xpos, ypos, text, color="white", fontsize=10):
        ax.text(xpos, ypos, text, ha="center", va="center",
                fontsize=fontsize, color=color, fontweight="bold")

    stack_label(0, kvc_p_prefill / M / 2,
                f"P heavy prefill\n{kvc_p_prefill/M:.2f}M")
    stack_label(0, (kvc_p_prefill + kvc_d_prefill / 2) / M,
                f"D append-prefill\n{kvc_d_prefill/M:.2f}M",
                color="black")
    stack_label(0, (kvc_p_prefill + kvc_d_prefill + kvc_d_decode / 2) / M,
                f"D decode\n{kvc_d_decode/M:.2f}M")
    stack_label(1, dp_prefill_total / M / 2,
                f"Full prefill\n(every worker)\n{dp_prefill_total/M:.2f}M",
                color="black")
    stack_label(1, (dp_prefill_total + dp_decode_total / 2) / M,
                f"Decode\n{dp_decode_total/M:.2f}M")

    # Totals on top
    ax.text(0, kvc_total / M + 0.15, f"{kvc_total/M:.2f}M tokens",
            ha="center", va="bottom", fontsize=12, fontweight="bold",
            color="#1F77B4")
    ax.text(1, dp_total / M + 0.15, f"{dp_total/M:.2f}M tokens",
            ha="center", va="bottom", fontsize=12, fontweight="bold",
            color="#D62728")

    # Big savings annotation — placed centrally inside the panel,
    # bracketed by a horizontal arrow connecting the bar tops.
    headroom_top = max(kvc_total, dp_total) / M * 1.42
    arrow_y = max(kvc_total, dp_total) / M * 1.08
    text_y = max(kvc_total, dp_total) / M * 1.22

    ax.annotate("", xy=(0.78, arrow_y), xytext=(0.22, arrow_y),
                arrowprops=dict(arrowstyle="<->", color="#2C8C2C", lw=1.8))
    ax.text(
        0.5, text_y, f"−{saving_pct:.0f}%\ntotal compute",
        ha="center", va="center",
        fontsize=13, fontweight="bold", color="#2C8C2C",
        bbox=dict(facecolor="#E8F5E8", edgecolor="#2C8C2C", alpha=0.95, pad=5),
    )

    ax.set_xticks(x)
    ax.set_xlim(-0.5, 1.5)
    ax.set_xticklabels(["KVC 1P3D v2", "DP 4-way CA"], fontsize=12, fontweight="bold")
    ax.set_ylabel("Total system compute  (millions of token-equivalents)", fontsize=11)
    ax.set_ylim(0, headroom_top)
    ax.set_title("System-wide compute economy   |   same 4449-request workload",
                 fontsize=12, pad=10)
    ax.grid(axis="y", linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)
    ax.legend(loc="upper left", fontsize=8.5, framealpha=0.95)

    # ==================================================================
    # Right panel: per-GPU breakdown showing the architectural shape
    # ==================================================================
    ax = axes[1]

    kvc_gpus = ["prefill-0", "decode-0", "decode-1", "decode-2"]
    dp_gpus = ["direct-0", "direct-1", "direct-2", "direct-3"]
    all_gpus = kvc_gpus + dp_gpus
    labels = [
        "KVC\nP-only", "KVC\nD-0", "KVC\nD-1", "KVC\nD-2",
        "DP\nP+D-0", "DP\nP+D-1", "DP\nP+D-2", "DP\nP+D-3",
    ]
    x = np.arange(len(all_gpus))

    prefill_M = ([kvc_prefill_tokens.get(g, 0) / M for g in kvc_gpus]
                 + [dp_prefill_tokens.get(g, 0) / M for g in dp_gpus])
    decode_M = ([kvc_decode_tokens.get(g, 0) / M for g in kvc_gpus]
                + [dp_decode_tokens.get(g, 0) / M for g in dp_gpus])

    # Color by group: orange for KVC P, blue for KVC D, red for DP
    bar_colors_prefill = [KVC_P_COLOR, KVC_D_PREF_COLOR, KVC_D_PREF_COLOR, KVC_D_PREF_COLOR,
                          DP_PREF_COLOR, DP_PREF_COLOR, DP_PREF_COLOR, DP_PREF_COLOR]
    bar_colors_decode = [KVC_D_DEC_COLOR, KVC_D_DEC_COLOR, KVC_D_DEC_COLOR, KVC_D_DEC_COLOR,
                         DP_DEC_COLOR, DP_DEC_COLOR, DP_DEC_COLOR, DP_DEC_COLOR]

    ax.bar(x, prefill_M, color=bar_colors_prefill,
           edgecolor="black", linewidth=0.5, label="Prefill compute")
    ax.bar(x, decode_M, bottom=prefill_M, color=bar_colors_decode,
           edgecolor="black", linewidth=0.5, hatch="///",
           alpha=0.75, label="Decode compute")

    total_M = [p + d for p, d in zip(prefill_M, decode_M)]
    for xi, t in zip(x, total_M):
        ax.text(xi, t + max(total_M) * 0.015, f"{t:.2f}M",
                ha="center", va="bottom", fontsize=9.5)

    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=9.5)
    ax.set_ylabel("Compute  (millions of token-equivalents)", fontsize=11)
    ax.set_ylim(0, max(total_M) * 1.30)
    ax.set_title("Where the work lives   |   specialized P + light D vs uniform fused workers",
                 fontsize=12, pad=10)
    ax.grid(axis="y", linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)

    # Separator + headline takeaways under the GROUP labels (in axes
    # fraction coords so they don't shift if ylim changes).
    ax.axvline(3.5, color="gray", linestyle="--", linewidth=1.0, alpha=0.5)
    ax.text(
        0.22, 0.97,
        f"KVC: P specialized for heavy prefill\nD workers ~{np.mean(total_M[1:4]):.2f}M each (light)",
        transform=ax.transAxes, ha="center", va="top", fontsize=9.5,
        bbox=dict(facecolor="#FFFAE6", edgecolor="#888", alpha=0.92, pad=4),
    )
    ax.text(
        0.78, 0.97,
        f"DP: every worker {np.mean(total_M[4:]):.2f}M (fused)\nfull prefill interleaved with decode",
        transform=ax.transAxes, ha="center", va="top", fontsize=9.5,
        bbox=dict(facecolor="#FFE8E8", edgecolor="#888", alpha=0.92, pad=4),
    )

    # No second legend on the right panel — the colours are already
    # introduced in the left panel and the in-panel annotation boxes
    # explain what each group means. Decode being hatched is signalled
    # in the right-panel bar style itself.

    fig.suptitle(
        "KVC v2 reduces system-wide compute by 33% vs DP 4-way CA, same workload (4449 requests).\n"
        "Mechanism: 91.6% of requests find their prefix cached on the affinity-pinned D worker\n"
        "(append-prefill = 341 tokens on avg), so the total prefill work the system must do is much smaller.",
        fontsize=12, y=1.05,
    )
    plt.tight_layout()
    plt.savefig(OUT, dpi=150, bbox_inches="tight")
    print(f"wrote {OUT}")
    plt.close(fig)

    # ------------------------------------------------------------------
    # Print numbers for doc reference
    # ------------------------------------------------------------------
    print("\n=== System totals ===")
    print(f"KVC v2 total: {kvc_total/M:.3f}M tokens")
    print(f"  P heavy prefill:     {kvc_p_prefill/M:.3f}M")
    print(f"  D append-prefill:    {kvc_d_prefill/M:.3f}M")
    print(f"  D decode:            {kvc_d_decode/M:.3f}M")
    print(f"DP 4w total: {dp_total/M:.3f}M tokens")
    print(f"  Full prefill:        {dp_prefill_total/M:.3f}M")
    print(f"  Decode:              {dp_decode_total/M:.3f}M")
    print(f"\nKVC vs DP: -{saving_pct:.1f}%  total compute saved")

    print("\n=== Per-GPU breakdown ===")
    for lbl, p, d in zip(labels, prefill_M, decode_M):
        print(f"  {lbl.replace(chr(10), ' '):<14}  prefill={p:.3f}M  decode={d:.3f}M  total={p+d:.3f}M")


if __name__ == "__main__":
    main()