agentic-pd-hybrid/scripts/analysis/plot_gpu_utilization.py

#!/usr/bin/env python3
"""Per-GPU utilization breakdown: KVC 1P3D v2 vs 4-way DP CA.

Generates docs/figures/gpu_utilization.png — two-panel:
  left:  per-GPU request count
  right: per-GPU compute work (uncached prefill tokens + decode tokens, stacked)

The point of the figure is to push back on the naïve reading
"KVC's prefill GPU is idle 90% of the time, so KVC is using fewer GPUs."

By request count, the prefill GPU is indeed touched by only ~8% of requests.
By compute work, the prefill GPU bears comparable per-GPU load to each
decode GPU — it is a low-frequency, high-cost safety net for cache misses,
not idle capacity.

Work attribution:
  KVC direct-to-D path: prefill happens locally on the assigned D worker
                        (append-prefill of `uncached_tokens` tokens).
  KVC seed/reseed/fallback path: prefill happens on prefill-0
                        (full uncached_tokens), decode on assigned D.
  DP: all work on assigned direct-N worker.

Aborted / errored requests are excluded.
"""

from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

ROOT = Path(__file__).resolve().parents[2]
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
OUT = ROOT / "docs/figures/gpu_utilization.png"


def load(p: Path) -> list[dict]:
    return [json.loads(line) for line in p.open()]


def is_failed(r: dict) -> bool:
    if r.get("error"):
        return True
    fr = r.get("finish_reason")
    if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
        return True
    return False


def uncached(r: dict) -> int:
    return max(0, r["input_length"] - r.get("cached_tokens", 0))


def out_tokens(r: dict) -> int:
    return r.get("actual_output_tokens") or r.get("output_length") or 0


def main() -> None:
    kvc = [r for r in load(KVC) if not is_failed(r)]
    dp = [r for r in load(DP) if not is_failed(r)]

    # ------------------------------------------------------------------
    # KVC per-GPU attribution
    # ------------------------------------------------------------------
    kvc_req_count = defaultdict(int)
    kvc_prefill_tokens = defaultdict(int)   # uncached prefill compute
    kvc_decode_tokens = defaultdict(int)

    for r in kvc:
        d = r["assigned_decode_node"]            # decode-0/1/2
        p = r["assigned_prefill_node"]            # prefill-0
        mode = r.get("execution_mode", "")
        if mode == "kvcache-direct-to-d-session":
            # P is bypassed entirely; D does the append-prefill + decode
            kvc_req_count[d] += 1
            kvc_prefill_tokens[d] += uncached(r)
            kvc_decode_tokens[d] += out_tokens(r)
        else:
            # P does the full prefill; D handles decode
            kvc_req_count[p] += 1
            kvc_req_count[d] += 1   # decode side still counts
            kvc_prefill_tokens[p] += uncached(r)
            kvc_decode_tokens[d] += out_tokens(r)

    # ------------------------------------------------------------------
    # DP per-GPU attribution (fused P+D on every worker)
    # ------------------------------------------------------------------
    dp_req_count = defaultdict(int)
    dp_prefill_tokens = defaultdict(int)
    dp_decode_tokens = defaultdict(int)

    for r in dp:
        w = r["assigned_decode_node"]  # direct-0..3
        dp_req_count[w] += 1
        dp_prefill_tokens[w] += uncached(r)
        dp_decode_tokens[w] += out_tokens(r)

    # ------------------------------------------------------------------
    # Build ordered GPU list, KVC then DP
    # ------------------------------------------------------------------
    kvc_gpus = ["prefill-0", "decode-0", "decode-1", "decode-2"]
    dp_gpus = ["direct-0", "direct-1", "direct-2", "direct-3"]
    all_gpus = kvc_gpus + dp_gpus

    def get(d, k):
        return d.get(k, 0)

    counts = [get(kvc_req_count, g) for g in kvc_gpus] + \
             [get(dp_req_count, g) for g in dp_gpus]
    prefill_tk = [get(kvc_prefill_tokens, g) for g in kvc_gpus] + \
                 [get(dp_prefill_tokens, g) for g in dp_gpus]
    decode_tk = [get(kvc_decode_tokens, g) for g in kvc_gpus] + \
                [get(dp_decode_tokens, g) for g in dp_gpus]

    # Display labels: P/D role + worker id
    labels = [
        "KVC P\nprefill-0",
        "KVC D\ndecode-0",
        "KVC D\ndecode-1",
        "KVC D\ndecode-2",
        "DP P+D\ndirect-0",
        "DP P+D\ndirect-1",
        "DP P+D\ndirect-2",
        "DP P+D\ndirect-3",
    ]
    kvc_mask = [True, True, True, True, False, False, False, False]

    KVC_P_COLOR = "#E89D44"     # orange — P GPU stands out
    KVC_D_COLOR = "#1F77B4"     # blue
    DP_COLOR    = "#D62728"     # red

    bar_colors = [KVC_P_COLOR, KVC_D_COLOR, KVC_D_COLOR, KVC_D_COLOR,
                  DP_COLOR, DP_COLOR, DP_COLOR, DP_COLOR]

    fig, axes = plt.subplots(1, 2, figsize=(15, 7.0))
    x = np.arange(len(all_gpus))

    # -- Left: per-GPU request count ----------------------------------
    ax = axes[0]
    bars = ax.bar(x, counts, color=bar_colors, edgecolor="black", linewidth=0.6)
    for xi, c in zip(x, counts):
        ax.text(xi, c + max(counts) * 0.015, f"{c:,}",
                ha="center", va="bottom", fontsize=9.5)
    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=9.5)
    ax.set_ylabel("Number of requests touching this GPU", fontsize=11)
    # Headroom for the annotation: extend ylim 35% above tallest bar
    ax.set_ylim(0, max(counts) * 1.40)
    ax.set_title("Per-GPU request count\n(naïve view: P seems idle)",
                 fontsize=12, pad=24)
    ax.grid(axis="y", linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)

    # Annotate: KVC P GPU is "low frequency"
    # Place in upper-right area (over DP group) so it doesn't sit on KVC D bars
    p_idx = 0
    ax.annotate(
        f"P GPU only sees\n"
        f"{counts[p_idx]:,} requests\n"
        f"({counts[p_idx]/len(kvc)*100:.1f}% of all KVC requests)",
        xy=(p_idx, counts[p_idx]),
        xytext=(2.4, max(counts) * 1.20),
        fontsize=10, color=KVC_P_COLOR, fontweight="bold", ha="center",
        bbox=dict(facecolor="white", edgecolor=KVC_P_COLOR, alpha=0.92, pad=4),
        arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
    )

    # -- Right: per-GPU compute work (stacked prefill + decode) -------
    ax = axes[1]
    prefill_M = [t / 1e6 for t in prefill_tk]
    decode_M = [t / 1e6 for t in decode_tk]
    total_M = [p + d for p, d in zip(prefill_M, decode_M)]

    bars_p = ax.bar(x, prefill_M, color=[c for c in bar_colors],
                    edgecolor="black", linewidth=0.6, label="Uncached prefill tokens",
                    alpha=0.95)
    bars_d = ax.bar(x, decode_M, bottom=prefill_M, color=[c for c in bar_colors],
                    edgecolor="black", linewidth=0.6, hatch="///",
                    label="Decode tokens", alpha=0.55)

    for xi, t in zip(x, total_M):
        ax.text(xi, t + max(total_M) * 0.015, f"{t:.2f}M",
                ha="center", va="bottom", fontsize=9.5)

    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=9.5)
    ax.set_ylabel("Compute tokens (millions)", fontsize=11)
    # Headroom for the annotation
    ax.set_ylim(0, max(total_M) * 1.45)
    ax.set_title("Per-GPU compute work\n(work view: P is comparable to each D)",
                 fontsize=12, pad=24)
    ax.grid(axis="y", linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)
    # Legend placed at upper-left where bars are tallest is fine after raising ylim
    ax.legend(loc="upper left", fontsize=10, framealpha=0.95)

    # Annotate: KVC P GPU does similar work to each D.
    # Place over DP region (right side) so it doesn't sit on KVC D bars.
    ax.annotate(
        f"P GPU does {total_M[p_idx]:.2f}M tokens of prefill\n"
        f"— comparable per-GPU load to each KVC D worker\n"
        f"(KVC D avg = {np.mean(total_M[1:4]):.2f}M)",
        xy=(p_idx, total_M[p_idx]),
        xytext=(5.5, max(total_M) * 1.30),
        fontsize=10, color=KVC_P_COLOR, fontweight="bold", ha="center",
        bbox=dict(facecolor="white", edgecolor=KVC_P_COLOR, alpha=0.92, pad=4),
        arrowprops=dict(arrowstyle="->", color=KVC_P_COLOR, lw=1.0),
    )

    # Separator + group labels (placed in axes-fraction coords, below subplot
    # title at pad=24 we now have safe room for these at y_axes_frac ≈ 1.02)
    for ax in axes:
        ax.axvline(3.5, color="gray", linestyle="--", linewidth=1.0, alpha=0.5)
        ax.text(0.25, 1.02, "KVC 1P3D",
                transform=ax.transAxes, ha="center", va="bottom",
                fontsize=11.5, fontweight="bold", color="#444",
                bbox=dict(facecolor="#F2F2F2", edgecolor="#888",
                          alpha=0.85, pad=3))
        ax.text(0.75, 1.02, "DP 4-way CA",
                transform=ax.transAxes, ha="center", va="bottom",
                fontsize=11.5, fontweight="bold", color="#444",
                bbox=dict(facecolor="#F2F2F2", edgecolor="#888",
                          alpha=0.85, pad=3))

    fig.suptitle(
        "Per-GPU utilization: \"is KVC's prefill GPU wasted?\"\n"
        "Left view says yes (only 8% of requests); right view says no (comparable work to each D).",
        fontsize=13, y=1.02,
    )
    plt.tight_layout()
    plt.savefig(OUT, dpi=150, bbox_inches="tight")
    print(f"wrote {OUT}")
    plt.close(fig)

    # ------------------------------------------------------------------
    # Print numbers for doc reference
    # ------------------------------------------------------------------
    print("\n=== Per-GPU numbers ===")
    print(f"{'GPU':<22}  {'requests':>10}  {'prefill(M)':>12}  {'decode(M)':>12}  {'total(M)':>10}")
    for lbl, n, pM, dM in zip(labels, counts, prefill_M, decode_M):
        print(f"  {lbl.replace(chr(10), ' '):<20}  {n:>10}  {pM:>12.3f}  {dM:>12.3f}  {pM+dM:>10.3f}")


if __name__ == "__main__":
    main()