agentic-pd-hybrid/scripts/analysis/plot_cache_efficiency.py

#!/usr/bin/env python3
"""Cache efficiency comparison: KVC 1P3D v2 vs 4-way DP CA.

Generates docs/figures/cache_efficiency.png — two-panel:
  left:  cache hit rate vs turn number   (mechanism: affinity vs LRU)
  right: ECDF of per-request uncached tokens  (per-request impact)

Resolves the apparent paradox: KVC has 27% less total KV pool capacity
(3 × 92K = 276K  vs  DP 4 × 87K = 351K) yet achieves higher cache hit rate
(98.1% vs 96.8%) and lower mean uncached tokens per request (560 vs 952).

The left panel shows the mechanism: KVC's session affinity makes cache hit
rate grow with turn count (more cache accumulates on the pinned D), while
DP's hash + radix-LRU causes cache hit rate to decay through the middle
turns (other sessions' KV competes via LRU eviction).

The right panel quantifies the impact: KVC's uncached tokens are
concentrated near 0 (mean 560), DP's are spread (mean 952).

Aborted / errored requests are excluded.
"""

from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

ROOT = Path(__file__).resolve().parents[2]
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
OUT = ROOT / "docs/figures/cache_efficiency.png"


def load(p: Path) -> list[dict]:
    return [json.loads(line) for line in p.open()]


def is_failed(r: dict) -> bool:
    if r.get("error"):
        return True
    fr = r.get("finish_reason")
    if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
        return True
    return False


def main() -> None:
    kvc = [r for r in load(KVC) if not is_failed(r)]
    dp = [r for r in load(DP) if not is_failed(r)]

    KVC_COLOR = "#1F77B4"
    DP_COLOR = "#D62728"

    fig, axes = plt.subplots(1, 2, figsize=(15, 6.5))

    # ------------------------------------------------------------------
    # Left panel: cache hit rate per turn
    # Bin requests by turn_id, plot mean hit rate per bin with shaded band
    # ------------------------------------------------------------------
    def bin_by_turn(rows: list[dict]) -> tuple[list[int], list[float], list[float], list[float]]:
        per_turn: defaultdict[int, list[float]] = defaultdict(list)
        for r in rows:
            if r["input_length"] == 0:
                continue
            hit = r.get("cached_tokens", 0) / r["input_length"]
            per_turn[r["turn_id"]].append(hit)
        turns = sorted(per_turn.keys())
        means, p25s, p75s = [], [], []
        for t in turns:
            arr = np.array(per_turn[t])
            means.append(float(np.mean(arr)))
            p25s.append(float(np.quantile(arr, 0.25)))
            p75s.append(float(np.quantile(arr, 0.75)))
        return turns, means, p25s, p75s

    kvc_t, kvc_m, kvc_lo, kvc_hi = bin_by_turn(kvc)
    dp_t, dp_m, dp_lo, dp_hi = bin_by_turn(dp)

    # Cap x-axis: tails get noisy below ~5 samples per bin
    max_turn = 100

    ax = axes[0]
    ax.plot(kvc_t, kvc_m, color=KVC_COLOR, lw=2.5,
            label=f"KVC 1P3D v2  (overall hit 98.1%)")
    ax.fill_between(kvc_t, kvc_lo, kvc_hi, color=KVC_COLOR, alpha=0.18,
                    label="KVC IQR (p25-p75)")
    ax.plot(dp_t, dp_m, color=DP_COLOR, lw=2.5,
            label=f"4-way DP CA  (overall hit 96.8%)")
    ax.fill_between(dp_t, dp_lo, dp_hi, color=DP_COLOR, alpha=0.18,
                    label="DP IQR (p25-p75)")

    # Annotate the mid-turn drift gap
    drift_turns = list(range(8, 25))
    drift_kvc = np.mean([m for t, m in zip(kvc_t, kvc_m) if t in drift_turns])
    drift_dp = np.mean([m for t, m in zip(dp_t, dp_m) if t in drift_turns])
    ax.axvspan(8, 25, color="#999", alpha=0.08, label="_nolegend_")
    ax.text(16, 0.65,
            f"Mid-turn region\n(turns 8-25):\nKVC {drift_kvc*100:.1f}%  |  DP {drift_dp*100:.1f}%\nGap {(drift_kvc-drift_dp)*100:+.1f} pp",
            ha="center", va="center", fontsize=9.5,
            bbox=dict(facecolor="white", edgecolor="gray", alpha=0.92, pad=4))

    ax.set_xlim(1, max_turn)
    ax.set_ylim(0.4, 1.02)
    ax.set_xlabel("Turn number within session", fontsize=11)
    ax.set_ylabel("Per-request cache hit rate (cached / input_length)", fontsize=11)
    ax.set_title("Cache hit rate vs turn number\n(mechanism: session affinity vs hash-LRU)",
                 fontsize=12, pad=10)
    ax.legend(loc="lower right", fontsize=9.5, framealpha=0.95)
    ax.grid(True, linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)

    # ------------------------------------------------------------------
    # Right panel: ECDF of per-request uncached tokens (log x)
    # ------------------------------------------------------------------
    def ecdf(rows: list[dict]) -> tuple[np.ndarray, np.ndarray]:
        vals = np.array([
            max(1, r["input_length"] - r.get("cached_tokens", 0))
            for r in rows
        ])
        vals = np.sort(vals)
        return vals, np.arange(1, len(vals) + 1) / len(vals)

    kvc_x, kvc_y = ecdf(kvc)
    dp_x, dp_y = ecdf(dp)

    ax = axes[1]
    ax.plot(kvc_x, kvc_y, color=KVC_COLOR, lw=2.5,
            label=f"KVC 1P3D v2  (mean {int(np.mean(kvc_x))} tokens)")
    ax.plot(dp_x, dp_y, color=DP_COLOR, lw=2.5,
            label=f"4-way DP CA  (mean {int(np.mean(dp_x))} tokens)")

    # Median markers
    kvc_p50 = np.quantile(kvc_x, 0.50)
    dp_p50 = np.quantile(dp_x, 0.50)
    ax.axhline(0.5, color="gray", linestyle=":", alpha=0.5)
    ax.text(1.2, 0.52, "median (50% of requests below this)",
            fontsize=8.5, color="gray", style="italic")
    ax.axvline(kvc_p50, color=KVC_COLOR, ls="--", alpha=0.5, lw=1.0)
    ax.axvline(dp_p50, color=DP_COLOR, ls="--", alpha=0.5, lw=1.0)
    ax.text(kvc_p50, 0.06, f"KVC\nmedian\n{int(kvc_p50)}",
            color=KVC_COLOR, fontsize=9, ha="center", va="bottom",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.75, pad=1))
    ax.text(dp_p50, 0.06, f"DP\nmedian\n{int(dp_p50)}",
            color=DP_COLOR, fontsize=9, ha="center", va="bottom",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.75, pad=1))

    # Annotate the separation: at uncached = 500 tokens, what fraction below?
    sep_x = 500
    kvc_at_sep = (kvc_x <= sep_x).mean()
    dp_at_sep = (dp_x <= sep_x).mean()
    ax.axvline(sep_x, color="#666", linestyle=":", alpha=0.6, lw=1.0)
    ax.annotate(
        f"At uncached = {sep_x} tokens:\n"
        f"KVC {kvc_at_sep*100:.0f}% of requests below\n"
        f"DP  {dp_at_sep*100:.0f}% of requests below",
        xy=(sep_x, dp_at_sep),
        xytext=(2500, 0.35),
        fontsize=9.5,
        bbox=dict(facecolor="white", edgecolor="gray", alpha=0.92, pad=4),
        arrowprops=dict(arrowstyle="->", color="#666", lw=0.8),
    )

    ax.set_xscale("log")
    ax.set_xlim(1, 1e5)
    ax.set_xticks([1, 10, 100, 1000, 10000, 100000])
    ax.set_xticklabels(["1", "10", "100", "1K", "10K", "100K"])
    ax.set_ylim(0, 1.02)
    ax.set_xlabel("Uncached tokens per request  (log scale)", fontsize=11)
    ax.set_ylabel("Cumulative fraction of requests", fontsize=11)
    ax.set_title("ECDF of uncached tokens per request\n(impact: KVC concentrates near zero)",
                 fontsize=12, pad=10)
    ax.legend(loc="lower right", fontsize=10, framealpha=0.95)
    ax.grid(True, which="both", linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)

    fig.suptitle(
        "Cache efficiency paradox:  KVC has 27% LESS total KV pool (276K vs 351K tokens) yet caches MORE per request.\n"
        "Left: session-affinity lets KVC's cache accumulate with turns; DP's hash-LRU loses cache to cross-session competition.\n"
        "Right: net effect — KVC's uncached compute is concentrated near zero, DP's is spread over 100-10K tokens.",
        fontsize=11.5, y=1.05,
    )
    plt.tight_layout()
    plt.savefig(OUT, dpi=150, bbox_inches="tight")
    print(f"wrote {OUT}")
    plt.close(fig)

    # ------------------------------------------------------------------
    # Print summary for doc reference
    # ------------------------------------------------------------------
    print("\n=== Cache efficiency stats ===")
    print(f"KVC v2:  total_input={sum(r['input_length'] for r in kvc)/1e6:.1f}M tokens")
    print(f"         total_cached={sum(r.get('cached_tokens',0) for r in kvc)/1e6:.1f}M tokens")
    print(f"         hit rate {sum(r.get('cached_tokens',0) for r in kvc)/sum(r['input_length'] for r in kvc)*100:.2f}%")
    print(f"         mean uncached {np.mean(kvc_x):.0f}  p50 {kvc_p50:.0f}  p90 {np.quantile(kvc_x, 0.9):.0f}")

    print(f"\nDP 4w:   total_input={sum(r['input_length'] for r in dp)/1e6:.1f}M tokens")
    print(f"         total_cached={sum(r.get('cached_tokens',0) for r in dp)/1e6:.1f}M tokens")
    print(f"         hit rate {sum(r.get('cached_tokens',0) for r in dp)/sum(r['input_length'] for r in dp)*100:.2f}%")
    print(f"         mean uncached {np.mean(dp_x):.0f}  p50 {dp_p50:.0f}  p90 {np.quantile(dp_x, 0.9):.0f}")

    print(f"\nMid-turn region (8-25): KVC {drift_kvc*100:.2f}%  DP {drift_dp*100:.2f}%  (gap {(drift_kvc-drift_dp)*100:+.2f}pp)")


if __name__ == "__main__":
    main()