Files
agentic-pd-hybrid/scripts/analysis/plot_cache_efficiency.py
kzlin 517677d7f2 docs(kvc): add GPU-utilization and cache-efficiency figures (rebut critic)
Two figures inserted into V2_DEEP_ANALYSIS §4.5 and §4.4 respectively, to
visually rebut the two critic-agent claims that we argued in prose were
design intent, not deficiencies.

(1) gpu_utilization.png  -- §4.5  "P GPU is wasted 90% of the time"
  Two-panel side-by-side:
    Left  (request count view, the naive reading): KVC P = 328 reqs (7.4%),
          KVC D = ~1450 each, DP = ~1100 each. P "looks idle."
    Right (compute work view, the honest reading): KVC P does 1.07M tokens
          of prefill, comparable to each KVC D worker's ~0.80M. P is a
          low-frequency high-cost safety net, not idle capacity.
  Bonus finding: KVC's total compute (3.47M tokens across 4 GPUs) is 33%
  LESS than DP's (5.17M). Same GPUs, less work done. That's the affinity
  win.

(2) cache_efficiency.png  -- §4.4  "Cache concentration is not policy win"
  Two-panel side-by-side. The setup: KVC has 27% LESS total KV pool
  (276K vs 351K tokens) yet caches MORE per request.
    Left  (cache hit rate vs turn number): KVC's session-affinity lets
          hit rate accumulate with turns; DP's hash + radix-LRU causes
          a mid-turn drift around turns 8-25 where KVC = 97.0% vs DP
          = 95.8% (1.24pp gap). Shows mechanism, not just outcome.
    Right (ECDF of per-request uncached tokens, log x): KVC's distribution
          concentrates near zero (50% < 187 tokens), DP's is spread
          (50% < 781 tokens). At uncached = 500 tokens threshold, KVC
          has 74% of requests below, DP has 31%.
  → smaller pool, better retention, less per-request work. Direct empirical
  rebuttal to "fragmentation is architectural, not policy."

Bundled scripts (rerunable):
- scripts/analysis/plot_gpu_utilization.py
- scripts/analysis/plot_cache_efficiency.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 18:04:49 +08:00

210 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Cache efficiency comparison: KVC 1P3D v2 vs 4-way DP CA.
Generates docs/figures/cache_efficiency.png — two-panel:
left: cache hit rate vs turn number (mechanism: affinity vs LRU)
right: ECDF of per-request uncached tokens (per-request impact)
Resolves the apparent paradox: KVC has 27% less total KV pool capacity
(3 × 92K = 276K vs DP 4 × 87K = 351K) yet achieves higher cache hit rate
(98.1% vs 96.8%) and lower mean uncached tokens per request (560 vs 952).
The left panel shows the mechanism: KVC's session affinity makes cache hit
rate grow with turn count (more cache accumulates on the pinned D), while
DP's hash + radix-LRU causes cache hit rate to decay through the middle
turns (other sessions' KV competes via LRU eviction).
The right panel quantifies the impact: KVC's uncached tokens are
concentrated near 0 (mean 560), DP's are spread (mean 952).
Aborted / errored requests are excluded.
"""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
ROOT = Path(__file__).resolve().parents[2]
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
OUT = ROOT / "docs/figures/cache_efficiency.png"
def load(p: Path) -> list[dict]:
return [json.loads(line) for line in p.open()]
def is_failed(r: dict) -> bool:
if r.get("error"):
return True
fr = r.get("finish_reason")
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
return True
return False
def main() -> None:
kvc = [r for r in load(KVC) if not is_failed(r)]
dp = [r for r in load(DP) if not is_failed(r)]
KVC_COLOR = "#1F77B4"
DP_COLOR = "#D62728"
fig, axes = plt.subplots(1, 2, figsize=(15, 6.5))
# ------------------------------------------------------------------
# Left panel: cache hit rate per turn
# Bin requests by turn_id, plot mean hit rate per bin with shaded band
# ------------------------------------------------------------------
def bin_by_turn(rows: list[dict]) -> tuple[list[int], list[float], list[float], list[float]]:
per_turn: defaultdict[int, list[float]] = defaultdict(list)
for r in rows:
if r["input_length"] == 0:
continue
hit = r.get("cached_tokens", 0) / r["input_length"]
per_turn[r["turn_id"]].append(hit)
turns = sorted(per_turn.keys())
means, p25s, p75s = [], [], []
for t in turns:
arr = np.array(per_turn[t])
means.append(float(np.mean(arr)))
p25s.append(float(np.quantile(arr, 0.25)))
p75s.append(float(np.quantile(arr, 0.75)))
return turns, means, p25s, p75s
kvc_t, kvc_m, kvc_lo, kvc_hi = bin_by_turn(kvc)
dp_t, dp_m, dp_lo, dp_hi = bin_by_turn(dp)
# Cap x-axis: tails get noisy below ~5 samples per bin
max_turn = 100
ax = axes[0]
ax.plot(kvc_t, kvc_m, color=KVC_COLOR, lw=2.5,
label=f"KVC 1P3D v2 (overall hit 98.1%)")
ax.fill_between(kvc_t, kvc_lo, kvc_hi, color=KVC_COLOR, alpha=0.18,
label="KVC IQR (p25-p75)")
ax.plot(dp_t, dp_m, color=DP_COLOR, lw=2.5,
label=f"4-way DP CA (overall hit 96.8%)")
ax.fill_between(dp_t, dp_lo, dp_hi, color=DP_COLOR, alpha=0.18,
label="DP IQR (p25-p75)")
# Annotate the mid-turn drift gap
drift_turns = list(range(8, 25))
drift_kvc = np.mean([m for t, m in zip(kvc_t, kvc_m) if t in drift_turns])
drift_dp = np.mean([m for t, m in zip(dp_t, dp_m) if t in drift_turns])
ax.axvspan(8, 25, color="#999", alpha=0.08, label="_nolegend_")
ax.text(16, 0.65,
f"Mid-turn region\n(turns 8-25):\nKVC {drift_kvc*100:.1f}% | DP {drift_dp*100:.1f}%\nGap {(drift_kvc-drift_dp)*100:+.1f} pp",
ha="center", va="center", fontsize=9.5,
bbox=dict(facecolor="white", edgecolor="gray", alpha=0.92, pad=4))
ax.set_xlim(1, max_turn)
ax.set_ylim(0.4, 1.02)
ax.set_xlabel("Turn number within session", fontsize=11)
ax.set_ylabel("Per-request cache hit rate (cached / input_length)", fontsize=11)
ax.set_title("Cache hit rate vs turn number\n(mechanism: session affinity vs hash-LRU)",
fontsize=12, pad=10)
ax.legend(loc="lower right", fontsize=9.5, framealpha=0.95)
ax.grid(True, linestyle=":", alpha=0.4)
ax.set_axisbelow(True)
# ------------------------------------------------------------------
# Right panel: ECDF of per-request uncached tokens (log x)
# ------------------------------------------------------------------
def ecdf(rows: list[dict]) -> tuple[np.ndarray, np.ndarray]:
vals = np.array([
max(1, r["input_length"] - r.get("cached_tokens", 0))
for r in rows
])
vals = np.sort(vals)
return vals, np.arange(1, len(vals) + 1) / len(vals)
kvc_x, kvc_y = ecdf(kvc)
dp_x, dp_y = ecdf(dp)
ax = axes[1]
ax.plot(kvc_x, kvc_y, color=KVC_COLOR, lw=2.5,
label=f"KVC 1P3D v2 (mean {int(np.mean(kvc_x))} tokens)")
ax.plot(dp_x, dp_y, color=DP_COLOR, lw=2.5,
label=f"4-way DP CA (mean {int(np.mean(dp_x))} tokens)")
# Median markers
kvc_p50 = np.quantile(kvc_x, 0.50)
dp_p50 = np.quantile(dp_x, 0.50)
ax.axhline(0.5, color="gray", linestyle=":", alpha=0.5)
ax.text(1.2, 0.52, "median (50% of requests below this)",
fontsize=8.5, color="gray", style="italic")
ax.axvline(kvc_p50, color=KVC_COLOR, ls="--", alpha=0.5, lw=1.0)
ax.axvline(dp_p50, color=DP_COLOR, ls="--", alpha=0.5, lw=1.0)
ax.text(kvc_p50, 0.06, f"KVC\nmedian\n{int(kvc_p50)}",
color=KVC_COLOR, fontsize=9, ha="center", va="bottom",
bbox=dict(facecolor="white", edgecolor="none", alpha=0.75, pad=1))
ax.text(dp_p50, 0.06, f"DP\nmedian\n{int(dp_p50)}",
color=DP_COLOR, fontsize=9, ha="center", va="bottom",
bbox=dict(facecolor="white", edgecolor="none", alpha=0.75, pad=1))
# Annotate the separation: at uncached = 500 tokens, what fraction below?
sep_x = 500
kvc_at_sep = (kvc_x <= sep_x).mean()
dp_at_sep = (dp_x <= sep_x).mean()
ax.axvline(sep_x, color="#666", linestyle=":", alpha=0.6, lw=1.0)
ax.annotate(
f"At uncached = {sep_x} tokens:\n"
f"KVC {kvc_at_sep*100:.0f}% of requests below\n"
f"DP {dp_at_sep*100:.0f}% of requests below",
xy=(sep_x, dp_at_sep),
xytext=(2500, 0.35),
fontsize=9.5,
bbox=dict(facecolor="white", edgecolor="gray", alpha=0.92, pad=4),
arrowprops=dict(arrowstyle="->", color="#666", lw=0.8),
)
ax.set_xscale("log")
ax.set_xlim(1, 1e5)
ax.set_xticks([1, 10, 100, 1000, 10000, 100000])
ax.set_xticklabels(["1", "10", "100", "1K", "10K", "100K"])
ax.set_ylim(0, 1.02)
ax.set_xlabel("Uncached tokens per request (log scale)", fontsize=11)
ax.set_ylabel("Cumulative fraction of requests", fontsize=11)
ax.set_title("ECDF of uncached tokens per request\n(impact: KVC concentrates near zero)",
fontsize=12, pad=10)
ax.legend(loc="lower right", fontsize=10, framealpha=0.95)
ax.grid(True, which="both", linestyle=":", alpha=0.4)
ax.set_axisbelow(True)
fig.suptitle(
"Cache efficiency paradox: KVC has 27% LESS total KV pool (276K vs 351K tokens) yet caches MORE per request.\n"
"Left: session-affinity lets KVC's cache accumulate with turns; DP's hash-LRU loses cache to cross-session competition.\n"
"Right: net effect — KVC's uncached compute is concentrated near zero, DP's is spread over 100-10K tokens.",
fontsize=11.5, y=1.05,
)
plt.tight_layout()
plt.savefig(OUT, dpi=150, bbox_inches="tight")
print(f"wrote {OUT}")
plt.close(fig)
# ------------------------------------------------------------------
# Print summary for doc reference
# ------------------------------------------------------------------
print("\n=== Cache efficiency stats ===")
print(f"KVC v2: total_input={sum(r['input_length'] for r in kvc)/1e6:.1f}M tokens")
print(f" total_cached={sum(r.get('cached_tokens',0) for r in kvc)/1e6:.1f}M tokens")
print(f" hit rate {sum(r.get('cached_tokens',0) for r in kvc)/sum(r['input_length'] for r in kvc)*100:.2f}%")
print(f" mean uncached {np.mean(kvc_x):.0f} p50 {kvc_p50:.0f} p90 {np.quantile(kvc_x, 0.9):.0f}")
print(f"\nDP 4w: total_input={sum(r['input_length'] for r in dp)/1e6:.1f}M tokens")
print(f" total_cached={sum(r.get('cached_tokens',0) for r in dp)/1e6:.1f}M tokens")
print(f" hit rate {sum(r.get('cached_tokens',0) for r in dp)/sum(r['input_length'] for r in dp)*100:.2f}%")
print(f" mean uncached {np.mean(dp_x):.0f} p50 {dp_p50:.0f} p90 {np.quantile(dp_x, 0.9):.0f}")
print(f"\nMid-turn region (8-25): KVC {drift_kvc*100:.2f}% DP {drift_dp*100:.2f}% (gap {(drift_kvc-drift_dp)*100:+.2f}pp)")
if __name__ == "__main__":
main()