Two figures inserted into V2_DEEP_ANALYSIS §4.5 and §4.4 respectively, to
visually rebut the two critic-agent claims that we argued in prose were
design intent, not deficiencies.
(1) gpu_utilization.png -- §4.5 "P GPU is wasted 90% of the time"
Two-panel side-by-side:
Left (request count view, the naive reading): KVC P = 328 reqs (7.4%),
KVC D = ~1450 each, DP = ~1100 each. P "looks idle."
Right (compute work view, the honest reading): KVC P does 1.07M tokens
of prefill, comparable to each KVC D worker's ~0.80M. P is a
low-frequency high-cost safety net, not idle capacity.
Bonus finding: KVC's total compute (3.47M tokens across 4 GPUs) is 33%
LESS than DP's (5.17M). Same GPUs, less work done. That's the affinity
win.
(2) cache_efficiency.png -- §4.4 "Cache concentration is not policy win"
Two-panel side-by-side. The setup: KVC has 27% LESS total KV pool
(276K vs 351K tokens) yet caches MORE per request.
Left (cache hit rate vs turn number): KVC's session-affinity lets
hit rate accumulate with turns; DP's hash + radix-LRU causes
a mid-turn drift around turns 8-25 where KVC = 97.0% vs DP
= 95.8% (1.24pp gap). Shows mechanism, not just outcome.
Right (ECDF of per-request uncached tokens, log x): KVC's distribution
concentrates near zero (50% < 187 tokens), DP's is spread
(50% < 781 tokens). At uncached = 500 tokens threshold, KVC
has 74% of requests below, DP has 31%.
→ smaller pool, better retention, less per-request work. Direct empirical
rebuttal to "fragmentation is architectural, not policy."
Bundled scripts (rerunable):
- scripts/analysis/plot_gpu_utilization.py
- scripts/analysis/plot_cache_efficiency.py
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
210 lines
9.0 KiB
Python
210 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
||
"""Cache efficiency comparison: KVC 1P3D v2 vs 4-way DP CA.
|
||
|
||
Generates docs/figures/cache_efficiency.png — two-panel:
|
||
left: cache hit rate vs turn number (mechanism: affinity vs LRU)
|
||
right: ECDF of per-request uncached tokens (per-request impact)
|
||
|
||
Resolves the apparent paradox: KVC has 27% less total KV pool capacity
|
||
(3 × 92K = 276K vs DP 4 × 87K = 351K) yet achieves higher cache hit rate
|
||
(98.1% vs 96.8%) and lower mean uncached tokens per request (560 vs 952).
|
||
|
||
The left panel shows the mechanism: KVC's session affinity makes cache hit
|
||
rate grow with turn count (more cache accumulates on the pinned D), while
|
||
DP's hash + radix-LRU causes cache hit rate to decay through the middle
|
||
turns (other sessions' KV competes via LRU eviction).
|
||
|
||
The right panel quantifies the impact: KVC's uncached tokens are
|
||
concentrated near 0 (mean 560), DP's are spread (mean 952).
|
||
|
||
Aborted / errored requests are excluded.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
|
||
ROOT = Path(__file__).resolve().parents[2]
|
||
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
|
||
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
|
||
OUT = ROOT / "docs/figures/cache_efficiency.png"
|
||
|
||
|
||
def load(p: Path) -> list[dict]:
|
||
return [json.loads(line) for line in p.open()]
|
||
|
||
|
||
def is_failed(r: dict) -> bool:
|
||
if r.get("error"):
|
||
return True
|
||
fr = r.get("finish_reason")
|
||
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
|
||
return True
|
||
return False
|
||
|
||
|
||
def main() -> None:
|
||
kvc = [r for r in load(KVC) if not is_failed(r)]
|
||
dp = [r for r in load(DP) if not is_failed(r)]
|
||
|
||
KVC_COLOR = "#1F77B4"
|
||
DP_COLOR = "#D62728"
|
||
|
||
fig, axes = plt.subplots(1, 2, figsize=(15, 6.5))
|
||
|
||
# ------------------------------------------------------------------
|
||
# Left panel: cache hit rate per turn
|
||
# Bin requests by turn_id, plot mean hit rate per bin with shaded band
|
||
# ------------------------------------------------------------------
|
||
def bin_by_turn(rows: list[dict]) -> tuple[list[int], list[float], list[float], list[float]]:
|
||
per_turn: defaultdict[int, list[float]] = defaultdict(list)
|
||
for r in rows:
|
||
if r["input_length"] == 0:
|
||
continue
|
||
hit = r.get("cached_tokens", 0) / r["input_length"]
|
||
per_turn[r["turn_id"]].append(hit)
|
||
turns = sorted(per_turn.keys())
|
||
means, p25s, p75s = [], [], []
|
||
for t in turns:
|
||
arr = np.array(per_turn[t])
|
||
means.append(float(np.mean(arr)))
|
||
p25s.append(float(np.quantile(arr, 0.25)))
|
||
p75s.append(float(np.quantile(arr, 0.75)))
|
||
return turns, means, p25s, p75s
|
||
|
||
kvc_t, kvc_m, kvc_lo, kvc_hi = bin_by_turn(kvc)
|
||
dp_t, dp_m, dp_lo, dp_hi = bin_by_turn(dp)
|
||
|
||
# Cap x-axis: tails get noisy below ~5 samples per bin
|
||
max_turn = 100
|
||
|
||
ax = axes[0]
|
||
ax.plot(kvc_t, kvc_m, color=KVC_COLOR, lw=2.5,
|
||
label=f"KVC 1P3D v2 (overall hit 98.1%)")
|
||
ax.fill_between(kvc_t, kvc_lo, kvc_hi, color=KVC_COLOR, alpha=0.18,
|
||
label="KVC IQR (p25-p75)")
|
||
ax.plot(dp_t, dp_m, color=DP_COLOR, lw=2.5,
|
||
label=f"4-way DP CA (overall hit 96.8%)")
|
||
ax.fill_between(dp_t, dp_lo, dp_hi, color=DP_COLOR, alpha=0.18,
|
||
label="DP IQR (p25-p75)")
|
||
|
||
# Annotate the mid-turn drift gap
|
||
drift_turns = list(range(8, 25))
|
||
drift_kvc = np.mean([m for t, m in zip(kvc_t, kvc_m) if t in drift_turns])
|
||
drift_dp = np.mean([m for t, m in zip(dp_t, dp_m) if t in drift_turns])
|
||
ax.axvspan(8, 25, color="#999", alpha=0.08, label="_nolegend_")
|
||
ax.text(16, 0.65,
|
||
f"Mid-turn region\n(turns 8-25):\nKVC {drift_kvc*100:.1f}% | DP {drift_dp*100:.1f}%\nGap {(drift_kvc-drift_dp)*100:+.1f} pp",
|
||
ha="center", va="center", fontsize=9.5,
|
||
bbox=dict(facecolor="white", edgecolor="gray", alpha=0.92, pad=4))
|
||
|
||
ax.set_xlim(1, max_turn)
|
||
ax.set_ylim(0.4, 1.02)
|
||
ax.set_xlabel("Turn number within session", fontsize=11)
|
||
ax.set_ylabel("Per-request cache hit rate (cached / input_length)", fontsize=11)
|
||
ax.set_title("Cache hit rate vs turn number\n(mechanism: session affinity vs hash-LRU)",
|
||
fontsize=12, pad=10)
|
||
ax.legend(loc="lower right", fontsize=9.5, framealpha=0.95)
|
||
ax.grid(True, linestyle=":", alpha=0.4)
|
||
ax.set_axisbelow(True)
|
||
|
||
# ------------------------------------------------------------------
|
||
# Right panel: ECDF of per-request uncached tokens (log x)
|
||
# ------------------------------------------------------------------
|
||
def ecdf(rows: list[dict]) -> tuple[np.ndarray, np.ndarray]:
|
||
vals = np.array([
|
||
max(1, r["input_length"] - r.get("cached_tokens", 0))
|
||
for r in rows
|
||
])
|
||
vals = np.sort(vals)
|
||
return vals, np.arange(1, len(vals) + 1) / len(vals)
|
||
|
||
kvc_x, kvc_y = ecdf(kvc)
|
||
dp_x, dp_y = ecdf(dp)
|
||
|
||
ax = axes[1]
|
||
ax.plot(kvc_x, kvc_y, color=KVC_COLOR, lw=2.5,
|
||
label=f"KVC 1P3D v2 (mean {int(np.mean(kvc_x))} tokens)")
|
||
ax.plot(dp_x, dp_y, color=DP_COLOR, lw=2.5,
|
||
label=f"4-way DP CA (mean {int(np.mean(dp_x))} tokens)")
|
||
|
||
# Median markers
|
||
kvc_p50 = np.quantile(kvc_x, 0.50)
|
||
dp_p50 = np.quantile(dp_x, 0.50)
|
||
ax.axhline(0.5, color="gray", linestyle=":", alpha=0.5)
|
||
ax.text(1.2, 0.52, "median (50% of requests below this)",
|
||
fontsize=8.5, color="gray", style="italic")
|
||
ax.axvline(kvc_p50, color=KVC_COLOR, ls="--", alpha=0.5, lw=1.0)
|
||
ax.axvline(dp_p50, color=DP_COLOR, ls="--", alpha=0.5, lw=1.0)
|
||
ax.text(kvc_p50, 0.06, f"KVC\nmedian\n{int(kvc_p50)}",
|
||
color=KVC_COLOR, fontsize=9, ha="center", va="bottom",
|
||
bbox=dict(facecolor="white", edgecolor="none", alpha=0.75, pad=1))
|
||
ax.text(dp_p50, 0.06, f"DP\nmedian\n{int(dp_p50)}",
|
||
color=DP_COLOR, fontsize=9, ha="center", va="bottom",
|
||
bbox=dict(facecolor="white", edgecolor="none", alpha=0.75, pad=1))
|
||
|
||
# Annotate the separation: at uncached = 500 tokens, what fraction below?
|
||
sep_x = 500
|
||
kvc_at_sep = (kvc_x <= sep_x).mean()
|
||
dp_at_sep = (dp_x <= sep_x).mean()
|
||
ax.axvline(sep_x, color="#666", linestyle=":", alpha=0.6, lw=1.0)
|
||
ax.annotate(
|
||
f"At uncached = {sep_x} tokens:\n"
|
||
f"KVC {kvc_at_sep*100:.0f}% of requests below\n"
|
||
f"DP {dp_at_sep*100:.0f}% of requests below",
|
||
xy=(sep_x, dp_at_sep),
|
||
xytext=(2500, 0.35),
|
||
fontsize=9.5,
|
||
bbox=dict(facecolor="white", edgecolor="gray", alpha=0.92, pad=4),
|
||
arrowprops=dict(arrowstyle="->", color="#666", lw=0.8),
|
||
)
|
||
|
||
ax.set_xscale("log")
|
||
ax.set_xlim(1, 1e5)
|
||
ax.set_xticks([1, 10, 100, 1000, 10000, 100000])
|
||
ax.set_xticklabels(["1", "10", "100", "1K", "10K", "100K"])
|
||
ax.set_ylim(0, 1.02)
|
||
ax.set_xlabel("Uncached tokens per request (log scale)", fontsize=11)
|
||
ax.set_ylabel("Cumulative fraction of requests", fontsize=11)
|
||
ax.set_title("ECDF of uncached tokens per request\n(impact: KVC concentrates near zero)",
|
||
fontsize=12, pad=10)
|
||
ax.legend(loc="lower right", fontsize=10, framealpha=0.95)
|
||
ax.grid(True, which="both", linestyle=":", alpha=0.4)
|
||
ax.set_axisbelow(True)
|
||
|
||
fig.suptitle(
|
||
"Cache efficiency paradox: KVC has 27% LESS total KV pool (276K vs 351K tokens) yet caches MORE per request.\n"
|
||
"Left: session-affinity lets KVC's cache accumulate with turns; DP's hash-LRU loses cache to cross-session competition.\n"
|
||
"Right: net effect — KVC's uncached compute is concentrated near zero, DP's is spread over 100-10K tokens.",
|
||
fontsize=11.5, y=1.05,
|
||
)
|
||
plt.tight_layout()
|
||
plt.savefig(OUT, dpi=150, bbox_inches="tight")
|
||
print(f"wrote {OUT}")
|
||
plt.close(fig)
|
||
|
||
# ------------------------------------------------------------------
|
||
# Print summary for doc reference
|
||
# ------------------------------------------------------------------
|
||
print("\n=== Cache efficiency stats ===")
|
||
print(f"KVC v2: total_input={sum(r['input_length'] for r in kvc)/1e6:.1f}M tokens")
|
||
print(f" total_cached={sum(r.get('cached_tokens',0) for r in kvc)/1e6:.1f}M tokens")
|
||
print(f" hit rate {sum(r.get('cached_tokens',0) for r in kvc)/sum(r['input_length'] for r in kvc)*100:.2f}%")
|
||
print(f" mean uncached {np.mean(kvc_x):.0f} p50 {kvc_p50:.0f} p90 {np.quantile(kvc_x, 0.9):.0f}")
|
||
|
||
print(f"\nDP 4w: total_input={sum(r['input_length'] for r in dp)/1e6:.1f}M tokens")
|
||
print(f" total_cached={sum(r.get('cached_tokens',0) for r in dp)/1e6:.1f}M tokens")
|
||
print(f" hit rate {sum(r.get('cached_tokens',0) for r in dp)/sum(r['input_length'] for r in dp)*100:.2f}%")
|
||
print(f" mean uncached {np.mean(dp_x):.0f} p50 {dp_p50:.0f} p90 {np.quantile(dp_x, 0.9):.0f}")
|
||
|
||
print(f"\nMid-turn region (8-25): KVC {drift_kvc*100:.2f}% DP {drift_dp*100:.2f}% (gap {(drift_kvc-drift_dp)*100:+.2f}pp)")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|