Files
agentic-pd-hybrid/scripts/analysis/plot_e1_vs_e4.py
Claude Code Agent e9ad1c4bc7 feat(experiments): E4 vs E1 results + p99 attribution figures
Headline: KVC v2 + load-floor + RDMA beats naive PD-disagg on
mean/p50/p90 by 30-65% (TTFT p50 31s vs 88s, lat p50 37s vs 93s,
wall-clock 64 min vs 88 min). Loses p99 by ~8% (TTFT 224 vs 207).

Wrote 4 figures (docs/figures/):
  e1_vs_e4_ttft_pdf.png         — bimodal E4 fast-path peak vs E1 single peak
  e1_vs_e4_latency_cdf.png      — CDF + log-survival showing tail crossover
  e4_path_latency.png           — per-execution-mode latency breakdown
  e1_vs_e4_p99_attribution.png  — what makes up E4's p99 tail

P99 tail attribution (this is the key finding):
  E4 p99 tail (n=65, TTFT ≥ 179.9s):
    fast-path direct-to-d        0 % (0/65)
    reseed paths                 5 % (3/65)
    fallback paths              88 % (57/65)
      large-append-session-cap  43 %  ← biggest culprit
      no-d-capacity             17 %
      large-append              14 %

Implication: D→P snapshot (designed to optimize reseed slow path)
even if fully working would touch ≤5% of the p99 tail. The real
bottleneck is *fallback chain* (admission retry + seeded-router
cold start), not reseed. Optimizing p99 needs work on fallback,
not more D→P plumbing.

Full analysis: docs/E4_VS_E1_RESULTS_ZH.md
2026-05-13 12:23:11 +08:00

335 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Generate E1 (naive PD-disagg) vs E4 (KVC + load-floor + RDMA) comparison figures.
Outputs (under docs/figures/):
e1_vs_e4_ttft_pdf.png - TTFT distribution body + log-tail
e1_vs_e4_latency_cdf.png - E2E latency CDF
e4_path_latency.png - E4 per-execution-mode latency breakdown
e1_vs_e4_p99_attribution.png - which execution modes contribute to E4's p99 tail
"""
from __future__ import annotations
import argparse
import json
from collections import Counter, defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
ROOT = Path(__file__).resolve().parents[2]
FIG = ROOT / "docs/figures"
FIG.mkdir(parents=True, exist_ok=True)
E1_COLOR = "#D62728" # red
E4_COLOR = "#1F77B4" # blue
def load(p: Path) -> list[dict]:
return [json.loads(l) for l in p.open()]
def is_failed(r: dict) -> bool:
if r.get("error"):
return True
fr = r.get("finish_reason")
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
return True
return False
def pct(values, q):
return float(np.quantile(values, q))
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--e1-metrics", required=True)
ap.add_argument("--e4-metrics", required=True)
args = ap.parse_args()
e1 = [r for r in load(Path(args.e1_metrics)) if not is_failed(r)]
e4 = [r for r in load(Path(args.e4_metrics)) if not is_failed(r)]
e1_ttft = np.array([r["ttft_s"] for r in e1 if r.get("ttft_s") is not None])
e4_ttft = np.array([r["ttft_s"] for r in e4 if r.get("ttft_s") is not None])
e1_lat = np.array([r["latency_s"] for r in e1 if r.get("latency_s") is not None])
e4_lat = np.array([r["latency_s"] for r in e4 if r.get("latency_s") is not None])
e1_ttft = e1_ttft[e1_ttft > 1e-4]
e4_ttft = e4_ttft[e4_ttft > 1e-4]
print(f"E1 reqs={len(e1)} (after failed-filter) TTFT n={len(e1_ttft)} lat n={len(e1_lat)}")
print(f"E4 reqs={len(e4)} (after failed-filter) TTFT n={len(e4_ttft)} lat n={len(e4_lat)}")
print()
for name, arr in [("E1", e1_ttft), ("E4", e4_ttft)]:
print(f" {name} TTFT mean={arr.mean():.3f} p50={pct(arr,0.5):.3f} "
f"p90={pct(arr,0.9):.3f} p99={pct(arr,0.99):.3f} max={arr.max():.3f}")
print()
for name, arr in [("E1", e1_lat), ("E4", e4_lat)]:
print(f" {name} Lat mean={arr.mean():.3f} p50={pct(arr,0.5):.3f} "
f"p90={pct(arr,0.9):.3f} p99={pct(arr,0.99):.3f} max={arr.max():.3f}")
print()
# ----- Plot 1: TTFT distribution (body + log tail) ---------------------
_plot_ttft_pdf(e1_ttft, e4_ttft)
# ----- Plot 2: Latency CDF --------------------------------------------
_plot_latency_cdf(e1_lat, e4_lat)
# ----- Plot 3: E4 path-level breakdown ---------------------------------
_plot_path_latency(e4)
# ----- Plot 4: p99 attribution -----------------------------------------
_plot_p99_attribution(e4, e1_ttft, e4_ttft)
def _plot_ttft_pdf(e1_ttft, e4_ttft):
from scipy.stats import gaussian_kde
fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))
# Body, linear x ∈ [0, 60s]
ax = axes[0]
x_body = np.linspace(0, 60, 800)
kde_e4 = gaussian_kde(e4_ttft, bw_method=0.15)
kde_e1 = gaussian_kde(e1_ttft, bw_method=0.15)
ax.plot(x_body, kde_e4(x_body), color=E4_COLOR, lw=2.5,
label=f"E4 KVC + load-floor + RDMA (n={len(e4_ttft)})")
ax.fill_between(x_body, kde_e4(x_body), alpha=0.2, color=E4_COLOR)
ax.plot(x_body, kde_e1(x_body), color=E1_COLOR, lw=2.5,
label=f"E1 naive PD-disagg (n={len(e1_ttft)})")
ax.fill_between(x_body, kde_e1(x_body), alpha=0.2, color=E1_COLOR)
for q, ls in [(0.5, "-"), (0.9, "--")]:
ax.axvline(pct(e4_ttft, q), color=E4_COLOR, ls=ls, alpha=0.55, lw=1.1)
ax.axvline(pct(e1_ttft, q), color=E1_COLOR, ls=ls, alpha=0.55, lw=1.1)
ymax = ax.get_ylim()[1]
ax.text(pct(e4_ttft, 0.5), ymax * 0.95, f"E4 p50\n{pct(e4_ttft, 0.5):.1f}s",
color=E4_COLOR, fontsize=9, va="top", ha="left",
bbox=dict(facecolor="white", edgecolor="none", alpha=0.8, pad=2))
ax.text(pct(e1_ttft, 0.5), ymax * 0.55, f"E1 p50\n{pct(e1_ttft, 0.5):.1f}s",
color=E1_COLOR, fontsize=9, va="top", ha="left",
bbox=dict(facecolor="white", edgecolor="none", alpha=0.8, pad=2))
ax.set_xlim(0, 60)
ax.set_xlabel("TTFT (seconds, linear)", fontsize=11)
ax.set_ylabel("Probability density", fontsize=11)
ax.set_title("Body of distribution (TTFT ≤ 60s)", fontsize=12, pad=10)
ax.legend(loc="upper right", fontsize=10, framealpha=0.95)
ax.grid(True, linestyle=":", alpha=0.4)
# Log tail
ax = axes[1]
kde_e4_log = gaussian_kde(np.log10(e4_ttft), bw_method="scott")
kde_e1_log = gaussian_kde(np.log10(e1_ttft), bw_method="scott")
log_x = np.linspace(np.log10(0.05), np.log10(500), 600)
x_full = 10 ** log_x
y_e4 = kde_e4_log(log_x)
y_e1 = kde_e1_log(log_x)
ax.plot(x_full, y_e4, color=E4_COLOR, lw=2.5, label=f"E4 KVC (n={len(e4_ttft)})")
ax.fill_between(x_full, y_e4, alpha=0.2, color=E4_COLOR)
ax.plot(x_full, y_e1, color=E1_COLOR, lw=2.5, label=f"E1 naive PD (n={len(e1_ttft)})")
ax.fill_between(x_full, y_e1, alpha=0.2, color=E1_COLOR)
ax.set_xscale("log")
ax.set_xlim(0.05, 500)
quartile_styles = [(0.5, "-", "p50"), (0.9, "--", "p90"), (0.99, ":", "p99")]
for q, ls, _ in quartile_styles:
ax.axvline(pct(e4_ttft, q), color=E4_COLOR, ls=ls, alpha=0.55, lw=1.1)
ax.axvline(pct(e1_ttft, q), color=E1_COLOR, ls=ls, alpha=0.55, lw=1.1)
ymax = max(y_e4.max(), y_e1.max())
ax.annotate(f"E4 p99 = {pct(e4_ttft, 0.99):.1f}s",
xy=(pct(e4_ttft, 0.99), kde_e4_log(np.log10(pct(e4_ttft, 0.99)))[0]),
xytext=(80, ymax * 0.55),
fontsize=10, color=E4_COLOR, fontweight="bold",
arrowprops=dict(arrowstyle="->", color=E4_COLOR, lw=1.0))
ax.annotate(f"E1 p99 = {pct(e1_ttft, 0.99):.1f}s",
xy=(pct(e1_ttft, 0.99), kde_e1_log(np.log10(pct(e1_ttft, 0.99)))[0]),
xytext=(80, ymax * 0.40),
fontsize=10, color=E1_COLOR, fontweight="bold",
arrowprops=dict(arrowstyle="->", color=E1_COLOR, lw=1.0))
ax.set_xticks([0.1, 1, 10, 100])
ax.set_xticklabels(["100ms", "1s", "10s", "100s"])
ax.set_xlabel("TTFT (log scale)", fontsize=11)
ax.set_ylabel("Density (per log₁₀ s)", fontsize=11)
ax.set_title("Full range incl. p99 tail (log x)", fontsize=12, pad=10)
ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
ax.grid(True, which="both", linestyle=":", alpha=0.4)
fig.suptitle(
"TTFT density: E4 KVC v2 + load-floor + RDMA vs E1 naive PD-disagg\n"
"Inferact 50-session trace · ts=1 · 4× H200 · aborted requests excluded",
fontsize=13, y=1.02,
)
plt.tight_layout()
out = FIG / "e1_vs_e4_ttft_pdf.png"
plt.savefig(out, dpi=150, bbox_inches="tight")
print(f"wrote {out}")
plt.close(fig)
def _plot_latency_cdf(e1_lat, e4_lat):
fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))
# Linear CDF
ax = axes[0]
for arr, color, name in [(e4_lat, E4_COLOR, f"E4 KVC (n={len(e4_lat)})"),
(e1_lat, E1_COLOR, f"E1 naive (n={len(e1_lat)})")]:
s = np.sort(arr)
y = np.linspace(0, 1, len(s), endpoint=False)
ax.plot(s, y, color=color, lw=2.5, label=name)
ax.set_xlim(0, 300)
ax.set_xlabel("E2E latency (seconds)", fontsize=11)
ax.set_ylabel("CDF", fontsize=11)
ax.set_title("Full latency CDF (linear)", fontsize=12)
ax.legend(loc="lower right", fontsize=10)
ax.grid(True, linestyle=":", alpha=0.4)
# Annotate percentiles
for q, mark in [(0.5, "p50"), (0.9, "p90"), (0.99, "p99")]:
e4v, e1v = pct(e4_lat, q), pct(e1_lat, q)
ax.axhline(q, color="gray", ls=":", alpha=0.3)
ax.annotate(f"{mark}: E4 {e4v:.1f}s, E1 {e1v:.1f}s",
xy=(0, q), xytext=(220, q - 0.02 if q > 0.5 else q + 0.02),
fontsize=9, color="black")
# Log CDF showing tail
ax = axes[1]
for arr, color, name in [(e4_lat, E4_COLOR, f"E4 KVC"),
(e1_lat, E1_COLOR, f"E1 naive")]:
s = np.sort(arr)
s_clip = np.maximum(s, 0.01)
y = np.linspace(0, 1, len(s), endpoint=False)
ax.plot(s_clip, 1 - y, color=color, lw=2.5, label=name)
ax.set_xscale("log")
ax.set_yscale("log")
ax.set_xlim(0.5, 500)
ax.set_ylim(1e-3, 1.1)
ax.set_xlabel("E2E latency (log s)", fontsize=11)
ax.set_ylabel("P(latency > x) (log)", fontsize=11)
ax.set_title("Survival function — log-log (highlights tail behavior)", fontsize=12)
ax.legend(loc="upper right", fontsize=10)
ax.grid(True, which="both", linestyle=":", alpha=0.4)
fig.suptitle("E2E latency: E4 KVC vs E1 naive PD-disagg", fontsize=13, y=1.02)
plt.tight_layout()
out = FIG / "e1_vs_e4_latency_cdf.png"
plt.savefig(out, dpi=150, bbox_inches="tight")
print(f"wrote {out}")
plt.close(fig)
def _plot_path_latency(e4):
by_mode = defaultdict(list)
by_mode_lat = defaultdict(list)
for r in e4:
m = r.get("execution_mode", "?") or "?"
if r.get("ttft_s") is not None:
by_mode[m].append(float(r["ttft_s"]))
if r.get("latency_s") is not None:
by_mode_lat[m].append(float(r["latency_s"]))
# Sort by count
modes = sorted(by_mode, key=lambda m: -len(by_mode[m]))
# Limit to top-N by count
modes = modes[:14]
fig, ax = plt.subplots(1, 1, figsize=(14, 7))
pos = np.arange(len(modes))
means = [np.mean(by_mode[m]) for m in modes]
p50 = [pct(np.array(by_mode[m]), 0.5) for m in modes]
p99 = [pct(np.array(by_mode[m]), 0.99) for m in modes]
counts = [len(by_mode[m]) for m in modes]
bar_h = 0.25
ax.barh(pos - bar_h, means, bar_h, label="mean", color="#4a90e2", alpha=0.85)
ax.barh(pos, p50, bar_h, label="p50", color="#66cc99", alpha=0.85)
ax.barh(pos + bar_h, p99, bar_h, label="p99", color="#e74c3c", alpha=0.85)
ax.set_yticks(pos)
ax.set_yticklabels([f"{m} (n={counts[i]})" for i, m in enumerate(modes)],
fontsize=9)
ax.invert_yaxis()
ax.set_xlabel("TTFT (s)", fontsize=11)
ax.set_title("E4 per execution_mode TTFT (sorted by count, top 14)",
fontsize=12, pad=10)
ax.legend(loc="lower right", fontsize=10)
ax.grid(True, linestyle=":", alpha=0.4)
plt.tight_layout()
out = FIG / "e4_path_latency.png"
plt.savefig(out, dpi=150, bbox_inches="tight")
print(f"wrote {out}")
plt.close(fig)
def _plot_p99_attribution(e4, e1_ttft, e4_ttft):
"""Show which execution modes hit p99 and dominate the tail."""
# Threshold: anything > E4's p99 = part of the p99 tail
e4_p99 = pct(e4_ttft, 0.99)
e1_p99 = pct(e1_ttft, 0.99)
# Define the "tail" as TTFT > p95
threshold = pct(e4_ttft, 0.95)
tail_modes = Counter()
body_modes = Counter()
for r in e4:
m = r.get("execution_mode", "?") or "?"
ttft = r.get("ttft_s")
if ttft is None:
continue
if ttft >= threshold:
tail_modes[m] += 1
else:
body_modes[m] += 1
all_modes = sorted(tail_modes, key=lambda m: -tail_modes[m])[:10]
body_total = sum(body_modes.values())
tail_total = sum(tail_modes.values())
fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))
# Pie of tail composition
ax = axes[0]
sizes = [tail_modes[m] for m in all_modes]
rest = sum(tail_modes.values()) - sum(sizes)
if rest > 0:
all_modes_label = all_modes + ["(other)"]
sizes = sizes + [rest]
else:
all_modes_label = all_modes
wedges, texts, autotexts = ax.pie(
sizes, labels=[f"{m}\n(n={c})" for m, c in zip(all_modes_label, sizes)],
autopct="%1.0f%%", startangle=90, textprops={"fontsize": 9},
)
ax.set_title(f"E4 p95-p99 tail composition\n(TTFT ≥ {threshold:.1f}s, n={tail_total})",
fontsize=12, pad=12)
# Bar of mean TTFT within tail per mode
ax = axes[1]
mode_to_tail_lat = defaultdict(list)
for r in e4:
m = r.get("execution_mode", "?") or "?"
ttft = r.get("ttft_s")
if ttft is None or ttft < threshold:
continue
mode_to_tail_lat[m].append(float(ttft))
pos = np.arange(len(all_modes))
means = [np.mean(mode_to_tail_lat[m]) if mode_to_tail_lat[m] else 0 for m in all_modes]
counts = [len(mode_to_tail_lat[m]) for m in all_modes]
ax.barh(pos, means, color="#e74c3c", alpha=0.85)
ax.set_yticks(pos)
ax.set_yticklabels([f"{m} (n={counts[i]})" for i, m in enumerate(all_modes)],
fontsize=9)
ax.invert_yaxis()
ax.set_xlabel("Mean TTFT in p95-p99 region (s)", fontsize=11)
ax.set_title(f"Per-mode mean TTFT among tail reqs", fontsize=12)
ax.axvline(e4_p99, color=E4_COLOR, ls="--", alpha=0.6, label=f"E4 p99 = {e4_p99:.1f}s")
ax.axvline(e1_p99, color=E1_COLOR, ls="--", alpha=0.6, label=f"E1 p99 = {e1_p99:.1f}s")
ax.legend(loc="lower right", fontsize=10)
ax.grid(True, linestyle=":", alpha=0.4)
fig.suptitle(
f"E4 p99 tail attribution: which execution_modes produce the long tail?\n"
f"E4 p99 = {e4_p99:.1f}s vs E1 p99 = {e1_p99:.1f}s "
f"(KVC loses tail by +{(e4_p99/e1_p99-1)*100:.1f}%)",
fontsize=13, y=1.02,
)
plt.tight_layout()
out = FIG / "e1_vs_e4_p99_attribution.png"
plt.savefig(out, dpi=150, bbox_inches="tight")
print(f"wrote {out}")
plt.close(fig)
if __name__ == "__main__":
main()