Headline: KVC v2 + load-floor + RDMA beats naive PD-disagg on
mean/p50/p90 by 30-65% (TTFT p50 31s vs 88s, lat p50 37s vs 93s,
wall-clock 64 min vs 88 min). Loses p99 by ~8% (TTFT 224 vs 207).
Wrote 4 figures (docs/figures/):
e1_vs_e4_ttft_pdf.png — bimodal E4 fast-path peak vs E1 single peak
e1_vs_e4_latency_cdf.png — CDF + log-survival showing tail crossover
e4_path_latency.png — per-execution-mode latency breakdown
e1_vs_e4_p99_attribution.png — what makes up E4's p99 tail
P99 tail attribution (this is the key finding):
E4 p99 tail (n=65, TTFT ≥ 179.9s):
fast-path direct-to-d 0 % (0/65)
reseed paths 5 % (3/65)
fallback paths 88 % (57/65)
large-append-session-cap 43 % ← biggest culprit
no-d-capacity 17 %
large-append 14 %
Implication: D→P snapshot (designed to optimize reseed slow path)
even if fully working would touch ≤5% of the p99 tail. The real
bottleneck is *fallback chain* (admission retry + seeded-router
cold start), not reseed. Optimizing p99 needs work on fallback,
not more D→P plumbing.
Full analysis: docs/E4_VS_E1_RESULTS_ZH.md
335 lines
14 KiB
Python
335 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""Generate E1 (naive PD-disagg) vs E4 (KVC + load-floor + RDMA) comparison figures.
|
||
|
||
Outputs (under docs/figures/):
|
||
e1_vs_e4_ttft_pdf.png - TTFT distribution body + log-tail
|
||
e1_vs_e4_latency_cdf.png - E2E latency CDF
|
||
e4_path_latency.png - E4 per-execution-mode latency breakdown
|
||
e1_vs_e4_p99_attribution.png - which execution modes contribute to E4's p99 tail
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
import argparse
|
||
import json
|
||
from collections import Counter, defaultdict
|
||
from pathlib import Path
|
||
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
|
||
ROOT = Path(__file__).resolve().parents[2]
|
||
FIG = ROOT / "docs/figures"
|
||
FIG.mkdir(parents=True, exist_ok=True)
|
||
|
||
E1_COLOR = "#D62728" # red
|
||
E4_COLOR = "#1F77B4" # blue
|
||
|
||
|
||
def load(p: Path) -> list[dict]:
|
||
return [json.loads(l) for l in p.open()]
|
||
|
||
|
||
def is_failed(r: dict) -> bool:
|
||
if r.get("error"):
|
||
return True
|
||
fr = r.get("finish_reason")
|
||
if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
|
||
return True
|
||
return False
|
||
|
||
|
||
def pct(values, q):
|
||
return float(np.quantile(values, q))
|
||
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--e1-metrics", required=True)
|
||
ap.add_argument("--e4-metrics", required=True)
|
||
args = ap.parse_args()
|
||
|
||
e1 = [r for r in load(Path(args.e1_metrics)) if not is_failed(r)]
|
||
e4 = [r for r in load(Path(args.e4_metrics)) if not is_failed(r)]
|
||
e1_ttft = np.array([r["ttft_s"] for r in e1 if r.get("ttft_s") is not None])
|
||
e4_ttft = np.array([r["ttft_s"] for r in e4 if r.get("ttft_s") is not None])
|
||
e1_lat = np.array([r["latency_s"] for r in e1 if r.get("latency_s") is not None])
|
||
e4_lat = np.array([r["latency_s"] for r in e4 if r.get("latency_s") is not None])
|
||
e1_ttft = e1_ttft[e1_ttft > 1e-4]
|
||
e4_ttft = e4_ttft[e4_ttft > 1e-4]
|
||
|
||
print(f"E1 reqs={len(e1)} (after failed-filter) TTFT n={len(e1_ttft)} lat n={len(e1_lat)}")
|
||
print(f"E4 reqs={len(e4)} (after failed-filter) TTFT n={len(e4_ttft)} lat n={len(e4_lat)}")
|
||
print()
|
||
for name, arr in [("E1", e1_ttft), ("E4", e4_ttft)]:
|
||
print(f" {name} TTFT mean={arr.mean():.3f} p50={pct(arr,0.5):.3f} "
|
||
f"p90={pct(arr,0.9):.3f} p99={pct(arr,0.99):.3f} max={arr.max():.3f}")
|
||
print()
|
||
for name, arr in [("E1", e1_lat), ("E4", e4_lat)]:
|
||
print(f" {name} Lat mean={arr.mean():.3f} p50={pct(arr,0.5):.3f} "
|
||
f"p90={pct(arr,0.9):.3f} p99={pct(arr,0.99):.3f} max={arr.max():.3f}")
|
||
print()
|
||
|
||
# ----- Plot 1: TTFT distribution (body + log tail) ---------------------
|
||
_plot_ttft_pdf(e1_ttft, e4_ttft)
|
||
|
||
# ----- Plot 2: Latency CDF --------------------------------------------
|
||
_plot_latency_cdf(e1_lat, e4_lat)
|
||
|
||
# ----- Plot 3: E4 path-level breakdown ---------------------------------
|
||
_plot_path_latency(e4)
|
||
|
||
# ----- Plot 4: p99 attribution -----------------------------------------
|
||
_plot_p99_attribution(e4, e1_ttft, e4_ttft)
|
||
|
||
|
||
def _plot_ttft_pdf(e1_ttft, e4_ttft):
|
||
from scipy.stats import gaussian_kde
|
||
fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))
|
||
|
||
# Body, linear x ∈ [0, 60s]
|
||
ax = axes[0]
|
||
x_body = np.linspace(0, 60, 800)
|
||
kde_e4 = gaussian_kde(e4_ttft, bw_method=0.15)
|
||
kde_e1 = gaussian_kde(e1_ttft, bw_method=0.15)
|
||
ax.plot(x_body, kde_e4(x_body), color=E4_COLOR, lw=2.5,
|
||
label=f"E4 KVC + load-floor + RDMA (n={len(e4_ttft)})")
|
||
ax.fill_between(x_body, kde_e4(x_body), alpha=0.2, color=E4_COLOR)
|
||
ax.plot(x_body, kde_e1(x_body), color=E1_COLOR, lw=2.5,
|
||
label=f"E1 naive PD-disagg (n={len(e1_ttft)})")
|
||
ax.fill_between(x_body, kde_e1(x_body), alpha=0.2, color=E1_COLOR)
|
||
for q, ls in [(0.5, "-"), (0.9, "--")]:
|
||
ax.axvline(pct(e4_ttft, q), color=E4_COLOR, ls=ls, alpha=0.55, lw=1.1)
|
||
ax.axvline(pct(e1_ttft, q), color=E1_COLOR, ls=ls, alpha=0.55, lw=1.1)
|
||
ymax = ax.get_ylim()[1]
|
||
ax.text(pct(e4_ttft, 0.5), ymax * 0.95, f"E4 p50\n{pct(e4_ttft, 0.5):.1f}s",
|
||
color=E4_COLOR, fontsize=9, va="top", ha="left",
|
||
bbox=dict(facecolor="white", edgecolor="none", alpha=0.8, pad=2))
|
||
ax.text(pct(e1_ttft, 0.5), ymax * 0.55, f"E1 p50\n{pct(e1_ttft, 0.5):.1f}s",
|
||
color=E1_COLOR, fontsize=9, va="top", ha="left",
|
||
bbox=dict(facecolor="white", edgecolor="none", alpha=0.8, pad=2))
|
||
ax.set_xlim(0, 60)
|
||
ax.set_xlabel("TTFT (seconds, linear)", fontsize=11)
|
||
ax.set_ylabel("Probability density", fontsize=11)
|
||
ax.set_title("Body of distribution (TTFT ≤ 60s)", fontsize=12, pad=10)
|
||
ax.legend(loc="upper right", fontsize=10, framealpha=0.95)
|
||
ax.grid(True, linestyle=":", alpha=0.4)
|
||
|
||
# Log tail
|
||
ax = axes[1]
|
||
kde_e4_log = gaussian_kde(np.log10(e4_ttft), bw_method="scott")
|
||
kde_e1_log = gaussian_kde(np.log10(e1_ttft), bw_method="scott")
|
||
log_x = np.linspace(np.log10(0.05), np.log10(500), 600)
|
||
x_full = 10 ** log_x
|
||
y_e4 = kde_e4_log(log_x)
|
||
y_e1 = kde_e1_log(log_x)
|
||
ax.plot(x_full, y_e4, color=E4_COLOR, lw=2.5, label=f"E4 KVC (n={len(e4_ttft)})")
|
||
ax.fill_between(x_full, y_e4, alpha=0.2, color=E4_COLOR)
|
||
ax.plot(x_full, y_e1, color=E1_COLOR, lw=2.5, label=f"E1 naive PD (n={len(e1_ttft)})")
|
||
ax.fill_between(x_full, y_e1, alpha=0.2, color=E1_COLOR)
|
||
ax.set_xscale("log")
|
||
ax.set_xlim(0.05, 500)
|
||
quartile_styles = [(0.5, "-", "p50"), (0.9, "--", "p90"), (0.99, ":", "p99")]
|
||
for q, ls, _ in quartile_styles:
|
||
ax.axvline(pct(e4_ttft, q), color=E4_COLOR, ls=ls, alpha=0.55, lw=1.1)
|
||
ax.axvline(pct(e1_ttft, q), color=E1_COLOR, ls=ls, alpha=0.55, lw=1.1)
|
||
ymax = max(y_e4.max(), y_e1.max())
|
||
ax.annotate(f"E4 p99 = {pct(e4_ttft, 0.99):.1f}s",
|
||
xy=(pct(e4_ttft, 0.99), kde_e4_log(np.log10(pct(e4_ttft, 0.99)))[0]),
|
||
xytext=(80, ymax * 0.55),
|
||
fontsize=10, color=E4_COLOR, fontweight="bold",
|
||
arrowprops=dict(arrowstyle="->", color=E4_COLOR, lw=1.0))
|
||
ax.annotate(f"E1 p99 = {pct(e1_ttft, 0.99):.1f}s",
|
||
xy=(pct(e1_ttft, 0.99), kde_e1_log(np.log10(pct(e1_ttft, 0.99)))[0]),
|
||
xytext=(80, ymax * 0.40),
|
||
fontsize=10, color=E1_COLOR, fontweight="bold",
|
||
arrowprops=dict(arrowstyle="->", color=E1_COLOR, lw=1.0))
|
||
ax.set_xticks([0.1, 1, 10, 100])
|
||
ax.set_xticklabels(["100ms", "1s", "10s", "100s"])
|
||
ax.set_xlabel("TTFT (log scale)", fontsize=11)
|
||
ax.set_ylabel("Density (per log₁₀ s)", fontsize=11)
|
||
ax.set_title("Full range incl. p99 tail (log x)", fontsize=12, pad=10)
|
||
ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
|
||
ax.grid(True, which="both", linestyle=":", alpha=0.4)
|
||
|
||
fig.suptitle(
|
||
"TTFT density: E4 KVC v2 + load-floor + RDMA vs E1 naive PD-disagg\n"
|
||
"Inferact 50-session trace · ts=1 · 4× H200 · aborted requests excluded",
|
||
fontsize=13, y=1.02,
|
||
)
|
||
plt.tight_layout()
|
||
out = FIG / "e1_vs_e4_ttft_pdf.png"
|
||
plt.savefig(out, dpi=150, bbox_inches="tight")
|
||
print(f"wrote {out}")
|
||
plt.close(fig)
|
||
|
||
|
||
def _plot_latency_cdf(e1_lat, e4_lat):
|
||
fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))
|
||
|
||
# Linear CDF
|
||
ax = axes[0]
|
||
for arr, color, name in [(e4_lat, E4_COLOR, f"E4 KVC (n={len(e4_lat)})"),
|
||
(e1_lat, E1_COLOR, f"E1 naive (n={len(e1_lat)})")]:
|
||
s = np.sort(arr)
|
||
y = np.linspace(0, 1, len(s), endpoint=False)
|
||
ax.plot(s, y, color=color, lw=2.5, label=name)
|
||
ax.set_xlim(0, 300)
|
||
ax.set_xlabel("E2E latency (seconds)", fontsize=11)
|
||
ax.set_ylabel("CDF", fontsize=11)
|
||
ax.set_title("Full latency CDF (linear)", fontsize=12)
|
||
ax.legend(loc="lower right", fontsize=10)
|
||
ax.grid(True, linestyle=":", alpha=0.4)
|
||
# Annotate percentiles
|
||
for q, mark in [(0.5, "p50"), (0.9, "p90"), (0.99, "p99")]:
|
||
e4v, e1v = pct(e4_lat, q), pct(e1_lat, q)
|
||
ax.axhline(q, color="gray", ls=":", alpha=0.3)
|
||
ax.annotate(f"{mark}: E4 {e4v:.1f}s, E1 {e1v:.1f}s",
|
||
xy=(0, q), xytext=(220, q - 0.02 if q > 0.5 else q + 0.02),
|
||
fontsize=9, color="black")
|
||
|
||
# Log CDF showing tail
|
||
ax = axes[1]
|
||
for arr, color, name in [(e4_lat, E4_COLOR, f"E4 KVC"),
|
||
(e1_lat, E1_COLOR, f"E1 naive")]:
|
||
s = np.sort(arr)
|
||
s_clip = np.maximum(s, 0.01)
|
||
y = np.linspace(0, 1, len(s), endpoint=False)
|
||
ax.plot(s_clip, 1 - y, color=color, lw=2.5, label=name)
|
||
ax.set_xscale("log")
|
||
ax.set_yscale("log")
|
||
ax.set_xlim(0.5, 500)
|
||
ax.set_ylim(1e-3, 1.1)
|
||
ax.set_xlabel("E2E latency (log s)", fontsize=11)
|
||
ax.set_ylabel("P(latency > x) (log)", fontsize=11)
|
||
ax.set_title("Survival function — log-log (highlights tail behavior)", fontsize=12)
|
||
ax.legend(loc="upper right", fontsize=10)
|
||
ax.grid(True, which="both", linestyle=":", alpha=0.4)
|
||
|
||
fig.suptitle("E2E latency: E4 KVC vs E1 naive PD-disagg", fontsize=13, y=1.02)
|
||
plt.tight_layout()
|
||
out = FIG / "e1_vs_e4_latency_cdf.png"
|
||
plt.savefig(out, dpi=150, bbox_inches="tight")
|
||
print(f"wrote {out}")
|
||
plt.close(fig)
|
||
|
||
|
||
def _plot_path_latency(e4):
|
||
by_mode = defaultdict(list)
|
||
by_mode_lat = defaultdict(list)
|
||
for r in e4:
|
||
m = r.get("execution_mode", "?") or "?"
|
||
if r.get("ttft_s") is not None:
|
||
by_mode[m].append(float(r["ttft_s"]))
|
||
if r.get("latency_s") is not None:
|
||
by_mode_lat[m].append(float(r["latency_s"]))
|
||
# Sort by count
|
||
modes = sorted(by_mode, key=lambda m: -len(by_mode[m]))
|
||
# Limit to top-N by count
|
||
modes = modes[:14]
|
||
|
||
fig, ax = plt.subplots(1, 1, figsize=(14, 7))
|
||
pos = np.arange(len(modes))
|
||
means = [np.mean(by_mode[m]) for m in modes]
|
||
p50 = [pct(np.array(by_mode[m]), 0.5) for m in modes]
|
||
p99 = [pct(np.array(by_mode[m]), 0.99) for m in modes]
|
||
counts = [len(by_mode[m]) for m in modes]
|
||
bar_h = 0.25
|
||
ax.barh(pos - bar_h, means, bar_h, label="mean", color="#4a90e2", alpha=0.85)
|
||
ax.barh(pos, p50, bar_h, label="p50", color="#66cc99", alpha=0.85)
|
||
ax.barh(pos + bar_h, p99, bar_h, label="p99", color="#e74c3c", alpha=0.85)
|
||
ax.set_yticks(pos)
|
||
ax.set_yticklabels([f"{m} (n={counts[i]})" for i, m in enumerate(modes)],
|
||
fontsize=9)
|
||
ax.invert_yaxis()
|
||
ax.set_xlabel("TTFT (s)", fontsize=11)
|
||
ax.set_title("E4 per execution_mode TTFT (sorted by count, top 14)",
|
||
fontsize=12, pad=10)
|
||
ax.legend(loc="lower right", fontsize=10)
|
||
ax.grid(True, linestyle=":", alpha=0.4)
|
||
plt.tight_layout()
|
||
out = FIG / "e4_path_latency.png"
|
||
plt.savefig(out, dpi=150, bbox_inches="tight")
|
||
print(f"wrote {out}")
|
||
plt.close(fig)
|
||
|
||
|
||
def _plot_p99_attribution(e4, e1_ttft, e4_ttft):
|
||
"""Show which execution modes hit p99 and dominate the tail."""
|
||
# Threshold: anything > E4's p99 = part of the p99 tail
|
||
e4_p99 = pct(e4_ttft, 0.99)
|
||
e1_p99 = pct(e1_ttft, 0.99)
|
||
# Define the "tail" as TTFT > p95
|
||
threshold = pct(e4_ttft, 0.95)
|
||
tail_modes = Counter()
|
||
body_modes = Counter()
|
||
for r in e4:
|
||
m = r.get("execution_mode", "?") or "?"
|
||
ttft = r.get("ttft_s")
|
||
if ttft is None:
|
||
continue
|
||
if ttft >= threshold:
|
||
tail_modes[m] += 1
|
||
else:
|
||
body_modes[m] += 1
|
||
all_modes = sorted(tail_modes, key=lambda m: -tail_modes[m])[:10]
|
||
body_total = sum(body_modes.values())
|
||
tail_total = sum(tail_modes.values())
|
||
|
||
fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))
|
||
|
||
# Pie of tail composition
|
||
ax = axes[0]
|
||
sizes = [tail_modes[m] for m in all_modes]
|
||
rest = sum(tail_modes.values()) - sum(sizes)
|
||
if rest > 0:
|
||
all_modes_label = all_modes + ["(other)"]
|
||
sizes = sizes + [rest]
|
||
else:
|
||
all_modes_label = all_modes
|
||
wedges, texts, autotexts = ax.pie(
|
||
sizes, labels=[f"{m}\n(n={c})" for m, c in zip(all_modes_label, sizes)],
|
||
autopct="%1.0f%%", startangle=90, textprops={"fontsize": 9},
|
||
)
|
||
ax.set_title(f"E4 p95-p99 tail composition\n(TTFT ≥ {threshold:.1f}s, n={tail_total})",
|
||
fontsize=12, pad=12)
|
||
|
||
# Bar of mean TTFT within tail per mode
|
||
ax = axes[1]
|
||
mode_to_tail_lat = defaultdict(list)
|
||
for r in e4:
|
||
m = r.get("execution_mode", "?") or "?"
|
||
ttft = r.get("ttft_s")
|
||
if ttft is None or ttft < threshold:
|
||
continue
|
||
mode_to_tail_lat[m].append(float(ttft))
|
||
pos = np.arange(len(all_modes))
|
||
means = [np.mean(mode_to_tail_lat[m]) if mode_to_tail_lat[m] else 0 for m in all_modes]
|
||
counts = [len(mode_to_tail_lat[m]) for m in all_modes]
|
||
ax.barh(pos, means, color="#e74c3c", alpha=0.85)
|
||
ax.set_yticks(pos)
|
||
ax.set_yticklabels([f"{m} (n={counts[i]})" for i, m in enumerate(all_modes)],
|
||
fontsize=9)
|
||
ax.invert_yaxis()
|
||
ax.set_xlabel("Mean TTFT in p95-p99 region (s)", fontsize=11)
|
||
ax.set_title(f"Per-mode mean TTFT among tail reqs", fontsize=12)
|
||
ax.axvline(e4_p99, color=E4_COLOR, ls="--", alpha=0.6, label=f"E4 p99 = {e4_p99:.1f}s")
|
||
ax.axvline(e1_p99, color=E1_COLOR, ls="--", alpha=0.6, label=f"E1 p99 = {e1_p99:.1f}s")
|
||
ax.legend(loc="lower right", fontsize=10)
|
||
ax.grid(True, linestyle=":", alpha=0.4)
|
||
|
||
fig.suptitle(
|
||
f"E4 p99 tail attribution: which execution_modes produce the long tail?\n"
|
||
f"E4 p99 = {e4_p99:.1f}s vs E1 p99 = {e1_p99:.1f}s "
|
||
f"(KVC loses tail by +{(e4_p99/e1_p99-1)*100:.1f}%)",
|
||
fontsize=13, y=1.02,
|
||
)
|
||
plt.tight_layout()
|
||
out = FIG / "e1_vs_e4_p99_attribution.png"
|
||
plt.savefig(out, dpi=150, bbox_inches="tight")
|
||
print(f"wrote {out}")
|
||
plt.close(fig)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|