agentic-pd-hybrid/scripts/analysis/plot_e1_vs_e4.py

#!/usr/bin/env python3
"""Generate E1 (naive PD-disagg) vs E4 (KVC + load-floor + RDMA) comparison figures.

Outputs (under docs/figures/):
  e1_vs_e4_ttft_pdf.png         - TTFT distribution body + log-tail
  e1_vs_e4_latency_cdf.png      - E2E latency CDF
  e4_path_latency.png           - E4 per-execution-mode latency breakdown
  e1_vs_e4_p99_attribution.png  - which execution modes contribute to E4's p99 tail
"""

from __future__ import annotations
import argparse
import json
from collections import Counter, defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

ROOT = Path(__file__).resolve().parents[2]
FIG = ROOT / "docs/figures"
FIG.mkdir(parents=True, exist_ok=True)

E1_COLOR = "#D62728"   # red
E4_COLOR = "#1F77B4"   # blue


def load(p: Path) -> list[dict]:
    return [json.loads(l) for l in p.open()]


def is_failed(r: dict) -> bool:
    if r.get("error"):
        return True
    fr = r.get("finish_reason")
    if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
        return True
    return False


def pct(values, q):
    return float(np.quantile(values, q))


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--e1-metrics", required=True)
    ap.add_argument("--e4-metrics", required=True)
    args = ap.parse_args()

    e1 = [r for r in load(Path(args.e1_metrics)) if not is_failed(r)]
    e4 = [r for r in load(Path(args.e4_metrics)) if not is_failed(r)]
    e1_ttft = np.array([r["ttft_s"] for r in e1 if r.get("ttft_s") is not None])
    e4_ttft = np.array([r["ttft_s"] for r in e4 if r.get("ttft_s") is not None])
    e1_lat = np.array([r["latency_s"] for r in e1 if r.get("latency_s") is not None])
    e4_lat = np.array([r["latency_s"] for r in e4 if r.get("latency_s") is not None])
    e1_ttft = e1_ttft[e1_ttft > 1e-4]
    e4_ttft = e4_ttft[e4_ttft > 1e-4]

    print(f"E1  reqs={len(e1)} (after failed-filter)  TTFT n={len(e1_ttft)}  lat n={len(e1_lat)}")
    print(f"E4  reqs={len(e4)} (after failed-filter)  TTFT n={len(e4_ttft)}  lat n={len(e4_lat)}")
    print()
    for name, arr in [("E1", e1_ttft), ("E4", e4_ttft)]:
        print(f"  {name} TTFT  mean={arr.mean():.3f}  p50={pct(arr,0.5):.3f}  "
              f"p90={pct(arr,0.9):.3f}  p99={pct(arr,0.99):.3f}  max={arr.max():.3f}")
    print()
    for name, arr in [("E1", e1_lat), ("E4", e4_lat)]:
        print(f"  {name} Lat   mean={arr.mean():.3f}  p50={pct(arr,0.5):.3f}  "
              f"p90={pct(arr,0.9):.3f}  p99={pct(arr,0.99):.3f}  max={arr.max():.3f}")
    print()

    # ----- Plot 1: TTFT distribution (body + log tail) ---------------------
    _plot_ttft_pdf(e1_ttft, e4_ttft)

    # ----- Plot 2: Latency CDF --------------------------------------------
    _plot_latency_cdf(e1_lat, e4_lat)

    # ----- Plot 3: E4 path-level breakdown ---------------------------------
    _plot_path_latency(e4)

    # ----- Plot 4: p99 attribution -----------------------------------------
    _plot_p99_attribution(e4, e1_ttft, e4_ttft)


def _plot_ttft_pdf(e1_ttft, e4_ttft):
    from scipy.stats import gaussian_kde
    fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))

    # Body, linear x ∈ [0, 60s]
    ax = axes[0]
    x_body = np.linspace(0, 60, 800)
    kde_e4 = gaussian_kde(e4_ttft, bw_method=0.15)
    kde_e1 = gaussian_kde(e1_ttft, bw_method=0.15)
    ax.plot(x_body, kde_e4(x_body), color=E4_COLOR, lw=2.5,
            label=f"E4 KVC + load-floor + RDMA  (n={len(e4_ttft)})")
    ax.fill_between(x_body, kde_e4(x_body), alpha=0.2, color=E4_COLOR)
    ax.plot(x_body, kde_e1(x_body), color=E1_COLOR, lw=2.5,
            label=f"E1 naive PD-disagg  (n={len(e1_ttft)})")
    ax.fill_between(x_body, kde_e1(x_body), alpha=0.2, color=E1_COLOR)
    for q, ls in [(0.5, "-"), (0.9, "--")]:
        ax.axvline(pct(e4_ttft, q), color=E4_COLOR, ls=ls, alpha=0.55, lw=1.1)
        ax.axvline(pct(e1_ttft, q), color=E1_COLOR, ls=ls, alpha=0.55, lw=1.1)
    ymax = ax.get_ylim()[1]
    ax.text(pct(e4_ttft, 0.5), ymax * 0.95, f"E4 p50\n{pct(e4_ttft, 0.5):.1f}s",
            color=E4_COLOR, fontsize=9, va="top", ha="left",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.8, pad=2))
    ax.text(pct(e1_ttft, 0.5), ymax * 0.55, f"E1 p50\n{pct(e1_ttft, 0.5):.1f}s",
            color=E1_COLOR, fontsize=9, va="top", ha="left",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.8, pad=2))
    ax.set_xlim(0, 60)
    ax.set_xlabel("TTFT (seconds, linear)", fontsize=11)
    ax.set_ylabel("Probability density", fontsize=11)
    ax.set_title("Body of distribution (TTFT ≤ 60s)", fontsize=12, pad=10)
    ax.legend(loc="upper right", fontsize=10, framealpha=0.95)
    ax.grid(True, linestyle=":", alpha=0.4)

    # Log tail
    ax = axes[1]
    kde_e4_log = gaussian_kde(np.log10(e4_ttft), bw_method="scott")
    kde_e1_log = gaussian_kde(np.log10(e1_ttft), bw_method="scott")
    log_x = np.linspace(np.log10(0.05), np.log10(500), 600)
    x_full = 10 ** log_x
    y_e4 = kde_e4_log(log_x)
    y_e1 = kde_e1_log(log_x)
    ax.plot(x_full, y_e4, color=E4_COLOR, lw=2.5, label=f"E4 KVC  (n={len(e4_ttft)})")
    ax.fill_between(x_full, y_e4, alpha=0.2, color=E4_COLOR)
    ax.plot(x_full, y_e1, color=E1_COLOR, lw=2.5, label=f"E1 naive PD  (n={len(e1_ttft)})")
    ax.fill_between(x_full, y_e1, alpha=0.2, color=E1_COLOR)
    ax.set_xscale("log")
    ax.set_xlim(0.05, 500)
    quartile_styles = [(0.5, "-", "p50"), (0.9, "--", "p90"), (0.99, ":", "p99")]
    for q, ls, _ in quartile_styles:
        ax.axvline(pct(e4_ttft, q), color=E4_COLOR, ls=ls, alpha=0.55, lw=1.1)
        ax.axvline(pct(e1_ttft, q), color=E1_COLOR, ls=ls, alpha=0.55, lw=1.1)
    ymax = max(y_e4.max(), y_e1.max())
    ax.annotate(f"E4 p99 = {pct(e4_ttft, 0.99):.1f}s",
                xy=(pct(e4_ttft, 0.99), kde_e4_log(np.log10(pct(e4_ttft, 0.99)))[0]),
                xytext=(80, ymax * 0.55),
                fontsize=10, color=E4_COLOR, fontweight="bold",
                arrowprops=dict(arrowstyle="->", color=E4_COLOR, lw=1.0))
    ax.annotate(f"E1 p99 = {pct(e1_ttft, 0.99):.1f}s",
                xy=(pct(e1_ttft, 0.99), kde_e1_log(np.log10(pct(e1_ttft, 0.99)))[0]),
                xytext=(80, ymax * 0.40),
                fontsize=10, color=E1_COLOR, fontweight="bold",
                arrowprops=dict(arrowstyle="->", color=E1_COLOR, lw=1.0))
    ax.set_xticks([0.1, 1, 10, 100])
    ax.set_xticklabels(["100ms", "1s", "10s", "100s"])
    ax.set_xlabel("TTFT (log scale)", fontsize=11)
    ax.set_ylabel("Density (per log₁₀ s)", fontsize=11)
    ax.set_title("Full range incl. p99 tail (log x)", fontsize=12, pad=10)
    ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
    ax.grid(True, which="both", linestyle=":", alpha=0.4)

    fig.suptitle(
        "TTFT density: E4 KVC v2 + load-floor + RDMA vs E1 naive PD-disagg\n"
        "Inferact 50-session trace · ts=1 · 4× H200 · aborted requests excluded",
        fontsize=13, y=1.02,
    )
    plt.tight_layout()
    out = FIG / "e1_vs_e4_ttft_pdf.png"
    plt.savefig(out, dpi=150, bbox_inches="tight")
    print(f"wrote {out}")
    plt.close(fig)


def _plot_latency_cdf(e1_lat, e4_lat):
    fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))

    # Linear CDF
    ax = axes[0]
    for arr, color, name in [(e4_lat, E4_COLOR, f"E4 KVC (n={len(e4_lat)})"),
                              (e1_lat, E1_COLOR, f"E1 naive (n={len(e1_lat)})")]:
        s = np.sort(arr)
        y = np.linspace(0, 1, len(s), endpoint=False)
        ax.plot(s, y, color=color, lw=2.5, label=name)
    ax.set_xlim(0, 300)
    ax.set_xlabel("E2E latency (seconds)", fontsize=11)
    ax.set_ylabel("CDF", fontsize=11)
    ax.set_title("Full latency CDF (linear)", fontsize=12)
    ax.legend(loc="lower right", fontsize=10)
    ax.grid(True, linestyle=":", alpha=0.4)
    # Annotate percentiles
    for q, mark in [(0.5, "p50"), (0.9, "p90"), (0.99, "p99")]:
        e4v, e1v = pct(e4_lat, q), pct(e1_lat, q)
        ax.axhline(q, color="gray", ls=":", alpha=0.3)
        ax.annotate(f"{mark}: E4 {e4v:.1f}s, E1 {e1v:.1f}s",
                    xy=(0, q), xytext=(220, q - 0.02 if q > 0.5 else q + 0.02),
                    fontsize=9, color="black")

    # Log CDF showing tail
    ax = axes[1]
    for arr, color, name in [(e4_lat, E4_COLOR, f"E4 KVC"),
                              (e1_lat, E1_COLOR, f"E1 naive")]:
        s = np.sort(arr)
        s_clip = np.maximum(s, 0.01)
        y = np.linspace(0, 1, len(s), endpoint=False)
        ax.plot(s_clip, 1 - y, color=color, lw=2.5, label=name)
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlim(0.5, 500)
    ax.set_ylim(1e-3, 1.1)
    ax.set_xlabel("E2E latency (log s)", fontsize=11)
    ax.set_ylabel("P(latency > x)  (log)", fontsize=11)
    ax.set_title("Survival function — log-log (highlights tail behavior)", fontsize=12)
    ax.legend(loc="upper right", fontsize=10)
    ax.grid(True, which="both", linestyle=":", alpha=0.4)

    fig.suptitle("E2E latency: E4 KVC vs E1 naive PD-disagg", fontsize=13, y=1.02)
    plt.tight_layout()
    out = FIG / "e1_vs_e4_latency_cdf.png"
    plt.savefig(out, dpi=150, bbox_inches="tight")
    print(f"wrote {out}")
    plt.close(fig)


def _plot_path_latency(e4):
    by_mode = defaultdict(list)
    by_mode_lat = defaultdict(list)
    for r in e4:
        m = r.get("execution_mode", "?") or "?"
        if r.get("ttft_s") is not None:
            by_mode[m].append(float(r["ttft_s"]))
        if r.get("latency_s") is not None:
            by_mode_lat[m].append(float(r["latency_s"]))
    # Sort by count
    modes = sorted(by_mode, key=lambda m: -len(by_mode[m]))
    # Limit to top-N by count
    modes = modes[:14]

    fig, ax = plt.subplots(1, 1, figsize=(14, 7))
    pos = np.arange(len(modes))
    means = [np.mean(by_mode[m]) for m in modes]
    p50 = [pct(np.array(by_mode[m]), 0.5) for m in modes]
    p99 = [pct(np.array(by_mode[m]), 0.99) for m in modes]
    counts = [len(by_mode[m]) for m in modes]
    bar_h = 0.25
    ax.barh(pos - bar_h, means, bar_h, label="mean", color="#4a90e2", alpha=0.85)
    ax.barh(pos, p50, bar_h, label="p50", color="#66cc99", alpha=0.85)
    ax.barh(pos + bar_h, p99, bar_h, label="p99", color="#e74c3c", alpha=0.85)
    ax.set_yticks(pos)
    ax.set_yticklabels([f"{m} (n={counts[i]})" for i, m in enumerate(modes)],
                       fontsize=9)
    ax.invert_yaxis()
    ax.set_xlabel("TTFT (s)", fontsize=11)
    ax.set_title("E4 per execution_mode TTFT (sorted by count, top 14)",
                 fontsize=12, pad=10)
    ax.legend(loc="lower right", fontsize=10)
    ax.grid(True, linestyle=":", alpha=0.4)
    plt.tight_layout()
    out = FIG / "e4_path_latency.png"
    plt.savefig(out, dpi=150, bbox_inches="tight")
    print(f"wrote {out}")
    plt.close(fig)


def _plot_p99_attribution(e4, e1_ttft, e4_ttft):
    """Show which execution modes hit p99 and dominate the tail."""
    # Threshold: anything > E4's p99 = part of the p99 tail
    e4_p99 = pct(e4_ttft, 0.99)
    e1_p99 = pct(e1_ttft, 0.99)
    # Define the "tail" as TTFT > p95
    threshold = pct(e4_ttft, 0.95)
    tail_modes = Counter()
    body_modes = Counter()
    for r in e4:
        m = r.get("execution_mode", "?") or "?"
        ttft = r.get("ttft_s")
        if ttft is None:
            continue
        if ttft >= threshold:
            tail_modes[m] += 1
        else:
            body_modes[m] += 1
    all_modes = sorted(tail_modes, key=lambda m: -tail_modes[m])[:10]
    body_total = sum(body_modes.values())
    tail_total = sum(tail_modes.values())

    fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))

    # Pie of tail composition
    ax = axes[0]
    sizes = [tail_modes[m] for m in all_modes]
    rest = sum(tail_modes.values()) - sum(sizes)
    if rest > 0:
        all_modes_label = all_modes + ["(other)"]
        sizes = sizes + [rest]
    else:
        all_modes_label = all_modes
    wedges, texts, autotexts = ax.pie(
        sizes, labels=[f"{m}\n(n={c})" for m, c in zip(all_modes_label, sizes)],
        autopct="%1.0f%%", startangle=90, textprops={"fontsize": 9},
    )
    ax.set_title(f"E4 p95-p99 tail composition\n(TTFT ≥ {threshold:.1f}s, n={tail_total})",
                 fontsize=12, pad=12)

    # Bar of mean TTFT within tail per mode
    ax = axes[1]
    mode_to_tail_lat = defaultdict(list)
    for r in e4:
        m = r.get("execution_mode", "?") or "?"
        ttft = r.get("ttft_s")
        if ttft is None or ttft < threshold:
            continue
        mode_to_tail_lat[m].append(float(ttft))
    pos = np.arange(len(all_modes))
    means = [np.mean(mode_to_tail_lat[m]) if mode_to_tail_lat[m] else 0 for m in all_modes]
    counts = [len(mode_to_tail_lat[m]) for m in all_modes]
    ax.barh(pos, means, color="#e74c3c", alpha=0.85)
    ax.set_yticks(pos)
    ax.set_yticklabels([f"{m} (n={counts[i]})" for i, m in enumerate(all_modes)],
                       fontsize=9)
    ax.invert_yaxis()
    ax.set_xlabel("Mean TTFT in p95-p99 region (s)", fontsize=11)
    ax.set_title(f"Per-mode mean TTFT among tail reqs", fontsize=12)
    ax.axvline(e4_p99, color=E4_COLOR, ls="--", alpha=0.6, label=f"E4 p99 = {e4_p99:.1f}s")
    ax.axvline(e1_p99, color=E1_COLOR, ls="--", alpha=0.6, label=f"E1 p99 = {e1_p99:.1f}s")
    ax.legend(loc="lower right", fontsize=10)
    ax.grid(True, linestyle=":", alpha=0.4)

    fig.suptitle(
        f"E4 p99 tail attribution: which execution_modes produce the long tail?\n"
        f"E4 p99 = {e4_p99:.1f}s  vs  E1 p99 = {e1_p99:.1f}s  "
        f"(KVC loses tail by +{(e4_p99/e1_p99-1)*100:.1f}%)",
        fontsize=13, y=1.02,
    )
    plt.tight_layout()
    out = FIG / "e1_vs_e4_p99_attribution.png"
    plt.savefig(out, dpi=150, bbox_inches="tight")
    print(f"wrote {out}")
    plt.close(fig)


if __name__ == "__main__":
    main()