agentic-pd-hybrid/scripts/analysis/plot_ttft_pdf.py

#!/usr/bin/env python3
"""Generate TTFT probability density curves: KVC 1P3D v2 vs 4-way DP CA.

Inputs:
  outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl
  outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl

Outputs:
  docs/figures/ttft_pdf_comparison.png  -- two-panel figure:
      left panel: linear x in [0, 1.0]s zoomed on the body
      right panel: log x covering full range (0.01 -- 10 s)
  Each KDE curve uses scipy.stats.gaussian_kde with Scott's rule bandwidth.
  Aborted requests are excluded (same filter as metrics.py:_is_failed_request).
"""

from __future__ import annotations

import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

ROOT = Path(__file__).resolve().parents[2]
KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl"
DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl"
OUT = ROOT / "docs/figures/ttft_pdf_comparison.png"


def load(p: Path) -> list[dict]:
    return [json.loads(line) for line in p.open()]


def is_failed(r: dict) -> bool:
    if r.get("error"):
        return True
    fr = r.get("finish_reason")
    if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()):
        return True
    return False


def pct(vals: np.ndarray, q: float) -> float:
    return float(np.quantile(vals, q))


def main() -> None:
    kvc = [r for r in load(KVC) if not is_failed(r)]
    dp = [r for r in load(DP) if not is_failed(r)]

    kvc_ttft = np.array([r["ttft_s"] for r in kvc if r.get("ttft_s") is not None])
    dp_ttft = np.array([r["ttft_s"] for r in dp if r.get("ttft_s") is not None])

    # Trim absurdly small zeros (rare measurement artifacts) so log KDE behaves.
    kvc_ttft = kvc_ttft[kvc_ttft > 1e-4]
    dp_ttft = dp_ttft[dp_ttft > 1e-4]

    KVC_COLOR = "#1F77B4"  # blue
    DP_COLOR = "#D62728"   # red

    fig, axes = plt.subplots(1, 2, figsize=(16, 6.5))

    # ------------------------------------------------------------------
    # Left panel: linear x ∈ [0, 0.6]s -- body of the distribution
    # ------------------------------------------------------------------
    ax = axes[0]
    x_body = np.linspace(0.0, 0.6, 600)

    # KDE on linear ttft values, clipped to body
    kde_kvc_lin = gaussian_kde(kvc_ttft, bw_method=0.15)
    kde_dp_lin = gaussian_kde(dp_ttft, bw_method=0.15)

    ax.plot(x_body, kde_kvc_lin(x_body),
            color=KVC_COLOR, lw=2.5, label=f"KVC 1P3D v2  (n={len(kvc_ttft)})")
    ax.fill_between(x_body, kde_kvc_lin(x_body), alpha=0.20, color=KVC_COLOR)
    ax.plot(x_body, kde_dp_lin(x_body),
            color=DP_COLOR, lw=2.5, label=f"4-way DP CA  (n={len(dp_ttft)})")
    ax.fill_between(x_body, kde_dp_lin(x_body), alpha=0.20, color=DP_COLOR)

    # Vertical lines for p50, p90
    for q, ls in [(0.50, "-"), (0.90, "--")]:
        ax.axvline(pct(kvc_ttft, q), color=KVC_COLOR, ls=ls, alpha=0.55, lw=1.1)
        ax.axvline(pct(dp_ttft, q), color=DP_COLOR, ls=ls, alpha=0.55, lw=1.1)
    ymax = ax.get_ylim()[1]
    ax.text(pct(kvc_ttft, 0.50), ymax * 0.97,
            f"KVC p50\n{pct(kvc_ttft, 0.50)*1000:.0f}ms",
            color=KVC_COLOR, fontsize=9, va="top", ha="left",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2))
    ax.text(pct(dp_ttft, 0.50), ymax * 0.50,
            f"DP p50\n{pct(dp_ttft, 0.50)*1000:.0f}ms",
            color=DP_COLOR, fontsize=9, va="top", ha="left",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2))
    ax.text(pct(kvc_ttft, 0.90), ymax * 0.30,
            f"KVC p90\n{pct(kvc_ttft, 0.90)*1000:.0f}ms",
            color=KVC_COLOR, fontsize=9, va="top", ha="left",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2))
    ax.text(pct(dp_ttft, 0.90), ymax * 0.18,
            f"DP p90\n{pct(dp_ttft, 0.90)*1000:.0f}ms",
            color=DP_COLOR, fontsize=9, va="top", ha="left",
            bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2))

    ax.set_xlim(0, 0.6)
    ax.set_xlabel("TTFT (seconds, linear)", fontsize=11)
    ax.set_ylabel("Probability density", fontsize=11)
    ax.set_title("Body of distribution  (TTFT ≤ 0.6 s)", fontsize=12, pad=10)
    ax.legend(loc="upper right", fontsize=10, framealpha=0.95)
    ax.grid(True, linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)

    # ------------------------------------------------------------------
    # Right panel: log x ∈ [0.01, 10]s -- full range incl. tail
    # PDF on log-x: we plot density vs log10(t) so the curve integrates
    # to 1 over log space (standard "log-density" presentation).
    # ------------------------------------------------------------------
    ax = axes[1]
    # KDE on log10(ttft) so the resulting curve integrates to 1 over log10 t
    kde_kvc_log = gaussian_kde(np.log10(kvc_ttft), bw_method="scott")
    kde_dp_log = gaussian_kde(np.log10(dp_ttft), bw_method="scott")
    log_x = np.linspace(np.log10(0.01), np.log10(10.0), 600)
    x_full = 10 ** log_x

    y_kvc = kde_kvc_log(log_x)
    y_dp = kde_dp_log(log_x)

    ax.plot(x_full, y_kvc, color=KVC_COLOR, lw=2.5, label=f"KVC 1P3D v2  (n={len(kvc_ttft)})")
    ax.fill_between(x_full, y_kvc, alpha=0.20, color=KVC_COLOR)
    ax.plot(x_full, y_dp, color=DP_COLOR, lw=2.5, label=f"4-way DP CA  (n={len(dp_ttft)})")
    ax.fill_between(x_full, y_dp, alpha=0.20, color=DP_COLOR)

    ax.set_xscale("log")
    ax.set_xlim(0.01, 10.0)

    # Percentile markers
    quartile_styles = [(0.50, "-", "p50"), (0.90, "--", "p90"), (0.99, ":", "p99")]
    for q, ls, name in quartile_styles:
        ax.axvline(pct(kvc_ttft, q), color=KVC_COLOR, ls=ls, alpha=0.55, lw=1.1)
        ax.axvline(pct(dp_ttft, q), color=DP_COLOR, ls=ls, alpha=0.55, lw=1.1)

    # Annotate p99 specifically since this is the key reviewer-targeted callout
    ymax = max(y_kvc.max(), y_dp.max())
    kvc_p99 = pct(kvc_ttft, 0.99)
    dp_p99 = pct(dp_ttft, 0.99)
    ax.annotate(f"KVC p99 = {kvc_p99:.2f}s\n(slow-path reseed tail)",
                xy=(kvc_p99, kde_kvc_log(np.log10(kvc_p99))[0]),
                xytext=(2.0, ymax * 0.65),
                fontsize=10, color=KVC_COLOR, fontweight="bold",
                arrowprops=dict(arrowstyle="->", color=KVC_COLOR, lw=1.0))
    ax.annotate(f"DP p99 = {dp_p99*1000:.0f}ms",
                xy=(dp_p99, kde_dp_log(np.log10(dp_p99))[0]),
                xytext=(0.025, ymax * 0.80),
                fontsize=10, color=DP_COLOR, fontweight="bold",
                arrowprops=dict(arrowstyle="->", color=DP_COLOR, lw=1.0))
    # Highlight the KVC bimodal structure
    ax.annotate("KVC fast path\n(direct-to-D, 91.6%)",
                xy=(0.05, y_kvc[np.argmin(np.abs(x_full - 0.05))]),
                xytext=(0.012, ymax * 0.45),
                fontsize=9, color=KVC_COLOR, style="italic",
                arrowprops=dict(arrowstyle="->", color=KVC_COLOR, lw=0.7, alpha=0.6))
    ax.annotate("KVC slow path\n(reseed, ~3.4%)",
                xy=(2.5, y_kvc[np.argmin(np.abs(x_full - 2.5))]),
                xytext=(3.0, ymax * 0.30),
                fontsize=9, color=KVC_COLOR, style="italic",
                arrowprops=dict(arrowstyle="->", color=KVC_COLOR, lw=0.7, alpha=0.6))

    # Custom tick labels in seconds (instead of 10^-2, 10^-1, 10^0, 10^1)
    ax.set_xticks([0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0])
    ax.set_xticklabels(["10ms", "50ms", "100ms", "500ms", "1s", "5s", "10s"])

    ax.set_xlabel("TTFT (log scale)", fontsize=11)
    ax.set_ylabel("Density  (per log₁₀ s)", fontsize=11)
    ax.set_title("Full range  (TTFT 10 ms – 10 s, log x)", fontsize=12, pad=10)
    ax.legend(loc="upper left", fontsize=10, framealpha=0.95)
    ax.grid(True, which="both", linestyle=":", alpha=0.4)
    ax.set_axisbelow(True)

    fig.suptitle(
        "TTFT probability density: KVC 1P3D v2 vs 4-way DP CA\n"
        "SWE-Bench 50sess trace · ts=1 · 4× H100 80GB · aborted/error requests excluded",
        fontsize=13, y=1.02,
    )
    plt.tight_layout()
    plt.savefig(OUT, dpi=150, bbox_inches="tight")
    print(f"wrote {OUT}")
    plt.close(fig)

    # ------------------------------------------------------------------
    # Print summary stats for doc cross-reference
    # ------------------------------------------------------------------
    print(f"\n=== TTFT distribution summary ===")
    for name, arr in [("KVC v2", kvc_ttft), ("DP 4w", dp_ttft)]:
        print(f"  {name}  (n={len(arr)})")
        print(f"    min={arr.min()*1000:.1f}ms  p10={pct(arr,0.10)*1000:.1f}ms  "
              f"p50={pct(arr,0.50)*1000:.1f}ms  p90={pct(arr,0.90)*1000:.1f}ms  "
              f"p99={pct(arr,0.99)*1000:.1f}ms  max={arr.max()*1000:.1f}ms")


if __name__ == "__main__":
    main()