#!/usr/bin/env python3 """Generate TTFT probability density curves: KVC 1P3D v2 vs 4-way DP CA. Inputs: outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl Outputs: docs/figures/ttft_pdf_comparison.png -- two-panel figure: left panel: linear x in [0, 1.0]s zoomed on the body right panel: log x covering full range (0.01 -- 10 s) Each KDE curve uses scipy.stats.gaussian_kde with Scott's rule bandwidth. Aborted requests are excluded (same filter as metrics.py:_is_failed_request). """ from __future__ import annotations import json from pathlib import Path import matplotlib.pyplot as plt import numpy as np from scipy.stats import gaussian_kde ROOT = Path(__file__).resolve().parents[2] KVC = ROOT / "outputs/qwen3-30b-tp1-ts1-migration-v2/kvc_1p3d_migration_v2_run1_metrics.jsonl" DP = ROOT / "outputs/qwen3-30b-tp1-ts1-validation/dp4_metrics.jsonl" OUT = ROOT / "docs/figures/ttft_pdf_comparison.png" def load(p: Path) -> list[dict]: return [json.loads(line) for line in p.open()] def is_failed(r: dict) -> bool: if r.get("error"): return True fr = r.get("finish_reason") if fr and ("abort" in str(fr).lower() or "badrequest" in str(fr).lower()): return True return False def pct(vals: np.ndarray, q: float) -> float: return float(np.quantile(vals, q)) def main() -> None: kvc = [r for r in load(KVC) if not is_failed(r)] dp = [r for r in load(DP) if not is_failed(r)] kvc_ttft = np.array([r["ttft_s"] for r in kvc if r.get("ttft_s") is not None]) dp_ttft = np.array([r["ttft_s"] for r in dp if r.get("ttft_s") is not None]) # Trim absurdly small zeros (rare measurement artifacts) so log KDE behaves. kvc_ttft = kvc_ttft[kvc_ttft > 1e-4] dp_ttft = dp_ttft[dp_ttft > 1e-4] KVC_COLOR = "#1F77B4" # blue DP_COLOR = "#D62728" # red fig, axes = plt.subplots(1, 2, figsize=(16, 6.5)) # ------------------------------------------------------------------ # Left panel: linear x ∈ [0, 0.6]s -- body of the distribution # ------------------------------------------------------------------ ax = axes[0] x_body = np.linspace(0.0, 0.6, 600) # KDE on linear ttft values, clipped to body kde_kvc_lin = gaussian_kde(kvc_ttft, bw_method=0.15) kde_dp_lin = gaussian_kde(dp_ttft, bw_method=0.15) ax.plot(x_body, kde_kvc_lin(x_body), color=KVC_COLOR, lw=2.5, label=f"KVC 1P3D v2 (n={len(kvc_ttft)})") ax.fill_between(x_body, kde_kvc_lin(x_body), alpha=0.20, color=KVC_COLOR) ax.plot(x_body, kde_dp_lin(x_body), color=DP_COLOR, lw=2.5, label=f"4-way DP CA (n={len(dp_ttft)})") ax.fill_between(x_body, kde_dp_lin(x_body), alpha=0.20, color=DP_COLOR) # Vertical lines for p50, p90 for q, ls in [(0.50, "-"), (0.90, "--")]: ax.axvline(pct(kvc_ttft, q), color=KVC_COLOR, ls=ls, alpha=0.55, lw=1.1) ax.axvline(pct(dp_ttft, q), color=DP_COLOR, ls=ls, alpha=0.55, lw=1.1) ymax = ax.get_ylim()[1] ax.text(pct(kvc_ttft, 0.50), ymax * 0.97, f"KVC p50\n{pct(kvc_ttft, 0.50)*1000:.0f}ms", color=KVC_COLOR, fontsize=9, va="top", ha="left", bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2)) ax.text(pct(dp_ttft, 0.50), ymax * 0.50, f"DP p50\n{pct(dp_ttft, 0.50)*1000:.0f}ms", color=DP_COLOR, fontsize=9, va="top", ha="left", bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2)) ax.text(pct(kvc_ttft, 0.90), ymax * 0.30, f"KVC p90\n{pct(kvc_ttft, 0.90)*1000:.0f}ms", color=KVC_COLOR, fontsize=9, va="top", ha="left", bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2)) ax.text(pct(dp_ttft, 0.90), ymax * 0.18, f"DP p90\n{pct(dp_ttft, 0.90)*1000:.0f}ms", color=DP_COLOR, fontsize=9, va="top", ha="left", bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=2)) ax.set_xlim(0, 0.6) ax.set_xlabel("TTFT (seconds, linear)", fontsize=11) ax.set_ylabel("Probability density", fontsize=11) ax.set_title("Body of distribution (TTFT ≤ 0.6 s)", fontsize=12, pad=10) ax.legend(loc="upper right", fontsize=10, framealpha=0.95) ax.grid(True, linestyle=":", alpha=0.4) ax.set_axisbelow(True) # ------------------------------------------------------------------ # Right panel: log x ∈ [0.01, 10]s -- full range incl. tail # PDF on log-x: we plot density vs log10(t) so the curve integrates # to 1 over log space (standard "log-density" presentation). # ------------------------------------------------------------------ ax = axes[1] # KDE on log10(ttft) so the resulting curve integrates to 1 over log10 t kde_kvc_log = gaussian_kde(np.log10(kvc_ttft), bw_method="scott") kde_dp_log = gaussian_kde(np.log10(dp_ttft), bw_method="scott") log_x = np.linspace(np.log10(0.01), np.log10(10.0), 600) x_full = 10 ** log_x y_kvc = kde_kvc_log(log_x) y_dp = kde_dp_log(log_x) ax.plot(x_full, y_kvc, color=KVC_COLOR, lw=2.5, label=f"KVC 1P3D v2 (n={len(kvc_ttft)})") ax.fill_between(x_full, y_kvc, alpha=0.20, color=KVC_COLOR) ax.plot(x_full, y_dp, color=DP_COLOR, lw=2.5, label=f"4-way DP CA (n={len(dp_ttft)})") ax.fill_between(x_full, y_dp, alpha=0.20, color=DP_COLOR) ax.set_xscale("log") ax.set_xlim(0.01, 10.0) # Percentile markers quartile_styles = [(0.50, "-", "p50"), (0.90, "--", "p90"), (0.99, ":", "p99")] for q, ls, name in quartile_styles: ax.axvline(pct(kvc_ttft, q), color=KVC_COLOR, ls=ls, alpha=0.55, lw=1.1) ax.axvline(pct(dp_ttft, q), color=DP_COLOR, ls=ls, alpha=0.55, lw=1.1) # Annotate p99 specifically since this is the key reviewer-targeted callout ymax = max(y_kvc.max(), y_dp.max()) kvc_p99 = pct(kvc_ttft, 0.99) dp_p99 = pct(dp_ttft, 0.99) ax.annotate(f"KVC p99 = {kvc_p99:.2f}s\n(slow-path reseed tail)", xy=(kvc_p99, kde_kvc_log(np.log10(kvc_p99))[0]), xytext=(2.0, ymax * 0.65), fontsize=10, color=KVC_COLOR, fontweight="bold", arrowprops=dict(arrowstyle="->", color=KVC_COLOR, lw=1.0)) ax.annotate(f"DP p99 = {dp_p99*1000:.0f}ms", xy=(dp_p99, kde_dp_log(np.log10(dp_p99))[0]), xytext=(0.025, ymax * 0.80), fontsize=10, color=DP_COLOR, fontweight="bold", arrowprops=dict(arrowstyle="->", color=DP_COLOR, lw=1.0)) # Highlight the KVC bimodal structure ax.annotate("KVC fast path\n(direct-to-D, 91.6%)", xy=(0.05, y_kvc[np.argmin(np.abs(x_full - 0.05))]), xytext=(0.012, ymax * 0.45), fontsize=9, color=KVC_COLOR, style="italic", arrowprops=dict(arrowstyle="->", color=KVC_COLOR, lw=0.7, alpha=0.6)) ax.annotate("KVC slow path\n(reseed, ~3.4%)", xy=(2.5, y_kvc[np.argmin(np.abs(x_full - 2.5))]), xytext=(3.0, ymax * 0.30), fontsize=9, color=KVC_COLOR, style="italic", arrowprops=dict(arrowstyle="->", color=KVC_COLOR, lw=0.7, alpha=0.6)) # Custom tick labels in seconds (instead of 10^-2, 10^-1, 10^0, 10^1) ax.set_xticks([0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]) ax.set_xticklabels(["10ms", "50ms", "100ms", "500ms", "1s", "5s", "10s"]) ax.set_xlabel("TTFT (log scale)", fontsize=11) ax.set_ylabel("Density (per log₁₀ s)", fontsize=11) ax.set_title("Full range (TTFT 10 ms – 10 s, log x)", fontsize=12, pad=10) ax.legend(loc="upper left", fontsize=10, framealpha=0.95) ax.grid(True, which="both", linestyle=":", alpha=0.4) ax.set_axisbelow(True) fig.suptitle( "TTFT probability density: KVC 1P3D v2 vs 4-way DP CA\n" "SWE-Bench 50sess trace · ts=1 · 4× H100 80GB · aborted/error requests excluded", fontsize=13, y=1.02, ) plt.tight_layout() plt.savefig(OUT, dpi=150, bbox_inches="tight") print(f"wrote {OUT}") plt.close(fig) # ------------------------------------------------------------------ # Print summary stats for doc cross-reference # ------------------------------------------------------------------ print(f"\n=== TTFT distribution summary ===") for name, arr in [("KVC v2", kvc_ttft), ("DP 4w", dp_ttft)]: print(f" {name} (n={len(arr)})") print(f" min={arr.min()*1000:.1f}ms p10={pct(arr,0.10)*1000:.1f}ms " f"p50={pct(arr,0.50)*1000:.1f}ms p90={pct(arr,0.90)*1000:.1f}ms " f"p99={pct(arr,0.99)*1000:.1f}ms max={arr.max()*1000:.1f}ms") if __name__ == "__main__": main()