agentic-kvc/analysis/characterization/elastic_migration_v2/render_figures.py

"""Render PNG figures for the elastic_migration_v2 section.

Inputs in ./data/ :
- b3_policy_comparison.json
- breakdown_unified.json, breakdown_unified_kv_both.json,
  breakdown_unified_v2.json, breakdown_unified_v2_strict.json
- per_worker_<policy>.json for each of the four

Outputs in ./figures/ :
- fig_kv_both_overhead.png      — three-way latency bars (plain vs kv_both vs v2)
- fig_v2_trigger_funnel.png     — request count per fall-through reason
- fig_v2_predicted_vs_actual.png — cost-model migrate prediction vs realized TTFT
- fig_three_way_hotspot.png      — per-worker TTFT p90 grouped bars
"""

from __future__ import annotations

import json
from collections import Counter
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

ROOT = Path(__file__).parent
DATA = ROOT / "data"
OUT = ROOT / "figures"
OUT.mkdir(parents=True, exist_ok=True)


def _load(name: str):
    return json.loads((DATA / name).read_text())


POLICY_COLORS = {
    "unified":             "#2ca02c",
    "unified_kv_both":     "#9467bd",
    "unified_nixl_both":   "#1f77b4",
    "unified_v2":          "#d62728",
    "unified_v2_strict":   "#ff7f0e",
}


def fig_kv_both_overhead():
    comp = _load("b3_policy_comparison.json")
    by = {r["policy"]: r for r in comp["rows"]}
    pols = ["unified", "unified_kv_both", "unified_nixl_both", "unified_v2"]
    metrics = [
        ("TTFT p90 (s)",   lambda r: r["ttft_p90_s"]),
        ("TPOT p90 (ms)",  lambda r: r["tpot_p90_s"] * 1000),
        ("E2E p90 (s)",    lambda r: r["e2e_p90_s"]),
        ("hotspot index",  lambda r: r["hotspot_index_ttft_p90"]),
    ]
    fig, axes = plt.subplots(1, 4, figsize=(15, 4.2))
    for ax, (label, fn) in zip(axes, metrics):
        vals = [fn(by[p]) for p in pols]
        labels_short = [p.replace("unified_", "") for p in pols]
        labels_short[0] = "plain"
        bars = ax.bar(labels_short, vals,
                       color=[POLICY_COLORS[p] for p in pols],
                       edgecolor="black", linewidth=0.5)
        ax.set_title(label)
        ax.tick_params(axis="x", rotation=15, labelsize=9)
        for b, v in zip(bars, vals):
            ax.text(b.get_x() + b.get_width() / 2, v,
                     f"{v:.2f}" if v < 100 else f"{v:.0f}",
                     ha="center", va="bottom", fontsize=9)
        ax.grid(alpha=0.3, axis="y")
        baseline = vals[0]
        for i, v in enumerate(vals):
            if i == 0:
                continue
            pct = (v - baseline) / baseline * 100
            ax.text(i, v * 0.5, f"{pct:+.0f}%", ha="center",
                     fontsize=10, fontweight="bold",
                     color="darkred" if pct > 0 else "darkgreen")
    fig.suptitle(
        "Mooncake substrate adds 19-45% across metrics; NIXL is 5-19pp better but\n"
        "still 16-38% above plain. v2's 5 PD-sep events don't recover the substrate tax."
    )
    fig.tight_layout()
    fig.savefig(OUT / "fig_kv_both_overhead.png", dpi=120)
    plt.close(fig)


def _bucket_reasons(data):
    """Collapse v2_reason strings into the funnel buckets."""
    buckets = Counter()
    for r in data:
        if r.get("v2_pd_sep") is True:
            buckets["PD-sep TRIGGERED"] += 1
            continue
        reason = (r.get("v2_reason") or "no_v2_reason").split(" (")[0]
        if reason.startswith("local_cost"):
            reason = "cost_benefit not enough margin"
        buckets[reason] += 1
    return buckets


def fig_v2_trigger_funnel():
    strict = _load("breakdown_unified_v2_strict.json")
    relaxed = _load("breakdown_unified_v2.json")
    bs = _bucket_reasons(strict)
    br = _bucket_reasons(relaxed)
    order = [
        "new_local_below_threshold",
        "chosen_no_active_decode",
        "chosen_few_decodes",
        "src_cache_below_threshold",
        "src_not_meaningfully_more_cache",
        "cost_benefit not enough margin",
        "PD-sep TRIGGERED",
    ]
    labels = [k for k in order if k in bs or k in br]
    strict_vals = [bs.get(k, 0) for k in labels]
    relaxed_vals = [br.get(k, 0) for k in labels]

    x = range(len(labels))
    width = 0.4
    fig, ax = plt.subplots(figsize=(11, 5))
    ax.bar([i - width / 2 for i in x], strict_vals, width,
            label=f"v2.0 strict (PD-sep={bs['PD-sep TRIGGERED']}/{sum(bs.values())} "
                  f"= {bs['PD-sep TRIGGERED']*100/sum(bs.values()):.2f}%)",
            color="#ff7f0e", edgecolor="black", linewidth=0.5)
    ax.bar([i + width / 2 for i in x], relaxed_vals, width,
            label=f"v2.1 relaxed (PD-sep={br['PD-sep TRIGGERED']}/{sum(br.values())} "
                  f"= {br['PD-sep TRIGGERED']*100/sum(br.values()):.2f}%)",
            color="#d62728", edgecolor="black", linewidth=0.5)
    ax.set_xticks(list(x))
    ax.set_xticklabels(labels, rotation=20, ha="right", fontsize=9)
    ax.set_ylabel("request count")
    ax.set_yscale("log")
    ax.set_title(
        "Why v2 rarely PD-seps: 88-76% of requests have new_local < threshold\n"
        "(intra-session cache already hot). Relaxing thresholds barely helps."
    )
    ax.legend()
    ax.grid(alpha=0.3, axis="y", which="both")
    for i, (s, r) in enumerate(zip(strict_vals, relaxed_vals)):
        if s > 0:
            ax.text(i - width / 2, s * 1.05, str(s), ha="center", fontsize=8)
        if r > 0:
            ax.text(i + width / 2, r * 1.05, str(r), ha="center", fontsize=8)
    fig.tight_layout()
    fig.savefig(OUT / "fig_v2_trigger_funnel.png", dpi=120)
    plt.close(fig)


def fig_v2_predicted_vs_actual():
    """For each PD-sep'd request, plot model-predicted migrate cost
    vs realized TTFT. Should sit near y=x if model is calibrated; sits
    far above if mechanism is more expensive than modeled."""
    relaxed = _load("breakdown_unified_v2.json")
    triggered = [r for r in relaxed if r.get("v2_pd_sep") is True]
    if not triggered:
        return
    predicted = []
    actual = []
    sizes = []
    rids = []
    for r in triggered:
        cm = r.get("v2_cost_migrate_s")
        t0 = r.get("t_proxy_recv")
        t_first = r.get("t_first_token")
        if cm is None or t0 is None or t_first is None:
            continue
        ttft = t_first - t0
        predicted.append(cm)
        actual.append(ttft)
        sizes.append(r.get("input_length", 0))
        rids.append(r.get("request_id", "?"))

    fig, ax = plt.subplots(figsize=(7, 5))
    ax.scatter(predicted, actual,
                s=[max(100, sz / 100) for sz in sizes],
                color="#d62728", edgecolors="black", alpha=0.75)
    for p, a, sz, rid in zip(predicted, actual, sizes, rids):
        ax.annotate(f"input={sz}",
                     (p, a), xytext=(8, 6), textcoords="offset points",
                     fontsize=9)
    # y=x reference + 10x line + 20x line
    lo = 0.5
    hi = max(50, max(actual) * 1.2)
    ax.plot([lo, hi], [lo, hi], "k--", alpha=0.5, label="y = x (calibrated)")
    ax.plot([lo, hi], [lo * 10, hi * 10], color="gray", linestyle=":",
             alpha=0.4, label="10x")
    ax.plot([lo, hi], [lo * 20, hi * 20], color="lightgray", linestyle=":",
             alpha=0.4, label="20x")
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlim(lo, hi)
    ax.set_ylim(lo, hi)
    ax.set_xlabel("Cost model: predicted migrate cost (s)")
    ax.set_ylabel("Realized TTFT (s)")
    ax.set_title(
        "All 5 PD-sep triggered requests in v2.1 sit far above y=x.\n"
        "Real transfer cost ~10-20x what the calibrated model predicted."
    )
    ax.grid(alpha=0.3, which="both")
    ax.legend(loc="lower right")
    fig.tight_layout()
    fig.savefig(OUT / "fig_v2_predicted_vs_actual.png", dpi=120)
    plt.close(fig)


def fig_three_way_hotspot():
    pols = ["unified", "unified_kv_both", "unified_nixl_both", "unified_v2"]
    per_worker = {p: _load(f"per_worker_{p}.json") for p in pols}
    workers = sorted(per_worker["unified"]["per_worker_ttft_p90_s"].keys())

    x = range(len(workers))
    n = len(pols)
    width = 0.85 / n
    fig, ax = plt.subplots(figsize=(12, 5))
    for i, p in enumerate(pols):
        d = per_worker[p]["per_worker_ttft_p90_s"]
        vals = [d[w] for w in workers]
        offset = (i - (n - 1) / 2) * width
        label = p.replace("unified_", "") if p != "unified" else "plain"
        ax.bar([j + offset for j in x], vals, width,
                label=f"{label} (hotspot={per_worker[p]['hotspot_index_ttft_p90']:.2f})",
                color=POLICY_COLORS[p], edgecolor="black", linewidth=0.4)
    short = [w.replace("http://127.0.0.1:", ":") for w in workers]
    ax.set_xticks(list(x))
    ax.set_xticklabels(short, rotation=0, fontsize=9)
    ax.set_ylabel("worker TTFT p90 (s)")
    ax.set_title(
        "Per-worker TTFT p90 distribution across substrates. Mooncake (kv_both)\n"
        "amplifies the hot worker (hotspot 4.36); NIXL keeps it close to plain (3.67)."
    )
    ax.legend(loc="upper left", fontsize=9)
    ax.grid(alpha=0.3, axis="y")
    fig.tight_layout()
    fig.savefig(OUT / "fig_three_way_hotspot.png", dpi=120)
    plt.close(fig)


def fig_connector_substrate_attribution():
    """Decomposes overhead into v1-framework cost (shared by all connectors,
    proxied by NIXL since it's the leanest) and Mooncake-specific cost."""
    comp = _load("b3_policy_comparison.json")
    by = {r["policy"]: r for r in comp["rows"]}
    metrics = [
        ("TTFT p90 (s)",  "ttft_p90_s",  False),
        ("TPOT p90 (ms)", "tpot_p90_s",  True),
        ("E2E p90 (s)",   "e2e_p90_s",   False),
        ("hotspot index", "hotspot_index_ttft_p90", False),
    ]
    fig, axes = plt.subplots(1, 4, figsize=(15, 4))
    for ax, (label, key, scale_ms) in zip(axes, metrics):
        plain = by["unified"][key] * (1000 if scale_ms else 1)
        nixl = by["unified_nixl_both"][key] * (1000 if scale_ms else 1)
        moon = by["unified_kv_both"][key] * (1000 if scale_ms else 1)
        v2 = by["unified_v2"][key] * (1000 if scale_ms else 1)

        framework_cost = nixl - plain   # what NIXL adds = v1 framework cost
        mooncake_extra = moon - nixl    # extra on top from Mooncake
        v2_branch_extra = v2 - moon     # extra from PD-sep branch (Mooncake + 5 events)

        bottom = 0
        ax.bar(["overhead"], [plain], color="#cccccc",
                edgecolor="black", linewidth=0.4,
                label=f"plain unified ({plain:.2f})")
        bottom += plain
        ax.bar(["overhead"], [framework_cost], bottom=[bottom],
                color="#1f77b4", edgecolor="black", linewidth=0.4,
                label=f"v1 framework (+{framework_cost:.2f})")
        bottom += framework_cost
        ax.bar(["overhead"], [mooncake_extra], bottom=[bottom],
                color="#9467bd", edgecolor="black", linewidth=0.4,
                label=f"Mooncake extra (+{mooncake_extra:.2f})")
        bottom += mooncake_extra
        ax.bar(["overhead"], [v2_branch_extra], bottom=[bottom],
                color="#d62728", edgecolor="black", linewidth=0.4,
                label=f"v2 PD-sep branch ({v2_branch_extra:+.2f})")
        ax.set_title(label)
        ax.legend(fontsize=8, loc="upper right")
        ax.grid(alpha=0.3, axis="y")
        ax.tick_params(axis="x", labelbottom=False)
    fig.suptitle(
        "Attribution: plain unified vs NIXL substrate vs Mooncake substrate vs v2.\n"
        "Blue: cost shared by any v1 connector. Purple: cost specific to Mooncake."
    )
    fig.tight_layout()
    fig.savefig(OUT / "fig_connector_substrate_attribution.png", dpi=120)
    plt.close(fig)


def main():
    fig_kv_both_overhead()
    fig_v2_trigger_funnel()
    fig_v2_predicted_vs_actual()
    fig_three_way_hotspot()
    fig_connector_substrate_attribution()
    print(f"wrote 5 figures to {OUT}")


if __name__ == "__main__":
    main()