agentic-kvc/microbench/fresh_setup/plot_mb1.py

#!/usr/bin/env python3
"""Plot MB1 phase-interference data.

Single output: figs/mb1_interference.png — effective per-stream TPOT
during a prefill burst, vs prefill size, one line per concurrent decode
batch size D.

Earlier versions of this script also produced figs/pd_cost_vs_benefit.png
which composed a "max PD-disagg benefit = decode duration (50–200 ms)
band" against the MB2 transfer-cost curve. That accounting was wrong
(see RESULTS_SUMMARY.md §4 correction): phase-isolation benefit is
per-prefill-event, equal to D × T_prefill across stalled streams, not
capped by a single request's decode duration. That figure has been
removed; the math it implied was structurally backwards. The dominant
reason static PD-disagg fails in agentic is **D-side KV capacity**
(see figs/f4b_pdsep_kv_wall.png), not cost-vs-benefit on phase isolation.
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt


def main() -> None:
    p = argparse.ArgumentParser()
    p.add_argument("--mb1", type=Path, required=True)
    p.add_argument("--out", type=Path, default=Path("figs/mb1_interference.png"))
    args = p.parse_args()

    mb1 = json.loads(args.mb1.read_text())["summary"]

    fig, ax = plt.subplots(figsize=(9, 5.5))
    Ds = sorted({s["decode_batch_size"] for s in mb1})
    colors = {1: "#1f77b4", 4: "#ff7f0e", 8: "#d62728"}
    for D in Ds:
        rows = [s for s in mb1 if s["decode_batch_size"] == D]
        rows.sort(key=lambda s: s["new_prefill_tokens"])
        xs = [s["new_prefill_tokens"] for s in rows]
        ys = [s["effective_tpot_during_ms"] for s in rows]
        ax.plot(xs, ys, "o-", lw=2, markersize=7,
                color=colors.get(D, "gray"),
                label=f"D={D} (baseline TPOT {rows[0]['baseline_tpot_ms']:.1f} ms)")

    ax.set_xscale("log"); ax.set_yscale("log")
    ax.set_xlabel("Prefill burst size (tokens, log)")
    ax.set_ylabel("Per-stream effective TPOT during prefill burst (ms, log)")
    ax.set_title("MB1: each ongoing decode is essentially halted while prefill runs\n"
                 "(chunked-prefill ON, vLLM 0.18.1 default, single H20). "
                 "Per-prefill aggregate stall = D × T_prefill.")
    ax.grid(True, which="both", alpha=0.3)
    ax.legend(loc="upper left", fontsize=9)
    args.out.parent.mkdir(parents=True, exist_ok=True)
    fig.tight_layout(); fig.savefig(args.out, dpi=150); plt.close(fig)
    print(f"wrote {args.out}")


if __name__ == "__main__":
    main()