agentic-kvc/microbench/fresh_setup/analyze_mb1.py

#!/usr/bin/env python3
"""Aggregate MB1 results: per-(D, P) baseline vs during-prefill effective TPOT.

The driver's `tpot_during_prefill_p50_ms` is computed per-token and can be
misleading: chunked-prefill schedules decode alongside each prefill chunk,
so most decode-token intervals during the prefill burst look "normal" — but
each chunk completion creates a long-stall token. p50 hides this, p90
exposes it, but the BEST single-number summary of "how much was decode
slowed by prefill" is the *effective TPOT during the prefill burst*:

    effective_TPOT_during = prefill_ttft_ms / (num_tokens_during_prefill / D)

i.e. wall-clock time divided by per-stream tokens emitted in that window.
This captures the true average throughput of each decode stream while a
prefill burst is underway. Compared to baseline_TPOT it gives the
"phase-interference penalty" PD-disagg could in principle recover.
"""
from __future__ import annotations

import argparse
import csv
import json
import statistics
from collections import defaultdict
from pathlib import Path


def main() -> None:
    p = argparse.ArgumentParser()
    p.add_argument("--summary", type=Path, required=True)
    p.add_argument("--out", type=Path, required=True)
    args = p.parse_args()

    rows = list(csv.DictReader(args.summary.open()))
    by_dp: dict[tuple[int, int], list[dict]] = defaultdict(list)
    for r in rows:
        D = int(r["decode_batch_size"])
        P = int(r["new_prefill_tokens"])
        by_dp[(D, P)].append(r)

    summary = []
    for (D, P) in sorted(by_dp):
        rs = by_dp[(D, P)]
        base = statistics.mean(float(r["tpot_baseline_p50_ms"]) for r in rs)
        during_p50_vals = [float(r["tpot_during_prefill_p50_ms"]) for r in rs
                            if float(r["tpot_during_prefill_p50_ms"]) > 0]
        during_p90_vals = [float(r["tpot_during_prefill_p90_ms"]) for r in rs
                            if float(r["tpot_during_prefill_p90_ms"]) > 0]
        ttft_vals = [float(r["prefill_ttft_ms"]) for r in rs]
        n_tok_vals = [float(r["num_tokens_during_prefill"]) for r in rs
                       if float(r["num_tokens_during_prefill"]) > 0]

        if not n_tok_vals or D == 0:
            continue
        ttft = statistics.mean(ttft_vals)
        n_tok_total = statistics.mean(n_tok_vals)
        per_stream_tokens = n_tok_total / D
        eff_tpot_during = ttft / per_stream_tokens if per_stream_tokens > 0 else 0
        penalty_x = eff_tpot_during / base if base > 0 else 0

        # PD-disagg potential benefit (per stream, ms):
        #   if decode ran at baseline rate throughout the prefill window,
        #   it would emit ttft/baseline tokens. Actual is per_stream_tokens.
        #   Time saved if no interference = ttft - per_stream_tokens * baseline
        time_saved_per_stream = ttft - per_stream_tokens * base

        summary.append({
            "decode_batch_size": D,
            "new_prefill_tokens": P,
            "baseline_tpot_ms": round(base, 2),
            "during_tpot_p50_ms_raw": (round(statistics.mean(during_p50_vals), 2)
                                        if during_p50_vals else None),
            "during_tpot_p90_ms_raw": (round(statistics.mean(during_p90_vals), 2)
                                        if during_p90_vals else None),
            "prefill_ttft_ms": round(ttft, 1),
            "num_tokens_during_prefill_total": round(n_tok_total, 1),
            "per_stream_tokens_during": round(per_stream_tokens, 2),
            "effective_tpot_during_ms": round(eff_tpot_during, 1),
            "interference_penalty_x": round(penalty_x, 1),
            "max_pd_disagg_benefit_ms_per_stream": round(time_saved_per_stream, 1),
        })

    args.out.parent.mkdir(parents=True, exist_ok=True)
    args.out.write_text(json.dumps({"summary": summary}, indent=2))

    print(f"{'D':>3} {'P':>7} {'base_ms':>9} {'eff_during_ms':>15} "
          f"{'penalty':>10} {'pd_benefit_ms':>15}")
    for s in summary:
        print(f"{s['decode_batch_size']:>3} {s['new_prefill_tokens']:>7} "
              f"{s['baseline_tpot_ms']:>9.2f} "
              f"{s['effective_tpot_during_ms']:>15.1f} "
              f"{s['interference_penalty_x']:>9.1f}x "
              f"{s['max_pd_disagg_benefit_ms_per_stream']:>15.0f}")
    print(f"\nwrote {args.out}")


if __name__ == "__main__":
    main()