#!/usr/bin/env python3 """Build Frontier-vs-vLLM alignment tables and plots for the current H20 runs.""" from __future__ import annotations import csv import json import subprocess from dataclasses import dataclass from pathlib import Path from typing import Any import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt ROOT = Path(__file__).resolve().parents[1] OUT_DIR = ROOT / "docs" / "assets" / "frontier_vllm_alignment" DASH1_VLLM_ROOT = Path("/home/admin/cpfs/wjh/replayserve/runs/vllm_gpu_smoke_20260625_dash1") @dataclass(frozen=True) class RunSpec: run_id: str label: str tp: int request_count: int scale_label: str scale_value: float fixture: str frontier_summary: str vllm_summary: str vllm_preemptions: int kv_blocks: int notes: str = "" vllm_remote: bool = False RUNS: list[RunSpec] = [ RunSpec( run_id="tp1_n100_scale1", label="TP1 N100 raw", tp=1, request_count=100, scale_label="raw", scale_value=1.0, fixture="coder_100", frontier_summary=( "runs/rs6_frontier_h20_tp1_profile_full32k_20260624/" "frontier_h20_tp1_profile_full32k/coder_100/" "vllm_kv_15281_profile_full32k/postprocess_summary.json" ), vllm_summary="runs/vllm_gpu_smoke_20260624/tp1_coder100_uncapped/summary.json", vllm_preemptions=8, kv_blocks=15281, notes="Frontier incomplete before lifecycle fix; included as TP1 100-request baseline.", ), RunSpec( run_id="tp1_n500_scale1", label="TP1 N500 raw", tp=1, request_count=500, scale_label="raw", scale_value=1.0, fixture="coder_500", frontier_summary=( "runs/rs8_frontier_h20_tp1_profile_full32k_coder500_20260625/" "frontier_h20_tp1_profile_full32k/coder_500/" "vllm_kv_15281_profile_full32k/postprocess_summary.json" ), vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder500_uncapped/summary.json", vllm_preemptions=63, kv_blocks=15281, notes="Frontier incomplete; useful as high-pressure stress signal.", ), RunSpec( run_id="tp1_n200_scale0667", label="TP1 N200 scale 0.667", tp=1, request_count=200, scale_label="0.667", scale_value=2 / 3, fixture="coder_200_ts0667", frontier_summary=( "runs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667/" "frontier_h20_tp1_profile_full32k/coder_200_ts0667/" "vllm_kv_15281_profile_full32k/postprocess_summary.json" ), vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder200_ts0667_uncapped/summary.json", vllm_preemptions=26, kv_blocks=15281, notes="Dense-arrival run; Frontier incomplete before lifecycle fix.", ), RunSpec( run_id="tp1_n200_scale2", label="TP1 N200 scale 2", tp=1, request_count=200, scale_label="2", scale_value=2.0, fixture="coder_200_ts2", frontier_summary=( "runs/rs10_preemption_replay_fix_ts2/frontier_h20_tp1_profile_full32k/" "coder_200_ts2/vllm_kv_15281_profile_full32k/postprocess_summary.json" ), vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts2_uncapped/summary.json", vllm_preemptions=43, kv_blocks=15281, notes="After Frontier decode-preemption lifecycle fix.", ), RunSpec( run_id="tp1_n200_scale3", label="TP1 N200 scale 3", tp=1, request_count=200, scale_label="3", scale_value=3.0, fixture="coder_200_ts3", frontier_summary=( "runs/rs10_preemption_replay_fix_ts3/frontier_h20_tp1_profile_full32k/" "coder_200_ts3/vllm_kv_15281_profile_full32k/postprocess_summary.json" ), vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts3_uncapped/summary.json", vllm_preemptions=16, kv_blocks=15281, notes="After Frontier decode-preemption lifecycle fix.", ), RunSpec( run_id="tp2_n200_scale2", label="TP2 N200 scale 2", tp=2, request_count=200, scale_label="2", scale_value=2.0, fixture="coder_200_ts2", frontier_summary=( "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/" "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/" "tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json" ), vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts2_uncapped" / "summary.json"), vllm_preemptions=0, kv_blocks=69055, notes="Uses true-mixed TP2/TP4 attention profile.", vllm_remote=True, ), RunSpec( run_id="tp2_n200_scale3", label="TP2 N200 scale 3", tp=2, request_count=200, scale_label="3", scale_value=3.0, fixture="coder_200_ts3", frontier_summary=( "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/" "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/" "tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json" ), vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts3_uncapped" / "summary.json"), vllm_preemptions=0, kv_blocks=69055, notes="Uses true-mixed TP2/TP4 attention profile.", vllm_remote=True, ), RunSpec( run_id="tp4_n200_scale2", label="TP4 N200 scale 2", tp=4, request_count=200, scale_label="2", scale_value=2.0, fixture="coder_200_ts2", frontier_summary=( "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/" "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/" "tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json" ), vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts2_uncapped" / "summary.json"), vllm_preemptions=0, kv_blocks=177077, notes="Uses true-mixed TP2/TP4 attention profile.", vllm_remote=True, ), RunSpec( run_id="tp4_n200_scale3", label="TP4 N200 scale 3", tp=4, request_count=200, scale_label="3", scale_value=3.0, fixture="coder_200_ts3", frontier_summary=( "runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/" "frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/" "tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json" ), vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts3_uncapped" / "summary.json"), vllm_preemptions=0, kv_blocks=177077, notes="Uses true-mixed TP2/TP4 attention profile.", vllm_remote=True, ), ] FIELDNAMES = [ "run_id", "label", "tp", "request_count", "scale_label", "scale_value", "fixture", "kv_blocks", "frontier_completed", "frontier_total", "frontier_complete", "vllm_completed", "vllm_total", "frontier_preemptions", "vllm_preemptions", "frontier_prefix_hit", "vllm_prefix_hit", "prefix_hit_delta", "frontier_rps", "vllm_rps", "rps_ratio", "frontier_total_tps", "vllm_total_tps", "total_tps_ratio", "frontier_decode_tps", "vllm_decode_tps", "decode_tps_ratio", "frontier_ttft_p50_s", "vllm_ttft_p50_s", "ttft_p50_ratio", "frontier_ttft_p95_s", "vllm_ttft_p95_s", "ttft_p95_ratio", "frontier_tpot_p50_s", "vllm_tpot_p50_s", "tpot_p50_ratio", "frontier_tpot_p95_s", "vllm_tpot_p95_s", "tpot_p95_ratio", "frontier_e2e_p50_s", "vllm_e2e_p50_s", "e2e_p50_ratio", "frontier_e2e_p95_s", "vllm_e2e_p95_s", "e2e_p95_ratio", "notes", ] def load_json(path: Path) -> dict[str, Any]: with path.open("r", encoding="utf-8") as handle: data = json.load(handle) if not isinstance(data, dict): raise ValueError(f"{path}: expected JSON object") return data def load_vllm_summary(spec: RunSpec) -> dict[str, Any]: path = Path(spec.vllm_summary) if not spec.vllm_remote: return load_json(ROOT / path) local_candidate = ROOT / "runs" / "vllm_gpu_smoke_20260625_dash1" / path.parent.name / path.name if local_candidate.exists(): return load_json(local_candidate) raw = subprocess.check_output(["ssh", "dash1", f"cat {spec.vllm_summary}"], text=True) data = json.loads(raw) if not isinstance(data, dict): raise ValueError(f"{spec.vllm_summary}: expected JSON object") return data def load_frontier_summary(spec: RunSpec) -> tuple[dict[str, Any], dict[str, Any]]: post = load_json(ROOT / spec.frontier_summary) system_path = Path(post["system_metrics"]) if not system_path.is_absolute(): system_path = ROOT / system_path return post, load_json(system_path) def ratio(numerator: float | int | None, denominator: float | int | None) -> float | None: if numerator is None or denominator in (None, 0): return None return float(numerator) / float(denominator) def nested(data: dict[str, Any], *keys: str) -> Any: value: Any = data for key in keys: if not isinstance(value, dict): return None value = value.get(key) return value def summarize(spec: RunSpec) -> dict[str, Any]: post, system = load_frontier_summary(spec) vllm = load_vllm_summary(spec) completion = post.get("completion", {}) preemption = post.get("preemption_statistics", {}) prefix = post.get("prefix_cache_postprocess", {}) token_weighted = prefix.get("replayserve_token_weighted", {}) throughput = system.get("throughput_metrics", {}) frontier_total_tps = throughput.get("tokens_per_second") vllm_total_tps = vllm["prompt_tokens_per_second"] + vllm["generated_tokens_per_second"] frontier_prefix_hit = token_weighted.get("hit_ratio") vllm_prefix_hit = nested(vllm, "estimated_prefix_reuse", "token_hit_ratio") row: dict[str, Any] = { "run_id": spec.run_id, "label": spec.label, "tp": spec.tp, "request_count": spec.request_count, "scale_label": spec.scale_label, "scale_value": spec.scale_value, "fixture": spec.fixture, "kv_blocks": spec.kv_blocks, "frontier_completed": completion.get("completed_requests"), "frontier_total": completion.get("total_requests"), "frontier_complete": completion.get("is_complete"), "vllm_completed": vllm.get("rows"), "vllm_total": vllm.get("rows"), "frontier_preemptions": preemption.get("total_preemption_events"), "vllm_preemptions": spec.vllm_preemptions, "frontier_prefix_hit": frontier_prefix_hit, "vllm_prefix_hit": vllm_prefix_hit, "prefix_hit_delta": ( float(frontier_prefix_hit) - float(vllm_prefix_hit) if frontier_prefix_hit is not None and vllm_prefix_hit is not None else None ), "frontier_rps": throughput.get("requests_per_second"), "vllm_rps": vllm.get("requests_per_second"), "frontier_total_tps": frontier_total_tps, "vllm_total_tps": vllm_total_tps, "frontier_decode_tps": throughput.get("decode_tokens_per_second"), "vllm_decode_tps": vllm.get("generated_tokens_per_second"), "frontier_ttft_p50_s": nested(system, "ttft_statistics", "p50") / 1000, "vllm_ttft_p50_s": nested(vllm, "ttft_s", "p50"), "frontier_ttft_p95_s": nested(system, "ttft_statistics", "p95") / 1000, "vllm_ttft_p95_s": nested(vllm, "ttft_s", "p95"), "frontier_tpot_p50_s": nested(system, "tpot_statistics", "p50") / 1000, "vllm_tpot_p50_s": nested(vllm, "tpot_s", "p50"), "frontier_tpot_p95_s": nested(system, "tpot_statistics", "p95") / 1000, "vllm_tpot_p95_s": nested(vllm, "tpot_s", "p95"), "frontier_e2e_p50_s": nested(system, "request_e2e_time_statistics", "p50") / 1000, "vllm_e2e_p50_s": nested(vllm, "e2e_s", "p50"), "frontier_e2e_p95_s": nested(system, "request_e2e_time_statistics", "p95") / 1000, "vllm_e2e_p95_s": nested(vllm, "e2e_s", "p95"), "notes": spec.notes, } for name in [ "rps", "total_tps", "decode_tps", "ttft_p50_s", "ttft_p95_s", "tpot_p50_s", "tpot_p95_s", "e2e_p50_s", "e2e_p95_s", ]: row[f"{name.removesuffix('_s')}_ratio"] = ratio( row.get(f"frontier_{name}"), row.get(f"vllm_{name}") ) return row def fmt(value: Any) -> str: if value is None: return "" if isinstance(value, bool): return "true" if value else "false" if isinstance(value, float): return f"{value:.10g}" return str(value) def write_csv(rows: list[dict[str, Any]]) -> None: path = OUT_DIR / "frontier_vllm_alignment.csv" with path.open("w", encoding="utf-8", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=FIELDNAMES) writer.writeheader() for row in rows: writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES}) def write_json(rows: list[dict[str, Any]]) -> None: path = OUT_DIR / "frontier_vllm_alignment.json" with path.open("w", encoding="utf-8") as handle: json.dump(rows, handle, indent=2, sort_keys=True) handle.write("\n") def setup_axis(ax: plt.Axes, title: str, ylabel: str) -> None: ax.set_title(title, fontsize=12, pad=10) ax.set_ylabel(ylabel) ax.grid(axis="y", alpha=0.25) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) def annotate_bars(ax: plt.Axes, bars: Any, fmt_text: str = "{:.2f}") -> None: for bar in bars: height = bar.get_height() if height != height: continue ax.annotate( fmt_text.format(height), xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha="center", va="bottom", fontsize=7, rotation=90 if height > 2.5 else 0, ) def savefig(name: str) -> None: plt.tight_layout() plt.savefig(OUT_DIR / name, dpi=180) plt.close() def plot_throughput_ratio(rows: list[dict[str, Any]]) -> None: labels = [row["label"] for row in rows] x = range(len(rows)) colors = {1: "#4C78A8", 2: "#F58518", 4: "#54A24B"} fig, ax = plt.subplots(figsize=(12, 4.8)) bars = ax.bar( x, [row["total_tps_ratio"] for row in rows], color=[colors[row["tp"]] for row in rows], alpha=0.9, ) for bar, row in zip(bars, rows, strict=True): if not row["frontier_complete"]: bar.set_hatch("//") bar.set_alpha(0.65) ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--") ax.set_xticks(list(x)) ax.set_xticklabels(labels, rotation=35, ha="right") setup_axis(ax, "Frontier Throughput Relative to vLLM", "Frontier / vLLM total tok/s") annotate_bars(ax, bars) savefig("throughput_ratio.png") def plot_latency_ratios(rows: list[dict[str, Any]]) -> None: labels = [row["label"] for row in rows] x = list(range(len(rows))) width = 0.26 fig, ax = plt.subplots(figsize=(13, 5.2)) b1 = ax.bar([i - width for i in x], [row["ttft_p95_ratio"] for row in rows], width, label="TTFT p95") b2 = ax.bar(x, [row["tpot_p50_ratio"] for row in rows], width, label="TPOT p50") b3 = ax.bar([i + width for i in x], [row["e2e_p95_ratio"] for row in rows], width, label="E2E p95") ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--") ax.set_xticks(x) ax.set_xticklabels(labels, rotation=35, ha="right") ax.legend(frameon=False, ncols=3, loc="upper left") setup_axis(ax, "Latency Ratios", "Frontier / vLLM") annotate_bars(ax, b1) annotate_bars(ax, b2) annotate_bars(ax, b3) savefig("latency_ratios.png") def plot_tp_scaling(rows: list[dict[str, Any]]) -> None: selected = [row for row in rows if row["request_count"] == 200 and row["scale_label"] in {"2", "3"}] groups = {} for row in selected: groups.setdefault(row["scale_label"], {})[row["tp"]] = row fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), sharey=False) for ax, scale in zip(axes, ["2", "3"], strict=True): group = groups[scale] tps = sorted(group) ax.plot(tps, [group[tp]["frontier_total_tps"] for tp in tps], marker="o", label="Frontier") ax.plot(tps, [group[tp]["vllm_total_tps"] for tp in tps], marker="o", label="vLLM") ax.set_xticks(tps) ax.set_xlabel("Tensor parallel size") setup_axis(ax, f"N=200, timestamp scale {scale}", "total tok/s") ax.legend(frameon=False) savefig("tp_scaling_total_tps.png") def plot_completion_prefix(rows: list[dict[str, Any]]) -> None: labels = [row["label"] for row in rows] x = list(range(len(rows))) fig, ax1 = plt.subplots(figsize=(12, 4.8)) completion = [row["frontier_completed"] / row["frontier_total"] for row in rows] bars = ax1.bar(x, completion, color="#72B7B2", alpha=0.8, label="Frontier completion") ax1.set_ylim(0, 1.08) ax1.set_xticks(x) ax1.set_xticklabels(labels, rotation=35, ha="right") setup_axis(ax1, "Completion and Prefix Reuse", "Frontier completed / total") ax2 = ax1.twinx() ax2.plot(x, [row["frontier_prefix_hit"] for row in rows], color="#E45756", marker="o", label="Frontier prefix hit") ax2.plot(x, [row["vllm_prefix_hit"] for row in rows], color="#4C78A8", marker="x", linestyle="--", label="vLLM trace-side prefix hit") ax2.set_ylabel("prefix token hit ratio") ax2.set_ylim(0, 0.45) lines, labels2 = ax2.get_legend_handles_labels() ax1.legend([bars, *lines], ["Frontier completion", *labels2], frameon=False, loc="upper left", ncols=2) savefig("completion_prefix.png") def main() -> None: OUT_DIR.mkdir(parents=True, exist_ok=True) rows = [summarize(spec) for spec in RUNS] write_csv(rows) write_json(rows) plot_throughput_ratio(rows) plot_latency_ratios(rows) plot_tp_scaling(rows) plot_completion_prefix(rows) print(f"Wrote {len(rows)} rows to {OUT_DIR}") if __name__ == "__main__": main()