Add ReplayServe Frontier vLLM alignment report
This commit is contained in:
532
tools/build_frontier_vllm_alignment_report.py
Normal file
532
tools/build_frontier_vllm_alignment_report.py
Normal file
@@ -0,0 +1,532 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build Frontier-vs-vLLM alignment tables and plots for the current H20 runs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import json
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
OUT_DIR = ROOT / "docs" / "assets" / "frontier_vllm_alignment"
|
||||
DASH1_VLLM_ROOT = Path("/home/admin/cpfs/wjh/replayserve/runs/vllm_gpu_smoke_20260625_dash1")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunSpec:
|
||||
run_id: str
|
||||
label: str
|
||||
tp: int
|
||||
request_count: int
|
||||
scale_label: str
|
||||
scale_value: float
|
||||
fixture: str
|
||||
frontier_summary: str
|
||||
vllm_summary: str
|
||||
vllm_preemptions: int
|
||||
kv_blocks: int
|
||||
notes: str = ""
|
||||
vllm_remote: bool = False
|
||||
|
||||
|
||||
RUNS: list[RunSpec] = [
|
||||
RunSpec(
|
||||
run_id="tp1_n100_scale1",
|
||||
label="TP1 N100 raw",
|
||||
tp=1,
|
||||
request_count=100,
|
||||
scale_label="raw",
|
||||
scale_value=1.0,
|
||||
fixture="coder_100",
|
||||
frontier_summary=(
|
||||
"runs/rs6_frontier_h20_tp1_profile_full32k_20260624/"
|
||||
"frontier_h20_tp1_profile_full32k/coder_100/"
|
||||
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260624/tp1_coder100_uncapped/summary.json",
|
||||
vllm_preemptions=8,
|
||||
kv_blocks=15281,
|
||||
notes="Frontier incomplete before lifecycle fix; included as TP1 100-request baseline.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n500_scale1",
|
||||
label="TP1 N500 raw",
|
||||
tp=1,
|
||||
request_count=500,
|
||||
scale_label="raw",
|
||||
scale_value=1.0,
|
||||
fixture="coder_500",
|
||||
frontier_summary=(
|
||||
"runs/rs8_frontier_h20_tp1_profile_full32k_coder500_20260625/"
|
||||
"frontier_h20_tp1_profile_full32k/coder_500/"
|
||||
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder500_uncapped/summary.json",
|
||||
vllm_preemptions=63,
|
||||
kv_blocks=15281,
|
||||
notes="Frontier incomplete; useful as high-pressure stress signal.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n200_scale0667",
|
||||
label="TP1 N200 scale 0.667",
|
||||
tp=1,
|
||||
request_count=200,
|
||||
scale_label="0.667",
|
||||
scale_value=2 / 3,
|
||||
fixture="coder_200_ts0667",
|
||||
frontier_summary=(
|
||||
"runs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667/"
|
||||
"frontier_h20_tp1_profile_full32k/coder_200_ts0667/"
|
||||
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder200_ts0667_uncapped/summary.json",
|
||||
vllm_preemptions=26,
|
||||
kv_blocks=15281,
|
||||
notes="Dense-arrival run; Frontier incomplete before lifecycle fix.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n200_scale2",
|
||||
label="TP1 N200 scale 2",
|
||||
tp=1,
|
||||
request_count=200,
|
||||
scale_label="2",
|
||||
scale_value=2.0,
|
||||
fixture="coder_200_ts2",
|
||||
frontier_summary=(
|
||||
"runs/rs10_preemption_replay_fix_ts2/frontier_h20_tp1_profile_full32k/"
|
||||
"coder_200_ts2/vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts2_uncapped/summary.json",
|
||||
vllm_preemptions=43,
|
||||
kv_blocks=15281,
|
||||
notes="After Frontier decode-preemption lifecycle fix.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n200_scale3",
|
||||
label="TP1 N200 scale 3",
|
||||
tp=1,
|
||||
request_count=200,
|
||||
scale_label="3",
|
||||
scale_value=3.0,
|
||||
fixture="coder_200_ts3",
|
||||
frontier_summary=(
|
||||
"runs/rs10_preemption_replay_fix_ts3/frontier_h20_tp1_profile_full32k/"
|
||||
"coder_200_ts3/vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts3_uncapped/summary.json",
|
||||
vllm_preemptions=16,
|
||||
kv_blocks=15281,
|
||||
notes="After Frontier decode-preemption lifecycle fix.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp2_n200_scale2",
|
||||
label="TP2 N200 scale 2",
|
||||
tp=2,
|
||||
request_count=200,
|
||||
scale_label="2",
|
||||
scale_value=2.0,
|
||||
fixture="coder_200_ts2",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
|
||||
"tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts2_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=69055,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp2_n200_scale3",
|
||||
label="TP2 N200 scale 3",
|
||||
tp=2,
|
||||
request_count=200,
|
||||
scale_label="3",
|
||||
scale_value=3.0,
|
||||
fixture="coder_200_ts3",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
|
||||
"tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts3_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=69055,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp4_n200_scale2",
|
||||
label="TP4 N200 scale 2",
|
||||
tp=4,
|
||||
request_count=200,
|
||||
scale_label="2",
|
||||
scale_value=2.0,
|
||||
fixture="coder_200_ts2",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
|
||||
"tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts2_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=177077,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp4_n200_scale3",
|
||||
label="TP4 N200 scale 3",
|
||||
tp=4,
|
||||
request_count=200,
|
||||
scale_label="3",
|
||||
scale_value=3.0,
|
||||
fixture="coder_200_ts3",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
|
||||
"tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts3_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=177077,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
FIELDNAMES = [
|
||||
"run_id",
|
||||
"label",
|
||||
"tp",
|
||||
"request_count",
|
||||
"scale_label",
|
||||
"scale_value",
|
||||
"fixture",
|
||||
"kv_blocks",
|
||||
"frontier_completed",
|
||||
"frontier_total",
|
||||
"frontier_complete",
|
||||
"vllm_completed",
|
||||
"vllm_total",
|
||||
"frontier_preemptions",
|
||||
"vllm_preemptions",
|
||||
"frontier_prefix_hit",
|
||||
"vllm_prefix_hit",
|
||||
"prefix_hit_delta",
|
||||
"frontier_rps",
|
||||
"vllm_rps",
|
||||
"rps_ratio",
|
||||
"frontier_total_tps",
|
||||
"vllm_total_tps",
|
||||
"total_tps_ratio",
|
||||
"frontier_decode_tps",
|
||||
"vllm_decode_tps",
|
||||
"decode_tps_ratio",
|
||||
"frontier_ttft_p50_s",
|
||||
"vllm_ttft_p50_s",
|
||||
"ttft_p50_ratio",
|
||||
"frontier_ttft_p95_s",
|
||||
"vllm_ttft_p95_s",
|
||||
"ttft_p95_ratio",
|
||||
"frontier_tpot_p50_s",
|
||||
"vllm_tpot_p50_s",
|
||||
"tpot_p50_ratio",
|
||||
"frontier_tpot_p95_s",
|
||||
"vllm_tpot_p95_s",
|
||||
"tpot_p95_ratio",
|
||||
"frontier_e2e_p50_s",
|
||||
"vllm_e2e_p50_s",
|
||||
"e2e_p50_ratio",
|
||||
"frontier_e2e_p95_s",
|
||||
"vllm_e2e_p95_s",
|
||||
"e2e_p95_ratio",
|
||||
"notes",
|
||||
]
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict[str, Any]:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{path}: expected JSON object")
|
||||
return data
|
||||
|
||||
|
||||
def load_vllm_summary(spec: RunSpec) -> dict[str, Any]:
|
||||
path = Path(spec.vllm_summary)
|
||||
if not spec.vllm_remote:
|
||||
return load_json(ROOT / path)
|
||||
local_candidate = ROOT / "runs" / "vllm_gpu_smoke_20260625_dash1" / path.parent.name / path.name
|
||||
if local_candidate.exists():
|
||||
return load_json(local_candidate)
|
||||
raw = subprocess.check_output(["ssh", "dash1", f"cat {spec.vllm_summary}"], text=True)
|
||||
data = json.loads(raw)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{spec.vllm_summary}: expected JSON object")
|
||||
return data
|
||||
|
||||
|
||||
def load_frontier_summary(spec: RunSpec) -> tuple[dict[str, Any], dict[str, Any]]:
|
||||
post = load_json(ROOT / spec.frontier_summary)
|
||||
system_path = Path(post["system_metrics"])
|
||||
if not system_path.is_absolute():
|
||||
system_path = ROOT / system_path
|
||||
return post, load_json(system_path)
|
||||
|
||||
|
||||
def ratio(numerator: float | int | None, denominator: float | int | None) -> float | None:
|
||||
if numerator is None or denominator in (None, 0):
|
||||
return None
|
||||
return float(numerator) / float(denominator)
|
||||
|
||||
|
||||
def nested(data: dict[str, Any], *keys: str) -> Any:
|
||||
value: Any = data
|
||||
for key in keys:
|
||||
if not isinstance(value, dict):
|
||||
return None
|
||||
value = value.get(key)
|
||||
return value
|
||||
|
||||
|
||||
def summarize(spec: RunSpec) -> dict[str, Any]:
|
||||
post, system = load_frontier_summary(spec)
|
||||
vllm = load_vllm_summary(spec)
|
||||
|
||||
completion = post.get("completion", {})
|
||||
preemption = post.get("preemption_statistics", {})
|
||||
prefix = post.get("prefix_cache_postprocess", {})
|
||||
token_weighted = prefix.get("replayserve_token_weighted", {})
|
||||
throughput = system.get("throughput_metrics", {})
|
||||
|
||||
frontier_total_tps = throughput.get("tokens_per_second")
|
||||
vllm_total_tps = vllm["prompt_tokens_per_second"] + vllm["generated_tokens_per_second"]
|
||||
frontier_prefix_hit = token_weighted.get("hit_ratio")
|
||||
vllm_prefix_hit = nested(vllm, "estimated_prefix_reuse", "token_hit_ratio")
|
||||
|
||||
row: dict[str, Any] = {
|
||||
"run_id": spec.run_id,
|
||||
"label": spec.label,
|
||||
"tp": spec.tp,
|
||||
"request_count": spec.request_count,
|
||||
"scale_label": spec.scale_label,
|
||||
"scale_value": spec.scale_value,
|
||||
"fixture": spec.fixture,
|
||||
"kv_blocks": spec.kv_blocks,
|
||||
"frontier_completed": completion.get("completed_requests"),
|
||||
"frontier_total": completion.get("total_requests"),
|
||||
"frontier_complete": completion.get("is_complete"),
|
||||
"vllm_completed": vllm.get("rows"),
|
||||
"vllm_total": vllm.get("rows"),
|
||||
"frontier_preemptions": preemption.get("total_preemption_events"),
|
||||
"vllm_preemptions": spec.vllm_preemptions,
|
||||
"frontier_prefix_hit": frontier_prefix_hit,
|
||||
"vllm_prefix_hit": vllm_prefix_hit,
|
||||
"prefix_hit_delta": (
|
||||
float(frontier_prefix_hit) - float(vllm_prefix_hit)
|
||||
if frontier_prefix_hit is not None and vllm_prefix_hit is not None
|
||||
else None
|
||||
),
|
||||
"frontier_rps": throughput.get("requests_per_second"),
|
||||
"vllm_rps": vllm.get("requests_per_second"),
|
||||
"frontier_total_tps": frontier_total_tps,
|
||||
"vllm_total_tps": vllm_total_tps,
|
||||
"frontier_decode_tps": throughput.get("decode_tokens_per_second"),
|
||||
"vllm_decode_tps": vllm.get("generated_tokens_per_second"),
|
||||
"frontier_ttft_p50_s": nested(system, "ttft_statistics", "p50") / 1000,
|
||||
"vllm_ttft_p50_s": nested(vllm, "ttft_s", "p50"),
|
||||
"frontier_ttft_p95_s": nested(system, "ttft_statistics", "p95") / 1000,
|
||||
"vllm_ttft_p95_s": nested(vllm, "ttft_s", "p95"),
|
||||
"frontier_tpot_p50_s": nested(system, "tpot_statistics", "p50") / 1000,
|
||||
"vllm_tpot_p50_s": nested(vllm, "tpot_s", "p50"),
|
||||
"frontier_tpot_p95_s": nested(system, "tpot_statistics", "p95") / 1000,
|
||||
"vllm_tpot_p95_s": nested(vllm, "tpot_s", "p95"),
|
||||
"frontier_e2e_p50_s": nested(system, "request_e2e_time_statistics", "p50") / 1000,
|
||||
"vllm_e2e_p50_s": nested(vllm, "e2e_s", "p50"),
|
||||
"frontier_e2e_p95_s": nested(system, "request_e2e_time_statistics", "p95") / 1000,
|
||||
"vllm_e2e_p95_s": nested(vllm, "e2e_s", "p95"),
|
||||
"notes": spec.notes,
|
||||
}
|
||||
|
||||
for name in [
|
||||
"rps",
|
||||
"total_tps",
|
||||
"decode_tps",
|
||||
"ttft_p50_s",
|
||||
"ttft_p95_s",
|
||||
"tpot_p50_s",
|
||||
"tpot_p95_s",
|
||||
"e2e_p50_s",
|
||||
"e2e_p95_s",
|
||||
]:
|
||||
row[f"{name.removesuffix('_s')}_ratio"] = ratio(
|
||||
row.get(f"frontier_{name}"), row.get(f"vllm_{name}")
|
||||
)
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def fmt(value: Any) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, bool):
|
||||
return "true" if value else "false"
|
||||
if isinstance(value, float):
|
||||
return f"{value:.10g}"
|
||||
return str(value)
|
||||
|
||||
|
||||
def write_csv(rows: list[dict[str, Any]]) -> None:
|
||||
path = OUT_DIR / "frontier_vllm_alignment.csv"
|
||||
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})
|
||||
|
||||
|
||||
def write_json(rows: list[dict[str, Any]]) -> None:
|
||||
path = OUT_DIR / "frontier_vllm_alignment.json"
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(rows, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
|
||||
|
||||
def setup_axis(ax: plt.Axes, title: str, ylabel: str) -> None:
|
||||
ax.set_title(title, fontsize=12, pad=10)
|
||||
ax.set_ylabel(ylabel)
|
||||
ax.grid(axis="y", alpha=0.25)
|
||||
ax.spines["top"].set_visible(False)
|
||||
ax.spines["right"].set_visible(False)
|
||||
|
||||
|
||||
def annotate_bars(ax: plt.Axes, bars: Any, fmt_text: str = "{:.2f}") -> None:
|
||||
for bar in bars:
|
||||
height = bar.get_height()
|
||||
if height != height:
|
||||
continue
|
||||
ax.annotate(
|
||||
fmt_text.format(height),
|
||||
xy=(bar.get_x() + bar.get_width() / 2, height),
|
||||
xytext=(0, 3),
|
||||
textcoords="offset points",
|
||||
ha="center",
|
||||
va="bottom",
|
||||
fontsize=7,
|
||||
rotation=90 if height > 2.5 else 0,
|
||||
)
|
||||
|
||||
|
||||
def savefig(name: str) -> None:
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUT_DIR / name, dpi=180)
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_throughput_ratio(rows: list[dict[str, Any]]) -> None:
|
||||
labels = [row["label"] for row in rows]
|
||||
x = range(len(rows))
|
||||
colors = {1: "#4C78A8", 2: "#F58518", 4: "#54A24B"}
|
||||
fig, ax = plt.subplots(figsize=(12, 4.8))
|
||||
bars = ax.bar(
|
||||
x,
|
||||
[row["total_tps_ratio"] for row in rows],
|
||||
color=[colors[row["tp"]] for row in rows],
|
||||
alpha=0.9,
|
||||
)
|
||||
for bar, row in zip(bars, rows, strict=True):
|
||||
if not row["frontier_complete"]:
|
||||
bar.set_hatch("//")
|
||||
bar.set_alpha(0.65)
|
||||
ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
|
||||
ax.set_xticks(list(x))
|
||||
ax.set_xticklabels(labels, rotation=35, ha="right")
|
||||
setup_axis(ax, "Frontier Throughput Relative to vLLM", "Frontier / vLLM total tok/s")
|
||||
annotate_bars(ax, bars)
|
||||
savefig("throughput_ratio.png")
|
||||
|
||||
|
||||
def plot_latency_ratios(rows: list[dict[str, Any]]) -> None:
|
||||
labels = [row["label"] for row in rows]
|
||||
x = list(range(len(rows)))
|
||||
width = 0.26
|
||||
fig, ax = plt.subplots(figsize=(13, 5.2))
|
||||
b1 = ax.bar([i - width for i in x], [row["ttft_p95_ratio"] for row in rows], width, label="TTFT p95")
|
||||
b2 = ax.bar(x, [row["tpot_p50_ratio"] for row in rows], width, label="TPOT p50")
|
||||
b3 = ax.bar([i + width for i in x], [row["e2e_p95_ratio"] for row in rows], width, label="E2E p95")
|
||||
ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(labels, rotation=35, ha="right")
|
||||
ax.legend(frameon=False, ncols=3, loc="upper left")
|
||||
setup_axis(ax, "Latency Ratios", "Frontier / vLLM")
|
||||
annotate_bars(ax, b1)
|
||||
annotate_bars(ax, b2)
|
||||
annotate_bars(ax, b3)
|
||||
savefig("latency_ratios.png")
|
||||
|
||||
|
||||
def plot_tp_scaling(rows: list[dict[str, Any]]) -> None:
|
||||
selected = [row for row in rows if row["request_count"] == 200 and row["scale_label"] in {"2", "3"}]
|
||||
groups = {}
|
||||
for row in selected:
|
||||
groups.setdefault(row["scale_label"], {})[row["tp"]] = row
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), sharey=False)
|
||||
for ax, scale in zip(axes, ["2", "3"], strict=True):
|
||||
group = groups[scale]
|
||||
tps = sorted(group)
|
||||
ax.plot(tps, [group[tp]["frontier_total_tps"] for tp in tps], marker="o", label="Frontier")
|
||||
ax.plot(tps, [group[tp]["vllm_total_tps"] for tp in tps], marker="o", label="vLLM")
|
||||
ax.set_xticks(tps)
|
||||
ax.set_xlabel("Tensor parallel size")
|
||||
setup_axis(ax, f"N=200, timestamp scale {scale}", "total tok/s")
|
||||
ax.legend(frameon=False)
|
||||
savefig("tp_scaling_total_tps.png")
|
||||
|
||||
|
||||
def plot_completion_prefix(rows: list[dict[str, Any]]) -> None:
|
||||
labels = [row["label"] for row in rows]
|
||||
x = list(range(len(rows)))
|
||||
fig, ax1 = plt.subplots(figsize=(12, 4.8))
|
||||
completion = [row["frontier_completed"] / row["frontier_total"] for row in rows]
|
||||
bars = ax1.bar(x, completion, color="#72B7B2", alpha=0.8, label="Frontier completion")
|
||||
ax1.set_ylim(0, 1.08)
|
||||
ax1.set_xticks(x)
|
||||
ax1.set_xticklabels(labels, rotation=35, ha="right")
|
||||
setup_axis(ax1, "Completion and Prefix Reuse", "Frontier completed / total")
|
||||
ax2 = ax1.twinx()
|
||||
ax2.plot(x, [row["frontier_prefix_hit"] for row in rows], color="#E45756", marker="o", label="Frontier prefix hit")
|
||||
ax2.plot(x, [row["vllm_prefix_hit"] for row in rows], color="#4C78A8", marker="x", linestyle="--", label="vLLM trace-side prefix hit")
|
||||
ax2.set_ylabel("prefix token hit ratio")
|
||||
ax2.set_ylim(0, 0.45)
|
||||
lines, labels2 = ax2.get_legend_handles_labels()
|
||||
ax1.legend([bars, *lines], ["Frontier completion", *labels2], frameon=False, loc="upper left", ncols=2)
|
||||
savefig("completion_prefix.png")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
rows = [summarize(spec) for spec in RUNS]
|
||||
write_csv(rows)
|
||||
write_json(rows)
|
||||
plot_throughput_ratio(rows)
|
||||
plot_latency_ratios(rows)
|
||||
plot_tp_scaling(rows)
|
||||
plot_completion_prefix(rows)
|
||||
print(f"Wrote {len(rows)} rows to {OUT_DIR}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user