Add ReplayServe Frontier vLLM alignment report

This commit is contained in:
2026-06-25 17:10:30 +08:00
commit a99bd00782
63 changed files with 17033 additions and 0 deletions

255
tools/aggregate_runs.py Normal file
View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python3
"""Aggregate ReplayServe Frontier run directories into CSV and Markdown."""
from __future__ import annotations
import argparse
import csv
import json
from pathlib import Path
from typing import Any
REPLAYSERVE_ROOT = Path(__file__).resolve().parents[1]
FIELDNAMES = [
"suite_id",
"sim",
"fixture",
"config_id",
"status",
"exit_code",
"runtime_seconds",
"frontier_mode",
"frontier_head",
"frontier_dirty",
"attn_tp",
"attn_dp",
"moe_tp",
"moe_ep",
"batch_size_cap",
"max_tokens_in_batch",
"block_size",
"enable_prefix_caching",
"enable_chunked_prefill",
"long_prefill_token_threshold",
"frontier_block_hit_ratio",
"replayserve_token_hit_ratio",
"cache_metrics_available",
"cache_metrics_unavailable_reason",
"cache_metric_rows_complete",
"cache_metric_rows_total",
"cache_metric_rows_missing",
"completion_is_complete",
"missing_latency_request_ids",
"preemption_events",
"preempted_requests",
"ttft_mean_ms",
"ttft_p50_ms",
"ttft_p95_ms",
"tpot_mean_ms",
"tpot_p50_ms",
"tpot_p95_ms",
"e2e_mean_ms",
"e2e_p50_ms",
"e2e_p95_ms",
"requests_per_second",
"tokens_per_second",
"decode_tokens_per_second",
"completed_requests",
"total_requests",
"run_dir",
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Aggregate ReplayServe run outputs.")
parser.add_argument("suite_dir", type=Path, help="Run suite directory.")
parser.add_argument(
"--output-csv",
type=Path,
help="Output CSV path. Defaults to <suite_dir>/summary.csv.",
)
parser.add_argument(
"--output-md",
type=Path,
help="Output Markdown path. Defaults to <suite_dir>/summary.md.",
)
return parser.parse_args()
def load_json(path: Path) -> dict[str, Any]:
if not path.exists():
return {}
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
return data if isinstance(data, dict) else {}
def read_int(path: Path) -> int | None:
try:
return int(path.read_text(encoding="utf-8").strip())
except (FileNotFoundError, ValueError):
return None
def nested(data: dict[str, Any], *keys: str) -> Any:
value: Any = data
for key in keys:
if not isinstance(value, dict):
return None
value = value.get(key)
return value
def fmt(value: Any) -> str:
if value is None:
return ""
if isinstance(value, bool):
return "true" if value else "false"
if isinstance(value, float):
return f"{value:.8g}"
return str(value)
def summarize_run(run_dir: Path) -> dict[str, Any]:
manifest = load_json(run_dir / "run_manifest.json")
status_json = load_json(run_dir / "run_status.json")
post = load_json(run_dir / "postprocess_summary.json")
system_metrics_path = post.get("system_metrics") if post else None
system_metrics = load_json(Path(system_metrics_path)) if system_metrics_path else {}
knobs = manifest.get("knobs", {}) if isinstance(manifest.get("knobs"), dict) else {}
frontier = manifest.get("frontier", {}) if isinstance(manifest.get("frontier"), dict) else {}
prefix = post.get("prefix_cache_postprocess", {}) if isinstance(post.get("prefix_cache_postprocess"), dict) else {}
frontier_block = prefix.get("frontier_block_level", {}) if isinstance(prefix.get("frontier_block_level"), dict) else {}
token_weighted = prefix.get("replayserve_token_weighted", {}) if isinstance(prefix.get("replayserve_token_weighted"), dict) else {}
missing_rows = prefix.get("rows_with_missing_cache_metrics") or []
if not isinstance(missing_rows, list):
missing_rows = []
preemption = post.get("preemption_statistics", {}) if isinstance(post.get("preemption_statistics"), dict) else {}
completion = post.get("completion", {}) if isinstance(post.get("completion"), dict) else {}
simulation = system_metrics.get("simulation_metadata", {}) if isinstance(system_metrics.get("simulation_metadata"), dict) else {}
throughput = system_metrics.get("throughput_metrics", {}) if isinstance(system_metrics.get("throughput_metrics"), dict) else {}
exit_code = status_json.get("exit_code")
if exit_code is None:
exit_code = read_int(run_dir / "exit_code.txt")
runtime = status_json.get("runtime_seconds")
if runtime is None:
runtime = read_int(run_dir / "runtime_seconds.txt")
status = status_json.get("status") or ("pass" if exit_code == 0 else "fail")
if completion and not completion.get("is_complete", True):
status = "incomplete"
missing_latency_ids = completion.get("missing_latency_request_ids") or []
if not isinstance(missing_latency_ids, list):
missing_latency_ids = []
return {
"suite_id": manifest.get("suite_id"),
"sim": manifest.get("sim"),
"fixture": manifest.get("fixture"),
"config_id": manifest.get("config_id"),
"status": status,
"exit_code": exit_code,
"runtime_seconds": runtime,
"frontier_mode": frontier.get("mode"),
"frontier_head": frontier.get("head"),
"frontier_dirty": bool((frontier.get("status_short") or "").strip()),
"attn_tp": knobs.get("attn_tensor_parallel_size"),
"attn_dp": knobs.get("attn_data_parallel_size"),
"moe_tp": knobs.get("moe_tensor_parallel_size"),
"moe_ep": knobs.get("moe_expert_parallel_size"),
"batch_size_cap": knobs.get("batch_size_cap"),
"max_tokens_in_batch": knobs.get("max_tokens_in_batch"),
"block_size": knobs.get("block_size"),
"enable_prefix_caching": knobs.get("enable_prefix_caching"),
"enable_chunked_prefill": knobs.get("enable_chunked_prefill"),
"long_prefill_token_threshold": knobs.get("long_prefill_token_threshold"),
"frontier_block_hit_ratio": frontier_block.get("hit_ratio"),
"replayserve_token_hit_ratio": token_weighted.get("hit_ratio"),
"cache_metrics_available": prefix.get("available"),
"cache_metrics_unavailable_reason": prefix.get("reason"),
"cache_metric_rows_complete": prefix.get("completed_request_rows"),
"cache_metric_rows_total": prefix.get("total_request_metric_rows"),
"cache_metric_rows_missing": len(missing_rows),
"completion_is_complete": completion.get("is_complete"),
"missing_latency_request_ids": ",".join(str(value) for value in missing_latency_ids),
"preemption_events": preemption.get("total_preemption_events"),
"preempted_requests": preemption.get("total_preempted_requests"),
"ttft_mean_ms": nested(system_metrics, "ttft_statistics", "mean"),
"ttft_p50_ms": nested(system_metrics, "ttft_statistics", "p50"),
"ttft_p95_ms": nested(system_metrics, "ttft_statistics", "p95"),
"tpot_mean_ms": nested(system_metrics, "tpot_statistics", "mean"),
"tpot_p50_ms": nested(system_metrics, "tpot_statistics", "p50"),
"tpot_p95_ms": nested(system_metrics, "tpot_statistics", "p95"),
"e2e_mean_ms": nested(system_metrics, "request_e2e_time_statistics", "mean"),
"e2e_p50_ms": nested(system_metrics, "request_e2e_time_statistics", "p50"),
"e2e_p95_ms": nested(system_metrics, "request_e2e_time_statistics", "p95"),
"requests_per_second": throughput.get("requests_per_second"),
"tokens_per_second": throughput.get("tokens_per_second"),
"decode_tokens_per_second": throughput.get("decode_tokens_per_second"),
"completed_requests": simulation.get("completed_requests"),
"total_requests": simulation.get("total_requests"),
"run_dir": str(run_dir),
}
def write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
with path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
writer.writeheader()
for row in rows:
writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})
def write_markdown(path: Path, rows: list[dict[str, Any]], suite_dir: Path) -> None:
columns = [
"config_id",
"fixture",
"status",
"runtime_seconds",
"enable_prefix_caching",
"enable_chunked_prefill",
"frontier_block_hit_ratio",
"replayserve_token_hit_ratio",
"cache_metric_rows_missing",
"completion_is_complete",
"preemption_events",
"ttft_mean_ms",
"tpot_mean_ms",
"e2e_mean_ms",
"tokens_per_second",
]
with path.open("w", encoding="utf-8") as handle:
handle.write(f"# Sweep Summary: {suite_dir.name}\n\n")
handle.write(f"- Suite dir: `{suite_dir}`\n")
handle.write(f"- Runs: `{len(rows)}`\n\n")
handle.write("| " + " | ".join(columns) + " |\n")
handle.write("|" + "|".join(["---"] * len(columns)) + "|\n")
for row in rows:
handle.write("| " + " | ".join(fmt(row.get(col)) for col in columns) + " |\n")
handle.write("\n")
handle.write(
"Latency and throughput values are Frontier smoke outputs from the "
"configured predictor/profile mode. RS3 tiny smoke uses dummy execution "
"time, so these are harness plumbing checks, not performance claims.\n"
)
def main() -> int:
args = parse_args()
suite_dir = args.suite_dir.resolve()
run_dirs = sorted(path.parent for path in suite_dir.glob("**/run_manifest.json"))
rows = [summarize_run(path) for path in run_dirs]
output_csv = args.output_csv or (suite_dir / "summary.csv")
output_md = args.output_md or (suite_dir / "summary.md")
write_csv(output_csv, rows)
write_markdown(output_md, rows, suite_dir)
print(f"wrote {output_csv}")
print(f"wrote {output_md}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

188
tools/analyze_trace_window.py Executable file
View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""Analyze Qwen/ReplayServe sidecar rows around a request id."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Analyze sidecar prefix overlap.")
parser.add_argument("--fixture-dir", required=True, type=Path)
parser.add_argument("--request-id", required=True, type=int)
parser.add_argument("--window", type=int, default=10)
parser.add_argument("--top-k", type=int, default=15)
parser.add_argument("--output-dir", required=True, type=Path)
return parser.parse_args()
def load_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as handle:
for line_number, line in enumerate(handle, start=1):
stripped = line.strip()
if not stripped:
continue
row = json.loads(stripped)
if not isinstance(row, dict):
raise ValueError(f"{path}: line {line_number}: expected object")
rows.append(row)
return rows
def common_prefix_len(left: list[int], right: list[int]) -> int:
count = 0
for left_item, right_item in zip(left, right):
if left_item != right_item:
break
count += 1
return count
def summarize_row(row: dict[str, Any], block_size: int = 16) -> dict[str, Any]:
input_length = int(row["input_length"])
output_length = int(row["output_length"])
hash_ids = [int(value) for value in row["hash_ids"]]
block_token_counts = [int(value) for value in row["block_token_counts"]]
return {
"request_id": int(row["request_id"]),
"chat_id": int(row["chat_id"]),
"parent_chat_id": int(row["parent_chat_id"]),
"turn": int(row["turn"]),
"type": row["type"],
"timestamp": float(row["timestamp"]),
"input_length": input_length,
"output_length": output_length,
"total_tokens": input_length + output_length,
"hash_count": len(hash_ids),
"first_hash_ids": hash_ids[:12],
"last_hash_id": hash_ids[-1] if hash_ids else None,
"partial_final_block": input_length % block_size != 0,
"final_block_token_count": block_token_counts[-1] if block_token_counts else 0,
}
def main() -> int:
args = parse_args()
sidecar_path = args.fixture_dir / "sidecar.jsonl"
rows = load_jsonl(sidecar_path)
by_id = {int(row["request_id"]): row for row in rows}
if args.request_id not in by_id:
raise SystemExit(f"request_id {args.request_id} not found in {sidecar_path}")
target = by_id[args.request_id]
target_hashes = [int(value) for value in target["hash_ids"]]
target_counts = [int(value) for value in target["block_token_counts"]]
overlaps: list[dict[str, Any]] = []
for row in rows:
request_id = int(row["request_id"])
if request_id >= args.request_id:
continue
lcp_blocks = common_prefix_len(target_hashes, [int(value) for value in row["hash_ids"]])
if lcp_blocks <= 0:
continue
overlaps.append(
{
**summarize_row(row),
"common_prefix_blocks_with_target": lcp_blocks,
"common_prefix_tokens_with_target": sum(target_counts[:lcp_blocks]),
"target_prefix_fraction_blocks": (
lcp_blocks / len(target_hashes) if target_hashes else 0.0
),
"target_prefix_fraction_tokens": (
sum(target_counts[:lcp_blocks]) / int(target["input_length"])
if int(target["input_length"]) > 0
else 0.0
),
}
)
overlaps.sort(
key=lambda item: (
item["common_prefix_blocks_with_target"],
item["request_id"],
),
reverse=True,
)
start = max(0, args.request_id - args.window)
end = min(len(rows), args.request_id + args.window + 1)
local_window = [summarize_row(row) for row in rows[start:end]]
parent_chat_id = int(target["parent_chat_id"])
parent_rows = [
summarize_row(row)
for row in rows
if int(row["chat_id"]) == parent_chat_id or int(row["request_id"]) == parent_chat_id
]
result = {
"fixture_dir": str(args.fixture_dir),
"sidecar": str(sidecar_path),
"request_id": args.request_id,
"target": summarize_row(target),
"local_window": local_window,
"top_prior_prefix_overlaps": overlaps[: args.top_k],
"prior_overlap_count": len(overlaps),
"parent_candidates": parent_rows,
"interpretation": {
"prefix_overlap_semantics": (
"Frontier prefix cache matches consecutive block_hash_ids from "
"the start of the prompt. common_prefix_tokens_with_target uses "
"the target sidecar block_token_counts, preserving partial final "
"block token counts."
),
"partial_final_block_related": bool(int(target["input_length"]) % 16 != 0),
},
}
args.output_dir.mkdir(parents=True, exist_ok=True)
json_path = args.output_dir / f"request_{args.request_id}_analysis.json"
md_path = args.output_dir / f"request_{args.request_id}_analysis.md"
with json_path.open("w", encoding="utf-8") as handle:
json.dump(result, handle, indent=2, sort_keys=True)
handle.write("\n")
with md_path.open("w", encoding="utf-8") as handle:
target_summary = result["target"]
handle.write(f"# Request {args.request_id} Trace Analysis\n\n")
handle.write(f"- Fixture: `{args.fixture_dir}`\n")
handle.write(f"- Timestamp: `{target_summary['timestamp']}`\n")
handle.write(f"- Chat: `{target_summary['chat_id']}` parent `{target_summary['parent_chat_id']}` turn `{target_summary['turn']}`\n")
handle.write(f"- Input/output/total tokens: `{target_summary['input_length']}` / `{target_summary['output_length']}` / `{target_summary['total_tokens']}`\n")
handle.write(f"- Hash blocks: `{target_summary['hash_count']}`\n")
handle.write(f"- Partial final block: `{target_summary['partial_final_block']}` final count `{target_summary['final_block_token_count']}`\n")
handle.write("\n## Top Prior Prefix Overlaps\n\n")
if not overlaps:
handle.write("No prior request shares a first block with the target.\n")
else:
handle.write("| prior request | timestamp | input | output | lcp blocks | lcp tokens | partial final |\n")
handle.write("|---:|---:|---:|---:|---:|---:|---|\n")
for item in overlaps[: args.top_k]:
handle.write(
f"| {item['request_id']} | {item['timestamp']} | "
f"{item['input_length']} | {item['output_length']} | "
f"{item['common_prefix_blocks_with_target']} | "
f"{item['common_prefix_tokens_with_target']} | "
f"{item['partial_final_block']} |\n"
)
handle.write("\n## Local Window\n\n")
handle.write("| request | timestamp | input | output | blocks | partial final | first hashes |\n")
handle.write("|---:|---:|---:|---:|---:|---|---|\n")
for item in local_window:
handle.write(
f"| {item['request_id']} | {item['timestamp']} | "
f"{item['input_length']} | {item['output_length']} | "
f"{item['hash_count']} | {item['partial_final_block']} | "
f"`{item['first_hash_ids']}` |\n"
)
print(json_path)
print(md_path)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python3
"""Summarize vLLM scheduler prefix-cache `computed:` log lines."""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
from typing import Any
START_RE = re.compile(r"Request (\d+) started running, prompt: (\d+), computed: (\d+)")
PREEMPT_RE = re.compile(r"Request (\d+) preempted")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Parse vLLM scheduler logs and report observed computed-token "
"prefix-cache behavior. Repeated starts indicate preemption or "
"re-admission, so all-start sums are not equivalent to per-request "
"prefix hits."
)
)
parser.add_argument("stdout_log", type=Path)
parser.add_argument("--summary-json", type=Path)
return parser.parse_args()
def load_estimated_hit_tokens(path: Path | None) -> int | None:
if path is None:
return None
summary = json.loads(path.read_text(encoding="utf-8"))
reuse = summary.get("estimated_prefix_reuse", {})
hit_tokens = reuse.get("hit_tokens")
return int(hit_tokens) if hit_tokens is not None else None
def main() -> int:
args = parse_args()
text = args.stdout_log.read_text(encoding="utf-8", errors="replace")
by_request: dict[int, list[dict[str, int]]] = {}
for match in START_RE.finditer(text):
request_id = int(match.group(1))
by_request.setdefault(request_id, []).append(
{
"prompt_tokens": int(match.group(2)),
"computed_tokens": int(match.group(3)),
}
)
preempted_request_ids = [int(match.group(1)) for match in PREEMPT_RE.finditer(text)]
repeated = {
str(request_id): starts
for request_id, starts in sorted(by_request.items())
if len(starts) > 1
}
all_computed = sum(
start["computed_tokens"]
for starts in by_request.values()
for start in starts
)
first_computed = sum(starts[0]["computed_tokens"] for starts in by_request.values())
last_computed = sum(starts[-1]["computed_tokens"] for starts in by_request.values())
max_computed = sum(max(start["computed_tokens"] for start in starts) for starts in by_request.values())
estimated_hit_tokens = load_estimated_hit_tokens(args.summary_json)
result: dict[str, Any] = {
"stdout_log": str(args.stdout_log),
"starts_total": sum(len(starts) for starts in by_request.values()),
"unique_requests": len(by_request),
"preemptions": len(preempted_request_ids),
"preempted_request_ids": preempted_request_ids,
"repeated_request_ids": sorted(int(request_id) for request_id in repeated),
"computed_tokens": {
"all_starts": all_computed,
"first_start_per_request": first_computed,
"last_start_per_request": last_computed,
"max_per_request": max_computed,
},
"repeated_starts": repeated,
}
if estimated_hit_tokens is not None:
result["estimated_prefix_hit_tokens"] = estimated_hit_tokens
result["matches_estimate"] = {
name: value == estimated_hit_tokens
for name, value in result["computed_tokens"].items()
}
print(json.dumps(result, indent=2, sort_keys=True))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,532 @@
#!/usr/bin/env python3
"""Build Frontier-vs-vLLM alignment tables and plots for the current H20 runs."""
from __future__ import annotations
import csv
import json
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
ROOT = Path(__file__).resolve().parents[1]
OUT_DIR = ROOT / "docs" / "assets" / "frontier_vllm_alignment"
DASH1_VLLM_ROOT = Path("/home/admin/cpfs/wjh/replayserve/runs/vllm_gpu_smoke_20260625_dash1")
@dataclass(frozen=True)
class RunSpec:
run_id: str
label: str
tp: int
request_count: int
scale_label: str
scale_value: float
fixture: str
frontier_summary: str
vllm_summary: str
vllm_preemptions: int
kv_blocks: int
notes: str = ""
vllm_remote: bool = False
RUNS: list[RunSpec] = [
RunSpec(
run_id="tp1_n100_scale1",
label="TP1 N100 raw",
tp=1,
request_count=100,
scale_label="raw",
scale_value=1.0,
fixture="coder_100",
frontier_summary=(
"runs/rs6_frontier_h20_tp1_profile_full32k_20260624/"
"frontier_h20_tp1_profile_full32k/coder_100/"
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
),
vllm_summary="runs/vllm_gpu_smoke_20260624/tp1_coder100_uncapped/summary.json",
vllm_preemptions=8,
kv_blocks=15281,
notes="Frontier incomplete before lifecycle fix; included as TP1 100-request baseline.",
),
RunSpec(
run_id="tp1_n500_scale1",
label="TP1 N500 raw",
tp=1,
request_count=500,
scale_label="raw",
scale_value=1.0,
fixture="coder_500",
frontier_summary=(
"runs/rs8_frontier_h20_tp1_profile_full32k_coder500_20260625/"
"frontier_h20_tp1_profile_full32k/coder_500/"
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
),
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder500_uncapped/summary.json",
vllm_preemptions=63,
kv_blocks=15281,
notes="Frontier incomplete; useful as high-pressure stress signal.",
),
RunSpec(
run_id="tp1_n200_scale0667",
label="TP1 N200 scale 0.667",
tp=1,
request_count=200,
scale_label="0.667",
scale_value=2 / 3,
fixture="coder_200_ts0667",
frontier_summary=(
"runs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667/"
"frontier_h20_tp1_profile_full32k/coder_200_ts0667/"
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
),
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder200_ts0667_uncapped/summary.json",
vllm_preemptions=26,
kv_blocks=15281,
notes="Dense-arrival run; Frontier incomplete before lifecycle fix.",
),
RunSpec(
run_id="tp1_n200_scale2",
label="TP1 N200 scale 2",
tp=1,
request_count=200,
scale_label="2",
scale_value=2.0,
fixture="coder_200_ts2",
frontier_summary=(
"runs/rs10_preemption_replay_fix_ts2/frontier_h20_tp1_profile_full32k/"
"coder_200_ts2/vllm_kv_15281_profile_full32k/postprocess_summary.json"
),
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts2_uncapped/summary.json",
vllm_preemptions=43,
kv_blocks=15281,
notes="After Frontier decode-preemption lifecycle fix.",
),
RunSpec(
run_id="tp1_n200_scale3",
label="TP1 N200 scale 3",
tp=1,
request_count=200,
scale_label="3",
scale_value=3.0,
fixture="coder_200_ts3",
frontier_summary=(
"runs/rs10_preemption_replay_fix_ts3/frontier_h20_tp1_profile_full32k/"
"coder_200_ts3/vllm_kv_15281_profile_full32k/postprocess_summary.json"
),
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts3_uncapped/summary.json",
vllm_preemptions=16,
kv_blocks=15281,
notes="After Frontier decode-preemption lifecycle fix.",
),
RunSpec(
run_id="tp2_n200_scale2",
label="TP2 N200 scale 2",
tp=2,
request_count=200,
scale_label="2",
scale_value=2.0,
fixture="coder_200_ts2",
frontier_summary=(
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
"tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
),
vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts2_uncapped" / "summary.json"),
vllm_preemptions=0,
kv_blocks=69055,
notes="Uses true-mixed TP2/TP4 attention profile.",
vllm_remote=True,
),
RunSpec(
run_id="tp2_n200_scale3",
label="TP2 N200 scale 3",
tp=2,
request_count=200,
scale_label="3",
scale_value=3.0,
fixture="coder_200_ts3",
frontier_summary=(
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
"tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
),
vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts3_uncapped" / "summary.json"),
vllm_preemptions=0,
kv_blocks=69055,
notes="Uses true-mixed TP2/TP4 attention profile.",
vllm_remote=True,
),
RunSpec(
run_id="tp4_n200_scale2",
label="TP4 N200 scale 2",
tp=4,
request_count=200,
scale_label="2",
scale_value=2.0,
fixture="coder_200_ts2",
frontier_summary=(
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
"tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
),
vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts2_uncapped" / "summary.json"),
vllm_preemptions=0,
kv_blocks=177077,
notes="Uses true-mixed TP2/TP4 attention profile.",
vllm_remote=True,
),
RunSpec(
run_id="tp4_n200_scale3",
label="TP4 N200 scale 3",
tp=4,
request_count=200,
scale_label="3",
scale_value=3.0,
fixture="coder_200_ts3",
frontier_summary=(
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
"tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
),
vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts3_uncapped" / "summary.json"),
vllm_preemptions=0,
kv_blocks=177077,
notes="Uses true-mixed TP2/TP4 attention profile.",
vllm_remote=True,
),
]
FIELDNAMES = [
"run_id",
"label",
"tp",
"request_count",
"scale_label",
"scale_value",
"fixture",
"kv_blocks",
"frontier_completed",
"frontier_total",
"frontier_complete",
"vllm_completed",
"vllm_total",
"frontier_preemptions",
"vllm_preemptions",
"frontier_prefix_hit",
"vllm_prefix_hit",
"prefix_hit_delta",
"frontier_rps",
"vllm_rps",
"rps_ratio",
"frontier_total_tps",
"vllm_total_tps",
"total_tps_ratio",
"frontier_decode_tps",
"vllm_decode_tps",
"decode_tps_ratio",
"frontier_ttft_p50_s",
"vllm_ttft_p50_s",
"ttft_p50_ratio",
"frontier_ttft_p95_s",
"vllm_ttft_p95_s",
"ttft_p95_ratio",
"frontier_tpot_p50_s",
"vllm_tpot_p50_s",
"tpot_p50_ratio",
"frontier_tpot_p95_s",
"vllm_tpot_p95_s",
"tpot_p95_ratio",
"frontier_e2e_p50_s",
"vllm_e2e_p50_s",
"e2e_p50_ratio",
"frontier_e2e_p95_s",
"vllm_e2e_p95_s",
"e2e_p95_ratio",
"notes",
]
def load_json(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
if not isinstance(data, dict):
raise ValueError(f"{path}: expected JSON object")
return data
def load_vllm_summary(spec: RunSpec) -> dict[str, Any]:
path = Path(spec.vllm_summary)
if not spec.vllm_remote:
return load_json(ROOT / path)
local_candidate = ROOT / "runs" / "vllm_gpu_smoke_20260625_dash1" / path.parent.name / path.name
if local_candidate.exists():
return load_json(local_candidate)
raw = subprocess.check_output(["ssh", "dash1", f"cat {spec.vllm_summary}"], text=True)
data = json.loads(raw)
if not isinstance(data, dict):
raise ValueError(f"{spec.vllm_summary}: expected JSON object")
return data
def load_frontier_summary(spec: RunSpec) -> tuple[dict[str, Any], dict[str, Any]]:
post = load_json(ROOT / spec.frontier_summary)
system_path = Path(post["system_metrics"])
if not system_path.is_absolute():
system_path = ROOT / system_path
return post, load_json(system_path)
def ratio(numerator: float | int | None, denominator: float | int | None) -> float | None:
if numerator is None or denominator in (None, 0):
return None
return float(numerator) / float(denominator)
def nested(data: dict[str, Any], *keys: str) -> Any:
value: Any = data
for key in keys:
if not isinstance(value, dict):
return None
value = value.get(key)
return value
def summarize(spec: RunSpec) -> dict[str, Any]:
post, system = load_frontier_summary(spec)
vllm = load_vllm_summary(spec)
completion = post.get("completion", {})
preemption = post.get("preemption_statistics", {})
prefix = post.get("prefix_cache_postprocess", {})
token_weighted = prefix.get("replayserve_token_weighted", {})
throughput = system.get("throughput_metrics", {})
frontier_total_tps = throughput.get("tokens_per_second")
vllm_total_tps = vllm["prompt_tokens_per_second"] + vllm["generated_tokens_per_second"]
frontier_prefix_hit = token_weighted.get("hit_ratio")
vllm_prefix_hit = nested(vllm, "estimated_prefix_reuse", "token_hit_ratio")
row: dict[str, Any] = {
"run_id": spec.run_id,
"label": spec.label,
"tp": spec.tp,
"request_count": spec.request_count,
"scale_label": spec.scale_label,
"scale_value": spec.scale_value,
"fixture": spec.fixture,
"kv_blocks": spec.kv_blocks,
"frontier_completed": completion.get("completed_requests"),
"frontier_total": completion.get("total_requests"),
"frontier_complete": completion.get("is_complete"),
"vllm_completed": vllm.get("rows"),
"vllm_total": vllm.get("rows"),
"frontier_preemptions": preemption.get("total_preemption_events"),
"vllm_preemptions": spec.vllm_preemptions,
"frontier_prefix_hit": frontier_prefix_hit,
"vllm_prefix_hit": vllm_prefix_hit,
"prefix_hit_delta": (
float(frontier_prefix_hit) - float(vllm_prefix_hit)
if frontier_prefix_hit is not None and vllm_prefix_hit is not None
else None
),
"frontier_rps": throughput.get("requests_per_second"),
"vllm_rps": vllm.get("requests_per_second"),
"frontier_total_tps": frontier_total_tps,
"vllm_total_tps": vllm_total_tps,
"frontier_decode_tps": throughput.get("decode_tokens_per_second"),
"vllm_decode_tps": vllm.get("generated_tokens_per_second"),
"frontier_ttft_p50_s": nested(system, "ttft_statistics", "p50") / 1000,
"vllm_ttft_p50_s": nested(vllm, "ttft_s", "p50"),
"frontier_ttft_p95_s": nested(system, "ttft_statistics", "p95") / 1000,
"vllm_ttft_p95_s": nested(vllm, "ttft_s", "p95"),
"frontier_tpot_p50_s": nested(system, "tpot_statistics", "p50") / 1000,
"vllm_tpot_p50_s": nested(vllm, "tpot_s", "p50"),
"frontier_tpot_p95_s": nested(system, "tpot_statistics", "p95") / 1000,
"vllm_tpot_p95_s": nested(vllm, "tpot_s", "p95"),
"frontier_e2e_p50_s": nested(system, "request_e2e_time_statistics", "p50") / 1000,
"vllm_e2e_p50_s": nested(vllm, "e2e_s", "p50"),
"frontier_e2e_p95_s": nested(system, "request_e2e_time_statistics", "p95") / 1000,
"vllm_e2e_p95_s": nested(vllm, "e2e_s", "p95"),
"notes": spec.notes,
}
for name in [
"rps",
"total_tps",
"decode_tps",
"ttft_p50_s",
"ttft_p95_s",
"tpot_p50_s",
"tpot_p95_s",
"e2e_p50_s",
"e2e_p95_s",
]:
row[f"{name.removesuffix('_s')}_ratio"] = ratio(
row.get(f"frontier_{name}"), row.get(f"vllm_{name}")
)
return row
def fmt(value: Any) -> str:
if value is None:
return ""
if isinstance(value, bool):
return "true" if value else "false"
if isinstance(value, float):
return f"{value:.10g}"
return str(value)
def write_csv(rows: list[dict[str, Any]]) -> None:
path = OUT_DIR / "frontier_vllm_alignment.csv"
with path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
writer.writeheader()
for row in rows:
writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})
def write_json(rows: list[dict[str, Any]]) -> None:
path = OUT_DIR / "frontier_vllm_alignment.json"
with path.open("w", encoding="utf-8") as handle:
json.dump(rows, handle, indent=2, sort_keys=True)
handle.write("\n")
def setup_axis(ax: plt.Axes, title: str, ylabel: str) -> None:
ax.set_title(title, fontsize=12, pad=10)
ax.set_ylabel(ylabel)
ax.grid(axis="y", alpha=0.25)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
def annotate_bars(ax: plt.Axes, bars: Any, fmt_text: str = "{:.2f}") -> None:
for bar in bars:
height = bar.get_height()
if height != height:
continue
ax.annotate(
fmt_text.format(height),
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3),
textcoords="offset points",
ha="center",
va="bottom",
fontsize=7,
rotation=90 if height > 2.5 else 0,
)
def savefig(name: str) -> None:
plt.tight_layout()
plt.savefig(OUT_DIR / name, dpi=180)
plt.close()
def plot_throughput_ratio(rows: list[dict[str, Any]]) -> None:
labels = [row["label"] for row in rows]
x = range(len(rows))
colors = {1: "#4C78A8", 2: "#F58518", 4: "#54A24B"}
fig, ax = plt.subplots(figsize=(12, 4.8))
bars = ax.bar(
x,
[row["total_tps_ratio"] for row in rows],
color=[colors[row["tp"]] for row in rows],
alpha=0.9,
)
for bar, row in zip(bars, rows, strict=True):
if not row["frontier_complete"]:
bar.set_hatch("//")
bar.set_alpha(0.65)
ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
ax.set_xticks(list(x))
ax.set_xticklabels(labels, rotation=35, ha="right")
setup_axis(ax, "Frontier Throughput Relative to vLLM", "Frontier / vLLM total tok/s")
annotate_bars(ax, bars)
savefig("throughput_ratio.png")
def plot_latency_ratios(rows: list[dict[str, Any]]) -> None:
labels = [row["label"] for row in rows]
x = list(range(len(rows)))
width = 0.26
fig, ax = plt.subplots(figsize=(13, 5.2))
b1 = ax.bar([i - width for i in x], [row["ttft_p95_ratio"] for row in rows], width, label="TTFT p95")
b2 = ax.bar(x, [row["tpot_p50_ratio"] for row in rows], width, label="TPOT p50")
b3 = ax.bar([i + width for i in x], [row["e2e_p95_ratio"] for row in rows], width, label="E2E p95")
ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=35, ha="right")
ax.legend(frameon=False, ncols=3, loc="upper left")
setup_axis(ax, "Latency Ratios", "Frontier / vLLM")
annotate_bars(ax, b1)
annotate_bars(ax, b2)
annotate_bars(ax, b3)
savefig("latency_ratios.png")
def plot_tp_scaling(rows: list[dict[str, Any]]) -> None:
selected = [row for row in rows if row["request_count"] == 200 and row["scale_label"] in {"2", "3"}]
groups = {}
for row in selected:
groups.setdefault(row["scale_label"], {})[row["tp"]] = row
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), sharey=False)
for ax, scale in zip(axes, ["2", "3"], strict=True):
group = groups[scale]
tps = sorted(group)
ax.plot(tps, [group[tp]["frontier_total_tps"] for tp in tps], marker="o", label="Frontier")
ax.plot(tps, [group[tp]["vllm_total_tps"] for tp in tps], marker="o", label="vLLM")
ax.set_xticks(tps)
ax.set_xlabel("Tensor parallel size")
setup_axis(ax, f"N=200, timestamp scale {scale}", "total tok/s")
ax.legend(frameon=False)
savefig("tp_scaling_total_tps.png")
def plot_completion_prefix(rows: list[dict[str, Any]]) -> None:
labels = [row["label"] for row in rows]
x = list(range(len(rows)))
fig, ax1 = plt.subplots(figsize=(12, 4.8))
completion = [row["frontier_completed"] / row["frontier_total"] for row in rows]
bars = ax1.bar(x, completion, color="#72B7B2", alpha=0.8, label="Frontier completion")
ax1.set_ylim(0, 1.08)
ax1.set_xticks(x)
ax1.set_xticklabels(labels, rotation=35, ha="right")
setup_axis(ax1, "Completion and Prefix Reuse", "Frontier completed / total")
ax2 = ax1.twinx()
ax2.plot(x, [row["frontier_prefix_hit"] for row in rows], color="#E45756", marker="o", label="Frontier prefix hit")
ax2.plot(x, [row["vllm_prefix_hit"] for row in rows], color="#4C78A8", marker="x", linestyle="--", label="vLLM trace-side prefix hit")
ax2.set_ylabel("prefix token hit ratio")
ax2.set_ylim(0, 0.45)
lines, labels2 = ax2.get_legend_handles_labels()
ax1.legend([bars, *lines], ["Frontier completion", *labels2], frameon=False, loc="upper left", ncols=2)
savefig("completion_prefix.png")
def main() -> None:
OUT_DIR.mkdir(parents=True, exist_ok=True)
rows = [summarize(spec) for spec in RUNS]
write_csv(rows)
write_json(rows)
plot_throughput_ratio(rows)
plot_latency_ratios(rows)
plot_tp_scaling(rows)
plot_completion_prefix(rows)
print(f"Wrote {len(rows)} rows to {OUT_DIR}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,454 @@
#!/usr/bin/env python3
"""Summarize a Frontier RS1 smoke run."""
from __future__ import annotations
import argparse
import csv
import json
import math
import re
import sys
from pathlib import Path
from typing import Any
CACHE_COLUMNS = {
"request_cached_prefill_tokens",
"request_prefix_cache_query_blocks",
"request_prefix_cache_hit_blocks",
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Postprocess Frontier smoke output.")
parser.add_argument("--run-dir", required=True, type=Path)
parser.add_argument("--fixture-dir", required=True, type=Path)
return parser.parse_args()
def load_json(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
if not isinstance(data, dict):
raise ValueError(f"{path}: JSON value must be an object")
return data
def load_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as handle:
for line_number, line in enumerate(handle, start=1):
stripped = line.strip()
if not stripped:
continue
row = json.loads(stripped)
if not isinstance(row, dict):
raise ValueError(f"{path}: line {line_number}: expected object")
rows.append(row)
return rows
def load_csv(path: Path) -> tuple[list[str], list[dict[str, str]]]:
with path.open("r", encoding="utf-8", newline="") as handle:
reader = csv.DictReader(handle)
return list(reader.fieldnames or []), list(reader)
def find_metrics_dir(run_dir: Path) -> Path:
candidates = sorted(run_dir.glob("frontier_metrics/**/system_metrics.json"))
if len(candidates) != 1:
raise ValueError(
f"{run_dir}: expected exactly one system_metrics.json under "
f"frontier_metrics, found {len(candidates)}"
)
return candidates[0].parent
def read_text_if_exists(path: Path) -> str:
if not path.exists():
return ""
return path.read_text(encoding="utf-8", errors="replace")
def parse_memory_state(log_text: str) -> dict[str, Any]:
matches = re.findall(
r"\[MEMORY_STATE\]\s+total_blocks=(?P<total_blocks>\d+),\s+"
r"max_blocks_per_sequence=(?P<max_blocks_per_sequence>\d+),\s+"
r"max_request_slots=(?P<max_request_slots>[^,]+),\s+"
r"max_batch_size=(?P<max_batch_size>\d+)",
log_text,
)
if not matches:
return {"available": False}
total_blocks, max_blocks_per_sequence, max_request_slots, max_batch_size = matches[-1]
return {
"available": True,
"total_blocks": int(total_blocks),
"max_blocks_per_sequence": int(max_blocks_per_sequence),
"max_request_slots": max_request_slots,
"max_batch_size": int(max_batch_size),
"source": "last [MEMORY_STATE] log line",
}
def extract_scheduler_config(config: dict[str, Any]) -> dict[str, Any]:
cluster = config.get("cluster_config")
if not isinstance(cluster, dict):
return {}
scheduler = cluster.get("replica_scheduler_config")
return scheduler if isinstance(scheduler, dict) else {}
def extract_replica_config(config: dict[str, Any]) -> dict[str, Any]:
cluster = config.get("cluster_config")
if not isinstance(cluster, dict):
return {}
replica = cluster.get("replica_config")
return replica if isinstance(replica, dict) else {}
def compute_token_weighted_cache(
request_metrics_path: Path,
sidecar_path: Path,
) -> dict[str, Any]:
fieldnames, rows = load_csv(request_metrics_path)
missing = sorted(CACHE_COLUMNS - set(fieldnames))
if missing:
return {
"available": False,
"reason": f"request_metrics.csv missing cache columns: {missing}",
}
sidecar_by_id = {int(row["request_id"]): row for row in load_jsonl(sidecar_path)}
total_query_blocks = 0
total_hit_blocks = 0
total_query_tokens = 0
total_hit_tokens = 0
total_frontier_cached_tokens = 0
completed_rows = 0
rows_with_missing_cache_metrics: list[int] = []
for row in rows:
request_id = int(float(row["Request Id"]))
sidecar = sidecar_by_id.get(request_id)
if sidecar is None:
raise ValueError(f"request_metrics.csv contains unknown request id {request_id}")
cache_values = [
row["request_prefix_cache_query_blocks"],
row["request_prefix_cache_hit_blocks"],
row["request_cached_prefill_tokens"],
]
if any(value == "" for value in cache_values):
rows_with_missing_cache_metrics.append(request_id)
continue
query_blocks = int(float(cache_values[0]))
hit_blocks = int(float(cache_values[1]))
cached_prefill_tokens = int(float(cache_values[2]))
block_token_counts = [int(value) for value in sidecar["block_token_counts"]]
input_length = int(sidecar["input_length"])
if query_blocks != len(block_token_counts):
raise ValueError(
f"request {request_id}: query_blocks={query_blocks} does not match "
f"sidecar blocks={len(block_token_counts)}"
)
if hit_blocks > query_blocks:
raise ValueError(
f"request {request_id}: hit_blocks={hit_blocks} > query_blocks={query_blocks}"
)
total_query_blocks += query_blocks
total_hit_blocks += hit_blocks
total_query_tokens += input_length
total_hit_tokens += sum(block_token_counts[:hit_blocks])
total_frontier_cached_tokens += cached_prefill_tokens
completed_rows += 1
if completed_rows == 0:
return {
"available": False,
"reason": "no request rows had complete prefix-cache metrics",
"rows_with_missing_cache_metrics": rows_with_missing_cache_metrics,
}
return {
"available": True,
"completed_request_rows": completed_rows,
"total_request_metric_rows": len(rows),
"rows_with_missing_cache_metrics": rows_with_missing_cache_metrics,
"frontier_block_level": {
"total_query_blocks": total_query_blocks,
"total_hit_blocks": total_hit_blocks,
"hit_ratio": (
total_hit_blocks / total_query_blocks if total_query_blocks else 0.0
),
"total_cached_prefill_tokens_frontier_whole_block": total_frontier_cached_tokens,
},
"replayserve_token_weighted": {
"total_query_tokens": total_query_tokens,
"total_hit_tokens": total_hit_tokens,
"hit_ratio": (
total_hit_tokens / total_query_tokens if total_query_tokens else 0.0
),
},
"semantics": (
"Frontier reports whole-block hits; ReplayServe weights the first "
"hit_blocks sidecar block_token_counts, so partial final blocks count "
"by their true token length when they are hit."
),
}
def compute_completion_summary(
system_metrics: dict[str, Any],
request_metrics_path: Path,
) -> dict[str, Any]:
fieldnames, rows = load_csv(request_metrics_path)
missing_latency_rows: list[int] = []
if "Request Id" in fieldnames and "request_e2e_time" in fieldnames:
for row in rows:
if row.get("request_e2e_time", "") == "":
missing_latency_rows.append(int(float(row["Request Id"])))
metadata = system_metrics.get("simulation_metadata", {})
total_requests = int(metadata.get("total_requests") or len(rows))
completed_requests = int(metadata.get("completed_requests") or 0)
is_complete = (
total_requests > 0
and completed_requests == total_requests
and not missing_latency_rows
)
return {
"is_complete": is_complete,
"total_requests": total_requests,
"completed_requests": completed_requests,
"request_metric_rows": len(rows),
"missing_latency_request_ids": missing_latency_rows,
}
def get_nested(data: dict[str, Any], *keys: str) -> Any:
value: Any = data
for key in keys:
if not isinstance(value, dict):
return None
value = value.get(key)
return value
def estimate_memory_planner_blocks(
*,
config: dict[str, Any],
scheduler_config: dict[str, Any],
model_weight_memory: dict[str, Any] | None,
) -> dict[str, Any]:
replica_config = extract_replica_config(config)
model_config = replica_config.get("model_config")
device_config = replica_config.get("device_config")
if (
not isinstance(model_config, dict)
or not isinstance(device_config, dict)
or not isinstance(model_weight_memory, dict)
):
return {"available": False, "reason": "missing model/device/weight config"}
block_size = int(scheduler_config.get("block_size", 0))
if block_size <= 0:
return {"available": False, "reason": "missing positive block_size"}
total_memory_gb = float(device_config["total_memory_gb"])
gpu_memory_utilization = scheduler_config.get("gpu_memory_utilization")
if gpu_memory_utilization is None:
gpu_memory_utilization = 1.0 - float(replica_config.get("memory_margin_fraction", 0.1))
gpu_memory_utilization = float(gpu_memory_utilization)
parameter_memory_bytes = int(model_weight_memory["total_memory_bytes"])
overhead_bytes = int(scheduler_config.get("non_kv_cache_overhead_bytes") or 0)
requested_memory_bytes = int(total_memory_gb * 1024**3 * gpu_memory_utilization)
available_kv_cache_memory_bytes = (
requested_memory_bytes - parameter_memory_bytes - overhead_bytes
)
embedding_dim = int(model_config["embedding_dim"])
num_q_heads = int(model_config["num_q_heads"])
head_dim = model_config.get("head_dim")
if head_dim is None:
head_dim = embedding_dim // num_q_heads
head_dim = int(head_dim)
num_kv_heads = int(model_config["num_kv_heads"])
attn_tp = int(replica_config["attn_tensor_parallel_size"])
kv_heads_per_tensor_parallel_worker = math.ceil(num_kv_heads / attn_tp)
num_layers = int(model_config["num_layers"])
page_size_bytes_per_layer_per_block = (
2 * 2 * block_size * kv_heads_per_tensor_parallel_worker * head_dim
)
if available_kv_cache_memory_bytes <= 0 or page_size_bytes_per_layer_per_block <= 0:
derived_num_blocks = 0
else:
derived_num_blocks = int(
available_kv_cache_memory_bytes
// page_size_bytes_per_layer_per_block
// num_layers
)
return {
"available": True,
"source": "ReplayServe fallback using Frontier MemoryPlanner.get_num_blocks formula",
"total_blocks": derived_num_blocks,
"requested_memory_bytes": requested_memory_bytes,
"parameter_memory_per_device_bytes": parameter_memory_bytes,
"non_kv_cache_overhead_bytes": overhead_bytes,
"available_kv_cache_memory_bytes": available_kv_cache_memory_bytes,
"block_size": block_size,
"num_layers": num_layers,
"head_dim": head_dim,
"num_kv_heads": num_kv_heads,
"attn_tensor_parallel_size": attn_tp,
"kv_heads_per_tensor_parallel_worker": kv_heads_per_tensor_parallel_worker,
"page_size_bytes_per_layer_per_block": page_size_bytes_per_layer_per_block,
"gpu_memory_utilization": gpu_memory_utilization,
"total_memory_gb": total_memory_gb,
}
def main() -> int:
args = parse_args()
try:
run_dir = args.run_dir
fixture_dir = args.fixture_dir
metrics_dir = find_metrics_dir(run_dir)
system_metrics_path = metrics_dir / "system_metrics.json"
request_metrics_path = metrics_dir / "request_metrics.csv"
config_path = metrics_dir / "config.json"
sidecar_path = fixture_dir / "sidecar.jsonl"
system_metrics = load_json(system_metrics_path)
config = load_json(config_path)
scheduler_config = extract_scheduler_config(config)
log_text = (
read_text_if_exists(run_dir / "stdout.log")
+ "\n"
+ read_text_if_exists(run_dir / "stderr.log")
)
memory_state = parse_memory_state(log_text)
completion_summary = compute_completion_summary(
system_metrics, request_metrics_path
)
cache_summary = compute_token_weighted_cache(request_metrics_path, sidecar_path)
model_weight_memory = get_nested(system_metrics, "model_weight_memory", "MONOLITHIC")
if not memory_state.get("available"):
memory_state = estimate_memory_planner_blocks(
config=config,
scheduler_config=scheduler_config,
model_weight_memory=model_weight_memory,
)
preemption_statistics = system_metrics.get("preemption_statistics", {})
allocation_pressure_lines = [
line
for line in log_text.splitlines()
if re.search(
r"preempt|insufficient|cannot allocate|allocation pressure|oom",
line,
flags=re.IGNORECASE,
)
]
summary = {
"run_dir": str(run_dir),
"fixture_dir": str(fixture_dir),
"metrics_dir": str(metrics_dir),
"system_metrics": str(system_metrics_path),
"request_metrics": str(request_metrics_path),
"config": str(config_path),
"frontier_prefix_cache_statistics": system_metrics.get(
"prefix_cache_statistics"
),
"completion": completion_summary,
"prefix_cache_postprocess": cache_summary,
"memory_planner": {
"mode": scheduler_config.get("num_blocks_mode"),
"gpu_memory_utilization": scheduler_config.get(
"gpu_memory_utilization"
),
"non_kv_cache_overhead_bytes": scheduler_config.get(
"non_kv_cache_overhead_bytes"
),
"derived": memory_state,
"model_weight_memory_monolithic": model_weight_memory,
"assumption": (
"RS1 uses Frontier memory_planner with analytical parameter "
"memory and non_kv_cache_overhead_bytes=0 for plumbing smoke."
),
},
"preemption_statistics": preemption_statistics,
"allocation_pressure_log_line_count": len(allocation_pressure_lines),
"allocation_pressure_log_excerpt": allocation_pressure_lines[:20],
}
output_json = run_dir / "postprocess_summary.json"
output_md = run_dir / "postprocess_summary.md"
with output_json.open("w", encoding="utf-8") as handle:
json.dump(summary, handle, indent=2, sort_keys=True)
handle.write("\n")
cache = summary["prefix_cache_postprocess"]
mem = summary["memory_planner"]
with output_md.open("w", encoding="utf-8") as handle:
handle.write(f"# RS1 Frontier Smoke: {fixture_dir.name}\n\n")
handle.write(f"- Metrics dir: `{metrics_dir}`\n")
handle.write(f"- Frontier system metrics: `{system_metrics_path}`\n")
handle.write(f"- Frontier request metrics: `{request_metrics_path}`\n")
handle.write(
"- Completion: "
f"`{completion_summary['completed_requests']}/"
f"{completion_summary['total_requests']}`\n"
)
missing_latency_rows = completion_summary.get("missing_latency_request_ids") or []
if missing_latency_rows:
handle.write(
"- Missing latency request rows: "
f"`{missing_latency_rows}`\n"
)
if cache.get("available"):
frontier_ratio = cache["frontier_block_level"]["hit_ratio"]
token_ratio = cache["replayserve_token_weighted"]["hit_ratio"]
handle.write(f"- Frontier block-level prefix hit ratio: {frontier_ratio:.8f}\n")
handle.write(f"- ReplayServe token-weighted prefix hit ratio: {token_ratio:.8f}\n")
missing_cache_rows = cache.get("rows_with_missing_cache_metrics") or []
if missing_cache_rows:
handle.write(
"- Prefix-cache metric rows skipped: "
f"`{missing_cache_rows}`\n"
)
else:
handle.write(f"- Prefix cache postprocess unavailable: {cache.get('reason')}\n")
derived = mem.get("derived", {})
handle.write(f"- Memory planner mode: `{mem.get('mode')}`\n")
handle.write(f"- GPU memory utilization: `{mem.get('gpu_memory_utilization')}`\n")
handle.write(
f"- Non-KV overhead bytes assumption: `{mem.get('non_kv_cache_overhead_bytes')}`\n"
)
if derived.get("available"):
handle.write(f"- Derived KV blocks: `{derived.get('total_blocks')}`\n")
handle.write(f"- Max batch size: `{derived.get('max_batch_size', 'n/a')}`\n")
else:
handle.write("- Derived KV blocks: not found in logs\n")
preemptions = preemption_statistics.get("total_preemption_events")
handle.write(f"- Total preemption events: `{preemptions}`\n")
handle.write(
f"- Allocation/preemption/OOM log lines: `{len(allocation_pressure_lines)}`\n"
)
except Exception as exc:
print(f"postprocess_frontier_smoke.py: error: {exc}", file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

391
tools/qwen_to_frontier.py Executable file
View File

@@ -0,0 +1,391 @@
#!/usr/bin/env python3
"""Convert Qwen JSONL traces to Frontier trace-replay CSV fixtures."""
from __future__ import annotations
import argparse
import csv
import json
import math
import os
import sys
from pathlib import Path
from typing import Any
CSV_FIELDS = [
"arrived_at",
"num_prefill_tokens",
"num_decode_tokens",
"session_id",
"block_hash_ids",
]
SIDECAR_FIELDS = [
"request_id",
"chat_id",
"parent_chat_id",
"turn",
"type",
"timestamp",
"input_length",
"output_length",
"hash_ids",
"block_token_counts",
]
def positive_int(value: str) -> int:
parsed = int(value)
if parsed <= 0:
raise argparse.ArgumentTypeError("must be positive")
return parsed
def positive_float(value: str) -> float:
parsed = float(value)
if parsed <= 0:
raise argparse.ArgumentTypeError("must be positive")
return parsed
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert Qwen JSONL to Frontier CSV plus ReplayServe sidecar."
)
parser.add_argument("--input", required=True, type=Path, help="Qwen JSONL path.")
parser.add_argument(
"--frontier-csv", required=True, type=Path, help="Output Frontier CSV path."
)
parser.add_argument(
"--sidecar-jsonl",
required=True,
type=Path,
help="Output ReplayServe sidecar JSONL path.",
)
parser.add_argument(
"--source-jsonl",
type=Path,
help="Optional path for the original source JSONL slice.",
)
parser.add_argument(
"--manifest-json", type=Path, help="Optional path for fixture manifest JSON."
)
parser.add_argument(
"--fixture-name", help="Optional fixture name stored in the manifest."
)
parser.add_argument(
"--limit", type=positive_int, help="Maximum number of rows to convert."
)
parser.add_argument("--max-tokens", type=positive_int, default=32768)
parser.add_argument("--block-size", type=positive_int, default=16)
parser.add_argument(
"--timestamp-scale",
type=positive_float,
default=1.0,
help="Multiply each source timestamp before writing fixture files.",
)
parser.add_argument(
"--fail-on-overflow",
action="store_true",
help="Hard fail if input_length + output_length exceeds --max-tokens.",
)
return parser.parse_args()
def require_int(row: dict[str, Any], key: str, line_number: int) -> int:
try:
value = row[key]
except KeyError as exc:
raise ValueError(f"line {line_number}: missing field {key!r}") from exc
if isinstance(value, bool) or not isinstance(value, int):
raise ValueError(f"line {line_number}: field {key!r} must be an int")
return value
def require_number(row: dict[str, Any], key: str, line_number: int) -> int | float:
try:
value = row[key]
except KeyError as exc:
raise ValueError(f"line {line_number}: missing field {key!r}") from exc
if isinstance(value, bool) or not isinstance(value, (int, float)):
raise ValueError(f"line {line_number}: field {key!r} must be numeric")
return value
def require_hash_ids(row: dict[str, Any], line_number: int) -> list[int]:
try:
value = row["hash_ids"]
except KeyError as exc:
raise ValueError(f"line {line_number}: missing field 'hash_ids'") from exc
if not isinstance(value, list):
raise ValueError(f"line {line_number}: field 'hash_ids' must be a list")
hash_ids: list[int] = []
for index, item in enumerate(value):
if isinstance(item, bool) or not isinstance(item, int):
raise ValueError(
f"line {line_number}: hash_ids[{index}] must be an int"
)
hash_ids.append(item)
return hash_ids
def block_token_counts(input_length: int, hash_count: int, block_size: int) -> list[int]:
if hash_count == 0:
return []
last_count = input_length % block_size
if last_count == 0:
last_count = block_size
return [block_size] * (hash_count - 1) + [last_count]
def convert_row(
row: dict[str, Any],
request_id: int,
line_number: int,
block_size: int,
max_tokens: int,
fail_on_overflow: bool,
timestamp_scale: float,
) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
chat_id = require_int(row, "chat_id", line_number)
parent_chat_id = require_int(row, "parent_chat_id", line_number)
timestamp = float(require_number(row, "timestamp", line_number)) * timestamp_scale
input_length = require_int(row, "input_length", line_number)
output_length = require_int(row, "output_length", line_number)
turn = require_int(row, "turn", line_number)
request_type = row.get("type")
hash_ids = require_hash_ids(row, line_number)
if input_length <= 0:
raise ValueError(f"line {line_number}: input_length must be positive")
if output_length <= 0:
raise ValueError(f"line {line_number}: output_length must be positive")
expected_hash_count = math.ceil(input_length / block_size)
if len(hash_ids) != expected_hash_count:
raise ValueError(
f"line {line_number}: len(hash_ids)={len(hash_ids)} does not match "
f"ceil(input_length / block_size)={expected_hash_count}"
)
total_tokens = input_length + output_length
overflow = total_tokens > max_tokens
if overflow and fail_on_overflow:
raise ValueError(
f"line {line_number}: total_tokens={total_tokens} exceeds "
f"max_tokens={max_tokens}"
)
counts = block_token_counts(input_length, len(hash_ids), block_size)
frontier_row = {
"arrived_at": timestamp,
"num_prefill_tokens": input_length,
"num_decode_tokens": output_length,
"session_id": chat_id,
"block_hash_ids": "|".join(str(item) for item in hash_ids),
}
sidecar_row = {
"request_id": request_id,
"chat_id": chat_id,
"parent_chat_id": parent_chat_id,
"turn": turn,
"type": request_type,
"timestamp": timestamp,
"input_length": input_length,
"output_length": output_length,
"hash_ids": hash_ids,
"block_token_counts": counts,
}
stats = {
"total_tokens": total_tokens,
"input_length": input_length,
"output_length": output_length,
"timestamp": timestamp,
"partial_final_block": input_length % block_size != 0,
"overflow": overflow,
}
return frontier_row, sidecar_row, stats
def tmp_path(path: Path) -> Path:
return path.with_name(f".{path.name}.tmp")
def ensure_parent(path: Path | None) -> None:
if path is not None:
path.parent.mkdir(parents=True, exist_ok=True)
def publish_tmp_files(paths: list[tuple[Path, Path]]) -> None:
for temporary, final in paths:
os.replace(temporary, final)
def cleanup_tmp_files(paths: list[tuple[Path, Path]]) -> None:
for temporary, _ in paths:
try:
temporary.unlink()
except FileNotFoundError:
pass
def main() -> int:
args = parse_args()
for output_path in (
args.frontier_csv,
args.sidecar_jsonl,
args.source_jsonl,
args.manifest_json,
):
ensure_parent(output_path)
temporary_paths: list[tuple[Path, Path]] = [
(tmp_path(args.frontier_csv), args.frontier_csv),
(tmp_path(args.sidecar_jsonl), args.sidecar_jsonl),
]
if args.source_jsonl is not None:
temporary_paths.append((tmp_path(args.source_jsonl), args.source_jsonl))
if args.manifest_json is not None:
temporary_paths.append((tmp_path(args.manifest_json), args.manifest_json))
row_count = 0
overflow_count = 0
max_total_tokens = 0
max_input_length = 0
max_output_length = 0
first_timestamp: float | None = None
last_timestamp: float | None = None
timestamp_monotonic = True
partial_final_block_rows = 0
try:
with (
args.input.open("r", encoding="utf-8") as input_file,
tmp_path(args.frontier_csv).open("w", encoding="utf-8", newline="") as csv_file,
tmp_path(args.sidecar_jsonl).open("w", encoding="utf-8") as sidecar_file,
):
csv_writer = csv.DictWriter(
csv_file, fieldnames=CSV_FIELDS, lineterminator="\n"
)
csv_writer.writeheader()
source_file = None
if args.source_jsonl is not None:
source_file = tmp_path(args.source_jsonl).open("w", encoding="utf-8")
try:
for line_number, raw_line in enumerate(input_file, start=1):
if args.limit is not None and row_count >= args.limit:
break
stripped = raw_line.strip()
if not stripped:
continue
row = json.loads(stripped)
frontier_row, sidecar_row, stats = convert_row(
row=row,
request_id=line_number - 1,
line_number=line_number,
block_size=args.block_size,
max_tokens=args.max_tokens,
fail_on_overflow=args.fail_on_overflow,
timestamp_scale=args.timestamp_scale,
)
csv_writer.writerow(frontier_row)
sidecar_file.write(
json.dumps(sidecar_row, sort_keys=True, separators=(",", ":"))
+ "\n"
)
if source_file is not None:
if args.timestamp_scale == 1.0:
source_file.write(
raw_line if raw_line.endswith("\n") else raw_line + "\n"
)
else:
source_row = dict(row)
source_row["timestamp"] = stats["timestamp"]
source_file.write(
json.dumps(
source_row, sort_keys=True, separators=(",", ":")
)
+ "\n"
)
row_count += 1
overflow_count += int(stats["overflow"])
max_total_tokens = max(max_total_tokens, int(stats["total_tokens"]))
max_input_length = max(max_input_length, int(stats["input_length"]))
max_output_length = max(max_output_length, int(stats["output_length"]))
partial_final_block_rows += int(stats["partial_final_block"])
timestamp = float(stats["timestamp"])
if first_timestamp is None:
first_timestamp = timestamp
if last_timestamp is not None and timestamp < last_timestamp:
timestamp_monotonic = False
last_timestamp = timestamp
finally:
if source_file is not None:
source_file.close()
if args.manifest_json is not None:
manifest = {
"fixture_name": args.fixture_name,
"generated_by": "tools/qwen_to_frontier.py",
"input_jsonl": str(args.input),
"source_jsonl": str(args.source_jsonl) if args.source_jsonl else None,
"frontier_csv": str(args.frontier_csv),
"sidecar_jsonl": str(args.sidecar_jsonl),
"csv_fields": CSV_FIELDS,
"sidecar_fields": SIDECAR_FIELDS,
"limit": args.limit,
"row_count": row_count,
"block_size": args.block_size,
"max_tokens": args.max_tokens,
"fail_on_overflow": args.fail_on_overflow,
"timestamp_scale": args.timestamp_scale,
"overflow_count": overflow_count,
"max_total_tokens": max_total_tokens,
"max_input_length": max_input_length,
"max_output_length": max_output_length,
"first_timestamp": first_timestamp,
"last_timestamp": last_timestamp,
"timestamp_monotonic": timestamp_monotonic,
"partial_final_block_rows": partial_final_block_rows,
"adapter_semantics": {
"timestamp": "arrived_at",
"input_length": "num_prefill_tokens",
"output_length": "num_decode_tokens",
"chat_id": "session_id",
"hash_ids": "block_hash_ids joined by |",
"block_token_counts": (
"full blocks use block_size tokens; final partial block "
"uses input_length % block_size, or block_size when zero"
),
},
}
with tmp_path(args.manifest_json).open("w", encoding="utf-8") as manifest_file:
json.dump(manifest, manifest_file, indent=2, sort_keys=True)
manifest_file.write("\n")
publish_tmp_files(temporary_paths)
except Exception as exc:
cleanup_tmp_files(temporary_paths)
print(f"qwen_to_frontier.py: error: {exc}", file=sys.stderr)
return 1
if overflow_count and not args.fail_on_overflow:
print(
f"qwen_to_frontier.py: warning: {overflow_count} rows exceed "
f"max_tokens={args.max_tokens}; no clipping was applied",
file=sys.stderr,
)
print(
f"converted rows={row_count} max_total_tokens={max_total_tokens} "
f"overflows={overflow_count}",
file=sys.stderr,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

534
tools/run_frontier_sweep.py Normal file
View File

@@ -0,0 +1,534 @@
#!/usr/bin/env python3
"""Run a small Frontier sweep from a ReplayServe JSON config."""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import time
from pathlib import Path
from typing import Any
REPLAYSERVE_ROOT = Path(__file__).resolve().parents[1]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run Frontier configs from JSON.")
parser.add_argument(
"--config",
type=Path,
default=REPLAYSERVE_ROOT / "configs" / "rs3_tiny_sweep.json",
help="Sweep JSON config.",
)
parser.add_argument("--suite-id", help="Override suite_id from the config.")
parser.add_argument(
"--run-root",
type=Path,
help="Override run root. Defaults to runs/<suite_id>.",
)
parser.add_argument(
"--only-config",
action="append",
default=[],
help="Run only a config id. Can be repeated.",
)
parser.add_argument(
"--only-fixture",
action="append",
default=[],
help="Run only a fixture. Can be repeated.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Write manifests and commands, but do not execute Frontier.",
)
parser.add_argument(
"--force",
action="store_true",
help="Replace existing run dirs selected by this invocation.",
)
return parser.parse_args()
def load_json(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
if not isinstance(data, dict):
raise ValueError(f"{path}: top-level JSON must be an object")
return data
def git_head(path: Path) -> str | None:
try:
result = subprocess.run(
["git", "-C", str(path), "rev-parse", "HEAD"],
check=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
except subprocess.CalledProcessError:
return None
return result.stdout.strip()
def git_status(path: Path) -> str | None:
try:
result = subprocess.run(
["git", "-C", str(path), "status", "--short"],
check=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
except subprocess.CalledProcessError:
return None
return result.stdout
def shell_join(argv: list[str]) -> str:
import shlex
return " ".join(shlex.quote(part) for part in argv)
def merge_config(defaults: dict[str, Any], item: dict[str, Any]) -> dict[str, Any]:
merged = dict(defaults)
overrides = item.get("overrides", {})
if overrides is None:
overrides = {}
if not isinstance(overrides, dict):
raise ValueError(f"config {item.get('id')}: overrides must be an object")
merged.update(overrides)
if "max_num_seqs" in merged and "batch_size_cap" not in overrides:
merged["batch_size_cap"] = merged["max_num_seqs"]
return merged
def build_frontier_command(
*,
python_bin: str,
trace_file: Path,
metrics_root: Path,
run_id: str,
knobs: dict[str, Any],
) -> list[str]:
cmd = [
python_bin,
"-m",
"frontier.main",
"--simulation_mode",
str(knobs["simulation_mode"]),
"--sys_arch",
str(knobs["sys_arch"]),
"--cc_backend_config_type",
"analytical",
"--cluster_config_num_replicas",
str(knobs["num_replicas"]),
"--cluster_scheduler_config_type",
str(knobs["cluster_scheduler"]),
"--replica_config_model_name",
str(knobs["model_name"]),
"--replica_config_device",
str(knobs["device"]),
"--replica_config_network_device",
str(knobs["network_device"]),
"--replica_config_attn_tensor_parallel_size",
str(knobs["attn_tensor_parallel_size"]),
"--replica_config_attn_data_parallel_size",
str(knobs["attn_data_parallel_size"]),
"--replica_config_moe_tensor_parallel_size",
str(knobs["moe_tensor_parallel_size"]),
"--replica_config_moe_expert_parallel_size",
str(knobs["moe_expert_parallel_size"]),
"--replica_config_num_pipeline_stages",
str(knobs["num_pipeline_stages"]),
"--replica_scheduler_config_type",
str(knobs["replica_scheduler"]),
"--decode_cuda_graph_mode",
str(knobs.get("decode_cuda_graph_mode", "full_decode_only")),
"--vllm_v1_scheduler_config_batch_size_cap",
str(knobs["batch_size_cap"]),
"--vllm_v1_scheduler_config_max_tokens_in_batch",
str(knobs["max_tokens_in_batch"]),
"--vllm_v1_scheduler_config_long_prefill_token_threshold",
str(knobs["long_prefill_token_threshold"]),
"--vllm_v1_scheduler_config_block_size",
str(knobs["block_size"]),
"--vllm_v1_scheduler_config_num_blocks_mode",
str(knobs["num_blocks_mode"]),
"--vllm_v1_scheduler_config_gpu_memory_utilization",
str(knobs["gpu_memory_utilization"]),
"--vllm_v1_scheduler_config_non_kv_cache_overhead_bytes",
str(knobs["non_kv_cache_overhead_bytes"]),
"--request_generator_config_type",
"trace_replay",
"--trace_request_generator_config_trace_file",
str(trace_file),
"--trace_request_generator_config_max_tokens",
str(knobs["trace_max_tokens"]),
"--metrics_config_output_dir",
str(metrics_root),
"--metrics_config_run_id",
run_id,
"--metrics_config_write_metrics",
"--metrics_config_store_request_metrics",
"--metrics_config_store_batch_metrics",
"--metrics_config_store_token_completion_metrics",
"--metrics_config_store_utilization_metrics",
"--no-metrics_config_store_plots",
"--no-metrics_config_enable_chrome_trace",
"--no-metrics_config_write_json_trace",
"--no-metrics_config_store_frontier_stage_batch_ledger",
]
if bool(knobs.get("enable_dummy_mode", True)):
cmd.extend(
[
"--random_forrest_execution_time_predictor_config_enable_dummy_mode",
"--random_forrest_execution_time_predictor_config_dummy_execution_time_ms",
str(knobs["dummy_execution_time_ms"]),
]
)
else:
cmd.append("--no-random_forrest_execution_time_predictor_config_enable_dummy_mode")
profile_arg_names = {
"linear_op_input_file": "linear_op_input_file",
"atten_input_file": "atten_input_file",
"moe_input_file": "moe_input_file",
"linear_op_kernel_only_input_file": "linear_op_kernel_only_input_file",
"atten_kernel_only_input_file": "atten_kernel_only_input_file",
"moe_kernel_only_input_file": "moe_kernel_only_input_file",
}
for knob_name, cli_name in profile_arg_names.items():
value = knobs.get(knob_name)
if value:
cmd.extend(
[
f"--random_forrest_execution_time_predictor_config_{cli_name}",
str(value),
]
)
for knob_name in (
"prediction_max_prefill_chunk_size",
"prediction_max_batch_size",
"prediction_max_tokens_per_request",
):
value = knobs.get(knob_name)
if value is not None:
cmd.extend(
[
f"--random_forrest_execution_time_predictor_config_{knob_name}",
str(value),
]
)
if bool(knobs.get("no_cache", False)):
cmd.append("--random_forrest_execution_time_predictor_config_no_cache")
if bool(knobs.get("skip_cpu_overhead_modeling", True)):
cmd.append(
"--random_forrest_execution_time_predictor_config_skip_cpu_overhead_modeling"
)
if knobs.get("num_blocks") is not None:
cmd.extend(
[
"--vllm_v1_scheduler_config_num_blocks",
str(knobs["num_blocks"]),
]
)
if bool(knobs["enable_prefix_caching"]):
cmd.append("--vllm_v1_scheduler_config_enable_prefix_caching")
if bool(knobs["enable_chunked_prefill"]):
cmd.append("--vllm_v1_scheduler_config_enable_chunked_prefill")
return cmd
def write_text(path: Path, text: str) -> None:
path.write_text(text, encoding="utf-8")
def run_one(
*,
suite_id: str,
sim: str,
frontier_info: dict[str, Any],
frontier_root: Path,
fixture: str,
config_item: dict[str, Any],
knobs: dict[str, Any],
run_root: Path,
python_bin: str,
python_deps_dir: Path,
dry_run: bool,
force: bool,
) -> dict[str, Any]:
config_id = str(config_item["id"])
fixture_dir = REPLAYSERVE_ROOT / "traces" / "fixtures" / fixture
trace_file = fixture_dir / "frontier.csv"
sidecar_file = fixture_dir / "sidecar.jsonl"
if not trace_file.exists():
raise FileNotFoundError(f"missing trace file: {trace_file}")
if not sidecar_file.exists():
raise FileNotFoundError(f"missing sidecar file: {sidecar_file}")
run_dir = (run_root / sim / fixture / config_id).resolve()
metrics_root = (run_dir / "frontier_metrics").resolve()
if run_dir.exists():
if not force:
raise FileExistsError(f"run dir exists, use --force to replace: {run_dir}")
shutil.rmtree(run_dir)
run_dir.mkdir(parents=True)
metrics_root.mkdir(parents=True)
run_id = f"{suite_id}_{fixture}_{config_id}"
cmd = build_frontier_command(
python_bin=python_bin,
trace_file=trace_file,
metrics_root=metrics_root,
run_id=run_id,
knobs=knobs,
)
existing_pythonpath = os.environ.get("PYTHONPATH")
pythonpath_parts = []
if python_deps_dir.is_dir():
pythonpath_parts.append(str(python_deps_dir))
pythonpath_parts.append(str(frontier_root))
if existing_pythonpath:
pythonpath_parts.append(existing_pythonpath)
env = os.environ.copy()
env.update(
{
"PYTHONPATH": ":".join(pythonpath_parts),
"WANDB_DISABLED": "true",
"VIDUR_DISABLE_WANDB": "1",
"FRONTIER_LOG_LEVEL": env.get("FRONTIER_LOG_LEVEL", "info"),
"PYTHONDONTWRITEBYTECODE": "1",
}
)
frontier_head = git_head(frontier_root)
frontier_status = git_status(frontier_root)
manifest = {
"suite_id": suite_id,
"sim": sim,
"fixture": fixture,
"config_id": config_id,
"description": config_item.get("description", ""),
"run_dir": str(run_dir),
"metrics_root": str(metrics_root),
"run_id": run_id,
"frontier": {
**frontier_info,
"root": str(frontier_root),
"head": frontier_head,
"status_short": frontier_status,
},
"fixture_dir": str(fixture_dir),
"trace_file": str(trace_file),
"sidecar_file": str(sidecar_file),
"knobs": knobs,
"command": cmd,
}
with (run_dir / "run_manifest.json").open("w", encoding="utf-8") as handle:
json.dump(manifest, handle, indent=2, sort_keys=True)
handle.write("\n")
write_text(
run_dir / "command.txt",
"\n".join(
[
f"cd {frontier_root}",
f"export PYTHONPATH={env['PYTHONPATH']}",
f"export WANDB_DISABLED={env['WANDB_DISABLED']}",
f"export VIDUR_DISABLE_WANDB={env['VIDUR_DISABLE_WANDB']}",
f"export FRONTIER_LOG_LEVEL={env['FRONTIER_LOG_LEVEL']}",
f"export PYTHONDONTWRITEBYTECODE={env['PYTHONDONTWRITEBYTECODE']}",
f"command={shell_join(cmd)}",
"",
]
),
)
write_text(
run_dir / "env.txt",
"\n".join(
[
f"suite_id={suite_id}",
f"sim={sim}",
f"fixture={fixture}",
f"config_id={config_id}",
f"replayserve_root={REPLAYSERVE_ROOT}",
f"frontier_root={frontier_root}",
f"frontier_head={frontier_head}",
f"python_deps_dir={python_deps_dir}",
f"trace_file={trace_file}",
f"sidecar_file={sidecar_file}",
f"run_dir={run_dir}",
f"metrics_root={metrics_root}",
f"run_id={run_id}",
"",
]
),
)
if dry_run:
write_text(run_dir / "exit_code.txt", "0\n")
status = {
"status": "dry_run",
"exit_code": 0,
"runtime_seconds": 0,
"postprocess_exit_code": None,
}
with (run_dir / "run_status.json").open("w", encoding="utf-8") as handle:
json.dump(status, handle, indent=2, sort_keys=True)
handle.write("\n")
return status
start_epoch = int(time.time())
write_text(run_dir / "start_epoch.txt", f"{start_epoch}\n")
with (run_dir / "stdout.log").open("w", encoding="utf-8") as stdout, (
run_dir / "stderr.log"
).open("w", encoding="utf-8") as stderr:
proc = subprocess.run(cmd, cwd=frontier_root, env=env, stdout=stdout, stderr=stderr)
end_epoch = int(time.time())
runtime_seconds = end_epoch - start_epoch
write_text(run_dir / "end_epoch.txt", f"{end_epoch}\n")
write_text(run_dir / "exit_code.txt", f"{proc.returncode}\n")
write_text(run_dir / "runtime_seconds.txt", f"{runtime_seconds}\n")
postprocess_exit_code: int | None = None
if proc.returncode == 0:
postprocess_cmd = [
python_bin,
str(REPLAYSERVE_ROOT / "tools" / "postprocess_frontier_smoke.py"),
"--run-dir",
str(run_dir),
"--fixture-dir",
str(fixture_dir),
]
with (run_dir / "postprocess.stdout.log").open("w", encoding="utf-8") as stdout, (
run_dir / "postprocess.stderr.log"
).open("w", encoding="utf-8") as stderr:
post = subprocess.run(
postprocess_cmd,
cwd=REPLAYSERVE_ROOT,
env={**env, "PYTHONPATH": env["PYTHONPATH"]},
stdout=stdout,
stderr=stderr,
)
postprocess_exit_code = post.returncode
status_name = "pass" if proc.returncode == 0 and postprocess_exit_code in (0, None) else "fail"
if proc.returncode == 0 and postprocess_exit_code not in (0, None):
status_name = "postprocess_fail"
if status_name == "pass":
summary_path = run_dir / "postprocess_summary.json"
if summary_path.exists():
try:
summary = load_json(summary_path)
completion = summary.get("completion", {})
if isinstance(completion, dict) and not completion.get("is_complete", True):
status_name = "incomplete"
except Exception:
status_name = "postprocess_fail"
status = {
"status": status_name,
"exit_code": proc.returncode,
"runtime_seconds": runtime_seconds,
"postprocess_exit_code": postprocess_exit_code,
}
with (run_dir / "run_status.json").open("w", encoding="utf-8") as handle:
json.dump(status, handle, indent=2, sort_keys=True)
handle.write("\n")
return status
def main() -> int:
args = parse_args()
config_path = args.config.resolve()
config = load_json(config_path)
suite_id = args.suite_id or str(config.get("suite_id") or "rs3_sweep")
run_root = args.run_root or (REPLAYSERVE_ROOT / "runs" / suite_id)
sim = str(config.get("sim") or "frontier")
frontier_info = config.get("frontier", {})
if not isinstance(frontier_info, dict):
raise ValueError("frontier must be an object")
frontier_root = Path(str(frontier_info.get("root") or "/tmp/toc-llm-sim-research/Frontier"))
if not frontier_root.is_dir():
raise FileNotFoundError(f"Frontier root does not exist: {frontier_root}")
fixtures = [str(value) for value in config.get("fixtures", [])]
if args.only_fixture:
selected = set(args.only_fixture)
fixtures = [value for value in fixtures if value in selected]
if not fixtures:
raise ValueError("no fixtures selected")
defaults = config.get("defaults", {})
if not isinstance(defaults, dict):
raise ValueError("defaults must be an object")
config_items = config.get("configs", [])
if not isinstance(config_items, list) or not config_items:
raise ValueError("configs must be a non-empty list")
if args.only_config:
selected_configs = set(args.only_config)
config_items = [
item
for item in config_items
if isinstance(item, dict) and str(item.get("id")) in selected_configs
]
if not config_items:
raise ValueError("no configs selected")
if (REPLAYSERVE_ROOT / ".venv" / "bin" / "python").is_file():
python_bin = str(REPLAYSERVE_ROOT / ".venv" / "bin" / "python")
else:
python_bin = os.environ.get("PYTHON_BIN", sys.executable or "python3")
python_deps_dir = Path(
os.environ.get("PYTHON_DEPS_DIR", str(REPLAYSERVE_ROOT / ".deps" / "python"))
)
results: list[dict[str, Any]] = []
for fixture in fixtures:
for item in config_items:
if not isinstance(item, dict):
raise ValueError("each configs entry must be an object")
if "id" not in item:
raise ValueError("each configs entry needs id")
knobs = merge_config(defaults, item)
status = run_one(
suite_id=suite_id,
sim=sim,
frontier_info=frontier_info,
frontier_root=frontier_root,
fixture=fixture,
config_item=item,
knobs=knobs,
run_root=run_root,
python_bin=python_bin,
python_deps_dir=python_deps_dir,
dry_run=args.dry_run,
force=args.force,
)
results.append(
{
"fixture": fixture,
"config_id": item["id"],
**status,
}
)
print(
f"{fixture}/{item['id']}: {status['status']} "
f"exit={status['exit_code']} runtime={status['runtime_seconds']}s"
)
failures = [row for row in results if row["status"] not in {"pass", "dry_run"}]
return 1 if failures else 0
if __name__ == "__main__":
raise SystemExit(main())

240
tools/validate_fixtures.py Executable file
View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""Validate ReplayServe fixture directories."""
from __future__ import annotations
import argparse
import csv
import json
import math
import sys
from pathlib import Path
from typing import Any
def positive_int(value: str) -> int:
parsed = int(value)
if parsed <= 0:
raise argparse.ArgumentTypeError("must be positive")
return parsed
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Validate ReplayServe fixtures.")
parser.add_argument("fixture_dirs", nargs="+", type=Path)
parser.add_argument("--max-tokens", type=positive_int, default=32768)
parser.add_argument("--block-size", type=positive_int, default=16)
return parser.parse_args()
def parse_block_hash_ids(value: str) -> list[int]:
stripped = value.strip()
if not stripped:
return []
return [int(part) for part in stripped.split("|") if part]
def expected_block_counts(input_length: int, block_size: int) -> list[int]:
hash_count = math.ceil(input_length / block_size)
if hash_count == 0:
return []
last_count = input_length % block_size
if last_count == 0:
last_count = block_size
return [block_size] * (hash_count - 1) + [last_count]
def load_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as handle:
for line_number, line in enumerate(handle, start=1):
stripped = line.strip()
if not stripped:
continue
try:
row = json.loads(stripped)
except json.JSONDecodeError as exc:
raise ValueError(f"{path}: line {line_number}: invalid JSON") from exc
if not isinstance(row, dict):
raise ValueError(f"{path}: line {line_number}: JSON value must be object")
rows.append(row)
return rows
def load_csv(path: Path) -> list[dict[str, str]]:
with path.open("r", encoding="utf-8", newline="") as handle:
reader = csv.DictReader(handle)
required = {
"arrived_at",
"num_prefill_tokens",
"num_decode_tokens",
"session_id",
"block_hash_ids",
}
missing = required - set(reader.fieldnames or [])
if missing:
raise ValueError(f"{path}: missing CSV columns: {sorted(missing)}")
return list(reader)
def require_paths(fixture_dir: Path) -> tuple[Path, Path, Path, Path]:
source_path = fixture_dir / "source.jsonl"
csv_path = fixture_dir / "frontier.csv"
sidecar_path = fixture_dir / "sidecar.jsonl"
manifest_path = fixture_dir / "manifest.json"
for path in (source_path, csv_path, sidecar_path, manifest_path):
if not path.exists():
raise ValueError(f"{fixture_dir}: missing {path.name}")
return source_path, csv_path, sidecar_path, manifest_path
def validate_fixture(fixture_dir: Path, block_size: int, max_tokens: int) -> str:
source_path, csv_path, sidecar_path, manifest_path = require_paths(fixture_dir)
source_rows = load_jsonl(source_path)
csv_rows = load_csv(csv_path)
sidecar_rows = load_jsonl(sidecar_path)
with manifest_path.open("r", encoding="utf-8") as handle:
manifest = json.load(handle)
row_count = len(csv_rows)
if len(source_rows) != row_count or len(sidecar_rows) != row_count:
raise ValueError(
f"{fixture_dir}: row count mismatch source={len(source_rows)} "
f"csv={row_count} sidecar={len(sidecar_rows)}"
)
if manifest.get("row_count") != row_count:
raise ValueError(
f"{fixture_dir}: manifest row_count={manifest.get('row_count')} "
f"does not match csv rows={row_count}"
)
if manifest.get("block_size") != block_size:
raise ValueError(
f"{fixture_dir}: manifest block_size={manifest.get('block_size')} "
f"does not match expected {block_size}"
)
if manifest.get("max_tokens") != max_tokens:
raise ValueError(
f"{fixture_dir}: manifest max_tokens={manifest.get('max_tokens')} "
f"does not match expected {max_tokens}"
)
previous_timestamp: float | None = None
max_total_tokens = 0
partial_final_block_rows = 0
for index, (source, csv_row, sidecar) in enumerate(
zip(source_rows, csv_rows, sidecar_rows)
):
prefix = f"{fixture_dir}: row {index}"
input_length = int(csv_row["num_prefill_tokens"])
output_length = int(csv_row["num_decode_tokens"])
total_tokens = input_length + output_length
if total_tokens > max_tokens:
raise ValueError(
f"{prefix}: total_tokens={total_tokens} exceeds max_tokens={max_tokens}"
)
max_total_tokens = max(max_total_tokens, total_tokens)
timestamp = float(csv_row["arrived_at"])
if previous_timestamp is not None and timestamp < previous_timestamp:
raise ValueError(f"{prefix}: timestamp is not monotonic")
previous_timestamp = timestamp
hash_ids = parse_block_hash_ids(csv_row["block_hash_ids"])
expected_hash_count = math.ceil(input_length / block_size)
if len(hash_ids) != expected_hash_count:
raise ValueError(
f"{prefix}: hash count {len(hash_ids)} != {expected_hash_count}"
)
counts = expected_block_counts(input_length, block_size)
if sum(counts) != input_length:
raise ValueError(f"{prefix}: expected block counts do not sum to input")
partial_final_block_rows += int(input_length % block_size != 0)
if int(csv_row["session_id"]) != int(source["chat_id"]):
raise ValueError(f"{prefix}: session_id does not match source chat_id")
if timestamp != float(source["timestamp"]):
raise ValueError(f"{prefix}: arrived_at does not match source timestamp")
if input_length != int(source["input_length"]):
raise ValueError(f"{prefix}: num_prefill_tokens does not match source")
if output_length != int(source["output_length"]):
raise ValueError(f"{prefix}: num_decode_tokens does not match source")
if hash_ids != source["hash_ids"]:
raise ValueError(f"{prefix}: block_hash_ids do not match source hash_ids")
required_sidecar_keys = {
"request_id",
"chat_id",
"parent_chat_id",
"turn",
"type",
"timestamp",
"input_length",
"output_length",
"hash_ids",
"block_token_counts",
}
missing = required_sidecar_keys - set(sidecar)
if missing:
raise ValueError(f"{prefix}: missing sidecar keys {sorted(missing)}")
if int(sidecar["request_id"]) != index:
raise ValueError(f"{prefix}: sidecar request_id mismatch")
if int(sidecar["chat_id"]) != int(source["chat_id"]):
raise ValueError(f"{prefix}: sidecar chat_id mismatch")
if int(sidecar["parent_chat_id"]) != int(source["parent_chat_id"]):
raise ValueError(f"{prefix}: sidecar parent_chat_id mismatch")
if int(sidecar["turn"]) != int(source["turn"]):
raise ValueError(f"{prefix}: sidecar turn mismatch")
if sidecar["type"] != source["type"]:
raise ValueError(f"{prefix}: sidecar type mismatch")
if float(sidecar["timestamp"]) != float(source["timestamp"]):
raise ValueError(f"{prefix}: sidecar timestamp mismatch")
if int(sidecar["input_length"]) != input_length:
raise ValueError(f"{prefix}: sidecar input_length mismatch")
if int(sidecar["output_length"]) != output_length:
raise ValueError(f"{prefix}: sidecar output_length mismatch")
if sidecar["hash_ids"] != hash_ids:
raise ValueError(f"{prefix}: sidecar hash_ids mismatch")
if sidecar["block_token_counts"] != counts:
raise ValueError(f"{prefix}: sidecar block_token_counts mismatch")
if manifest.get("max_total_tokens") != max_total_tokens:
raise ValueError(
f"{fixture_dir}: manifest max_total_tokens="
f"{manifest.get('max_total_tokens')} does not match {max_total_tokens}"
)
if manifest.get("partial_final_block_rows") != partial_final_block_rows:
raise ValueError(
f"{fixture_dir}: manifest partial_final_block_rows="
f"{manifest.get('partial_final_block_rows')} does not match "
f"{partial_final_block_rows}"
)
if manifest.get("overflow_count") != 0:
raise ValueError(f"{fixture_dir}: manifest overflow_count is not zero")
if manifest.get("timestamp_monotonic") is not True:
raise ValueError(f"{fixture_dir}: manifest timestamp_monotonic is not true")
return (
f"{fixture_dir.name}: rows={row_count} max_total_tokens={max_total_tokens} "
f"partial_final_block_rows={partial_final_block_rows}"
)
def main() -> int:
args = parse_args()
try:
for fixture_dir in args.fixture_dirs:
print(
validate_fixture(
fixture_dir=fixture_dir,
block_size=args.block_size,
max_tokens=args.max_tokens,
)
)
except Exception as exc:
print(f"validate_fixtures.py: error: {exc}", file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,405 @@
#!/usr/bin/env python3
"""Replay a ReplayServe fixture on vLLM with synthetic prompt token blocks."""
from __future__ import annotations
import argparse
import asyncio
import csv
import hashlib
import json
import os
import random
import statistics
import sys
import time
from pathlib import Path
from typing import Any
def positive_int(value: str) -> int:
parsed = int(value)
if parsed <= 0:
raise argparse.ArgumentTypeError("must be positive")
return parsed
def positive_float(value: str) -> float:
parsed = float(value)
if parsed <= 0:
raise argparse.ArgumentTypeError("must be positive")
return parsed
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Run an online vLLM smoke/replay using synthetic prompt_token_ids "
"derived from ReplayServe block hashes."
)
)
parser.add_argument("--fixture-dir", required=True, type=Path)
parser.add_argument("--model", required=True, type=str)
parser.add_argument("--output-dir", required=True, type=Path)
parser.add_argument("--tensor-parallel-size", type=positive_int, default=1)
parser.add_argument("--limit", type=positive_int)
parser.add_argument("--block-size", type=positive_int, default=16)
parser.add_argument("--max-model-len", type=positive_int, default=32768)
parser.add_argument("--max-num-seqs", type=positive_int, default=128)
parser.add_argument("--max-num-batched-tokens", type=positive_int, default=32768)
parser.add_argument("--gpu-memory-utilization", type=positive_float, default=0.9)
parser.add_argument("--time-scale", type=positive_float, default=1.0)
parser.add_argument(
"--max-output-tokens",
type=positive_int,
help="Cap each row's output_length for smoke tests.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--dtype", default="auto")
parser.add_argument("--enforce-eager", action="store_true")
parser.add_argument("--trust-remote-code", action=argparse.BooleanOptionalAction, default=True)
parser.add_argument("--enable-prefix-caching", action=argparse.BooleanOptionalAction, default=True)
parser.add_argument("--enable-chunked-prefill", action=argparse.BooleanOptionalAction, default=True)
return parser.parse_args()
def load_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as handle:
for line_number, line in enumerate(handle, start=1):
stripped = line.strip()
if not stripped:
continue
row = json.loads(stripped)
if not isinstance(row, dict):
raise ValueError(f"{path}: line {line_number}: expected object")
rows.append(row)
return rows
def percentile(values: list[float], pct: float) -> float | None:
if not values:
return None
ordered = sorted(values)
index = min(len(ordered) - 1, max(0, int((len(ordered) - 1) * pct)))
return ordered[index]
def block_seed(hash_id: int, seed: int) -> int:
digest = hashlib.blake2b(
f"{seed}:{hash_id}".encode("utf-8"), digest_size=8
).digest()
return int.from_bytes(digest, "big")
def block_tokens(
hash_id: int,
*,
seed: int,
block_size: int,
vocab_size: int,
special_ids: set[int],
) -> list[int]:
rng = random.Random(block_seed(hash_id, seed))
low = 1000
high = max(low + 1, vocab_size - 1000)
tokens: list[int] = []
while len(tokens) < block_size:
token_id = rng.randrange(low, high)
if token_id not in special_ids:
tokens.append(token_id)
return tokens
def make_prompt_token_ids(
row: dict[str, Any],
*,
seed: int,
block_size: int,
vocab_size: int,
special_ids: set[int],
) -> list[int]:
hash_ids = [int(value) for value in row["hash_ids"]]
counts = [int(value) for value in row["block_token_counts"]]
if len(hash_ids) != len(counts):
raise ValueError(f"request {row.get('request_id')}: hash/count length mismatch")
token_ids: list[int] = []
for hash_id, count in zip(hash_ids, counts):
token_ids.extend(
block_tokens(
hash_id,
seed=seed,
block_size=block_size,
vocab_size=vocab_size,
special_ids=special_ids,
)[:count]
)
expected = int(row["input_length"])
if len(token_ids) != expected:
raise ValueError(
f"request {row.get('request_id')}: synthetic prompt length "
f"{len(token_ids)} != input_length {expected}"
)
return token_ids
def estimate_prefix_reuse(rows: list[dict[str, Any]]) -> dict[int, dict[str, int | float]]:
trie: dict[int, dict[Any, Any]] = {}
estimates: dict[int, dict[str, int | float]] = {}
for row in rows:
request_id = int(row["request_id"])
hash_ids = [int(value) for value in row["hash_ids"]]
counts = [int(value) for value in row["block_token_counts"]]
node = trie
hit_blocks = 0
for hash_id in hash_ids:
if hash_id not in node:
break
hit_blocks += 1
node = node[hash_id]
node = trie
for hash_id in hash_ids:
node = node.setdefault(hash_id, {})
query_tokens = int(row["input_length"])
hit_tokens = sum(counts[:hit_blocks])
estimates[request_id] = {
"query_blocks": len(hash_ids),
"hit_blocks": hit_blocks,
"query_tokens": query_tokens,
"hit_tokens": hit_tokens,
"block_hit_ratio": hit_blocks / len(hash_ids) if hash_ids else 0.0,
"token_hit_ratio": hit_tokens / query_tokens if query_tokens else 0.0,
}
return estimates
async def run_replay(args: argparse.Namespace) -> dict[str, Any]:
try:
from transformers import AutoTokenizer
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
from vllm.inputs import TokensPrompt
except Exception as exc: # pragma: no cover - exercised on GPU host.
raise RuntimeError(f"failed to import vLLM runtime dependencies: {exc}") from exc
sidecar_path = args.fixture_dir / "sidecar.jsonl"
rows = load_jsonl(sidecar_path)
if args.limit is not None:
rows = rows[: args.limit]
if not rows:
raise ValueError("no rows selected")
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
special_ids = {int(value) for value in tokenizer.all_special_ids}
vocab_size = len(tokenizer)
synthetic_prompts = {
int(row["request_id"]): make_prompt_token_ids(
row,
seed=args.seed,
block_size=args.block_size,
vocab_size=vocab_size,
special_ids=special_ids,
)
for row in rows
}
prefix_reuse = estimate_prefix_reuse(rows)
engine_args = AsyncEngineArgs(
model=args.model,
tokenizer=args.model,
trust_remote_code=args.trust_remote_code,
tensor_parallel_size=args.tensor_parallel_size,
dtype=args.dtype,
max_model_len=args.max_model_len,
block_size=args.block_size,
enable_prefix_caching=args.enable_prefix_caching,
enable_chunked_prefill=args.enable_chunked_prefill,
max_num_seqs=args.max_num_seqs,
max_num_batched_tokens=args.max_num_batched_tokens,
gpu_memory_utilization=args.gpu_memory_utilization,
enforce_eager=args.enforce_eager,
disable_log_stats=True,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
output_rows: list[dict[str, Any]] = []
first_timestamp = float(rows[0]["timestamp"])
replay_start = time.perf_counter()
async def run_one(row: dict[str, Any]) -> None:
request_id = int(row["request_id"])
scheduled_arrival_s = (float(row["timestamp"]) - first_timestamp) * args.time_scale
await asyncio.sleep(max(0.0, replay_start + scheduled_arrival_s - time.perf_counter()))
prompt_token_ids = synthetic_prompts[request_id]
requested_output_tokens = int(row["output_length"])
effective_output_tokens = requested_output_tokens
if args.max_output_tokens is not None:
effective_output_tokens = min(effective_output_tokens, args.max_output_tokens)
sampling_params = SamplingParams(
temperature=0.0,
max_tokens=effective_output_tokens,
min_tokens=effective_output_tokens,
ignore_eos=True,
detokenize=False,
seed=args.seed + request_id,
)
arrival_wall = time.perf_counter()
first_token_wall: float | None = None
last_output_tokens = 0
final_output: Any = None
generator = engine.generate(
TokensPrompt(prompt_token_ids=prompt_token_ids),
sampling_params,
request_id=str(request_id),
)
async for output in generator:
final_output = output
if output.outputs:
token_count = len(output.outputs[0].token_ids)
if token_count > 0 and first_token_wall is None:
first_token_wall = time.perf_counter()
last_output_tokens = token_count
done_wall = time.perf_counter()
finish_reason = ""
if final_output is not None and final_output.outputs:
finish_reason = str(final_output.outputs[0].finish_reason)
ttft_s = None if first_token_wall is None else first_token_wall - arrival_wall
e2e_s = done_wall - arrival_wall
tpot_s = None
if first_token_wall is not None and last_output_tokens > 1:
tpot_s = (done_wall - first_token_wall) / (last_output_tokens - 1)
reuse = prefix_reuse[request_id]
output_rows.append(
{
"request_id": request_id,
"scheduled_arrival_s": scheduled_arrival_s,
"arrival_delay_s": arrival_wall - replay_start - scheduled_arrival_s,
"input_length": int(row["input_length"]),
"requested_output_length": requested_output_tokens,
"effective_output_length": effective_output_tokens,
"generated_output_tokens": last_output_tokens,
"ttft_s": ttft_s,
"tpot_s": tpot_s,
"e2e_s": e2e_s,
"finish_reason": finish_reason,
"prefix_query_blocks_est": reuse["query_blocks"],
"prefix_hit_blocks_est": reuse["hit_blocks"],
"prefix_query_tokens_est": reuse["query_tokens"],
"prefix_hit_tokens_est": reuse["hit_tokens"],
"prefix_block_hit_ratio_est": reuse["block_hit_ratio"],
"prefix_token_hit_ratio_est": reuse["token_hit_ratio"],
}
)
try:
await asyncio.gather(*(run_one(row) for row in rows))
finally:
engine.shutdown()
replay_end = time.perf_counter()
output_rows.sort(key=lambda item: int(item["request_id"]))
args.output_dir.mkdir(parents=True, exist_ok=True)
request_metrics_path = args.output_dir / "request_metrics.csv"
fieldnames = list(output_rows[0].keys())
with request_metrics_path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(output_rows)
ttft_values = [float(row["ttft_s"]) for row in output_rows if row["ttft_s"] is not None]
tpot_values = [float(row["tpot_s"]) for row in output_rows if row["tpot_s"] is not None]
e2e_values = [float(row["e2e_s"]) for row in output_rows]
generated_tokens = sum(int(row["generated_output_tokens"]) for row in output_rows)
prompt_tokens = sum(int(row["input_length"]) for row in output_rows)
wall_s = replay_end - replay_start
summary = {
"status": "pass",
"fixture_dir": str(args.fixture_dir),
"model": args.model,
"tensor_parallel_size": args.tensor_parallel_size,
"cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES", ""),
"rows": len(output_rows),
"block_size": args.block_size,
"max_model_len": args.max_model_len,
"max_num_seqs": args.max_num_seqs,
"max_num_batched_tokens": args.max_num_batched_tokens,
"gpu_memory_utilization": args.gpu_memory_utilization,
"enable_prefix_caching": args.enable_prefix_caching,
"enable_chunked_prefill": args.enable_chunked_prefill,
"time_scale": args.time_scale,
"max_output_tokens": args.max_output_tokens,
"synthetic_replay": {
"semantics": (
"Each trace block hash is deterministically mapped to a stable "
"block of prompt token ids; equal hashes reuse equal token blocks. "
"This preserves arrival, length, and block-prefix sharing patterns, "
"but it is not original text/token recovery."
),
"seed": args.seed,
"vocab_size": vocab_size,
"special_token_ids_excluded": sorted(special_ids),
},
"wall_time_s": wall_s,
"requests_per_second": len(output_rows) / wall_s if wall_s else 0.0,
"prompt_tokens_per_second": prompt_tokens / wall_s if wall_s else 0.0,
"generated_tokens_per_second": generated_tokens / wall_s if wall_s else 0.0,
"total_prompt_tokens": prompt_tokens,
"total_generated_tokens": generated_tokens,
"ttft_s": {
"mean": statistics.fmean(ttft_values) if ttft_values else None,
"p50": percentile(ttft_values, 0.50),
"p95": percentile(ttft_values, 0.95),
},
"tpot_s": {
"mean": statistics.fmean(tpot_values) if tpot_values else None,
"p50": percentile(tpot_values, 0.50),
"p95": percentile(tpot_values, 0.95),
},
"e2e_s": {
"mean": statistics.fmean(e2e_values) if e2e_values else None,
"p50": percentile(e2e_values, 0.50),
"p95": percentile(e2e_values, 0.95),
},
"estimated_prefix_reuse": {
"query_blocks": sum(int(row["prefix_query_blocks_est"]) for row in output_rows),
"hit_blocks": sum(int(row["prefix_hit_blocks_est"]) for row in output_rows),
"query_tokens": sum(int(row["prefix_query_tokens_est"]) for row in output_rows),
"hit_tokens": sum(int(row["prefix_hit_tokens_est"]) for row in output_rows),
},
"request_metrics_csv": str(request_metrics_path),
}
reuse = summary["estimated_prefix_reuse"]
summary["estimated_prefix_reuse"]["block_hit_ratio"] = (
reuse["hit_blocks"] / reuse["query_blocks"] if reuse["query_blocks"] else 0.0
)
summary["estimated_prefix_reuse"]["token_hit_ratio"] = (
reuse["hit_tokens"] / reuse["query_tokens"] if reuse["query_tokens"] else 0.0
)
with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
json.dump(summary, handle, indent=2, sort_keys=True)
handle.write("\n")
return summary
def main() -> int:
args = parse_args()
try:
summary = asyncio.run(run_replay(args))
except Exception as exc:
args.output_dir.mkdir(parents=True, exist_ok=True)
with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
json.dump({"status": "fail", "error": str(exc)}, handle, indent=2)
handle.write("\n")
print(f"vllm_synthetic_replay.py: error: {exc}", file=sys.stderr)
return 1
print(json.dumps(summary, indent=2, sort_keys=True))
return 0
if __name__ == "__main__":
raise SystemExit(main())