Add ReplayServe Frontier vLLM alignment report
This commit is contained in:
255
tools/aggregate_runs.py
Normal file
255
tools/aggregate_runs.py
Normal file
@@ -0,0 +1,255 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Aggregate ReplayServe Frontier run directories into CSV and Markdown."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
REPLAYSERVE_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
FIELDNAMES = [
|
||||
"suite_id",
|
||||
"sim",
|
||||
"fixture",
|
||||
"config_id",
|
||||
"status",
|
||||
"exit_code",
|
||||
"runtime_seconds",
|
||||
"frontier_mode",
|
||||
"frontier_head",
|
||||
"frontier_dirty",
|
||||
"attn_tp",
|
||||
"attn_dp",
|
||||
"moe_tp",
|
||||
"moe_ep",
|
||||
"batch_size_cap",
|
||||
"max_tokens_in_batch",
|
||||
"block_size",
|
||||
"enable_prefix_caching",
|
||||
"enable_chunked_prefill",
|
||||
"long_prefill_token_threshold",
|
||||
"frontier_block_hit_ratio",
|
||||
"replayserve_token_hit_ratio",
|
||||
"cache_metrics_available",
|
||||
"cache_metrics_unavailable_reason",
|
||||
"cache_metric_rows_complete",
|
||||
"cache_metric_rows_total",
|
||||
"cache_metric_rows_missing",
|
||||
"completion_is_complete",
|
||||
"missing_latency_request_ids",
|
||||
"preemption_events",
|
||||
"preempted_requests",
|
||||
"ttft_mean_ms",
|
||||
"ttft_p50_ms",
|
||||
"ttft_p95_ms",
|
||||
"tpot_mean_ms",
|
||||
"tpot_p50_ms",
|
||||
"tpot_p95_ms",
|
||||
"e2e_mean_ms",
|
||||
"e2e_p50_ms",
|
||||
"e2e_p95_ms",
|
||||
"requests_per_second",
|
||||
"tokens_per_second",
|
||||
"decode_tokens_per_second",
|
||||
"completed_requests",
|
||||
"total_requests",
|
||||
"run_dir",
|
||||
]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Aggregate ReplayServe run outputs.")
|
||||
parser.add_argument("suite_dir", type=Path, help="Run suite directory.")
|
||||
parser.add_argument(
|
||||
"--output-csv",
|
||||
type=Path,
|
||||
help="Output CSV path. Defaults to <suite_dir>/summary.csv.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-md",
|
||||
type=Path,
|
||||
help="Output Markdown path. Defaults to <suite_dir>/summary.md.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict[str, Any]:
|
||||
if not path.exists():
|
||||
return {}
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
|
||||
def read_int(path: Path) -> int | None:
|
||||
try:
|
||||
return int(path.read_text(encoding="utf-8").strip())
|
||||
except (FileNotFoundError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def nested(data: dict[str, Any], *keys: str) -> Any:
|
||||
value: Any = data
|
||||
for key in keys:
|
||||
if not isinstance(value, dict):
|
||||
return None
|
||||
value = value.get(key)
|
||||
return value
|
||||
|
||||
|
||||
def fmt(value: Any) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, bool):
|
||||
return "true" if value else "false"
|
||||
if isinstance(value, float):
|
||||
return f"{value:.8g}"
|
||||
return str(value)
|
||||
|
||||
|
||||
def summarize_run(run_dir: Path) -> dict[str, Any]:
|
||||
manifest = load_json(run_dir / "run_manifest.json")
|
||||
status_json = load_json(run_dir / "run_status.json")
|
||||
post = load_json(run_dir / "postprocess_summary.json")
|
||||
system_metrics_path = post.get("system_metrics") if post else None
|
||||
system_metrics = load_json(Path(system_metrics_path)) if system_metrics_path else {}
|
||||
knobs = manifest.get("knobs", {}) if isinstance(manifest.get("knobs"), dict) else {}
|
||||
frontier = manifest.get("frontier", {}) if isinstance(manifest.get("frontier"), dict) else {}
|
||||
prefix = post.get("prefix_cache_postprocess", {}) if isinstance(post.get("prefix_cache_postprocess"), dict) else {}
|
||||
frontier_block = prefix.get("frontier_block_level", {}) if isinstance(prefix.get("frontier_block_level"), dict) else {}
|
||||
token_weighted = prefix.get("replayserve_token_weighted", {}) if isinstance(prefix.get("replayserve_token_weighted"), dict) else {}
|
||||
missing_rows = prefix.get("rows_with_missing_cache_metrics") or []
|
||||
if not isinstance(missing_rows, list):
|
||||
missing_rows = []
|
||||
preemption = post.get("preemption_statistics", {}) if isinstance(post.get("preemption_statistics"), dict) else {}
|
||||
completion = post.get("completion", {}) if isinstance(post.get("completion"), dict) else {}
|
||||
simulation = system_metrics.get("simulation_metadata", {}) if isinstance(system_metrics.get("simulation_metadata"), dict) else {}
|
||||
throughput = system_metrics.get("throughput_metrics", {}) if isinstance(system_metrics.get("throughput_metrics"), dict) else {}
|
||||
|
||||
exit_code = status_json.get("exit_code")
|
||||
if exit_code is None:
|
||||
exit_code = read_int(run_dir / "exit_code.txt")
|
||||
runtime = status_json.get("runtime_seconds")
|
||||
if runtime is None:
|
||||
runtime = read_int(run_dir / "runtime_seconds.txt")
|
||||
status = status_json.get("status") or ("pass" if exit_code == 0 else "fail")
|
||||
if completion and not completion.get("is_complete", True):
|
||||
status = "incomplete"
|
||||
missing_latency_ids = completion.get("missing_latency_request_ids") or []
|
||||
if not isinstance(missing_latency_ids, list):
|
||||
missing_latency_ids = []
|
||||
|
||||
return {
|
||||
"suite_id": manifest.get("suite_id"),
|
||||
"sim": manifest.get("sim"),
|
||||
"fixture": manifest.get("fixture"),
|
||||
"config_id": manifest.get("config_id"),
|
||||
"status": status,
|
||||
"exit_code": exit_code,
|
||||
"runtime_seconds": runtime,
|
||||
"frontier_mode": frontier.get("mode"),
|
||||
"frontier_head": frontier.get("head"),
|
||||
"frontier_dirty": bool((frontier.get("status_short") or "").strip()),
|
||||
"attn_tp": knobs.get("attn_tensor_parallel_size"),
|
||||
"attn_dp": knobs.get("attn_data_parallel_size"),
|
||||
"moe_tp": knobs.get("moe_tensor_parallel_size"),
|
||||
"moe_ep": knobs.get("moe_expert_parallel_size"),
|
||||
"batch_size_cap": knobs.get("batch_size_cap"),
|
||||
"max_tokens_in_batch": knobs.get("max_tokens_in_batch"),
|
||||
"block_size": knobs.get("block_size"),
|
||||
"enable_prefix_caching": knobs.get("enable_prefix_caching"),
|
||||
"enable_chunked_prefill": knobs.get("enable_chunked_prefill"),
|
||||
"long_prefill_token_threshold": knobs.get("long_prefill_token_threshold"),
|
||||
"frontier_block_hit_ratio": frontier_block.get("hit_ratio"),
|
||||
"replayserve_token_hit_ratio": token_weighted.get("hit_ratio"),
|
||||
"cache_metrics_available": prefix.get("available"),
|
||||
"cache_metrics_unavailable_reason": prefix.get("reason"),
|
||||
"cache_metric_rows_complete": prefix.get("completed_request_rows"),
|
||||
"cache_metric_rows_total": prefix.get("total_request_metric_rows"),
|
||||
"cache_metric_rows_missing": len(missing_rows),
|
||||
"completion_is_complete": completion.get("is_complete"),
|
||||
"missing_latency_request_ids": ",".join(str(value) for value in missing_latency_ids),
|
||||
"preemption_events": preemption.get("total_preemption_events"),
|
||||
"preempted_requests": preemption.get("total_preempted_requests"),
|
||||
"ttft_mean_ms": nested(system_metrics, "ttft_statistics", "mean"),
|
||||
"ttft_p50_ms": nested(system_metrics, "ttft_statistics", "p50"),
|
||||
"ttft_p95_ms": nested(system_metrics, "ttft_statistics", "p95"),
|
||||
"tpot_mean_ms": nested(system_metrics, "tpot_statistics", "mean"),
|
||||
"tpot_p50_ms": nested(system_metrics, "tpot_statistics", "p50"),
|
||||
"tpot_p95_ms": nested(system_metrics, "tpot_statistics", "p95"),
|
||||
"e2e_mean_ms": nested(system_metrics, "request_e2e_time_statistics", "mean"),
|
||||
"e2e_p50_ms": nested(system_metrics, "request_e2e_time_statistics", "p50"),
|
||||
"e2e_p95_ms": nested(system_metrics, "request_e2e_time_statistics", "p95"),
|
||||
"requests_per_second": throughput.get("requests_per_second"),
|
||||
"tokens_per_second": throughput.get("tokens_per_second"),
|
||||
"decode_tokens_per_second": throughput.get("decode_tokens_per_second"),
|
||||
"completed_requests": simulation.get("completed_requests"),
|
||||
"total_requests": simulation.get("total_requests"),
|
||||
"run_dir": str(run_dir),
|
||||
}
|
||||
|
||||
|
||||
def write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
|
||||
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})
|
||||
|
||||
|
||||
def write_markdown(path: Path, rows: list[dict[str, Any]], suite_dir: Path) -> None:
|
||||
columns = [
|
||||
"config_id",
|
||||
"fixture",
|
||||
"status",
|
||||
"runtime_seconds",
|
||||
"enable_prefix_caching",
|
||||
"enable_chunked_prefill",
|
||||
"frontier_block_hit_ratio",
|
||||
"replayserve_token_hit_ratio",
|
||||
"cache_metric_rows_missing",
|
||||
"completion_is_complete",
|
||||
"preemption_events",
|
||||
"ttft_mean_ms",
|
||||
"tpot_mean_ms",
|
||||
"e2e_mean_ms",
|
||||
"tokens_per_second",
|
||||
]
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
handle.write(f"# Sweep Summary: {suite_dir.name}\n\n")
|
||||
handle.write(f"- Suite dir: `{suite_dir}`\n")
|
||||
handle.write(f"- Runs: `{len(rows)}`\n\n")
|
||||
handle.write("| " + " | ".join(columns) + " |\n")
|
||||
handle.write("|" + "|".join(["---"] * len(columns)) + "|\n")
|
||||
for row in rows:
|
||||
handle.write("| " + " | ".join(fmt(row.get(col)) for col in columns) + " |\n")
|
||||
handle.write("\n")
|
||||
handle.write(
|
||||
"Latency and throughput values are Frontier smoke outputs from the "
|
||||
"configured predictor/profile mode. RS3 tiny smoke uses dummy execution "
|
||||
"time, so these are harness plumbing checks, not performance claims.\n"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
suite_dir = args.suite_dir.resolve()
|
||||
run_dirs = sorted(path.parent for path in suite_dir.glob("**/run_manifest.json"))
|
||||
rows = [summarize_run(path) for path in run_dirs]
|
||||
output_csv = args.output_csv or (suite_dir / "summary.csv")
|
||||
output_md = args.output_md or (suite_dir / "summary.md")
|
||||
write_csv(output_csv, rows)
|
||||
write_markdown(output_md, rows, suite_dir)
|
||||
print(f"wrote {output_csv}")
|
||||
print(f"wrote {output_md}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
188
tools/analyze_trace_window.py
Executable file
188
tools/analyze_trace_window.py
Executable file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Analyze Qwen/ReplayServe sidecar rows around a request id."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Analyze sidecar prefix overlap.")
|
||||
parser.add_argument("--fixture-dir", required=True, type=Path)
|
||||
parser.add_argument("--request-id", required=True, type=int)
|
||||
parser.add_argument("--window", type=int, default=10)
|
||||
parser.add_argument("--top-k", type=int, default=15)
|
||||
parser.add_argument("--output-dir", required=True, type=Path)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line_number, line in enumerate(handle, start=1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
row = json.loads(stripped)
|
||||
if not isinstance(row, dict):
|
||||
raise ValueError(f"{path}: line {line_number}: expected object")
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
|
||||
def common_prefix_len(left: list[int], right: list[int]) -> int:
|
||||
count = 0
|
||||
for left_item, right_item in zip(left, right):
|
||||
if left_item != right_item:
|
||||
break
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def summarize_row(row: dict[str, Any], block_size: int = 16) -> dict[str, Any]:
|
||||
input_length = int(row["input_length"])
|
||||
output_length = int(row["output_length"])
|
||||
hash_ids = [int(value) for value in row["hash_ids"]]
|
||||
block_token_counts = [int(value) for value in row["block_token_counts"]]
|
||||
return {
|
||||
"request_id": int(row["request_id"]),
|
||||
"chat_id": int(row["chat_id"]),
|
||||
"parent_chat_id": int(row["parent_chat_id"]),
|
||||
"turn": int(row["turn"]),
|
||||
"type": row["type"],
|
||||
"timestamp": float(row["timestamp"]),
|
||||
"input_length": input_length,
|
||||
"output_length": output_length,
|
||||
"total_tokens": input_length + output_length,
|
||||
"hash_count": len(hash_ids),
|
||||
"first_hash_ids": hash_ids[:12],
|
||||
"last_hash_id": hash_ids[-1] if hash_ids else None,
|
||||
"partial_final_block": input_length % block_size != 0,
|
||||
"final_block_token_count": block_token_counts[-1] if block_token_counts else 0,
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
sidecar_path = args.fixture_dir / "sidecar.jsonl"
|
||||
rows = load_jsonl(sidecar_path)
|
||||
by_id = {int(row["request_id"]): row for row in rows}
|
||||
if args.request_id not in by_id:
|
||||
raise SystemExit(f"request_id {args.request_id} not found in {sidecar_path}")
|
||||
|
||||
target = by_id[args.request_id]
|
||||
target_hashes = [int(value) for value in target["hash_ids"]]
|
||||
target_counts = [int(value) for value in target["block_token_counts"]]
|
||||
overlaps: list[dict[str, Any]] = []
|
||||
for row in rows:
|
||||
request_id = int(row["request_id"])
|
||||
if request_id >= args.request_id:
|
||||
continue
|
||||
lcp_blocks = common_prefix_len(target_hashes, [int(value) for value in row["hash_ids"]])
|
||||
if lcp_blocks <= 0:
|
||||
continue
|
||||
overlaps.append(
|
||||
{
|
||||
**summarize_row(row),
|
||||
"common_prefix_blocks_with_target": lcp_blocks,
|
||||
"common_prefix_tokens_with_target": sum(target_counts[:lcp_blocks]),
|
||||
"target_prefix_fraction_blocks": (
|
||||
lcp_blocks / len(target_hashes) if target_hashes else 0.0
|
||||
),
|
||||
"target_prefix_fraction_tokens": (
|
||||
sum(target_counts[:lcp_blocks]) / int(target["input_length"])
|
||||
if int(target["input_length"]) > 0
|
||||
else 0.0
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
overlaps.sort(
|
||||
key=lambda item: (
|
||||
item["common_prefix_blocks_with_target"],
|
||||
item["request_id"],
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
start = max(0, args.request_id - args.window)
|
||||
end = min(len(rows), args.request_id + args.window + 1)
|
||||
local_window = [summarize_row(row) for row in rows[start:end]]
|
||||
parent_chat_id = int(target["parent_chat_id"])
|
||||
parent_rows = [
|
||||
summarize_row(row)
|
||||
for row in rows
|
||||
if int(row["chat_id"]) == parent_chat_id or int(row["request_id"]) == parent_chat_id
|
||||
]
|
||||
|
||||
result = {
|
||||
"fixture_dir": str(args.fixture_dir),
|
||||
"sidecar": str(sidecar_path),
|
||||
"request_id": args.request_id,
|
||||
"target": summarize_row(target),
|
||||
"local_window": local_window,
|
||||
"top_prior_prefix_overlaps": overlaps[: args.top_k],
|
||||
"prior_overlap_count": len(overlaps),
|
||||
"parent_candidates": parent_rows,
|
||||
"interpretation": {
|
||||
"prefix_overlap_semantics": (
|
||||
"Frontier prefix cache matches consecutive block_hash_ids from "
|
||||
"the start of the prompt. common_prefix_tokens_with_target uses "
|
||||
"the target sidecar block_token_counts, preserving partial final "
|
||||
"block token counts."
|
||||
),
|
||||
"partial_final_block_related": bool(int(target["input_length"]) % 16 != 0),
|
||||
},
|
||||
}
|
||||
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
json_path = args.output_dir / f"request_{args.request_id}_analysis.json"
|
||||
md_path = args.output_dir / f"request_{args.request_id}_analysis.md"
|
||||
with json_path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(result, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
|
||||
with md_path.open("w", encoding="utf-8") as handle:
|
||||
target_summary = result["target"]
|
||||
handle.write(f"# Request {args.request_id} Trace Analysis\n\n")
|
||||
handle.write(f"- Fixture: `{args.fixture_dir}`\n")
|
||||
handle.write(f"- Timestamp: `{target_summary['timestamp']}`\n")
|
||||
handle.write(f"- Chat: `{target_summary['chat_id']}` parent `{target_summary['parent_chat_id']}` turn `{target_summary['turn']}`\n")
|
||||
handle.write(f"- Input/output/total tokens: `{target_summary['input_length']}` / `{target_summary['output_length']}` / `{target_summary['total_tokens']}`\n")
|
||||
handle.write(f"- Hash blocks: `{target_summary['hash_count']}`\n")
|
||||
handle.write(f"- Partial final block: `{target_summary['partial_final_block']}` final count `{target_summary['final_block_token_count']}`\n")
|
||||
handle.write("\n## Top Prior Prefix Overlaps\n\n")
|
||||
if not overlaps:
|
||||
handle.write("No prior request shares a first block with the target.\n")
|
||||
else:
|
||||
handle.write("| prior request | timestamp | input | output | lcp blocks | lcp tokens | partial final |\n")
|
||||
handle.write("|---:|---:|---:|---:|---:|---:|---|\n")
|
||||
for item in overlaps[: args.top_k]:
|
||||
handle.write(
|
||||
f"| {item['request_id']} | {item['timestamp']} | "
|
||||
f"{item['input_length']} | {item['output_length']} | "
|
||||
f"{item['common_prefix_blocks_with_target']} | "
|
||||
f"{item['common_prefix_tokens_with_target']} | "
|
||||
f"{item['partial_final_block']} |\n"
|
||||
)
|
||||
handle.write("\n## Local Window\n\n")
|
||||
handle.write("| request | timestamp | input | output | blocks | partial final | first hashes |\n")
|
||||
handle.write("|---:|---:|---:|---:|---:|---|---|\n")
|
||||
for item in local_window:
|
||||
handle.write(
|
||||
f"| {item['request_id']} | {item['timestamp']} | "
|
||||
f"{item['input_length']} | {item['output_length']} | "
|
||||
f"{item['hash_count']} | {item['partial_final_block']} | "
|
||||
f"`{item['first_hash_ids']}` |\n"
|
||||
)
|
||||
|
||||
print(json_path)
|
||||
print(md_path)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
98
tools/analyze_vllm_prefix_log.py
Normal file
98
tools/analyze_vllm_prefix_log.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Summarize vLLM scheduler prefix-cache `computed:` log lines."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
START_RE = re.compile(r"Request (\d+) started running, prompt: (\d+), computed: (\d+)")
|
||||
PREEMPT_RE = re.compile(r"Request (\d+) preempted")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Parse vLLM scheduler logs and report observed computed-token "
|
||||
"prefix-cache behavior. Repeated starts indicate preemption or "
|
||||
"re-admission, so all-start sums are not equivalent to per-request "
|
||||
"prefix hits."
|
||||
)
|
||||
)
|
||||
parser.add_argument("stdout_log", type=Path)
|
||||
parser.add_argument("--summary-json", type=Path)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_estimated_hit_tokens(path: Path | None) -> int | None:
|
||||
if path is None:
|
||||
return None
|
||||
summary = json.loads(path.read_text(encoding="utf-8"))
|
||||
reuse = summary.get("estimated_prefix_reuse", {})
|
||||
hit_tokens = reuse.get("hit_tokens")
|
||||
return int(hit_tokens) if hit_tokens is not None else None
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
text = args.stdout_log.read_text(encoding="utf-8", errors="replace")
|
||||
|
||||
by_request: dict[int, list[dict[str, int]]] = {}
|
||||
for match in START_RE.finditer(text):
|
||||
request_id = int(match.group(1))
|
||||
by_request.setdefault(request_id, []).append(
|
||||
{
|
||||
"prompt_tokens": int(match.group(2)),
|
||||
"computed_tokens": int(match.group(3)),
|
||||
}
|
||||
)
|
||||
|
||||
preempted_request_ids = [int(match.group(1)) for match in PREEMPT_RE.finditer(text)]
|
||||
repeated = {
|
||||
str(request_id): starts
|
||||
for request_id, starts in sorted(by_request.items())
|
||||
if len(starts) > 1
|
||||
}
|
||||
|
||||
all_computed = sum(
|
||||
start["computed_tokens"]
|
||||
for starts in by_request.values()
|
||||
for start in starts
|
||||
)
|
||||
first_computed = sum(starts[0]["computed_tokens"] for starts in by_request.values())
|
||||
last_computed = sum(starts[-1]["computed_tokens"] for starts in by_request.values())
|
||||
max_computed = sum(max(start["computed_tokens"] for start in starts) for starts in by_request.values())
|
||||
estimated_hit_tokens = load_estimated_hit_tokens(args.summary_json)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"stdout_log": str(args.stdout_log),
|
||||
"starts_total": sum(len(starts) for starts in by_request.values()),
|
||||
"unique_requests": len(by_request),
|
||||
"preemptions": len(preempted_request_ids),
|
||||
"preempted_request_ids": preempted_request_ids,
|
||||
"repeated_request_ids": sorted(int(request_id) for request_id in repeated),
|
||||
"computed_tokens": {
|
||||
"all_starts": all_computed,
|
||||
"first_start_per_request": first_computed,
|
||||
"last_start_per_request": last_computed,
|
||||
"max_per_request": max_computed,
|
||||
},
|
||||
"repeated_starts": repeated,
|
||||
}
|
||||
if estimated_hit_tokens is not None:
|
||||
result["estimated_prefix_hit_tokens"] = estimated_hit_tokens
|
||||
result["matches_estimate"] = {
|
||||
name: value == estimated_hit_tokens
|
||||
for name, value in result["computed_tokens"].items()
|
||||
}
|
||||
|
||||
print(json.dumps(result, indent=2, sort_keys=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
532
tools/build_frontier_vllm_alignment_report.py
Normal file
532
tools/build_frontier_vllm_alignment_report.py
Normal file
@@ -0,0 +1,532 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build Frontier-vs-vLLM alignment tables and plots for the current H20 runs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import json
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
OUT_DIR = ROOT / "docs" / "assets" / "frontier_vllm_alignment"
|
||||
DASH1_VLLM_ROOT = Path("/home/admin/cpfs/wjh/replayserve/runs/vllm_gpu_smoke_20260625_dash1")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunSpec:
|
||||
run_id: str
|
||||
label: str
|
||||
tp: int
|
||||
request_count: int
|
||||
scale_label: str
|
||||
scale_value: float
|
||||
fixture: str
|
||||
frontier_summary: str
|
||||
vllm_summary: str
|
||||
vllm_preemptions: int
|
||||
kv_blocks: int
|
||||
notes: str = ""
|
||||
vllm_remote: bool = False
|
||||
|
||||
|
||||
RUNS: list[RunSpec] = [
|
||||
RunSpec(
|
||||
run_id="tp1_n100_scale1",
|
||||
label="TP1 N100 raw",
|
||||
tp=1,
|
||||
request_count=100,
|
||||
scale_label="raw",
|
||||
scale_value=1.0,
|
||||
fixture="coder_100",
|
||||
frontier_summary=(
|
||||
"runs/rs6_frontier_h20_tp1_profile_full32k_20260624/"
|
||||
"frontier_h20_tp1_profile_full32k/coder_100/"
|
||||
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260624/tp1_coder100_uncapped/summary.json",
|
||||
vllm_preemptions=8,
|
||||
kv_blocks=15281,
|
||||
notes="Frontier incomplete before lifecycle fix; included as TP1 100-request baseline.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n500_scale1",
|
||||
label="TP1 N500 raw",
|
||||
tp=1,
|
||||
request_count=500,
|
||||
scale_label="raw",
|
||||
scale_value=1.0,
|
||||
fixture="coder_500",
|
||||
frontier_summary=(
|
||||
"runs/rs8_frontier_h20_tp1_profile_full32k_coder500_20260625/"
|
||||
"frontier_h20_tp1_profile_full32k/coder_500/"
|
||||
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder500_uncapped/summary.json",
|
||||
vllm_preemptions=63,
|
||||
kv_blocks=15281,
|
||||
notes="Frontier incomplete; useful as high-pressure stress signal.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n200_scale0667",
|
||||
label="TP1 N200 scale 0.667",
|
||||
tp=1,
|
||||
request_count=200,
|
||||
scale_label="0.667",
|
||||
scale_value=2 / 3,
|
||||
fixture="coder_200_ts0667",
|
||||
frontier_summary=(
|
||||
"runs/rs9_frontier_h20_tp1_profile_full32k_coder200_ts0667/"
|
||||
"frontier_h20_tp1_profile_full32k/coder_200_ts0667/"
|
||||
"vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder200_ts0667_uncapped/summary.json",
|
||||
vllm_preemptions=26,
|
||||
kv_blocks=15281,
|
||||
notes="Dense-arrival run; Frontier incomplete before lifecycle fix.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n200_scale2",
|
||||
label="TP1 N200 scale 2",
|
||||
tp=1,
|
||||
request_count=200,
|
||||
scale_label="2",
|
||||
scale_value=2.0,
|
||||
fixture="coder_200_ts2",
|
||||
frontier_summary=(
|
||||
"runs/rs10_preemption_replay_fix_ts2/frontier_h20_tp1_profile_full32k/"
|
||||
"coder_200_ts2/vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts2_uncapped/summary.json",
|
||||
vllm_preemptions=43,
|
||||
kv_blocks=15281,
|
||||
notes="After Frontier decode-preemption lifecycle fix.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp1_n200_scale3",
|
||||
label="TP1 N200 scale 3",
|
||||
tp=1,
|
||||
request_count=200,
|
||||
scale_label="3",
|
||||
scale_value=3.0,
|
||||
fixture="coder_200_ts3",
|
||||
frontier_summary=(
|
||||
"runs/rs10_preemption_replay_fix_ts3/frontier_h20_tp1_profile_full32k/"
|
||||
"coder_200_ts3/vllm_kv_15281_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary="runs/vllm_gpu_smoke_20260625_dash1/tp1_coder_200_ts3_uncapped/summary.json",
|
||||
vllm_preemptions=16,
|
||||
kv_blocks=15281,
|
||||
notes="After Frontier decode-preemption lifecycle fix.",
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp2_n200_scale2",
|
||||
label="TP2 N200 scale 2",
|
||||
tp=2,
|
||||
request_count=200,
|
||||
scale_label="2",
|
||||
scale_value=2.0,
|
||||
fixture="coder_200_ts2",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
|
||||
"tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts2_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=69055,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp2_n200_scale3",
|
||||
label="TP2 N200 scale 3",
|
||||
tp=2,
|
||||
request_count=200,
|
||||
scale_label="3",
|
||||
scale_value=3.0,
|
||||
fixture="coder_200_ts3",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
|
||||
"tp2_vllm_kv_69055_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp2_coder_200_ts3_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=69055,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp4_n200_scale2",
|
||||
label="TP4 N200 scale 2",
|
||||
tp=4,
|
||||
request_count=200,
|
||||
scale_label="2",
|
||||
scale_value=2.0,
|
||||
fixture="coder_200_ts2",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts2/"
|
||||
"tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts2_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=177077,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
RunSpec(
|
||||
run_id="tp4_n200_scale3",
|
||||
label="TP4 N200 scale 3",
|
||||
tp=4,
|
||||
request_count=200,
|
||||
scale_label="3",
|
||||
scale_value=3.0,
|
||||
fixture="coder_200_ts3",
|
||||
frontier_summary=(
|
||||
"runs/rs12_frontier_h20_tp2_tp4_profile_full32k_coder200_ts2_ts3/"
|
||||
"frontier_h20_tp2_tp4_profile_full32k/coder_200_ts3/"
|
||||
"tp4_vllm_kv_177077_profile_full32k/postprocess_summary.json"
|
||||
),
|
||||
vllm_summary=str(DASH1_VLLM_ROOT / "tp4_coder_200_ts3_uncapped" / "summary.json"),
|
||||
vllm_preemptions=0,
|
||||
kv_blocks=177077,
|
||||
notes="Uses true-mixed TP2/TP4 attention profile.",
|
||||
vllm_remote=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
FIELDNAMES = [
|
||||
"run_id",
|
||||
"label",
|
||||
"tp",
|
||||
"request_count",
|
||||
"scale_label",
|
||||
"scale_value",
|
||||
"fixture",
|
||||
"kv_blocks",
|
||||
"frontier_completed",
|
||||
"frontier_total",
|
||||
"frontier_complete",
|
||||
"vllm_completed",
|
||||
"vllm_total",
|
||||
"frontier_preemptions",
|
||||
"vllm_preemptions",
|
||||
"frontier_prefix_hit",
|
||||
"vllm_prefix_hit",
|
||||
"prefix_hit_delta",
|
||||
"frontier_rps",
|
||||
"vllm_rps",
|
||||
"rps_ratio",
|
||||
"frontier_total_tps",
|
||||
"vllm_total_tps",
|
||||
"total_tps_ratio",
|
||||
"frontier_decode_tps",
|
||||
"vllm_decode_tps",
|
||||
"decode_tps_ratio",
|
||||
"frontier_ttft_p50_s",
|
||||
"vllm_ttft_p50_s",
|
||||
"ttft_p50_ratio",
|
||||
"frontier_ttft_p95_s",
|
||||
"vllm_ttft_p95_s",
|
||||
"ttft_p95_ratio",
|
||||
"frontier_tpot_p50_s",
|
||||
"vllm_tpot_p50_s",
|
||||
"tpot_p50_ratio",
|
||||
"frontier_tpot_p95_s",
|
||||
"vllm_tpot_p95_s",
|
||||
"tpot_p95_ratio",
|
||||
"frontier_e2e_p50_s",
|
||||
"vllm_e2e_p50_s",
|
||||
"e2e_p50_ratio",
|
||||
"frontier_e2e_p95_s",
|
||||
"vllm_e2e_p95_s",
|
||||
"e2e_p95_ratio",
|
||||
"notes",
|
||||
]
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict[str, Any]:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{path}: expected JSON object")
|
||||
return data
|
||||
|
||||
|
||||
def load_vllm_summary(spec: RunSpec) -> dict[str, Any]:
|
||||
path = Path(spec.vllm_summary)
|
||||
if not spec.vllm_remote:
|
||||
return load_json(ROOT / path)
|
||||
local_candidate = ROOT / "runs" / "vllm_gpu_smoke_20260625_dash1" / path.parent.name / path.name
|
||||
if local_candidate.exists():
|
||||
return load_json(local_candidate)
|
||||
raw = subprocess.check_output(["ssh", "dash1", f"cat {spec.vllm_summary}"], text=True)
|
||||
data = json.loads(raw)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{spec.vllm_summary}: expected JSON object")
|
||||
return data
|
||||
|
||||
|
||||
def load_frontier_summary(spec: RunSpec) -> tuple[dict[str, Any], dict[str, Any]]:
|
||||
post = load_json(ROOT / spec.frontier_summary)
|
||||
system_path = Path(post["system_metrics"])
|
||||
if not system_path.is_absolute():
|
||||
system_path = ROOT / system_path
|
||||
return post, load_json(system_path)
|
||||
|
||||
|
||||
def ratio(numerator: float | int | None, denominator: float | int | None) -> float | None:
|
||||
if numerator is None or denominator in (None, 0):
|
||||
return None
|
||||
return float(numerator) / float(denominator)
|
||||
|
||||
|
||||
def nested(data: dict[str, Any], *keys: str) -> Any:
|
||||
value: Any = data
|
||||
for key in keys:
|
||||
if not isinstance(value, dict):
|
||||
return None
|
||||
value = value.get(key)
|
||||
return value
|
||||
|
||||
|
||||
def summarize(spec: RunSpec) -> dict[str, Any]:
|
||||
post, system = load_frontier_summary(spec)
|
||||
vllm = load_vllm_summary(spec)
|
||||
|
||||
completion = post.get("completion", {})
|
||||
preemption = post.get("preemption_statistics", {})
|
||||
prefix = post.get("prefix_cache_postprocess", {})
|
||||
token_weighted = prefix.get("replayserve_token_weighted", {})
|
||||
throughput = system.get("throughput_metrics", {})
|
||||
|
||||
frontier_total_tps = throughput.get("tokens_per_second")
|
||||
vllm_total_tps = vllm["prompt_tokens_per_second"] + vllm["generated_tokens_per_second"]
|
||||
frontier_prefix_hit = token_weighted.get("hit_ratio")
|
||||
vllm_prefix_hit = nested(vllm, "estimated_prefix_reuse", "token_hit_ratio")
|
||||
|
||||
row: dict[str, Any] = {
|
||||
"run_id": spec.run_id,
|
||||
"label": spec.label,
|
||||
"tp": spec.tp,
|
||||
"request_count": spec.request_count,
|
||||
"scale_label": spec.scale_label,
|
||||
"scale_value": spec.scale_value,
|
||||
"fixture": spec.fixture,
|
||||
"kv_blocks": spec.kv_blocks,
|
||||
"frontier_completed": completion.get("completed_requests"),
|
||||
"frontier_total": completion.get("total_requests"),
|
||||
"frontier_complete": completion.get("is_complete"),
|
||||
"vllm_completed": vllm.get("rows"),
|
||||
"vllm_total": vllm.get("rows"),
|
||||
"frontier_preemptions": preemption.get("total_preemption_events"),
|
||||
"vllm_preemptions": spec.vllm_preemptions,
|
||||
"frontier_prefix_hit": frontier_prefix_hit,
|
||||
"vllm_prefix_hit": vllm_prefix_hit,
|
||||
"prefix_hit_delta": (
|
||||
float(frontier_prefix_hit) - float(vllm_prefix_hit)
|
||||
if frontier_prefix_hit is not None and vllm_prefix_hit is not None
|
||||
else None
|
||||
),
|
||||
"frontier_rps": throughput.get("requests_per_second"),
|
||||
"vllm_rps": vllm.get("requests_per_second"),
|
||||
"frontier_total_tps": frontier_total_tps,
|
||||
"vllm_total_tps": vllm_total_tps,
|
||||
"frontier_decode_tps": throughput.get("decode_tokens_per_second"),
|
||||
"vllm_decode_tps": vllm.get("generated_tokens_per_second"),
|
||||
"frontier_ttft_p50_s": nested(system, "ttft_statistics", "p50") / 1000,
|
||||
"vllm_ttft_p50_s": nested(vllm, "ttft_s", "p50"),
|
||||
"frontier_ttft_p95_s": nested(system, "ttft_statistics", "p95") / 1000,
|
||||
"vllm_ttft_p95_s": nested(vllm, "ttft_s", "p95"),
|
||||
"frontier_tpot_p50_s": nested(system, "tpot_statistics", "p50") / 1000,
|
||||
"vllm_tpot_p50_s": nested(vllm, "tpot_s", "p50"),
|
||||
"frontier_tpot_p95_s": nested(system, "tpot_statistics", "p95") / 1000,
|
||||
"vllm_tpot_p95_s": nested(vllm, "tpot_s", "p95"),
|
||||
"frontier_e2e_p50_s": nested(system, "request_e2e_time_statistics", "p50") / 1000,
|
||||
"vllm_e2e_p50_s": nested(vllm, "e2e_s", "p50"),
|
||||
"frontier_e2e_p95_s": nested(system, "request_e2e_time_statistics", "p95") / 1000,
|
||||
"vllm_e2e_p95_s": nested(vllm, "e2e_s", "p95"),
|
||||
"notes": spec.notes,
|
||||
}
|
||||
|
||||
for name in [
|
||||
"rps",
|
||||
"total_tps",
|
||||
"decode_tps",
|
||||
"ttft_p50_s",
|
||||
"ttft_p95_s",
|
||||
"tpot_p50_s",
|
||||
"tpot_p95_s",
|
||||
"e2e_p50_s",
|
||||
"e2e_p95_s",
|
||||
]:
|
||||
row[f"{name.removesuffix('_s')}_ratio"] = ratio(
|
||||
row.get(f"frontier_{name}"), row.get(f"vllm_{name}")
|
||||
)
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def fmt(value: Any) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, bool):
|
||||
return "true" if value else "false"
|
||||
if isinstance(value, float):
|
||||
return f"{value:.10g}"
|
||||
return str(value)
|
||||
|
||||
|
||||
def write_csv(rows: list[dict[str, Any]]) -> None:
|
||||
path = OUT_DIR / "frontier_vllm_alignment.csv"
|
||||
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})
|
||||
|
||||
|
||||
def write_json(rows: list[dict[str, Any]]) -> None:
|
||||
path = OUT_DIR / "frontier_vllm_alignment.json"
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(rows, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
|
||||
|
||||
def setup_axis(ax: plt.Axes, title: str, ylabel: str) -> None:
|
||||
ax.set_title(title, fontsize=12, pad=10)
|
||||
ax.set_ylabel(ylabel)
|
||||
ax.grid(axis="y", alpha=0.25)
|
||||
ax.spines["top"].set_visible(False)
|
||||
ax.spines["right"].set_visible(False)
|
||||
|
||||
|
||||
def annotate_bars(ax: plt.Axes, bars: Any, fmt_text: str = "{:.2f}") -> None:
|
||||
for bar in bars:
|
||||
height = bar.get_height()
|
||||
if height != height:
|
||||
continue
|
||||
ax.annotate(
|
||||
fmt_text.format(height),
|
||||
xy=(bar.get_x() + bar.get_width() / 2, height),
|
||||
xytext=(0, 3),
|
||||
textcoords="offset points",
|
||||
ha="center",
|
||||
va="bottom",
|
||||
fontsize=7,
|
||||
rotation=90 if height > 2.5 else 0,
|
||||
)
|
||||
|
||||
|
||||
def savefig(name: str) -> None:
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUT_DIR / name, dpi=180)
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_throughput_ratio(rows: list[dict[str, Any]]) -> None:
|
||||
labels = [row["label"] for row in rows]
|
||||
x = range(len(rows))
|
||||
colors = {1: "#4C78A8", 2: "#F58518", 4: "#54A24B"}
|
||||
fig, ax = plt.subplots(figsize=(12, 4.8))
|
||||
bars = ax.bar(
|
||||
x,
|
||||
[row["total_tps_ratio"] for row in rows],
|
||||
color=[colors[row["tp"]] for row in rows],
|
||||
alpha=0.9,
|
||||
)
|
||||
for bar, row in zip(bars, rows, strict=True):
|
||||
if not row["frontier_complete"]:
|
||||
bar.set_hatch("//")
|
||||
bar.set_alpha(0.65)
|
||||
ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
|
||||
ax.set_xticks(list(x))
|
||||
ax.set_xticklabels(labels, rotation=35, ha="right")
|
||||
setup_axis(ax, "Frontier Throughput Relative to vLLM", "Frontier / vLLM total tok/s")
|
||||
annotate_bars(ax, bars)
|
||||
savefig("throughput_ratio.png")
|
||||
|
||||
|
||||
def plot_latency_ratios(rows: list[dict[str, Any]]) -> None:
|
||||
labels = [row["label"] for row in rows]
|
||||
x = list(range(len(rows)))
|
||||
width = 0.26
|
||||
fig, ax = plt.subplots(figsize=(13, 5.2))
|
||||
b1 = ax.bar([i - width for i in x], [row["ttft_p95_ratio"] for row in rows], width, label="TTFT p95")
|
||||
b2 = ax.bar(x, [row["tpot_p50_ratio"] for row in rows], width, label="TPOT p50")
|
||||
b3 = ax.bar([i + width for i in x], [row["e2e_p95_ratio"] for row in rows], width, label="E2E p95")
|
||||
ax.axhline(1.0, color="#222222", linewidth=1, linestyle="--")
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(labels, rotation=35, ha="right")
|
||||
ax.legend(frameon=False, ncols=3, loc="upper left")
|
||||
setup_axis(ax, "Latency Ratios", "Frontier / vLLM")
|
||||
annotate_bars(ax, b1)
|
||||
annotate_bars(ax, b2)
|
||||
annotate_bars(ax, b3)
|
||||
savefig("latency_ratios.png")
|
||||
|
||||
|
||||
def plot_tp_scaling(rows: list[dict[str, Any]]) -> None:
|
||||
selected = [row for row in rows if row["request_count"] == 200 and row["scale_label"] in {"2", "3"}]
|
||||
groups = {}
|
||||
for row in selected:
|
||||
groups.setdefault(row["scale_label"], {})[row["tp"]] = row
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(11, 4.2), sharey=False)
|
||||
for ax, scale in zip(axes, ["2", "3"], strict=True):
|
||||
group = groups[scale]
|
||||
tps = sorted(group)
|
||||
ax.plot(tps, [group[tp]["frontier_total_tps"] for tp in tps], marker="o", label="Frontier")
|
||||
ax.plot(tps, [group[tp]["vllm_total_tps"] for tp in tps], marker="o", label="vLLM")
|
||||
ax.set_xticks(tps)
|
||||
ax.set_xlabel("Tensor parallel size")
|
||||
setup_axis(ax, f"N=200, timestamp scale {scale}", "total tok/s")
|
||||
ax.legend(frameon=False)
|
||||
savefig("tp_scaling_total_tps.png")
|
||||
|
||||
|
||||
def plot_completion_prefix(rows: list[dict[str, Any]]) -> None:
|
||||
labels = [row["label"] for row in rows]
|
||||
x = list(range(len(rows)))
|
||||
fig, ax1 = plt.subplots(figsize=(12, 4.8))
|
||||
completion = [row["frontier_completed"] / row["frontier_total"] for row in rows]
|
||||
bars = ax1.bar(x, completion, color="#72B7B2", alpha=0.8, label="Frontier completion")
|
||||
ax1.set_ylim(0, 1.08)
|
||||
ax1.set_xticks(x)
|
||||
ax1.set_xticklabels(labels, rotation=35, ha="right")
|
||||
setup_axis(ax1, "Completion and Prefix Reuse", "Frontier completed / total")
|
||||
ax2 = ax1.twinx()
|
||||
ax2.plot(x, [row["frontier_prefix_hit"] for row in rows], color="#E45756", marker="o", label="Frontier prefix hit")
|
||||
ax2.plot(x, [row["vllm_prefix_hit"] for row in rows], color="#4C78A8", marker="x", linestyle="--", label="vLLM trace-side prefix hit")
|
||||
ax2.set_ylabel("prefix token hit ratio")
|
||||
ax2.set_ylim(0, 0.45)
|
||||
lines, labels2 = ax2.get_legend_handles_labels()
|
||||
ax1.legend([bars, *lines], ["Frontier completion", *labels2], frameon=False, loc="upper left", ncols=2)
|
||||
savefig("completion_prefix.png")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
rows = [summarize(spec) for spec in RUNS]
|
||||
write_csv(rows)
|
||||
write_json(rows)
|
||||
plot_throughput_ratio(rows)
|
||||
plot_latency_ratios(rows)
|
||||
plot_tp_scaling(rows)
|
||||
plot_completion_prefix(rows)
|
||||
print(f"Wrote {len(rows)} rows to {OUT_DIR}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
454
tools/postprocess_frontier_smoke.py
Executable file
454
tools/postprocess_frontier_smoke.py
Executable file
@@ -0,0 +1,454 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Summarize a Frontier RS1 smoke run."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
CACHE_COLUMNS = {
|
||||
"request_cached_prefill_tokens",
|
||||
"request_prefix_cache_query_blocks",
|
||||
"request_prefix_cache_hit_blocks",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Postprocess Frontier smoke output.")
|
||||
parser.add_argument("--run-dir", required=True, type=Path)
|
||||
parser.add_argument("--fixture-dir", required=True, type=Path)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict[str, Any]:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{path}: JSON value must be an object")
|
||||
return data
|
||||
|
||||
|
||||
def load_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line_number, line in enumerate(handle, start=1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
row = json.loads(stripped)
|
||||
if not isinstance(row, dict):
|
||||
raise ValueError(f"{path}: line {line_number}: expected object")
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
|
||||
def load_csv(path: Path) -> tuple[list[str], list[dict[str, str]]]:
|
||||
with path.open("r", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.DictReader(handle)
|
||||
return list(reader.fieldnames or []), list(reader)
|
||||
|
||||
|
||||
def find_metrics_dir(run_dir: Path) -> Path:
|
||||
candidates = sorted(run_dir.glob("frontier_metrics/**/system_metrics.json"))
|
||||
if len(candidates) != 1:
|
||||
raise ValueError(
|
||||
f"{run_dir}: expected exactly one system_metrics.json under "
|
||||
f"frontier_metrics, found {len(candidates)}"
|
||||
)
|
||||
return candidates[0].parent
|
||||
|
||||
|
||||
def read_text_if_exists(path: Path) -> str:
|
||||
if not path.exists():
|
||||
return ""
|
||||
return path.read_text(encoding="utf-8", errors="replace")
|
||||
|
||||
|
||||
def parse_memory_state(log_text: str) -> dict[str, Any]:
|
||||
matches = re.findall(
|
||||
r"\[MEMORY_STATE\]\s+total_blocks=(?P<total_blocks>\d+),\s+"
|
||||
r"max_blocks_per_sequence=(?P<max_blocks_per_sequence>\d+),\s+"
|
||||
r"max_request_slots=(?P<max_request_slots>[^,]+),\s+"
|
||||
r"max_batch_size=(?P<max_batch_size>\d+)",
|
||||
log_text,
|
||||
)
|
||||
if not matches:
|
||||
return {"available": False}
|
||||
total_blocks, max_blocks_per_sequence, max_request_slots, max_batch_size = matches[-1]
|
||||
return {
|
||||
"available": True,
|
||||
"total_blocks": int(total_blocks),
|
||||
"max_blocks_per_sequence": int(max_blocks_per_sequence),
|
||||
"max_request_slots": max_request_slots,
|
||||
"max_batch_size": int(max_batch_size),
|
||||
"source": "last [MEMORY_STATE] log line",
|
||||
}
|
||||
|
||||
|
||||
def extract_scheduler_config(config: dict[str, Any]) -> dict[str, Any]:
|
||||
cluster = config.get("cluster_config")
|
||||
if not isinstance(cluster, dict):
|
||||
return {}
|
||||
scheduler = cluster.get("replica_scheduler_config")
|
||||
return scheduler if isinstance(scheduler, dict) else {}
|
||||
|
||||
|
||||
def extract_replica_config(config: dict[str, Any]) -> dict[str, Any]:
|
||||
cluster = config.get("cluster_config")
|
||||
if not isinstance(cluster, dict):
|
||||
return {}
|
||||
replica = cluster.get("replica_config")
|
||||
return replica if isinstance(replica, dict) else {}
|
||||
|
||||
|
||||
def compute_token_weighted_cache(
|
||||
request_metrics_path: Path,
|
||||
sidecar_path: Path,
|
||||
) -> dict[str, Any]:
|
||||
fieldnames, rows = load_csv(request_metrics_path)
|
||||
missing = sorted(CACHE_COLUMNS - set(fieldnames))
|
||||
if missing:
|
||||
return {
|
||||
"available": False,
|
||||
"reason": f"request_metrics.csv missing cache columns: {missing}",
|
||||
}
|
||||
|
||||
sidecar_by_id = {int(row["request_id"]): row for row in load_jsonl(sidecar_path)}
|
||||
total_query_blocks = 0
|
||||
total_hit_blocks = 0
|
||||
total_query_tokens = 0
|
||||
total_hit_tokens = 0
|
||||
total_frontier_cached_tokens = 0
|
||||
completed_rows = 0
|
||||
rows_with_missing_cache_metrics: list[int] = []
|
||||
|
||||
for row in rows:
|
||||
request_id = int(float(row["Request Id"]))
|
||||
sidecar = sidecar_by_id.get(request_id)
|
||||
if sidecar is None:
|
||||
raise ValueError(f"request_metrics.csv contains unknown request id {request_id}")
|
||||
|
||||
cache_values = [
|
||||
row["request_prefix_cache_query_blocks"],
|
||||
row["request_prefix_cache_hit_blocks"],
|
||||
row["request_cached_prefill_tokens"],
|
||||
]
|
||||
if any(value == "" for value in cache_values):
|
||||
rows_with_missing_cache_metrics.append(request_id)
|
||||
continue
|
||||
|
||||
query_blocks = int(float(cache_values[0]))
|
||||
hit_blocks = int(float(cache_values[1]))
|
||||
cached_prefill_tokens = int(float(cache_values[2]))
|
||||
block_token_counts = [int(value) for value in sidecar["block_token_counts"]]
|
||||
input_length = int(sidecar["input_length"])
|
||||
|
||||
if query_blocks != len(block_token_counts):
|
||||
raise ValueError(
|
||||
f"request {request_id}: query_blocks={query_blocks} does not match "
|
||||
f"sidecar blocks={len(block_token_counts)}"
|
||||
)
|
||||
if hit_blocks > query_blocks:
|
||||
raise ValueError(
|
||||
f"request {request_id}: hit_blocks={hit_blocks} > query_blocks={query_blocks}"
|
||||
)
|
||||
|
||||
total_query_blocks += query_blocks
|
||||
total_hit_blocks += hit_blocks
|
||||
total_query_tokens += input_length
|
||||
total_hit_tokens += sum(block_token_counts[:hit_blocks])
|
||||
total_frontier_cached_tokens += cached_prefill_tokens
|
||||
completed_rows += 1
|
||||
|
||||
if completed_rows == 0:
|
||||
return {
|
||||
"available": False,
|
||||
"reason": "no request rows had complete prefix-cache metrics",
|
||||
"rows_with_missing_cache_metrics": rows_with_missing_cache_metrics,
|
||||
}
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"completed_request_rows": completed_rows,
|
||||
"total_request_metric_rows": len(rows),
|
||||
"rows_with_missing_cache_metrics": rows_with_missing_cache_metrics,
|
||||
"frontier_block_level": {
|
||||
"total_query_blocks": total_query_blocks,
|
||||
"total_hit_blocks": total_hit_blocks,
|
||||
"hit_ratio": (
|
||||
total_hit_blocks / total_query_blocks if total_query_blocks else 0.0
|
||||
),
|
||||
"total_cached_prefill_tokens_frontier_whole_block": total_frontier_cached_tokens,
|
||||
},
|
||||
"replayserve_token_weighted": {
|
||||
"total_query_tokens": total_query_tokens,
|
||||
"total_hit_tokens": total_hit_tokens,
|
||||
"hit_ratio": (
|
||||
total_hit_tokens / total_query_tokens if total_query_tokens else 0.0
|
||||
),
|
||||
},
|
||||
"semantics": (
|
||||
"Frontier reports whole-block hits; ReplayServe weights the first "
|
||||
"hit_blocks sidecar block_token_counts, so partial final blocks count "
|
||||
"by their true token length when they are hit."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def compute_completion_summary(
|
||||
system_metrics: dict[str, Any],
|
||||
request_metrics_path: Path,
|
||||
) -> dict[str, Any]:
|
||||
fieldnames, rows = load_csv(request_metrics_path)
|
||||
missing_latency_rows: list[int] = []
|
||||
if "Request Id" in fieldnames and "request_e2e_time" in fieldnames:
|
||||
for row in rows:
|
||||
if row.get("request_e2e_time", "") == "":
|
||||
missing_latency_rows.append(int(float(row["Request Id"])))
|
||||
|
||||
metadata = system_metrics.get("simulation_metadata", {})
|
||||
total_requests = int(metadata.get("total_requests") or len(rows))
|
||||
completed_requests = int(metadata.get("completed_requests") or 0)
|
||||
is_complete = (
|
||||
total_requests > 0
|
||||
and completed_requests == total_requests
|
||||
and not missing_latency_rows
|
||||
)
|
||||
|
||||
return {
|
||||
"is_complete": is_complete,
|
||||
"total_requests": total_requests,
|
||||
"completed_requests": completed_requests,
|
||||
"request_metric_rows": len(rows),
|
||||
"missing_latency_request_ids": missing_latency_rows,
|
||||
}
|
||||
|
||||
|
||||
def get_nested(data: dict[str, Any], *keys: str) -> Any:
|
||||
value: Any = data
|
||||
for key in keys:
|
||||
if not isinstance(value, dict):
|
||||
return None
|
||||
value = value.get(key)
|
||||
return value
|
||||
|
||||
|
||||
def estimate_memory_planner_blocks(
|
||||
*,
|
||||
config: dict[str, Any],
|
||||
scheduler_config: dict[str, Any],
|
||||
model_weight_memory: dict[str, Any] | None,
|
||||
) -> dict[str, Any]:
|
||||
replica_config = extract_replica_config(config)
|
||||
model_config = replica_config.get("model_config")
|
||||
device_config = replica_config.get("device_config")
|
||||
if (
|
||||
not isinstance(model_config, dict)
|
||||
or not isinstance(device_config, dict)
|
||||
or not isinstance(model_weight_memory, dict)
|
||||
):
|
||||
return {"available": False, "reason": "missing model/device/weight config"}
|
||||
|
||||
block_size = int(scheduler_config.get("block_size", 0))
|
||||
if block_size <= 0:
|
||||
return {"available": False, "reason": "missing positive block_size"}
|
||||
|
||||
total_memory_gb = float(device_config["total_memory_gb"])
|
||||
gpu_memory_utilization = scheduler_config.get("gpu_memory_utilization")
|
||||
if gpu_memory_utilization is None:
|
||||
gpu_memory_utilization = 1.0 - float(replica_config.get("memory_margin_fraction", 0.1))
|
||||
gpu_memory_utilization = float(gpu_memory_utilization)
|
||||
|
||||
parameter_memory_bytes = int(model_weight_memory["total_memory_bytes"])
|
||||
overhead_bytes = int(scheduler_config.get("non_kv_cache_overhead_bytes") or 0)
|
||||
requested_memory_bytes = int(total_memory_gb * 1024**3 * gpu_memory_utilization)
|
||||
available_kv_cache_memory_bytes = (
|
||||
requested_memory_bytes - parameter_memory_bytes - overhead_bytes
|
||||
)
|
||||
|
||||
embedding_dim = int(model_config["embedding_dim"])
|
||||
num_q_heads = int(model_config["num_q_heads"])
|
||||
head_dim = model_config.get("head_dim")
|
||||
if head_dim is None:
|
||||
head_dim = embedding_dim // num_q_heads
|
||||
head_dim = int(head_dim)
|
||||
num_kv_heads = int(model_config["num_kv_heads"])
|
||||
attn_tp = int(replica_config["attn_tensor_parallel_size"])
|
||||
kv_heads_per_tensor_parallel_worker = math.ceil(num_kv_heads / attn_tp)
|
||||
num_layers = int(model_config["num_layers"])
|
||||
page_size_bytes_per_layer_per_block = (
|
||||
2 * 2 * block_size * kv_heads_per_tensor_parallel_worker * head_dim
|
||||
)
|
||||
if available_kv_cache_memory_bytes <= 0 or page_size_bytes_per_layer_per_block <= 0:
|
||||
derived_num_blocks = 0
|
||||
else:
|
||||
derived_num_blocks = int(
|
||||
available_kv_cache_memory_bytes
|
||||
// page_size_bytes_per_layer_per_block
|
||||
// num_layers
|
||||
)
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"source": "ReplayServe fallback using Frontier MemoryPlanner.get_num_blocks formula",
|
||||
"total_blocks": derived_num_blocks,
|
||||
"requested_memory_bytes": requested_memory_bytes,
|
||||
"parameter_memory_per_device_bytes": parameter_memory_bytes,
|
||||
"non_kv_cache_overhead_bytes": overhead_bytes,
|
||||
"available_kv_cache_memory_bytes": available_kv_cache_memory_bytes,
|
||||
"block_size": block_size,
|
||||
"num_layers": num_layers,
|
||||
"head_dim": head_dim,
|
||||
"num_kv_heads": num_kv_heads,
|
||||
"attn_tensor_parallel_size": attn_tp,
|
||||
"kv_heads_per_tensor_parallel_worker": kv_heads_per_tensor_parallel_worker,
|
||||
"page_size_bytes_per_layer_per_block": page_size_bytes_per_layer_per_block,
|
||||
"gpu_memory_utilization": gpu_memory_utilization,
|
||||
"total_memory_gb": total_memory_gb,
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
try:
|
||||
run_dir = args.run_dir
|
||||
fixture_dir = args.fixture_dir
|
||||
metrics_dir = find_metrics_dir(run_dir)
|
||||
system_metrics_path = metrics_dir / "system_metrics.json"
|
||||
request_metrics_path = metrics_dir / "request_metrics.csv"
|
||||
config_path = metrics_dir / "config.json"
|
||||
sidecar_path = fixture_dir / "sidecar.jsonl"
|
||||
|
||||
system_metrics = load_json(system_metrics_path)
|
||||
config = load_json(config_path)
|
||||
scheduler_config = extract_scheduler_config(config)
|
||||
log_text = (
|
||||
read_text_if_exists(run_dir / "stdout.log")
|
||||
+ "\n"
|
||||
+ read_text_if_exists(run_dir / "stderr.log")
|
||||
)
|
||||
memory_state = parse_memory_state(log_text)
|
||||
completion_summary = compute_completion_summary(
|
||||
system_metrics, request_metrics_path
|
||||
)
|
||||
cache_summary = compute_token_weighted_cache(request_metrics_path, sidecar_path)
|
||||
|
||||
model_weight_memory = get_nested(system_metrics, "model_weight_memory", "MONOLITHIC")
|
||||
if not memory_state.get("available"):
|
||||
memory_state = estimate_memory_planner_blocks(
|
||||
config=config,
|
||||
scheduler_config=scheduler_config,
|
||||
model_weight_memory=model_weight_memory,
|
||||
)
|
||||
preemption_statistics = system_metrics.get("preemption_statistics", {})
|
||||
allocation_pressure_lines = [
|
||||
line
|
||||
for line in log_text.splitlines()
|
||||
if re.search(
|
||||
r"preempt|insufficient|cannot allocate|allocation pressure|oom",
|
||||
line,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
]
|
||||
|
||||
summary = {
|
||||
"run_dir": str(run_dir),
|
||||
"fixture_dir": str(fixture_dir),
|
||||
"metrics_dir": str(metrics_dir),
|
||||
"system_metrics": str(system_metrics_path),
|
||||
"request_metrics": str(request_metrics_path),
|
||||
"config": str(config_path),
|
||||
"frontier_prefix_cache_statistics": system_metrics.get(
|
||||
"prefix_cache_statistics"
|
||||
),
|
||||
"completion": completion_summary,
|
||||
"prefix_cache_postprocess": cache_summary,
|
||||
"memory_planner": {
|
||||
"mode": scheduler_config.get("num_blocks_mode"),
|
||||
"gpu_memory_utilization": scheduler_config.get(
|
||||
"gpu_memory_utilization"
|
||||
),
|
||||
"non_kv_cache_overhead_bytes": scheduler_config.get(
|
||||
"non_kv_cache_overhead_bytes"
|
||||
),
|
||||
"derived": memory_state,
|
||||
"model_weight_memory_monolithic": model_weight_memory,
|
||||
"assumption": (
|
||||
"RS1 uses Frontier memory_planner with analytical parameter "
|
||||
"memory and non_kv_cache_overhead_bytes=0 for plumbing smoke."
|
||||
),
|
||||
},
|
||||
"preemption_statistics": preemption_statistics,
|
||||
"allocation_pressure_log_line_count": len(allocation_pressure_lines),
|
||||
"allocation_pressure_log_excerpt": allocation_pressure_lines[:20],
|
||||
}
|
||||
|
||||
output_json = run_dir / "postprocess_summary.json"
|
||||
output_md = run_dir / "postprocess_summary.md"
|
||||
with output_json.open("w", encoding="utf-8") as handle:
|
||||
json.dump(summary, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
|
||||
cache = summary["prefix_cache_postprocess"]
|
||||
mem = summary["memory_planner"]
|
||||
with output_md.open("w", encoding="utf-8") as handle:
|
||||
handle.write(f"# RS1 Frontier Smoke: {fixture_dir.name}\n\n")
|
||||
handle.write(f"- Metrics dir: `{metrics_dir}`\n")
|
||||
handle.write(f"- Frontier system metrics: `{system_metrics_path}`\n")
|
||||
handle.write(f"- Frontier request metrics: `{request_metrics_path}`\n")
|
||||
handle.write(
|
||||
"- Completion: "
|
||||
f"`{completion_summary['completed_requests']}/"
|
||||
f"{completion_summary['total_requests']}`\n"
|
||||
)
|
||||
missing_latency_rows = completion_summary.get("missing_latency_request_ids") or []
|
||||
if missing_latency_rows:
|
||||
handle.write(
|
||||
"- Missing latency request rows: "
|
||||
f"`{missing_latency_rows}`\n"
|
||||
)
|
||||
if cache.get("available"):
|
||||
frontier_ratio = cache["frontier_block_level"]["hit_ratio"]
|
||||
token_ratio = cache["replayserve_token_weighted"]["hit_ratio"]
|
||||
handle.write(f"- Frontier block-level prefix hit ratio: {frontier_ratio:.8f}\n")
|
||||
handle.write(f"- ReplayServe token-weighted prefix hit ratio: {token_ratio:.8f}\n")
|
||||
missing_cache_rows = cache.get("rows_with_missing_cache_metrics") or []
|
||||
if missing_cache_rows:
|
||||
handle.write(
|
||||
"- Prefix-cache metric rows skipped: "
|
||||
f"`{missing_cache_rows}`\n"
|
||||
)
|
||||
else:
|
||||
handle.write(f"- Prefix cache postprocess unavailable: {cache.get('reason')}\n")
|
||||
derived = mem.get("derived", {})
|
||||
handle.write(f"- Memory planner mode: `{mem.get('mode')}`\n")
|
||||
handle.write(f"- GPU memory utilization: `{mem.get('gpu_memory_utilization')}`\n")
|
||||
handle.write(
|
||||
f"- Non-KV overhead bytes assumption: `{mem.get('non_kv_cache_overhead_bytes')}`\n"
|
||||
)
|
||||
if derived.get("available"):
|
||||
handle.write(f"- Derived KV blocks: `{derived.get('total_blocks')}`\n")
|
||||
handle.write(f"- Max batch size: `{derived.get('max_batch_size', 'n/a')}`\n")
|
||||
else:
|
||||
handle.write("- Derived KV blocks: not found in logs\n")
|
||||
preemptions = preemption_statistics.get("total_preemption_events")
|
||||
handle.write(f"- Total preemption events: `{preemptions}`\n")
|
||||
handle.write(
|
||||
f"- Allocation/preemption/OOM log lines: `{len(allocation_pressure_lines)}`\n"
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
print(f"postprocess_frontier_smoke.py: error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
391
tools/qwen_to_frontier.py
Executable file
391
tools/qwen_to_frontier.py
Executable file
@@ -0,0 +1,391 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert Qwen JSONL traces to Frontier trace-replay CSV fixtures."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
CSV_FIELDS = [
|
||||
"arrived_at",
|
||||
"num_prefill_tokens",
|
||||
"num_decode_tokens",
|
||||
"session_id",
|
||||
"block_hash_ids",
|
||||
]
|
||||
|
||||
SIDECAR_FIELDS = [
|
||||
"request_id",
|
||||
"chat_id",
|
||||
"parent_chat_id",
|
||||
"turn",
|
||||
"type",
|
||||
"timestamp",
|
||||
"input_length",
|
||||
"output_length",
|
||||
"hash_ids",
|
||||
"block_token_counts",
|
||||
]
|
||||
|
||||
|
||||
def positive_int(value: str) -> int:
|
||||
parsed = int(value)
|
||||
if parsed <= 0:
|
||||
raise argparse.ArgumentTypeError("must be positive")
|
||||
return parsed
|
||||
|
||||
|
||||
def positive_float(value: str) -> float:
|
||||
parsed = float(value)
|
||||
if parsed <= 0:
|
||||
raise argparse.ArgumentTypeError("must be positive")
|
||||
return parsed
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert Qwen JSONL to Frontier CSV plus ReplayServe sidecar."
|
||||
)
|
||||
parser.add_argument("--input", required=True, type=Path, help="Qwen JSONL path.")
|
||||
parser.add_argument(
|
||||
"--frontier-csv", required=True, type=Path, help="Output Frontier CSV path."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sidecar-jsonl",
|
||||
required=True,
|
||||
type=Path,
|
||||
help="Output ReplayServe sidecar JSONL path.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source-jsonl",
|
||||
type=Path,
|
||||
help="Optional path for the original source JSONL slice.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--manifest-json", type=Path, help="Optional path for fixture manifest JSON."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fixture-name", help="Optional fixture name stored in the manifest."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit", type=positive_int, help="Maximum number of rows to convert."
|
||||
)
|
||||
parser.add_argument("--max-tokens", type=positive_int, default=32768)
|
||||
parser.add_argument("--block-size", type=positive_int, default=16)
|
||||
parser.add_argument(
|
||||
"--timestamp-scale",
|
||||
type=positive_float,
|
||||
default=1.0,
|
||||
help="Multiply each source timestamp before writing fixture files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fail-on-overflow",
|
||||
action="store_true",
|
||||
help="Hard fail if input_length + output_length exceeds --max-tokens.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def require_int(row: dict[str, Any], key: str, line_number: int) -> int:
|
||||
try:
|
||||
value = row[key]
|
||||
except KeyError as exc:
|
||||
raise ValueError(f"line {line_number}: missing field {key!r}") from exc
|
||||
if isinstance(value, bool) or not isinstance(value, int):
|
||||
raise ValueError(f"line {line_number}: field {key!r} must be an int")
|
||||
return value
|
||||
|
||||
|
||||
def require_number(row: dict[str, Any], key: str, line_number: int) -> int | float:
|
||||
try:
|
||||
value = row[key]
|
||||
except KeyError as exc:
|
||||
raise ValueError(f"line {line_number}: missing field {key!r}") from exc
|
||||
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
||||
raise ValueError(f"line {line_number}: field {key!r} must be numeric")
|
||||
return value
|
||||
|
||||
|
||||
def require_hash_ids(row: dict[str, Any], line_number: int) -> list[int]:
|
||||
try:
|
||||
value = row["hash_ids"]
|
||||
except KeyError as exc:
|
||||
raise ValueError(f"line {line_number}: missing field 'hash_ids'") from exc
|
||||
if not isinstance(value, list):
|
||||
raise ValueError(f"line {line_number}: field 'hash_ids' must be a list")
|
||||
hash_ids: list[int] = []
|
||||
for index, item in enumerate(value):
|
||||
if isinstance(item, bool) or not isinstance(item, int):
|
||||
raise ValueError(
|
||||
f"line {line_number}: hash_ids[{index}] must be an int"
|
||||
)
|
||||
hash_ids.append(item)
|
||||
return hash_ids
|
||||
|
||||
|
||||
def block_token_counts(input_length: int, hash_count: int, block_size: int) -> list[int]:
|
||||
if hash_count == 0:
|
||||
return []
|
||||
last_count = input_length % block_size
|
||||
if last_count == 0:
|
||||
last_count = block_size
|
||||
return [block_size] * (hash_count - 1) + [last_count]
|
||||
|
||||
|
||||
def convert_row(
|
||||
row: dict[str, Any],
|
||||
request_id: int,
|
||||
line_number: int,
|
||||
block_size: int,
|
||||
max_tokens: int,
|
||||
fail_on_overflow: bool,
|
||||
timestamp_scale: float,
|
||||
) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
|
||||
chat_id = require_int(row, "chat_id", line_number)
|
||||
parent_chat_id = require_int(row, "parent_chat_id", line_number)
|
||||
timestamp = float(require_number(row, "timestamp", line_number)) * timestamp_scale
|
||||
input_length = require_int(row, "input_length", line_number)
|
||||
output_length = require_int(row, "output_length", line_number)
|
||||
turn = require_int(row, "turn", line_number)
|
||||
request_type = row.get("type")
|
||||
hash_ids = require_hash_ids(row, line_number)
|
||||
|
||||
if input_length <= 0:
|
||||
raise ValueError(f"line {line_number}: input_length must be positive")
|
||||
if output_length <= 0:
|
||||
raise ValueError(f"line {line_number}: output_length must be positive")
|
||||
|
||||
expected_hash_count = math.ceil(input_length / block_size)
|
||||
if len(hash_ids) != expected_hash_count:
|
||||
raise ValueError(
|
||||
f"line {line_number}: len(hash_ids)={len(hash_ids)} does not match "
|
||||
f"ceil(input_length / block_size)={expected_hash_count}"
|
||||
)
|
||||
|
||||
total_tokens = input_length + output_length
|
||||
overflow = total_tokens > max_tokens
|
||||
if overflow and fail_on_overflow:
|
||||
raise ValueError(
|
||||
f"line {line_number}: total_tokens={total_tokens} exceeds "
|
||||
f"max_tokens={max_tokens}"
|
||||
)
|
||||
|
||||
counts = block_token_counts(input_length, len(hash_ids), block_size)
|
||||
frontier_row = {
|
||||
"arrived_at": timestamp,
|
||||
"num_prefill_tokens": input_length,
|
||||
"num_decode_tokens": output_length,
|
||||
"session_id": chat_id,
|
||||
"block_hash_ids": "|".join(str(item) for item in hash_ids),
|
||||
}
|
||||
sidecar_row = {
|
||||
"request_id": request_id,
|
||||
"chat_id": chat_id,
|
||||
"parent_chat_id": parent_chat_id,
|
||||
"turn": turn,
|
||||
"type": request_type,
|
||||
"timestamp": timestamp,
|
||||
"input_length": input_length,
|
||||
"output_length": output_length,
|
||||
"hash_ids": hash_ids,
|
||||
"block_token_counts": counts,
|
||||
}
|
||||
stats = {
|
||||
"total_tokens": total_tokens,
|
||||
"input_length": input_length,
|
||||
"output_length": output_length,
|
||||
"timestamp": timestamp,
|
||||
"partial_final_block": input_length % block_size != 0,
|
||||
"overflow": overflow,
|
||||
}
|
||||
return frontier_row, sidecar_row, stats
|
||||
|
||||
|
||||
def tmp_path(path: Path) -> Path:
|
||||
return path.with_name(f".{path.name}.tmp")
|
||||
|
||||
|
||||
def ensure_parent(path: Path | None) -> None:
|
||||
if path is not None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def publish_tmp_files(paths: list[tuple[Path, Path]]) -> None:
|
||||
for temporary, final in paths:
|
||||
os.replace(temporary, final)
|
||||
|
||||
|
||||
def cleanup_tmp_files(paths: list[tuple[Path, Path]]) -> None:
|
||||
for temporary, _ in paths:
|
||||
try:
|
||||
temporary.unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
for output_path in (
|
||||
args.frontier_csv,
|
||||
args.sidecar_jsonl,
|
||||
args.source_jsonl,
|
||||
args.manifest_json,
|
||||
):
|
||||
ensure_parent(output_path)
|
||||
|
||||
temporary_paths: list[tuple[Path, Path]] = [
|
||||
(tmp_path(args.frontier_csv), args.frontier_csv),
|
||||
(tmp_path(args.sidecar_jsonl), args.sidecar_jsonl),
|
||||
]
|
||||
if args.source_jsonl is not None:
|
||||
temporary_paths.append((tmp_path(args.source_jsonl), args.source_jsonl))
|
||||
if args.manifest_json is not None:
|
||||
temporary_paths.append((tmp_path(args.manifest_json), args.manifest_json))
|
||||
|
||||
row_count = 0
|
||||
overflow_count = 0
|
||||
max_total_tokens = 0
|
||||
max_input_length = 0
|
||||
max_output_length = 0
|
||||
first_timestamp: float | None = None
|
||||
last_timestamp: float | None = None
|
||||
timestamp_monotonic = True
|
||||
partial_final_block_rows = 0
|
||||
|
||||
try:
|
||||
with (
|
||||
args.input.open("r", encoding="utf-8") as input_file,
|
||||
tmp_path(args.frontier_csv).open("w", encoding="utf-8", newline="") as csv_file,
|
||||
tmp_path(args.sidecar_jsonl).open("w", encoding="utf-8") as sidecar_file,
|
||||
):
|
||||
csv_writer = csv.DictWriter(
|
||||
csv_file, fieldnames=CSV_FIELDS, lineterminator="\n"
|
||||
)
|
||||
csv_writer.writeheader()
|
||||
|
||||
source_file = None
|
||||
if args.source_jsonl is not None:
|
||||
source_file = tmp_path(args.source_jsonl).open("w", encoding="utf-8")
|
||||
|
||||
try:
|
||||
for line_number, raw_line in enumerate(input_file, start=1):
|
||||
if args.limit is not None and row_count >= args.limit:
|
||||
break
|
||||
stripped = raw_line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
row = json.loads(stripped)
|
||||
frontier_row, sidecar_row, stats = convert_row(
|
||||
row=row,
|
||||
request_id=line_number - 1,
|
||||
line_number=line_number,
|
||||
block_size=args.block_size,
|
||||
max_tokens=args.max_tokens,
|
||||
fail_on_overflow=args.fail_on_overflow,
|
||||
timestamp_scale=args.timestamp_scale,
|
||||
)
|
||||
csv_writer.writerow(frontier_row)
|
||||
sidecar_file.write(
|
||||
json.dumps(sidecar_row, sort_keys=True, separators=(",", ":"))
|
||||
+ "\n"
|
||||
)
|
||||
if source_file is not None:
|
||||
if args.timestamp_scale == 1.0:
|
||||
source_file.write(
|
||||
raw_line if raw_line.endswith("\n") else raw_line + "\n"
|
||||
)
|
||||
else:
|
||||
source_row = dict(row)
|
||||
source_row["timestamp"] = stats["timestamp"]
|
||||
source_file.write(
|
||||
json.dumps(
|
||||
source_row, sort_keys=True, separators=(",", ":")
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
row_count += 1
|
||||
overflow_count += int(stats["overflow"])
|
||||
max_total_tokens = max(max_total_tokens, int(stats["total_tokens"]))
|
||||
max_input_length = max(max_input_length, int(stats["input_length"]))
|
||||
max_output_length = max(max_output_length, int(stats["output_length"]))
|
||||
partial_final_block_rows += int(stats["partial_final_block"])
|
||||
timestamp = float(stats["timestamp"])
|
||||
if first_timestamp is None:
|
||||
first_timestamp = timestamp
|
||||
if last_timestamp is not None and timestamp < last_timestamp:
|
||||
timestamp_monotonic = False
|
||||
last_timestamp = timestamp
|
||||
finally:
|
||||
if source_file is not None:
|
||||
source_file.close()
|
||||
|
||||
if args.manifest_json is not None:
|
||||
manifest = {
|
||||
"fixture_name": args.fixture_name,
|
||||
"generated_by": "tools/qwen_to_frontier.py",
|
||||
"input_jsonl": str(args.input),
|
||||
"source_jsonl": str(args.source_jsonl) if args.source_jsonl else None,
|
||||
"frontier_csv": str(args.frontier_csv),
|
||||
"sidecar_jsonl": str(args.sidecar_jsonl),
|
||||
"csv_fields": CSV_FIELDS,
|
||||
"sidecar_fields": SIDECAR_FIELDS,
|
||||
"limit": args.limit,
|
||||
"row_count": row_count,
|
||||
"block_size": args.block_size,
|
||||
"max_tokens": args.max_tokens,
|
||||
"fail_on_overflow": args.fail_on_overflow,
|
||||
"timestamp_scale": args.timestamp_scale,
|
||||
"overflow_count": overflow_count,
|
||||
"max_total_tokens": max_total_tokens,
|
||||
"max_input_length": max_input_length,
|
||||
"max_output_length": max_output_length,
|
||||
"first_timestamp": first_timestamp,
|
||||
"last_timestamp": last_timestamp,
|
||||
"timestamp_monotonic": timestamp_monotonic,
|
||||
"partial_final_block_rows": partial_final_block_rows,
|
||||
"adapter_semantics": {
|
||||
"timestamp": "arrived_at",
|
||||
"input_length": "num_prefill_tokens",
|
||||
"output_length": "num_decode_tokens",
|
||||
"chat_id": "session_id",
|
||||
"hash_ids": "block_hash_ids joined by |",
|
||||
"block_token_counts": (
|
||||
"full blocks use block_size tokens; final partial block "
|
||||
"uses input_length % block_size, or block_size when zero"
|
||||
),
|
||||
},
|
||||
}
|
||||
with tmp_path(args.manifest_json).open("w", encoding="utf-8") as manifest_file:
|
||||
json.dump(manifest, manifest_file, indent=2, sort_keys=True)
|
||||
manifest_file.write("\n")
|
||||
|
||||
publish_tmp_files(temporary_paths)
|
||||
except Exception as exc:
|
||||
cleanup_tmp_files(temporary_paths)
|
||||
print(f"qwen_to_frontier.py: error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if overflow_count and not args.fail_on_overflow:
|
||||
print(
|
||||
f"qwen_to_frontier.py: warning: {overflow_count} rows exceed "
|
||||
f"max_tokens={args.max_tokens}; no clipping was applied",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
f"converted rows={row_count} max_total_tokens={max_total_tokens} "
|
||||
f"overflows={overflow_count}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
534
tools/run_frontier_sweep.py
Normal file
534
tools/run_frontier_sweep.py
Normal file
@@ -0,0 +1,534 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Run a small Frontier sweep from a ReplayServe JSON config."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
REPLAYSERVE_ROOT = Path(__file__).resolve().parents[1]
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Run Frontier configs from JSON.")
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=Path,
|
||||
default=REPLAYSERVE_ROOT / "configs" / "rs3_tiny_sweep.json",
|
||||
help="Sweep JSON config.",
|
||||
)
|
||||
parser.add_argument("--suite-id", help="Override suite_id from the config.")
|
||||
parser.add_argument(
|
||||
"--run-root",
|
||||
type=Path,
|
||||
help="Override run root. Defaults to runs/<suite_id>.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-config",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Run only a config id. Can be repeated.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-fixture",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Run only a fixture. Can be repeated.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Write manifests and commands, but do not execute Frontier.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Replace existing run dirs selected by this invocation.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict[str, Any]:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"{path}: top-level JSON must be an object")
|
||||
return data
|
||||
|
||||
|
||||
def git_head(path: Path) -> str | None:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "-C", str(path), "rev-parse", "HEAD"],
|
||||
check=True,
|
||||
text=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def git_status(path: Path) -> str | None:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "-C", str(path), "status", "--short"],
|
||||
check=True,
|
||||
text=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
return None
|
||||
return result.stdout
|
||||
|
||||
|
||||
def shell_join(argv: list[str]) -> str:
|
||||
import shlex
|
||||
|
||||
return " ".join(shlex.quote(part) for part in argv)
|
||||
|
||||
|
||||
def merge_config(defaults: dict[str, Any], item: dict[str, Any]) -> dict[str, Any]:
|
||||
merged = dict(defaults)
|
||||
overrides = item.get("overrides", {})
|
||||
if overrides is None:
|
||||
overrides = {}
|
||||
if not isinstance(overrides, dict):
|
||||
raise ValueError(f"config {item.get('id')}: overrides must be an object")
|
||||
merged.update(overrides)
|
||||
if "max_num_seqs" in merged and "batch_size_cap" not in overrides:
|
||||
merged["batch_size_cap"] = merged["max_num_seqs"]
|
||||
return merged
|
||||
|
||||
|
||||
def build_frontier_command(
|
||||
*,
|
||||
python_bin: str,
|
||||
trace_file: Path,
|
||||
metrics_root: Path,
|
||||
run_id: str,
|
||||
knobs: dict[str, Any],
|
||||
) -> list[str]:
|
||||
cmd = [
|
||||
python_bin,
|
||||
"-m",
|
||||
"frontier.main",
|
||||
"--simulation_mode",
|
||||
str(knobs["simulation_mode"]),
|
||||
"--sys_arch",
|
||||
str(knobs["sys_arch"]),
|
||||
"--cc_backend_config_type",
|
||||
"analytical",
|
||||
"--cluster_config_num_replicas",
|
||||
str(knobs["num_replicas"]),
|
||||
"--cluster_scheduler_config_type",
|
||||
str(knobs["cluster_scheduler"]),
|
||||
"--replica_config_model_name",
|
||||
str(knobs["model_name"]),
|
||||
"--replica_config_device",
|
||||
str(knobs["device"]),
|
||||
"--replica_config_network_device",
|
||||
str(knobs["network_device"]),
|
||||
"--replica_config_attn_tensor_parallel_size",
|
||||
str(knobs["attn_tensor_parallel_size"]),
|
||||
"--replica_config_attn_data_parallel_size",
|
||||
str(knobs["attn_data_parallel_size"]),
|
||||
"--replica_config_moe_tensor_parallel_size",
|
||||
str(knobs["moe_tensor_parallel_size"]),
|
||||
"--replica_config_moe_expert_parallel_size",
|
||||
str(knobs["moe_expert_parallel_size"]),
|
||||
"--replica_config_num_pipeline_stages",
|
||||
str(knobs["num_pipeline_stages"]),
|
||||
"--replica_scheduler_config_type",
|
||||
str(knobs["replica_scheduler"]),
|
||||
"--decode_cuda_graph_mode",
|
||||
str(knobs.get("decode_cuda_graph_mode", "full_decode_only")),
|
||||
"--vllm_v1_scheduler_config_batch_size_cap",
|
||||
str(knobs["batch_size_cap"]),
|
||||
"--vllm_v1_scheduler_config_max_tokens_in_batch",
|
||||
str(knobs["max_tokens_in_batch"]),
|
||||
"--vllm_v1_scheduler_config_long_prefill_token_threshold",
|
||||
str(knobs["long_prefill_token_threshold"]),
|
||||
"--vllm_v1_scheduler_config_block_size",
|
||||
str(knobs["block_size"]),
|
||||
"--vllm_v1_scheduler_config_num_blocks_mode",
|
||||
str(knobs["num_blocks_mode"]),
|
||||
"--vllm_v1_scheduler_config_gpu_memory_utilization",
|
||||
str(knobs["gpu_memory_utilization"]),
|
||||
"--vllm_v1_scheduler_config_non_kv_cache_overhead_bytes",
|
||||
str(knobs["non_kv_cache_overhead_bytes"]),
|
||||
"--request_generator_config_type",
|
||||
"trace_replay",
|
||||
"--trace_request_generator_config_trace_file",
|
||||
str(trace_file),
|
||||
"--trace_request_generator_config_max_tokens",
|
||||
str(knobs["trace_max_tokens"]),
|
||||
"--metrics_config_output_dir",
|
||||
str(metrics_root),
|
||||
"--metrics_config_run_id",
|
||||
run_id,
|
||||
"--metrics_config_write_metrics",
|
||||
"--metrics_config_store_request_metrics",
|
||||
"--metrics_config_store_batch_metrics",
|
||||
"--metrics_config_store_token_completion_metrics",
|
||||
"--metrics_config_store_utilization_metrics",
|
||||
"--no-metrics_config_store_plots",
|
||||
"--no-metrics_config_enable_chrome_trace",
|
||||
"--no-metrics_config_write_json_trace",
|
||||
"--no-metrics_config_store_frontier_stage_batch_ledger",
|
||||
]
|
||||
if bool(knobs.get("enable_dummy_mode", True)):
|
||||
cmd.extend(
|
||||
[
|
||||
"--random_forrest_execution_time_predictor_config_enable_dummy_mode",
|
||||
"--random_forrest_execution_time_predictor_config_dummy_execution_time_ms",
|
||||
str(knobs["dummy_execution_time_ms"]),
|
||||
]
|
||||
)
|
||||
else:
|
||||
cmd.append("--no-random_forrest_execution_time_predictor_config_enable_dummy_mode")
|
||||
profile_arg_names = {
|
||||
"linear_op_input_file": "linear_op_input_file",
|
||||
"atten_input_file": "atten_input_file",
|
||||
"moe_input_file": "moe_input_file",
|
||||
"linear_op_kernel_only_input_file": "linear_op_kernel_only_input_file",
|
||||
"atten_kernel_only_input_file": "atten_kernel_only_input_file",
|
||||
"moe_kernel_only_input_file": "moe_kernel_only_input_file",
|
||||
}
|
||||
for knob_name, cli_name in profile_arg_names.items():
|
||||
value = knobs.get(knob_name)
|
||||
if value:
|
||||
cmd.extend(
|
||||
[
|
||||
f"--random_forrest_execution_time_predictor_config_{cli_name}",
|
||||
str(value),
|
||||
]
|
||||
)
|
||||
for knob_name in (
|
||||
"prediction_max_prefill_chunk_size",
|
||||
"prediction_max_batch_size",
|
||||
"prediction_max_tokens_per_request",
|
||||
):
|
||||
value = knobs.get(knob_name)
|
||||
if value is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
f"--random_forrest_execution_time_predictor_config_{knob_name}",
|
||||
str(value),
|
||||
]
|
||||
)
|
||||
if bool(knobs.get("no_cache", False)):
|
||||
cmd.append("--random_forrest_execution_time_predictor_config_no_cache")
|
||||
if bool(knobs.get("skip_cpu_overhead_modeling", True)):
|
||||
cmd.append(
|
||||
"--random_forrest_execution_time_predictor_config_skip_cpu_overhead_modeling"
|
||||
)
|
||||
if knobs.get("num_blocks") is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
"--vllm_v1_scheduler_config_num_blocks",
|
||||
str(knobs["num_blocks"]),
|
||||
]
|
||||
)
|
||||
if bool(knobs["enable_prefix_caching"]):
|
||||
cmd.append("--vllm_v1_scheduler_config_enable_prefix_caching")
|
||||
if bool(knobs["enable_chunked_prefill"]):
|
||||
cmd.append("--vllm_v1_scheduler_config_enable_chunked_prefill")
|
||||
return cmd
|
||||
|
||||
|
||||
def write_text(path: Path, text: str) -> None:
|
||||
path.write_text(text, encoding="utf-8")
|
||||
|
||||
|
||||
def run_one(
|
||||
*,
|
||||
suite_id: str,
|
||||
sim: str,
|
||||
frontier_info: dict[str, Any],
|
||||
frontier_root: Path,
|
||||
fixture: str,
|
||||
config_item: dict[str, Any],
|
||||
knobs: dict[str, Any],
|
||||
run_root: Path,
|
||||
python_bin: str,
|
||||
python_deps_dir: Path,
|
||||
dry_run: bool,
|
||||
force: bool,
|
||||
) -> dict[str, Any]:
|
||||
config_id = str(config_item["id"])
|
||||
fixture_dir = REPLAYSERVE_ROOT / "traces" / "fixtures" / fixture
|
||||
trace_file = fixture_dir / "frontier.csv"
|
||||
sidecar_file = fixture_dir / "sidecar.jsonl"
|
||||
if not trace_file.exists():
|
||||
raise FileNotFoundError(f"missing trace file: {trace_file}")
|
||||
if not sidecar_file.exists():
|
||||
raise FileNotFoundError(f"missing sidecar file: {sidecar_file}")
|
||||
|
||||
run_dir = (run_root / sim / fixture / config_id).resolve()
|
||||
metrics_root = (run_dir / "frontier_metrics").resolve()
|
||||
if run_dir.exists():
|
||||
if not force:
|
||||
raise FileExistsError(f"run dir exists, use --force to replace: {run_dir}")
|
||||
shutil.rmtree(run_dir)
|
||||
run_dir.mkdir(parents=True)
|
||||
metrics_root.mkdir(parents=True)
|
||||
|
||||
run_id = f"{suite_id}_{fixture}_{config_id}"
|
||||
cmd = build_frontier_command(
|
||||
python_bin=python_bin,
|
||||
trace_file=trace_file,
|
||||
metrics_root=metrics_root,
|
||||
run_id=run_id,
|
||||
knobs=knobs,
|
||||
)
|
||||
|
||||
existing_pythonpath = os.environ.get("PYTHONPATH")
|
||||
pythonpath_parts = []
|
||||
if python_deps_dir.is_dir():
|
||||
pythonpath_parts.append(str(python_deps_dir))
|
||||
pythonpath_parts.append(str(frontier_root))
|
||||
if existing_pythonpath:
|
||||
pythonpath_parts.append(existing_pythonpath)
|
||||
env = os.environ.copy()
|
||||
env.update(
|
||||
{
|
||||
"PYTHONPATH": ":".join(pythonpath_parts),
|
||||
"WANDB_DISABLED": "true",
|
||||
"VIDUR_DISABLE_WANDB": "1",
|
||||
"FRONTIER_LOG_LEVEL": env.get("FRONTIER_LOG_LEVEL", "info"),
|
||||
"PYTHONDONTWRITEBYTECODE": "1",
|
||||
}
|
||||
)
|
||||
|
||||
frontier_head = git_head(frontier_root)
|
||||
frontier_status = git_status(frontier_root)
|
||||
manifest = {
|
||||
"suite_id": suite_id,
|
||||
"sim": sim,
|
||||
"fixture": fixture,
|
||||
"config_id": config_id,
|
||||
"description": config_item.get("description", ""),
|
||||
"run_dir": str(run_dir),
|
||||
"metrics_root": str(metrics_root),
|
||||
"run_id": run_id,
|
||||
"frontier": {
|
||||
**frontier_info,
|
||||
"root": str(frontier_root),
|
||||
"head": frontier_head,
|
||||
"status_short": frontier_status,
|
||||
},
|
||||
"fixture_dir": str(fixture_dir),
|
||||
"trace_file": str(trace_file),
|
||||
"sidecar_file": str(sidecar_file),
|
||||
"knobs": knobs,
|
||||
"command": cmd,
|
||||
}
|
||||
with (run_dir / "run_manifest.json").open("w", encoding="utf-8") as handle:
|
||||
json.dump(manifest, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
|
||||
write_text(
|
||||
run_dir / "command.txt",
|
||||
"\n".join(
|
||||
[
|
||||
f"cd {frontier_root}",
|
||||
f"export PYTHONPATH={env['PYTHONPATH']}",
|
||||
f"export WANDB_DISABLED={env['WANDB_DISABLED']}",
|
||||
f"export VIDUR_DISABLE_WANDB={env['VIDUR_DISABLE_WANDB']}",
|
||||
f"export FRONTIER_LOG_LEVEL={env['FRONTIER_LOG_LEVEL']}",
|
||||
f"export PYTHONDONTWRITEBYTECODE={env['PYTHONDONTWRITEBYTECODE']}",
|
||||
f"command={shell_join(cmd)}",
|
||||
"",
|
||||
]
|
||||
),
|
||||
)
|
||||
write_text(
|
||||
run_dir / "env.txt",
|
||||
"\n".join(
|
||||
[
|
||||
f"suite_id={suite_id}",
|
||||
f"sim={sim}",
|
||||
f"fixture={fixture}",
|
||||
f"config_id={config_id}",
|
||||
f"replayserve_root={REPLAYSERVE_ROOT}",
|
||||
f"frontier_root={frontier_root}",
|
||||
f"frontier_head={frontier_head}",
|
||||
f"python_deps_dir={python_deps_dir}",
|
||||
f"trace_file={trace_file}",
|
||||
f"sidecar_file={sidecar_file}",
|
||||
f"run_dir={run_dir}",
|
||||
f"metrics_root={metrics_root}",
|
||||
f"run_id={run_id}",
|
||||
"",
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
if dry_run:
|
||||
write_text(run_dir / "exit_code.txt", "0\n")
|
||||
status = {
|
||||
"status": "dry_run",
|
||||
"exit_code": 0,
|
||||
"runtime_seconds": 0,
|
||||
"postprocess_exit_code": None,
|
||||
}
|
||||
with (run_dir / "run_status.json").open("w", encoding="utf-8") as handle:
|
||||
json.dump(status, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
return status
|
||||
|
||||
start_epoch = int(time.time())
|
||||
write_text(run_dir / "start_epoch.txt", f"{start_epoch}\n")
|
||||
with (run_dir / "stdout.log").open("w", encoding="utf-8") as stdout, (
|
||||
run_dir / "stderr.log"
|
||||
).open("w", encoding="utf-8") as stderr:
|
||||
proc = subprocess.run(cmd, cwd=frontier_root, env=env, stdout=stdout, stderr=stderr)
|
||||
end_epoch = int(time.time())
|
||||
runtime_seconds = end_epoch - start_epoch
|
||||
write_text(run_dir / "end_epoch.txt", f"{end_epoch}\n")
|
||||
write_text(run_dir / "exit_code.txt", f"{proc.returncode}\n")
|
||||
write_text(run_dir / "runtime_seconds.txt", f"{runtime_seconds}\n")
|
||||
|
||||
postprocess_exit_code: int | None = None
|
||||
if proc.returncode == 0:
|
||||
postprocess_cmd = [
|
||||
python_bin,
|
||||
str(REPLAYSERVE_ROOT / "tools" / "postprocess_frontier_smoke.py"),
|
||||
"--run-dir",
|
||||
str(run_dir),
|
||||
"--fixture-dir",
|
||||
str(fixture_dir),
|
||||
]
|
||||
with (run_dir / "postprocess.stdout.log").open("w", encoding="utf-8") as stdout, (
|
||||
run_dir / "postprocess.stderr.log"
|
||||
).open("w", encoding="utf-8") as stderr:
|
||||
post = subprocess.run(
|
||||
postprocess_cmd,
|
||||
cwd=REPLAYSERVE_ROOT,
|
||||
env={**env, "PYTHONPATH": env["PYTHONPATH"]},
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
)
|
||||
postprocess_exit_code = post.returncode
|
||||
|
||||
status_name = "pass" if proc.returncode == 0 and postprocess_exit_code in (0, None) else "fail"
|
||||
if proc.returncode == 0 and postprocess_exit_code not in (0, None):
|
||||
status_name = "postprocess_fail"
|
||||
if status_name == "pass":
|
||||
summary_path = run_dir / "postprocess_summary.json"
|
||||
if summary_path.exists():
|
||||
try:
|
||||
summary = load_json(summary_path)
|
||||
completion = summary.get("completion", {})
|
||||
if isinstance(completion, dict) and not completion.get("is_complete", True):
|
||||
status_name = "incomplete"
|
||||
except Exception:
|
||||
status_name = "postprocess_fail"
|
||||
status = {
|
||||
"status": status_name,
|
||||
"exit_code": proc.returncode,
|
||||
"runtime_seconds": runtime_seconds,
|
||||
"postprocess_exit_code": postprocess_exit_code,
|
||||
}
|
||||
with (run_dir / "run_status.json").open("w", encoding="utf-8") as handle:
|
||||
json.dump(status, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
return status
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
config_path = args.config.resolve()
|
||||
config = load_json(config_path)
|
||||
suite_id = args.suite_id or str(config.get("suite_id") or "rs3_sweep")
|
||||
run_root = args.run_root or (REPLAYSERVE_ROOT / "runs" / suite_id)
|
||||
sim = str(config.get("sim") or "frontier")
|
||||
frontier_info = config.get("frontier", {})
|
||||
if not isinstance(frontier_info, dict):
|
||||
raise ValueError("frontier must be an object")
|
||||
frontier_root = Path(str(frontier_info.get("root") or "/tmp/toc-llm-sim-research/Frontier"))
|
||||
if not frontier_root.is_dir():
|
||||
raise FileNotFoundError(f"Frontier root does not exist: {frontier_root}")
|
||||
|
||||
fixtures = [str(value) for value in config.get("fixtures", [])]
|
||||
if args.only_fixture:
|
||||
selected = set(args.only_fixture)
|
||||
fixtures = [value for value in fixtures if value in selected]
|
||||
if not fixtures:
|
||||
raise ValueError("no fixtures selected")
|
||||
|
||||
defaults = config.get("defaults", {})
|
||||
if not isinstance(defaults, dict):
|
||||
raise ValueError("defaults must be an object")
|
||||
config_items = config.get("configs", [])
|
||||
if not isinstance(config_items, list) or not config_items:
|
||||
raise ValueError("configs must be a non-empty list")
|
||||
if args.only_config:
|
||||
selected_configs = set(args.only_config)
|
||||
config_items = [
|
||||
item
|
||||
for item in config_items
|
||||
if isinstance(item, dict) and str(item.get("id")) in selected_configs
|
||||
]
|
||||
if not config_items:
|
||||
raise ValueError("no configs selected")
|
||||
|
||||
if (REPLAYSERVE_ROOT / ".venv" / "bin" / "python").is_file():
|
||||
python_bin = str(REPLAYSERVE_ROOT / ".venv" / "bin" / "python")
|
||||
else:
|
||||
python_bin = os.environ.get("PYTHON_BIN", sys.executable or "python3")
|
||||
python_deps_dir = Path(
|
||||
os.environ.get("PYTHON_DEPS_DIR", str(REPLAYSERVE_ROOT / ".deps" / "python"))
|
||||
)
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
for fixture in fixtures:
|
||||
for item in config_items:
|
||||
if not isinstance(item, dict):
|
||||
raise ValueError("each configs entry must be an object")
|
||||
if "id" not in item:
|
||||
raise ValueError("each configs entry needs id")
|
||||
knobs = merge_config(defaults, item)
|
||||
status = run_one(
|
||||
suite_id=suite_id,
|
||||
sim=sim,
|
||||
frontier_info=frontier_info,
|
||||
frontier_root=frontier_root,
|
||||
fixture=fixture,
|
||||
config_item=item,
|
||||
knobs=knobs,
|
||||
run_root=run_root,
|
||||
python_bin=python_bin,
|
||||
python_deps_dir=python_deps_dir,
|
||||
dry_run=args.dry_run,
|
||||
force=args.force,
|
||||
)
|
||||
results.append(
|
||||
{
|
||||
"fixture": fixture,
|
||||
"config_id": item["id"],
|
||||
**status,
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"{fixture}/{item['id']}: {status['status']} "
|
||||
f"exit={status['exit_code']} runtime={status['runtime_seconds']}s"
|
||||
)
|
||||
|
||||
failures = [row for row in results if row["status"] not in {"pass", "dry_run"}]
|
||||
return 1 if failures else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
240
tools/validate_fixtures.py
Executable file
240
tools/validate_fixtures.py
Executable file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate ReplayServe fixture directories."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def positive_int(value: str) -> int:
|
||||
parsed = int(value)
|
||||
if parsed <= 0:
|
||||
raise argparse.ArgumentTypeError("must be positive")
|
||||
return parsed
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Validate ReplayServe fixtures.")
|
||||
parser.add_argument("fixture_dirs", nargs="+", type=Path)
|
||||
parser.add_argument("--max-tokens", type=positive_int, default=32768)
|
||||
parser.add_argument("--block-size", type=positive_int, default=16)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def parse_block_hash_ids(value: str) -> list[int]:
|
||||
stripped = value.strip()
|
||||
if not stripped:
|
||||
return []
|
||||
return [int(part) for part in stripped.split("|") if part]
|
||||
|
||||
|
||||
def expected_block_counts(input_length: int, block_size: int) -> list[int]:
|
||||
hash_count = math.ceil(input_length / block_size)
|
||||
if hash_count == 0:
|
||||
return []
|
||||
last_count = input_length % block_size
|
||||
if last_count == 0:
|
||||
last_count = block_size
|
||||
return [block_size] * (hash_count - 1) + [last_count]
|
||||
|
||||
|
||||
def load_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line_number, line in enumerate(handle, start=1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
try:
|
||||
row = json.loads(stripped)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(f"{path}: line {line_number}: invalid JSON") from exc
|
||||
if not isinstance(row, dict):
|
||||
raise ValueError(f"{path}: line {line_number}: JSON value must be object")
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
|
||||
def load_csv(path: Path) -> list[dict[str, str]]:
|
||||
with path.open("r", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.DictReader(handle)
|
||||
required = {
|
||||
"arrived_at",
|
||||
"num_prefill_tokens",
|
||||
"num_decode_tokens",
|
||||
"session_id",
|
||||
"block_hash_ids",
|
||||
}
|
||||
missing = required - set(reader.fieldnames or [])
|
||||
if missing:
|
||||
raise ValueError(f"{path}: missing CSV columns: {sorted(missing)}")
|
||||
return list(reader)
|
||||
|
||||
|
||||
def require_paths(fixture_dir: Path) -> tuple[Path, Path, Path, Path]:
|
||||
source_path = fixture_dir / "source.jsonl"
|
||||
csv_path = fixture_dir / "frontier.csv"
|
||||
sidecar_path = fixture_dir / "sidecar.jsonl"
|
||||
manifest_path = fixture_dir / "manifest.json"
|
||||
for path in (source_path, csv_path, sidecar_path, manifest_path):
|
||||
if not path.exists():
|
||||
raise ValueError(f"{fixture_dir}: missing {path.name}")
|
||||
return source_path, csv_path, sidecar_path, manifest_path
|
||||
|
||||
|
||||
def validate_fixture(fixture_dir: Path, block_size: int, max_tokens: int) -> str:
|
||||
source_path, csv_path, sidecar_path, manifest_path = require_paths(fixture_dir)
|
||||
source_rows = load_jsonl(source_path)
|
||||
csv_rows = load_csv(csv_path)
|
||||
sidecar_rows = load_jsonl(sidecar_path)
|
||||
with manifest_path.open("r", encoding="utf-8") as handle:
|
||||
manifest = json.load(handle)
|
||||
|
||||
row_count = len(csv_rows)
|
||||
if len(source_rows) != row_count or len(sidecar_rows) != row_count:
|
||||
raise ValueError(
|
||||
f"{fixture_dir}: row count mismatch source={len(source_rows)} "
|
||||
f"csv={row_count} sidecar={len(sidecar_rows)}"
|
||||
)
|
||||
if manifest.get("row_count") != row_count:
|
||||
raise ValueError(
|
||||
f"{fixture_dir}: manifest row_count={manifest.get('row_count')} "
|
||||
f"does not match csv rows={row_count}"
|
||||
)
|
||||
if manifest.get("block_size") != block_size:
|
||||
raise ValueError(
|
||||
f"{fixture_dir}: manifest block_size={manifest.get('block_size')} "
|
||||
f"does not match expected {block_size}"
|
||||
)
|
||||
if manifest.get("max_tokens") != max_tokens:
|
||||
raise ValueError(
|
||||
f"{fixture_dir}: manifest max_tokens={manifest.get('max_tokens')} "
|
||||
f"does not match expected {max_tokens}"
|
||||
)
|
||||
|
||||
previous_timestamp: float | None = None
|
||||
max_total_tokens = 0
|
||||
partial_final_block_rows = 0
|
||||
for index, (source, csv_row, sidecar) in enumerate(
|
||||
zip(source_rows, csv_rows, sidecar_rows)
|
||||
):
|
||||
prefix = f"{fixture_dir}: row {index}"
|
||||
input_length = int(csv_row["num_prefill_tokens"])
|
||||
output_length = int(csv_row["num_decode_tokens"])
|
||||
total_tokens = input_length + output_length
|
||||
if total_tokens > max_tokens:
|
||||
raise ValueError(
|
||||
f"{prefix}: total_tokens={total_tokens} exceeds max_tokens={max_tokens}"
|
||||
)
|
||||
max_total_tokens = max(max_total_tokens, total_tokens)
|
||||
|
||||
timestamp = float(csv_row["arrived_at"])
|
||||
if previous_timestamp is not None and timestamp < previous_timestamp:
|
||||
raise ValueError(f"{prefix}: timestamp is not monotonic")
|
||||
previous_timestamp = timestamp
|
||||
|
||||
hash_ids = parse_block_hash_ids(csv_row["block_hash_ids"])
|
||||
expected_hash_count = math.ceil(input_length / block_size)
|
||||
if len(hash_ids) != expected_hash_count:
|
||||
raise ValueError(
|
||||
f"{prefix}: hash count {len(hash_ids)} != {expected_hash_count}"
|
||||
)
|
||||
counts = expected_block_counts(input_length, block_size)
|
||||
if sum(counts) != input_length:
|
||||
raise ValueError(f"{prefix}: expected block counts do not sum to input")
|
||||
partial_final_block_rows += int(input_length % block_size != 0)
|
||||
|
||||
if int(csv_row["session_id"]) != int(source["chat_id"]):
|
||||
raise ValueError(f"{prefix}: session_id does not match source chat_id")
|
||||
if timestamp != float(source["timestamp"]):
|
||||
raise ValueError(f"{prefix}: arrived_at does not match source timestamp")
|
||||
if input_length != int(source["input_length"]):
|
||||
raise ValueError(f"{prefix}: num_prefill_tokens does not match source")
|
||||
if output_length != int(source["output_length"]):
|
||||
raise ValueError(f"{prefix}: num_decode_tokens does not match source")
|
||||
if hash_ids != source["hash_ids"]:
|
||||
raise ValueError(f"{prefix}: block_hash_ids do not match source hash_ids")
|
||||
|
||||
required_sidecar_keys = {
|
||||
"request_id",
|
||||
"chat_id",
|
||||
"parent_chat_id",
|
||||
"turn",
|
||||
"type",
|
||||
"timestamp",
|
||||
"input_length",
|
||||
"output_length",
|
||||
"hash_ids",
|
||||
"block_token_counts",
|
||||
}
|
||||
missing = required_sidecar_keys - set(sidecar)
|
||||
if missing:
|
||||
raise ValueError(f"{prefix}: missing sidecar keys {sorted(missing)}")
|
||||
if int(sidecar["request_id"]) != index:
|
||||
raise ValueError(f"{prefix}: sidecar request_id mismatch")
|
||||
if int(sidecar["chat_id"]) != int(source["chat_id"]):
|
||||
raise ValueError(f"{prefix}: sidecar chat_id mismatch")
|
||||
if int(sidecar["parent_chat_id"]) != int(source["parent_chat_id"]):
|
||||
raise ValueError(f"{prefix}: sidecar parent_chat_id mismatch")
|
||||
if int(sidecar["turn"]) != int(source["turn"]):
|
||||
raise ValueError(f"{prefix}: sidecar turn mismatch")
|
||||
if sidecar["type"] != source["type"]:
|
||||
raise ValueError(f"{prefix}: sidecar type mismatch")
|
||||
if float(sidecar["timestamp"]) != float(source["timestamp"]):
|
||||
raise ValueError(f"{prefix}: sidecar timestamp mismatch")
|
||||
if int(sidecar["input_length"]) != input_length:
|
||||
raise ValueError(f"{prefix}: sidecar input_length mismatch")
|
||||
if int(sidecar["output_length"]) != output_length:
|
||||
raise ValueError(f"{prefix}: sidecar output_length mismatch")
|
||||
if sidecar["hash_ids"] != hash_ids:
|
||||
raise ValueError(f"{prefix}: sidecar hash_ids mismatch")
|
||||
if sidecar["block_token_counts"] != counts:
|
||||
raise ValueError(f"{prefix}: sidecar block_token_counts mismatch")
|
||||
|
||||
if manifest.get("max_total_tokens") != max_total_tokens:
|
||||
raise ValueError(
|
||||
f"{fixture_dir}: manifest max_total_tokens="
|
||||
f"{manifest.get('max_total_tokens')} does not match {max_total_tokens}"
|
||||
)
|
||||
if manifest.get("partial_final_block_rows") != partial_final_block_rows:
|
||||
raise ValueError(
|
||||
f"{fixture_dir}: manifest partial_final_block_rows="
|
||||
f"{manifest.get('partial_final_block_rows')} does not match "
|
||||
f"{partial_final_block_rows}"
|
||||
)
|
||||
if manifest.get("overflow_count") != 0:
|
||||
raise ValueError(f"{fixture_dir}: manifest overflow_count is not zero")
|
||||
if manifest.get("timestamp_monotonic") is not True:
|
||||
raise ValueError(f"{fixture_dir}: manifest timestamp_monotonic is not true")
|
||||
|
||||
return (
|
||||
f"{fixture_dir.name}: rows={row_count} max_total_tokens={max_total_tokens} "
|
||||
f"partial_final_block_rows={partial_final_block_rows}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
try:
|
||||
for fixture_dir in args.fixture_dirs:
|
||||
print(
|
||||
validate_fixture(
|
||||
fixture_dir=fixture_dir,
|
||||
block_size=args.block_size,
|
||||
max_tokens=args.max_tokens,
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f"validate_fixtures.py: error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
405
tools/vllm_synthetic_replay.py
Normal file
405
tools/vllm_synthetic_replay.py
Normal file
@@ -0,0 +1,405 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Replay a ReplayServe fixture on vLLM with synthetic prompt token blocks."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import csv
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def positive_int(value: str) -> int:
|
||||
parsed = int(value)
|
||||
if parsed <= 0:
|
||||
raise argparse.ArgumentTypeError("must be positive")
|
||||
return parsed
|
||||
|
||||
|
||||
def positive_float(value: str) -> float:
|
||||
parsed = float(value)
|
||||
if parsed <= 0:
|
||||
raise argparse.ArgumentTypeError("must be positive")
|
||||
return parsed
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Run an online vLLM smoke/replay using synthetic prompt_token_ids "
|
||||
"derived from ReplayServe block hashes."
|
||||
)
|
||||
)
|
||||
parser.add_argument("--fixture-dir", required=True, type=Path)
|
||||
parser.add_argument("--model", required=True, type=str)
|
||||
parser.add_argument("--output-dir", required=True, type=Path)
|
||||
parser.add_argument("--tensor-parallel-size", type=positive_int, default=1)
|
||||
parser.add_argument("--limit", type=positive_int)
|
||||
parser.add_argument("--block-size", type=positive_int, default=16)
|
||||
parser.add_argument("--max-model-len", type=positive_int, default=32768)
|
||||
parser.add_argument("--max-num-seqs", type=positive_int, default=128)
|
||||
parser.add_argument("--max-num-batched-tokens", type=positive_int, default=32768)
|
||||
parser.add_argument("--gpu-memory-utilization", type=positive_float, default=0.9)
|
||||
parser.add_argument("--time-scale", type=positive_float, default=1.0)
|
||||
parser.add_argument(
|
||||
"--max-output-tokens",
|
||||
type=positive_int,
|
||||
help="Cap each row's output_length for smoke tests.",
|
||||
)
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--dtype", default="auto")
|
||||
parser.add_argument("--enforce-eager", action="store_true")
|
||||
parser.add_argument("--trust-remote-code", action=argparse.BooleanOptionalAction, default=True)
|
||||
parser.add_argument("--enable-prefix-caching", action=argparse.BooleanOptionalAction, default=True)
|
||||
parser.add_argument("--enable-chunked-prefill", action=argparse.BooleanOptionalAction, default=True)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line_number, line in enumerate(handle, start=1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
row = json.loads(stripped)
|
||||
if not isinstance(row, dict):
|
||||
raise ValueError(f"{path}: line {line_number}: expected object")
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
|
||||
def percentile(values: list[float], pct: float) -> float | None:
|
||||
if not values:
|
||||
return None
|
||||
ordered = sorted(values)
|
||||
index = min(len(ordered) - 1, max(0, int((len(ordered) - 1) * pct)))
|
||||
return ordered[index]
|
||||
|
||||
|
||||
def block_seed(hash_id: int, seed: int) -> int:
|
||||
digest = hashlib.blake2b(
|
||||
f"{seed}:{hash_id}".encode("utf-8"), digest_size=8
|
||||
).digest()
|
||||
return int.from_bytes(digest, "big")
|
||||
|
||||
|
||||
def block_tokens(
|
||||
hash_id: int,
|
||||
*,
|
||||
seed: int,
|
||||
block_size: int,
|
||||
vocab_size: int,
|
||||
special_ids: set[int],
|
||||
) -> list[int]:
|
||||
rng = random.Random(block_seed(hash_id, seed))
|
||||
low = 1000
|
||||
high = max(low + 1, vocab_size - 1000)
|
||||
tokens: list[int] = []
|
||||
while len(tokens) < block_size:
|
||||
token_id = rng.randrange(low, high)
|
||||
if token_id not in special_ids:
|
||||
tokens.append(token_id)
|
||||
return tokens
|
||||
|
||||
|
||||
def make_prompt_token_ids(
|
||||
row: dict[str, Any],
|
||||
*,
|
||||
seed: int,
|
||||
block_size: int,
|
||||
vocab_size: int,
|
||||
special_ids: set[int],
|
||||
) -> list[int]:
|
||||
hash_ids = [int(value) for value in row["hash_ids"]]
|
||||
counts = [int(value) for value in row["block_token_counts"]]
|
||||
if len(hash_ids) != len(counts):
|
||||
raise ValueError(f"request {row.get('request_id')}: hash/count length mismatch")
|
||||
|
||||
token_ids: list[int] = []
|
||||
for hash_id, count in zip(hash_ids, counts):
|
||||
token_ids.extend(
|
||||
block_tokens(
|
||||
hash_id,
|
||||
seed=seed,
|
||||
block_size=block_size,
|
||||
vocab_size=vocab_size,
|
||||
special_ids=special_ids,
|
||||
)[:count]
|
||||
)
|
||||
expected = int(row["input_length"])
|
||||
if len(token_ids) != expected:
|
||||
raise ValueError(
|
||||
f"request {row.get('request_id')}: synthetic prompt length "
|
||||
f"{len(token_ids)} != input_length {expected}"
|
||||
)
|
||||
return token_ids
|
||||
|
||||
|
||||
def estimate_prefix_reuse(rows: list[dict[str, Any]]) -> dict[int, dict[str, int | float]]:
|
||||
trie: dict[int, dict[Any, Any]] = {}
|
||||
estimates: dict[int, dict[str, int | float]] = {}
|
||||
for row in rows:
|
||||
request_id = int(row["request_id"])
|
||||
hash_ids = [int(value) for value in row["hash_ids"]]
|
||||
counts = [int(value) for value in row["block_token_counts"]]
|
||||
|
||||
node = trie
|
||||
hit_blocks = 0
|
||||
for hash_id in hash_ids:
|
||||
if hash_id not in node:
|
||||
break
|
||||
hit_blocks += 1
|
||||
node = node[hash_id]
|
||||
|
||||
node = trie
|
||||
for hash_id in hash_ids:
|
||||
node = node.setdefault(hash_id, {})
|
||||
|
||||
query_tokens = int(row["input_length"])
|
||||
hit_tokens = sum(counts[:hit_blocks])
|
||||
estimates[request_id] = {
|
||||
"query_blocks": len(hash_ids),
|
||||
"hit_blocks": hit_blocks,
|
||||
"query_tokens": query_tokens,
|
||||
"hit_tokens": hit_tokens,
|
||||
"block_hit_ratio": hit_blocks / len(hash_ids) if hash_ids else 0.0,
|
||||
"token_hit_ratio": hit_tokens / query_tokens if query_tokens else 0.0,
|
||||
}
|
||||
return estimates
|
||||
|
||||
|
||||
async def run_replay(args: argparse.Namespace) -> dict[str, Any]:
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
|
||||
from vllm.inputs import TokensPrompt
|
||||
except Exception as exc: # pragma: no cover - exercised on GPU host.
|
||||
raise RuntimeError(f"failed to import vLLM runtime dependencies: {exc}") from exc
|
||||
|
||||
sidecar_path = args.fixture_dir / "sidecar.jsonl"
|
||||
rows = load_jsonl(sidecar_path)
|
||||
if args.limit is not None:
|
||||
rows = rows[: args.limit]
|
||||
if not rows:
|
||||
raise ValueError("no rows selected")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
|
||||
special_ids = {int(value) for value in tokenizer.all_special_ids}
|
||||
vocab_size = len(tokenizer)
|
||||
synthetic_prompts = {
|
||||
int(row["request_id"]): make_prompt_token_ids(
|
||||
row,
|
||||
seed=args.seed,
|
||||
block_size=args.block_size,
|
||||
vocab_size=vocab_size,
|
||||
special_ids=special_ids,
|
||||
)
|
||||
for row in rows
|
||||
}
|
||||
prefix_reuse = estimate_prefix_reuse(rows)
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=args.model,
|
||||
tokenizer=args.model,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
dtype=args.dtype,
|
||||
max_model_len=args.max_model_len,
|
||||
block_size=args.block_size,
|
||||
enable_prefix_caching=args.enable_prefix_caching,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
max_num_batched_tokens=args.max_num_batched_tokens,
|
||||
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||
enforce_eager=args.enforce_eager,
|
||||
disable_log_stats=True,
|
||||
)
|
||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
output_rows: list[dict[str, Any]] = []
|
||||
first_timestamp = float(rows[0]["timestamp"])
|
||||
replay_start = time.perf_counter()
|
||||
|
||||
async def run_one(row: dict[str, Any]) -> None:
|
||||
request_id = int(row["request_id"])
|
||||
scheduled_arrival_s = (float(row["timestamp"]) - first_timestamp) * args.time_scale
|
||||
await asyncio.sleep(max(0.0, replay_start + scheduled_arrival_s - time.perf_counter()))
|
||||
|
||||
prompt_token_ids = synthetic_prompts[request_id]
|
||||
requested_output_tokens = int(row["output_length"])
|
||||
effective_output_tokens = requested_output_tokens
|
||||
if args.max_output_tokens is not None:
|
||||
effective_output_tokens = min(effective_output_tokens, args.max_output_tokens)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=effective_output_tokens,
|
||||
min_tokens=effective_output_tokens,
|
||||
ignore_eos=True,
|
||||
detokenize=False,
|
||||
seed=args.seed + request_id,
|
||||
)
|
||||
arrival_wall = time.perf_counter()
|
||||
first_token_wall: float | None = None
|
||||
last_output_tokens = 0
|
||||
final_output: Any = None
|
||||
generator = engine.generate(
|
||||
TokensPrompt(prompt_token_ids=prompt_token_ids),
|
||||
sampling_params,
|
||||
request_id=str(request_id),
|
||||
)
|
||||
async for output in generator:
|
||||
final_output = output
|
||||
if output.outputs:
|
||||
token_count = len(output.outputs[0].token_ids)
|
||||
if token_count > 0 and first_token_wall is None:
|
||||
first_token_wall = time.perf_counter()
|
||||
last_output_tokens = token_count
|
||||
done_wall = time.perf_counter()
|
||||
|
||||
finish_reason = ""
|
||||
if final_output is not None and final_output.outputs:
|
||||
finish_reason = str(final_output.outputs[0].finish_reason)
|
||||
ttft_s = None if first_token_wall is None else first_token_wall - arrival_wall
|
||||
e2e_s = done_wall - arrival_wall
|
||||
tpot_s = None
|
||||
if first_token_wall is not None and last_output_tokens > 1:
|
||||
tpot_s = (done_wall - first_token_wall) / (last_output_tokens - 1)
|
||||
reuse = prefix_reuse[request_id]
|
||||
output_rows.append(
|
||||
{
|
||||
"request_id": request_id,
|
||||
"scheduled_arrival_s": scheduled_arrival_s,
|
||||
"arrival_delay_s": arrival_wall - replay_start - scheduled_arrival_s,
|
||||
"input_length": int(row["input_length"]),
|
||||
"requested_output_length": requested_output_tokens,
|
||||
"effective_output_length": effective_output_tokens,
|
||||
"generated_output_tokens": last_output_tokens,
|
||||
"ttft_s": ttft_s,
|
||||
"tpot_s": tpot_s,
|
||||
"e2e_s": e2e_s,
|
||||
"finish_reason": finish_reason,
|
||||
"prefix_query_blocks_est": reuse["query_blocks"],
|
||||
"prefix_hit_blocks_est": reuse["hit_blocks"],
|
||||
"prefix_query_tokens_est": reuse["query_tokens"],
|
||||
"prefix_hit_tokens_est": reuse["hit_tokens"],
|
||||
"prefix_block_hit_ratio_est": reuse["block_hit_ratio"],
|
||||
"prefix_token_hit_ratio_est": reuse["token_hit_ratio"],
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
await asyncio.gather(*(run_one(row) for row in rows))
|
||||
finally:
|
||||
engine.shutdown()
|
||||
|
||||
replay_end = time.perf_counter()
|
||||
output_rows.sort(key=lambda item: int(item["request_id"]))
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
request_metrics_path = args.output_dir / "request_metrics.csv"
|
||||
fieldnames = list(output_rows[0].keys())
|
||||
with request_metrics_path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(output_rows)
|
||||
|
||||
ttft_values = [float(row["ttft_s"]) for row in output_rows if row["ttft_s"] is not None]
|
||||
tpot_values = [float(row["tpot_s"]) for row in output_rows if row["tpot_s"] is not None]
|
||||
e2e_values = [float(row["e2e_s"]) for row in output_rows]
|
||||
generated_tokens = sum(int(row["generated_output_tokens"]) for row in output_rows)
|
||||
prompt_tokens = sum(int(row["input_length"]) for row in output_rows)
|
||||
wall_s = replay_end - replay_start
|
||||
summary = {
|
||||
"status": "pass",
|
||||
"fixture_dir": str(args.fixture_dir),
|
||||
"model": args.model,
|
||||
"tensor_parallel_size": args.tensor_parallel_size,
|
||||
"cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES", ""),
|
||||
"rows": len(output_rows),
|
||||
"block_size": args.block_size,
|
||||
"max_model_len": args.max_model_len,
|
||||
"max_num_seqs": args.max_num_seqs,
|
||||
"max_num_batched_tokens": args.max_num_batched_tokens,
|
||||
"gpu_memory_utilization": args.gpu_memory_utilization,
|
||||
"enable_prefix_caching": args.enable_prefix_caching,
|
||||
"enable_chunked_prefill": args.enable_chunked_prefill,
|
||||
"time_scale": args.time_scale,
|
||||
"max_output_tokens": args.max_output_tokens,
|
||||
"synthetic_replay": {
|
||||
"semantics": (
|
||||
"Each trace block hash is deterministically mapped to a stable "
|
||||
"block of prompt token ids; equal hashes reuse equal token blocks. "
|
||||
"This preserves arrival, length, and block-prefix sharing patterns, "
|
||||
"but it is not original text/token recovery."
|
||||
),
|
||||
"seed": args.seed,
|
||||
"vocab_size": vocab_size,
|
||||
"special_token_ids_excluded": sorted(special_ids),
|
||||
},
|
||||
"wall_time_s": wall_s,
|
||||
"requests_per_second": len(output_rows) / wall_s if wall_s else 0.0,
|
||||
"prompt_tokens_per_second": prompt_tokens / wall_s if wall_s else 0.0,
|
||||
"generated_tokens_per_second": generated_tokens / wall_s if wall_s else 0.0,
|
||||
"total_prompt_tokens": prompt_tokens,
|
||||
"total_generated_tokens": generated_tokens,
|
||||
"ttft_s": {
|
||||
"mean": statistics.fmean(ttft_values) if ttft_values else None,
|
||||
"p50": percentile(ttft_values, 0.50),
|
||||
"p95": percentile(ttft_values, 0.95),
|
||||
},
|
||||
"tpot_s": {
|
||||
"mean": statistics.fmean(tpot_values) if tpot_values else None,
|
||||
"p50": percentile(tpot_values, 0.50),
|
||||
"p95": percentile(tpot_values, 0.95),
|
||||
},
|
||||
"e2e_s": {
|
||||
"mean": statistics.fmean(e2e_values) if e2e_values else None,
|
||||
"p50": percentile(e2e_values, 0.50),
|
||||
"p95": percentile(e2e_values, 0.95),
|
||||
},
|
||||
"estimated_prefix_reuse": {
|
||||
"query_blocks": sum(int(row["prefix_query_blocks_est"]) for row in output_rows),
|
||||
"hit_blocks": sum(int(row["prefix_hit_blocks_est"]) for row in output_rows),
|
||||
"query_tokens": sum(int(row["prefix_query_tokens_est"]) for row in output_rows),
|
||||
"hit_tokens": sum(int(row["prefix_hit_tokens_est"]) for row in output_rows),
|
||||
},
|
||||
"request_metrics_csv": str(request_metrics_path),
|
||||
}
|
||||
reuse = summary["estimated_prefix_reuse"]
|
||||
summary["estimated_prefix_reuse"]["block_hit_ratio"] = (
|
||||
reuse["hit_blocks"] / reuse["query_blocks"] if reuse["query_blocks"] else 0.0
|
||||
)
|
||||
summary["estimated_prefix_reuse"]["token_hit_ratio"] = (
|
||||
reuse["hit_tokens"] / reuse["query_tokens"] if reuse["query_tokens"] else 0.0
|
||||
)
|
||||
with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
|
||||
json.dump(summary, handle, indent=2, sort_keys=True)
|
||||
handle.write("\n")
|
||||
return summary
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
try:
|
||||
summary = asyncio.run(run_replay(args))
|
||||
except Exception as exc:
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
|
||||
json.dump({"status": "fail", "error": str(exc)}, handle, indent=2)
|
||||
handle.write("\n")
|
||||
print(f"vllm_synthetic_replay.py: error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
print(json.dumps(summary, indent=2, sort_keys=True))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user