replaysim/tools/aggregate_runs.py

#!/usr/bin/env python3
"""Aggregate ReplayServe Frontier run directories into CSV and Markdown."""

from __future__ import annotations

import argparse
import csv
import json
from pathlib import Path
from typing import Any


REPLAYSERVE_ROOT = Path(__file__).resolve().parents[1]


FIELDNAMES = [
    "suite_id",
    "sim",
    "fixture",
    "config_id",
    "status",
    "exit_code",
    "runtime_seconds",
    "frontier_mode",
    "frontier_head",
    "frontier_dirty",
    "attn_tp",
    "attn_dp",
    "moe_tp",
    "moe_ep",
    "batch_size_cap",
    "max_tokens_in_batch",
    "block_size",
    "enable_prefix_caching",
    "enable_chunked_prefill",
    "long_prefill_token_threshold",
    "frontier_block_hit_ratio",
    "replayserve_token_hit_ratio",
    "cache_metrics_available",
    "cache_metrics_unavailable_reason",
    "cache_metric_rows_complete",
    "cache_metric_rows_total",
    "cache_metric_rows_missing",
    "completion_is_complete",
    "missing_latency_request_ids",
    "preemption_events",
    "preempted_requests",
    "ttft_mean_ms",
    "ttft_p50_ms",
    "ttft_p95_ms",
    "tpot_mean_ms",
    "tpot_p50_ms",
    "tpot_p95_ms",
    "e2e_mean_ms",
    "e2e_p50_ms",
    "e2e_p95_ms",
    "requests_per_second",
    "tokens_per_second",
    "decode_tokens_per_second",
    "completed_requests",
    "total_requests",
    "run_dir",
]


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Aggregate ReplayServe run outputs.")
    parser.add_argument("suite_dir", type=Path, help="Run suite directory.")
    parser.add_argument(
        "--output-csv",
        type=Path,
        help="Output CSV path. Defaults to <suite_dir>/summary.csv.",
    )
    parser.add_argument(
        "--output-md",
        type=Path,
        help="Output Markdown path. Defaults to <suite_dir>/summary.md.",
    )
    return parser.parse_args()


def load_json(path: Path) -> dict[str, Any]:
    if not path.exists():
        return {}
    with path.open("r", encoding="utf-8") as handle:
        data = json.load(handle)
    return data if isinstance(data, dict) else {}


def read_int(path: Path) -> int | None:
    try:
        return int(path.read_text(encoding="utf-8").strip())
    except (FileNotFoundError, ValueError):
        return None


def nested(data: dict[str, Any], *keys: str) -> Any:
    value: Any = data
    for key in keys:
        if not isinstance(value, dict):
            return None
        value = value.get(key)
    return value


def fmt(value: Any) -> str:
    if value is None:
        return ""
    if isinstance(value, bool):
        return "true" if value else "false"
    if isinstance(value, float):
        return f"{value:.8g}"
    return str(value)


def summarize_run(run_dir: Path) -> dict[str, Any]:
    manifest = load_json(run_dir / "run_manifest.json")
    status_json = load_json(run_dir / "run_status.json")
    post = load_json(run_dir / "postprocess_summary.json")
    system_metrics_path = post.get("system_metrics") if post else None
    system_metrics = load_json(Path(system_metrics_path)) if system_metrics_path else {}
    knobs = manifest.get("knobs", {}) if isinstance(manifest.get("knobs"), dict) else {}
    frontier = manifest.get("frontier", {}) if isinstance(manifest.get("frontier"), dict) else {}
    prefix = post.get("prefix_cache_postprocess", {}) if isinstance(post.get("prefix_cache_postprocess"), dict) else {}
    frontier_block = prefix.get("frontier_block_level", {}) if isinstance(prefix.get("frontier_block_level"), dict) else {}
    token_weighted = prefix.get("replayserve_token_weighted", {}) if isinstance(prefix.get("replayserve_token_weighted"), dict) else {}
    missing_rows = prefix.get("rows_with_missing_cache_metrics") or []
    if not isinstance(missing_rows, list):
        missing_rows = []
    preemption = post.get("preemption_statistics", {}) if isinstance(post.get("preemption_statistics"), dict) else {}
    completion = post.get("completion", {}) if isinstance(post.get("completion"), dict) else {}
    simulation = system_metrics.get("simulation_metadata", {}) if isinstance(system_metrics.get("simulation_metadata"), dict) else {}
    throughput = system_metrics.get("throughput_metrics", {}) if isinstance(system_metrics.get("throughput_metrics"), dict) else {}

    exit_code = status_json.get("exit_code")
    if exit_code is None:
        exit_code = read_int(run_dir / "exit_code.txt")
    runtime = status_json.get("runtime_seconds")
    if runtime is None:
        runtime = read_int(run_dir / "runtime_seconds.txt")
    status = status_json.get("status") or ("pass" if exit_code == 0 else "fail")
    if completion and not completion.get("is_complete", True):
        status = "incomplete"
    missing_latency_ids = completion.get("missing_latency_request_ids") or []
    if not isinstance(missing_latency_ids, list):
        missing_latency_ids = []

    return {
        "suite_id": manifest.get("suite_id"),
        "sim": manifest.get("sim"),
        "fixture": manifest.get("fixture"),
        "config_id": manifest.get("config_id"),
        "status": status,
        "exit_code": exit_code,
        "runtime_seconds": runtime,
        "frontier_mode": frontier.get("mode"),
        "frontier_head": frontier.get("head"),
        "frontier_dirty": bool((frontier.get("status_short") or "").strip()),
        "attn_tp": knobs.get("attn_tensor_parallel_size"),
        "attn_dp": knobs.get("attn_data_parallel_size"),
        "moe_tp": knobs.get("moe_tensor_parallel_size"),
        "moe_ep": knobs.get("moe_expert_parallel_size"),
        "batch_size_cap": knobs.get("batch_size_cap"),
        "max_tokens_in_batch": knobs.get("max_tokens_in_batch"),
        "block_size": knobs.get("block_size"),
        "enable_prefix_caching": knobs.get("enable_prefix_caching"),
        "enable_chunked_prefill": knobs.get("enable_chunked_prefill"),
        "long_prefill_token_threshold": knobs.get("long_prefill_token_threshold"),
        "frontier_block_hit_ratio": frontier_block.get("hit_ratio"),
        "replayserve_token_hit_ratio": token_weighted.get("hit_ratio"),
        "cache_metrics_available": prefix.get("available"),
        "cache_metrics_unavailable_reason": prefix.get("reason"),
        "cache_metric_rows_complete": prefix.get("completed_request_rows"),
        "cache_metric_rows_total": prefix.get("total_request_metric_rows"),
        "cache_metric_rows_missing": len(missing_rows),
        "completion_is_complete": completion.get("is_complete"),
        "missing_latency_request_ids": ",".join(str(value) for value in missing_latency_ids),
        "preemption_events": preemption.get("total_preemption_events"),
        "preempted_requests": preemption.get("total_preempted_requests"),
        "ttft_mean_ms": nested(system_metrics, "ttft_statistics", "mean"),
        "ttft_p50_ms": nested(system_metrics, "ttft_statistics", "p50"),
        "ttft_p95_ms": nested(system_metrics, "ttft_statistics", "p95"),
        "tpot_mean_ms": nested(system_metrics, "tpot_statistics", "mean"),
        "tpot_p50_ms": nested(system_metrics, "tpot_statistics", "p50"),
        "tpot_p95_ms": nested(system_metrics, "tpot_statistics", "p95"),
        "e2e_mean_ms": nested(system_metrics, "request_e2e_time_statistics", "mean"),
        "e2e_p50_ms": nested(system_metrics, "request_e2e_time_statistics", "p50"),
        "e2e_p95_ms": nested(system_metrics, "request_e2e_time_statistics", "p95"),
        "requests_per_second": throughput.get("requests_per_second"),
        "tokens_per_second": throughput.get("tokens_per_second"),
        "decode_tokens_per_second": throughput.get("decode_tokens_per_second"),
        "completed_requests": simulation.get("completed_requests"),
        "total_requests": simulation.get("total_requests"),
        "run_dir": str(run_dir),
    }


def write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=FIELDNAMES)
        writer.writeheader()
        for row in rows:
            writer.writerow({key: fmt(row.get(key)) for key in FIELDNAMES})


def write_markdown(path: Path, rows: list[dict[str, Any]], suite_dir: Path) -> None:
    columns = [
        "config_id",
        "fixture",
        "status",
        "runtime_seconds",
        "enable_prefix_caching",
        "enable_chunked_prefill",
        "frontier_block_hit_ratio",
        "replayserve_token_hit_ratio",
        "cache_metric_rows_missing",
        "completion_is_complete",
        "preemption_events",
        "ttft_mean_ms",
        "tpot_mean_ms",
        "e2e_mean_ms",
        "tokens_per_second",
    ]
    with path.open("w", encoding="utf-8") as handle:
        handle.write(f"# Sweep Summary: {suite_dir.name}\n\n")
        handle.write(f"- Suite dir: `{suite_dir}`\n")
        handle.write(f"- Runs: `{len(rows)}`\n\n")
        handle.write("| " + " | ".join(columns) + " |\n")
        handle.write("|" + "|".join(["---"] * len(columns)) + "|\n")
        for row in rows:
            handle.write("| " + " | ".join(fmt(row.get(col)) for col in columns) + " |\n")
        handle.write("\n")
        handle.write(
            "Latency and throughput values are Frontier smoke outputs from the "
            "configured predictor/profile mode. RS3 tiny smoke uses dummy execution "
            "time, so these are harness plumbing checks, not performance claims.\n"
        )


def main() -> int:
    args = parse_args()
    suite_dir = args.suite_dir.resolve()
    run_dirs = sorted(path.parent for path in suite_dir.glob("**/run_manifest.json"))
    rows = [summarize_run(path) for path in run_dirs]
    output_csv = args.output_csv or (suite_dir / "summary.csv")
    output_md = args.output_md or (suite_dir / "summary.md")
    write_csv(output_csv, rows)
    write_markdown(output_md, rows, suite_dir)
    print(f"wrote {output_csv}")
    print(f"wrote {output_md}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())