replaysim/tools/vllm_synthetic_replay.py

#!/usr/bin/env python3
"""Replay a ReplayServe fixture on vLLM with synthetic prompt token blocks."""

from __future__ import annotations

import argparse
import asyncio
import csv
import hashlib
import json
import os
import random
import statistics
import sys
import time
from pathlib import Path
from typing import Any


def positive_int(value: str) -> int:
    parsed = int(value)
    if parsed <= 0:
        raise argparse.ArgumentTypeError("must be positive")
    return parsed


def positive_float(value: str) -> float:
    parsed = float(value)
    if parsed <= 0:
        raise argparse.ArgumentTypeError("must be positive")
    return parsed


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Run an online vLLM smoke/replay using synthetic prompt_token_ids "
            "derived from ReplayServe block hashes."
        )
    )
    parser.add_argument("--fixture-dir", required=True, type=Path)
    parser.add_argument("--model", required=True, type=str)
    parser.add_argument("--output-dir", required=True, type=Path)
    parser.add_argument("--tensor-parallel-size", type=positive_int, default=1)
    parser.add_argument("--limit", type=positive_int)
    parser.add_argument("--block-size", type=positive_int, default=16)
    parser.add_argument("--max-model-len", type=positive_int, default=32768)
    parser.add_argument("--max-num-seqs", type=positive_int, default=128)
    parser.add_argument("--max-num-batched-tokens", type=positive_int, default=32768)
    parser.add_argument("--gpu-memory-utilization", type=positive_float, default=0.9)
    parser.add_argument("--time-scale", type=positive_float, default=1.0)
    parser.add_argument(
        "--max-output-tokens",
        type=positive_int,
        help="Cap each row's output_length for smoke tests.",
    )
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--dtype", default="auto")
    parser.add_argument("--enforce-eager", action="store_true")
    parser.add_argument("--trust-remote-code", action=argparse.BooleanOptionalAction, default=True)
    parser.add_argument("--enable-prefix-caching", action=argparse.BooleanOptionalAction, default=True)
    parser.add_argument("--enable-chunked-prefill", action=argparse.BooleanOptionalAction, default=True)
    return parser.parse_args()


def load_jsonl(path: Path) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as handle:
        for line_number, line in enumerate(handle, start=1):
            stripped = line.strip()
            if not stripped:
                continue
            row = json.loads(stripped)
            if not isinstance(row, dict):
                raise ValueError(f"{path}: line {line_number}: expected object")
            rows.append(row)
    return rows


def percentile(values: list[float], pct: float) -> float | None:
    if not values:
        return None
    ordered = sorted(values)
    index = min(len(ordered) - 1, max(0, int((len(ordered) - 1) * pct)))
    return ordered[index]


def block_seed(hash_id: int, seed: int) -> int:
    digest = hashlib.blake2b(
        f"{seed}:{hash_id}".encode("utf-8"), digest_size=8
    ).digest()
    return int.from_bytes(digest, "big")


def block_tokens(
    hash_id: int,
    *,
    seed: int,
    block_size: int,
    vocab_size: int,
    special_ids: set[int],
) -> list[int]:
    rng = random.Random(block_seed(hash_id, seed))
    low = 1000
    high = max(low + 1, vocab_size - 1000)
    tokens: list[int] = []
    while len(tokens) < block_size:
        token_id = rng.randrange(low, high)
        if token_id not in special_ids:
            tokens.append(token_id)
    return tokens


def make_prompt_token_ids(
    row: dict[str, Any],
    *,
    seed: int,
    block_size: int,
    vocab_size: int,
    special_ids: set[int],
) -> list[int]:
    hash_ids = [int(value) for value in row["hash_ids"]]
    counts = [int(value) for value in row["block_token_counts"]]
    if len(hash_ids) != len(counts):
        raise ValueError(f"request {row.get('request_id')}: hash/count length mismatch")

    token_ids: list[int] = []
    for hash_id, count in zip(hash_ids, counts):
        token_ids.extend(
            block_tokens(
                hash_id,
                seed=seed,
                block_size=block_size,
                vocab_size=vocab_size,
                special_ids=special_ids,
            )[:count]
        )
    expected = int(row["input_length"])
    if len(token_ids) != expected:
        raise ValueError(
            f"request {row.get('request_id')}: synthetic prompt length "
            f"{len(token_ids)} != input_length {expected}"
        )
    return token_ids


def estimate_prefix_reuse(rows: list[dict[str, Any]]) -> dict[int, dict[str, int | float]]:
    trie: dict[int, dict[Any, Any]] = {}
    estimates: dict[int, dict[str, int | float]] = {}
    for row in rows:
        request_id = int(row["request_id"])
        hash_ids = [int(value) for value in row["hash_ids"]]
        counts = [int(value) for value in row["block_token_counts"]]

        node = trie
        hit_blocks = 0
        for hash_id in hash_ids:
            if hash_id not in node:
                break
            hit_blocks += 1
            node = node[hash_id]

        node = trie
        for hash_id in hash_ids:
            node = node.setdefault(hash_id, {})

        query_tokens = int(row["input_length"])
        hit_tokens = sum(counts[:hit_blocks])
        estimates[request_id] = {
            "query_blocks": len(hash_ids),
            "hit_blocks": hit_blocks,
            "query_tokens": query_tokens,
            "hit_tokens": hit_tokens,
            "block_hit_ratio": hit_blocks / len(hash_ids) if hash_ids else 0.0,
            "token_hit_ratio": hit_tokens / query_tokens if query_tokens else 0.0,
        }
    return estimates


async def run_replay(args: argparse.Namespace) -> dict[str, Any]:
    try:
        from transformers import AutoTokenizer
        from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
        from vllm.inputs import TokensPrompt
    except Exception as exc:  # pragma: no cover - exercised on GPU host.
        raise RuntimeError(f"failed to import vLLM runtime dependencies: {exc}") from exc

    sidecar_path = args.fixture_dir / "sidecar.jsonl"
    rows = load_jsonl(sidecar_path)
    if args.limit is not None:
        rows = rows[: args.limit]
    if not rows:
        raise ValueError("no rows selected")

    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
    special_ids = {int(value) for value in tokenizer.all_special_ids}
    vocab_size = len(tokenizer)
    synthetic_prompts = {
        int(row["request_id"]): make_prompt_token_ids(
            row,
            seed=args.seed,
            block_size=args.block_size,
            vocab_size=vocab_size,
            special_ids=special_ids,
        )
        for row in rows
    }
    prefix_reuse = estimate_prefix_reuse(rows)

    engine_args = AsyncEngineArgs(
        model=args.model,
        tokenizer=args.model,
        trust_remote_code=args.trust_remote_code,
        tensor_parallel_size=args.tensor_parallel_size,
        dtype=args.dtype,
        max_model_len=args.max_model_len,
        block_size=args.block_size,
        enable_prefix_caching=args.enable_prefix_caching,
        enable_chunked_prefill=args.enable_chunked_prefill,
        max_num_seqs=args.max_num_seqs,
        max_num_batched_tokens=args.max_num_batched_tokens,
        gpu_memory_utilization=args.gpu_memory_utilization,
        enforce_eager=args.enforce_eager,
        disable_log_stats=True,
    )
    engine = AsyncLLMEngine.from_engine_args(engine_args)
    output_rows: list[dict[str, Any]] = []
    first_timestamp = float(rows[0]["timestamp"])
    replay_start = time.perf_counter()

    async def run_one(row: dict[str, Any]) -> None:
        request_id = int(row["request_id"])
        scheduled_arrival_s = (float(row["timestamp"]) - first_timestamp) * args.time_scale
        await asyncio.sleep(max(0.0, replay_start + scheduled_arrival_s - time.perf_counter()))

        prompt_token_ids = synthetic_prompts[request_id]
        requested_output_tokens = int(row["output_length"])
        effective_output_tokens = requested_output_tokens
        if args.max_output_tokens is not None:
            effective_output_tokens = min(effective_output_tokens, args.max_output_tokens)

        sampling_params = SamplingParams(
            temperature=0.0,
            max_tokens=effective_output_tokens,
            min_tokens=effective_output_tokens,
            ignore_eos=True,
            detokenize=False,
            seed=args.seed + request_id,
        )
        arrival_wall = time.perf_counter()
        first_token_wall: float | None = None
        last_output_tokens = 0
        final_output: Any = None
        generator = engine.generate(
            TokensPrompt(prompt_token_ids=prompt_token_ids),
            sampling_params,
            request_id=str(request_id),
        )
        async for output in generator:
            final_output = output
            if output.outputs:
                token_count = len(output.outputs[0].token_ids)
                if token_count > 0 and first_token_wall is None:
                    first_token_wall = time.perf_counter()
                last_output_tokens = token_count
        done_wall = time.perf_counter()

        finish_reason = ""
        if final_output is not None and final_output.outputs:
            finish_reason = str(final_output.outputs[0].finish_reason)
        ttft_s = None if first_token_wall is None else first_token_wall - arrival_wall
        e2e_s = done_wall - arrival_wall
        tpot_s = None
        if first_token_wall is not None and last_output_tokens > 1:
            tpot_s = (done_wall - first_token_wall) / (last_output_tokens - 1)
        reuse = prefix_reuse[request_id]
        output_rows.append(
            {
                "request_id": request_id,
                "scheduled_arrival_s": scheduled_arrival_s,
                "arrival_delay_s": arrival_wall - replay_start - scheduled_arrival_s,
                "input_length": int(row["input_length"]),
                "requested_output_length": requested_output_tokens,
                "effective_output_length": effective_output_tokens,
                "generated_output_tokens": last_output_tokens,
                "ttft_s": ttft_s,
                "tpot_s": tpot_s,
                "e2e_s": e2e_s,
                "finish_reason": finish_reason,
                "prefix_query_blocks_est": reuse["query_blocks"],
                "prefix_hit_blocks_est": reuse["hit_blocks"],
                "prefix_query_tokens_est": reuse["query_tokens"],
                "prefix_hit_tokens_est": reuse["hit_tokens"],
                "prefix_block_hit_ratio_est": reuse["block_hit_ratio"],
                "prefix_token_hit_ratio_est": reuse["token_hit_ratio"],
            }
        )

    try:
        await asyncio.gather(*(run_one(row) for row in rows))
    finally:
        engine.shutdown()

    replay_end = time.perf_counter()
    output_rows.sort(key=lambda item: int(item["request_id"]))
    args.output_dir.mkdir(parents=True, exist_ok=True)
    request_metrics_path = args.output_dir / "request_metrics.csv"
    fieldnames = list(output_rows[0].keys())
    with request_metrics_path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(output_rows)

    ttft_values = [float(row["ttft_s"]) for row in output_rows if row["ttft_s"] is not None]
    tpot_values = [float(row["tpot_s"]) for row in output_rows if row["tpot_s"] is not None]
    e2e_values = [float(row["e2e_s"]) for row in output_rows]
    generated_tokens = sum(int(row["generated_output_tokens"]) for row in output_rows)
    prompt_tokens = sum(int(row["input_length"]) for row in output_rows)
    wall_s = replay_end - replay_start
    summary = {
        "status": "pass",
        "fixture_dir": str(args.fixture_dir),
        "model": args.model,
        "tensor_parallel_size": args.tensor_parallel_size,
        "cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES", ""),
        "rows": len(output_rows),
        "block_size": args.block_size,
        "max_model_len": args.max_model_len,
        "max_num_seqs": args.max_num_seqs,
        "max_num_batched_tokens": args.max_num_batched_tokens,
        "gpu_memory_utilization": args.gpu_memory_utilization,
        "enable_prefix_caching": args.enable_prefix_caching,
        "enable_chunked_prefill": args.enable_chunked_prefill,
        "time_scale": args.time_scale,
        "max_output_tokens": args.max_output_tokens,
        "synthetic_replay": {
            "semantics": (
                "Each trace block hash is deterministically mapped to a stable "
                "block of prompt token ids; equal hashes reuse equal token blocks. "
                "This preserves arrival, length, and block-prefix sharing patterns, "
                "but it is not original text/token recovery."
            ),
            "seed": args.seed,
            "vocab_size": vocab_size,
            "special_token_ids_excluded": sorted(special_ids),
        },
        "wall_time_s": wall_s,
        "requests_per_second": len(output_rows) / wall_s if wall_s else 0.0,
        "prompt_tokens_per_second": prompt_tokens / wall_s if wall_s else 0.0,
        "generated_tokens_per_second": generated_tokens / wall_s if wall_s else 0.0,
        "total_prompt_tokens": prompt_tokens,
        "total_generated_tokens": generated_tokens,
        "ttft_s": {
            "mean": statistics.fmean(ttft_values) if ttft_values else None,
            "p50": percentile(ttft_values, 0.50),
            "p95": percentile(ttft_values, 0.95),
        },
        "tpot_s": {
            "mean": statistics.fmean(tpot_values) if tpot_values else None,
            "p50": percentile(tpot_values, 0.50),
            "p95": percentile(tpot_values, 0.95),
        },
        "e2e_s": {
            "mean": statistics.fmean(e2e_values) if e2e_values else None,
            "p50": percentile(e2e_values, 0.50),
            "p95": percentile(e2e_values, 0.95),
        },
        "estimated_prefix_reuse": {
            "query_blocks": sum(int(row["prefix_query_blocks_est"]) for row in output_rows),
            "hit_blocks": sum(int(row["prefix_hit_blocks_est"]) for row in output_rows),
            "query_tokens": sum(int(row["prefix_query_tokens_est"]) for row in output_rows),
            "hit_tokens": sum(int(row["prefix_hit_tokens_est"]) for row in output_rows),
        },
        "request_metrics_csv": str(request_metrics_path),
    }
    reuse = summary["estimated_prefix_reuse"]
    summary["estimated_prefix_reuse"]["block_hit_ratio"] = (
        reuse["hit_blocks"] / reuse["query_blocks"] if reuse["query_blocks"] else 0.0
    )
    summary["estimated_prefix_reuse"]["token_hit_ratio"] = (
        reuse["hit_tokens"] / reuse["query_tokens"] if reuse["query_tokens"] else 0.0
    )
    with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
        json.dump(summary, handle, indent=2, sort_keys=True)
        handle.write("\n")
    return summary


def main() -> int:
    args = parse_args()
    try:
        summary = asyncio.run(run_replay(args))
    except Exception as exc:
        args.output_dir.mkdir(parents=True, exist_ok=True)
        with (args.output_dir / "summary.json").open("w", encoding="utf-8") as handle:
            json.dump({"status": "fail", "error": str(exc)}, handle, indent=2)
            handle.write("\n")
        print(f"vllm_synthetic_replay.py: error: {exc}", file=sys.stderr)
        return 1
    print(json.dumps(summary, indent=2, sort_keys=True))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())