agentic-kvc/microbench/fresh_setup/gen_synthetic_trace.py

"""Generate synthetic traces for the PD-disagg crossover study.

Emits the same JSONL schema the replayer consumes (chat_id, parent_chat_id,
timestamp, input_length, output_length, type, turn, hash_ids, session_id),
so no replayer change is needed.

Phase A ("vanilla") workload — the textbook regime where PD-disagg is
expected to win:
  - Poisson arrivals at a fixed mean QPS.
  - Fixed input / output length.
  - Every request is its own single-turn session (parent_chat_id = -1).
  - hash_ids are globally unique, so there is ZERO prefix-cache reuse and the
    prefix-cache confound (PD round-robin loses cache, 8C keeps it) is removed
    from the comparison by construction.

Later morph dimensions (multi-turn reuse, burst arrivals, session skew) are
intentionally NOT implemented here yet; this file owns the vanilla baseline.

Usage:
  python gen_synthetic_trace.py --out traces/vanilla_q1.6_in1024_out256.jsonl \
      --qps 1.6 --duration-s 600 --input-len 1024 --output-len 256 --seed 42
"""

from __future__ import annotations

import argparse
import json
import random
from pathlib import Path

BLOCK_SIZE = 512  # must match replayer.replay.BLOCK_SIZE
# Start unique hash ids well above the real-trace hash range (~1.2e7) so a
# synthetic trace never accidentally shares a block hash with anything else.
HASH_BASE = 1_000_000_000


def n_blocks_for(input_length: int) -> int:
    return max(1, input_length // BLOCK_SIZE)


def gen_vanilla(
    *,
    qps: float,
    duration_s: float,
    input_len: int,
    output_len: int,
    seed: int,
) -> list[dict]:
    """Poisson arrivals, fixed lengths, every request a unique single-turn
    session with globally-unique block hashes (zero reuse)."""
    rng = random.Random(seed)
    rows: list[dict] = []
    t = 0.0
    next_hash = HASH_BASE
    chat_id = 1
    while True:
        # Exponential inter-arrival -> Poisson process at rate `qps`.
        t += rng.expovariate(qps)
        if t > duration_s:
            break
        nb = n_blocks_for(input_len)
        hash_ids = list(range(next_hash, next_hash + nb))
        next_hash += nb
        rows.append({
            "chat_id": chat_id,
            "parent_chat_id": -1,
            "timestamp": round(t, 6),
            "input_length": input_len,
            "output_length": output_len,
            "type": "synthetic",
            "turn": 1,
            "hash_ids": hash_ids,
            "session_id": str(chat_id),
        })
        chat_id += 1
    return rows


def _sample_turns(rng: random.Random, turns_mean: float, turns_max: int,
                  heavy_frac: float) -> int:
    """Geometric-ish turn count, with a heavy-tailed minority (session skew)."""
    if heavy_frac > 0 and rng.random() < heavy_frac:
        return turns_max
    cont = max(0.0, 1.0 - 1.0 / max(turns_mean, 1.0))
    t = 1
    while t < turns_max and rng.random() < cont:
        t += 1
    return t


def gen_multiturn(
    *,
    session_qps: float,
    duration_s: float,
    turns_mean: float,
    turns_max: int,
    heavy_frac: float,
    first_input: int,
    new_user_tokens: int,
    output_len: int,
    inter_turn_gap_s: float,
    seed: int,
) -> list[dict]:
    """Multi-turn agentic-like sessions with intra-session prefix reuse.

    Each session's turn k re-sends the whole conversation-so-far as its prompt
    (cumulative hash_ids prefix = prior turns' input+output blocks) then appends
    `new_user_tokens` of fresh context, so vLLM sees a high intra-session prefix-
    cache hit on the growing prefix — exactly the agentic multi-turn pattern.
    Context grows each turn; outputs are short; inter-turn gap models think-time.
    """
    rng = random.Random(seed)
    rows: list[dict] = []
    next_hash = HASH_BASE
    chat_id = 1

    # Generate session start times (Poisson), then expand each into turns.
    starts: list[float] = []
    t = 0.0
    while True:
        t += rng.expovariate(session_qps)
        if t > duration_s:
            break
        starts.append(t)

    for s_idx, start in enumerate(starts):
        session_id = f"s{s_idx}"
        n_turns = _sample_turns(rng, turns_mean, turns_max, heavy_frac)
        session_hashes: list[int] = []   # cumulative blocks of the conversation
        ctx_len = 0                       # cumulative prompt tokens (prior turns)
        prev_chat = -1
        ts = start
        for turn in range(1, n_turns + 1):
            added = first_input if turn == 1 else new_user_tokens
            input_len = ctx_len + added
            n_new = max(1, added // BLOCK_SIZE)
            new_blocks = list(range(next_hash, next_hash + n_new))
            next_hash += n_new
            turn_hashes = session_hashes + new_blocks
            rows.append({
                "chat_id": chat_id,
                "parent_chat_id": prev_chat,
                "timestamp": round(ts, 6),
                "input_length": input_len,
                "output_length": output_len,
                "type": "synthetic_agentic",
                "turn": turn,
                "hash_ids": turn_hashes,
                "session_id": session_id,
            })
            # Conversation grows by the new user tokens AND this turn's output.
            n_out_blocks = max(1, output_len // BLOCK_SIZE)
            session_hashes = turn_hashes + list(range(next_hash, next_hash + n_out_blocks))
            next_hash += n_out_blocks
            ctx_len = input_len + output_len
            prev_chat = chat_id
            chat_id += 1
            ts += rng.expovariate(1.0 / inter_turn_gap_s) if inter_turn_gap_s > 0 else 0.0

    rows.sort(key=lambda r: r["timestamp"])
    return rows


def main() -> None:
    p = argparse.ArgumentParser(description=__doc__,
                                formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--out", type=Path, required=True, help="output trace JSONL")
    p.add_argument("--mode", choices=["vanilla", "multiturn"], default="vanilla")
    p.add_argument("--qps", type=float, help="vanilla: mean Poisson request rate; "
                   "multiturn: mean Poisson SESSION rate")
    p.add_argument("--duration-s", type=float, default=600.0, help="trace span (s)")
    p.add_argument("--input-len", type=int, help="vanilla: fixed input length")
    p.add_argument("--output-len", type=int, required=True)
    p.add_argument("--seed", type=int, default=42)
    # multiturn knobs
    p.add_argument("--turns-mean", type=float, default=4.0)
    p.add_argument("--turns-max", type=int, default=40)
    p.add_argument("--heavy-frac", type=float, default=0.0,
                   help="fraction of sessions that are heavy (turns_max) — session skew")
    p.add_argument("--first-input", type=int, default=2048,
                   help="multiturn: turn-1 input length")
    p.add_argument("--new-user-tokens", type=int, default=256,
                   help="multiturn: fresh user tokens added each subsequent turn")
    p.add_argument("--inter-turn-gap-s", type=float, default=1.6,
                   help="multiturn: mean think-time between turns")
    args = p.parse_args()

    if args.mode == "vanilla":
        assert args.qps and args.input_len, "vanilla needs --qps and --input-len"
        rows = gen_vanilla(
            qps=args.qps, duration_s=args.duration_s,
            input_len=args.input_len, output_len=args.output_len, seed=args.seed,
        )
        cfg = {"mode": "vanilla", "qps": args.qps, "duration_s": args.duration_s,
               "input_len": args.input_len, "output_len": args.output_len,
               "seed": args.seed, "reuse": "none"}
    else:
        assert args.qps, "multiturn needs --qps (session rate)"
        rows = gen_multiturn(
            session_qps=args.qps, duration_s=args.duration_s,
            turns_mean=args.turns_mean, turns_max=args.turns_max,
            heavy_frac=args.heavy_frac, first_input=args.first_input,
            new_user_tokens=args.new_user_tokens, output_len=args.output_len,
            inter_turn_gap_s=args.inter_turn_gap_s, seed=args.seed,
        )
        cfg = {"mode": "multiturn", "session_qps": args.qps,
               "duration_s": args.duration_s, "turns_mean": args.turns_mean,
               "turns_max": args.turns_max, "heavy_frac": args.heavy_frac,
               "first_input": args.first_input, "new_user_tokens": args.new_user_tokens,
               "output_len": args.output_len, "inter_turn_gap_s": args.inter_turn_gap_s,
               "seed": args.seed, "reuse": "intra-session"}

    args.out.parent.mkdir(parents=True, exist_ok=True)
    with args.out.open("w", encoding="utf-8") as fh:
        for r in rows:
            fh.write(json.dumps(r) + "\n")

    cfg["n_requests"] = len(rows)
    cfg["block_size"] = BLOCK_SIZE
    cfg_path = args.out.with_suffix(args.out.suffix + ".config.json")
    cfg_path.write_text(json.dumps(cfg, indent=2))

    span = rows[-1]["timestamp"] - rows[0]["timestamp"] if rows else 0.0
    eff_qps = len(rows) / span if span > 0 else 0.0
    print(f"wrote {len(rows)} requests to {args.out}  (mode={args.mode})")
    print(f"  target qps={args.qps}  effective req qps={eff_qps:.3f}  span={span:.1f}s")
    if args.mode == "vanilla":
        print(f"  input_len={args.input_len} output_len={args.output_len} "
              f"(blocks/req={n_blocks_for(args.input_len)}, zero reuse)")
    else:
        n_sessions = len({r["session_id"] for r in rows})
        inputs = sorted(r["input_length"] for r in rows)
        p = lambda v, q: v[min(int(q * len(v)), len(v) - 1)] if v else 0
        print(f"  sessions={n_sessions}  turns/session~{len(rows)/max(n_sessions,1):.1f}  "
              f"input p50={p(inputs,.5)} p90={p(inputs,.9)} p99={p(inputs,.99)} "
              f"output_len={args.output_len} (intra-session reuse)")
    print(f"  config -> {cfg_path}")


if __name__ == "__main__":
    main()