"""Controlled multi-turn agentic workload for the capacity->APC knee. Each session grows its prefix cumulatively: turn k appends G fresh blocks and reuses all blocks of turns 1..k-1 (intra-session prefix reuse, the dominant mode per the trace, 93% intra-session). Block ids are namespaced per session so cross-session reuse is ~0. Intra-session APC ceiling = (T-1)/(T+1). timestamp=0 => the replayer fires closed-loop, gated only by max-inflight-sessions. """ import argparse import json BLOCK = 16 # tokens/block (vLLM default) def main(): ap = argparse.ArgumentParser() ap.add_argument("--sessions", type=int, default=40) ap.add_argument("--turns", type=int, default=8) ap.add_argument("--blocks-per-turn", type=int, default=192) # 3072 tok/turn ap.add_argument("--output-len", type=int, default=100) ap.add_argument("--out", required=True) a = ap.parse_args() rows = [] for s in range(a.sessions): base = s * 10_000_000 # unique block namespace per session cum = [] for k in range(1, a.turns + 1): for _ in range(a.blocks_per_turn): cum.append(base + len(cum)) rows.append({ "chat_id": s * 1000 + k, "parent_chat_id": (s * 1000 + k - 1) if k > 1 else 0, "timestamp": 0.0, "input_length": len(cum) * BLOCK, "output_length": a.output_len, "type": "coder", "turn": k, "hash_ids": list(cum), "session_id": f"s{s}", }) with open(a.out, "w") as o: for r in rows: o.write(json.dumps(r) + "\n") ws_blocks = a.turns * a.blocks_per_turn apc = (a.turns - 1) / (a.turns + 1) print(f"wrote {len(rows)} reqs ({a.sessions} sessions x {a.turns} turns) -> {a.out}") print(f"session working set = {ws_blocks} blocks ({ws_blocks*BLOCK} tok, " f"{ws_blocks*BLOCK*98304/1e9:.2f} GB); max req = {ws_blocks*BLOCK} tok") print(f"intra-session APC ceiling = {apc:.1%}") if __name__ == "__main__": main()