Sweeps GPU KV-cache capacity (--num-gpu-blocks-override) under a closed-loop replay (concurrency 4) of a controlled multi-turn workload (cumulative intra-session prefix, gen_synth_trace.py), measuring realized APC (prefix_cache hits/queries delta) and latency per capacity. Result: a sharp knee at 3.6 GB = exactly the active working set (4 sessions x 0.91 GB). APC rises 7->12->36->80% then saturates at the ~71% intra-session ceiling; TTFT p90 collapses 13.0 s -> 0.53 s at the same point; dead flat to 14.5 GB, 100% completion throughout. So only the active working set needs HBM; capacity beyond it -- and the CPU/storage tier built to chase the reuse tail -- buys ~0. Knee scales linearly with concurrency = cluster GPU count. README.md ties exp(a)+exp(b) into the section-2.2 GPU-hit-first argument with tables, conclusions, and caveats. Raw per-request dumps gitignored; summary/m0/m1 deltas kept. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
56 lines
2.1 KiB
Python
56 lines
2.1 KiB
Python
"""Controlled multi-turn agentic workload for the capacity->APC knee.
|
|
|
|
Each session grows its prefix cumulatively: turn k appends G fresh blocks and
|
|
reuses all blocks of turns 1..k-1 (intra-session prefix reuse, the dominant
|
|
mode per the trace, 93% intra-session). Block ids are namespaced per session so
|
|
cross-session reuse is ~0. Intra-session APC ceiling = (T-1)/(T+1).
|
|
|
|
timestamp=0 => the replayer fires closed-loop, gated only by max-inflight-sessions.
|
|
"""
|
|
import argparse
|
|
import json
|
|
|
|
BLOCK = 16 # tokens/block (vLLM default)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--sessions", type=int, default=40)
|
|
ap.add_argument("--turns", type=int, default=8)
|
|
ap.add_argument("--blocks-per-turn", type=int, default=192) # 3072 tok/turn
|
|
ap.add_argument("--output-len", type=int, default=100)
|
|
ap.add_argument("--out", required=True)
|
|
a = ap.parse_args()
|
|
|
|
rows = []
|
|
for s in range(a.sessions):
|
|
base = s * 10_000_000 # unique block namespace per session
|
|
cum = []
|
|
for k in range(1, a.turns + 1):
|
|
for _ in range(a.blocks_per_turn):
|
|
cum.append(base + len(cum))
|
|
rows.append({
|
|
"chat_id": s * 1000 + k,
|
|
"parent_chat_id": (s * 1000 + k - 1) if k > 1 else 0,
|
|
"timestamp": 0.0,
|
|
"input_length": len(cum) * BLOCK,
|
|
"output_length": a.output_len,
|
|
"type": "coder",
|
|
"turn": k,
|
|
"hash_ids": list(cum),
|
|
"session_id": f"s{s}",
|
|
})
|
|
with open(a.out, "w") as o:
|
|
for r in rows:
|
|
o.write(json.dumps(r) + "\n")
|
|
ws_blocks = a.turns * a.blocks_per_turn
|
|
apc = (a.turns - 1) / (a.turns + 1)
|
|
print(f"wrote {len(rows)} reqs ({a.sessions} sessions x {a.turns} turns) -> {a.out}")
|
|
print(f"session working set = {ws_blocks} blocks ({ws_blocks*BLOCK} tok, "
|
|
f"{ws_blocks*BLOCK*98304/1e9:.2f} GB); max req = {ws_blocks*BLOCK} tok")
|
|
print(f"intra-session APC ceiling = {apc:.1%}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|