Measures TTFT to serve a reused prefix of length L from each KV tier on a single H20 (Qwen3-Coder-30B-A3B, vLLM 0.18.1): miss (recompute), CPU-tier hit (native DRAM offload), GPU-tier hit (HBM prefix cache). Each measured request is bracketed by /metrics scrapes so the tier is verified (vllm:prefix_cache_hits vs external_prefix_cache_hits), not assumed. Result: GPU hit is ~flat (42->111 ms over 1k->64k tokens); CPU hit is transfer-bound (PCIe H2D ~54 GB/s, 57->272 ms); miss grows superlinearly (78 ms -> 15.2 s). GPU beats CPU 1.4-2.5x (gap grows with context); miss/CPU up to 56x, miss/GPU up to 137x. pcie_transfer.py is the independent CPU-hit floor backstop. Evidence for the GPU-hit-first principle (paper section 2.2). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
141 lines
4.8 KiB
Python
141 lines
4.8 KiB
Python
"""Exp (a): three-tier hit-latency microbench.
|
|
|
|
Measures TTFT of serving a prefix of length L from each tier:
|
|
- miss : fresh unique prompt -> full prefill (recompute)
|
|
- gpu : re-request same prompt -> HBM prefix-cache hit
|
|
- cpu : warm -> evict to CPU offload tier -> re-request -> DRAM hit
|
|
|
|
Each measured request is bracketed by /metrics scrapes so the tier is *verified*
|
|
(gpu_hits delta vs external_prefix_cache_hits delta), not assumed.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import statistics
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
from common.util import make_token_prompt, scrape_prefix_cache, measure_ttft # noqa: E402
|
|
|
|
LENGTHS = [1024, 2048, 4096, 8192, 16384, 32768, 65536]
|
|
|
|
|
|
def delta(a: dict, b: dict) -> dict:
|
|
return {k: b[k] - a[k] for k in a}
|
|
|
|
|
|
def one_measurement(ep, model, prompt, expect):
|
|
m0 = scrape_prefix_cache(ep)
|
|
res = measure_ttft(ep, model, prompt)
|
|
m1 = scrape_prefix_cache(ep)
|
|
d = delta(m0, m1)
|
|
cached = (res.get("usage") or {}).get("prompt_tokens", None)
|
|
# classify
|
|
if d["ext_hits"] > 0.5:
|
|
tier = "cpu"
|
|
elif d["gpu_hits"] > 0.5:
|
|
tier = "gpu"
|
|
else:
|
|
tier = "miss"
|
|
return {"ttft_s": res["ttft_s"], "e2e_s": res["e2e_s"],
|
|
"tier_observed": tier, "expect": expect,
|
|
"d_gpu_hits": d["gpu_hits"], "d_ext_hits": d["ext_hits"]}
|
|
|
|
|
|
def run_miss(ep, model, L, reps, base):
|
|
rows = []
|
|
for i in range(reps):
|
|
p = make_token_prompt(L, seed=base + i) # fresh each time
|
|
rows.append(one_measurement(ep, model, p, "miss"))
|
|
return rows
|
|
|
|
|
|
def run_gpu(ep, model, L, reps, base):
|
|
rows = []
|
|
for i in range(reps):
|
|
p = make_token_prompt(L, seed=base + i)
|
|
measure_ttft(ep, model, p) # warm
|
|
rows.append(one_measurement(ep, model, p, "gpu")) # hit
|
|
return rows
|
|
|
|
|
|
def run_cpu(ep, model, L, reps, base, flood_tokens, flood_chunk):
|
|
rows = []
|
|
for i in range(reps):
|
|
p = make_token_prompt(L, seed=base + i)
|
|
measure_ttft(ep, model, p) # warm -> GPU (+offload)
|
|
# flood with distinct content to evict p from the GPU pool to CPU tier
|
|
sent = 0
|
|
fseed = 10_000_000 + (base + i) * 1000
|
|
while sent < flood_tokens:
|
|
fp = make_token_prompt(flood_chunk, seed=fseed)
|
|
measure_ttft(ep, model, fp)
|
|
fseed += 1
|
|
sent += flood_chunk
|
|
rows.append(one_measurement(ep, model, p, "cpu")) # should hit CPU tier
|
|
return rows
|
|
|
|
|
|
def summarize(rows):
|
|
t = sorted(r["ttft_s"] for r in rows)
|
|
return {
|
|
"n": len(rows),
|
|
"ttft_p50": statistics.median(t) if t else None,
|
|
"ttft_mean": statistics.fmean(t) if t else None,
|
|
"ttft_min": t[0] if t else None,
|
|
"ttft_max": t[-1] if t else None,
|
|
"tier_observed": _modal([r["tier_observed"] for r in rows]),
|
|
"verified_frac": sum(r["tier_observed"] == r["expect"] for r in rows) / len(rows) if rows else 0,
|
|
}
|
|
|
|
|
|
def _modal(xs):
|
|
from collections import Counter
|
|
return Counter(xs).most_common(1)[0][0] if xs else None
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--endpoint", required=True)
|
|
ap.add_argument("--model", required=True)
|
|
ap.add_argument("--mode", required=True, choices=["miss", "gpu", "cpu"])
|
|
ap.add_argument("--reps", type=int, default=8)
|
|
ap.add_argument("--out", required=True)
|
|
ap.add_argument("--lengths", type=str, default=None,
|
|
help="comma list override, e.g. 1024,4096")
|
|
ap.add_argument("--flood-tokens", type=int, default=120000,
|
|
help="cpu mode: distinct tokens to flush GPU pool")
|
|
ap.add_argument("--flood-chunk", type=int, default=8192)
|
|
args = ap.parse_args()
|
|
|
|
lengths = ([int(x) for x in args.lengths.split(",")] if args.lengths else LENGTHS)
|
|
out = {"mode": args.mode, "reps": args.reps, "by_length": {}, "raw": {}}
|
|
base = {"miss": 1_000, "gpu": 2_000, "cpu": 3_000}[args.mode]
|
|
|
|
for L in lengths:
|
|
t0 = time.time()
|
|
if args.mode == "miss":
|
|
rows = run_miss(args.endpoint, args.model, L, args.reps, base)
|
|
elif args.mode == "gpu":
|
|
rows = run_gpu(args.endpoint, args.model, L, args.reps, base)
|
|
else:
|
|
rows = run_cpu(args.endpoint, args.model, L, args.reps, base,
|
|
args.flood_tokens, args.flood_chunk)
|
|
base += 100_000
|
|
s = summarize(rows)
|
|
out["by_length"][str(L)] = s
|
|
out["raw"][str(L)] = rows
|
|
print(f"[{args.mode}] L={L:>6} ttft_p50={s['ttft_p50']:.4f}s "
|
|
f"tier={s['tier_observed']} verified={s['verified_frac']:.0%} "
|
|
f"({time.time()-t0:.0f}s)", flush=True)
|
|
Path(args.out).write_text(json.dumps(out, indent=2))
|
|
|
|
print(f"wrote {args.out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|