"""Exp (a) backstop: direct CPU(DRAM)->GPU(HBM) KV-transfer cost. Independent lower bound on a CPU-tier hit: fetching L tokens' KV over the host<->device link. CPU_hit(L) >= GPU_hit(L) + KV_bytes(L) / BW_h2d. Uses pinned host memory (best case for the offload tier, which pins buffers). """ from __future__ import annotations import argparse import json import time import torch KV_BYTES_PER_TOKEN = 98304 # Qwen3-Coder, bf16 LENGTHS = [1024, 2048, 4096, 8192, 16384, 32768, 65536] def time_h2d(nbytes: int, reps: int) -> float: n = nbytes // 2 # bf16 elements host = torch.empty(n, dtype=torch.float16, pin_memory=True) dev = torch.empty(n, dtype=torch.float16, device="cuda") # warmup for _ in range(3): dev.copy_(host, non_blocking=True) torch.cuda.synchronize() ts = [] for _ in range(reps): t0 = time.perf_counter() dev.copy_(host, non_blocking=True) torch.cuda.synchronize() ts.append(time.perf_counter() - t0) ts.sort() return ts[len(ts) // 2] def main(): ap = argparse.ArgumentParser() ap.add_argument("--reps", type=int, default=20) ap.add_argument("--out", required=True) args = ap.parse_args() assert torch.cuda.is_available(), "need a GPU" print("device:", torch.cuda.get_device_name(0)) out = {"device": torch.cuda.get_device_name(0), "by_length": {}} for L in LENGTHS: nbytes = L * KV_BYTES_PER_TOKEN sec = time_h2d(nbytes, args.reps) bw = nbytes / sec / 1e9 out["by_length"][str(L)] = { "kv_bytes": nbytes, "transfer_s": sec, "bw_GBps": bw, } print(f"L={L:>6} KV={nbytes/1e9:6.3f}GB t={sec*1000:7.2f}ms bw={bw:6.1f} GB/s", flush=True) json.dump(out, open(args.out, "w"), indent=2) print("wrote", args.out) if __name__ == "__main__": main()