agentic-kvc/v2/exp_a_tier_latency/pcie_transfer.py

"""Exp (a) backstop: direct CPU(DRAM)->GPU(HBM) KV-transfer cost.

Independent lower bound on a CPU-tier hit: fetching L tokens' KV over the
host<->device link. CPU_hit(L) >= GPU_hit(L) + KV_bytes(L) / BW_h2d.
Uses pinned host memory (best case for the offload tier, which pins buffers).
"""
from __future__ import annotations

import argparse
import json
import time

import torch

KV_BYTES_PER_TOKEN = 98304  # Qwen3-Coder, bf16
LENGTHS = [1024, 2048, 4096, 8192, 16384, 32768, 65536]


def time_h2d(nbytes: int, reps: int) -> float:
    n = nbytes // 2  # bf16 elements
    host = torch.empty(n, dtype=torch.float16, pin_memory=True)
    dev = torch.empty(n, dtype=torch.float16, device="cuda")
    # warmup
    for _ in range(3):
        dev.copy_(host, non_blocking=True)
    torch.cuda.synchronize()
    ts = []
    for _ in range(reps):
        t0 = time.perf_counter()
        dev.copy_(host, non_blocking=True)
        torch.cuda.synchronize()
        ts.append(time.perf_counter() - t0)
    ts.sort()
    return ts[len(ts) // 2]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--reps", type=int, default=20)
    ap.add_argument("--out", required=True)
    args = ap.parse_args()
    assert torch.cuda.is_available(), "need a GPU"
    print("device:", torch.cuda.get_device_name(0))

    out = {"device": torch.cuda.get_device_name(0), "by_length": {}}
    for L in LENGTHS:
        nbytes = L * KV_BYTES_PER_TOKEN
        sec = time_h2d(nbytes, args.reps)
        bw = nbytes / sec / 1e9
        out["by_length"][str(L)] = {
            "kv_bytes": nbytes, "transfer_s": sec, "bw_GBps": bw,
        }
        print(f"L={L:>6}  KV={nbytes/1e9:6.3f}GB  t={sec*1000:7.2f}ms  bw={bw:6.1f} GB/s", flush=True)
    json.dump(out, open(args.out, "w"), indent=2)
    print("wrote", args.out)


if __name__ == "__main__":
    main()