Files
agentic-kvc/v2/exp_a_tier_latency/results/gpu.json
Gahow Wang 837df6bc9e v2 exp(a): three-tier KV-hit latency microbench (GPU >> CPU >> miss)
Measures TTFT to serve a reused prefix of length L from each KV tier on a
single H20 (Qwen3-Coder-30B-A3B, vLLM 0.18.1): miss (recompute), CPU-tier
hit (native DRAM offload), GPU-tier hit (HBM prefix cache). Each measured
request is bracketed by /metrics scrapes so the tier is verified
(vllm:prefix_cache_hits vs external_prefix_cache_hits), not assumed.

Result: GPU hit is ~flat (42->111 ms over 1k->64k tokens); CPU hit is
transfer-bound (PCIe H2D ~54 GB/s, 57->272 ms); miss grows superlinearly
(78 ms -> 15.2 s). GPU beats CPU 1.4-2.5x (gap grows with context);
miss/CPU up to 56x, miss/GPU up to 137x. pcie_transfer.py is the
independent CPU-hit floor backstop. Evidence for the GPU-hit-first
principle (paper section 2.2).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-30 11:23:04 +08:00

533 lines
13 KiB
JSON

{
"mode": "gpu",
"reps": 8,
"by_length": {
"1024": {
"n": 8,
"ttft_p50": 0.04180275100225117,
"ttft_mean": 0.05689269150025211,
"ttft_min": 0.041313502995762974,
"ttft_max": 0.1606091230059974,
"tier_observed": "gpu",
"verified_frac": 1.0
},
"2048": {
"n": 8,
"ttft_p50": 0.044922845510882325,
"ttft_mean": 0.04646045462868642,
"ttft_min": 0.04261300901998766,
"ttft_max": 0.06082483098725788,
"tier_observed": "gpu",
"verified_frac": 1.0
},
"4096": {
"n": 8,
"ttft_p50": 0.0462174300046172,
"ttft_mean": 0.04691218675361597,
"ttft_min": 0.044408742018276826,
"ttft_max": 0.05101387499598786,
"tier_observed": "gpu",
"verified_frac": 1.0
},
"8192": {
"n": 8,
"ttft_p50": 0.052487702007056214,
"ttft_mean": 0.05252782000388834,
"ttft_min": 0.050384567002765834,
"ttft_max": 0.055209266021847725,
"tier_observed": "gpu",
"verified_frac": 1.0
},
"16384": {
"n": 8,
"ttft_p50": 0.06340778700541705,
"ttft_mean": 0.06307360512437299,
"ttft_min": 0.059953891002805904,
"ttft_max": 0.06587072199909016,
"tier_observed": "gpu",
"verified_frac": 1.0
},
"32768": {
"n": 8,
"ttft_p50": 0.07986902150150854,
"ttft_mean": 0.08412684850554797,
"ttft_min": 0.07615292401169427,
"ttft_max": 0.11761908099288121,
"tier_observed": "gpu",
"verified_frac": 1.0
},
"65536": {
"n": 8,
"ttft_p50": 0.11140661900572013,
"ttft_mean": 0.10751268000240088,
"ttft_min": 0.07390080401091836,
"ttft_max": 0.1206158839922864,
"tier_observed": "gpu",
"verified_frac": 1.0
}
},
"raw": {
"1024": [
{
"ttft_s": 0.1606091230059974,
"e2e_s": 0.16078226300305687,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.043477901024743915,
"e2e_s": 0.0436010490229819,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04137404798530042,
"e2e_s": 0.04146770399529487,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04179324599681422,
"e2e_s": 0.041887808009050786,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04326947100344114,
"e2e_s": 0.04335355598595925,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04149198398226872,
"e2e_s": 0.04157822398701683,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04181225600768812,
"e2e_s": 0.04190706100780517,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.041313502995762974,
"e2e_s": 0.041313502995762974,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 1008.0,
"d_ext_hits": 0.0
}
],
"2048": [
{
"ttft_s": 0.04491939002764411,
"e2e_s": 0.045019031007541344,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.045014784001978114,
"e2e_s": 0.04511277299025096,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04492630099412054,
"e2e_s": 0.04502850098651834,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04566141500254162,
"e2e_s": 0.04576313399593346,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04261300901998766,
"e2e_s": 0.04271370900096372,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.06082483098725788,
"e2e_s": 0.06096197199076414,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04355804901570082,
"e2e_s": 0.04355804901570082,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.044165857980260625,
"e2e_s": 0.044268568977713585,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 2032.0,
"d_ext_hits": 0.0
}
],
"4096": [
{
"ttft_s": 0.05101387499598786,
"e2e_s": 0.051123478973750025,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.044408742018276826,
"e2e_s": 0.044408742018276826,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04538871700060554,
"e2e_s": 0.045498208986828104,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04754545699688606,
"e2e_s": 0.047664124984294176,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04840670898556709,
"e2e_s": 0.04840670898556709,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.0462190090038348,
"e2e_s": 0.04632823000429198,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.04609913402236998,
"e2e_s": 0.046204126003431156,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.0462158510053996,
"e2e_s": 0.0462158510053996,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 4080.0,
"d_ext_hits": 0.0
}
],
"8192": [
{
"ttft_s": 0.05042222701013088,
"e2e_s": 0.05053543800022453,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.05319672200130299,
"e2e_s": 0.053308423986891285,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.05063424099353142,
"e2e_s": 0.05073276098119095,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.050384567002765834,
"e2e_s": 0.05048462699051015,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.055105848994571716,
"e2e_s": 0.055215683998540044,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.05349100599414669,
"e2e_s": 0.053595816978486255,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.05177868201280944,
"e2e_s": 0.05188246400211938,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.055209266021847725,
"e2e_s": 0.05531894601881504,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 8176.0,
"d_ext_hits": 0.0
}
],
"16384": [
{
"ttft_s": 0.0633803239907138,
"e2e_s": 0.06349112599855289,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.06337857199832797,
"e2e_s": 0.06350608498905785,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.06098292299429886,
"e2e_s": 0.061115075019188225,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.06343525002012029,
"e2e_s": 0.06355450401315466,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.0636955969966948,
"e2e_s": 0.0636955969966948,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.06389156199293211,
"e2e_s": 0.06389156199293211,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.06587072199909016,
"e2e_s": 0.06587072199909016,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.059953891002805904,
"e2e_s": 0.060058912000386044,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 16368.0,
"d_ext_hits": 0.0
}
],
"32768": [
{
"ttft_s": 0.07615292401169427,
"e2e_s": 0.07625289200223051,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.07992992899380624,
"e2e_s": 0.0800386439950671,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.07980811400921084,
"e2e_s": 0.07995001602103002,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.11761908099288121,
"e2e_s": 0.11776423300034367,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.07834753501811065,
"e2e_s": 0.07834753501811065,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.0814115820103325,
"e2e_s": 0.0814115820103325,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.08212830501724966,
"e2e_s": 0.08224253499065526,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.07761731799109839,
"e2e_s": 0.07772363899857737,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 32752.0,
"d_ext_hits": 0.0
}
],
"65536": [
{
"ttft_s": 0.1206158839922864,
"e2e_s": 0.1206158839922864,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.10727833199780434,
"e2e_s": 0.10727833199780434,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.11010084400186315,
"e2e_s": 0.11023741000099108,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.11566799599677324,
"e2e_s": 0.1157765949901659,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.11422122400836088,
"e2e_s": 0.11422122400836088,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.07390080401091836,
"e2e_s": 0.07390080401091836,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.11271239400957711,
"e2e_s": 0.11271239400957711,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
},
{
"ttft_s": 0.10560396200162359,
"e2e_s": 0.10572021701955236,
"tier_observed": "gpu",
"expect": "gpu",
"d_gpu_hits": 65520.0,
"d_ext_hits": 0.0
}
]
}
}