Measures TTFT to serve a reused prefix of length L from each KV tier on a single H20 (Qwen3-Coder-30B-A3B, vLLM 0.18.1): miss (recompute), CPU-tier hit (native DRAM offload), GPU-tier hit (HBM prefix cache). Each measured request is bracketed by /metrics scrapes so the tier is verified (vllm:prefix_cache_hits vs external_prefix_cache_hits), not assumed. Result: GPU hit is ~flat (42->111 ms over 1k->64k tokens); CPU hit is transfer-bound (PCIe H2D ~54 GB/s, 57->272 ms); miss grows superlinearly (78 ms -> 15.2 s). GPU beats CPU 1.4-2.5x (gap grows with context); miss/CPU up to 56x, miss/GPU up to 137x. pcie_transfer.py is the independent CPU-hit floor backstop. Evidence for the GPU-hit-first principle (paper section 2.2). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
533 lines
13 KiB
JSON
533 lines
13 KiB
JSON
{
|
|
"mode": "gpu",
|
|
"reps": 8,
|
|
"by_length": {
|
|
"1024": {
|
|
"n": 8,
|
|
"ttft_p50": 0.04180275100225117,
|
|
"ttft_mean": 0.05689269150025211,
|
|
"ttft_min": 0.041313502995762974,
|
|
"ttft_max": 0.1606091230059974,
|
|
"tier_observed": "gpu",
|
|
"verified_frac": 1.0
|
|
},
|
|
"2048": {
|
|
"n": 8,
|
|
"ttft_p50": 0.044922845510882325,
|
|
"ttft_mean": 0.04646045462868642,
|
|
"ttft_min": 0.04261300901998766,
|
|
"ttft_max": 0.06082483098725788,
|
|
"tier_observed": "gpu",
|
|
"verified_frac": 1.0
|
|
},
|
|
"4096": {
|
|
"n": 8,
|
|
"ttft_p50": 0.0462174300046172,
|
|
"ttft_mean": 0.04691218675361597,
|
|
"ttft_min": 0.044408742018276826,
|
|
"ttft_max": 0.05101387499598786,
|
|
"tier_observed": "gpu",
|
|
"verified_frac": 1.0
|
|
},
|
|
"8192": {
|
|
"n": 8,
|
|
"ttft_p50": 0.052487702007056214,
|
|
"ttft_mean": 0.05252782000388834,
|
|
"ttft_min": 0.050384567002765834,
|
|
"ttft_max": 0.055209266021847725,
|
|
"tier_observed": "gpu",
|
|
"verified_frac": 1.0
|
|
},
|
|
"16384": {
|
|
"n": 8,
|
|
"ttft_p50": 0.06340778700541705,
|
|
"ttft_mean": 0.06307360512437299,
|
|
"ttft_min": 0.059953891002805904,
|
|
"ttft_max": 0.06587072199909016,
|
|
"tier_observed": "gpu",
|
|
"verified_frac": 1.0
|
|
},
|
|
"32768": {
|
|
"n": 8,
|
|
"ttft_p50": 0.07986902150150854,
|
|
"ttft_mean": 0.08412684850554797,
|
|
"ttft_min": 0.07615292401169427,
|
|
"ttft_max": 0.11761908099288121,
|
|
"tier_observed": "gpu",
|
|
"verified_frac": 1.0
|
|
},
|
|
"65536": {
|
|
"n": 8,
|
|
"ttft_p50": 0.11140661900572013,
|
|
"ttft_mean": 0.10751268000240088,
|
|
"ttft_min": 0.07390080401091836,
|
|
"ttft_max": 0.1206158839922864,
|
|
"tier_observed": "gpu",
|
|
"verified_frac": 1.0
|
|
}
|
|
},
|
|
"raw": {
|
|
"1024": [
|
|
{
|
|
"ttft_s": 0.1606091230059974,
|
|
"e2e_s": 0.16078226300305687,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.043477901024743915,
|
|
"e2e_s": 0.0436010490229819,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04137404798530042,
|
|
"e2e_s": 0.04146770399529487,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04179324599681422,
|
|
"e2e_s": 0.041887808009050786,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04326947100344114,
|
|
"e2e_s": 0.04335355598595925,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04149198398226872,
|
|
"e2e_s": 0.04157822398701683,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04181225600768812,
|
|
"e2e_s": 0.04190706100780517,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.041313502995762974,
|
|
"e2e_s": 0.041313502995762974,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 1008.0,
|
|
"d_ext_hits": 0.0
|
|
}
|
|
],
|
|
"2048": [
|
|
{
|
|
"ttft_s": 0.04491939002764411,
|
|
"e2e_s": 0.045019031007541344,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.045014784001978114,
|
|
"e2e_s": 0.04511277299025096,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04492630099412054,
|
|
"e2e_s": 0.04502850098651834,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04566141500254162,
|
|
"e2e_s": 0.04576313399593346,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04261300901998766,
|
|
"e2e_s": 0.04271370900096372,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.06082483098725788,
|
|
"e2e_s": 0.06096197199076414,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04355804901570082,
|
|
"e2e_s": 0.04355804901570082,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.044165857980260625,
|
|
"e2e_s": 0.044268568977713585,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 2032.0,
|
|
"d_ext_hits": 0.0
|
|
}
|
|
],
|
|
"4096": [
|
|
{
|
|
"ttft_s": 0.05101387499598786,
|
|
"e2e_s": 0.051123478973750025,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.044408742018276826,
|
|
"e2e_s": 0.044408742018276826,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04538871700060554,
|
|
"e2e_s": 0.045498208986828104,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04754545699688606,
|
|
"e2e_s": 0.047664124984294176,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04840670898556709,
|
|
"e2e_s": 0.04840670898556709,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.0462190090038348,
|
|
"e2e_s": 0.04632823000429198,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.04609913402236998,
|
|
"e2e_s": 0.046204126003431156,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.0462158510053996,
|
|
"e2e_s": 0.0462158510053996,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 4080.0,
|
|
"d_ext_hits": 0.0
|
|
}
|
|
],
|
|
"8192": [
|
|
{
|
|
"ttft_s": 0.05042222701013088,
|
|
"e2e_s": 0.05053543800022453,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.05319672200130299,
|
|
"e2e_s": 0.053308423986891285,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.05063424099353142,
|
|
"e2e_s": 0.05073276098119095,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.050384567002765834,
|
|
"e2e_s": 0.05048462699051015,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.055105848994571716,
|
|
"e2e_s": 0.055215683998540044,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.05349100599414669,
|
|
"e2e_s": 0.053595816978486255,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.05177868201280944,
|
|
"e2e_s": 0.05188246400211938,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.055209266021847725,
|
|
"e2e_s": 0.05531894601881504,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 8176.0,
|
|
"d_ext_hits": 0.0
|
|
}
|
|
],
|
|
"16384": [
|
|
{
|
|
"ttft_s": 0.0633803239907138,
|
|
"e2e_s": 0.06349112599855289,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.06337857199832797,
|
|
"e2e_s": 0.06350608498905785,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.06098292299429886,
|
|
"e2e_s": 0.061115075019188225,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.06343525002012029,
|
|
"e2e_s": 0.06355450401315466,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.0636955969966948,
|
|
"e2e_s": 0.0636955969966948,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.06389156199293211,
|
|
"e2e_s": 0.06389156199293211,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.06587072199909016,
|
|
"e2e_s": 0.06587072199909016,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.059953891002805904,
|
|
"e2e_s": 0.060058912000386044,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 16368.0,
|
|
"d_ext_hits": 0.0
|
|
}
|
|
],
|
|
"32768": [
|
|
{
|
|
"ttft_s": 0.07615292401169427,
|
|
"e2e_s": 0.07625289200223051,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.07992992899380624,
|
|
"e2e_s": 0.0800386439950671,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.07980811400921084,
|
|
"e2e_s": 0.07995001602103002,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.11761908099288121,
|
|
"e2e_s": 0.11776423300034367,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.07834753501811065,
|
|
"e2e_s": 0.07834753501811065,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.0814115820103325,
|
|
"e2e_s": 0.0814115820103325,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.08212830501724966,
|
|
"e2e_s": 0.08224253499065526,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.07761731799109839,
|
|
"e2e_s": 0.07772363899857737,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 32752.0,
|
|
"d_ext_hits": 0.0
|
|
}
|
|
],
|
|
"65536": [
|
|
{
|
|
"ttft_s": 0.1206158839922864,
|
|
"e2e_s": 0.1206158839922864,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.10727833199780434,
|
|
"e2e_s": 0.10727833199780434,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.11010084400186315,
|
|
"e2e_s": 0.11023741000099108,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.11566799599677324,
|
|
"e2e_s": 0.1157765949901659,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.11422122400836088,
|
|
"e2e_s": 0.11422122400836088,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.07390080401091836,
|
|
"e2e_s": 0.07390080401091836,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.11271239400957711,
|
|
"e2e_s": 0.11271239400957711,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
},
|
|
{
|
|
"ttft_s": 0.10560396200162359,
|
|
"e2e_s": 0.10572021701955236,
|
|
"tier_observed": "gpu",
|
|
"expect": "gpu",
|
|
"d_gpu_hits": 65520.0,
|
|
"d_ext_hits": 0.0
|
|
}
|
|
]
|
|
}
|
|
} |