Files
agentic-kvc/microbench/connector_tax/layerwise/results/mb7_baseline.json
Gahow Wang fec50fa45d Layerwise KV transfer on Mooncake: PoC + microbench (worktree exploration)
Implements per-layer KV push during prefill (write mode) on vLLM's
MooncakeConnector, env-gated by MOONCAKE_LAYERWISE=1. 2-instance microbench
(mb7) shows correctness (KV lands, cached==prompt) and that the transfer is
hidden behind prefill compute: critical-path overhead drops from O(KV size)
(123/202/529ms for 8k/16k/32k) to a flat ~58ms (2-9x), with no prefill
slowdown, on idle instances. Caveats: idle-only, chunked-prefill disabled,
single concurrent transfer — see DESIGN.md.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 15:34:43 +08:00

140 lines
3.4 KiB
JSON

{
"mode": "baseline",
"model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct",
"raw": [
{
"t_prefill_s": 0.5736213000018324,
"t_xfer_s": 0.36388630099827424,
"t_total_s": 0.9375749369974073,
"cached": 8176,
"mode": "baseline",
"size": 8192,
"rep": 0,
"t_prefill_only_s": 1.0551288530004967,
"kv_gib": 0.75,
"correct": true
},
{
"t_prefill_s": 0.5740011439993395,
"t_xfer_s": 0.12374231500143651,
"t_total_s": 0.6978207100000873,
"cached": 8176,
"mode": "baseline",
"size": 8192,
"rep": 1,
"t_prefill_only_s": 0.5743715360003989,
"kv_gib": 0.75,
"correct": true
},
{
"t_prefill_s": 0.5732713990000775,
"t_xfer_s": 0.10885842400239198,
"t_total_s": 0.6821924389987544,
"cached": 8176,
"mode": "baseline",
"size": 8192,
"rep": 2,
"t_prefill_only_s": 0.5745713680007611,
"kv_gib": 0.75,
"correct": true
},
{
"t_prefill_s": 1.4892208660021424,
"t_xfer_s": 0.2091717740004242,
"t_total_s": 1.6984740270017937,
"cached": 16368,
"mode": "baseline",
"size": 16384,
"rep": 0,
"t_prefill_only_s": 1.4990949730017746,
"kv_gib": 1.5,
"correct": true
},
{
"t_prefill_s": 1.4885207330007688,
"t_xfer_s": 0.2010940889995254,
"t_total_s": 1.6896768289989268,
"cached": 16368,
"mode": "baseline",
"size": 16384,
"rep": 1,
"t_prefill_only_s": 1.4898170189990196,
"kv_gib": 1.5,
"correct": true
},
{
"t_prefill_s": 1.4895933570005582,
"t_xfer_s": 0.2026357979993918,
"t_total_s": 1.6922962099997676,
"cached": 16368,
"mode": "baseline",
"size": 16384,
"rep": 2,
"t_prefill_only_s": 1.4907751430000644,
"kv_gib": 1.5,
"correct": true
},
{
"t_prefill_s": 4.438586502998078,
"t_xfer_s": 0.37847799000155646,
"t_total_s": 4.817142683001293,
"cached": 32752,
"mode": "baseline",
"size": 32768,
"rep": 0,
"t_prefill_only_s": 4.437922253000579,
"kv_gib": 3.0,
"correct": true
},
{
"t_prefill_s": 4.4350325649975275,
"t_xfer_s": 0.5313337980005599,
"t_total_s": 4.966431269000168,
"cached": 32752,
"mode": "baseline",
"size": 32768,
"rep": 1,
"t_prefill_only_s": 4.437473922000208,
"kv_gib": 3.0,
"correct": true
},
{
"t_prefill_s": 4.436279826000828,
"t_xfer_s": 0.6335160570015432,
"t_total_s": 5.069869226001174,
"cached": 32752,
"mode": "baseline",
"size": 32768,
"rep": 2,
"t_prefill_only_s": 4.440119222999783,
"kv_gib": 3.0,
"correct": true
}
],
"summary": [
{
"size": 8192,
"n": 3,
"pf_only_ms": 574.5713680007611,
"total_ms": 697.8207100000873,
"overhead_ms": 123.24934199932613,
"all_correct": true
},
{
"size": 16384,
"n": 3,
"pf_only_ms": 1490.7751430000644,
"total_ms": 1692.2962099997676,
"overhead_ms": 201.52106699970318,
"all_correct": true
},
{
"size": 32768,
"n": 3,
"pf_only_ms": 4437.922253000579,
"total_ms": 4966.431269000168,
"overhead_ms": 528.5090159995889,
"all_correct": true
}
]
}