Files
agentic-kvc/microbench/connector_tax/layerwise/results/mb7_layerwise.json
Gahow Wang fec50fa45d Layerwise KV transfer on Mooncake: PoC + microbench (worktree exploration)
Implements per-layer KV push during prefill (write mode) on vLLM's
MooncakeConnector, env-gated by MOONCAKE_LAYERWISE=1. 2-instance microbench
(mb7) shows correctness (KV lands, cached==prompt) and that the transfer is
hidden behind prefill compute: critical-path overhead drops from O(KV size)
(123/202/529ms for 8k/16k/32k) to a flat ~58ms (2-9x), with no prefill
slowdown, on idle instances. Caveats: idle-only, chunked-prefill disabled,
single concurrent transfer — see DESIGN.md.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 15:34:43 +08:00

140 lines
3.3 KiB
JSON

{
"mode": "layerwise",
"model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct",
"raw": [
{
"t_A_s": 0.5749198459998297,
"t_B_s": 0.6508419569981925,
"t_total_s": 0.6509377910006151,
"cached": 8176,
"mode": "layerwise",
"size": 8192,
"rep": 0,
"t_prefill_only_s": 1.0447357020020718,
"kv_gib": 0.75,
"correct": true
},
{
"t_A_s": 0.574626908000937,
"t_B_s": 0.6306310719992325,
"t_total_s": 0.6307087300010608,
"cached": 8176,
"mode": "layerwise",
"size": 8192,
"rep": 1,
"t_prefill_only_s": 0.5731983850018878,
"kv_gib": 0.75,
"correct": true
},
{
"t_A_s": 0.5756587910000235,
"t_B_s": 0.6316753270002664,
"t_total_s": 0.6317471290021786,
"cached": 8176,
"mode": "layerwise",
"size": 8192,
"rep": 2,
"t_prefill_only_s": 0.5737888650000968,
"kv_gib": 0.75,
"correct": true
},
{
"t_A_s": 1.4953326409995498,
"t_B_s": 1.5502465710014803,
"t_total_s": 1.5503262860001996,
"cached": 16368,
"mode": "layerwise",
"size": 16384,
"rep": 0,
"t_prefill_only_s": 1.5000705940001353,
"kv_gib": 1.5,
"correct": true
},
{
"t_A_s": 1.493850356000621,
"t_B_s": 1.5505031290012994,
"t_total_s": 1.5505791659998067,
"cached": 16368,
"mode": "layerwise",
"size": 16384,
"rep": 1,
"t_prefill_only_s": 1.4924546469992492,
"kv_gib": 1.5,
"correct": true
},
{
"t_A_s": 1.4979969070009247,
"t_B_s": 1.554968774002191,
"t_total_s": 1.5551903560008213,
"cached": 16368,
"mode": "layerwise",
"size": 16384,
"rep": 2,
"t_prefill_only_s": 1.4914496510027675,
"kv_gib": 1.5,
"correct": true
},
{
"t_A_s": 4.4403588690001925,
"t_B_s": 4.496483378999983,
"t_total_s": 4.4965666819989565,
"cached": 32752,
"mode": "layerwise",
"size": 32768,
"rep": 0,
"t_prefill_only_s": 4.440080869000667,
"kv_gib": 3.0,
"correct": true
},
{
"t_A_s": 4.44209005599987,
"t_B_s": 4.499940814999718,
"t_total_s": 4.500021006002498,
"cached": 32752,
"mode": "layerwise",
"size": 32768,
"rep": 1,
"t_prefill_only_s": 4.440225810998527,
"kv_gib": 3.0,
"correct": true
},
{
"t_A_s": 4.437084657998639,
"t_B_s": 4.496842522999941,
"t_total_s": 4.496926485000586,
"cached": 32752,
"mode": "layerwise",
"size": 32768,
"rep": 2,
"t_prefill_only_s": 4.439449855002749,
"kv_gib": 3.0,
"correct": true
}
],
"summary": [
{
"size": 8192,
"n": 3,
"pf_only_ms": 573.7888650000968,
"total_ms": 631.7471290021786,
"overhead_ms": 57.958264002081705,
"all_correct": true
},
{
"size": 16384,
"n": 3,
"pf_only_ms": 1492.4546469992492,
"total_ms": 1550.5791659998067,
"overhead_ms": 58.124519000557484,
"all_correct": true
},
{
"size": 32768,
"n": 3,
"pf_only_ms": 4440.080869000667,
"total_ms": 4496.926485000586,
"overhead_ms": 56.845615999918664,
"all_correct": true
}
]
}