Implements per-layer KV push during prefill (write mode) on vLLM's MooncakeConnector, env-gated by MOONCAKE_LAYERWISE=1. 2-instance microbench (mb7) shows correctness (KV lands, cached==prompt) and that the transfer is hidden behind prefill compute: critical-path overhead drops from O(KV size) (123/202/529ms for 8k/16k/32k) to a flat ~58ms (2-9x), with no prefill slowdown, on idle instances. Caveats: idle-only, chunked-prefill disabled, single concurrent transfer — see DESIGN.md. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
140 lines
3.3 KiB
JSON
140 lines
3.3 KiB
JSON
{
|
|
"mode": "layerwise",
|
|
"model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
|
"raw": [
|
|
{
|
|
"t_A_s": 0.5749198459998297,
|
|
"t_B_s": 0.6508419569981925,
|
|
"t_total_s": 0.6509377910006151,
|
|
"cached": 8176,
|
|
"mode": "layerwise",
|
|
"size": 8192,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 1.0447357020020718,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 0.574626908000937,
|
|
"t_B_s": 0.6306310719992325,
|
|
"t_total_s": 0.6307087300010608,
|
|
"cached": 8176,
|
|
"mode": "layerwise",
|
|
"size": 8192,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 0.5731983850018878,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 0.5756587910000235,
|
|
"t_B_s": 0.6316753270002664,
|
|
"t_total_s": 0.6317471290021786,
|
|
"cached": 8176,
|
|
"mode": "layerwise",
|
|
"size": 8192,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 0.5737888650000968,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 1.4953326409995498,
|
|
"t_B_s": 1.5502465710014803,
|
|
"t_total_s": 1.5503262860001996,
|
|
"cached": 16368,
|
|
"mode": "layerwise",
|
|
"size": 16384,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 1.5000705940001353,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 1.493850356000621,
|
|
"t_B_s": 1.5505031290012994,
|
|
"t_total_s": 1.5505791659998067,
|
|
"cached": 16368,
|
|
"mode": "layerwise",
|
|
"size": 16384,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 1.4924546469992492,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 1.4979969070009247,
|
|
"t_B_s": 1.554968774002191,
|
|
"t_total_s": 1.5551903560008213,
|
|
"cached": 16368,
|
|
"mode": "layerwise",
|
|
"size": 16384,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 1.4914496510027675,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 4.4403588690001925,
|
|
"t_B_s": 4.496483378999983,
|
|
"t_total_s": 4.4965666819989565,
|
|
"cached": 32752,
|
|
"mode": "layerwise",
|
|
"size": 32768,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 4.440080869000667,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 4.44209005599987,
|
|
"t_B_s": 4.499940814999718,
|
|
"t_total_s": 4.500021006002498,
|
|
"cached": 32752,
|
|
"mode": "layerwise",
|
|
"size": 32768,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 4.440225810998527,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 4.437084657998639,
|
|
"t_B_s": 4.496842522999941,
|
|
"t_total_s": 4.496926485000586,
|
|
"cached": 32752,
|
|
"mode": "layerwise",
|
|
"size": 32768,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 4.439449855002749,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
}
|
|
],
|
|
"summary": [
|
|
{
|
|
"size": 8192,
|
|
"n": 3,
|
|
"pf_only_ms": 573.7888650000968,
|
|
"total_ms": 631.7471290021786,
|
|
"overhead_ms": 57.958264002081705,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 16384,
|
|
"n": 3,
|
|
"pf_only_ms": 1492.4546469992492,
|
|
"total_ms": 1550.5791659998067,
|
|
"overhead_ms": 58.124519000557484,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 32768,
|
|
"n": 3,
|
|
"pf_only_ms": 4440.080869000667,
|
|
"total_ms": 4496.926485000586,
|
|
"overhead_ms": 56.845615999918664,
|
|
"all_correct": true
|
|
}
|
|
]
|
|
} |