Implements per-layer KV push during prefill (write mode) on vLLM's MooncakeConnector, env-gated by MOONCAKE_LAYERWISE=1. 2-instance microbench (mb7) shows correctness (KV lands, cached==prompt) and that the transfer is hidden behind prefill compute: critical-path overhead drops from O(KV size) (123/202/529ms for 8k/16k/32k) to a flat ~58ms (2-9x), with no prefill slowdown, on idle instances. Caveats: idle-only, chunked-prefill disabled, single concurrent transfer — see DESIGN.md. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
140 lines
3.4 KiB
JSON
140 lines
3.4 KiB
JSON
{
|
|
"mode": "baseline",
|
|
"model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
|
"raw": [
|
|
{
|
|
"t_prefill_s": 0.5736213000018324,
|
|
"t_xfer_s": 0.36388630099827424,
|
|
"t_total_s": 0.9375749369974073,
|
|
"cached": 8176,
|
|
"mode": "baseline",
|
|
"size": 8192,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 1.0551288530004967,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 0.5740011439993395,
|
|
"t_xfer_s": 0.12374231500143651,
|
|
"t_total_s": 0.6978207100000873,
|
|
"cached": 8176,
|
|
"mode": "baseline",
|
|
"size": 8192,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 0.5743715360003989,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 0.5732713990000775,
|
|
"t_xfer_s": 0.10885842400239198,
|
|
"t_total_s": 0.6821924389987544,
|
|
"cached": 8176,
|
|
"mode": "baseline",
|
|
"size": 8192,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 0.5745713680007611,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 1.4892208660021424,
|
|
"t_xfer_s": 0.2091717740004242,
|
|
"t_total_s": 1.6984740270017937,
|
|
"cached": 16368,
|
|
"mode": "baseline",
|
|
"size": 16384,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 1.4990949730017746,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 1.4885207330007688,
|
|
"t_xfer_s": 0.2010940889995254,
|
|
"t_total_s": 1.6896768289989268,
|
|
"cached": 16368,
|
|
"mode": "baseline",
|
|
"size": 16384,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 1.4898170189990196,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 1.4895933570005582,
|
|
"t_xfer_s": 0.2026357979993918,
|
|
"t_total_s": 1.6922962099997676,
|
|
"cached": 16368,
|
|
"mode": "baseline",
|
|
"size": 16384,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 1.4907751430000644,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 4.438586502998078,
|
|
"t_xfer_s": 0.37847799000155646,
|
|
"t_total_s": 4.817142683001293,
|
|
"cached": 32752,
|
|
"mode": "baseline",
|
|
"size": 32768,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 4.437922253000579,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 4.4350325649975275,
|
|
"t_xfer_s": 0.5313337980005599,
|
|
"t_total_s": 4.966431269000168,
|
|
"cached": 32752,
|
|
"mode": "baseline",
|
|
"size": 32768,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 4.437473922000208,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 4.436279826000828,
|
|
"t_xfer_s": 0.6335160570015432,
|
|
"t_total_s": 5.069869226001174,
|
|
"cached": 32752,
|
|
"mode": "baseline",
|
|
"size": 32768,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 4.440119222999783,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
}
|
|
],
|
|
"summary": [
|
|
{
|
|
"size": 8192,
|
|
"n": 3,
|
|
"pf_only_ms": 574.5713680007611,
|
|
"total_ms": 697.8207100000873,
|
|
"overhead_ms": 123.24934199932613,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 16384,
|
|
"n": 3,
|
|
"pf_only_ms": 1490.7751430000644,
|
|
"total_ms": 1692.2962099997676,
|
|
"overhead_ms": 201.52106699970318,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 32768,
|
|
"n": 3,
|
|
"pf_only_ms": 4437.922253000579,
|
|
"total_ms": 4966.431269000168,
|
|
"overhead_ms": 528.5090159995889,
|
|
"all_correct": true
|
|
}
|
|
]
|
|
} |