mb7 with background decode load (8/instance). Critical-path transfer overhead stays ~constant ~90ms for layerwise vs 158/239/749ms baseline (up to 7.9x at 32k), prefill not slowed, KV correct. Confirms the overlap holds on busy instances. DESIGN.md updated with idle-vs-load table + the two blockers (chunk-safety, concurrent-transfer safety) that the full 1200-req trace needs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
140 lines
3.3 KiB
JSON
140 lines
3.3 KiB
JSON
{
|
|
"mode": "layerwise",
|
|
"model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
|
"raw": [
|
|
{
|
|
"t_A_s": 0.5905098549992545,
|
|
"t_B_s": 0.6900827390018094,
|
|
"t_total_s": 0.6904724189989793,
|
|
"cached": 8176,
|
|
"mode": "layerwise",
|
|
"size": 8192,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 0.5852864849985053,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 0.5897548109969648,
|
|
"t_B_s": 0.6827381169969158,
|
|
"t_total_s": 0.6828304180016858,
|
|
"cached": 8176,
|
|
"mode": "layerwise",
|
|
"size": 8192,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 0.5890174580017629,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 0.5850713190011447,
|
|
"t_B_s": 0.6744917560026806,
|
|
"t_total_s": 0.6745770380002796,
|
|
"cached": 8176,
|
|
"mode": "layerwise",
|
|
"size": 8192,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 0.5943713950000529,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 1.5030149390004226,
|
|
"t_B_s": 1.596173029000056,
|
|
"t_total_s": 1.597060264000902,
|
|
"cached": 16368,
|
|
"mode": "layerwise",
|
|
"size": 16384,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 1.5130829510017065,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 1.499876754998695,
|
|
"t_B_s": 1.5940461120007967,
|
|
"t_total_s": 1.5948001770011615,
|
|
"cached": 16368,
|
|
"mode": "layerwise",
|
|
"size": 16384,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 1.5024838620010996,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 1.5068977490009274,
|
|
"t_B_s": 1.5950395179970656,
|
|
"t_total_s": 1.59571184500237,
|
|
"cached": 16368,
|
|
"mode": "layerwise",
|
|
"size": 16384,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 1.5303227439981129,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 4.4503932609986805,
|
|
"t_B_s": 4.538851200999488,
|
|
"t_total_s": 4.539281312001549,
|
|
"cached": 32752,
|
|
"mode": "layerwise",
|
|
"size": 32768,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 4.446753306998289,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 4.44226107799841,
|
|
"t_B_s": 4.551636377997056,
|
|
"t_total_s": 4.552389411001059,
|
|
"cached": 32752,
|
|
"mode": "layerwise",
|
|
"size": 32768,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 4.44538704000297,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_A_s": 4.440309538000292,
|
|
"t_B_s": 4.539836316998844,
|
|
"t_total_s": 4.540553365997766,
|
|
"cached": 32752,
|
|
"mode": "layerwise",
|
|
"size": 32768,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 4.443476915999781,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
}
|
|
],
|
|
"summary": [
|
|
{
|
|
"size": 8192,
|
|
"n": 3,
|
|
"pf_only_ms": 589.0174580017629,
|
|
"total_ms": 682.8304180016858,
|
|
"overhead_ms": 93.8129599999229,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 16384,
|
|
"n": 3,
|
|
"pf_only_ms": 1513.0829510017065,
|
|
"total_ms": 1595.71184500237,
|
|
"overhead_ms": 82.62889400066342,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 32768,
|
|
"n": 3,
|
|
"pf_only_ms": 4445.38704000297,
|
|
"total_ms": 4540.553365997766,
|
|
"overhead_ms": 95.16632599479635,
|
|
"all_correct": true
|
|
}
|
|
]
|
|
} |