mb7 with background decode load (8/instance). Critical-path transfer overhead stays ~constant ~90ms for layerwise vs 158/239/749ms baseline (up to 7.9x at 32k), prefill not slowed, KV correct. Confirms the overlap holds on busy instances. DESIGN.md updated with idle-vs-load table + the two blockers (chunk-safety, concurrent-transfer safety) that the full 1200-req trace needs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
140 lines
3.4 KiB
JSON
140 lines
3.4 KiB
JSON
{
|
|
"mode": "baseline",
|
|
"model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
|
"raw": [
|
|
{
|
|
"t_prefill_s": 0.5868483350022871,
|
|
"t_xfer_s": 0.19584889299949282,
|
|
"t_total_s": 0.7827702419999696,
|
|
"cached": 8176,
|
|
"mode": "baseline",
|
|
"size": 8192,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 0.5920699099988269,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 0.5875704979989678,
|
|
"t_xfer_s": 0.1554814909977722,
|
|
"t_total_s": 0.7431365060001554,
|
|
"cached": 8176,
|
|
"mode": "baseline",
|
|
"size": 8192,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 0.5814537600017502,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 0.5852241569991747,
|
|
"t_xfer_s": 0.15129724399957922,
|
|
"t_total_s": 0.7365909610016388,
|
|
"cached": 8176,
|
|
"mode": "baseline",
|
|
"size": 8192,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 0.5846994370003813,
|
|
"kv_gib": 0.75,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 1.498547145001794,
|
|
"t_xfer_s": 0.2475714690008317,
|
|
"t_total_s": 1.7462187470009667,
|
|
"cached": 16368,
|
|
"mode": "baseline",
|
|
"size": 16384,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 1.5670790190015396,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 1.5025789940009417,
|
|
"t_xfer_s": 0.24532966799961287,
|
|
"t_total_s": 1.7479741930001182,
|
|
"cached": 16368,
|
|
"mode": "baseline",
|
|
"size": 16384,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 1.5008903820016712,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 1.5021674179988622,
|
|
"t_xfer_s": 0.24640760400143336,
|
|
"t_total_s": 1.7486415580024186,
|
|
"cached": 16368,
|
|
"mode": "baseline",
|
|
"size": 16384,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 1.509417139001016,
|
|
"kv_gib": 1.5,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 4.444555983998725,
|
|
"t_xfer_s": 0.4227471090016479,
|
|
"t_total_s": 4.86737214599998,
|
|
"cached": 32752,
|
|
"mode": "baseline",
|
|
"size": 32768,
|
|
"rep": 0,
|
|
"t_prefill_only_s": 4.4467717689985875,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 4.442135782999685,
|
|
"t_xfer_s": 0.7519038230020669,
|
|
"t_total_s": 5.194113359000767,
|
|
"cached": 32752,
|
|
"mode": "baseline",
|
|
"size": 32768,
|
|
"rep": 1,
|
|
"t_prefill_only_s": 4.445541313998547,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
},
|
|
{
|
|
"t_prefill_s": 4.439772993999213,
|
|
"t_xfer_s": 0.7855456319994119,
|
|
"t_total_s": 5.225392060998274,
|
|
"cached": 32752,
|
|
"mode": "baseline",
|
|
"size": 32768,
|
|
"rep": 2,
|
|
"t_prefill_only_s": 4.442906365002273,
|
|
"kv_gib": 3.0,
|
|
"correct": true
|
|
}
|
|
],
|
|
"summary": [
|
|
{
|
|
"size": 8192,
|
|
"n": 3,
|
|
"pf_only_ms": 584.6994370003813,
|
|
"total_ms": 743.1365060001554,
|
|
"overhead_ms": 158.43706899977406,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 16384,
|
|
"n": 3,
|
|
"pf_only_ms": 1509.417139001016,
|
|
"total_ms": 1747.9741930001182,
|
|
"overhead_ms": 238.5570539991022,
|
|
"all_correct": true
|
|
},
|
|
{
|
|
"size": 32768,
|
|
"n": 3,
|
|
"pf_only_ms": 4445.541313998547,
|
|
"total_ms": 5194.113359000767,
|
|
"overhead_ms": 748.57204500222,
|
|
"all_correct": true
|
|
}
|
|
]
|
|
} |