From e77bdcac5a4be26a2167b42bc0b61c9669bcfe69 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Thu, 28 May 2026 16:30:14 +0800 Subject: [PATCH] Layerwise under load: overlap benefit survives (bg=16) mb7 with background decode load (8/instance). Critical-path transfer overhead stays ~constant ~90ms for layerwise vs 158/239/749ms baseline (up to 7.9x at 32k), prefill not slowed, KV correct. Confirms the overlap holds on busy instances. DESIGN.md updated with idle-vs-load table + the two blockers (chunk-safety, concurrent-transfer safety) that the full 1200-req trace needs. Co-Authored-By: Claude Opus 4.7 --- microbench/connector_tax/layerwise/DESIGN.md | 40 ++++- .../connector_tax/layerwise/mb7_layerwise.py | 72 ++++++++- .../layerwise/results/mb7_baseline_bg16.json | 140 ++++++++++++++++++ .../layerwise/results/mb7_layerwise_bg16.json | 140 ++++++++++++++++++ microbench/connector_tax/layerwise/run_mb7.sh | 4 +- 5 files changed, 387 insertions(+), 9 deletions(-) create mode 100644 microbench/connector_tax/layerwise/results/mb7_baseline_bg16.json create mode 100644 microbench/connector_tax/layerwise/results/mb7_layerwise_bg16.json diff --git a/microbench/connector_tax/layerwise/DESIGN.md b/microbench/connector_tax/layerwise/DESIGN.md index c5549a9..dd2e07c 100644 --- a/microbench/connector_tax/layerwise/DESIGN.md +++ b/microbench/connector_tax/layerwise/DESIGN.md @@ -114,11 +114,39 @@ Key signatures: (`_handle_combined_pd_sep_v2`) dispatches sequentially and would need the write-mode (concurrent) restructure. +## Results under LOAD (bg=16 background decode streams, 8 per instance) + +Critical-path transfer overhead (ms), `total − prefill_only`: + +| KV size | idle base | idle LW | **load base** | **load LW** | +|--------:|----------:|--------:|--------------:|------------:| +| 8k | 123 | 58 | 158 | **94** | +| 16k | 202 | 58 | 239 | **83** | +| 32k | 529 | 57 | **749** | **95** | + +The overlap **survives load**: layerwise overhead stays ~constant (~90 ms) +under load while baseline grows to 749 ms at 32k (7.9× reduction). Prefill did +not slow (load LW `t_A` == load `prefill_only`); the transfer (0.56/1.46/4.37 s, +producer logs) ran inside the prefill window even with 16 concurrent decodes. +Correctness PASS under load. + ## Verdict -The mechanism **works and delivers the predicted benefit**: layer-wise push -turns migration's KV-transfer cost from O(KV size) on the critical path into a -near-constant tail, by overlapping it with prefill compute — exactly what -MoRIIO's write mode does on AMD, now demonstrated on NVIDIA/Mooncake. Whether -it flips agentic *migration* to net-positive still depends on the busy-instance -behavior (caveat 1) and is the next experiment. +The mechanism **works and the benefit holds under load**: layer-wise push turns +migration's KV-transfer cost from O(KV size) on the critical path into a +near-constant ~90 ms tail, by overlapping it with prefill compute — what +MoRIIO's write mode does on AMD, now demonstrated on NVIDIA/Mooncake. + +**BUT this is single-transfer, non-chunked.** Running the actual 1200-req trace +correctly needs two more pieces this PoC does NOT have: +1. **Chunk-safe tracking** — long agentic prompts force chunked prefill; + `save_kv_layer` then fires per-chunk and the monotonic counter would ship + uncomputed blocks. Needs slot-mapping-aware per-(request,chunk) tracking. +2. **Concurrent-transfer safety** — the global counter assumes one producer at + a time; the trace migrates from busy instances running other forwards. + +Also: even with those fixed, layer-wise only removes the **transfer half** of +the measured migration overhead. The b3_v3_fullbreak profile showed dst-side +`T_kv_pull` = ~55% RDMA + ~45% control-plane GIL-dispatch stalls; layer-wise +hides the RDMA half but the control-plane half is orthogonal. So a trace +re-profile would show roughly the transfer half collapse, not the whole thing. diff --git a/microbench/connector_tax/layerwise/mb7_layerwise.py b/microbench/connector_tax/layerwise/mb7_layerwise.py index c36dfe5..de436af 100644 --- a/microbench/connector_tax/layerwise/mb7_layerwise.py +++ b/microbench/connector_tax/layerwise/mb7_layerwise.py @@ -72,6 +72,56 @@ def cached_of(resp) -> int: return det.get("cached_tokens", 0) or usage.get("cached_tokens", 0) or 0 +async def _stream_completion(client, host, port, prompt, max_tokens): + payload = {"model": MODEL, "prompt": prompt, "max_tokens": max_tokens, + "min_tokens": 1, "temperature": 0.0, "stream": True} + async with client.stream("POST", f"http://{host}:{port}/v1/completions", + json=payload, timeout=600.0) as r: + r.raise_for_status() + async for _ in r.aiter_bytes(): + pass + + +class BackgroundLoad: + """Hold N concurrent long-decode streams across endpoints to keep busy.""" + def __init__(self, client, endpoints, n, prompt_tokens=2000, out_tokens=6000): + self.client, self.endpoints, self.n = client, endpoints, n + self.pt, self.ot = prompt_tokens, out_tokens + self._stop = asyncio.Event() + self._tasks = [] + + async def _w(self, idx): + host, port = self.endpoints[idx % len(self.endpoints)] + seed = 800000 + idx + while not self._stop.is_set(): + try: + await _stream_completion(self.client, host, port, + synth_prompt(seed, self.pt), self.ot) + except Exception: + await asyncio.sleep(0.5) + seed += 1 + + def start(self): + self._tasks = [asyncio.create_task(self._w(i)) for i in range(self.n)] + + async def stop(self): + self._stop.set() + for t in self._tasks: + t.cancel() + await asyncio.gather(*self._tasks, return_exceptions=True) + + +async def num_running(client, host, port): + try: + r = await client.get(f"http://{host}:{port}/metrics", timeout=5.0) + for line in r.text.splitlines(): + if line.startswith("vllm:num_requests_running"): + return int(float(line.split()[-1])) + except Exception: + pass + return -1 + + async def prefill_only(client, host, port, prompt): """Reference: plain prefill cost on A, no transfer.""" dt, _ = await completion(client, host, port, prompt, max_tokens=1) @@ -128,7 +178,20 @@ async def main_async(a): async with httpx.AsyncClient(limits=limits, trust_env=False) as client: src_eid = await get_engine_id(client, a.src_host, a.src_bp) src_bp_addr = f"http://{a.src_host}:{a.src_bp}" - print(f"[mb7] mode={a.mode} src_eid={src_eid[:16]}...") + print(f"[mb7] mode={a.mode} bg_load={a.bg_load} src_eid={src_eid[:16]}...") + + loader = None + if a.bg_load > 0: + loader = BackgroundLoad(client, [A, B], a.bg_load) + loader.start() + print(f"[mb7] ramping background load ({a.bg_load}) ...") + for _ in range(40): + await asyncio.sleep(1.0) + na = await num_running(client, *A) + nb = await num_running(client, *B) + if na >= 1 and nb >= 1: + print(f"[mb7] busy: A_run={na} B_run={nb}") + break results = [] for sz in sizes: @@ -155,8 +218,11 @@ async def main_async(a): f"total={row['t_total_s']*1000:7.0f}ms {extra} " f"cached={row['cached']}/{sz} correct={row['correct']}") + if loader: + await loader.stop() + # summary - print(f"\n=== {a.mode} summary ===") + print(f"\n=== {a.mode} (bg={a.bg_load}) summary ===") print(f"{'size':>7} {'n':>2} {'pf_only_ms':>11} {'total_ms':>9} " f"{'overhead_ms':>12} {'correct':>8}") summary = [] @@ -191,6 +257,8 @@ def main(): p.add_argument("--dst-bp", type=int, default=8999) p.add_argument("--sizes", default="8192,32768,65536") p.add_argument("--repeats", type=int, default=3) + p.add_argument("--bg-load", type=int, default=0, + help="N concurrent background decode streams across A+B") p.add_argument("--out", default="mb7_result.json") args = p.parse_args() asyncio.run(main_async(args)) diff --git a/microbench/connector_tax/layerwise/results/mb7_baseline_bg16.json b/microbench/connector_tax/layerwise/results/mb7_baseline_bg16.json new file mode 100644 index 0000000..c8960d6 --- /dev/null +++ b/microbench/connector_tax/layerwise/results/mb7_baseline_bg16.json @@ -0,0 +1,140 @@ +{ + "mode": "baseline", + "model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct", + "raw": [ + { + "t_prefill_s": 0.5868483350022871, + "t_xfer_s": 0.19584889299949282, + "t_total_s": 0.7827702419999696, + "cached": 8176, + "mode": "baseline", + "size": 8192, + "rep": 0, + "t_prefill_only_s": 0.5920699099988269, + "kv_gib": 0.75, + "correct": true + }, + { + "t_prefill_s": 0.5875704979989678, + "t_xfer_s": 0.1554814909977722, + "t_total_s": 0.7431365060001554, + "cached": 8176, + "mode": "baseline", + "size": 8192, + "rep": 1, + "t_prefill_only_s": 0.5814537600017502, + "kv_gib": 0.75, + "correct": true + }, + { + "t_prefill_s": 0.5852241569991747, + "t_xfer_s": 0.15129724399957922, + "t_total_s": 0.7365909610016388, + "cached": 8176, + "mode": "baseline", + "size": 8192, + "rep": 2, + "t_prefill_only_s": 0.5846994370003813, + "kv_gib": 0.75, + "correct": true + }, + { + "t_prefill_s": 1.498547145001794, + "t_xfer_s": 0.2475714690008317, + "t_total_s": 1.7462187470009667, + "cached": 16368, + "mode": "baseline", + "size": 16384, + "rep": 0, + "t_prefill_only_s": 1.5670790190015396, + "kv_gib": 1.5, + "correct": true + }, + { + "t_prefill_s": 1.5025789940009417, + "t_xfer_s": 0.24532966799961287, + "t_total_s": 1.7479741930001182, + "cached": 16368, + "mode": "baseline", + "size": 16384, + "rep": 1, + "t_prefill_only_s": 1.5008903820016712, + "kv_gib": 1.5, + "correct": true + }, + { + "t_prefill_s": 1.5021674179988622, + "t_xfer_s": 0.24640760400143336, + "t_total_s": 1.7486415580024186, + "cached": 16368, + "mode": "baseline", + "size": 16384, + "rep": 2, + "t_prefill_only_s": 1.509417139001016, + "kv_gib": 1.5, + "correct": true + }, + { + "t_prefill_s": 4.444555983998725, + "t_xfer_s": 0.4227471090016479, + "t_total_s": 4.86737214599998, + "cached": 32752, + "mode": "baseline", + "size": 32768, + "rep": 0, + "t_prefill_only_s": 4.4467717689985875, + "kv_gib": 3.0, + "correct": true + }, + { + "t_prefill_s": 4.442135782999685, + "t_xfer_s": 0.7519038230020669, + "t_total_s": 5.194113359000767, + "cached": 32752, + "mode": "baseline", + "size": 32768, + "rep": 1, + "t_prefill_only_s": 4.445541313998547, + "kv_gib": 3.0, + "correct": true + }, + { + "t_prefill_s": 4.439772993999213, + "t_xfer_s": 0.7855456319994119, + "t_total_s": 5.225392060998274, + "cached": 32752, + "mode": "baseline", + "size": 32768, + "rep": 2, + "t_prefill_only_s": 4.442906365002273, + "kv_gib": 3.0, + "correct": true + } + ], + "summary": [ + { + "size": 8192, + "n": 3, + "pf_only_ms": 584.6994370003813, + "total_ms": 743.1365060001554, + "overhead_ms": 158.43706899977406, + "all_correct": true + }, + { + "size": 16384, + "n": 3, + "pf_only_ms": 1509.417139001016, + "total_ms": 1747.9741930001182, + "overhead_ms": 238.5570539991022, + "all_correct": true + }, + { + "size": 32768, + "n": 3, + "pf_only_ms": 4445.541313998547, + "total_ms": 5194.113359000767, + "overhead_ms": 748.57204500222, + "all_correct": true + } + ] +} \ No newline at end of file diff --git a/microbench/connector_tax/layerwise/results/mb7_layerwise_bg16.json b/microbench/connector_tax/layerwise/results/mb7_layerwise_bg16.json new file mode 100644 index 0000000..40f9dc9 --- /dev/null +++ b/microbench/connector_tax/layerwise/results/mb7_layerwise_bg16.json @@ -0,0 +1,140 @@ +{ + "mode": "layerwise", + "model": "/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct", + "raw": [ + { + "t_A_s": 0.5905098549992545, + "t_B_s": 0.6900827390018094, + "t_total_s": 0.6904724189989793, + "cached": 8176, + "mode": "layerwise", + "size": 8192, + "rep": 0, + "t_prefill_only_s": 0.5852864849985053, + "kv_gib": 0.75, + "correct": true + }, + { + "t_A_s": 0.5897548109969648, + "t_B_s": 0.6827381169969158, + "t_total_s": 0.6828304180016858, + "cached": 8176, + "mode": "layerwise", + "size": 8192, + "rep": 1, + "t_prefill_only_s": 0.5890174580017629, + "kv_gib": 0.75, + "correct": true + }, + { + "t_A_s": 0.5850713190011447, + "t_B_s": 0.6744917560026806, + "t_total_s": 0.6745770380002796, + "cached": 8176, + "mode": "layerwise", + "size": 8192, + "rep": 2, + "t_prefill_only_s": 0.5943713950000529, + "kv_gib": 0.75, + "correct": true + }, + { + "t_A_s": 1.5030149390004226, + "t_B_s": 1.596173029000056, + "t_total_s": 1.597060264000902, + "cached": 16368, + "mode": "layerwise", + "size": 16384, + "rep": 0, + "t_prefill_only_s": 1.5130829510017065, + "kv_gib": 1.5, + "correct": true + }, + { + "t_A_s": 1.499876754998695, + "t_B_s": 1.5940461120007967, + "t_total_s": 1.5948001770011615, + "cached": 16368, + "mode": "layerwise", + "size": 16384, + "rep": 1, + "t_prefill_only_s": 1.5024838620010996, + "kv_gib": 1.5, + "correct": true + }, + { + "t_A_s": 1.5068977490009274, + "t_B_s": 1.5950395179970656, + "t_total_s": 1.59571184500237, + "cached": 16368, + "mode": "layerwise", + "size": 16384, + "rep": 2, + "t_prefill_only_s": 1.5303227439981129, + "kv_gib": 1.5, + "correct": true + }, + { + "t_A_s": 4.4503932609986805, + "t_B_s": 4.538851200999488, + "t_total_s": 4.539281312001549, + "cached": 32752, + "mode": "layerwise", + "size": 32768, + "rep": 0, + "t_prefill_only_s": 4.446753306998289, + "kv_gib": 3.0, + "correct": true + }, + { + "t_A_s": 4.44226107799841, + "t_B_s": 4.551636377997056, + "t_total_s": 4.552389411001059, + "cached": 32752, + "mode": "layerwise", + "size": 32768, + "rep": 1, + "t_prefill_only_s": 4.44538704000297, + "kv_gib": 3.0, + "correct": true + }, + { + "t_A_s": 4.440309538000292, + "t_B_s": 4.539836316998844, + "t_total_s": 4.540553365997766, + "cached": 32752, + "mode": "layerwise", + "size": 32768, + "rep": 2, + "t_prefill_only_s": 4.443476915999781, + "kv_gib": 3.0, + "correct": true + } + ], + "summary": [ + { + "size": 8192, + "n": 3, + "pf_only_ms": 589.0174580017629, + "total_ms": 682.8304180016858, + "overhead_ms": 93.8129599999229, + "all_correct": true + }, + { + "size": 16384, + "n": 3, + "pf_only_ms": 1513.0829510017065, + "total_ms": 1595.71184500237, + "overhead_ms": 82.62889400066342, + "all_correct": true + }, + { + "size": 32768, + "n": 3, + "pf_only_ms": 4445.38704000297, + "total_ms": 4540.553365997766, + "overhead_ms": 95.16632599479635, + "all_correct": true + } + ] +} \ No newline at end of file diff --git a/microbench/connector_tax/layerwise/run_mb7.sh b/microbench/connector_tax/layerwise/run_mb7.sh index 6020231..f8328c6 100644 --- a/microbench/connector_tax/layerwise/run_mb7.sh +++ b/microbench/connector_tax/layerwise/run_mb7.sh @@ -22,6 +22,7 @@ MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" GPUS=(${GPUS:-0 1}) SIZES="${SIZES:-8192,16384,32768}" REPEATS="${REPEATS:-3}" +BG_LOAD="${BG_LOAD:-0}" MAX_BATCHED="${MAX_BATCHED:-40960}" # >= max prompt => no chunked prefill DATE="$(date +%Y%m%d_%H%M)" OUTDIR="${OUTDIR:-$PROJ_DIR/outputs/mb7_${MODE}_${DATE}}" @@ -100,7 +101,8 @@ echo "[run] mb7 --mode $MODE" "$PYTHON" "$DRIVER" --mode "$MODE" \ --src-port "${PORTS[0]}" --dst-port "${PORTS[1]}" \ --src-bp "${BPS[0]}" --dst-bp "${BPS[1]}" \ - --sizes "$SIZES" --repeats "$REPEATS" --out "$OUTDIR/mb7_result.json" \ + --sizes "$SIZES" --repeats "$REPEATS" --bg-load "$BG_LOAD" \ + --out "$OUTDIR/mb7_result.json" \ 2>&1 | tee "$OUTDIR/mb7_run.txt" echo "[done] $OUTDIR"