MB2: parameterize vLLM roles (kv_producer + kv_consumer default)

start_vllm_pair.sh ROLE_A / ROLE_B env vars (default kv_producer / kv_consumer for strict PD-disagg). Override to kv_both for the kv_both control. The role is injected into --kv-transfer-config so vLLM imposes the role restriction. mb2_kv_transfer.py --skip-verify flag drops step 3 (the plain completion sanity-check on the destination), required when the dst is kv_consumer-only since a kv_consumer instance refuses to serve a request without do_remote_prefill. The transfer-time itself is still measured from step 2 (do_remote_prefill on the consumer). Also: per-step client-side wall-clock timestamps (t_step1_client_unix, t_step2_client_unix, t_step2_end_unix) are now captured so the post-hoc breakdown analyzer can join with the per-instance JSONL logs on absolute time. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-27 18:17:42 +08:00
parent efdcf3c555
commit 622e0bc04c
2 changed files with 43 additions and 16 deletions
--- a/microbench/fresh_setup/mb2_kv_transfer.py
+++ b/microbench/fresh_setup/mb2_kv_transfer.py
@@ -76,10 +76,14 @@ async def measure_one(
    src_eid: str, dst_eid: str,
    input_tokens: int,
    rng_seed: int,
    skip_verify: bool = False,
 ) -> dict:
    """Three-step measurement (step 3 is sanity, optional for strict PD-disagg
    where the dst is a kv_consumer-only instance that cannot serve a plain
    request)."""
    prompt = synth_prompt(rng_seed, input_tokens)
    session = uuid.uuid4().hex
-    # Step 1: prefill on A. max_tokens=1 ensures KV is cached but no real decode work.
+    t_step1_client = time.time()
    t_prefill_s, prefill_resp = await completion(
        client, src_port, prompt, max_tokens=1,
        kv_transfer_params={
@@ -91,7 +95,7 @@ async def measure_one(
        },
    )
    src_kvp = prefill_resp.get("kv_transfer_params") or {}
-    # Step 2: pull from A to B (the transfer step we time)
+    t_step2_client = time.time()
    t_transfer_s, pull_resp = await completion(
        client, dst_port, prompt, max_tokens=1,
        kv_transfer_params={
@@ -102,22 +106,36 @@ async def measure_one(
            "remote_port": src_kvp.get("remote_port", src_port),
        },
    )
-    # Step 3: follow-up, no kv_transfer_params — should hit B's cache fully
+    t_step2_end_client = time.time()
-    t_followup_s, follow_resp = await completion(
+
-        client, dst_port, prompt, max_tokens=1,
+    cached_followup = None
-    )
+    t_followup_s = None
-    usage = (follow_resp.get("usage") or {})
+    if not skip_verify:
-    details = usage.get("prompt_tokens_details") or {}
+        t_followup_s, follow_resp = await completion(
-    cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
+            client, dst_port, prompt, max_tokens=1,
        )
        usage = (follow_resp.get("usage") or {})
        details = usage.get("prompt_tokens_details") or {}
        cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
    pull_usage = (pull_resp.get("usage") or {})
    pull_completion_tokens = pull_usage.get("completion_tokens", 0)
    ok = pull_completion_tokens >= 1
    if not skip_verify and cached_followup is not None:
        ok = ok and (cached_followup >= input_tokens * 0.9)
    return {
        "input_tokens": input_tokens,
        "session": session,
        "t_step1_client_unix": t_step1_client,
        "t_step2_client_unix": t_step2_client,
        "t_step2_end_unix": t_step2_end_client,
        "t_prefill_s": t_prefill_s,
        "t_transfer_s": t_transfer_s,
        "t_followup_s": t_followup_s,
        "cached_followup": cached_followup,
-        "ok": cached_followup >= input_tokens * 0.9,  # ≥90 % cached = transfer succeeded
+        "pull_completion_tokens": pull_completion_tokens,
        "ok": ok,
    }
@@ -139,10 +157,13 @@ async def main_async(args: argparse.Namespace) -> None:
                row = await measure_one(
                    client, src_port, dst_port, src_eid, dst_eid,
                    input_tokens=sz, rng_seed=sz * 1000 + r,
                    skip_verify=args.skip_verify,
                )
                cached = row.get("cached_followup")
                cached_str = f"{cached}/{sz}" if cached is not None else "skip"
                print(f"  size={sz:>6}  rep={r}  "
                      f"transfer={row['t_transfer_s']*1000:7.1f}ms  "
-                      f"followup_cached={row['cached_followup']}/{sz}  "
+                      f"followup_cached={cached_str}  "
                      f"ok={row['ok']}")
                results.append(row)
@@ -196,6 +217,9 @@ def main() -> None:
    p.add_argument("--label", default="intra-node",
                   help="Label written into the output (e.g. intra-node / inter-node)")
    p.add_argument("--out", default="mb2_result.json")
    p.add_argument("--skip-verify", action="store_true",
                   help="Skip the step-3 verify completion (required for "
                        "strict PD-disagg where dst is kv_consumer-only).")
    args = p.parse_args()
    asyncio.run(main_async(args))
--- a/microbench/fresh_setup/start_vllm_pair.sh
+++ b/microbench/fresh_setup/start_vllm_pair.sh
@@ -29,6 +29,8 @@ BP_A=8998
 BP_B=8999
 MASTER_A=29500
 MASTER_B=29501
 ROLE_A="${ROLE_A:-kv_producer}"  # or kv_both
 ROLE_B="${ROLE_B:-kv_consumer}"  # or kv_both
 MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -78,8 +80,9 @@ fi
 mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"
 launch() {
-    local idx="$1" gpu="$2" port="$3" bp="$4" master="$5"
+    local idx="$1" gpu="$2" port="$3" bp="$4" master="$5" role="$6"
-    echo "[mb2] launching instance ${idx} on GPU ${gpu}, port ${port}, bp ${bp}"
+    local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
    echo "[mb2] launching ${idx}: gpu=${gpu} port=${port} bp=${bp} role=${role}"
    PYTHONHASHSEED=42 \
    VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
    CUDA_VISIBLE_DEVICES="${gpu}" \
@@ -91,15 +94,15 @@ launch() {
        --trust-remote-code --enable-prefix-caching \
        --dtype auto --gpu-memory-utilization 0.9 \
        --max-model-len 200000 \
-        --kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
+        --kv-transfer-config "${cfg}" \
        --enable-prompt-tokens-details \
        > "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
    disown
 }
-launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}"
+launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}" "${ROLE_A}"
 sleep 3
-launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}"
+launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}" "${ROLE_B}"
 echo "[mb2] waiting for both /health endpoints..."
 for port in "${PORT_A}" "${PORT_B}"; do