MB2: parameterize vLLM roles (kv_producer + kv_consumer default)

start_vllm_pair.sh
  ROLE_A / ROLE_B env vars (default kv_producer / kv_consumer for strict
  PD-disagg). Override to kv_both for the kv_both control. The role is
  injected into --kv-transfer-config so vLLM imposes the role restriction.

mb2_kv_transfer.py
  --skip-verify flag drops step 3 (the plain completion sanity-check on
  the destination), required when the dst is kv_consumer-only since a
  kv_consumer instance refuses to serve a request without
  do_remote_prefill. The transfer-time itself is still measured from
  step 2 (do_remote_prefill on the consumer).

Also: per-step client-side wall-clock timestamps (t_step1_client_unix,
t_step2_client_unix, t_step2_end_unix) are now captured so the
post-hoc breakdown analyzer can join with the per-instance JSONL logs
on absolute time.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 18:17:42 +08:00
parent efdcf3c555
commit 622e0bc04c
2 changed files with 43 additions and 16 deletions

View File

@@ -76,10 +76,14 @@ async def measure_one(
src_eid: str, dst_eid: str,
input_tokens: int,
rng_seed: int,
skip_verify: bool = False,
) -> dict:
"""Three-step measurement (step 3 is sanity, optional for strict PD-disagg
where the dst is a kv_consumer-only instance that cannot serve a plain
request)."""
prompt = synth_prompt(rng_seed, input_tokens)
session = uuid.uuid4().hex
# Step 1: prefill on A. max_tokens=1 ensures KV is cached but no real decode work.
t_step1_client = time.time()
t_prefill_s, prefill_resp = await completion(
client, src_port, prompt, max_tokens=1,
kv_transfer_params={
@@ -91,7 +95,7 @@ async def measure_one(
},
)
src_kvp = prefill_resp.get("kv_transfer_params") or {}
# Step 2: pull from A to B (the transfer step we time)
t_step2_client = time.time()
t_transfer_s, pull_resp = await completion(
client, dst_port, prompt, max_tokens=1,
kv_transfer_params={
@@ -102,22 +106,36 @@ async def measure_one(
"remote_port": src_kvp.get("remote_port", src_port),
},
)
# Step 3: follow-up, no kv_transfer_params — should hit B's cache fully
t_followup_s, follow_resp = await completion(
client, dst_port, prompt, max_tokens=1,
)
usage = (follow_resp.get("usage") or {})
details = usage.get("prompt_tokens_details") or {}
cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
t_step2_end_client = time.time()
cached_followup = None
t_followup_s = None
if not skip_verify:
t_followup_s, follow_resp = await completion(
client, dst_port, prompt, max_tokens=1,
)
usage = (follow_resp.get("usage") or {})
details = usage.get("prompt_tokens_details") or {}
cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
pull_usage = (pull_resp.get("usage") or {})
pull_completion_tokens = pull_usage.get("completion_tokens", 0)
ok = pull_completion_tokens >= 1
if not skip_verify and cached_followup is not None:
ok = ok and (cached_followup >= input_tokens * 0.9)
return {
"input_tokens": input_tokens,
"session": session,
"t_step1_client_unix": t_step1_client,
"t_step2_client_unix": t_step2_client,
"t_step2_end_unix": t_step2_end_client,
"t_prefill_s": t_prefill_s,
"t_transfer_s": t_transfer_s,
"t_followup_s": t_followup_s,
"cached_followup": cached_followup,
"ok": cached_followup >= input_tokens * 0.9, # ≥90 % cached = transfer succeeded
"pull_completion_tokens": pull_completion_tokens,
"ok": ok,
}
@@ -139,10 +157,13 @@ async def main_async(args: argparse.Namespace) -> None:
row = await measure_one(
client, src_port, dst_port, src_eid, dst_eid,
input_tokens=sz, rng_seed=sz * 1000 + r,
skip_verify=args.skip_verify,
)
cached = row.get("cached_followup")
cached_str = f"{cached}/{sz}" if cached is not None else "skip"
print(f" size={sz:>6} rep={r} "
f"transfer={row['t_transfer_s']*1000:7.1f}ms "
f"followup_cached={row['cached_followup']}/{sz} "
f"followup_cached={cached_str} "
f"ok={row['ok']}")
results.append(row)
@@ -196,6 +217,9 @@ def main() -> None:
p.add_argument("--label", default="intra-node",
help="Label written into the output (e.g. intra-node / inter-node)")
p.add_argument("--out", default="mb2_result.json")
p.add_argument("--skip-verify", action="store_true",
help="Skip the step-3 verify completion (required for "
"strict PD-disagg where dst is kv_consumer-only).")
args = p.parse_args()
asyncio.run(main_async(args))

View File

@@ -29,6 +29,8 @@ BP_A=8998
BP_B=8999
MASTER_A=29500
MASTER_B=29501
ROLE_A="${ROLE_A:-kv_producer}" # or kv_both
ROLE_B="${ROLE_B:-kv_consumer}" # or kv_both
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -78,8 +80,9 @@ fi
mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"
launch() {
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5"
echo "[mb2] launching instance ${idx} on GPU ${gpu}, port ${port}, bp ${bp}"
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5" role="$6"
local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
echo "[mb2] launching ${idx}: gpu=${gpu} port=${port} bp=${bp} role=${role}"
PYTHONHASHSEED=42 \
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
CUDA_VISIBLE_DEVICES="${gpu}" \
@@ -91,15 +94,15 @@ launch() {
--trust-remote-code --enable-prefix-caching \
--dtype auto --gpu-memory-utilization 0.9 \
--max-model-len 200000 \
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
--kv-transfer-config "${cfg}" \
--enable-prompt-tokens-details \
> "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
disown
}
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}"
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}" "${ROLE_A}"
sleep 3
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}"
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}" "${ROLE_B}"
echo "[mb2] waiting for both /health endpoints..."
for port in "${PORT_A}" "${PORT_B}"; do