MB2: parameterize vLLM roles (kv_producer + kv_consumer default)
start_vllm_pair.sh ROLE_A / ROLE_B env vars (default kv_producer / kv_consumer for strict PD-disagg). Override to kv_both for the kv_both control. The role is injected into --kv-transfer-config so vLLM imposes the role restriction. mb2_kv_transfer.py --skip-verify flag drops step 3 (the plain completion sanity-check on the destination), required when the dst is kv_consumer-only since a kv_consumer instance refuses to serve a request without do_remote_prefill. The transfer-time itself is still measured from step 2 (do_remote_prefill on the consumer). Also: per-step client-side wall-clock timestamps (t_step1_client_unix, t_step2_client_unix, t_step2_end_unix) are now captured so the post-hoc breakdown analyzer can join with the per-instance JSONL logs on absolute time. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -76,10 +76,14 @@ async def measure_one(
|
||||
src_eid: str, dst_eid: str,
|
||||
input_tokens: int,
|
||||
rng_seed: int,
|
||||
skip_verify: bool = False,
|
||||
) -> dict:
|
||||
"""Three-step measurement (step 3 is sanity, optional for strict PD-disagg
|
||||
where the dst is a kv_consumer-only instance that cannot serve a plain
|
||||
request)."""
|
||||
prompt = synth_prompt(rng_seed, input_tokens)
|
||||
session = uuid.uuid4().hex
|
||||
# Step 1: prefill on A. max_tokens=1 ensures KV is cached but no real decode work.
|
||||
t_step1_client = time.time()
|
||||
t_prefill_s, prefill_resp = await completion(
|
||||
client, src_port, prompt, max_tokens=1,
|
||||
kv_transfer_params={
|
||||
@@ -91,7 +95,7 @@ async def measure_one(
|
||||
},
|
||||
)
|
||||
src_kvp = prefill_resp.get("kv_transfer_params") or {}
|
||||
# Step 2: pull from A to B (the transfer step we time)
|
||||
t_step2_client = time.time()
|
||||
t_transfer_s, pull_resp = await completion(
|
||||
client, dst_port, prompt, max_tokens=1,
|
||||
kv_transfer_params={
|
||||
@@ -102,22 +106,36 @@ async def measure_one(
|
||||
"remote_port": src_kvp.get("remote_port", src_port),
|
||||
},
|
||||
)
|
||||
# Step 3: follow-up, no kv_transfer_params — should hit B's cache fully
|
||||
t_followup_s, follow_resp = await completion(
|
||||
client, dst_port, prompt, max_tokens=1,
|
||||
)
|
||||
usage = (follow_resp.get("usage") or {})
|
||||
details = usage.get("prompt_tokens_details") or {}
|
||||
cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
|
||||
t_step2_end_client = time.time()
|
||||
|
||||
cached_followup = None
|
||||
t_followup_s = None
|
||||
if not skip_verify:
|
||||
t_followup_s, follow_resp = await completion(
|
||||
client, dst_port, prompt, max_tokens=1,
|
||||
)
|
||||
usage = (follow_resp.get("usage") or {})
|
||||
details = usage.get("prompt_tokens_details") or {}
|
||||
cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
|
||||
|
||||
pull_usage = (pull_resp.get("usage") or {})
|
||||
pull_completion_tokens = pull_usage.get("completion_tokens", 0)
|
||||
ok = pull_completion_tokens >= 1
|
||||
if not skip_verify and cached_followup is not None:
|
||||
ok = ok and (cached_followup >= input_tokens * 0.9)
|
||||
|
||||
return {
|
||||
"input_tokens": input_tokens,
|
||||
"session": session,
|
||||
"t_step1_client_unix": t_step1_client,
|
||||
"t_step2_client_unix": t_step2_client,
|
||||
"t_step2_end_unix": t_step2_end_client,
|
||||
"t_prefill_s": t_prefill_s,
|
||||
"t_transfer_s": t_transfer_s,
|
||||
"t_followup_s": t_followup_s,
|
||||
"cached_followup": cached_followup,
|
||||
"ok": cached_followup >= input_tokens * 0.9, # ≥90 % cached = transfer succeeded
|
||||
"pull_completion_tokens": pull_completion_tokens,
|
||||
"ok": ok,
|
||||
}
|
||||
|
||||
|
||||
@@ -139,10 +157,13 @@ async def main_async(args: argparse.Namespace) -> None:
|
||||
row = await measure_one(
|
||||
client, src_port, dst_port, src_eid, dst_eid,
|
||||
input_tokens=sz, rng_seed=sz * 1000 + r,
|
||||
skip_verify=args.skip_verify,
|
||||
)
|
||||
cached = row.get("cached_followup")
|
||||
cached_str = f"{cached}/{sz}" if cached is not None else "skip"
|
||||
print(f" size={sz:>6} rep={r} "
|
||||
f"transfer={row['t_transfer_s']*1000:7.1f}ms "
|
||||
f"followup_cached={row['cached_followup']}/{sz} "
|
||||
f"followup_cached={cached_str} "
|
||||
f"ok={row['ok']}")
|
||||
results.append(row)
|
||||
|
||||
@@ -196,6 +217,9 @@ def main() -> None:
|
||||
p.add_argument("--label", default="intra-node",
|
||||
help="Label written into the output (e.g. intra-node / inter-node)")
|
||||
p.add_argument("--out", default="mb2_result.json")
|
||||
p.add_argument("--skip-verify", action="store_true",
|
||||
help="Skip the step-3 verify completion (required for "
|
||||
"strict PD-disagg where dst is kv_consumer-only).")
|
||||
args = p.parse_args()
|
||||
asyncio.run(main_async(args))
|
||||
|
||||
|
||||
@@ -29,6 +29,8 @@ BP_A=8998
|
||||
BP_B=8999
|
||||
MASTER_A=29500
|
||||
MASTER_B=29501
|
||||
ROLE_A="${ROLE_A:-kv_producer}" # or kv_both
|
||||
ROLE_B="${ROLE_B:-kv_consumer}" # or kv_both
|
||||
|
||||
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
@@ -78,8 +80,9 @@ fi
|
||||
mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"
|
||||
|
||||
launch() {
|
||||
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5"
|
||||
echo "[mb2] launching instance ${idx} on GPU ${gpu}, port ${port}, bp ${bp}"
|
||||
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5" role="$6"
|
||||
local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
|
||||
echo "[mb2] launching ${idx}: gpu=${gpu} port=${port} bp=${bp} role=${role}"
|
||||
PYTHONHASHSEED=42 \
|
||||
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
|
||||
CUDA_VISIBLE_DEVICES="${gpu}" \
|
||||
@@ -91,15 +94,15 @@ launch() {
|
||||
--trust-remote-code --enable-prefix-caching \
|
||||
--dtype auto --gpu-memory-utilization 0.9 \
|
||||
--max-model-len 200000 \
|
||||
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
||||
--kv-transfer-config "${cfg}" \
|
||||
--enable-prompt-tokens-details \
|
||||
> "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
|
||||
disown
|
||||
}
|
||||
|
||||
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}"
|
||||
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}" "${ROLE_A}"
|
||||
sleep 3
|
||||
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}"
|
||||
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}" "${ROLE_B}"
|
||||
|
||||
echo "[mb2] waiting for both /health endpoints..."
|
||||
for port in "${PORT_A}" "${PORT_B}"; do
|
||||
|
||||
Reference in New Issue
Block a user