MB2: parameterize vLLM roles (kv_producer + kv_consumer default)
start_vllm_pair.sh ROLE_A / ROLE_B env vars (default kv_producer / kv_consumer for strict PD-disagg). Override to kv_both for the kv_both control. The role is injected into --kv-transfer-config so vLLM imposes the role restriction. mb2_kv_transfer.py --skip-verify flag drops step 3 (the plain completion sanity-check on the destination), required when the dst is kv_consumer-only since a kv_consumer instance refuses to serve a request without do_remote_prefill. The transfer-time itself is still measured from step 2 (do_remote_prefill on the consumer). Also: per-step client-side wall-clock timestamps (t_step1_client_unix, t_step2_client_unix, t_step2_end_unix) are now captured so the post-hoc breakdown analyzer can join with the per-instance JSONL logs on absolute time. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -76,10 +76,14 @@ async def measure_one(
|
|||||||
src_eid: str, dst_eid: str,
|
src_eid: str, dst_eid: str,
|
||||||
input_tokens: int,
|
input_tokens: int,
|
||||||
rng_seed: int,
|
rng_seed: int,
|
||||||
|
skip_verify: bool = False,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
|
"""Three-step measurement (step 3 is sanity, optional for strict PD-disagg
|
||||||
|
where the dst is a kv_consumer-only instance that cannot serve a plain
|
||||||
|
request)."""
|
||||||
prompt = synth_prompt(rng_seed, input_tokens)
|
prompt = synth_prompt(rng_seed, input_tokens)
|
||||||
session = uuid.uuid4().hex
|
session = uuid.uuid4().hex
|
||||||
# Step 1: prefill on A. max_tokens=1 ensures KV is cached but no real decode work.
|
t_step1_client = time.time()
|
||||||
t_prefill_s, prefill_resp = await completion(
|
t_prefill_s, prefill_resp = await completion(
|
||||||
client, src_port, prompt, max_tokens=1,
|
client, src_port, prompt, max_tokens=1,
|
||||||
kv_transfer_params={
|
kv_transfer_params={
|
||||||
@@ -91,7 +95,7 @@ async def measure_one(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
src_kvp = prefill_resp.get("kv_transfer_params") or {}
|
src_kvp = prefill_resp.get("kv_transfer_params") or {}
|
||||||
# Step 2: pull from A to B (the transfer step we time)
|
t_step2_client = time.time()
|
||||||
t_transfer_s, pull_resp = await completion(
|
t_transfer_s, pull_resp = await completion(
|
||||||
client, dst_port, prompt, max_tokens=1,
|
client, dst_port, prompt, max_tokens=1,
|
||||||
kv_transfer_params={
|
kv_transfer_params={
|
||||||
@@ -102,22 +106,36 @@ async def measure_one(
|
|||||||
"remote_port": src_kvp.get("remote_port", src_port),
|
"remote_port": src_kvp.get("remote_port", src_port),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
# Step 3: follow-up, no kv_transfer_params — should hit B's cache fully
|
t_step2_end_client = time.time()
|
||||||
t_followup_s, follow_resp = await completion(
|
|
||||||
client, dst_port, prompt, max_tokens=1,
|
cached_followup = None
|
||||||
)
|
t_followup_s = None
|
||||||
usage = (follow_resp.get("usage") or {})
|
if not skip_verify:
|
||||||
details = usage.get("prompt_tokens_details") or {}
|
t_followup_s, follow_resp = await completion(
|
||||||
cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
|
client, dst_port, prompt, max_tokens=1,
|
||||||
|
)
|
||||||
|
usage = (follow_resp.get("usage") or {})
|
||||||
|
details = usage.get("prompt_tokens_details") or {}
|
||||||
|
cached_followup = details.get("cached_tokens", 0) or usage.get("cached_tokens", 0)
|
||||||
|
|
||||||
|
pull_usage = (pull_resp.get("usage") or {})
|
||||||
|
pull_completion_tokens = pull_usage.get("completion_tokens", 0)
|
||||||
|
ok = pull_completion_tokens >= 1
|
||||||
|
if not skip_verify and cached_followup is not None:
|
||||||
|
ok = ok and (cached_followup >= input_tokens * 0.9)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"input_tokens": input_tokens,
|
"input_tokens": input_tokens,
|
||||||
"session": session,
|
"session": session,
|
||||||
|
"t_step1_client_unix": t_step1_client,
|
||||||
|
"t_step2_client_unix": t_step2_client,
|
||||||
|
"t_step2_end_unix": t_step2_end_client,
|
||||||
"t_prefill_s": t_prefill_s,
|
"t_prefill_s": t_prefill_s,
|
||||||
"t_transfer_s": t_transfer_s,
|
"t_transfer_s": t_transfer_s,
|
||||||
"t_followup_s": t_followup_s,
|
"t_followup_s": t_followup_s,
|
||||||
"cached_followup": cached_followup,
|
"cached_followup": cached_followup,
|
||||||
"ok": cached_followup >= input_tokens * 0.9, # ≥90 % cached = transfer succeeded
|
"pull_completion_tokens": pull_completion_tokens,
|
||||||
|
"ok": ok,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -139,10 +157,13 @@ async def main_async(args: argparse.Namespace) -> None:
|
|||||||
row = await measure_one(
|
row = await measure_one(
|
||||||
client, src_port, dst_port, src_eid, dst_eid,
|
client, src_port, dst_port, src_eid, dst_eid,
|
||||||
input_tokens=sz, rng_seed=sz * 1000 + r,
|
input_tokens=sz, rng_seed=sz * 1000 + r,
|
||||||
|
skip_verify=args.skip_verify,
|
||||||
)
|
)
|
||||||
|
cached = row.get("cached_followup")
|
||||||
|
cached_str = f"{cached}/{sz}" if cached is not None else "skip"
|
||||||
print(f" size={sz:>6} rep={r} "
|
print(f" size={sz:>6} rep={r} "
|
||||||
f"transfer={row['t_transfer_s']*1000:7.1f}ms "
|
f"transfer={row['t_transfer_s']*1000:7.1f}ms "
|
||||||
f"followup_cached={row['cached_followup']}/{sz} "
|
f"followup_cached={cached_str} "
|
||||||
f"ok={row['ok']}")
|
f"ok={row['ok']}")
|
||||||
results.append(row)
|
results.append(row)
|
||||||
|
|
||||||
@@ -196,6 +217,9 @@ def main() -> None:
|
|||||||
p.add_argument("--label", default="intra-node",
|
p.add_argument("--label", default="intra-node",
|
||||||
help="Label written into the output (e.g. intra-node / inter-node)")
|
help="Label written into the output (e.g. intra-node / inter-node)")
|
||||||
p.add_argument("--out", default="mb2_result.json")
|
p.add_argument("--out", default="mb2_result.json")
|
||||||
|
p.add_argument("--skip-verify", action="store_true",
|
||||||
|
help="Skip the step-3 verify completion (required for "
|
||||||
|
"strict PD-disagg where dst is kv_consumer-only).")
|
||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
asyncio.run(main_async(args))
|
asyncio.run(main_async(args))
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,8 @@ BP_A=8998
|
|||||||
BP_B=8999
|
BP_B=8999
|
||||||
MASTER_A=29500
|
MASTER_A=29500
|
||||||
MASTER_B=29501
|
MASTER_B=29501
|
||||||
|
ROLE_A="${ROLE_A:-kv_producer}" # or kv_both
|
||||||
|
ROLE_B="${ROLE_B:-kv_consumer}" # or kv_both
|
||||||
|
|
||||||
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
|
MB2_LOG_ROOT="${FRESH_ROOT}/mb2_transfer_logs"
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
@@ -78,8 +80,9 @@ fi
|
|||||||
mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"
|
mkdir -p "${MB2_LOG_ROOT}/A" "${MB2_LOG_ROOT}/B"
|
||||||
|
|
||||||
launch() {
|
launch() {
|
||||||
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5"
|
local idx="$1" gpu="$2" port="$3" bp="$4" master="$5" role="$6"
|
||||||
echo "[mb2] launching instance ${idx} on GPU ${gpu}, port ${port}, bp ${bp}"
|
local cfg="{\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"${role}\"}"
|
||||||
|
echo "[mb2] launching ${idx}: gpu=${gpu} port=${port} bp=${bp} role=${role}"
|
||||||
PYTHONHASHSEED=42 \
|
PYTHONHASHSEED=42 \
|
||||||
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
|
VLLM_MOONCAKE_BOOTSTRAP_PORT="${bp}" \
|
||||||
CUDA_VISIBLE_DEVICES="${gpu}" \
|
CUDA_VISIBLE_DEVICES="${gpu}" \
|
||||||
@@ -91,15 +94,15 @@ launch() {
|
|||||||
--trust-remote-code --enable-prefix-caching \
|
--trust-remote-code --enable-prefix-caching \
|
||||||
--dtype auto --gpu-memory-utilization 0.9 \
|
--dtype auto --gpu-memory-utilization 0.9 \
|
||||||
--max-model-len 200000 \
|
--max-model-len 200000 \
|
||||||
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
--kv-transfer-config "${cfg}" \
|
||||||
--enable-prompt-tokens-details \
|
--enable-prompt-tokens-details \
|
||||||
> "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
|
> "${LOGS_DIR}/vllm_${idx}_gpu${gpu}.log" 2>&1 &
|
||||||
disown
|
disown
|
||||||
}
|
}
|
||||||
|
|
||||||
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}"
|
launch A "${GPU_A}" "${PORT_A}" "${BP_A}" "${MASTER_A}" "${ROLE_A}"
|
||||||
sleep 3
|
sleep 3
|
||||||
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}"
|
launch B "${GPU_B}" "${PORT_B}" "${BP_B}" "${MASTER_B}" "${ROLE_B}"
|
||||||
|
|
||||||
echo "[mb2] waiting for both /health endpoints..."
|
echo "[mb2] waiting for both /health endpoints..."
|
||||||
for port in "${PORT_A}" "${PORT_B}"; do
|
for port in "${PORT_A}" "${PORT_B}"; do
|
||||||
|
|||||||
Reference in New Issue
Block a user