Add unified_nixl_both policy: NIXL connector isolation control
Adds a NIXL-backed counterpart to unified_kv_both so we can attribute
the kv_both substrate overhead measured in the elastic_migration_v2
section to either Mooncake-specific code or a generic v1-connector
cost shared by all connectors.
- scripts/cache_aware_proxy.py: register --policy unified_nixl_both.
Picker is identical to unified (and unified_kv_both); routing
decisions never go through the PD-sep branch. Differs only at the
vLLM launch layer.
- scripts/b3_isolated_policy.sh: new KV_CONNECTOR env var
(Mooncake|Nixl), auto-set based on POLICY. NIXL launch path uses
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
with no VLLM_MOONCAKE_BOOTSTRAP_PORT (NIXL uses UCX side-channels).
- Health-check timeout: 90 iterations * 2s -> 180 iterations * 2s
(180s -> 360s). Empirically NIXL needs ~100-150s per instance to
initialize the UCX agent and register KV cache memory; 8
concurrent NIXL launches frequently overshoot the previous 180s
budget. Mooncake is unaffected (still finishes well inside the new
budget). The 8-vLLM unified_nixl_both first launch tripped the
old timeout despite 7/8 instances reaching startup-complete.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -31,8 +31,15 @@ TRACE="${2:?usage: $0 <policy> <trace> <rundir>}"
|
||||
RUNDIR="${3:?usage: $0 <policy> <trace> <rundir>}"
|
||||
|
||||
# Auto-enable kv_both when the policy requires it.
|
||||
# KV_CONNECTOR (Mooncake|Nixl) selects the underlying connector when KV_BOTH=1.
|
||||
KV_CONNECTOR="${KV_CONNECTOR:-Mooncake}"
|
||||
if [ "$POLICY" = "unified_v2" ] || [ "$POLICY" = "unified_kv_both" ]; then
|
||||
ENABLE_KV_BOTH=1
|
||||
KV_CONNECTOR="Mooncake"
|
||||
fi
|
||||
if [ "$POLICY" = "unified_nixl_both" ]; then
|
||||
ENABLE_KV_BOTH=1
|
||||
KV_CONNECTOR="Nixl"
|
||||
fi
|
||||
|
||||
mkdir -p "$RUNDIR/engine_state" "$RUNDIR/logs"
|
||||
@@ -49,17 +56,13 @@ trap cleanup EXIT
|
||||
# Hard reset first
|
||||
cleanup
|
||||
|
||||
echo "[isolated] launching $N_INSTANCES vLLM on GPUs $GPU_INDICES ENABLE_KV_BOTH=$ENABLE_KV_BOTH ..."
|
||||
echo "[isolated] launching $N_INSTANCES vLLM on GPUs $GPU_INDICES ENABLE_KV_BOTH=$ENABLE_KV_BOTH KV_CONNECTOR=$KV_CONNECTOR ..."
|
||||
i=0
|
||||
kv_both_extra=""
|
||||
if [ "$ENABLE_KV_BOTH" = "1" ]; then
|
||||
kv_both_extra="--kv-transfer-config {\"kv_connector\":\"MooncakeConnector\",\"kv_role\":\"kv_both\"}"
|
||||
fi
|
||||
for gpu in $GPU_INDICES; do
|
||||
port=$((BASE_PORT + i))
|
||||
master=$((29500 + i))
|
||||
bp=$((BOOTSTRAP_BASE_PORT + i))
|
||||
if [ "$ENABLE_KV_BOTH" = "1" ]; then
|
||||
if [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Mooncake" ]; then
|
||||
PYTHONHASHSEED=42 \
|
||||
VLLM_MOONCAKE_BOOTSTRAP_PORT=$bp \
|
||||
AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
|
||||
@@ -75,6 +78,23 @@ for gpu in $GPU_INDICES; do
|
||||
--kv-transfer-config '{"kv_connector":"MooncakeConnector","kv_role":"kv_both"}' \
|
||||
$EXTRA_VLLM_ARGS \
|
||||
> "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
|
||||
elif [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then
|
||||
# NixlConnector uses UCX side-channels for handshake (no bootstrap
|
||||
# port needed). Side-channel host defaults to NIC IP discovered
|
||||
# at register_kv_caches time.
|
||||
AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
|
||||
AGENTIC_WORKER_ID="engine_${i}" \
|
||||
CUDA_VISIBLE_DEVICES=$gpu \
|
||||
MASTER_PORT=$master \
|
||||
nohup "$VENV/vllm" serve "$MODEL" \
|
||||
--host 0.0.0.0 --port "$port" \
|
||||
--tensor-parallel-size 1 \
|
||||
--trust-remote-code --enable-prefix-caching \
|
||||
--dtype auto --gpu-memory-utilization 0.9 \
|
||||
--max-model-len 200000 \
|
||||
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
|
||||
$EXTRA_VLLM_ARGS \
|
||||
> "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 &
|
||||
else
|
||||
AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \
|
||||
AGENTIC_WORKER_ID="engine_${i}" \
|
||||
@@ -95,13 +115,17 @@ for gpu in $GPU_INDICES; do
|
||||
done
|
||||
|
||||
echo "[isolated] waiting for vLLM health ..."
|
||||
# NIXL init takes ~100-150s per instance even with concurrent launches;
|
||||
# Mooncake is closer to ~30-60s. Use a generous 360s timeout to cover
|
||||
# both (90s -> 360s vs the previous 180s).
|
||||
HEALTH_MAX_TRIES=180
|
||||
for i in $(seq 0 $((N_INSTANCES - 1))); do
|
||||
port=$((BASE_PORT + i))
|
||||
tries=0
|
||||
while ! curl -sf "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
|
||||
tries=$((tries + 1))
|
||||
if [ $tries -gt 90 ]; then
|
||||
echo "[isolated] FATAL: inst_$i not healthy after 180s"
|
||||
if [ $tries -gt $HEALTH_MAX_TRIES ]; then
|
||||
echo "[isolated] FATAL: inst_$i not healthy after $((HEALTH_MAX_TRIES * 2))s"
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
|
||||
@@ -762,6 +762,9 @@ async def lifespan(app: FastAPI):
|
||||
|
||||
# Bootstrap combined instances for offload (need engine_ids for KV transfer)
|
||||
policy = getattr(global_args, 'policy', 'linear')
|
||||
# Mooncake-based modes still need bootstrap discovery; NIXL uses
|
||||
# its own UCX side-channel and doesn't go through our proxy
|
||||
# bootstrap path (and unified_nixl_both never PD-seps anyway).
|
||||
needs_bootstrap = (
|
||||
global_args.offload
|
||||
or policy in ("unified_v2", "unified_kv_both")
|
||||
@@ -927,11 +930,16 @@ async def _handle_combined(api, req_data, token_ids, input_length, session_id, h
|
||||
chosen, best_idx = pick_instance_sticky(
|
||||
combined_instances, token_ids, session_id, input_length,
|
||||
session_affinity_combined)
|
||||
elif policy == "unified" or policy == "unified_kv_both":
|
||||
elif policy in ("unified", "unified_kv_both", "unified_nixl_both"):
|
||||
# unified_kv_both: same picker as `unified`, but the vLLMs are
|
||||
# launched in kv_role=kv_both. Use this as an isolation control
|
||||
# for `unified_v2` so the v2-vs-v1 gap reflects only the PD-sep
|
||||
# branch, not the kv_both always-on overhead.
|
||||
# launched in kv_role=kv_both with MooncakeConnector. Use this
|
||||
# as an isolation control for `unified_v2` so the v2-vs-v1 gap
|
||||
# reflects only the PD-sep branch, not the kv_both always-on
|
||||
# overhead.
|
||||
# unified_nixl_both: identical to unified_kv_both but with
|
||||
# NixlConnector at the vLLM layer. Used to attribute the
|
||||
# kv_both overhead to either Mooncake-specific code or a
|
||||
# generic v1-connector cost.
|
||||
chosen, best_idx, decision = pick_instance_unified_hybrid(
|
||||
combined_instances, token_ids, session_id, input_length,
|
||||
session_affinity_combined)
|
||||
@@ -1291,13 +1299,17 @@ def parse_args():
|
||||
help="Comma-separated bootstrap ports for combined instances (for offload mode)")
|
||||
p.add_argument("--policy", type=str, default="linear",
|
||||
choices=["linear", "lmetric", "load_only", "sticky",
|
||||
"unified", "unified_kv_both", "unified_v2"],
|
||||
"unified", "unified_kv_both",
|
||||
"unified_nixl_both", "unified_v2"],
|
||||
help="Routing policy: linear (cache-aware), lmetric (P_tokens × BS), "
|
||||
"load_only (B3 control: pure min-num_requests), "
|
||||
"sticky (B3 control: hard session affinity), "
|
||||
"unified (hybrid affinity + LMetric fallback), "
|
||||
"unified_kv_both (unified on kv_both vLLMs; isolation "
|
||||
"control for unified_v2; PD-sep never triggers), "
|
||||
"unified_kv_both (unified picker on kv_both Mooncake "
|
||||
"vLLMs; isolation control for unified_v2), "
|
||||
"unified_nixl_both (same as unified_kv_both but using "
|
||||
"NixlConnector instead of MooncakeConnector; isolates "
|
||||
"connector implementation from policy effect), "
|
||||
"or unified_v2 (unified + selective per-request PD-sep "
|
||||
"via Mooncake; requires --bootstrap-ports and "
|
||||
"kv_role=kv_both vLLM launch)")
|
||||
|
||||
Reference in New Issue
Block a user