diff --git a/scripts/b3_isolated_policy.sh b/scripts/b3_isolated_policy.sh index ccf3203..cb4eb3e 100755 --- a/scripts/b3_isolated_policy.sh +++ b/scripts/b3_isolated_policy.sh @@ -79,9 +79,13 @@ for gpu in $GPU_INDICES; do $EXTRA_VLLM_ARGS \ > "$RUNDIR/logs/vllm_inst_${i}_gpu${gpu}.log" 2>&1 & elif [ "$ENABLE_KV_BOTH" = "1" ] && [ "$KV_CONNECTOR" = "Nixl" ]; then - # NixlConnector uses UCX side-channels for handshake (no bootstrap - # port needed). Side-channel host defaults to NIC IP discovered - # at register_kv_caches time. + # NixlConnector's handshake listener binds to a fixed default + # port 5600 unless VLLM_NIXL_SIDE_CHANNEL_PORT is overridden. + # Multiple instances on the same host MUST use distinct ports + # or only one will start; the rest hit + # `zmq.error.ZMQError: Address already in use`. + nixl_port=$((5600 + i)) + VLLM_NIXL_SIDE_CHANNEL_PORT=$nixl_port \ AGENTIC_STEP_LOG_PATH="$RUNDIR/engine_state/engine_${i}.jsonl" \ AGENTIC_WORKER_ID="engine_${i}" \ CUDA_VISIBLE_DEVICES=$gpu \