diff --git a/patches/0001-fix-kv-transfer-abort-race.patch b/patches/0001-fix-kv-transfer-abort-race.patch new file mode 100644 index 0000000..09b7b30 --- /dev/null +++ b/patches/0001-fix-kv-transfer-abort-race.patch @@ -0,0 +1,24 @@ +--- a/vllm/v1/core/sched/scheduler.py ++++ b/vllm/v1/core/sched/scheduler.py +@@ -2097,7 +2097,9 @@ + # KV Connector:: update recv and send status from last step. + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) +- assert req_id in self.requests ++ if req_id not in self.requests: ++ logger.warning("Skipping finished_recving for unknown request %s (already aborted?)", req_id) ++ continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) +@@ -2106,7 +2108,9 @@ + self._free_blocks(self.requests[req_id]) + for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) +- assert req_id in self.requests ++ if req_id not in self.requests: ++ logger.warning("Skipping finished_sending for unknown request %s (already aborted?)", req_id) ++ continue + self._free_blocks(self.requests[req_id]) + + def _update_requests_with_invalid_blocks( diff --git a/patches/README.md b/patches/README.md new file mode 100644 index 0000000..b54c5d0 --- /dev/null +++ b/patches/README.md @@ -0,0 +1,30 @@ +# vLLM Patches + +Patches against vLLM v0.18.1. Apply to either the source tree (`third_party/vllm/`) or the installed package. + +## Applying + +```bash +# To source tree (for rebuilding) +cd third_party/vllm && git apply ../../patches/*.patch + +# To installed package (quick, no rebuild) +SITE=$(python -c "import vllm; print(vllm.__path__[0])") +for p in patches/*.patch; do + patch -p1 -d "$(dirname $SITE)" < "$p" +done +``` + +## Patches + +### 0001-fix-kv-transfer-abort-race.patch + +**File**: `vllm/v1/core/sched/scheduler.py` + +**Problem**: When a client disconnects (timeout/abort) during PD-disaggregated serving, the Mooncake KV transfer callback arrives after the request has been removed from the scheduler. The `assert req_id in self.requests` kills the engine process. + +**Fix**: Replace fatal assert with graceful skip + warning log. + +**Impact**: Without this patch, decode instances crash after ~200 requests under sustained load with concurrent KV transfers. + +**Upstream**: Not yet submitted. Could be upstreamed to vllm-project/vllm.