From e13391eeab4c85ec9360abf18a9b1b6acd8fa9fa Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Sun, 24 May 2026 16:56:34 +0800
Subject: [PATCH] Evict migrated blocks from prefix cache after KV send
 completes

After a session migrates from C to D via offload, C's blocks were freed
to the LRU tail (most-recently-used position), making them the last to
be evicted. Since the session won't return to C, these blocks are dead
weight occupying cache capacity.

Now capture block IDs before _free_blocks and call evict_blocks to
remove them from the prefix cache hash table, so they can be reused
sooner for active sessions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 third_party/vllm/vllm/v1/core/sched/scheduler.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/vllm/vllm/v1/core/sched/scheduler.py b/third_party/vllm/vllm/v1/core/sched/scheduler.py
index f817deb..528fc40 100644
--- a/third_party/vllm/vllm/v1/core/sched/scheduler.py
+++ b/third_party/vllm/vllm/v1/core/sched/scheduler.py
@@ -2116,7 +2116,12 @@ class Scheduler(SchedulerInterface):
             if req_id not in self.requests:
                 logger.warning("Skipping finished_sending for unknown request %s (already aborted?)", req_id)
                 continue
+            sent_block_ids: set[int] = set()
+            for group in self.kv_cache_manager.get_block_ids(req_id):
+                sent_block_ids.update(group)
             self._free_blocks(self.requests[req_id])
+            if sent_block_ids:
+                self.kv_cache_manager.evict_blocks(sent_block_ids)
 
     def _update_requests_with_invalid_blocks(
         self,