Fix A+C: real cache sync + cached-prefill-on-C architecture

A: Add /estimate_hit endpoint to bootstrap server for real-time cache probing. Proxy queries this before committing to PUSH, eliminating 24% zero-match PUSH requests (shadow cache divergence). C: Add _handle_cached_prefill_offload: C (cache source) does fast cached prefill → KV to Mooncake → D pulls and decodes. Replaces broken direct_read PUSH where D waited for RDMA transfer while occupying KV blocks without doing compute. Also: update §3.9 baseline to plain vLLM with full mean/p50/p90/p99. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-24 11:22:38 +08:00
parent 2b9eae0d54
commit cdf83493ab
3 changed files with 252 additions and 59 deletions
--- a/third_party/vllm/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_utils.py
+++ b/third_party/vllm/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_utils.py
@@ -61,6 +61,15 @@ class PushBlocksResponse(BaseModel):
    pushed: bool


+class EstimateHitRequest(BaseModel):
+    token_ids: list[int]
+    block_size: int = 512
+
+
+class EstimateHitResponse(BaseModel):
+    hit_tokens: int
+
+
 class UnpinBlocksRequest(BaseModel):
    pin_token: str

@@ -98,6 +107,7 @@ class MooncakeBootstrapServer:
        self.app.post("/query_blocks")(self.query_blocks)
        self.app.post("/unpin_blocks")(self.unpin_blocks)
        self.app.post("/push_blocks")(self.push_blocks)
+        self.app.post("/estimate_hit")(self.estimate_hit)

    def start(self):
        if self.server_thread:
@@ -286,6 +296,36 @@ class MooncakeBootstrapServer:
        self._pinned.pop(req.pin_token, None)
        return {"status": "ok"}

+    async def estimate_hit(self, req: EstimateHitRequest):
+        """Read-only probe: how many prefix-contiguous tokens are cached?"""
+        if self._kv_info is None:
+            raise HTTPException(503, "Worker KV info not registered yet")
+
+        block_size = req.block_size or self._kv_info.get("block_size", 512)
+        n_tokens = len(req.token_ids)
+        num_blocks = n_tokens // block_size
+        if num_blocks == 0 or not self._hash_table:
+            return EstimateHitResponse(hit_tokens=0)
+
+        import vllm.v1.core.kv_cache_utils as kv_utils
+        from vllm.utils.hashing import sha256
+
+        prev_hash = kv_utils.NONE_HASH
+        hit_blocks = 0
+        for i in range(num_blocks):
+            block_tokens = tuple(
+                req.token_ids[i * block_size:(i + 1) * block_size])
+            block_hash = kv_utils.hash_block_tokens(
+                sha256, prev_hash, block_tokens, None)
+            prev_hash = block_hash
+
+            if self._hash_table.get(block_hash.hex()) is not None:
+                hit_blocks += 1
+            else:
+                break
+
+        return EstimateHitResponse(hit_tokens=hit_blocks * block_size)
+
    async def push_blocks(self, req: PushBlocksRequest):
        """Query matching blocks by token_ids, then PUSH them to D via RDMA write."""
        if self._kv_info is None or self._transfer_engine is None: