Fix A+C: real cache sync + cached-prefill-on-C architecture

A: Add /estimate_hit endpoint to bootstrap server for real-time cache
   probing. Proxy queries this before committing to PUSH, eliminating
   24% zero-match PUSH requests (shadow cache divergence).

C: Add _handle_cached_prefill_offload: C (cache source) does fast
   cached prefill → KV to Mooncake → D pulls and decodes.
   Replaces broken direct_read PUSH where D waited for RDMA transfer
   while occupying KV blocks without doing compute.

Also: update §3.9 baseline to plain vLLM with full mean/p50/p90/p99.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-24 11:22:38 +08:00
parent 2b9eae0d54
commit cdf83493ab
3 changed files with 252 additions and 59 deletions

View File

@@ -61,6 +61,15 @@ class PushBlocksResponse(BaseModel):
pushed: bool
class EstimateHitRequest(BaseModel):
token_ids: list[int]
block_size: int = 512
class EstimateHitResponse(BaseModel):
hit_tokens: int
class UnpinBlocksRequest(BaseModel):
pin_token: str
@@ -98,6 +107,7 @@ class MooncakeBootstrapServer:
self.app.post("/query_blocks")(self.query_blocks)
self.app.post("/unpin_blocks")(self.unpin_blocks)
self.app.post("/push_blocks")(self.push_blocks)
self.app.post("/estimate_hit")(self.estimate_hit)
def start(self):
if self.server_thread:
@@ -286,6 +296,36 @@ class MooncakeBootstrapServer:
self._pinned.pop(req.pin_token, None)
return {"status": "ok"}
async def estimate_hit(self, req: EstimateHitRequest):
"""Read-only probe: how many prefix-contiguous tokens are cached?"""
if self._kv_info is None:
raise HTTPException(503, "Worker KV info not registered yet")
block_size = req.block_size or self._kv_info.get("block_size", 512)
n_tokens = len(req.token_ids)
num_blocks = n_tokens // block_size
if num_blocks == 0 or not self._hash_table:
return EstimateHitResponse(hit_tokens=0)
import vllm.v1.core.kv_cache_utils as kv_utils
from vllm.utils.hashing import sha256
prev_hash = kv_utils.NONE_HASH
hit_blocks = 0
for i in range(num_blocks):
block_tokens = tuple(
req.token_ids[i * block_size:(i + 1) * block_size])
block_hash = kv_utils.hash_block_tokens(
sha256, prev_hash, block_tokens, None)
prev_hash = block_hash
if self._hash_table.get(block_hash.hex()) is not None:
hit_blocks += 1
else:
break
return EstimateHitResponse(hit_tokens=hit_blocks * block_size)
async def push_blocks(self, req: PushBlocksRequest):
"""Query matching blocks by token_ids, then PUSH them to D via RDMA write."""
if self._kv_info is None or self._transfer_engine is None: