Fix A+C: real cache sync + cached-prefill-on-C architecture
A: Add /estimate_hit endpoint to bootstrap server for real-time cache probing. Proxy queries this before committing to PUSH, eliminating 24% zero-match PUSH requests (shadow cache divergence). C: Add _handle_cached_prefill_offload: C (cache source) does fast cached prefill → KV to Mooncake → D pulls and decodes. Replaces broken direct_read PUSH where D waited for RDMA transfer while occupying KV blocks without doing compute. Also: update §3.9 baseline to plain vLLM with full mean/p50/p90/p99. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -61,6 +61,15 @@ class PushBlocksResponse(BaseModel):
|
||||
pushed: bool
|
||||
|
||||
|
||||
class EstimateHitRequest(BaseModel):
|
||||
token_ids: list[int]
|
||||
block_size: int = 512
|
||||
|
||||
|
||||
class EstimateHitResponse(BaseModel):
|
||||
hit_tokens: int
|
||||
|
||||
|
||||
class UnpinBlocksRequest(BaseModel):
|
||||
pin_token: str
|
||||
|
||||
@@ -98,6 +107,7 @@ class MooncakeBootstrapServer:
|
||||
self.app.post("/query_blocks")(self.query_blocks)
|
||||
self.app.post("/unpin_blocks")(self.unpin_blocks)
|
||||
self.app.post("/push_blocks")(self.push_blocks)
|
||||
self.app.post("/estimate_hit")(self.estimate_hit)
|
||||
|
||||
def start(self):
|
||||
if self.server_thread:
|
||||
@@ -286,6 +296,36 @@ class MooncakeBootstrapServer:
|
||||
self._pinned.pop(req.pin_token, None)
|
||||
return {"status": "ok"}
|
||||
|
||||
async def estimate_hit(self, req: EstimateHitRequest):
|
||||
"""Read-only probe: how many prefix-contiguous tokens are cached?"""
|
||||
if self._kv_info is None:
|
||||
raise HTTPException(503, "Worker KV info not registered yet")
|
||||
|
||||
block_size = req.block_size or self._kv_info.get("block_size", 512)
|
||||
n_tokens = len(req.token_ids)
|
||||
num_blocks = n_tokens // block_size
|
||||
if num_blocks == 0 or not self._hash_table:
|
||||
return EstimateHitResponse(hit_tokens=0)
|
||||
|
||||
import vllm.v1.core.kv_cache_utils as kv_utils
|
||||
from vllm.utils.hashing import sha256
|
||||
|
||||
prev_hash = kv_utils.NONE_HASH
|
||||
hit_blocks = 0
|
||||
for i in range(num_blocks):
|
||||
block_tokens = tuple(
|
||||
req.token_ids[i * block_size:(i + 1) * block_size])
|
||||
block_hash = kv_utils.hash_block_tokens(
|
||||
sha256, prev_hash, block_tokens, None)
|
||||
prev_hash = block_hash
|
||||
|
||||
if self._hash_table.get(block_hash.hex()) is not None:
|
||||
hit_blocks += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return EstimateHitResponse(hit_tokens=hit_blocks * block_size)
|
||||
|
||||
async def push_blocks(self, req: PushBlocksRequest):
|
||||
"""Query matching blocks by token_ids, then PUSH them to D via RDMA write."""
|
||||
if self._kv_info is None or self._transfer_engine is None:
|
||||
|
||||
Reference in New Issue
Block a user