Validates the elastic_migration_v2 finding that kv_role=kv_both adds
TTFT p90 +45% even when PD-sep never fires. Replicates under
single-instance, synthetic, open-loop workload to disambiguate
mechanism cost from 8-instance feedback amplification.
Configurations (8):
plain, noop_connector, mooncake_{producer,consumer,both},
nixl_both, lmcache_only, multi_mooncake_lmcache.
Pre-flight verification gates risky configs (kv_consumer needs dummy
bootstrap, multi-connector composition, NoOp custom class loading).
Workload: two-phase sweep
Phase A: rate {0.5..32} req/s × shape (4096, 256), saturation criteria
Phase B: ref_safe rate × cartesian (input ∈ {512,4k,32k}, output ∈ {64,256,1024})
Step-timing patch enriches vLLM's existing AGENTIC_STEP_LOG_PATH emit
with step_duration_us and build_meta_us — directly measures per-step
substrate cost, not just user-visible TTFT/TPOT.
run_all.sh runs as 5-stage barrier:
0 pre-flight + apply patch
1 Phase A all configs
2 pick ref_safe / ref_load
3 Phase B all configs
4 revert patch + analyze + plot
Outputs aggregate.{json,csv}, MANIFEST.tsv, and 5 figures.
Estimated runtime: 4-5.5 hours on idle dash0 H20.
85 lines
2.3 KiB
Python
85 lines
2.3 KiB
Python
"""Dummy Mooncake bootstrap server for kv_consumer pre-flight.
|
|
|
|
Exposes the same HTTP routes as MooncakeBootstrapServer but returns
|
|
empty / accepting responses. Allows a kv_consumer vLLM to start up
|
|
without a real prefiller behind it.
|
|
|
|
Usage:
|
|
python dummy_bootstrap.py --port 8997
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import threading
|
|
|
|
from fastapi import FastAPI, Request
|
|
from fastapi.responses import JSONResponse
|
|
import uvicorn
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
log = logging.getLogger("dummy_bootstrap")
|
|
|
|
|
|
def make_app() -> FastAPI:
|
|
app = FastAPI()
|
|
state = {"workers": {}, "hash_table": {}}
|
|
|
|
@app.post("/register")
|
|
async def register_worker(req: Request):
|
|
body = await req.json()
|
|
log.info("register_worker: %s", body)
|
|
# Pretend success
|
|
dp_rank = int(body.get("dp_rank", 0))
|
|
engine_id = body.get("engine_id", "dummy-engine")
|
|
state["workers"][dp_rank] = {
|
|
"engine_id": engine_id,
|
|
"worker_addr": body.get("worker_addr", {}),
|
|
}
|
|
return JSONResponse({"status": "ok"})
|
|
|
|
@app.get("/query")
|
|
async def query():
|
|
# Return whatever we have. Empty {} is acceptable for the consumer
|
|
# because no PD-sep request will actually trigger a pull.
|
|
return JSONResponse(state["workers"])
|
|
|
|
@app.post("/query_blocks")
|
|
async def query_blocks(req: Request):
|
|
return JSONResponse({"matched_blocks": []})
|
|
|
|
@app.post("/unpin_blocks")
|
|
async def unpin_blocks(req: Request):
|
|
return JSONResponse({"status": "ok"})
|
|
|
|
@app.post("/push_blocks")
|
|
async def push_blocks(req: Request):
|
|
return JSONResponse({"status": "ok"})
|
|
|
|
@app.post("/estimate_hit")
|
|
async def estimate_hit(req: Request):
|
|
return JSONResponse({"hit_tokens": 0})
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return JSONResponse({"status": "ok"})
|
|
|
|
return app
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--host", default="127.0.0.1")
|
|
ap.add_argument("--port", type=int, default=8997)
|
|
args = ap.parse_args()
|
|
|
|
app = make_app()
|
|
config = uvicorn.Config(app=app, host=args.host, port=args.port,
|
|
log_level="info")
|
|
server = uvicorn.Server(config)
|
|
log.info("Dummy Mooncake bootstrap listening on %s:%d", args.host, args.port)
|
|
server.run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|