MB5 driver updates: PD-proxy + snapshot instrument + launcher tweaks

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-29 11:53:27 +08:00
parent bad512d3c5
commit ee5db0b321
4 changed files with 153 additions and 17 deletions
--- a/microbench/fresh_setup/instrument_kv_snapshot.py
+++ b/microbench/fresh_setup/instrument_kv_snapshot.py
@@ -194,24 +194,35 @@ MOONCAKE_PATCHES = [
 ]
 # ---------- Patch 4: vLLM 0.18.1 PD-consumer metrics counter underflow ------
-# In PromptTokenStats.update_from_output, local_cache_hit is computed as
+# On a kv_consumer, a KV-load failure (Mooncake transfer returns -1 when the
-# (num_cached_tokens + recomputed - num_external_computed_tokens). On a
+# D-pool is full) makes vLLM emit an iteration-stats "correction" with NEGATIVE
-# kv_consumer, a remote KV transfer can report more external-computed tokens
+# token deltas: PromptTokenStats fields (local_cache_hit, cached_tokens, ...)
-# than the scheduler's cached count (esp. on a KV-load failure for a large
+# AND iteration_stats.{num_prompt_tokens, num_generation_tokens} all go below
-# request), driving local_cache_hit negative. loggers.record() then calls
+# zero. Every Counter.inc() in loggers.record() then trips prometheus_client's
 # Counter.inc() with that negative value and prometheus_client raises
 # "Counters can only be incremented by non-negative amounts.", which kills the
 # EngineCore — turning one failed request into a total config collapse.
-# We clamp the per-source counts to >= 0 right before they are recorded.
+#
-LOGGERS_ANCHOR = "        pts = iteration_stats.prompt_token_stats\n"
+# Clamp every field that feeds a Counter.inc() in record() to >= 0. We anchor
 # right after the `if iteration_stats is None: return` guard so the clamp runs
 # before the first inc() (the corrupted-requests / preempted / prompt-token
 # counters at the top of the method).
 LOGGERS_ANCHOR = "        if iteration_stats is None:\n            return\n"
 LOGGERS_INSERT = (
    f"        {START_MARK}\n"
-    f"        if pts.local_cache_hit < 0:\n"
+    f"        _mb5_pts = iteration_stats.prompt_token_stats\n"
-    f"            pts.local_cache_hit = 0\n"
+    f"        for _mb5_o, _mb5_a in (\n"
-    f"        if pts.computed < 0:\n"
+    f"            (iteration_stats, 'num_prompt_tokens'),\n"
-    f"            pts.computed = 0\n"
+    f"            (iteration_stats, 'num_generation_tokens'),\n"
-    f"        if pts.external_kv_transfer < 0:\n"
+    f"            (iteration_stats, 'num_preempted_reqs'),\n"
-    f"            pts.external_kv_transfer = 0\n"
+    f"            (iteration_stats, 'num_corrupted_reqs'),\n"
    f"            (_mb5_pts, 'computed'),\n"
    f"            (_mb5_pts, 'local_cache_hit'),\n"
    f"            (_mb5_pts, 'external_kv_transfer'),\n"
    f"            (_mb5_pts, 'cached_tokens'),\n"
    f"            (_mb5_pts, 'recomputed_tokens'),\n"
    f"        ):\n"
    f"            if getattr(_mb5_o, _mb5_a, 0) < 0:\n"
    f"                setattr(_mb5_o, _mb5_a, 0)\n"
    f"        {END_MARK}\n"
 )
--- a/microbench/fresh_setup/mb5_launch.sh
+++ b/microbench/fresh_setup/mb5_launch.sh
@@ -24,7 +24,9 @@
 set -eo pipefail
 FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
-VENV="${FRESH_ROOT}/.venv"
+# MB5_VENV lets a second host use an isolated venv clone (e.g. .venv_dash0) so
 # two boxes can run in parallel without racing on the shared cpfs venv patch.
 VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
 MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 INSTRUMENT="${SCRIPT_DIR}/instrument_kv_snapshot.py"
@@ -86,14 +88,16 @@ case "${1:-start}" in
 esac
 # --- parse CONFIG into (prefill_gpus, decode_gpus) ----------------
 USE_COLO_PROXY=0
 case "${CONFIG}" in
    8C)        ROLES="combined"; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
    8C-proxy)  ROLES="combined"; USE_COLO_PROXY=1; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
    6P+2D)     ROLES="pd"; P_GPUS="0,1,2,3,4,5"; D_GPUS="6,7" ;;
    5P+3D)     ROLES="pd"; P_GPUS="0,1,2,3,4"; D_GPUS="5,6,7" ;;
    4P+4D)     ROLES="pd"; P_GPUS="0,1,2,3"; D_GPUS="4,5,6,7" ;;
    3P+5D)     ROLES="pd"; P_GPUS="0,1,2"; D_GPUS="3,4,5,6,7" ;;
    2P+6D)     ROLES="pd"; P_GPUS="0,1"; D_GPUS="2,3,4,5,6,7" ;;
-    *) echo "Unknown CONFIG=${CONFIG} (expected: 8C, 6P+2D, 5P+3D, 4P+4D, 3P+5D, 2P+6D)"; exit 1;;
+    *) echo "Unknown CONFIG=${CONFIG} (expected: 8C, 8C-proxy, 6P+2D, 5P+3D, 4P+4D, 3P+5D, 2P+6D)"; exit 1;;
 esac
 stop_all
@@ -137,6 +141,7 @@ launch_vllm() {
 idx=0
 proxy_args=()
 colo_args=()
 ENDPOINTS=""
 case "${ROLES}" in
@@ -147,6 +152,7 @@ case "${ROLES}" in
            bp=$((BASE_BP+idx))
            launch_vllm "${idx}" "${gpu}" "${port}" "kv_both" "${bp}"
            ENDPOINTS+="${ENDPOINTS:+,}http://127.0.0.1:${port}"
            colo_args+=( --colo "http://127.0.0.1:${port}" )
            idx=$((idx+1))
            sleep 1
        done
@@ -215,6 +221,25 @@ if [ "${ROLES}" = "pd" ]; then
    ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
 fi
 if [ "${USE_COLO_PROXY}" = "1" ]; then
    echo "[mb5] launching colo passthrough proxy on ${PROXY_PORT} (8 kv_both instances)"
    nohup python "${PROXY_SRC}" "${colo_args[@]}" --port "${PROXY_PORT}" --host 0.0.0.0 \
        > "${LOGS_DIR}/proxy.log" 2>&1 &
    disown
    tries=0
    while ! curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${PROXY_PORT}/" 2>/dev/null | grep -qE "^[0-9]"; do
        tries=$((tries+1))
        if [ ${tries} -gt 60 ]; then
            echo "[mb5] FATAL colo proxy did not come up in 2 min"
            tail -40 "${LOGS_DIR}/proxy.log" || true
            exit 1
        fi
        sleep 2
    done
    echo "  colo proxy port=${PROXY_PORT} ready (HTTP responding)"
    ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
 fi
 echo "[mb5] CONFIG=${CONFIG}  RUN_LABEL=${RUN_LABEL}  UP"
 echo "ENDPOINTS=${ENDPOINTS}"
 echo "RUN_ROOT=${RUN_ROOT}"
--- a/microbench/fresh_setup/mb5_pd_proxy.py
+++ b/microbench/fresh_setup/mb5_pd_proxy.py
@@ -72,8 +72,32 @@ async def lifespan(app: FastAPI):
    # Startup: Initialize client pools for prefiller and decoder services
    app.state.prefill_clients = []
    app.state.decode_clients = []
    app.state.colo_clients = []
    app.state.ready = asyncio.Event()
    # Colo (PD-combined) passthrough mode: no bootstrap handshake needed.
    if global_args.colo:
        for url in global_args.colo:
            app.state.colo_clients.append({
                "client": httpx.AsyncClient(
                    timeout=None,
                    base_url=url,
                    trust_env=False,  # ignore http_proxy env: backends are localhost
                    limits=httpx.Limits(
                        max_connections=None,
                        max_keepalive_connections=None,
                    ),
                ),
                "url": url,
            })
        app.state.colo_iterator = itertools.cycle(range(len(app.state.colo_clients)))
        app.state.ready.set()
        print(f"Colo passthrough mode: {len(app.state.colo_clients)} kv_both clients.")
        yield
        for client_info in app.state.colo_clients:
            await client_info["client"].aclose()
        return
    # Create prefill clients
    for i, (url, bootstrap_port) in enumerate(global_args.prefill):
        parsed_url = urllib.parse.urlparse(url)
@@ -169,9 +193,25 @@ def parse_args():
        help="Decode server URL. Can be specified multiple times.",
    )
    # MB5: colocated (PD-combined) instances. When given, the proxy runs in
    # "colo" mode — it round-robins /v1/completions to these kv_both instances
    # with a plain streaming passthrough (no P->D split, no kv_transfer_params).
    # This exists so the 8C baseline pays the SAME proxy hop as PD configs,
    # removing the "8C bypasses the proxy" confound from the comparison.
    parser.add_argument(
        "--colo",
        nargs=1,
        action="append",
        dest="colo_raw",
        metavar=("URL",),
        help="Colocated (kv_both) server URL. Can be specified multiple times. "
             "Enables colo passthrough mode.",
    )
    args = parser.parse_args()
    args.prefill = _parse_prefill_urls(args.prefill_raw)
    args.decode = _parse_decode_urls(args.decode_raw)
    args.colo = [u[0] for u in args.colo_raw] if args.colo_raw else []
    return args
@@ -235,6 +275,14 @@ def _parse_decode_urls(decode_list):
 # Decode side stays round-robin (load balance) regardless.
 MB5_P_ROUTING = os.environ.get("MB5_P_ROUTING", "rr").lower()
 # MB5: routing mode for the COLO (kv_both) passthrough proxy.
 #   "rr"      — round-robin (loses session-local prefix cache)
 #   "session" — consistent hash on X-Session-Id, so all turns of a session land
 #               on the same kv_both instance and reuse its prefix cache. This is
 #               the cache-aware colo baseline (the fair strong baseline for the
 #               agentic reuse regime — D4).
 MB5_COLO_ROUTING = os.environ.get("MB5_COLO_ROUTING", "rr").lower()
 def get_prefill_by_session(app, session_id: str):
    """Pick a (prefill_client, dp_rank) deterministically from session_id.
@@ -340,7 +388,58 @@ async def stream_service_response(
            yield chunk
 async def stream_colo_response(
    colo_client_info: dict, endpoint: str, req_data: dict, headers: dict
 ):
    """Plain streaming passthrough to one colocated (kv_both) instance.
    The request body is forwarded unchanged (stream/min_tokens/stream_options
    all preserved) so the replayer's streaming + usage parsing works exactly
    as it does when it talks to a colo instance directly.
    """
    async with colo_client_info["client"].stream(
        "POST", endpoint, json=req_data, headers=headers
    ) as response:
        response.raise_for_status()
        async for chunk in response.aiter_bytes():
            yield chunk
 async def _handle_colo(api: str, request: Request):
    if not app.state.ready.is_set():
        raise HTTPException(status_code=503, detail="Service Unavailable")
    req_data = await request.json()
    request_id = request.headers.get("X-Request-Id") or str(uuid.uuid4())
    headers = {"X-Request-Id": request_id}
    session_id = request.headers.get("X-Session-Id")
    if session_id:
        headers["X-Session-Id"] = session_id
    key = os.environ.get("OPENAI_API_KEY")
    if key:
        headers["Authorization"] = f"Bearer {key}"
    if MB5_COLO_ROUTING == "session" and session_id:
        # consistent hash -> same kv_both instance reuses its prefix cache
        h = int(hashlib.md5(session_id.encode()).hexdigest()[:8], 16)
        idx = h % len(app.state.colo_clients)
    else:
        idx = next(app.state.colo_iterator)
    colo_client_info = app.state.colo_clients[idx]
    async def generate_stream():
        async for chunk in stream_colo_response(
            colo_client_info, api, req_data, headers
        ):
            yield chunk
    return StreamingResponse(generate_stream(), media_type="text/event-stream")
 async def _handle_completions(api: str, request: Request):
    if getattr(global_args, "colo", None):
        return await _handle_colo(api, request)
    if not app.state.ready.is_set():
        raise HTTPException(status_code=503, detail="Service Unavailable")
--- a/microbench/fresh_setup/mb5_run.sh
+++ b/microbench/fresh_setup/mb5_run.sh
@@ -17,7 +17,8 @@
 set -eo pipefail
 FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
-VENV="${FRESH_ROOT}/.venv"
+# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
 VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
 REPLAYER_DIR="${FRESH_ROOT}/replayer"