# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import asyncio import hashlib import ipaddress import itertools import os import urllib import uuid from contextlib import asynccontextmanager from typing import Any import httpx from fastapi import FastAPI, HTTPException, Request from fastapi.responses import StreamingResponse def maybe_wrap_ipv6_address(address: str) -> str: try: ipaddress.IPv6Address(address) return f"[{address}]" except ValueError: return address def make_http_path(host: str, port: int) -> str: return f"http://{host}:{port}" def prefiller_cycle(prefill_clients: list[Any]): while True: for prefill_client in prefill_clients: for i in range(prefill_client["dp_size"]): yield prefill_client, i async def get_prefiller_info(prefill_clients: list, ready: asyncio.Event): for prefill_client in prefill_clients: while True: try: # Wait for prefill service to be ready response = await prefill_client["client"].get("/health") response.raise_for_status() except Exception: await asyncio.sleep(1) continue response = await prefill_client["client"].get( prefill_client["bootstrap_addr"] + "/query" ) response.raise_for_status() data = response.json() break for dp_rank, dp_entry in data.items(): prefill_client["dp_engine_id"][int(dp_rank)] = dp_entry["engine_id"] dp_size = len(data) prefill_client["dp_size"] = dp_size print(f"Inited prefiller {prefill_client['url']} with dp_size={dp_size}") ready.set() print("All prefiller instances are ready.") @asynccontextmanager async def lifespan(app: FastAPI): """ Lifespan context manager to handle startup and shutdown events. """ # Startup: Initialize client pools for prefiller and decoder services app.state.prefill_clients = [] app.state.decode_clients = [] app.state.colo_clients = [] app.state.ready = asyncio.Event() # Colo (PD-combined) passthrough mode: no bootstrap handshake needed. if global_args.colo: for url in global_args.colo: app.state.colo_clients.append({ "client": httpx.AsyncClient( timeout=None, base_url=url, trust_env=False, # ignore http_proxy env: backends are localhost limits=httpx.Limits( max_connections=None, max_keepalive_connections=None, ), ), "url": url, }) app.state.colo_iterator = itertools.cycle(range(len(app.state.colo_clients))) app.state.ready.set() print(f"Colo passthrough mode: {len(app.state.colo_clients)} kv_both clients.") yield for client_info in app.state.colo_clients: await client_info["client"].aclose() return # Create prefill clients for i, (url, bootstrap_port) in enumerate(global_args.prefill): parsed_url = urllib.parse.urlparse(url) hostname = maybe_wrap_ipv6_address(parsed_url.hostname) app.state.prefill_clients.append( { "client": httpx.AsyncClient( timeout=None, base_url=url, limits=httpx.Limits( max_connections=None, max_keepalive_connections=None, ), ), "url": url, "bootstrap_addr": make_http_path(hostname, bootstrap_port or 8998), "dp_engine_id": {}, } ) # Create decode clients for i, url in enumerate(global_args.decode): parsed_url = urllib.parse.urlparse(url) hostname = maybe_wrap_ipv6_address(parsed_url.hostname) app.state.decode_clients.append( { "client": httpx.AsyncClient( timeout=None, base_url=url, limits=httpx.Limits( max_connections=None, max_keepalive_connections=None, ), ), } ) asyncio.create_task(get_prefiller_info(app.state.prefill_clients, app.state.ready)) # Initialize round-robin iterators app.state.prefill_iterator = prefiller_cycle(app.state.prefill_clients) app.state.decode_iterator = itertools.cycle(range(len(app.state.decode_clients))) print( f"Got {len(app.state.prefill_clients)} prefill clients " f"and {len(app.state.decode_clients)} decode clients." ) yield # Shutdown: Close all clients for client_info in app.state.prefill_clients: await client_info["client"].aclose() for client_info in app.state.decode_clients: await client_info["client"].aclose() # Update FastAPI app initialization to use lifespan app = FastAPI(lifespan=lifespan) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--port", type=int, default=8000) # Always use 127.0.0.1 as localhost binds to IPv6 which is blocked on CI parser.add_argument("--host", type=str, default="127.0.0.1") # For prefiller instances parser.add_argument( "--prefill", nargs="+", action="append", dest="prefill_raw", metavar=("URL", "bootstrap_port"), help=( "Prefill server URL and optional bootstrap port. " "Can be specified multiple times. " "Format: --prefill URL [BOOTSTRAP_PORT]. " "BOOTSTRAP_PORT can be a port number, " "'none', or omitted (defaults to none)." ), ) # For decoder instances parser.add_argument( "--decode", nargs=1, action="append", dest="decode_raw", metavar=("URL",), help="Decode server URL. Can be specified multiple times.", ) # MB5: colocated (PD-combined) instances. When given, the proxy runs in # "colo" mode — it round-robins /v1/completions to these kv_both instances # with a plain streaming passthrough (no P->D split, no kv_transfer_params). # This exists so the 8C baseline pays the SAME proxy hop as PD configs, # removing the "8C bypasses the proxy" confound from the comparison. parser.add_argument( "--colo", nargs=1, action="append", dest="colo_raw", metavar=("URL",), help="Colocated (kv_both) server URL. Can be specified multiple times. " "Enables colo passthrough mode.", ) args = parser.parse_args() args.prefill = _parse_prefill_urls(args.prefill_raw) args.decode = _parse_decode_urls(args.decode_raw) args.colo = [u[0] for u in args.colo_raw] if args.colo_raw else [] return args # From sglang router_args.py def _parse_prefill_urls(prefill_list): """Parse prefill URLs from --prefill arguments. Format: --prefill URL [BOOTSTRAP_PORT] Example: --prefill http://prefill1:8080 9000 # With bootstrap port --prefill http://prefill2:8080 none # Explicitly no bootstrap port --prefill http://prefill3:8080 # Defaults to no bootstrap port """ if not prefill_list: return [] prefill_urls = [] for prefill_args in prefill_list: url = prefill_args[0] # Handle optional bootstrap port if len(prefill_args) >= 2: bootstrap_port_str = prefill_args[1] # Handle 'none' as None if bootstrap_port_str.lower() == "none": bootstrap_port = None else: try: bootstrap_port = int(bootstrap_port_str) except ValueError as e: raise ValueError( f"Invalid bootstrap port: {bootstrap_port_str}. Must be a number or 'none'" # noqa: E501 ) from e else: # No bootstrap port specified, default to None bootstrap_port = None prefill_urls.append((url, bootstrap_port)) return prefill_urls def _parse_decode_urls(decode_list): """Parse decode URLs from --decode arguments. Format: --decode URL Example: --decode http://decode1:8081 --decode http://decode2:8081 """ if not decode_list: return [] # decode_list is a list of single-element lists due to nargs=1 return [url[0] for url in decode_list] # MB5: routing mode for the prefill (producer) side. # "rr" — round-robin (official upstream behavior) # "session" — consistent hash on X-Session-Id, so all turns of a session # land on the same producer and reuse its prefix cache. # Decode side stays round-robin (load balance) regardless. MB5_P_ROUTING = os.environ.get("MB5_P_ROUTING", "rr").lower() # MB5: routing mode for the COLO (kv_both) passthrough proxy. # "rr" — round-robin (loses session-local prefix cache) # "session" — consistent hash on X-Session-Id, so all turns of a session land # on the same kv_both instance and reuse its prefix cache. This is # the cache-aware colo baseline (the fair strong baseline for the # agentic reuse regime — D4). MB5_COLO_ROUTING = os.environ.get("MB5_COLO_ROUTING", "rr").lower() def get_prefill_by_session(app, session_id: str): """Pick a (prefill_client, dp_rank) deterministically from session_id. Uses a stable (non-PYTHONHASHSEED-dependent) hash so the mapping is reproducible across processes. dp_size is usually 1 here (TP=1, no DP), but we hash into the flat (client, dp_rank) slot space to stay correct if a producer ever reports dp_size > 1. """ clients = app.state.prefill_clients slots = [(c, r) for c in clients for r in range(max(1, c.get("dp_size", 1)))] h = int(hashlib.md5(session_id.encode()).hexdigest()[:8], 16) return slots[h % len(slots)] def get_next_client(app, service_type: str): """ Get the next client in round-robin fashion. Args: app: The FastAPI app instance service_type: Either 'prefill' or 'decode' Returns: The next client to use """ if service_type == "prefill": return next(app.state.prefill_iterator) elif service_type == "decode": client_idx = next(app.state.decode_iterator) return app.state.decode_clients[client_idx] else: raise ValueError(f"Unknown service type: {service_type}") async def send_request_to_service( client_info: dict, dp_rank: int, endpoint: str, req_data: dict, request_id: str ): """ Send a request to a service using a client from the pool. """ req_data = req_data.copy() req_data["kv_transfer_params"] = { "do_remote_decode": True, "do_remote_prefill": False, "transfer_id": f"xfer-{request_id}", } req_data["stream"] = False req_data["max_tokens"] = 1 # MB5 fix: clients (our replayer) may set min_tokens to enforce a fixed # output length. After the proxy caps max_tokens=1 on the prefill leg, # any min_tokens > 1 violates vLLM's `min_tokens <= max_tokens` check. if "min_tokens" in req_data: req_data["min_tokens"] = 1 if "max_completion_tokens" in req_data: req_data["max_completion_tokens"] = 1 if "stream_options" in req_data: del req_data["stream_options"] headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", "X-Request-Id": request_id, "X-data-parallel-rank": str(dp_rank), } response = await client_info["client"].post( endpoint, json=req_data, headers=headers ) response.raise_for_status() # CRITICAL: Release connection back to pool await response.aclose() async def stream_service_response( prefill_client_info: dict, prefill_dp_rank: int, decode_client_info: dict, endpoint: str, req_data: dict, request_id: str, ): """ Asynchronously stream response from a service using a client from the pool. """ headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", "X-Request-Id": request_id, } req_data["kv_transfer_params"] = { "do_remote_decode": False, "do_remote_prefill": True, "remote_bootstrap_addr": prefill_client_info["bootstrap_addr"], "remote_engine_id": prefill_client_info["dp_engine_id"][prefill_dp_rank], "transfer_id": f"xfer-{request_id}", } async with decode_client_info["client"].stream( "POST", endpoint, json=req_data, headers=headers ) as response: response.raise_for_status() async for chunk in response.aiter_bytes(): yield chunk async def stream_colo_response( colo_client_info: dict, endpoint: str, req_data: dict, headers: dict ): """Plain streaming passthrough to one colocated (kv_both) instance. The request body is forwarded unchanged (stream/min_tokens/stream_options all preserved) so the replayer's streaming + usage parsing works exactly as it does when it talks to a colo instance directly. """ async with colo_client_info["client"].stream( "POST", endpoint, json=req_data, headers=headers ) as response: response.raise_for_status() async for chunk in response.aiter_bytes(): yield chunk async def _handle_colo(api: str, request: Request): if not app.state.ready.is_set(): raise HTTPException(status_code=503, detail="Service Unavailable") req_data = await request.json() request_id = request.headers.get("X-Request-Id") or str(uuid.uuid4()) headers = {"X-Request-Id": request_id} session_id = request.headers.get("X-Session-Id") if session_id: headers["X-Session-Id"] = session_id key = os.environ.get("OPENAI_API_KEY") if key: headers["Authorization"] = f"Bearer {key}" if MB5_COLO_ROUTING == "session" and session_id: # consistent hash -> same kv_both instance reuses its prefix cache h = int(hashlib.md5(session_id.encode()).hexdigest()[:8], 16) idx = h % len(app.state.colo_clients) else: idx = next(app.state.colo_iterator) colo_client_info = app.state.colo_clients[idx] async def generate_stream(): async for chunk in stream_colo_response( colo_client_info, api, req_data, headers ): yield chunk return StreamingResponse(generate_stream(), media_type="text/event-stream") async def _handle_completions(api: str, request: Request): if getattr(global_args, "colo", None): return await _handle_colo(api, request) if not app.state.ready.is_set(): raise HTTPException(status_code=503, detail="Service Unavailable") try: req_data = await request.json() request_id = str(uuid.uuid4()) # Select the prefill (producer) client. if MB5_P_ROUTING == "session": session_id = request.headers.get("X-Session-Id") or request_id prefill_client_info, prefill_dp_rank = get_prefill_by_session( request.app, session_id ) else: # Round-robin (official upstream behavior). prefill_client_info, prefill_dp_rank = get_next_client( request.app, "prefill" ) # Send request to prefill service asyncio.create_task( send_request_to_service( prefill_client_info, prefill_dp_rank, api, req_data, request_id ) ) decode_client_info = get_next_client(request.app, "decode") # Stream response from decode service async def generate_stream(): async for chunk in stream_service_response( prefill_client_info, prefill_dp_rank, decode_client_info, api, req_data, request_id=request_id, ): yield chunk return StreamingResponse(generate_stream(), media_type="application/json") except Exception as e: import sys import traceback exc_info = sys.exc_info() print(f"Error occurred in disagg prefill proxy server - {api} endpoint") print(e) print("".join(traceback.format_exception(*exc_info))) raise @app.post("/v1/completions") async def handle_completions(request: Request): return await _handle_completions("/v1/completions", request) @app.post("/v1/chat/completions") async def handle_chat_completions(request: Request): return await _handle_completions("/v1/chat/completions", request) if __name__ == "__main__": global global_args global_args = parse_args() import uvicorn uvicorn.run(app, host=global_args.host, port=global_args.port)