Files
agentic-kvc/microbench/fresh_setup/mb5_pd_proxy.py

513 lines
17 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import asyncio
import hashlib
import ipaddress
import itertools
import os
import urllib
import uuid
from contextlib import asynccontextmanager
from typing import Any
import httpx
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import StreamingResponse
def maybe_wrap_ipv6_address(address: str) -> str:
try:
ipaddress.IPv6Address(address)
return f"[{address}]"
except ValueError:
return address
def make_http_path(host: str, port: int) -> str:
return f"http://{host}:{port}"
def prefiller_cycle(prefill_clients: list[Any]):
while True:
for prefill_client in prefill_clients:
for i in range(prefill_client["dp_size"]):
yield prefill_client, i
async def get_prefiller_info(prefill_clients: list, ready: asyncio.Event):
for prefill_client in prefill_clients:
while True:
try:
# Wait for prefill service to be ready
response = await prefill_client["client"].get("/health")
response.raise_for_status()
except Exception:
await asyncio.sleep(1)
continue
response = await prefill_client["client"].get(
prefill_client["bootstrap_addr"] + "/query"
)
response.raise_for_status()
data = response.json()
break
for dp_rank, dp_entry in data.items():
prefill_client["dp_engine_id"][int(dp_rank)] = dp_entry["engine_id"]
dp_size = len(data)
prefill_client["dp_size"] = dp_size
print(f"Inited prefiller {prefill_client['url']} with dp_size={dp_size}")
ready.set()
print("All prefiller instances are ready.")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Lifespan context manager to handle startup and shutdown events.
"""
# Startup: Initialize client pools for prefiller and decoder services
app.state.prefill_clients = []
app.state.decode_clients = []
app.state.colo_clients = []
app.state.ready = asyncio.Event()
# Colo (PD-combined) passthrough mode: no bootstrap handshake needed.
if global_args.colo:
for url in global_args.colo:
app.state.colo_clients.append({
"client": httpx.AsyncClient(
timeout=None,
base_url=url,
trust_env=False, # ignore http_proxy env: backends are localhost
limits=httpx.Limits(
max_connections=None,
max_keepalive_connections=None,
),
),
"url": url,
})
app.state.colo_iterator = itertools.cycle(range(len(app.state.colo_clients)))
app.state.ready.set()
print(f"Colo passthrough mode: {len(app.state.colo_clients)} kv_both clients.")
yield
for client_info in app.state.colo_clients:
await client_info["client"].aclose()
return
# Create prefill clients
for i, (url, bootstrap_port) in enumerate(global_args.prefill):
parsed_url = urllib.parse.urlparse(url)
hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
app.state.prefill_clients.append(
{
"client": httpx.AsyncClient(
timeout=None,
base_url=url,
limits=httpx.Limits(
max_connections=None,
max_keepalive_connections=None,
),
),
"url": url,
"bootstrap_addr": make_http_path(hostname, bootstrap_port or 8998),
"dp_engine_id": {},
}
)
# Create decode clients
for i, url in enumerate(global_args.decode):
parsed_url = urllib.parse.urlparse(url)
hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
app.state.decode_clients.append(
{
"client": httpx.AsyncClient(
timeout=None,
base_url=url,
limits=httpx.Limits(
max_connections=None,
max_keepalive_connections=None,
),
),
}
)
asyncio.create_task(get_prefiller_info(app.state.prefill_clients, app.state.ready))
# Initialize round-robin iterators
app.state.prefill_iterator = prefiller_cycle(app.state.prefill_clients)
app.state.decode_iterator = itertools.cycle(range(len(app.state.decode_clients)))
print(
f"Got {len(app.state.prefill_clients)} prefill clients "
f"and {len(app.state.decode_clients)} decode clients."
)
yield
# Shutdown: Close all clients
for client_info in app.state.prefill_clients:
await client_info["client"].aclose()
for client_info in app.state.decode_clients:
await client_info["client"].aclose()
# Update FastAPI app initialization to use lifespan
app = FastAPI(lifespan=lifespan)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int, default=8000)
# Always use 127.0.0.1 as localhost binds to IPv6 which is blocked on CI
parser.add_argument("--host", type=str, default="127.0.0.1")
# For prefiller instances
parser.add_argument(
"--prefill",
nargs="+",
action="append",
dest="prefill_raw",
metavar=("URL", "bootstrap_port"),
help=(
"Prefill server URL and optional bootstrap port. "
"Can be specified multiple times. "
"Format: --prefill URL [BOOTSTRAP_PORT]. "
"BOOTSTRAP_PORT can be a port number, "
"'none', or omitted (defaults to none)."
),
)
# For decoder instances
parser.add_argument(
"--decode",
nargs=1,
action="append",
dest="decode_raw",
metavar=("URL",),
help="Decode server URL. Can be specified multiple times.",
)
# MB5: colocated (PD-combined) instances. When given, the proxy runs in
# "colo" mode — it round-robins /v1/completions to these kv_both instances
# with a plain streaming passthrough (no P->D split, no kv_transfer_params).
# This exists so the 8C baseline pays the SAME proxy hop as PD configs,
# removing the "8C bypasses the proxy" confound from the comparison.
parser.add_argument(
"--colo",
nargs=1,
action="append",
dest="colo_raw",
metavar=("URL",),
help="Colocated (kv_both) server URL. Can be specified multiple times. "
"Enables colo passthrough mode.",
)
args = parser.parse_args()
args.prefill = _parse_prefill_urls(args.prefill_raw)
args.decode = _parse_decode_urls(args.decode_raw)
args.colo = [u[0] for u in args.colo_raw] if args.colo_raw else []
return args
# From sglang router_args.py
def _parse_prefill_urls(prefill_list):
"""Parse prefill URLs from --prefill arguments.
Format: --prefill URL [BOOTSTRAP_PORT]
Example:
--prefill http://prefill1:8080 9000 # With bootstrap port
--prefill http://prefill2:8080 none # Explicitly no bootstrap port
--prefill http://prefill3:8080 # Defaults to no bootstrap port
"""
if not prefill_list:
return []
prefill_urls = []
for prefill_args in prefill_list:
url = prefill_args[0]
# Handle optional bootstrap port
if len(prefill_args) >= 2:
bootstrap_port_str = prefill_args[1]
# Handle 'none' as None
if bootstrap_port_str.lower() == "none":
bootstrap_port = None
else:
try:
bootstrap_port = int(bootstrap_port_str)
except ValueError as e:
raise ValueError(
f"Invalid bootstrap port: {bootstrap_port_str}. Must be a number or 'none'" # noqa: E501
) from e
else:
# No bootstrap port specified, default to None
bootstrap_port = None
prefill_urls.append((url, bootstrap_port))
return prefill_urls
def _parse_decode_urls(decode_list):
"""Parse decode URLs from --decode arguments.
Format: --decode URL
Example: --decode http://decode1:8081 --decode http://decode2:8081
"""
if not decode_list:
return []
# decode_list is a list of single-element lists due to nargs=1
return [url[0] for url in decode_list]
# MB5: routing mode for the prefill (producer) side.
# "rr" — round-robin (official upstream behavior)
# "session" — consistent hash on X-Session-Id, so all turns of a session
# land on the same producer and reuse its prefix cache.
# Decode side stays round-robin (load balance) regardless.
MB5_P_ROUTING = os.environ.get("MB5_P_ROUTING", "rr").lower()
# MB5: routing mode for the COLO (kv_both) passthrough proxy.
# "rr" — round-robin (loses session-local prefix cache)
# "session" — consistent hash on X-Session-Id, so all turns of a session land
# on the same kv_both instance and reuse its prefix cache. This is
# the cache-aware colo baseline (the fair strong baseline for the
# agentic reuse regime — D4).
MB5_COLO_ROUTING = os.environ.get("MB5_COLO_ROUTING", "rr").lower()
def get_prefill_by_session(app, session_id: str):
"""Pick a (prefill_client, dp_rank) deterministically from session_id.
Uses a stable (non-PYTHONHASHSEED-dependent) hash so the mapping is
reproducible across processes. dp_size is usually 1 here (TP=1, no DP),
but we hash into the flat (client, dp_rank) slot space to stay correct
if a producer ever reports dp_size > 1.
"""
clients = app.state.prefill_clients
slots = [(c, r) for c in clients for r in range(max(1, c.get("dp_size", 1)))]
h = int(hashlib.md5(session_id.encode()).hexdigest()[:8], 16)
return slots[h % len(slots)]
def get_next_client(app, service_type: str):
"""
Get the next client in round-robin fashion.
Args:
app: The FastAPI app instance
service_type: Either 'prefill' or 'decode'
Returns:
The next client to use
"""
if service_type == "prefill":
return next(app.state.prefill_iterator)
elif service_type == "decode":
client_idx = next(app.state.decode_iterator)
return app.state.decode_clients[client_idx]
else:
raise ValueError(f"Unknown service type: {service_type}")
async def send_request_to_service(
client_info: dict, dp_rank: int, endpoint: str, req_data: dict, request_id: str
):
"""
Send a request to a service using a client from the pool.
"""
req_data = req_data.copy()
req_data["kv_transfer_params"] = {
"do_remote_decode": True,
"do_remote_prefill": False,
"transfer_id": f"xfer-{request_id}",
}
req_data["stream"] = False
req_data["max_tokens"] = 1
# MB5 fix: clients (our replayer) may set min_tokens to enforce a fixed
# output length. After the proxy caps max_tokens=1 on the prefill leg,
# any min_tokens > 1 violates vLLM's `min_tokens <= max_tokens` check.
if "min_tokens" in req_data:
req_data["min_tokens"] = 1
if "max_completion_tokens" in req_data:
req_data["max_completion_tokens"] = 1
if "stream_options" in req_data:
del req_data["stream_options"]
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"X-Request-Id": request_id,
"X-data-parallel-rank": str(dp_rank),
}
response = await client_info["client"].post(
endpoint, json=req_data, headers=headers
)
response.raise_for_status()
# CRITICAL: Release connection back to pool
await response.aclose()
async def stream_service_response(
prefill_client_info: dict,
prefill_dp_rank: int,
decode_client_info: dict,
endpoint: str,
req_data: dict,
request_id: str,
):
"""
Asynchronously stream response from a service using a client from the pool.
"""
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"X-Request-Id": request_id,
}
req_data["kv_transfer_params"] = {
"do_remote_decode": False,
"do_remote_prefill": True,
"remote_bootstrap_addr": prefill_client_info["bootstrap_addr"],
"remote_engine_id": prefill_client_info["dp_engine_id"][prefill_dp_rank],
"transfer_id": f"xfer-{request_id}",
}
async with decode_client_info["client"].stream(
"POST", endpoint, json=req_data, headers=headers
) as response:
response.raise_for_status()
async for chunk in response.aiter_bytes():
yield chunk
async def stream_colo_response(
colo_client_info: dict, endpoint: str, req_data: dict, headers: dict
):
"""Plain streaming passthrough to one colocated (kv_both) instance.
The request body is forwarded unchanged (stream/min_tokens/stream_options
all preserved) so the replayer's streaming + usage parsing works exactly
as it does when it talks to a colo instance directly.
"""
async with colo_client_info["client"].stream(
"POST", endpoint, json=req_data, headers=headers
) as response:
response.raise_for_status()
async for chunk in response.aiter_bytes():
yield chunk
async def _handle_colo(api: str, request: Request):
if not app.state.ready.is_set():
raise HTTPException(status_code=503, detail="Service Unavailable")
req_data = await request.json()
request_id = request.headers.get("X-Request-Id") or str(uuid.uuid4())
headers = {"X-Request-Id": request_id}
session_id = request.headers.get("X-Session-Id")
if session_id:
headers["X-Session-Id"] = session_id
key = os.environ.get("OPENAI_API_KEY")
if key:
headers["Authorization"] = f"Bearer {key}"
if MB5_COLO_ROUTING == "session" and session_id:
# consistent hash -> same kv_both instance reuses its prefix cache
h = int(hashlib.md5(session_id.encode()).hexdigest()[:8], 16)
idx = h % len(app.state.colo_clients)
else:
idx = next(app.state.colo_iterator)
colo_client_info = app.state.colo_clients[idx]
async def generate_stream():
async for chunk in stream_colo_response(
colo_client_info, api, req_data, headers
):
yield chunk
return StreamingResponse(generate_stream(), media_type="text/event-stream")
async def _handle_completions(api: str, request: Request):
if getattr(global_args, "colo", None):
return await _handle_colo(api, request)
if not app.state.ready.is_set():
raise HTTPException(status_code=503, detail="Service Unavailable")
try:
req_data = await request.json()
request_id = str(uuid.uuid4())
# Select the prefill (producer) client.
if MB5_P_ROUTING == "session":
session_id = request.headers.get("X-Session-Id") or request_id
prefill_client_info, prefill_dp_rank = get_prefill_by_session(
request.app, session_id
)
else:
# Round-robin (official upstream behavior).
prefill_client_info, prefill_dp_rank = get_next_client(
request.app, "prefill"
)
# Send request to prefill service
asyncio.create_task(
send_request_to_service(
prefill_client_info, prefill_dp_rank, api, req_data, request_id
)
)
decode_client_info = get_next_client(request.app, "decode")
# Stream response from decode service
async def generate_stream():
async for chunk in stream_service_response(
prefill_client_info,
prefill_dp_rank,
decode_client_info,
api,
req_data,
request_id=request_id,
):
yield chunk
return StreamingResponse(generate_stream(), media_type="application/json")
except Exception as e:
import sys
import traceback
exc_info = sys.exc_info()
print(f"Error occurred in disagg prefill proxy server - {api} endpoint")
print(e)
print("".join(traceback.format_exception(*exc_info)))
raise
@app.post("/v1/completions")
async def handle_completions(request: Request):
return await _handle_completions("/v1/completions", request)
@app.post("/v1/chat/completions")
async def handle_chat_completions(request: Request):
return await _handle_completions("/v1/chat/completions", request)
if __name__ == "__main__":
global global_args
global_args = parse_args()
import uvicorn
uvicorn.run(app, host=global_args.host, port=global_args.port)