MB5 driver updates: PD-proxy + snapshot instrument + launcher tweaks
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -194,24 +194,35 @@ MOONCAKE_PATCHES = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
# ---------- Patch 4: vLLM 0.18.1 PD-consumer metrics counter underflow ------
|
# ---------- Patch 4: vLLM 0.18.1 PD-consumer metrics counter underflow ------
|
||||||
# In PromptTokenStats.update_from_output, local_cache_hit is computed as
|
# On a kv_consumer, a KV-load failure (Mooncake transfer returns -1 when the
|
||||||
# (num_cached_tokens + recomputed - num_external_computed_tokens). On a
|
# D-pool is full) makes vLLM emit an iteration-stats "correction" with NEGATIVE
|
||||||
# kv_consumer, a remote KV transfer can report more external-computed tokens
|
# token deltas: PromptTokenStats fields (local_cache_hit, cached_tokens, ...)
|
||||||
# than the scheduler's cached count (esp. on a KV-load failure for a large
|
# AND iteration_stats.{num_prompt_tokens, num_generation_tokens} all go below
|
||||||
# request), driving local_cache_hit negative. loggers.record() then calls
|
# zero. Every Counter.inc() in loggers.record() then trips prometheus_client's
|
||||||
# Counter.inc() with that negative value and prometheus_client raises
|
|
||||||
# "Counters can only be incremented by non-negative amounts.", which kills the
|
# "Counters can only be incremented by non-negative amounts.", which kills the
|
||||||
# EngineCore — turning one failed request into a total config collapse.
|
# EngineCore — turning one failed request into a total config collapse.
|
||||||
# We clamp the per-source counts to >= 0 right before they are recorded.
|
#
|
||||||
LOGGERS_ANCHOR = " pts = iteration_stats.prompt_token_stats\n"
|
# Clamp every field that feeds a Counter.inc() in record() to >= 0. We anchor
|
||||||
|
# right after the `if iteration_stats is None: return` guard so the clamp runs
|
||||||
|
# before the first inc() (the corrupted-requests / preempted / prompt-token
|
||||||
|
# counters at the top of the method).
|
||||||
|
LOGGERS_ANCHOR = " if iteration_stats is None:\n return\n"
|
||||||
LOGGERS_INSERT = (
|
LOGGERS_INSERT = (
|
||||||
f" {START_MARK}\n"
|
f" {START_MARK}\n"
|
||||||
f" if pts.local_cache_hit < 0:\n"
|
f" _mb5_pts = iteration_stats.prompt_token_stats\n"
|
||||||
f" pts.local_cache_hit = 0\n"
|
f" for _mb5_o, _mb5_a in (\n"
|
||||||
f" if pts.computed < 0:\n"
|
f" (iteration_stats, 'num_prompt_tokens'),\n"
|
||||||
f" pts.computed = 0\n"
|
f" (iteration_stats, 'num_generation_tokens'),\n"
|
||||||
f" if pts.external_kv_transfer < 0:\n"
|
f" (iteration_stats, 'num_preempted_reqs'),\n"
|
||||||
f" pts.external_kv_transfer = 0\n"
|
f" (iteration_stats, 'num_corrupted_reqs'),\n"
|
||||||
|
f" (_mb5_pts, 'computed'),\n"
|
||||||
|
f" (_mb5_pts, 'local_cache_hit'),\n"
|
||||||
|
f" (_mb5_pts, 'external_kv_transfer'),\n"
|
||||||
|
f" (_mb5_pts, 'cached_tokens'),\n"
|
||||||
|
f" (_mb5_pts, 'recomputed_tokens'),\n"
|
||||||
|
f" ):\n"
|
||||||
|
f" if getattr(_mb5_o, _mb5_a, 0) < 0:\n"
|
||||||
|
f" setattr(_mb5_o, _mb5_a, 0)\n"
|
||||||
f" {END_MARK}\n"
|
f" {END_MARK}\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,9 @@
|
|||||||
set -eo pipefail
|
set -eo pipefail
|
||||||
|
|
||||||
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
||||||
VENV="${FRESH_ROOT}/.venv"
|
# MB5_VENV lets a second host use an isolated venv clone (e.g. .venv_dash0) so
|
||||||
|
# two boxes can run in parallel without racing on the shared cpfs venv patch.
|
||||||
|
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
|
||||||
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
INSTRUMENT="${SCRIPT_DIR}/instrument_kv_snapshot.py"
|
INSTRUMENT="${SCRIPT_DIR}/instrument_kv_snapshot.py"
|
||||||
@@ -86,14 +88,16 @@ case "${1:-start}" in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
# --- parse CONFIG into (prefill_gpus, decode_gpus) ----------------
|
# --- parse CONFIG into (prefill_gpus, decode_gpus) ----------------
|
||||||
|
USE_COLO_PROXY=0
|
||||||
case "${CONFIG}" in
|
case "${CONFIG}" in
|
||||||
8C) ROLES="combined"; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
|
8C) ROLES="combined"; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
|
||||||
|
8C-proxy) ROLES="combined"; USE_COLO_PROXY=1; P_GPUS=""; D_GPUS=""; COMBINED_GPUS="0,1,2,3,4,5,6,7" ;;
|
||||||
6P+2D) ROLES="pd"; P_GPUS="0,1,2,3,4,5"; D_GPUS="6,7" ;;
|
6P+2D) ROLES="pd"; P_GPUS="0,1,2,3,4,5"; D_GPUS="6,7" ;;
|
||||||
5P+3D) ROLES="pd"; P_GPUS="0,1,2,3,4"; D_GPUS="5,6,7" ;;
|
5P+3D) ROLES="pd"; P_GPUS="0,1,2,3,4"; D_GPUS="5,6,7" ;;
|
||||||
4P+4D) ROLES="pd"; P_GPUS="0,1,2,3"; D_GPUS="4,5,6,7" ;;
|
4P+4D) ROLES="pd"; P_GPUS="0,1,2,3"; D_GPUS="4,5,6,7" ;;
|
||||||
3P+5D) ROLES="pd"; P_GPUS="0,1,2"; D_GPUS="3,4,5,6,7" ;;
|
3P+5D) ROLES="pd"; P_GPUS="0,1,2"; D_GPUS="3,4,5,6,7" ;;
|
||||||
2P+6D) ROLES="pd"; P_GPUS="0,1"; D_GPUS="2,3,4,5,6,7" ;;
|
2P+6D) ROLES="pd"; P_GPUS="0,1"; D_GPUS="2,3,4,5,6,7" ;;
|
||||||
*) echo "Unknown CONFIG=${CONFIG} (expected: 8C, 6P+2D, 5P+3D, 4P+4D, 3P+5D, 2P+6D)"; exit 1;;
|
*) echo "Unknown CONFIG=${CONFIG} (expected: 8C, 8C-proxy, 6P+2D, 5P+3D, 4P+4D, 3P+5D, 2P+6D)"; exit 1;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
stop_all
|
stop_all
|
||||||
@@ -137,6 +141,7 @@ launch_vllm() {
|
|||||||
|
|
||||||
idx=0
|
idx=0
|
||||||
proxy_args=()
|
proxy_args=()
|
||||||
|
colo_args=()
|
||||||
ENDPOINTS=""
|
ENDPOINTS=""
|
||||||
|
|
||||||
case "${ROLES}" in
|
case "${ROLES}" in
|
||||||
@@ -147,6 +152,7 @@ case "${ROLES}" in
|
|||||||
bp=$((BASE_BP+idx))
|
bp=$((BASE_BP+idx))
|
||||||
launch_vllm "${idx}" "${gpu}" "${port}" "kv_both" "${bp}"
|
launch_vllm "${idx}" "${gpu}" "${port}" "kv_both" "${bp}"
|
||||||
ENDPOINTS+="${ENDPOINTS:+,}http://127.0.0.1:${port}"
|
ENDPOINTS+="${ENDPOINTS:+,}http://127.0.0.1:${port}"
|
||||||
|
colo_args+=( --colo "http://127.0.0.1:${port}" )
|
||||||
idx=$((idx+1))
|
idx=$((idx+1))
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
@@ -215,6 +221,25 @@ if [ "${ROLES}" = "pd" ]; then
|
|||||||
ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
|
ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "${USE_COLO_PROXY}" = "1" ]; then
|
||||||
|
echo "[mb5] launching colo passthrough proxy on ${PROXY_PORT} (8 kv_both instances)"
|
||||||
|
nohup python "${PROXY_SRC}" "${colo_args[@]}" --port "${PROXY_PORT}" --host 0.0.0.0 \
|
||||||
|
> "${LOGS_DIR}/proxy.log" 2>&1 &
|
||||||
|
disown
|
||||||
|
tries=0
|
||||||
|
while ! curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:${PROXY_PORT}/" 2>/dev/null | grep -qE "^[0-9]"; do
|
||||||
|
tries=$((tries+1))
|
||||||
|
if [ ${tries} -gt 60 ]; then
|
||||||
|
echo "[mb5] FATAL colo proxy did not come up in 2 min"
|
||||||
|
tail -40 "${LOGS_DIR}/proxy.log" || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
echo " colo proxy port=${PROXY_PORT} ready (HTTP responding)"
|
||||||
|
ENDPOINTS="http://127.0.0.1:${PROXY_PORT}"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "[mb5] CONFIG=${CONFIG} RUN_LABEL=${RUN_LABEL} UP"
|
echo "[mb5] CONFIG=${CONFIG} RUN_LABEL=${RUN_LABEL} UP"
|
||||||
echo "ENDPOINTS=${ENDPOINTS}"
|
echo "ENDPOINTS=${ENDPOINTS}"
|
||||||
echo "RUN_ROOT=${RUN_ROOT}"
|
echo "RUN_ROOT=${RUN_ROOT}"
|
||||||
|
|||||||
@@ -72,8 +72,32 @@ async def lifespan(app: FastAPI):
|
|||||||
# Startup: Initialize client pools for prefiller and decoder services
|
# Startup: Initialize client pools for prefiller and decoder services
|
||||||
app.state.prefill_clients = []
|
app.state.prefill_clients = []
|
||||||
app.state.decode_clients = []
|
app.state.decode_clients = []
|
||||||
|
app.state.colo_clients = []
|
||||||
app.state.ready = asyncio.Event()
|
app.state.ready = asyncio.Event()
|
||||||
|
|
||||||
|
# Colo (PD-combined) passthrough mode: no bootstrap handshake needed.
|
||||||
|
if global_args.colo:
|
||||||
|
for url in global_args.colo:
|
||||||
|
app.state.colo_clients.append({
|
||||||
|
"client": httpx.AsyncClient(
|
||||||
|
timeout=None,
|
||||||
|
base_url=url,
|
||||||
|
trust_env=False, # ignore http_proxy env: backends are localhost
|
||||||
|
limits=httpx.Limits(
|
||||||
|
max_connections=None,
|
||||||
|
max_keepalive_connections=None,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
"url": url,
|
||||||
|
})
|
||||||
|
app.state.colo_iterator = itertools.cycle(range(len(app.state.colo_clients)))
|
||||||
|
app.state.ready.set()
|
||||||
|
print(f"Colo passthrough mode: {len(app.state.colo_clients)} kv_both clients.")
|
||||||
|
yield
|
||||||
|
for client_info in app.state.colo_clients:
|
||||||
|
await client_info["client"].aclose()
|
||||||
|
return
|
||||||
|
|
||||||
# Create prefill clients
|
# Create prefill clients
|
||||||
for i, (url, bootstrap_port) in enumerate(global_args.prefill):
|
for i, (url, bootstrap_port) in enumerate(global_args.prefill):
|
||||||
parsed_url = urllib.parse.urlparse(url)
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
@@ -169,9 +193,25 @@ def parse_args():
|
|||||||
help="Decode server URL. Can be specified multiple times.",
|
help="Decode server URL. Can be specified multiple times.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# MB5: colocated (PD-combined) instances. When given, the proxy runs in
|
||||||
|
# "colo" mode — it round-robins /v1/completions to these kv_both instances
|
||||||
|
# with a plain streaming passthrough (no P->D split, no kv_transfer_params).
|
||||||
|
# This exists so the 8C baseline pays the SAME proxy hop as PD configs,
|
||||||
|
# removing the "8C bypasses the proxy" confound from the comparison.
|
||||||
|
parser.add_argument(
|
||||||
|
"--colo",
|
||||||
|
nargs=1,
|
||||||
|
action="append",
|
||||||
|
dest="colo_raw",
|
||||||
|
metavar=("URL",),
|
||||||
|
help="Colocated (kv_both) server URL. Can be specified multiple times. "
|
||||||
|
"Enables colo passthrough mode.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
args.prefill = _parse_prefill_urls(args.prefill_raw)
|
args.prefill = _parse_prefill_urls(args.prefill_raw)
|
||||||
args.decode = _parse_decode_urls(args.decode_raw)
|
args.decode = _parse_decode_urls(args.decode_raw)
|
||||||
|
args.colo = [u[0] for u in args.colo_raw] if args.colo_raw else []
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
@@ -235,6 +275,14 @@ def _parse_decode_urls(decode_list):
|
|||||||
# Decode side stays round-robin (load balance) regardless.
|
# Decode side stays round-robin (load balance) regardless.
|
||||||
MB5_P_ROUTING = os.environ.get("MB5_P_ROUTING", "rr").lower()
|
MB5_P_ROUTING = os.environ.get("MB5_P_ROUTING", "rr").lower()
|
||||||
|
|
||||||
|
# MB5: routing mode for the COLO (kv_both) passthrough proxy.
|
||||||
|
# "rr" — round-robin (loses session-local prefix cache)
|
||||||
|
# "session" — consistent hash on X-Session-Id, so all turns of a session land
|
||||||
|
# on the same kv_both instance and reuse its prefix cache. This is
|
||||||
|
# the cache-aware colo baseline (the fair strong baseline for the
|
||||||
|
# agentic reuse regime — D4).
|
||||||
|
MB5_COLO_ROUTING = os.environ.get("MB5_COLO_ROUTING", "rr").lower()
|
||||||
|
|
||||||
|
|
||||||
def get_prefill_by_session(app, session_id: str):
|
def get_prefill_by_session(app, session_id: str):
|
||||||
"""Pick a (prefill_client, dp_rank) deterministically from session_id.
|
"""Pick a (prefill_client, dp_rank) deterministically from session_id.
|
||||||
@@ -340,7 +388,58 @@ async def stream_service_response(
|
|||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
||||||
|
async def stream_colo_response(
|
||||||
|
colo_client_info: dict, endpoint: str, req_data: dict, headers: dict
|
||||||
|
):
|
||||||
|
"""Plain streaming passthrough to one colocated (kv_both) instance.
|
||||||
|
|
||||||
|
The request body is forwarded unchanged (stream/min_tokens/stream_options
|
||||||
|
all preserved) so the replayer's streaming + usage parsing works exactly
|
||||||
|
as it does when it talks to a colo instance directly.
|
||||||
|
"""
|
||||||
|
async with colo_client_info["client"].stream(
|
||||||
|
"POST", endpoint, json=req_data, headers=headers
|
||||||
|
) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
async for chunk in response.aiter_bytes():
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
|
||||||
|
async def _handle_colo(api: str, request: Request):
|
||||||
|
if not app.state.ready.is_set():
|
||||||
|
raise HTTPException(status_code=503, detail="Service Unavailable")
|
||||||
|
|
||||||
|
req_data = await request.json()
|
||||||
|
request_id = request.headers.get("X-Request-Id") or str(uuid.uuid4())
|
||||||
|
headers = {"X-Request-Id": request_id}
|
||||||
|
session_id = request.headers.get("X-Session-Id")
|
||||||
|
if session_id:
|
||||||
|
headers["X-Session-Id"] = session_id
|
||||||
|
key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
if key:
|
||||||
|
headers["Authorization"] = f"Bearer {key}"
|
||||||
|
|
||||||
|
if MB5_COLO_ROUTING == "session" and session_id:
|
||||||
|
# consistent hash -> same kv_both instance reuses its prefix cache
|
||||||
|
h = int(hashlib.md5(session_id.encode()).hexdigest()[:8], 16)
|
||||||
|
idx = h % len(app.state.colo_clients)
|
||||||
|
else:
|
||||||
|
idx = next(app.state.colo_iterator)
|
||||||
|
colo_client_info = app.state.colo_clients[idx]
|
||||||
|
|
||||||
|
async def generate_stream():
|
||||||
|
async for chunk in stream_colo_response(
|
||||||
|
colo_client_info, api, req_data, headers
|
||||||
|
):
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
return StreamingResponse(generate_stream(), media_type="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
async def _handle_completions(api: str, request: Request):
|
async def _handle_completions(api: str, request: Request):
|
||||||
|
if getattr(global_args, "colo", None):
|
||||||
|
return await _handle_colo(api, request)
|
||||||
|
|
||||||
if not app.state.ready.is_set():
|
if not app.state.ready.is_set():
|
||||||
raise HTTPException(status_code=503, detail="Service Unavailable")
|
raise HTTPException(status_code=503, detail="Service Unavailable")
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,8 @@
|
|||||||
set -eo pipefail
|
set -eo pipefail
|
||||||
|
|
||||||
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
|
||||||
VENV="${FRESH_ROOT}/.venv"
|
# MB5_VENV lets a second host use an isolated venv clone (see mb5_launch.sh).
|
||||||
|
VENV="${MB5_VENV:-${FRESH_ROOT}/.venv}"
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
|
LAUNCH="${SCRIPT_DIR}/mb5_launch.sh"
|
||||||
REPLAYER_DIR="${FRESH_ROOT}/replayer"
|
REPLAYER_DIR="${FRESH_ROOT}/replayer"
|
||||||
|
|||||||
Reference in New Issue
Block a user