From 645b067dd428e2813a20c66dc96fc2b9b4085912 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Tue, 26 May 2026 15:54:55 +0800 Subject: [PATCH] Fix review bugs: PD-sep counter leaks, hardcoded paths, missing deps Critical: - cache_aware_proxy: _handle_pd_sep leaked p_inst.num_requests (never decremented) and never managed d_inst.num_requests; fix media_type from application/json to text/event-stream for SSE stream High: - b3_sweep/b3_isolated_policy/b3_analyze: replace hardcoded /home/admin/cpfs/wjh/ ROOT with script-relative $(dirname "$0")/.. - b3_analyze: replace hardcoded 8-port WORKER_MAP with dynamic generation from BASE_PORT and N_INSTANCES Medium: - analyze_breakdown: warn on stderr when records are skipped (was silent) - deploy_vllm_patches: fail-fast on SSH/SCP errors instead of continuing with empty VENV_SITE - pyproject.toml: declare fastapi and uvicorn as runtime dependencies - launch_elastic_p2p: kill EngineCore and proxy in trap handler to prevent GPU memory leaks on exit --- pyproject.toml | 2 ++ scripts/analyze_breakdown.py | 5 +++++ scripts/b3_analyze.sh | 12 ++++++++++-- scripts/b3_isolated_policy.sh | 4 ++-- scripts/b3_sweep.sh | 4 ++-- scripts/cache_aware_proxy.py | 6 +++++- scripts/deploy_vllm_patches.sh | 11 +++++++++-- scripts/launch_elastic_p2p.sh | 2 +- 8 files changed, 36 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0f7c9ae..458463c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,8 @@ requires-python = ">=3.10" dependencies = [ "httpx>=0.27", "numpy>=1.24", + "fastapi>=0.110", + "uvicorn>=0.29", ] [project.optional-dependencies] diff --git a/scripts/analyze_breakdown.py b/scripts/analyze_breakdown.py index ee6f842..58ca84b 100644 --- a/scripts/analyze_breakdown.py +++ b/scripts/analyze_breakdown.py @@ -12,9 +12,11 @@ else: print("Total records: %d" % len(data)) results = [] +skipped = 0 for d in data: keys = ["t_proxy_recv", "t_prefill_sent", "t_prefill_done", "t_decode_sent", "t_first_token"] if not all(k in d for k in keys): + skipped += 1 continue results.append({ "input": d["input_length"], @@ -26,6 +28,9 @@ for d in data: results.sort(key=lambda x: x["input"]) print("Complete breakdown: %d" % len(results)) +if skipped: + print("WARNING: %d records skipped (missing breakdown timestamps)" % skipped, + file=sys.stderr) if not results: print("No complete records yet") diff --git a/scripts/b3_analyze.sh b/scripts/b3_analyze.sh index a136231..4ffdef2 100755 --- a/scripts/b3_analyze.sh +++ b/scripts/b3_analyze.sh @@ -9,11 +9,19 @@ set -euo pipefail -ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}" +ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}" VENV="$ROOT/.venv/bin" SWEEP_DIR="${1:?usage: $0 }" -WORKER_MAP="http://127.0.0.1:8000=engine_0,http://127.0.0.1:8001=engine_1,http://127.0.0.1:8002=engine_2,http://127.0.0.1:8003=engine_3,http://127.0.0.1:8004=engine_4,http://127.0.0.1:8005=engine_5,http://127.0.0.1:8006=engine_6,http://127.0.0.1:8007=engine_7" +BASE_PORT="${BASE_PORT:-8000}" +N_INSTANCES="${N_INSTANCES:-8}" + +# Build WORKER_MAP dynamically from BASE_PORT and N_INSTANCES. +_worker_map_parts=() +for ((i=0; i&2; exit 1; + } + if [ -z "$VENV_SITE" ]; then + echo "ERROR: empty site-packages path from $HOST" >&2; exit 1; + fi DST="$VENV_SITE/vllm" echo "Deploying to $HOST:$DST" for f in "${PATCHED_FILES[@]}"; do - scp "$VLLM_SRC/$f" "$HOST:$DST/$f" + scp "$VLLM_SRC/$f" "$HOST:$DST/$f" || { + echo "ERROR: failed to copy $f to $HOST" >&2; exit 1; + } done fi diff --git a/scripts/launch_elastic_p2p.sh b/scripts/launch_elastic_p2p.sh index 6a2e8e8..eac5cf3 100755 --- a/scripts/launch_elastic_p2p.sh +++ b/scripts/launch_elastic_p2p.sh @@ -27,7 +27,7 @@ PROXY_PORT=9090 HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}" MAX_OFFLOAD="${MAX_OFFLOAD:-4}" -trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; wait 2>/dev/null' EXIT INT TERM +trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; pkill -9 -f "vllm serve" 2>/dev/null; pkill -9 -f "EngineCore" 2>/dev/null; pkill -9 -f cache_aware_proxy 2>/dev/null; wait 2>/dev/null' EXIT INT TERM echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ===" echo " Model: $MODEL"