Fix review bugs: PD-sep counter leaks, hardcoded paths, missing deps
Critical: - cache_aware_proxy: _handle_pd_sep leaked p_inst.num_requests (never decremented) and never managed d_inst.num_requests; fix media_type from application/json to text/event-stream for SSE stream High: - b3_sweep/b3_isolated_policy/b3_analyze: replace hardcoded /home/admin/cpfs/wjh/ ROOT with script-relative $(dirname "$0")/.. - b3_analyze: replace hardcoded 8-port WORKER_MAP with dynamic generation from BASE_PORT and N_INSTANCES Medium: - analyze_breakdown: warn on stderr when records are skipped (was silent) - deploy_vllm_patches: fail-fast on SSH/SCP errors instead of continuing with empty VENV_SITE - pyproject.toml: declare fastapi and uvicorn as runtime dependencies - launch_elastic_p2p: kill EngineCore and proxy in trap handler to prevent GPU memory leaks on exit
This commit is contained in:
@@ -6,6 +6,8 @@ requires-python = ">=3.10"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"httpx>=0.27",
|
"httpx>=0.27",
|
||||||
"numpy>=1.24",
|
"numpy>=1.24",
|
||||||
|
"fastapi>=0.110",
|
||||||
|
"uvicorn>=0.29",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -12,9 +12,11 @@ else:
|
|||||||
print("Total records: %d" % len(data))
|
print("Total records: %d" % len(data))
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
skipped = 0
|
||||||
for d in data:
|
for d in data:
|
||||||
keys = ["t_proxy_recv", "t_prefill_sent", "t_prefill_done", "t_decode_sent", "t_first_token"]
|
keys = ["t_proxy_recv", "t_prefill_sent", "t_prefill_done", "t_decode_sent", "t_first_token"]
|
||||||
if not all(k in d for k in keys):
|
if not all(k in d for k in keys):
|
||||||
|
skipped += 1
|
||||||
continue
|
continue
|
||||||
results.append({
|
results.append({
|
||||||
"input": d["input_length"],
|
"input": d["input_length"],
|
||||||
@@ -26,6 +28,9 @@ for d in data:
|
|||||||
|
|
||||||
results.sort(key=lambda x: x["input"])
|
results.sort(key=lambda x: x["input"])
|
||||||
print("Complete breakdown: %d" % len(results))
|
print("Complete breakdown: %d" % len(results))
|
||||||
|
if skipped:
|
||||||
|
print("WARNING: %d records skipped (missing breakdown timestamps)" % skipped,
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
if not results:
|
if not results:
|
||||||
print("No complete records yet")
|
print("No complete records yet")
|
||||||
|
|||||||
@@ -9,11 +9,19 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
|
ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
|
||||||
VENV="$ROOT/.venv/bin"
|
VENV="$ROOT/.venv/bin"
|
||||||
SWEEP_DIR="${1:?usage: $0 <sweep_dir>}"
|
SWEEP_DIR="${1:?usage: $0 <sweep_dir>}"
|
||||||
|
|
||||||
WORKER_MAP="http://127.0.0.1:8000=engine_0,http://127.0.0.1:8001=engine_1,http://127.0.0.1:8002=engine_2,http://127.0.0.1:8003=engine_3,http://127.0.0.1:8004=engine_4,http://127.0.0.1:8005=engine_5,http://127.0.0.1:8006=engine_6,http://127.0.0.1:8007=engine_7"
|
BASE_PORT="${BASE_PORT:-8000}"
|
||||||
|
N_INSTANCES="${N_INSTANCES:-8}"
|
||||||
|
|
||||||
|
# Build WORKER_MAP dynamically from BASE_PORT and N_INSTANCES.
|
||||||
|
_worker_map_parts=()
|
||||||
|
for ((i=0; i<N_INSTANCES; i++)); do
|
||||||
|
_worker_map_parts+=("http://127.0.0.1:$((BASE_PORT + i))=engine_$i")
|
||||||
|
done
|
||||||
|
WORKER_MAP=$(IFS=,; echo "${_worker_map_parts[*]}")
|
||||||
|
|
||||||
_has_engine_data() {
|
_has_engine_data() {
|
||||||
# Return 0 (true) if $1/*.jsonl contains any non-empty file.
|
# Return 0 (true) if $1/*.jsonl contains any non-empty file.
|
||||||
|
|||||||
@@ -11,9 +11,9 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
|
ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
|
||||||
VENV="$ROOT/.venv/bin"
|
VENV="$ROOT/.venv/bin"
|
||||||
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
MODEL="${MODEL:-$ROOT/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||||||
PROXY_PORT="${PROXY_PORT:-9300}"
|
PROXY_PORT="${PROXY_PORT:-9300}"
|
||||||
BASE_PORT="${BASE_PORT:-8000}"
|
BASE_PORT="${BASE_PORT:-8000}"
|
||||||
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
|
GPU_INDICES="${GPU_INDICES:-0 1 2 3 4 5 6 7}"
|
||||||
|
|||||||
@@ -14,9 +14,9 @@
|
|||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
ROOT="${ROOT:-/home/admin/cpfs/wjh/agentic-kv}"
|
ROOT="${ROOT:-$(cd "$(dirname "$0")/.." && pwd)}"
|
||||||
VENV="$ROOT/.venv/bin"
|
VENV="$ROOT/.venv/bin"
|
||||||
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
MODEL="${MODEL:-$ROOT/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
|
||||||
TRACE="${TRACE:-$ROOT/traces/w600_r0.0015_st30.jsonl}"
|
TRACE="${TRACE:-$ROOT/traces/w600_r0.0015_st30.jsonl}"
|
||||||
OUTDIR="${OUTDIR:-$ROOT/outputs/b3_sweep_$(date +%Y%m%d_%H%M%S)}"
|
OUTDIR="${OUTDIR:-$ROOT/outputs/b3_sweep_$(date +%Y%m%d_%H%M%S)}"
|
||||||
PROXY_PORT="${PROXY_PORT:-9300}"
|
PROXY_PORT="${PROXY_PORT:-9300}"
|
||||||
|
|||||||
@@ -1176,6 +1176,7 @@ async def _handle_pd_sep(api, req_data, request_id, token_ids, input_length,
|
|||||||
p_headers = {**headers, "X-data-parallel-rank": "0"}
|
p_headers = {**headers, "X-data-parallel-rank": "0"}
|
||||||
|
|
||||||
p_inst.ongoing_tokens += input_length
|
p_inst.ongoing_tokens += input_length
|
||||||
|
p_inst.num_requests += 1
|
||||||
breakdown["t_prefill_sent"] = _time.monotonic()
|
breakdown["t_prefill_sent"] = _time.monotonic()
|
||||||
breakdown["t_prefill_sent_unix"] = _time.time()
|
breakdown["t_prefill_sent_unix"] = _time.time()
|
||||||
|
|
||||||
@@ -1194,9 +1195,11 @@ async def _handle_pd_sep(api, req_data, request_id, token_ids, input_length,
|
|||||||
raise HTTPException(status_code=502, detail=f"Prefill failed: {e}")
|
raise HTTPException(status_code=502, detail=f"Prefill failed: {e}")
|
||||||
finally:
|
finally:
|
||||||
p_inst.ongoing_tokens -= input_length
|
p_inst.ongoing_tokens -= input_length
|
||||||
|
p_inst.num_requests -= 1
|
||||||
|
|
||||||
# Send decode
|
# Send decode
|
||||||
d_inst.ongoing_tokens += input_length
|
d_inst.ongoing_tokens += input_length
|
||||||
|
d_inst.num_requests += 1
|
||||||
parsed = urllib.parse.urlparse(str(p_inst.client.base_url))
|
parsed = urllib.parse.urlparse(str(p_inst.client.base_url))
|
||||||
bootstrap_addr = f"http://{parsed.hostname}:{p_inst.bootstrap_port}"
|
bootstrap_addr = f"http://{parsed.hostname}:{p_inst.bootstrap_port}"
|
||||||
|
|
||||||
@@ -1232,9 +1235,10 @@ async def _handle_pd_sep(api, req_data, request_id, token_ids, input_length,
|
|||||||
breakdown["t_done"] = _time.monotonic()
|
breakdown["t_done"] = _time.monotonic()
|
||||||
breakdown["t_done_unix"] = _time.time()
|
breakdown["t_done_unix"] = _time.time()
|
||||||
d_inst.ongoing_tokens -= input_length
|
d_inst.ongoing_tokens -= input_length
|
||||||
|
d_inst.num_requests -= 1
|
||||||
_breakdown_log.append(breakdown)
|
_breakdown_log.append(breakdown)
|
||||||
|
|
||||||
return StreamingResponse(generate(), media_type="application/json")
|
return StreamingResponse(generate(), media_type="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
@app.get("/breakdown")
|
@app.get("/breakdown")
|
||||||
|
|||||||
@@ -29,11 +29,18 @@ if [ "$HOST" = "local" ]; then
|
|||||||
done
|
done
|
||||||
else
|
else
|
||||||
# Find site-packages on remote
|
# Find site-packages on remote
|
||||||
VENV_SITE=$(ssh "$HOST" "~/agentic-kv/.venv/bin/python -c \"import site; print(site.getsitepackages()[0])\"")
|
VENV_SITE=$(ssh "$HOST" "~/agentic-kv/.venv/bin/python -c \"import site; print(site.getsitepackages()[0])\"") || {
|
||||||
|
echo "ERROR: failed to resolve site-packages on $HOST" >&2; exit 1;
|
||||||
|
}
|
||||||
|
if [ -z "$VENV_SITE" ]; then
|
||||||
|
echo "ERROR: empty site-packages path from $HOST" >&2; exit 1;
|
||||||
|
fi
|
||||||
DST="$VENV_SITE/vllm"
|
DST="$VENV_SITE/vllm"
|
||||||
echo "Deploying to $HOST:$DST"
|
echo "Deploying to $HOST:$DST"
|
||||||
for f in "${PATCHED_FILES[@]}"; do
|
for f in "${PATCHED_FILES[@]}"; do
|
||||||
scp "$VLLM_SRC/$f" "$HOST:$DST/$f"
|
scp "$VLLM_SRC/$f" "$HOST:$DST/$f" || {
|
||||||
|
echo "ERROR: failed to copy $f to $HOST" >&2; exit 1;
|
||||||
|
}
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ PROXY_PORT=9090
|
|||||||
HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}"
|
HEAVY_THRESHOLD="${HEAVY_THRESHOLD:-20000}"
|
||||||
MAX_OFFLOAD="${MAX_OFFLOAD:-4}"
|
MAX_OFFLOAD="${MAX_OFFLOAD:-4}"
|
||||||
|
|
||||||
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; wait 2>/dev/null' EXIT INT TERM
|
trap 'echo "Cleaning up..."; kill $(jobs -p) 2>/dev/null; pkill -9 -f "vllm serve" 2>/dev/null; pkill -9 -f "EngineCore" 2>/dev/null; pkill -9 -f cache_aware_proxy 2>/dev/null; wait 2>/dev/null' EXIT INT TERM
|
||||||
|
|
||||||
echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ==="
|
echo "=== Elastic P2P Offload (${N_INSTANCES}× TP=1 kv_both) ==="
|
||||||
echo " Model: $MODEL"
|
echo " Model: $MODEL"
|
||||||
|
|||||||
Reference in New Issue
Block a user