#!/bin/bash # Exp (a) CPU-tier + PCIe only (miss/gpu already done). HMA fix applied. set -uo pipefail cd /home/admin/cpfs/wjh/agentic-kv PY=.venv/bin/python MODEL=/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct GPU=${GPU:-0} PORT=${PORT:-8100} EP=http://127.0.0.1:$PORT OUT=v2/exp_a_tier_latency/results mkdir -p "$OUT" VLLM_PID="" teardown() { [ -n "$VLLM_PID" ] && kill -TERM "$VLLM_PID" 2>/dev/null for _ in $(seq 1 40); do kill -0 "$VLLM_PID" 2>/dev/null || break; sleep 1; done sleep 3; VLLM_PID="" } trap teardown EXIT echo ">>> launch A2: small pool + CPU offload (HMA disabled)" CUDA_VISIBLE_DEVICES=$GPU VLLM_LOGGING_LEVEL=WARNING \ $PY -m vllm.entrypoints.openai.api_server --model "$MODEL" \ --host 0.0.0.0 --port $PORT --tensor-parallel-size 1 --trust-remote-code \ --enable-prefix-caching --enforce-eager --dtype auto --max-model-len 200000 \ --num-gpu-blocks-override 5000 --kv-offloading-size 40 --kv-offloading-backend native \ --disable-hybrid-kv-cache-manager > "$OUT/vllm_a2.log" 2>&1 & VLLM_PID=$! echo " pid=$VLLM_PID waiting for health..." $PY -c "import sys; sys.path.insert(0,'v2'); from common.util import wait_healthy; sys.exit(0 if wait_healthy('$EP',900) else 1)" \ || { echo "LAUNCH FAILED"; tail -25 "$OUT/vllm_a2.log"; exit 1; } echo " healthy." $PY v2/exp_a_tier_latency/driver.py --endpoint $EP --model "$MODEL" --mode cpu --reps 4 \ --flood-tokens 88000 --flood-chunk 16384 --out "$OUT/cpu.json" teardown CUDA_VISIBLE_DEVICES=$GPU $PY v2/exp_a_tier_latency/pcie_transfer.py --reps 20 --out "$OUT/pcie.json" echo "=== exp (a) CPU+PCIe DONE ==="