agentic-kvc/microbench/fresh_setup/mb1_launch.sh

#!/usr/bin/env bash
# Launch a SINGLE vLLM instance on dash1 for MB1 (prefill-decode interference).
# No kv_connector — MB1 measures intra-GPU phase interference, not transfer.
# chunked_prefill is enabled by default in vLLM 0.18.1 (this is the regime
# we want to characterize: how much benefit can PD-disagg buy on top of
# the existing chunked-prefill colocated baseline?).
#
# Usage:
#   GPU=0 PORT=8000 CHUNK_TOKENS=8192 bash mb1_launch.sh start
#   bash mb1_launch.sh status
#   bash mb1_launch.sh stop

set -eo pipefail

FRESH_ROOT="/home/admin/cpfs/wjh/agentic-kv-fresh"
VENV="${FRESH_ROOT}/.venv"
MODEL="${MODEL:-/home/admin/cpfs/wjh/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
LOGS_DIR="${LOGS_DIR:-${FRESH_ROOT}/mb1_logs}"

GPU="${GPU:-0}"
PORT="${PORT:-8000}"
MASTER="${MASTER:-29500}"
# max_num_batched_tokens — controls the chunked-prefill chunk granularity.
# vLLM 0.18.1 default is 8192; we keep that as the headline run and
# optionally repeat at 32768 to expose the chunk-size effect.
CHUNK_TOKENS="${CHUNK_TOKENS:-8192}"

mkdir -p "${LOGS_DIR}"

stop_local() {
    pkill -9 -f "vllm serve.*--port ${PORT} " 2>/dev/null || true
    pkill -9 -f "EngineCore" 2>/dev/null || true
    sleep 2
}

case "${1:-start}" in
    stop)
        stop_local; exit 0;;
    status)
        if curl -sf "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; then
            echo "port ${PORT}: UP"
        else
            echo "port ${PORT}: DOWN"
        fi
        exit 0;;
    start) ;;
    *) echo "Unknown command: $1"; exit 1;;
esac

stop_local
source "${VENV}/bin/activate"

echo "[mb1] launching: gpu=${GPU} port=${PORT} chunk_tokens=${CHUNK_TOKENS} (no kv_connector)"

PYTHONHASHSEED=42 \
CUDA_VISIBLE_DEVICES="${GPU}" \
MASTER_PORT="${MASTER}" \
nohup vllm serve "${MODEL}" \
    --host 0.0.0.0 --port "${PORT}" \
    --tensor-parallel-size 1 \
    --trust-remote-code --enable-prefix-caching \
    --dtype auto --gpu-memory-utilization 0.9 \
    --max-model-len 200000 \
    --max-num-batched-tokens "${CHUNK_TOKENS}" \
    --enable-prompt-tokens-details \
    > "${LOGS_DIR}/vllm_gpu${GPU}_chunk${CHUNK_TOKENS}.log" 2>&1 &
disown

echo "[mb1] waiting for /health on port ${PORT}..."
tries=0
while ! curl -sf "http://127.0.0.1:${PORT}/health" >/dev/null 2>&1; do
    tries=$((tries+1))
    if [ ${tries} -gt 180 ]; then
        echo "[mb1] FATAL port ${PORT} did not come up in 6 min"
        tail -40 "${LOGS_DIR}/vllm_gpu${GPU}_chunk${CHUNK_TOKENS}.log" || true
        exit 1
    fi
    sleep 2
done
echo "[mb1] UP on $(hostname -s):${PORT} (GPU ${GPU}, chunk_tokens=${CHUNK_TOKENS})"