#!/bin/bash # Launch vLLM 0.18.1 in PD-combined mode (TP=8, all GPUs). # # Usage: bash scripts/launch_vllm.sh set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" VLLM="$PROJECT_DIR/.venv/bin/vllm" MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" HOST="${HOST:-0.0.0.0}" PORT="${PORT:-8000}" echo "Starting vLLM 0.18.1 in PD-combined mode (TP=8) on port $PORT ..." $VLLM serve "$MODEL_PATH" \ --trust-remote-code \ --enable-prefix-caching \ --dtype auto \ --tensor-parallel-size 8 \ --host "$HOST" \ --port "$PORT"