#!/bin/bash # Launch a single vLLM instance on GPU 0 for interference microbenchmark. # Uses TP=1, enable-chunked-prefill, enable-prefix-caching. # # Usage: bash launch_microbench1.sh [chunk_size] [port] # chunk_size: max_num_batched_tokens (default: 8192) # port: serving port (default: 8000) set -euo pipefail CHUNK_SIZE=${1:-8192} PORT=${2:-8000} MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}" GPU_ID=${GPU_ID:-0} LOG_FILE="vllm_microbench1_chunk${CHUNK_SIZE}.log" echo "=== Interference Microbench vLLM Instance ===" echo "Model: $MODEL" echo "GPU: $GPU_ID" echo "Port: $PORT" echo "Chunk size (max_num_batched_tokens): $CHUNK_SIZE" echo "Log: $LOG_FILE" echo "" # Kill any existing vLLM on this port pkill -f "vllm.*--port $PORT" 2>/dev/null || true sleep 2 CUDA_VISIBLE_DEVICES=$GPU_ID python -m vllm.entrypoints.openai.api_server \ --model "$MODEL" \ --tensor-parallel-size 1 \ --enable-prefix-caching \ --dtype auto \ --gpu-memory-utilization 0.9 \ --max-model-len 200000 \ --max-num-batched-tokens "$CHUNK_SIZE" \ --port "$PORT" \ --trust-remote-code \ --disable-log-requests \ 2>&1 | tee "$LOG_FILE" & VLLM_PID=$! echo "vLLM PID: $VLLM_PID" echo "$VLLM_PID" > .vllm_microbench1.pid # Wait for server to be ready echo "Waiting for server to start..." for i in $(seq 1 120); do if curl -s "http://127.0.0.1:$PORT/v1/models" > /dev/null 2>&1; then echo "Server ready after ${i}s!" exit 0 fi sleep 1 done echo "ERROR: Server did not start within 120s" exit 1