#!/bin/bash
# Launch a single vLLM instance on GPU 0 for interference microbenchmark.
# Uses TP=1, enable-chunked-prefill, enable-prefix-caching.
#
# Usage: bash launch_microbench1.sh [chunk_size] [port]
#   chunk_size: max_num_batched_tokens (default: 8192)
#   port: serving port (default: 8000)

set -euo pipefail

CHUNK_SIZE=${1:-8192}
PORT=${2:-8000}
MODEL="${MODEL:-$HOME/models/Qwen/Qwen3-Coder-30B-A3B-Instruct}"
GPU_ID=${GPU_ID:-0}
LOG_FILE="vllm_microbench1_chunk${CHUNK_SIZE}.log"

echo "=== Interference Microbench vLLM Instance ==="
echo "Model: $MODEL"
echo "GPU: $GPU_ID"
echo "Port: $PORT"
echo "Chunk size (max_num_batched_tokens): $CHUNK_SIZE"
echo "Log: $LOG_FILE"
echo ""

# Kill any existing vLLM on this port
pkill -f "vllm.*--port $PORT" 2>/dev/null || true
sleep 2

CUDA_VISIBLE_DEVICES=$GPU_ID python -m vllm.entrypoints.openai.api_server \
    --model "$MODEL" \
    --tensor-parallel-size 1 \
    --enable-prefix-caching \
    --dtype auto \
    --gpu-memory-utilization 0.9 \
    --max-model-len 200000 \
    --max-num-batched-tokens "$CHUNK_SIZE" \
    --port "$PORT" \
    --trust-remote-code \
    --disable-log-requests \
    2>&1 | tee "$LOG_FILE" &

VLLM_PID=$!
echo "vLLM PID: $VLLM_PID"
echo "$VLLM_PID" > .vllm_microbench1.pid

# Wait for server to be ready
echo "Waiting for server to start..."
for i in $(seq 1 120); do
    if curl -s "http://127.0.0.1:$PORT/v1/models" > /dev/null 2>&1; then
        echo "Server ready after ${i}s!"
        exit 0
    fi
    sleep 1
done

echo "ERROR: Server did not start within 120s"
exit 1