xserv/tools/run_gpt_oss_bench.sh

#!/bin/bash
set -e
cd /opt/wjh/projects/xserv

# Kill any existing servers
pkill -f 'xserv-server.*18080' 2>/dev/null || true
pkill -f 'llama-server.*18090' 2>/dev/null || true
sleep 2

QUALITY_LIMIT="${1:-200}"
echo "=== gpt-oss-20b Benchmark: xserv (TP=2) vs llama.cpp (BF16) ==="
echo "GSM8K limit: $QUALITY_LIMIT problems"

# Start llama.cpp (GPU 2,3)
echo 'Starting llama-server on GPU 2,3...'
CUDA_VISIBLE_DEVICES=2,3 nohup third_party/llama.cpp/build/bin/llama-server \
    -m /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf \
    --port 18090 -ngl 999 -c 4096 \
    > /tmp/llama-gptoss.log 2>&1 &

# Start xserv (GPU 0,1, TP=2)
echo 'Starting xserv-server on GPU 0,1 (TP=2)...'
CUDA_VISIBLE_DEVICES=0,1 nohup ./target/release/xserv-server \
    /opt/wjh/models/gpt-oss-20b-bf16 \
    --port 18080 --tp 2 --max-batch 1 --max-seq-len 4096 \
    > /tmp/xserv-gptoss.log 2>&1 &

# Wait for both to be ready
echo 'Waiting for servers to start...'
for i in $(seq 1 60); do
    sleep 2
    XOK=$(curl -s http://localhost:18080/health 2>/dev/null || echo '')
    LOK=$(curl -s http://localhost:18090/health 2>/dev/null || echo '')
    if [ -n "$XOK" ] && [ -n "$LOK" ]; then
        echo "Both servers ready! (${i}x2s)"
        break
    fi
    if [ $i -eq 60 ]; then
        echo 'ERROR: Timeout waiting for servers'
        echo '--- xserv log ---'
        tail -10 /tmp/xserv-gptoss.log
        echo '--- llama log ---'
        tail -10 /tmp/llama-gptoss.log
        exit 1
    fi
done

echo ''
echo '=== Running GSM8K quality benchmark ==='
python3 -m tools.bench.runner \
    --xserv-base-url http://localhost:18080 \
    --xserv-model-id gpt-oss-20b \
    --llama-base-url http://localhost:18090 \
    --suite quality \
    --quality-limit "$QUALITY_LIMIT" \
    --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_quality.log

echo ''
echo '=== Running speed benchmark ==='
python3 -m tools.bench.runner \
    --xserv-base-url http://localhost:18080 \
    --xserv-model-id gpt-oss-20b \
    --llama-base-url http://localhost:18090 \
    --suite speed \
    --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_speed.log

# Cleanup
echo ''
echo '=== Cleaning up ==='
pkill -f 'xserv-server.*18080' 2>/dev/null || true
pkill -f 'llama-server.*18090' 2>/dev/null || true

echo ''
echo '=== BENCHMARK COMPLETE ==='
echo "Quality results: /tmp/bench_gptoss_quality.log"
echo "Speed results: /tmp/bench_gptoss_speed.log"