#!/bin/bash set -e cd /opt/wjh/projects/xserv # Kill any existing servers pkill -f 'xserv-server.*18080' 2>/dev/null || true pkill -f 'llama-server.*18090' 2>/dev/null || true sleep 2 QUALITY_LIMIT="${1:-200}" echo "=== gpt-oss-20b Benchmark: xserv (TP=2) vs llama.cpp (BF16) ===" echo "GSM8K limit: $QUALITY_LIMIT problems" # Start llama.cpp (GPU 2,3) echo 'Starting llama-server on GPU 2,3...' CUDA_VISIBLE_DEVICES=2,3 nohup third_party/llama.cpp/build/bin/llama-server \ -m /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf \ --port 18090 -ngl 999 -c 4096 \ > /tmp/llama-gptoss.log 2>&1 & # Start xserv (GPU 0,1, TP=2) echo 'Starting xserv-server on GPU 0,1 (TP=2)...' CUDA_VISIBLE_DEVICES=0,1 nohup ./target/release/xserv-server \ /opt/wjh/models/gpt-oss-20b-bf16 \ --port 18080 --tp 2 --max-batch 1 --max-seq-len 4096 \ > /tmp/xserv-gptoss.log 2>&1 & # Wait for both to be ready echo 'Waiting for servers to start...' for i in $(seq 1 60); do sleep 2 XOK=$(curl -s http://localhost:18080/health 2>/dev/null || echo '') LOK=$(curl -s http://localhost:18090/health 2>/dev/null || echo '') if [ -n "$XOK" ] && [ -n "$LOK" ]; then echo "Both servers ready! (${i}x2s)" break fi if [ $i -eq 60 ]; then echo 'ERROR: Timeout waiting for servers' echo '--- xserv log ---' tail -10 /tmp/xserv-gptoss.log echo '--- llama log ---' tail -10 /tmp/llama-gptoss.log exit 1 fi done echo '' echo '=== Running GSM8K quality benchmark ===' python3 -m tools.bench.runner \ --xserv-base-url http://localhost:18080 \ --xserv-model-id gpt-oss-20b \ --llama-base-url http://localhost:18090 \ --suite quality \ --quality-limit "$QUALITY_LIMIT" \ --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_quality.log echo '' echo '=== Running speed benchmark ===' python3 -m tools.bench.runner \ --xserv-base-url http://localhost:18080 \ --xserv-model-id gpt-oss-20b \ --llama-base-url http://localhost:18090 \ --suite speed \ --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_speed.log # Cleanup echo '' echo '=== Cleaning up ===' pkill -f 'xserv-server.*18080' 2>/dev/null || true pkill -f 'llama-server.*18090' 2>/dev/null || true echo '' echo '=== BENCHMARK COMPLETE ===' echo "Quality results: /tmp/bench_gptoss_quality.log" echo "Speed results: /tmp/bench_gptoss_speed.log"