Files
xserv/tools/run_gpt_oss_bench.sh
Gahow Wang 15c51f143e server: support GptOss in TP engine + benchmark script
- tp_engine.rs: TpModel enum dispatches between Qwen3 and GptOss based on
  config.is_moe(). Server auto-detects model type on startup.
- tools/run_gpt_oss_bench.sh: one-click benchmark comparing xserv (TP=2)
  vs llama.cpp (BF16 GGUF) on GSM8K quality + speed

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-30 15:39:44 +08:00

77 lines
2.4 KiB
Bash
Executable File

#!/bin/bash
set -e
cd /opt/wjh/projects/xserv
# Kill any existing servers
pkill -f 'xserv-server.*18080' 2>/dev/null || true
pkill -f 'llama-server.*18090' 2>/dev/null || true
sleep 2
QUALITY_LIMIT="${1:-200}"
echo "=== gpt-oss-20b Benchmark: xserv (TP=2) vs llama.cpp (BF16) ==="
echo "GSM8K limit: $QUALITY_LIMIT problems"
# Start llama.cpp (GPU 2,3)
echo 'Starting llama-server on GPU 2,3...'
CUDA_VISIBLE_DEVICES=2,3 nohup third_party/llama.cpp/build/bin/llama-server \
-m /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf \
--port 18090 -ngl 999 -c 4096 \
> /tmp/llama-gptoss.log 2>&1 &
# Start xserv (GPU 0,1, TP=2)
echo 'Starting xserv-server on GPU 0,1 (TP=2)...'
CUDA_VISIBLE_DEVICES=0,1 nohup ./target/release/xserv-server \
/opt/wjh/models/gpt-oss-20b-bf16 \
--port 18080 --tp 2 --max-batch 1 --max-seq-len 4096 \
> /tmp/xserv-gptoss.log 2>&1 &
# Wait for both to be ready
echo 'Waiting for servers to start...'
for i in $(seq 1 60); do
sleep 2
XOK=$(curl -s http://localhost:18080/health 2>/dev/null || echo '')
LOK=$(curl -s http://localhost:18090/health 2>/dev/null || echo '')
if [ -n "$XOK" ] && [ -n "$LOK" ]; then
echo "Both servers ready! (${i}x2s)"
break
fi
if [ $i -eq 60 ]; then
echo 'ERROR: Timeout waiting for servers'
echo '--- xserv log ---'
tail -10 /tmp/xserv-gptoss.log
echo '--- llama log ---'
tail -10 /tmp/llama-gptoss.log
exit 1
fi
done
echo ''
echo '=== Running GSM8K quality benchmark ==='
python3 -m tools.bench.runner \
--xserv-base-url http://localhost:18080 \
--xserv-model-id gpt-oss-20b \
--llama-base-url http://localhost:18090 \
--suite quality \
--quality-limit "$QUALITY_LIMIT" \
--max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_quality.log
echo ''
echo '=== Running speed benchmark ==='
python3 -m tools.bench.runner \
--xserv-base-url http://localhost:18080 \
--xserv-model-id gpt-oss-20b \
--llama-base-url http://localhost:18090 \
--suite speed \
--max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_speed.log
# Cleanup
echo ''
echo '=== Cleaning up ==='
pkill -f 'xserv-server.*18080' 2>/dev/null || true
pkill -f 'llama-server.*18090' 2>/dev/null || true
echo ''
echo '=== BENCHMARK COMPLETE ==='
echo "Quality results: /tmp/bench_gptoss_quality.log"
echo "Speed results: /tmp/bench_gptoss_speed.log"