- tp_engine.rs: TpModel enum dispatches between Qwen3 and GptOss based on config.is_moe(). Server auto-detects model type on startup. - tools/run_gpt_oss_bench.sh: one-click benchmark comparing xserv (TP=2) vs llama.cpp (BF16 GGUF) on GSM8K quality + speed Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
77 lines
2.4 KiB
Bash
Executable File
77 lines
2.4 KiB
Bash
Executable File
#!/bin/bash
|
|
set -e
|
|
cd /opt/wjh/projects/xserv
|
|
|
|
# Kill any existing servers
|
|
pkill -f 'xserv-server.*18080' 2>/dev/null || true
|
|
pkill -f 'llama-server.*18090' 2>/dev/null || true
|
|
sleep 2
|
|
|
|
QUALITY_LIMIT="${1:-200}"
|
|
echo "=== gpt-oss-20b Benchmark: xserv (TP=2) vs llama.cpp (BF16) ==="
|
|
echo "GSM8K limit: $QUALITY_LIMIT problems"
|
|
|
|
# Start llama.cpp (GPU 2,3)
|
|
echo 'Starting llama-server on GPU 2,3...'
|
|
CUDA_VISIBLE_DEVICES=2,3 nohup third_party/llama.cpp/build/bin/llama-server \
|
|
-m /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf \
|
|
--port 18090 -ngl 999 -c 4096 \
|
|
> /tmp/llama-gptoss.log 2>&1 &
|
|
|
|
# Start xserv (GPU 0,1, TP=2)
|
|
echo 'Starting xserv-server on GPU 0,1 (TP=2)...'
|
|
CUDA_VISIBLE_DEVICES=0,1 nohup ./target/release/xserv-server \
|
|
/opt/wjh/models/gpt-oss-20b-bf16 \
|
|
--port 18080 --tp 2 --max-batch 1 --max-seq-len 4096 \
|
|
> /tmp/xserv-gptoss.log 2>&1 &
|
|
|
|
# Wait for both to be ready
|
|
echo 'Waiting for servers to start...'
|
|
for i in $(seq 1 60); do
|
|
sleep 2
|
|
XOK=$(curl -s http://localhost:18080/health 2>/dev/null || echo '')
|
|
LOK=$(curl -s http://localhost:18090/health 2>/dev/null || echo '')
|
|
if [ -n "$XOK" ] && [ -n "$LOK" ]; then
|
|
echo "Both servers ready! (${i}x2s)"
|
|
break
|
|
fi
|
|
if [ $i -eq 60 ]; then
|
|
echo 'ERROR: Timeout waiting for servers'
|
|
echo '--- xserv log ---'
|
|
tail -10 /tmp/xserv-gptoss.log
|
|
echo '--- llama log ---'
|
|
tail -10 /tmp/llama-gptoss.log
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
echo ''
|
|
echo '=== Running GSM8K quality benchmark ==='
|
|
python3 -m tools.bench.runner \
|
|
--xserv-base-url http://localhost:18080 \
|
|
--xserv-model-id gpt-oss-20b \
|
|
--llama-base-url http://localhost:18090 \
|
|
--suite quality \
|
|
--quality-limit "$QUALITY_LIMIT" \
|
|
--max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_quality.log
|
|
|
|
echo ''
|
|
echo '=== Running speed benchmark ==='
|
|
python3 -m tools.bench.runner \
|
|
--xserv-base-url http://localhost:18080 \
|
|
--xserv-model-id gpt-oss-20b \
|
|
--llama-base-url http://localhost:18090 \
|
|
--suite speed \
|
|
--max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_speed.log
|
|
|
|
# Cleanup
|
|
echo ''
|
|
echo '=== Cleaning up ==='
|
|
pkill -f 'xserv-server.*18080' 2>/dev/null || true
|
|
pkill -f 'llama-server.*18090' 2>/dev/null || true
|
|
|
|
echo ''
|
|
echo '=== BENCHMARK COMPLETE ==='
|
|
echo "Quality results: /tmp/bench_gptoss_quality.log"
|
|
echo "Speed results: /tmp/bench_gptoss_speed.log"
|