Files
xserv/tools/bench_gpt_oss.sh
Gahow Wang 15c51f143e server: support GptOss in TP engine + benchmark script
- tp_engine.rs: TpModel enum dispatches between Qwen3 and GptOss based on
  config.is_moe(). Server auto-detects model type on startup.
- tools/run_gpt_oss_bench.sh: one-click benchmark comparing xserv (TP=2)
  vs llama.cpp (BF16 GGUF) on GSM8K quality + speed

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-30 15:39:44 +08:00

93 lines
3.0 KiB
Bash

#!/bin/bash
# Benchmark gpt-oss-20b: xserv (TP=2) vs llama.cpp (BF16 GGUF)
# Runs GSM8K 200 problems on both systems and produces a comparison report.
#
# Usage: ./tools/bench_gpt_oss.sh [--quality-limit N]
set -e
REMOTE="dash5"
REMOTE_DIR="/opt/wjh/projects/xserv"
MODEL_DIR="/opt/wjh/models/gpt-oss-20b-bf16"
GGUF="/opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf"
LLAMA_BIN="$REMOTE_DIR/third_party/llama.cpp/build/bin/llama-server"
XSERV_BIN="$REMOTE_DIR/target/release/xserv-server"
QUALITY_LIMIT="${1:-200}"
XSERV_PORT=18080
LLAMA_PORT=18090
echo "=== gpt-oss-20b Benchmark: xserv vs llama.cpp ==="
echo "Quality limit: $QUALITY_LIMIT problems"
echo ""
# Sync project first
LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)"
echo "Syncing project..."
rsync -az --exclude target --exclude .git --exclude bench-out \
"$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/" >/dev/null
# Build on remote
echo "Building xserv..."
ssh $REMOTE "cd $REMOTE_DIR && source /etc/profile && \
if [ -d /usr/local/cuda-12.9 ]; then export CUDA_HOME=/usr/local/cuda-12.9; else export CUDA_HOME=/usr/local/cuda; fi && \
export PATH=\$CUDA_HOME/bin:\$PATH && \
cargo build --release 2>&1 | tail -3"
echo ""
echo "=== Starting servers ==="
# Start llama.cpp server (GPU 2-3, BF16 GGUF needs ~42GB → use 2 GPUs)
echo "Starting llama-server (GPU 2,3)..."
ssh $REMOTE "CUDA_VISIBLE_DEVICES=2,3 nohup $LLAMA_BIN \
-m $GGUF --port $LLAMA_PORT -ngl 999 -c 4096 --n-gpu-layers 999 \
> /tmp/llama-gptoss.log 2>&1 &"
sleep 5
# Start xserv server (GPU 0,1, TP=2)
echo "Starting xserv-server (GPU 0,1, TP=2)..."
ssh $REMOTE "CUDA_VISIBLE_DEVICES=0,1 nohup $XSERV_BIN $MODEL_DIR \
--port $XSERV_PORT --tp 2 --max-batch 1 --max-seq-len 4096 \
> /tmp/xserv-gptoss.log 2>&1 &"
sleep 10
# Wait for servers to be ready
echo "Waiting for servers..."
for i in $(seq 1 30); do
XSERV_OK=$(ssh $REMOTE "curl -s http://localhost:$XSERV_PORT/health 2>/dev/null" || echo "")
LLAMA_OK=$(ssh $REMOTE "curl -s http://localhost:$LLAMA_PORT/health 2>/dev/null" || echo "")
if [ -n "$XSERV_OK" ] && [ -n "$LLAMA_OK" ]; then
echo "Both servers ready!"
break
fi
sleep 2
done
echo ""
echo "=== Running GSM8K benchmark ($QUALITY_LIMIT problems) ==="
# Run quality benchmark
ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \
--xserv-base-url http://localhost:$XSERV_PORT \
--xserv-model-id gpt-oss-20b \
--llama-base-url http://localhost:$LLAMA_PORT \
--suite quality \
--quality-limit $QUALITY_LIMIT \
--max-seq-len 4096" 2>&1
echo ""
echo "=== Running speed benchmark ==="
ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \
--xserv-base-url http://localhost:$XSERV_PORT \
--xserv-model-id gpt-oss-20b \
--llama-base-url http://localhost:$LLAMA_PORT \
--suite speed \
--max-seq-len 4096" 2>&1
# Cleanup
echo ""
echo "=== Cleaning up ==="
ssh $REMOTE "pkill -f 'llama-server.*18090' 2>/dev/null; pkill -f 'xserv-server.*18080' 2>/dev/null" || true
echo "Done! Results in bench-out/"