#!/bin/bash # Benchmark gpt-oss-20b: xserv (TP=2) vs llama.cpp (BF16 GGUF) # Runs GSM8K 200 problems on both systems and produces a comparison report. # # Usage: ./tools/bench_gpt_oss.sh [--quality-limit N] set -e REMOTE="dash5" REMOTE_DIR="/opt/wjh/projects/xserv" MODEL_DIR="/opt/wjh/models/gpt-oss-20b-bf16" GGUF="/opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf" LLAMA_BIN="$REMOTE_DIR/third_party/llama.cpp/build/bin/llama-server" XSERV_BIN="$REMOTE_DIR/target/release/xserv-server" QUALITY_LIMIT="${1:-200}" XSERV_PORT=18080 LLAMA_PORT=18090 echo "=== gpt-oss-20b Benchmark: xserv vs llama.cpp ===" echo "Quality limit: $QUALITY_LIMIT problems" echo "" # Sync project first LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)" echo "Syncing project..." rsync -az --exclude target --exclude .git --exclude bench-out \ "$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/" >/dev/null # Build on remote echo "Building xserv..." ssh $REMOTE "cd $REMOTE_DIR && source /etc/profile && \ if [ -d /usr/local/cuda-12.9 ]; then export CUDA_HOME=/usr/local/cuda-12.9; else export CUDA_HOME=/usr/local/cuda; fi && \ export PATH=\$CUDA_HOME/bin:\$PATH && \ cargo build --release 2>&1 | tail -3" echo "" echo "=== Starting servers ===" # Start llama.cpp server (GPU 2-3, BF16 GGUF needs ~42GB → use 2 GPUs) echo "Starting llama-server (GPU 2,3)..." ssh $REMOTE "CUDA_VISIBLE_DEVICES=2,3 nohup $LLAMA_BIN \ -m $GGUF --port $LLAMA_PORT -ngl 999 -c 4096 --n-gpu-layers 999 \ > /tmp/llama-gptoss.log 2>&1 &" sleep 5 # Start xserv server (GPU 0,1, TP=2) echo "Starting xserv-server (GPU 0,1, TP=2)..." ssh $REMOTE "CUDA_VISIBLE_DEVICES=0,1 nohup $XSERV_BIN $MODEL_DIR \ --port $XSERV_PORT --tp 2 --max-batch 1 --max-seq-len 4096 \ > /tmp/xserv-gptoss.log 2>&1 &" sleep 10 # Wait for servers to be ready echo "Waiting for servers..." for i in $(seq 1 30); do XSERV_OK=$(ssh $REMOTE "curl -s http://localhost:$XSERV_PORT/health 2>/dev/null" || echo "") LLAMA_OK=$(ssh $REMOTE "curl -s http://localhost:$LLAMA_PORT/health 2>/dev/null" || echo "") if [ -n "$XSERV_OK" ] && [ -n "$LLAMA_OK" ]; then echo "Both servers ready!" break fi sleep 2 done echo "" echo "=== Running GSM8K benchmark ($QUALITY_LIMIT problems) ===" # Run quality benchmark ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \ --xserv-base-url http://localhost:$XSERV_PORT \ --xserv-model-id gpt-oss-20b \ --llama-base-url http://localhost:$LLAMA_PORT \ --suite quality \ --quality-limit $QUALITY_LIMIT \ --max-seq-len 4096" 2>&1 echo "" echo "=== Running speed benchmark ===" ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \ --xserv-base-url http://localhost:$XSERV_PORT \ --xserv-model-id gpt-oss-20b \ --llama-base-url http://localhost:$LLAMA_PORT \ --suite speed \ --max-seq-len 4096" 2>&1 # Cleanup echo "" echo "=== Cleaning up ===" ssh $REMOTE "pkill -f 'llama-server.*18090' 2>/dev/null; pkill -f 'xserv-server.*18080' 2>/dev/null" || true echo "Done! Results in bench-out/"