From 15c51f143e0baa6669a6ab66f55e93c821e97c57 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Sat, 30 May 2026 15:39:44 +0800 Subject: [PATCH] server: support GptOss in TP engine + benchmark script - tp_engine.rs: TpModel enum dispatches between Qwen3 and GptOss based on config.is_moe(). Server auto-detects model type on startup. - tools/run_gpt_oss_bench.sh: one-click benchmark comparing xserv (TP=2) vs llama.cpp (BF16 GGUF) on GSM8K quality + speed Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/xserv-server/src/tp_engine.rs | 35 +++++++++-- tools/bench_gpt_oss.sh | 92 ++++++++++++++++++++++++++++ tools/run_gpt_oss_bench.sh | 76 +++++++++++++++++++++++ 3 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 tools/bench_gpt_oss.sh create mode 100755 tools/run_gpt_oss_bench.sh diff --git a/crates/xserv-server/src/tp_engine.rs b/crates/xserv-server/src/tp_engine.rs index 3e4a203..975a138 100644 --- a/crates/xserv-server/src/tp_engine.rs +++ b/crates/xserv-server/src/tp_engine.rs @@ -19,8 +19,8 @@ use std::thread; use xserv_distributed::{TpContext, UniqueId}; use xserv_model::loader; -use xserv_model::{sample, ModelConfig, PagedKVCache, Qwen3, BLOCK_SIZE}; -use xserv_tensor::{DType, Device}; +use xserv_model::{sample, GptOss, ModelConfig, PagedKVCache, Qwen3, BLOCK_SIZE}; +use xserv_tensor::{DType, Device, Tensor}; use xserv_tokenizer::Tokenizer; use crate::engine::{GenerateEvent, GenerateRequest}; @@ -34,8 +34,29 @@ enum TpCommand { Shutdown, } +enum TpModel { + Qwen3(Qwen3), + GptOss(GptOss), +} + +impl TpModel { + fn forward_prefill_paged(&self, tokens: &[u32], slot: usize, cache: &mut PagedKVCache) -> Tensor { + match self { + TpModel::Qwen3(m) => m.forward_prefill_paged(tokens, slot, cache), + TpModel::GptOss(m) => m.forward_prefill_paged(tokens, slot, cache), + } + } + + fn forward_decode_paged(&self, tokens: &[u32], positions: &[usize], slots: &[usize], cache: &mut PagedKVCache) -> Tensor { + match self { + TpModel::Qwen3(m) => m.forward_decode_paged(tokens, positions, slots, cache), + TpModel::GptOss(m) => m.forward_decode_paged(tokens, positions, slots, cache), + } + } +} + struct RankCtx { - model: Qwen3, + model: TpModel, cache: PagedKVCache, } @@ -49,9 +70,13 @@ fn build_rank( tp: Option>, ) -> RankCtx { let weights = loader::load_model_dir(model_dir, Device::Cpu); - let model = Qwen3::from_weights_tp(config.clone(), weights, rank, world, device, tp); + let model = if config.is_moe() { + TpModel::GptOss(GptOss::from_weights_tp(config.clone(), weights, rank, world, device, tp)) + } else { + TpModel::Qwen3(Qwen3::from_weights_tp(config.clone(), weights, rank, world, device, tp)) + }; let local_kv = config.num_kv_heads() / world; - let max_blocks_per_seq = max_seq_len.div_ceil(BLOCK_SIZE); + let max_blocks_per_seq = (max_seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; let total_blocks = max_blocks_per_seq + 8; let cache = PagedKVCache::new_tp( config, local_kv, total_blocks, 0, 4, max_blocks_per_seq, DType::BF16, device, diff --git a/tools/bench_gpt_oss.sh b/tools/bench_gpt_oss.sh new file mode 100644 index 0000000..fa4981c --- /dev/null +++ b/tools/bench_gpt_oss.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Benchmark gpt-oss-20b: xserv (TP=2) vs llama.cpp (BF16 GGUF) +# Runs GSM8K 200 problems on both systems and produces a comparison report. +# +# Usage: ./tools/bench_gpt_oss.sh [--quality-limit N] +set -e + +REMOTE="dash5" +REMOTE_DIR="/opt/wjh/projects/xserv" +MODEL_DIR="/opt/wjh/models/gpt-oss-20b-bf16" +GGUF="/opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf" +LLAMA_BIN="$REMOTE_DIR/third_party/llama.cpp/build/bin/llama-server" +XSERV_BIN="$REMOTE_DIR/target/release/xserv-server" + +QUALITY_LIMIT="${1:-200}" +XSERV_PORT=18080 +LLAMA_PORT=18090 + +echo "=== gpt-oss-20b Benchmark: xserv vs llama.cpp ===" +echo "Quality limit: $QUALITY_LIMIT problems" +echo "" + +# Sync project first +LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)" +echo "Syncing project..." +rsync -az --exclude target --exclude .git --exclude bench-out \ + "$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/" >/dev/null + +# Build on remote +echo "Building xserv..." +ssh $REMOTE "cd $REMOTE_DIR && source /etc/profile && \ + if [ -d /usr/local/cuda-12.9 ]; then export CUDA_HOME=/usr/local/cuda-12.9; else export CUDA_HOME=/usr/local/cuda; fi && \ + export PATH=\$CUDA_HOME/bin:\$PATH && \ + cargo build --release 2>&1 | tail -3" + +echo "" +echo "=== Starting servers ===" + +# Start llama.cpp server (GPU 2-3, BF16 GGUF needs ~42GB → use 2 GPUs) +echo "Starting llama-server (GPU 2,3)..." +ssh $REMOTE "CUDA_VISIBLE_DEVICES=2,3 nohup $LLAMA_BIN \ + -m $GGUF --port $LLAMA_PORT -ngl 999 -c 4096 --n-gpu-layers 999 \ + > /tmp/llama-gptoss.log 2>&1 &" +sleep 5 + +# Start xserv server (GPU 0,1, TP=2) +echo "Starting xserv-server (GPU 0,1, TP=2)..." +ssh $REMOTE "CUDA_VISIBLE_DEVICES=0,1 nohup $XSERV_BIN $MODEL_DIR \ + --port $XSERV_PORT --tp 2 --max-batch 1 --max-seq-len 4096 \ + > /tmp/xserv-gptoss.log 2>&1 &" +sleep 10 + +# Wait for servers to be ready +echo "Waiting for servers..." +for i in $(seq 1 30); do + XSERV_OK=$(ssh $REMOTE "curl -s http://localhost:$XSERV_PORT/health 2>/dev/null" || echo "") + LLAMA_OK=$(ssh $REMOTE "curl -s http://localhost:$LLAMA_PORT/health 2>/dev/null" || echo "") + if [ -n "$XSERV_OK" ] && [ -n "$LLAMA_OK" ]; then + echo "Both servers ready!" + break + fi + sleep 2 +done + +echo "" +echo "=== Running GSM8K benchmark ($QUALITY_LIMIT problems) ===" + +# Run quality benchmark +ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \ + --xserv-base-url http://localhost:$XSERV_PORT \ + --xserv-model-id gpt-oss-20b \ + --llama-base-url http://localhost:$LLAMA_PORT \ + --suite quality \ + --quality-limit $QUALITY_LIMIT \ + --max-seq-len 4096" 2>&1 + +echo "" +echo "=== Running speed benchmark ===" + +ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \ + --xserv-base-url http://localhost:$XSERV_PORT \ + --xserv-model-id gpt-oss-20b \ + --llama-base-url http://localhost:$LLAMA_PORT \ + --suite speed \ + --max-seq-len 4096" 2>&1 + +# Cleanup +echo "" +echo "=== Cleaning up ===" +ssh $REMOTE "pkill -f 'llama-server.*18090' 2>/dev/null; pkill -f 'xserv-server.*18080' 2>/dev/null" || true + +echo "Done! Results in bench-out/" diff --git a/tools/run_gpt_oss_bench.sh b/tools/run_gpt_oss_bench.sh new file mode 100755 index 0000000..a4c56f3 --- /dev/null +++ b/tools/run_gpt_oss_bench.sh @@ -0,0 +1,76 @@ +#!/bin/bash +set -e +cd /opt/wjh/projects/xserv + +# Kill any existing servers +pkill -f 'xserv-server.*18080' 2>/dev/null || true +pkill -f 'llama-server.*18090' 2>/dev/null || true +sleep 2 + +QUALITY_LIMIT="${1:-200}" +echo "=== gpt-oss-20b Benchmark: xserv (TP=2) vs llama.cpp (BF16) ===" +echo "GSM8K limit: $QUALITY_LIMIT problems" + +# Start llama.cpp (GPU 2,3) +echo 'Starting llama-server on GPU 2,3...' +CUDA_VISIBLE_DEVICES=2,3 nohup third_party/llama.cpp/build/bin/llama-server \ + -m /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf \ + --port 18090 -ngl 999 -c 4096 \ + > /tmp/llama-gptoss.log 2>&1 & + +# Start xserv (GPU 0,1, TP=2) +echo 'Starting xserv-server on GPU 0,1 (TP=2)...' +CUDA_VISIBLE_DEVICES=0,1 nohup ./target/release/xserv-server \ + /opt/wjh/models/gpt-oss-20b-bf16 \ + --port 18080 --tp 2 --max-batch 1 --max-seq-len 4096 \ + > /tmp/xserv-gptoss.log 2>&1 & + +# Wait for both to be ready +echo 'Waiting for servers to start...' +for i in $(seq 1 60); do + sleep 2 + XOK=$(curl -s http://localhost:18080/health 2>/dev/null || echo '') + LOK=$(curl -s http://localhost:18090/health 2>/dev/null || echo '') + if [ -n "$XOK" ] && [ -n "$LOK" ]; then + echo "Both servers ready! (${i}x2s)" + break + fi + if [ $i -eq 60 ]; then + echo 'ERROR: Timeout waiting for servers' + echo '--- xserv log ---' + tail -10 /tmp/xserv-gptoss.log + echo '--- llama log ---' + tail -10 /tmp/llama-gptoss.log + exit 1 + fi +done + +echo '' +echo '=== Running GSM8K quality benchmark ===' +python3 -m tools.bench.runner \ + --xserv-base-url http://localhost:18080 \ + --xserv-model-id gpt-oss-20b \ + --llama-base-url http://localhost:18090 \ + --suite quality \ + --quality-limit "$QUALITY_LIMIT" \ + --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_quality.log + +echo '' +echo '=== Running speed benchmark ===' +python3 -m tools.bench.runner \ + --xserv-base-url http://localhost:18080 \ + --xserv-model-id gpt-oss-20b \ + --llama-base-url http://localhost:18090 \ + --suite speed \ + --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_speed.log + +# Cleanup +echo '' +echo '=== Cleaning up ===' +pkill -f 'xserv-server.*18080' 2>/dev/null || true +pkill -f 'llama-server.*18090' 2>/dev/null || true + +echo '' +echo '=== BENCHMARK COMPLETE ===' +echo "Quality results: /tmp/bench_gptoss_quality.log" +echo "Speed results: /tmp/bench_gptoss_speed.log"