From 15c51f143e0baa6669a6ab66f55e93c821e97c57 Mon Sep 17 00:00:00 2001
From: Gahow Wang <yuanqu.wjh@alibaba-inc.com>
Date: Sat, 30 May 2026 15:39:44 +0800
Subject: [PATCH] server: support GptOss in TP engine + benchmark script

- tp_engine.rs: TpModel enum dispatches between Qwen3 and GptOss based on
  config.is_moe(). Server auto-detects model type on startup.
- tools/run_gpt_oss_bench.sh: one-click benchmark comparing xserv (TP=2)
  vs llama.cpp (BF16 GGUF) on GSM8K quality + speed

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/xserv-server/src/tp_engine.rs | 35 +++++++++--
 tools/bench_gpt_oss.sh               | 92 ++++++++++++++++++++++++++++
 tools/run_gpt_oss_bench.sh           | 76 +++++++++++++++++++++++
 3 files changed, 198 insertions(+), 5 deletions(-)
 create mode 100644 tools/bench_gpt_oss.sh
 create mode 100755 tools/run_gpt_oss_bench.sh
diff --git a/crates/xserv-server/src/tp_engine.rs b/crates/xserv-server/src/tp_engine.rs
index 3e4a203..975a138 100644
--- a/crates/xserv-server/src/tp_engine.rs
+++ b/crates/xserv-server/src/tp_engine.rs
@@ -19,8 +19,8 @@ use std::thread;
 
 use xserv_distributed::{TpContext, UniqueId};
 use xserv_model::loader;
-use xserv_model::{sample, ModelConfig, PagedKVCache, Qwen3, BLOCK_SIZE};
-use xserv_tensor::{DType, Device};
+use xserv_model::{sample, GptOss, ModelConfig, PagedKVCache, Qwen3, BLOCK_SIZE};
+use xserv_tensor::{DType, Device, Tensor};
 use xserv_tokenizer::Tokenizer;
 
 use crate::engine::{GenerateEvent, GenerateRequest};
@@ -34,8 +34,29 @@ enum TpCommand {
     Shutdown,
 }
 
+enum TpModel {
+    Qwen3(Qwen3),
+    GptOss(GptOss),
+}
+
+impl TpModel {
+    fn forward_prefill_paged(&self, tokens: &[u32], slot: usize, cache: &mut PagedKVCache) -> Tensor {
+        match self {
+            TpModel::Qwen3(m) => m.forward_prefill_paged(tokens, slot, cache),
+            TpModel::GptOss(m) => m.forward_prefill_paged(tokens, slot, cache),
+        }
+    }
+
+    fn forward_decode_paged(&self, tokens: &[u32], positions: &[usize], slots: &[usize], cache: &mut PagedKVCache) -> Tensor {
+        match self {
+            TpModel::Qwen3(m) => m.forward_decode_paged(tokens, positions, slots, cache),
+            TpModel::GptOss(m) => m.forward_decode_paged(tokens, positions, slots, cache),
+        }
+    }
+}
+
 struct RankCtx {
-    model: Qwen3,
+    model: TpModel,
     cache: PagedKVCache,
 }
 
@@ -49,9 +70,13 @@ fn build_rank(
     tp: Option<Arc<TpContext>>,
 ) -> RankCtx {
     let weights = loader::load_model_dir(model_dir, Device::Cpu);
-    let model = Qwen3::from_weights_tp(config.clone(), weights, rank, world, device, tp);
+    let model = if config.is_moe() {
+        TpModel::GptOss(GptOss::from_weights_tp(config.clone(), weights, rank, world, device, tp))
+    } else {
+        TpModel::Qwen3(Qwen3::from_weights_tp(config.clone(), weights, rank, world, device, tp))
+    };
     let local_kv = config.num_kv_heads() / world;
-    let max_blocks_per_seq = max_seq_len.div_ceil(BLOCK_SIZE);
+    let max_blocks_per_seq = (max_seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
     let total_blocks = max_blocks_per_seq + 8;
     let cache = PagedKVCache::new_tp(
         config, local_kv, total_blocks, 0, 4, max_blocks_per_seq, DType::BF16, device,
diff --git a/tools/bench_gpt_oss.sh b/tools/bench_gpt_oss.sh
new file mode 100644
index 0000000..fa4981c
--- /dev/null
+++ b/tools/bench_gpt_oss.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# Benchmark gpt-oss-20b: xserv (TP=2) vs llama.cpp (BF16 GGUF)
+# Runs GSM8K 200 problems on both systems and produces a comparison report.
+#
+# Usage: ./tools/bench_gpt_oss.sh [--quality-limit N]
+set -e
+
+REMOTE="dash5"
+REMOTE_DIR="/opt/wjh/projects/xserv"
+MODEL_DIR="/opt/wjh/models/gpt-oss-20b-bf16"
+GGUF="/opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf"
+LLAMA_BIN="$REMOTE_DIR/third_party/llama.cpp/build/bin/llama-server"
+XSERV_BIN="$REMOTE_DIR/target/release/xserv-server"
+
+QUALITY_LIMIT="${1:-200}"
+XSERV_PORT=18080
+LLAMA_PORT=18090
+
+echo "=== gpt-oss-20b Benchmark: xserv vs llama.cpp ==="
+echo "Quality limit: $QUALITY_LIMIT problems"
+echo ""
+
+# Sync project first
+LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+echo "Syncing project..."
+rsync -az --exclude target --exclude .git --exclude bench-out \
+    "$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/" >/dev/null
+
+# Build on remote
+echo "Building xserv..."
+ssh $REMOTE "cd $REMOTE_DIR && source /etc/profile && \
+    if [ -d /usr/local/cuda-12.9 ]; then export CUDA_HOME=/usr/local/cuda-12.9; else export CUDA_HOME=/usr/local/cuda; fi && \
+    export PATH=\$CUDA_HOME/bin:\$PATH && \
+    cargo build --release 2>&1 | tail -3"
+
+echo ""
+echo "=== Starting servers ==="
+
+# Start llama.cpp server (GPU 2-3, BF16 GGUF needs ~42GB → use 2 GPUs)
+echo "Starting llama-server (GPU 2,3)..."
+ssh $REMOTE "CUDA_VISIBLE_DEVICES=2,3 nohup $LLAMA_BIN \
+    -m $GGUF --port $LLAMA_PORT -ngl 999 -c 4096 --n-gpu-layers 999 \
+    > /tmp/llama-gptoss.log 2>&1 &"
+sleep 5
+
+# Start xserv server (GPU 0,1, TP=2)
+echo "Starting xserv-server (GPU 0,1, TP=2)..."
+ssh $REMOTE "CUDA_VISIBLE_DEVICES=0,1 nohup $XSERV_BIN $MODEL_DIR \
+    --port $XSERV_PORT --tp 2 --max-batch 1 --max-seq-len 4096 \
+    > /tmp/xserv-gptoss.log 2>&1 &"
+sleep 10
+
+# Wait for servers to be ready
+echo "Waiting for servers..."
+for i in $(seq 1 30); do
+    XSERV_OK=$(ssh $REMOTE "curl -s http://localhost:$XSERV_PORT/health 2>/dev/null" || echo "")
+    LLAMA_OK=$(ssh $REMOTE "curl -s http://localhost:$LLAMA_PORT/health 2>/dev/null" || echo "")
+    if [ -n "$XSERV_OK" ] && [ -n "$LLAMA_OK" ]; then
+        echo "Both servers ready!"
+        break
+    fi
+    sleep 2
+done
+
+echo ""
+echo "=== Running GSM8K benchmark ($QUALITY_LIMIT problems) ==="
+
+# Run quality benchmark
+ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \
+    --xserv-base-url http://localhost:$XSERV_PORT \
+    --xserv-model-id gpt-oss-20b \
+    --llama-base-url http://localhost:$LLAMA_PORT \
+    --suite quality \
+    --quality-limit $QUALITY_LIMIT \
+    --max-seq-len 4096" 2>&1
+
+echo ""
+echo "=== Running speed benchmark ==="
+
+ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \
+    --xserv-base-url http://localhost:$XSERV_PORT \
+    --xserv-model-id gpt-oss-20b \
+    --llama-base-url http://localhost:$LLAMA_PORT \
+    --suite speed \
+    --max-seq-len 4096" 2>&1
+
+# Cleanup
+echo ""
+echo "=== Cleaning up ==="
+ssh $REMOTE "pkill -f 'llama-server.*18090' 2>/dev/null; pkill -f 'xserv-server.*18080' 2>/dev/null" || true
+
+echo "Done! Results in bench-out/"
diff --git a/tools/run_gpt_oss_bench.sh b/tools/run_gpt_oss_bench.sh
new file mode 100755
index 0000000..a4c56f3
--- /dev/null
+++ b/tools/run_gpt_oss_bench.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+set -e
+cd /opt/wjh/projects/xserv
+
+# Kill any existing servers
+pkill -f 'xserv-server.*18080' 2>/dev/null || true
+pkill -f 'llama-server.*18090' 2>/dev/null || true
+sleep 2
+
+QUALITY_LIMIT="${1:-200}"
+echo "=== gpt-oss-20b Benchmark: xserv (TP=2) vs llama.cpp (BF16) ==="
+echo "GSM8K limit: $QUALITY_LIMIT problems"
+
+# Start llama.cpp (GPU 2,3)
+echo 'Starting llama-server on GPU 2,3...'
+CUDA_VISIBLE_DEVICES=2,3 nohup third_party/llama.cpp/build/bin/llama-server \
+    -m /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf \
+    --port 18090 -ngl 999 -c 4096 \
+    > /tmp/llama-gptoss.log 2>&1 &
+
+# Start xserv (GPU 0,1, TP=2)
+echo 'Starting xserv-server on GPU 0,1 (TP=2)...'
+CUDA_VISIBLE_DEVICES=0,1 nohup ./target/release/xserv-server \
+    /opt/wjh/models/gpt-oss-20b-bf16 \
+    --port 18080 --tp 2 --max-batch 1 --max-seq-len 4096 \
+    > /tmp/xserv-gptoss.log 2>&1 &
+
+# Wait for both to be ready
+echo 'Waiting for servers to start...'
+for i in $(seq 1 60); do
+    sleep 2
+    XOK=$(curl -s http://localhost:18080/health 2>/dev/null || echo '')
+    LOK=$(curl -s http://localhost:18090/health 2>/dev/null || echo '')
+    if [ -n "$XOK" ] && [ -n "$LOK" ]; then
+        echo "Both servers ready! (${i}x2s)"
+        break
+    fi
+    if [ $i -eq 60 ]; then
+        echo 'ERROR: Timeout waiting for servers'
+        echo '--- xserv log ---'
+        tail -10 /tmp/xserv-gptoss.log
+        echo '--- llama log ---'
+        tail -10 /tmp/llama-gptoss.log
+        exit 1
+    fi
+done
+
+echo ''
+echo '=== Running GSM8K quality benchmark ==='
+python3 -m tools.bench.runner \
+    --xserv-base-url http://localhost:18080 \
+    --xserv-model-id gpt-oss-20b \
+    --llama-base-url http://localhost:18090 \
+    --suite quality \
+    --quality-limit "$QUALITY_LIMIT" \
+    --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_quality.log
+
+echo ''
+echo '=== Running speed benchmark ==='
+python3 -m tools.bench.runner \
+    --xserv-base-url http://localhost:18080 \
+    --xserv-model-id gpt-oss-20b \
+    --llama-base-url http://localhost:18090 \
+    --suite speed \
+    --max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_speed.log
+
+# Cleanup
+echo ''
+echo '=== Cleaning up ==='
+pkill -f 'xserv-server.*18080' 2>/dev/null || true
+pkill -f 'llama-server.*18090' 2>/dev/null || true
+
+echo ''
+echo '=== BENCHMARK COMPLETE ==='
+echo "Quality results: /tmp/bench_gptoss_quality.log"
+echo "Speed results: /tmp/bench_gptoss_speed.log"