server: support GptOss in TP engine + benchmark script
- tp_engine.rs: TpModel enum dispatches between Qwen3 and GptOss based on config.is_moe(). Server auto-detects model type on startup. - tools/run_gpt_oss_bench.sh: one-click benchmark comparing xserv (TP=2) vs llama.cpp (BF16 GGUF) on GSM8K quality + speed Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,8 +19,8 @@ use std::thread;
|
|||||||
|
|
||||||
use xserv_distributed::{TpContext, UniqueId};
|
use xserv_distributed::{TpContext, UniqueId};
|
||||||
use xserv_model::loader;
|
use xserv_model::loader;
|
||||||
use xserv_model::{sample, ModelConfig, PagedKVCache, Qwen3, BLOCK_SIZE};
|
use xserv_model::{sample, GptOss, ModelConfig, PagedKVCache, Qwen3, BLOCK_SIZE};
|
||||||
use xserv_tensor::{DType, Device};
|
use xserv_tensor::{DType, Device, Tensor};
|
||||||
use xserv_tokenizer::Tokenizer;
|
use xserv_tokenizer::Tokenizer;
|
||||||
|
|
||||||
use crate::engine::{GenerateEvent, GenerateRequest};
|
use crate::engine::{GenerateEvent, GenerateRequest};
|
||||||
@@ -34,8 +34,29 @@ enum TpCommand {
|
|||||||
Shutdown,
|
Shutdown,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum TpModel {
|
||||||
|
Qwen3(Qwen3),
|
||||||
|
GptOss(GptOss),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TpModel {
|
||||||
|
fn forward_prefill_paged(&self, tokens: &[u32], slot: usize, cache: &mut PagedKVCache) -> Tensor {
|
||||||
|
match self {
|
||||||
|
TpModel::Qwen3(m) => m.forward_prefill_paged(tokens, slot, cache),
|
||||||
|
TpModel::GptOss(m) => m.forward_prefill_paged(tokens, slot, cache),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn forward_decode_paged(&self, tokens: &[u32], positions: &[usize], slots: &[usize], cache: &mut PagedKVCache) -> Tensor {
|
||||||
|
match self {
|
||||||
|
TpModel::Qwen3(m) => m.forward_decode_paged(tokens, positions, slots, cache),
|
||||||
|
TpModel::GptOss(m) => m.forward_decode_paged(tokens, positions, slots, cache),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct RankCtx {
|
struct RankCtx {
|
||||||
model: Qwen3,
|
model: TpModel,
|
||||||
cache: PagedKVCache,
|
cache: PagedKVCache,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -49,9 +70,13 @@ fn build_rank(
|
|||||||
tp: Option<Arc<TpContext>>,
|
tp: Option<Arc<TpContext>>,
|
||||||
) -> RankCtx {
|
) -> RankCtx {
|
||||||
let weights = loader::load_model_dir(model_dir, Device::Cpu);
|
let weights = loader::load_model_dir(model_dir, Device::Cpu);
|
||||||
let model = Qwen3::from_weights_tp(config.clone(), weights, rank, world, device, tp);
|
let model = if config.is_moe() {
|
||||||
|
TpModel::GptOss(GptOss::from_weights_tp(config.clone(), weights, rank, world, device, tp))
|
||||||
|
} else {
|
||||||
|
TpModel::Qwen3(Qwen3::from_weights_tp(config.clone(), weights, rank, world, device, tp))
|
||||||
|
};
|
||||||
let local_kv = config.num_kv_heads() / world;
|
let local_kv = config.num_kv_heads() / world;
|
||||||
let max_blocks_per_seq = max_seq_len.div_ceil(BLOCK_SIZE);
|
let max_blocks_per_seq = (max_seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
let total_blocks = max_blocks_per_seq + 8;
|
let total_blocks = max_blocks_per_seq + 8;
|
||||||
let cache = PagedKVCache::new_tp(
|
let cache = PagedKVCache::new_tp(
|
||||||
config, local_kv, total_blocks, 0, 4, max_blocks_per_seq, DType::BF16, device,
|
config, local_kv, total_blocks, 0, 4, max_blocks_per_seq, DType::BF16, device,
|
||||||
|
|||||||
92
tools/bench_gpt_oss.sh
Normal file
92
tools/bench_gpt_oss.sh
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Benchmark gpt-oss-20b: xserv (TP=2) vs llama.cpp (BF16 GGUF)
|
||||||
|
# Runs GSM8K 200 problems on both systems and produces a comparison report.
|
||||||
|
#
|
||||||
|
# Usage: ./tools/bench_gpt_oss.sh [--quality-limit N]
|
||||||
|
set -e
|
||||||
|
|
||||||
|
REMOTE="dash5"
|
||||||
|
REMOTE_DIR="/opt/wjh/projects/xserv"
|
||||||
|
MODEL_DIR="/opt/wjh/models/gpt-oss-20b-bf16"
|
||||||
|
GGUF="/opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf"
|
||||||
|
LLAMA_BIN="$REMOTE_DIR/third_party/llama.cpp/build/bin/llama-server"
|
||||||
|
XSERV_BIN="$REMOTE_DIR/target/release/xserv-server"
|
||||||
|
|
||||||
|
QUALITY_LIMIT="${1:-200}"
|
||||||
|
XSERV_PORT=18080
|
||||||
|
LLAMA_PORT=18090
|
||||||
|
|
||||||
|
echo "=== gpt-oss-20b Benchmark: xserv vs llama.cpp ==="
|
||||||
|
echo "Quality limit: $QUALITY_LIMIT problems"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Sync project first
|
||||||
|
LOCAL_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
echo "Syncing project..."
|
||||||
|
rsync -az --exclude target --exclude .git --exclude bench-out \
|
||||||
|
"$LOCAL_DIR/" "$REMOTE:$REMOTE_DIR/" >/dev/null
|
||||||
|
|
||||||
|
# Build on remote
|
||||||
|
echo "Building xserv..."
|
||||||
|
ssh $REMOTE "cd $REMOTE_DIR && source /etc/profile && \
|
||||||
|
if [ -d /usr/local/cuda-12.9 ]; then export CUDA_HOME=/usr/local/cuda-12.9; else export CUDA_HOME=/usr/local/cuda; fi && \
|
||||||
|
export PATH=\$CUDA_HOME/bin:\$PATH && \
|
||||||
|
cargo build --release 2>&1 | tail -3"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Starting servers ==="
|
||||||
|
|
||||||
|
# Start llama.cpp server (GPU 2-3, BF16 GGUF needs ~42GB → use 2 GPUs)
|
||||||
|
echo "Starting llama-server (GPU 2,3)..."
|
||||||
|
ssh $REMOTE "CUDA_VISIBLE_DEVICES=2,3 nohup $LLAMA_BIN \
|
||||||
|
-m $GGUF --port $LLAMA_PORT -ngl 999 -c 4096 --n-gpu-layers 999 \
|
||||||
|
> /tmp/llama-gptoss.log 2>&1 &"
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Start xserv server (GPU 0,1, TP=2)
|
||||||
|
echo "Starting xserv-server (GPU 0,1, TP=2)..."
|
||||||
|
ssh $REMOTE "CUDA_VISIBLE_DEVICES=0,1 nohup $XSERV_BIN $MODEL_DIR \
|
||||||
|
--port $XSERV_PORT --tp 2 --max-batch 1 --max-seq-len 4096 \
|
||||||
|
> /tmp/xserv-gptoss.log 2>&1 &"
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# Wait for servers to be ready
|
||||||
|
echo "Waiting for servers..."
|
||||||
|
for i in $(seq 1 30); do
|
||||||
|
XSERV_OK=$(ssh $REMOTE "curl -s http://localhost:$XSERV_PORT/health 2>/dev/null" || echo "")
|
||||||
|
LLAMA_OK=$(ssh $REMOTE "curl -s http://localhost:$LLAMA_PORT/health 2>/dev/null" || echo "")
|
||||||
|
if [ -n "$XSERV_OK" ] && [ -n "$LLAMA_OK" ]; then
|
||||||
|
echo "Both servers ready!"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Running GSM8K benchmark ($QUALITY_LIMIT problems) ==="
|
||||||
|
|
||||||
|
# Run quality benchmark
|
||||||
|
ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \
|
||||||
|
--xserv-base-url http://localhost:$XSERV_PORT \
|
||||||
|
--xserv-model-id gpt-oss-20b \
|
||||||
|
--llama-base-url http://localhost:$LLAMA_PORT \
|
||||||
|
--suite quality \
|
||||||
|
--quality-limit $QUALITY_LIMIT \
|
||||||
|
--max-seq-len 4096" 2>&1
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Running speed benchmark ==="
|
||||||
|
|
||||||
|
ssh $REMOTE "cd $REMOTE_DIR && python3 -m tools.bench.runner \
|
||||||
|
--xserv-base-url http://localhost:$XSERV_PORT \
|
||||||
|
--xserv-model-id gpt-oss-20b \
|
||||||
|
--llama-base-url http://localhost:$LLAMA_PORT \
|
||||||
|
--suite speed \
|
||||||
|
--max-seq-len 4096" 2>&1
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
echo ""
|
||||||
|
echo "=== Cleaning up ==="
|
||||||
|
ssh $REMOTE "pkill -f 'llama-server.*18090' 2>/dev/null; pkill -f 'xserv-server.*18080' 2>/dev/null" || true
|
||||||
|
|
||||||
|
echo "Done! Results in bench-out/"
|
||||||
76
tools/run_gpt_oss_bench.sh
Executable file
76
tools/run_gpt_oss_bench.sh
Executable file
@@ -0,0 +1,76 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
cd /opt/wjh/projects/xserv
|
||||||
|
|
||||||
|
# Kill any existing servers
|
||||||
|
pkill -f 'xserv-server.*18080' 2>/dev/null || true
|
||||||
|
pkill -f 'llama-server.*18090' 2>/dev/null || true
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
QUALITY_LIMIT="${1:-200}"
|
||||||
|
echo "=== gpt-oss-20b Benchmark: xserv (TP=2) vs llama.cpp (BF16) ==="
|
||||||
|
echo "GSM8K limit: $QUALITY_LIMIT problems"
|
||||||
|
|
||||||
|
# Start llama.cpp (GPU 2,3)
|
||||||
|
echo 'Starting llama-server on GPU 2,3...'
|
||||||
|
CUDA_VISIBLE_DEVICES=2,3 nohup third_party/llama.cpp/build/bin/llama-server \
|
||||||
|
-m /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-bf16.gguf \
|
||||||
|
--port 18090 -ngl 999 -c 4096 \
|
||||||
|
> /tmp/llama-gptoss.log 2>&1 &
|
||||||
|
|
||||||
|
# Start xserv (GPU 0,1, TP=2)
|
||||||
|
echo 'Starting xserv-server on GPU 0,1 (TP=2)...'
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1 nohup ./target/release/xserv-server \
|
||||||
|
/opt/wjh/models/gpt-oss-20b-bf16 \
|
||||||
|
--port 18080 --tp 2 --max-batch 1 --max-seq-len 4096 \
|
||||||
|
> /tmp/xserv-gptoss.log 2>&1 &
|
||||||
|
|
||||||
|
# Wait for both to be ready
|
||||||
|
echo 'Waiting for servers to start...'
|
||||||
|
for i in $(seq 1 60); do
|
||||||
|
sleep 2
|
||||||
|
XOK=$(curl -s http://localhost:18080/health 2>/dev/null || echo '')
|
||||||
|
LOK=$(curl -s http://localhost:18090/health 2>/dev/null || echo '')
|
||||||
|
if [ -n "$XOK" ] && [ -n "$LOK" ]; then
|
||||||
|
echo "Both servers ready! (${i}x2s)"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ $i -eq 60 ]; then
|
||||||
|
echo 'ERROR: Timeout waiting for servers'
|
||||||
|
echo '--- xserv log ---'
|
||||||
|
tail -10 /tmp/xserv-gptoss.log
|
||||||
|
echo '--- llama log ---'
|
||||||
|
tail -10 /tmp/llama-gptoss.log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ''
|
||||||
|
echo '=== Running GSM8K quality benchmark ==='
|
||||||
|
python3 -m tools.bench.runner \
|
||||||
|
--xserv-base-url http://localhost:18080 \
|
||||||
|
--xserv-model-id gpt-oss-20b \
|
||||||
|
--llama-base-url http://localhost:18090 \
|
||||||
|
--suite quality \
|
||||||
|
--quality-limit "$QUALITY_LIMIT" \
|
||||||
|
--max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_quality.log
|
||||||
|
|
||||||
|
echo ''
|
||||||
|
echo '=== Running speed benchmark ==='
|
||||||
|
python3 -m tools.bench.runner \
|
||||||
|
--xserv-base-url http://localhost:18080 \
|
||||||
|
--xserv-model-id gpt-oss-20b \
|
||||||
|
--llama-base-url http://localhost:18090 \
|
||||||
|
--suite speed \
|
||||||
|
--max-seq-len 4096 2>&1 | tee /tmp/bench_gptoss_speed.log
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
echo ''
|
||||||
|
echo '=== Cleaning up ==='
|
||||||
|
pkill -f 'xserv-server.*18080' 2>/dev/null || true
|
||||||
|
pkill -f 'llama-server.*18090' 2>/dev/null || true
|
||||||
|
|
||||||
|
echo ''
|
||||||
|
echo '=== BENCHMARK COMPLETE ==='
|
||||||
|
echo "Quality results: /tmp/bench_gptoss_quality.log"
|
||||||
|
echo "Speed results: /tmp/bench_gptoss_speed.log"
|
||||||
Reference in New Issue
Block a user