#!/usr/bin/env bash # FULL quality matrix, strictly sequential (one server at a time, same GPU group # 0..N-1, no concurrency). Both engines x PP=1/2/4 x {aime2025, gsm8k}. # Each (engine,pp) invocation runs runner.py once (it does start->both tasks->stop). # Writes bench-out/fullq--pp/comparison-*.json ; summarized at the end. set -u cd /opt/wjh/projects/xserv export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH export CUDA_HOME=/usr/local/cuda-12.9 MODEL=/opt/wjh/models/qwen3-8b GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf XBIN=./target/release/xserv-server LBIN=third_party/llama.cpp/build/bin/llama-server AIME_LIMIT=${AIME_LIMIT:-30} GSM_LIMIT=${GSM_LIMIT:-20} MAXSEQ=${MAXSEQ:-4096} PROG=bench-out/FULLQ_PROGRESS.md : > "$PROG" echo "# full quality matrix — $(date)" >> "$PROG" kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; pkill -9 -f runner.py 2>/dev/null; sleep 4; } drain(){ for _ in $(seq 1 90); do hi=0; for g in $(seq 0 $1); do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; } run_one(){ # $1 engine $2 pp local eng=$1 pp=$2 dev; dev=$(seq -s, 0 $((pp-1))) kall; drain $((pp-1)) local out=bench-out/fullq-$eng-pp$pp rm -rf "$out" echo "=== START $eng pp=$pp on GPU $dev $(date +%H:%M:%S) ===" >> "$PROG" if [ "$eng" = xserv ]; then python3 -u -m tools.bench.runner --systems xserv --pp "$pp" \ --xserv-bin "$XBIN" --xserv-model "$MODEL" \ --suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \ --max-batch 1 --max-seq-len "$MAXSEQ" \ --out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1 else python3 -u -m tools.bench.runner --systems llama.cpp --pp "$pp" \ --llama-bin "$LBIN" --llama-gguf "$GGUF" \ --suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \ --max-batch 1 --max-seq-len "$MAXSEQ" \ --out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1 fi echo "=== END $eng pp=$pp rc=$? $(date +%H:%M:%S) $(ls $out/comparison-*.json 2>/dev/null | wc -l) json ===" >> "$PROG" } # aime2025 has 30 problems; runner uses one --quality-limit for ALL tasks, so we # pass max(limits) and rely on the datasets' own sizes (gsm8k.json may be larger, # but we cap with --quality-limit). To keep gsm8k at 20 and aime at 30 we run the # matrix with --quality-limit 30 (aime full; gsm8k uses first 30 -> report shows n_total). for eng in xserv llama; do for pp in 1 2 4; do run_one "$eng" "$pp"; done done kall echo "FULLQ_DONE" >> "$PROG"