xserv/tools/pp_quality_full.sh

#!/usr/bin/env bash
# FULL quality matrix, strictly sequential (one server at a time, same GPU group
# 0..N-1, no concurrency). Both engines x PP=1/2/4 x {aime2025, gsm8k}.
# Each (engine,pp) invocation runs runner.py once (it does start->both tasks->stop).
# Writes bench-out/fullq-<engine>-pp<N>/comparison-*.json ; summarized at the end.
set -u
cd /opt/wjh/projects/xserv
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
export CUDA_HOME=/usr/local/cuda-12.9
MODEL=/opt/wjh/models/qwen3-8b
GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf
XBIN=./target/release/xserv-server
LBIN=third_party/llama.cpp/build/bin/llama-server
AIME_LIMIT=${AIME_LIMIT:-30}
GSM_LIMIT=${GSM_LIMIT:-20}
MAXSEQ=${MAXSEQ:-4096}
PROG=bench-out/FULLQ_PROGRESS.md
: > "$PROG"
echo "# full quality matrix — $(date)" >> "$PROG"

kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; pkill -9 -f runner.py 2>/dev/null; sleep 4; }
drain(){ for _ in $(seq 1 90); do hi=0; for g in $(seq 0 $1); do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; }

run_one(){ # $1 engine  $2 pp
  local eng=$1 pp=$2 dev; dev=$(seq -s, 0 $((pp-1)))
  kall; drain $((pp-1))
  local out=bench-out/fullq-$eng-pp$pp
  rm -rf "$out"
  echo "=== START $eng pp=$pp on GPU $dev $(date +%H:%M:%S) ===" >> "$PROG"
  if [ "$eng" = xserv ]; then
    python3 -u -m tools.bench.runner --systems xserv --pp "$pp" \
      --xserv-bin "$XBIN" --xserv-model "$MODEL" \
      --suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \
      --max-batch 1 --max-seq-len "$MAXSEQ" \
      --out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1
  else
    python3 -u -m tools.bench.runner --systems llama.cpp --pp "$pp" \
      --llama-bin "$LBIN" --llama-gguf "$GGUF" \
      --suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \
      --max-batch 1 --max-seq-len "$MAXSEQ" \
      --out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1
  fi
  echo "=== END   $eng pp=$pp rc=$? $(date +%H:%M:%S) $(ls $out/comparison-*.json 2>/dev/null | wc -l) json ===" >> "$PROG"
}

# aime2025 has 30 problems; runner uses one --quality-limit for ALL tasks, so we
# pass max(limits) and rely on the datasets' own sizes (gsm8k.json may be larger,
# but we cap with --quality-limit). To keep gsm8k at 20 and aime at 30 we run the
# matrix with --quality-limit 30 (aime full; gsm8k uses first 30 -> report shows n_total).
for eng in xserv llama; do
  for pp in 1 2 4; do run_one "$eng" "$pp"; done
done
kall
echo "FULLQ_DONE" >> "$PROG"