runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
55 lines
2.5 KiB
Bash
55 lines
2.5 KiB
Bash
#!/usr/bin/env bash
|
|
# FULL quality matrix, strictly sequential (one server at a time, same GPU group
|
|
# 0..N-1, no concurrency). Both engines x PP=1/2/4 x {aime2025, gsm8k}.
|
|
# Each (engine,pp) invocation runs runner.py once (it does start->both tasks->stop).
|
|
# Writes bench-out/fullq-<engine>-pp<N>/comparison-*.json ; summarized at the end.
|
|
set -u
|
|
cd /opt/wjh/projects/xserv
|
|
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
|
export CUDA_HOME=/usr/local/cuda-12.9
|
|
MODEL=/opt/wjh/models/qwen3-8b
|
|
GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf
|
|
XBIN=./target/release/xserv-server
|
|
LBIN=third_party/llama.cpp/build/bin/llama-server
|
|
AIME_LIMIT=${AIME_LIMIT:-30}
|
|
GSM_LIMIT=${GSM_LIMIT:-20}
|
|
MAXSEQ=${MAXSEQ:-4096}
|
|
PROG=bench-out/FULLQ_PROGRESS.md
|
|
: > "$PROG"
|
|
echo "# full quality matrix — $(date)" >> "$PROG"
|
|
|
|
kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; pkill -9 -f runner.py 2>/dev/null; sleep 4; }
|
|
drain(){ for _ in $(seq 1 90); do hi=0; for g in $(seq 0 $1); do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; }
|
|
|
|
run_one(){ # $1 engine $2 pp
|
|
local eng=$1 pp=$2 dev; dev=$(seq -s, 0 $((pp-1)))
|
|
kall; drain $((pp-1))
|
|
local out=bench-out/fullq-$eng-pp$pp
|
|
rm -rf "$out"
|
|
echo "=== START $eng pp=$pp on GPU $dev $(date +%H:%M:%S) ===" >> "$PROG"
|
|
if [ "$eng" = xserv ]; then
|
|
python3 -u -m tools.bench.runner --systems xserv --pp "$pp" \
|
|
--xserv-bin "$XBIN" --xserv-model "$MODEL" \
|
|
--suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \
|
|
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
|
--out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1
|
|
else
|
|
python3 -u -m tools.bench.runner --systems llama.cpp --pp "$pp" \
|
|
--llama-bin "$LBIN" --llama-gguf "$GGUF" \
|
|
--suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \
|
|
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
|
--out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1
|
|
fi
|
|
echo "=== END $eng pp=$pp rc=$? $(date +%H:%M:%S) $(ls $out/comparison-*.json 2>/dev/null | wc -l) json ===" >> "$PROG"
|
|
}
|
|
|
|
# aime2025 has 30 problems; runner uses one --quality-limit for ALL tasks, so we
|
|
# pass max(limits) and rely on the datasets' own sizes (gsm8k.json may be larger,
|
|
# but we cap with --quality-limit). To keep gsm8k at 20 and aime at 30 we run the
|
|
# matrix with --quality-limit 30 (aime full; gsm8k uses first 30 -> report shows n_total).
|
|
for eng in xserv llama; do
|
|
for pp in 1 2 4; do run_one "$eng" "$pp"; done
|
|
done
|
|
kall
|
|
echo "FULLQ_DONE" >> "$PROG"
|