runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
43 lines
1.7 KiB
Bash
43 lines
1.7 KiB
Bash
#!/usr/bin/env bash
|
|
# Run the PP=1/2/4 sweep with xserv and llama.cpp CONCURRENTLY on disjoint GPU
|
|
# groups: xserv (--pp) on GPUs 0..N-1, llama.cpp (-sm layer) on GPUs 4..4+N-1.
|
|
# The 8x5090 box is grouped 0-3 / 4-7 (PHB intra-group), so each engine's P2P
|
|
# stays intra-group and the two engines never contend for a GPU.
|
|
#
|
|
# xserv splits layers across N GPUs and hands off hidden states via NCCL P2P;
|
|
# llama.cpp's default `-sm layer` does the analogous layer-wise split.
|
|
#
|
|
# Run from the repo root on the GPU host. Produces bench-out/pp{1,2,4}-{xserv,llama}.
|
|
|
|
set -u
|
|
MODEL="${MODEL:-/opt/wjh/models/qwen3-8b}"
|
|
GGUF="${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}"
|
|
LIMIT="${LIMIT:-20}"
|
|
MAXSEQ="${MAXSEQ:-2048}"
|
|
PPS="${PPS:-1 2 4}"
|
|
TASKS="${TASKS:-gsm8k}"
|
|
|
|
for PP in $PPS; do
|
|
LD=$(seq -s, 4 $((3 + PP))) # llama GPUs: 4 / 4,5 / 4,5,6,7
|
|
echo "##### PP=$PP (xserv GPU 0..$((PP-1)) || llama GPU $LD) #####"
|
|
rm -rf "bench-out/pp$PP-xserv" "bench-out/pp$PP-llama"
|
|
|
|
python3 -u -m tools.bench.runner --systems xserv --pp "$PP" \
|
|
--xserv-bin ./target/release/xserv-server --xserv-model "$MODEL" \
|
|
--suite quality --quality-tasks "$TASKS" --quality-limit "$LIMIT" \
|
|
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
|
--out-dir "bench-out/pp$PP-xserv" > "/tmp/pp$PP-xserv.log" 2>&1 &
|
|
XP=$!
|
|
|
|
python3 -u -m tools.bench.runner --systems llama.cpp --pp "$PP" --llama-devices "$LD" \
|
|
--llama-bin third_party/llama.cpp/build/bin/llama-server --llama-gguf "$GGUF" \
|
|
--suite quality --quality-tasks "$TASKS" --quality-limit "$LIMIT" \
|
|
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
|
--out-dir "bench-out/pp$PP-llama" > "/tmp/pp$PP-llama.log" 2>&1 &
|
|
LP=$!
|
|
|
|
wait "$XP" "$LP"
|
|
echo "PP=$PP done"
|
|
done
|
|
echo ALL_DONE
|