runner/servers gain --tp (xserv --tp N; llama.cpp --split-mode row) and --llama-devices so llama can run on a disjoint GPU group. run_tp_parallel.sh runs xserv (GPU 0..N-1) and llama.cpp (GPU 4..4+N-1) concurrently per TP, matching the box's 0-3 / 4-7 PHB groups. summarize_tp.py tabulates the sweep. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
39 lines
1.6 KiB
Bash
39 lines
1.6 KiB
Bash
#!/usr/bin/env bash
|
|
# Run the TP=1/2/4 quality sweep with xserv and llama.cpp CONCURRENTLY on
|
|
# disjoint GPU groups: xserv on GPUs 0..N-1, llama.cpp on GPUs 4..4+N-1.
|
|
# The 8x5090 box is grouped 0-3 / 4-7 (PHB intra-group), so each engine's TP
|
|
# comm stays intra-group and the two engines never contend for a GPU.
|
|
#
|
|
# Run from the repo root on the GPU host. Produces bench-out/tp{1,2,4}-{xserv,llama}.
|
|
|
|
set -u
|
|
MODEL="${MODEL:-/opt/wjh/models/qwen3-8b}"
|
|
GGUF="${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}"
|
|
LIMIT="${LIMIT:-30}"
|
|
MAXSEQ="${MAXSEQ:-2048}"
|
|
TPS="${TPS:-1 2 4}"
|
|
|
|
for TP in $TPS; do
|
|
LD=$(seq -s, 4 $((3 + TP))) # llama GPUs: 4 / 4,5 / 4,5,6,7
|
|
echo "##### TP=$TP (xserv GPU 0..$((TP-1)) || llama GPU $LD) #####"
|
|
rm -rf "bench-out/tp$TP-xserv" "bench-out/tp$TP-llama"
|
|
|
|
python3 -u -m tools.bench.runner --systems xserv --tp "$TP" \
|
|
--xserv-bin ./target/release/xserv-server --xserv-model "$MODEL" \
|
|
--suite quality --quality-tasks aime2025,gsm8k --quality-limit "$LIMIT" \
|
|
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
|
--out-dir "bench-out/tp$TP-xserv" > "/tmp/tp$TP-xserv.log" 2>&1 &
|
|
XP=$!
|
|
|
|
python3 -u -m tools.bench.runner --systems llama.cpp --tp "$TP" --llama-devices "$LD" \
|
|
--llama-bin third_party/llama.cpp/build/bin/llama-server --llama-gguf "$GGUF" \
|
|
--suite quality --quality-tasks aime2025,gsm8k --quality-limit "$LIMIT" \
|
|
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
|
--out-dir "bench-out/tp$TP-llama" > "/tmp/tp$TP-llama.log" 2>&1 &
|
|
LP=$!
|
|
|
|
wait "$XP" "$LP"
|
|
echo "TP=$TP done (xserv exit=$? )"
|
|
done
|
|
echo ALL_DONE
|