runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
91 lines
4.5 KiB
Bash
91 lines
4.5 KiB
Bash
#!/usr/bin/env bash
|
|
# One-shot pipeline-parallel (PP) verification + benchmark for Qwen3-8B.
|
|
# Run on the GPU host from the repo root. Writes bench-out/PP_RESULTS.md.
|
|
#
|
|
# 1. NCCL P2P send/recv + AllReduce unit tests
|
|
# 2. correctness: greedy (temp=0) output single == --pp 2 == --pp 4 (byte compare)
|
|
# 3. per-GPU VRAM (health-gated; weights + a minimal KV pool, ~1/P per card)
|
|
# 4. quality+latency sweep vs llama.cpp (-sm layer), gsm8k
|
|
#
|
|
# Env: MODEL, GGUF, LIMIT (problems), PPS (e.g. "1 2 4") may be overridden.
|
|
set -u
|
|
cd "$(dirname "$0")/.."
|
|
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
|
export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda-12.9}
|
|
MODEL=${MODEL:-/opt/wjh/models/qwen3-8b}
|
|
GGUF=${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}
|
|
LIMIT=${LIMIT:-20}
|
|
PPS=${PPS:-1 2 4}
|
|
BIN=./target/release/xserv-server
|
|
R=bench-out/PP_RESULTS.md
|
|
mkdir -p bench-out
|
|
: > "$R"
|
|
log(){ echo "$@" | tee -a "$R"; }
|
|
|
|
pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3
|
|
|
|
log "# PP verification — $(date)"
|
|
|
|
# ---- 1. NCCL P2P + AllReduce unit tests ----
|
|
log ""; log "## 1. NCCL P2P + AllReduce test"
|
|
cargo test -p xserv-distributed --release -- --test-threads=1 >/tmp/pp_t.log 2>&1
|
|
log " cargo test exit=$?"
|
|
grep -hE "test result|pp_send_recv|allreduce_two_gpu" /tmp/pp_t.log | sed 's/^/ /' | tee -a "$R"
|
|
|
|
# wait_ready PORT PID -> 0 when a real generation succeeds (xserv's /health
|
|
# returns 200 before the model is loaded, so gate on a generation, not /health).
|
|
wait_ready(){ local port=$1 pid=$2
|
|
for _ in $(seq 1 400); do
|
|
curl -s -o /dev/null -w '%{http_code}' --max-time 8 \
|
|
"http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
|
-d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' \
|
|
2>/dev/null | grep -q 200 && return 0
|
|
kill -0 "$pid" 2>/dev/null || return 1
|
|
sleep 3
|
|
done; return 1
|
|
}
|
|
|
|
# ---- 2. correctness ----
|
|
PROMPT='Explain what a transformer is in machine learning, in 3 sentences.'
|
|
gen(){ local port=$1 cvd=$2; shift 2
|
|
CUDA_VISIBLE_DEVICES=$cvd nohup $BIN $MODEL --port $port --max-seq-len 2048 "$@" >/tmp/pp_s$port.log 2>&1 &
|
|
local pid=$!
|
|
wait_ready "$port" "$pid" || { echo "(server $port failed)"; kill -9 "$pid" 2>/dev/null; return; }
|
|
curl -s --max-time 200 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
|
-d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":64,\"temperature\":0,\"stream\":false}" \
|
|
| python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' 2>/dev/null
|
|
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3
|
|
}
|
|
gen 8091 0 > /tmp/o_single.txt
|
|
gen 8092 0,1 --pp 2 > /tmp/o_pp2.txt
|
|
gen 8093 0,1,2,3 --pp 4 > /tmp/o_pp4.txt
|
|
log ""; log "## 2. Correctness (greedy temp=0, byte compare)"
|
|
log " single==pp2: $(cmp -s /tmp/o_single.txt /tmp/o_pp2.txt && echo IDENTICAL || echo DIFFER)"
|
|
log " single==pp4: $(cmp -s /tmp/o_single.txt /tmp/o_pp4.txt && echo IDENTICAL || echo DIFFER)"
|
|
log " single text: $(head -c 160 /tmp/o_single.txt)"
|
|
|
|
# ---- 3. per-GPU VRAM (health-gated, KV pool capped so all configs comparable) ----
|
|
log ""; log "## 3. Per-GPU VRAM (XSERV_MAX_KV_BLOCKS=160; weights + minimal KV)"
|
|
snap(){ nvidia-smi -i "$1" --query-gpu=memory.used --format=csv,noheader,nounits | paste -sd' '; }
|
|
vram(){ local label=$1 cvd=$2 port=$3; shift 3
|
|
XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $BIN $MODEL --port $port --max-seq-len 2048 "$@" >/tmp/pp_v$port.log 2>&1 &
|
|
local pid=$!
|
|
wait_ready "$port" "$pid" || { log " $label: server failed"; kill -9 "$pid" 2>/dev/null; return; }
|
|
curl -s --max-time 120 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
|
-d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":8,"temperature":0,"stream":false}' >/dev/null
|
|
local a b=""; for _ in $(seq 1 12); do a=$(snap "$cvd"); [ "$a" = "$b" ] && break; b=$a; sleep 2; done
|
|
log " $label ($cvd): $a MiB"
|
|
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 5
|
|
}
|
|
vram single 0 8094
|
|
vram pp2 0,1 8095 --pp 2
|
|
vram pp4 0,1,2,3 8096 --pp 4
|
|
|
|
# ---- 4. sweep vs llama.cpp ----
|
|
log ""; log "## 4. Sweep (gsm8k $LIMIT, xserv --pp 0..N-1 vs llama -sm layer 4..)"
|
|
PPS="$PPS" LIMIT="$LIMIT" TASKS=gsm8k bash tools/bench/run_pp_parallel.sh >/tmp/pp_sweep.log 2>&1
|
|
log '```'
|
|
python3 tools/bench/summarize_pp.py bench-out >> "$R" 2>&1
|
|
log '```'
|
|
log ""; log "PP_VERIFY_DONE"
|