runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
90 lines
4.3 KiB
Bash
90 lines
4.3 KiB
Bash
#!/usr/bin/env bash
|
|
# Clean, strictly-sequential single-stream latency + per-GPU VRAM for PP.
|
|
# One server at a time. Readiness = first SUCCESSFUL generation (xserv's /health
|
|
# returns 200 before the model finishes loading, so we must not gate on it).
|
|
# Snapshots are therefore always post-load. Writes bench-out/PP_CLEAN.md.
|
|
#
|
|
# Env overrides: MODEL, GGUF, PPS (default "1 2 4"), LLAMA_BIN.
|
|
set -u
|
|
cd "$(dirname "$0")/../.."
|
|
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
|
export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda-12.9}
|
|
MODEL=${MODEL:-/opt/wjh/models/qwen3-8b}
|
|
GGUF=${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}
|
|
LLAMA_BIN=${LLAMA_BIN:-third_party/llama.cpp/build/bin/llama-server}
|
|
XBIN=./target/release/xserv-server
|
|
PPS=${PPS:-1 2 4}
|
|
PROMPT='Write a detailed paragraph explaining how GPUs accelerate neural network training.'
|
|
OUT=bench-out/PP_CLEAN.md
|
|
mkdir -p bench-out
|
|
: > "$OUT"
|
|
echo "# PP clean single-stream latency + VRAM — $(date)" >> "$OUT"
|
|
echo "" >> "$OUT"
|
|
echo "| engine | PP | TTFT_ms | TPOT_ms | tok/s | per-GPU VRAM (MiB) |" >> "$OUT"
|
|
echo "|--------|----|---------|---------|-------|--------------------|" >> "$OUT"
|
|
|
|
killall_servers(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; }
|
|
|
|
drain(){ # wait until GPUs $1 (csv) all < 1500 MiB, max 120s
|
|
for _ in $(seq 1 60); do
|
|
local hi=0
|
|
for g in ${1//,/ }; do
|
|
m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits)
|
|
[ "${m:-0}" -gt 1500 ] && hi=1
|
|
done
|
|
[ "$hi" -eq 0 ] && return 0; sleep 2
|
|
done
|
|
}
|
|
|
|
# probe_ready PORT PID -> 0 when a generation succeeds (deadline ~1200s)
|
|
probe_ready(){ local port=$1 pid=$2
|
|
for _ in $(seq 1 400); do
|
|
if curl -s -o /dev/null -w '%{http_code}' --max-time 8 \
|
|
"http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
|
-d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' \
|
|
2>/dev/null | grep -q 200; then return 0; fi
|
|
kill -0 "$pid" 2>/dev/null || return 1
|
|
sleep 3
|
|
done; return 1
|
|
}
|
|
|
|
vram(){ local cvd=$1; local a b="" # stabilized snapshot of GPUs $cvd
|
|
for _ in $(seq 1 12); do
|
|
a=$(for g in ${cvd//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' ')
|
|
[ "$a" = "$b" ] && break; b=$a; sleep 2
|
|
done; echo "$a"
|
|
}
|
|
|
|
run_xserv(){ local pp=$1; local cvd; cvd=$(seq -s, 0 $((pp-1)))
|
|
killall_servers; drain "$cvd"
|
|
local extra=""; [ "$pp" -gt 1 ] && extra="--pp $pp"
|
|
XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $extra >/tmp/x$pp.log 2>&1 &
|
|
local pid=$!
|
|
if ! probe_ready 8090 "$pid"; then echo "| xserv | $pp | FAILED (see /tmp/x$pp.log) | | | |" >> "$OUT"; kill -9 "$pid" 2>/dev/null; return; fi
|
|
local mib; mib=$(vram "$cvd")
|
|
local m; m=$(python3 tools/bench/pp_time.py http://127.0.0.1:8090 "$PROMPT")
|
|
local ttft tpot toks; ttft=$(echo "$m"|sed -n 's/.*TTFT_ms=\([0-9.]*\).*/\1/p'); tpot=$(echo "$m"|sed -n 's/.*TPOT_ms=\([0-9.a-z]*\).*/\1/p'); toks=$(echo "$m"|sed -n 's/.*tok_s=\([0-9.a-z]*\).*/\1/p')
|
|
echo "| xserv | $pp | $ttft | $tpot | $toks | $mib |" >> "$OUT"
|
|
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3
|
|
}
|
|
|
|
run_llama(){ local pp=$1; local cvd; cvd=$(seq -s, 0 $((pp-1)))
|
|
killall_servers; drain "$cvd"
|
|
local sm=(-sm none); [ "$pp" -gt 1 ] && sm=(-sm layer -ts "$(printf '1%.0s,' $(seq 1 $pp) | sed 's/,$//')")
|
|
CUDA_VISIBLE_DEVICES=$cvd nohup $LLAMA_BIN -m $GGUF --port 8090 --host 127.0.0.1 \
|
|
-c 2048 --parallel 1 -ngl 999 "${sm[@]}" >/tmp/l$pp.log 2>&1 &
|
|
local pid=$!
|
|
if ! probe_ready 8090 "$pid"; then echo "| llama | $pp | FAILED (see /tmp/l$pp.log) | | | |" >> "$OUT"; kill -9 "$pid" 2>/dev/null; return; fi
|
|
local mib; mib=$(vram "$cvd")
|
|
local m; m=$(python3 tools/bench/pp_time.py http://127.0.0.1:8090 "$PROMPT")
|
|
local ttft tpot toks; ttft=$(echo "$m"|sed -n 's/.*TTFT_ms=\([0-9.]*\).*/\1/p'); tpot=$(echo "$m"|sed -n 's/.*TPOT_ms=\([0-9.a-z]*\).*/\1/p'); toks=$(echo "$m"|sed -n 's/.*tok_s=\([0-9.a-z]*\).*/\1/p')
|
|
echo "| llama | $pp | $ttft | $tpot | $toks | $mib |" >> "$OUT"
|
|
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3
|
|
}
|
|
|
|
for pp in $PPS; do run_xserv "$pp"; done
|
|
for pp in $PPS; do run_llama "$pp"; done
|
|
killall_servers
|
|
echo "" >> "$OUT"
|
|
echo "PP_CLEAN_DONE" >> "$OUT"
|