runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
73 lines
4.4 KiB
Bash
73 lines
4.4 KiB
Bash
#!/usr/bin/env bash
|
|
# Definitive PP measurement, strictly sequential, with generated text captured
|
|
# for a real correctness byte-compare. Writes bench-out/PP_FINAL.md and per-config
|
|
# text files. One server at a time; readiness gated on a real generation.
|
|
set -u
|
|
cd /opt/wjh/projects/xserv
|
|
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
|
export CUDA_HOME=/usr/local/cuda-12.9
|
|
MODEL=/opt/wjh/models/qwen3-8b
|
|
GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf
|
|
LBIN=third_party/llama.cpp/build/bin/llama-server
|
|
XBIN=./target/release/xserv-server
|
|
PROMPT='Explain what a transformer is in machine learning, in 3 sentences.'
|
|
R=bench-out/PP_FINAL.md
|
|
: > "$R"
|
|
log(){ echo "$@" >> "$R"; }
|
|
|
|
kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; }
|
|
drain(){ for _ in $(seq 1 90); do hi=0; for g in ${1//,/ }; do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; }
|
|
# gen PORT MAXTOK -> echoes JSON; http code in $GCODE
|
|
gen(){ GCODE=$(curl -s -o /tmp/resp.json -w '%{http_code}' --max-time 300 \
|
|
"http://127.0.0.1:$1/v1/chat/completions" -H 'Content-Type: application/json' \
|
|
-d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":$2,\"temperature\":0,\"stream\":false}"); cat /tmp/resp.json; }
|
|
ready(){ local port=$1 pid=$2; for _ in $(seq 1 400); do
|
|
c=$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null)
|
|
[ "$c" = 200 ] && return 0; kill -0 "$pid" 2>/dev/null || return 1; sleep 3; done; return 1; }
|
|
snap(){ for g in ${1//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' '; }
|
|
# latency: TTFT from 1-tok, TPOT from 96-tok using server completion_tokens
|
|
lat(){ local port=$1
|
|
local t0 t1 c1 cF tF
|
|
t0=$(date +%s.%N); gen "$port" 1 >/tmp/g1.json; t1=$(date +%s.%N)
|
|
c1=$(python3 -c 'import json;print(json.load(open("/tmp/g1.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 1)
|
|
local ta tb; ta=$(date +%s.%N); gen "$port" 96 >/tmp/gF.json; tb=$(date +%s.%N)
|
|
cF=$(python3 -c 'import json;print(json.load(open("/tmp/gF.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 0)
|
|
python3 -c "
|
|
ttft=($t1-$t0)*1000
|
|
d=$cF-$c1
|
|
print('TTFT_ms=%.1f TPOT_ms=%.2f tok_s=%.1f tokF=$cF'%(ttft,(($tb-$ta)-($t1-$t0))/d*1000 if d>0 else float('nan'),(1000.0/((($tb-$ta)-($t1-$t0))/d*1000)) if d>0 else float('nan')))"
|
|
}
|
|
|
|
xserv(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd"
|
|
local ex=""; [ "$pp" -gt 1 ] && ex="--pp $pp"
|
|
XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $ex >/tmp/xf$pp.log 2>&1 &
|
|
local pid=$!; if ! ready 8090 $pid; then log "xserv pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi
|
|
local mib; mib=$(snap "$cvd")
|
|
gen 8090 64 | python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > /tmp/xtext_$pp.txt 2>/dev/null
|
|
local L; L=$(lat 8090)
|
|
log "xserv pp=$pp | VRAM=$mib MiB | $L"
|
|
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
|
|
}
|
|
llama(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd"
|
|
local sm; if [ "$pp" -gt 1 ]; then sm="-sm layer -ts $(printf '1%.0s,' $(seq 1 $pp)|sed 's/,$//')"; else sm="-sm none"; fi
|
|
CUDA_VISIBLE_DEVICES=$cvd nohup $LBIN -m $GGUF --port 8090 --host 127.0.0.1 -c 2048 --parallel 1 -ngl 999 $sm >/tmp/lf$pp.log 2>&1 &
|
|
local pid=$!; if ! ready 8090 $pid; then log "llama pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi
|
|
local mib; mib=$(snap "$cvd"); local L; L=$(lat 8090)
|
|
log "llama pp=$pp | VRAM=$mib MiB | $L"
|
|
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
|
|
}
|
|
|
|
log "# PP FINAL — $(date)"
|
|
for pp in 1 2 4; do xserv $pp; done
|
|
log ""
|
|
log "## correctness (xserv greedy, byte compare of generated text)"
|
|
log "single==pp2: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_2.txt && echo IDENTICAL || echo DIFFER)"
|
|
log "single==pp4: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_4.txt && echo IDENTICAL || echo DIFFER)"
|
|
log "single_text: $(head -c 200 /tmp/xtext_1.txt)"
|
|
log "pp2_text: $(head -c 200 /tmp/xtext_2.txt)"
|
|
log ""
|
|
for pp in 1 2 4; do llama $pp; done
|
|
kall
|
|
log ""
|
|
log "PP_FINAL_DONE"
|