runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
32 lines
2.1 KiB
Bash
32 lines
2.1 KiB
Bash
#!/usr/bin/env bash
|
|
# Diagnose pp4 divergence: run single x2 and pp4 x2, same prompt, compare all.
|
|
set -u
|
|
cd /opt/wjh/projects/xserv
|
|
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
|
MODEL=/opt/wjh/models/qwen3-8b; XBIN=./target/release/xserv-server
|
|
P='Explain what a transformer is in machine learning, in 3 sentences.'
|
|
D=bench-out/PP_DIAG.md; : > "$D"
|
|
kall(){ pkill -9 -f xserv-server 2>/dev/null; sleep 3; }
|
|
ready(){ for _ in $(seq 1 400); do [ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 http://127.0.0.1:8090/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null)" = 200 ] && return 0; kill -0 $1 2>/dev/null||return 1; sleep 3; done; return 1; }
|
|
run(){ local out=$1 cvd=$2; shift 2
|
|
kall
|
|
CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 "$@" >/tmp/d.log 2>&1 &
|
|
local pid=$!; ready $pid || { echo "FAIL" >"$out"; kill -9 $pid 2>/dev/null; return; }
|
|
curl -s --max-time 200 http://127.0.0.1:8090/v1/chat/completions -H 'Content-Type: application/json' \
|
|
-d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$P\"}],\"max_tokens\":128,\"temperature\":0,\"stream\":false}" \
|
|
| python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > "$out" 2>/dev/null
|
|
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
|
|
}
|
|
run /tmp/s_a.txt 0
|
|
run /tmp/s_b.txt 0
|
|
run /tmp/p4_a.txt 0,1,2,3 --pp 4
|
|
run /tmp/p4_b.txt 0,1,2,3 --pp 4
|
|
echo "single_A==single_B: $(cmp -s /tmp/s_a.txt /tmp/s_b.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D"
|
|
echo "pp4_A==pp4_B: $(cmp -s /tmp/p4_a.txt /tmp/p4_b.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D"
|
|
echo "single_A==pp4_A: $(cmp -s /tmp/s_a.txt /tmp/p4_a.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D"
|
|
echo "--- first diff offset single_A vs pp4_A ---" | tee -a "$D"
|
|
cmp /tmp/s_a.txt /tmp/p4_a.txt 2>&1 | tee -a "$D"
|
|
echo "--- lengths (chars) ---" | tee -a "$D"
|
|
wc -c /tmp/s_a.txt /tmp/s_b.txt /tmp/p4_a.txt /tmp/p4_b.txt | tee -a "$D"
|
|
echo "PP_DIAG_DONE" >> "$D"
|