Files
xserv/tools/pp_final.sh
Gahow Wang d5dcf1a5ab bench: PP harness (xserv --pp vs llama.cpp -sm layer)
runner/servers: add --pp for both engines (xserv --pp N; llama.cpp
-sm layer over N GPUs). New drivers: pp_final.sh (sequential latency +
per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2
determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K
matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq,
pp_time.py latency probe.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-29 18:45:59 +08:00

73 lines
4.4 KiB
Bash

#!/usr/bin/env bash
# Definitive PP measurement, strictly sequential, with generated text captured
# for a real correctness byte-compare. Writes bench-out/PP_FINAL.md and per-config
# text files. One server at a time; readiness gated on a real generation.
set -u
cd /opt/wjh/projects/xserv
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
export CUDA_HOME=/usr/local/cuda-12.9
MODEL=/opt/wjh/models/qwen3-8b
GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf
LBIN=third_party/llama.cpp/build/bin/llama-server
XBIN=./target/release/xserv-server
PROMPT='Explain what a transformer is in machine learning, in 3 sentences.'
R=bench-out/PP_FINAL.md
: > "$R"
log(){ echo "$@" >> "$R"; }
kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; }
drain(){ for _ in $(seq 1 90); do hi=0; for g in ${1//,/ }; do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; }
# gen PORT MAXTOK -> echoes JSON; http code in $GCODE
gen(){ GCODE=$(curl -s -o /tmp/resp.json -w '%{http_code}' --max-time 300 \
"http://127.0.0.1:$1/v1/chat/completions" -H 'Content-Type: application/json' \
-d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":$2,\"temperature\":0,\"stream\":false}"); cat /tmp/resp.json; }
ready(){ local port=$1 pid=$2; for _ in $(seq 1 400); do
c=$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null)
[ "$c" = 200 ] && return 0; kill -0 "$pid" 2>/dev/null || return 1; sleep 3; done; return 1; }
snap(){ for g in ${1//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' '; }
# latency: TTFT from 1-tok, TPOT from 96-tok using server completion_tokens
lat(){ local port=$1
local t0 t1 c1 cF tF
t0=$(date +%s.%N); gen "$port" 1 >/tmp/g1.json; t1=$(date +%s.%N)
c1=$(python3 -c 'import json;print(json.load(open("/tmp/g1.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 1)
local ta tb; ta=$(date +%s.%N); gen "$port" 96 >/tmp/gF.json; tb=$(date +%s.%N)
cF=$(python3 -c 'import json;print(json.load(open("/tmp/gF.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 0)
python3 -c "
ttft=($t1-$t0)*1000
d=$cF-$c1
print('TTFT_ms=%.1f TPOT_ms=%.2f tok_s=%.1f tokF=$cF'%(ttft,(($tb-$ta)-($t1-$t0))/d*1000 if d>0 else float('nan'),(1000.0/((($tb-$ta)-($t1-$t0))/d*1000)) if d>0 else float('nan')))"
}
xserv(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd"
local ex=""; [ "$pp" -gt 1 ] && ex="--pp $pp"
XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $ex >/tmp/xf$pp.log 2>&1 &
local pid=$!; if ! ready 8090 $pid; then log "xserv pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi
local mib; mib=$(snap "$cvd")
gen 8090 64 | python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > /tmp/xtext_$pp.txt 2>/dev/null
local L; L=$(lat 8090)
log "xserv pp=$pp | VRAM=$mib MiB | $L"
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
}
llama(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd"
local sm; if [ "$pp" -gt 1 ]; then sm="-sm layer -ts $(printf '1%.0s,' $(seq 1 $pp)|sed 's/,$//')"; else sm="-sm none"; fi
CUDA_VISIBLE_DEVICES=$cvd nohup $LBIN -m $GGUF --port 8090 --host 127.0.0.1 -c 2048 --parallel 1 -ngl 999 $sm >/tmp/lf$pp.log 2>&1 &
local pid=$!; if ! ready 8090 $pid; then log "llama pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi
local mib; mib=$(snap "$cvd"); local L; L=$(lat 8090)
log "llama pp=$pp | VRAM=$mib MiB | $L"
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
}
log "# PP FINAL — $(date)"
for pp in 1 2 4; do xserv $pp; done
log ""
log "## correctness (xserv greedy, byte compare of generated text)"
log "single==pp2: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_2.txt && echo IDENTICAL || echo DIFFER)"
log "single==pp4: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_4.txt && echo IDENTICAL || echo DIFFER)"
log "single_text: $(head -c 200 /tmp/xtext_1.txt)"
log "pp2_text: $(head -c 200 /tmp/xtext_2.txt)"
log ""
for pp in 1 2 4; do llama $pp; done
kall
log ""
log "PP_FINAL_DONE"