#!/usr/bin/env bash # Definitive PP measurement, strictly sequential, with generated text captured # for a real correctness byte-compare. Writes bench-out/PP_FINAL.md and per-config # text files. One server at a time; readiness gated on a real generation. set -u cd /opt/wjh/projects/xserv export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH export CUDA_HOME=/usr/local/cuda-12.9 MODEL=/opt/wjh/models/qwen3-8b GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf LBIN=third_party/llama.cpp/build/bin/llama-server XBIN=./target/release/xserv-server PROMPT='Explain what a transformer is in machine learning, in 3 sentences.' R=bench-out/PP_FINAL.md : > "$R" log(){ echo "$@" >> "$R"; } kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; } drain(){ for _ in $(seq 1 90); do hi=0; for g in ${1//,/ }; do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; } # gen PORT MAXTOK -> echoes JSON; http code in $GCODE gen(){ GCODE=$(curl -s -o /tmp/resp.json -w '%{http_code}' --max-time 300 \ "http://127.0.0.1:$1/v1/chat/completions" -H 'Content-Type: application/json' \ -d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":$2,\"temperature\":0,\"stream\":false}"); cat /tmp/resp.json; } ready(){ local port=$1 pid=$2; for _ in $(seq 1 400); do c=$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null) [ "$c" = 200 ] && return 0; kill -0 "$pid" 2>/dev/null || return 1; sleep 3; done; return 1; } snap(){ for g in ${1//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' '; } # latency: TTFT from 1-tok, TPOT from 96-tok using server completion_tokens lat(){ local port=$1 local t0 t1 c1 cF tF t0=$(date +%s.%N); gen "$port" 1 >/tmp/g1.json; t1=$(date +%s.%N) c1=$(python3 -c 'import json;print(json.load(open("/tmp/g1.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 1) local ta tb; ta=$(date +%s.%N); gen "$port" 96 >/tmp/gF.json; tb=$(date +%s.%N) cF=$(python3 -c 'import json;print(json.load(open("/tmp/gF.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 0) python3 -c " ttft=($t1-$t0)*1000 d=$cF-$c1 print('TTFT_ms=%.1f TPOT_ms=%.2f tok_s=%.1f tokF=$cF'%(ttft,(($tb-$ta)-($t1-$t0))/d*1000 if d>0 else float('nan'),(1000.0/((($tb-$ta)-($t1-$t0))/d*1000)) if d>0 else float('nan')))" } xserv(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd" local ex=""; [ "$pp" -gt 1 ] && ex="--pp $pp" XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $ex >/tmp/xf$pp.log 2>&1 & local pid=$!; if ! ready 8090 $pid; then log "xserv pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi local mib; mib=$(snap "$cvd") gen 8090 64 | python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > /tmp/xtext_$pp.txt 2>/dev/null local L; L=$(lat 8090) log "xserv pp=$pp | VRAM=$mib MiB | $L" kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3 } llama(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd" local sm; if [ "$pp" -gt 1 ]; then sm="-sm layer -ts $(printf '1%.0s,' $(seq 1 $pp)|sed 's/,$//')"; else sm="-sm none"; fi CUDA_VISIBLE_DEVICES=$cvd nohup $LBIN -m $GGUF --port 8090 --host 127.0.0.1 -c 2048 --parallel 1 -ngl 999 $sm >/tmp/lf$pp.log 2>&1 & local pid=$!; if ! ready 8090 $pid; then log "llama pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi local mib; mib=$(snap "$cvd"); local L; L=$(lat 8090) log "llama pp=$pp | VRAM=$mib MiB | $L" kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3 } log "# PP FINAL — $(date)" for pp in 1 2 4; do xserv $pp; done log "" log "## correctness (xserv greedy, byte compare of generated text)" log "single==pp2: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_2.txt && echo IDENTICAL || echo DIFFER)" log "single==pp4: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_4.txt && echo IDENTICAL || echo DIFFER)" log "single_text: $(head -c 200 /tmp/xtext_1.txt)" log "pp2_text: $(head -c 200 /tmp/xtext_2.txt)" log "" for pp in 1 2 4; do llama $pp; done kall log "" log "PP_FINAL_DONE"