From d5dcf1a5abebcb486a19c64453c4857617029b15 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Fri, 29 May 2026 18:45:59 +0800 Subject: [PATCH] bench: PP harness (xserv --pp vs llama.cpp -sm layer) runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 --- tools/bench/pp_clean_bench.sh | 89 +++++++++++++++++++++++++++++++++ tools/bench/pp_time.py | 44 +++++++++++++++++ tools/bench/run_pp_parallel.sh | 42 ++++++++++++++++ tools/bench/runner.py | 13 +++-- tools/bench/servers.py | 11 ++++- tools/bench/summarize_fullq.py | 17 +++++++ tools/bench/summarize_pp.py | 24 +++++++++ tools/pp_diag.sh | 31 ++++++++++++ tools/pp_final.sh | 72 +++++++++++++++++++++++++++ tools/pp_llama_47.sh | 25 ++++++++++ tools/pp_quality_full.sh | 54 ++++++++++++++++++++ tools/pp_verify.sh | 90 ++++++++++++++++++++++++++++++++++ 12 files changed, 505 insertions(+), 7 deletions(-) create mode 100644 tools/bench/pp_clean_bench.sh create mode 100644 tools/bench/pp_time.py create mode 100644 tools/bench/run_pp_parallel.sh create mode 100644 tools/bench/summarize_fullq.py create mode 100644 tools/bench/summarize_pp.py create mode 100644 tools/pp_diag.sh create mode 100644 tools/pp_final.sh create mode 100644 tools/pp_llama_47.sh create mode 100644 tools/pp_quality_full.sh create mode 100644 tools/pp_verify.sh diff --git a/tools/bench/pp_clean_bench.sh b/tools/bench/pp_clean_bench.sh new file mode 100644 index 0000000..9bf34b7 --- /dev/null +++ b/tools/bench/pp_clean_bench.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# Clean, strictly-sequential single-stream latency + per-GPU VRAM for PP. +# One server at a time. Readiness = first SUCCESSFUL generation (xserv's /health +# returns 200 before the model finishes loading, so we must not gate on it). +# Snapshots are therefore always post-load. Writes bench-out/PP_CLEAN.md. +# +# Env overrides: MODEL, GGUF, PPS (default "1 2 4"), LLAMA_BIN. +set -u +cd "$(dirname "$0")/../.." +export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH +export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda-12.9} +MODEL=${MODEL:-/opt/wjh/models/qwen3-8b} +GGUF=${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf} +LLAMA_BIN=${LLAMA_BIN:-third_party/llama.cpp/build/bin/llama-server} +XBIN=./target/release/xserv-server +PPS=${PPS:-1 2 4} +PROMPT='Write a detailed paragraph explaining how GPUs accelerate neural network training.' +OUT=bench-out/PP_CLEAN.md +mkdir -p bench-out +: > "$OUT" +echo "# PP clean single-stream latency + VRAM — $(date)" >> "$OUT" +echo "" >> "$OUT" +echo "| engine | PP | TTFT_ms | TPOT_ms | tok/s | per-GPU VRAM (MiB) |" >> "$OUT" +echo "|--------|----|---------|---------|-------|--------------------|" >> "$OUT" + +killall_servers(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; } + +drain(){ # wait until GPUs $1 (csv) all < 1500 MiB, max 120s + for _ in $(seq 1 60); do + local hi=0 + for g in ${1//,/ }; do + m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits) + [ "${m:-0}" -gt 1500 ] && hi=1 + done + [ "$hi" -eq 0 ] && return 0; sleep 2 + done +} + +# probe_ready PORT PID -> 0 when a generation succeeds (deadline ~1200s) +probe_ready(){ local port=$1 pid=$2 + for _ in $(seq 1 400); do + if curl -s -o /dev/null -w '%{http_code}' --max-time 8 \ + "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \ + -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' \ + 2>/dev/null | grep -q 200; then return 0; fi + kill -0 "$pid" 2>/dev/null || return 1 + sleep 3 + done; return 1 +} + +vram(){ local cvd=$1; local a b="" # stabilized snapshot of GPUs $cvd + for _ in $(seq 1 12); do + a=$(for g in ${cvd//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' ') + [ "$a" = "$b" ] && break; b=$a; sleep 2 + done; echo "$a" +} + +run_xserv(){ local pp=$1; local cvd; cvd=$(seq -s, 0 $((pp-1))) + killall_servers; drain "$cvd" + local extra=""; [ "$pp" -gt 1 ] && extra="--pp $pp" + XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $extra >/tmp/x$pp.log 2>&1 & + local pid=$! + if ! probe_ready 8090 "$pid"; then echo "| xserv | $pp | FAILED (see /tmp/x$pp.log) | | | |" >> "$OUT"; kill -9 "$pid" 2>/dev/null; return; fi + local mib; mib=$(vram "$cvd") + local m; m=$(python3 tools/bench/pp_time.py http://127.0.0.1:8090 "$PROMPT") + local ttft tpot toks; ttft=$(echo "$m"|sed -n 's/.*TTFT_ms=\([0-9.]*\).*/\1/p'); tpot=$(echo "$m"|sed -n 's/.*TPOT_ms=\([0-9.a-z]*\).*/\1/p'); toks=$(echo "$m"|sed -n 's/.*tok_s=\([0-9.a-z]*\).*/\1/p') + echo "| xserv | $pp | $ttft | $tpot | $toks | $mib |" >> "$OUT" + kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3 +} + +run_llama(){ local pp=$1; local cvd; cvd=$(seq -s, 0 $((pp-1))) + killall_servers; drain "$cvd" + local sm=(-sm none); [ "$pp" -gt 1 ] && sm=(-sm layer -ts "$(printf '1%.0s,' $(seq 1 $pp) | sed 's/,$//')") + CUDA_VISIBLE_DEVICES=$cvd nohup $LLAMA_BIN -m $GGUF --port 8090 --host 127.0.0.1 \ + -c 2048 --parallel 1 -ngl 999 "${sm[@]}" >/tmp/l$pp.log 2>&1 & + local pid=$! + if ! probe_ready 8090 "$pid"; then echo "| llama | $pp | FAILED (see /tmp/l$pp.log) | | | |" >> "$OUT"; kill -9 "$pid" 2>/dev/null; return; fi + local mib; mib=$(vram "$cvd") + local m; m=$(python3 tools/bench/pp_time.py http://127.0.0.1:8090 "$PROMPT") + local ttft tpot toks; ttft=$(echo "$m"|sed -n 's/.*TTFT_ms=\([0-9.]*\).*/\1/p'); tpot=$(echo "$m"|sed -n 's/.*TPOT_ms=\([0-9.a-z]*\).*/\1/p'); toks=$(echo "$m"|sed -n 's/.*tok_s=\([0-9.a-z]*\).*/\1/p') + echo "| llama | $pp | $ttft | $tpot | $toks | $mib |" >> "$OUT" + kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3 +} + +for pp in $PPS; do run_xserv "$pp"; done +for pp in $PPS; do run_llama "$pp"; done +killall_servers +echo "" >> "$OUT" +echo "PP_CLEAN_DONE" >> "$OUT" diff --git a/tools/bench/pp_time.py b/tools/bench/pp_time.py new file mode 100644 index 0000000..4d0ad74 --- /dev/null +++ b/tools/bench/pp_time.py @@ -0,0 +1,44 @@ +"""Tiny single-stream latency probe over the OpenAI HTTP API. + +Usage: python3 pp_time.py BASE_URL "PROMPT" +Prints: TTFT_ms=.. TPOT_ms=.. tok_full=.. tok_s=.. + +TTFT ~ wall time of a max_tokens=1 request (prefill + 1 token). +TPOT ~ (t_full - t_1) / (tokens_full - tokens_1), using the server's reported +completion_tokens so it is exact even if generation stops early. +""" +import json +import sys +import time +import urllib.request + +base = sys.argv[1].rstrip("/") +prompt = sys.argv[2] + + +def req(max_tokens): + body = json.dumps({ + "model": "qwen3-8b", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0, + "stream": False, + }).encode() + r = urllib.request.Request(base + "/v1/chat/completions", body, + {"Content-Type": "application/json"}) + t = time.time() + d = json.load(urllib.request.urlopen(r, timeout=600)) + dt = time.time() - t + ct = d.get("usage", {}).get("completion_tokens") + return dt, ct + + +t1, c1 = req(1) +tF, cF = req(160) +ttft = t1 * 1000.0 +denom = (cF - c1) if (cF and c1 and cF > c1) else None +if denom: + tpot = (tF - t1) / denom * 1000.0 + print(f"TTFT_ms={ttft:.1f} TPOT_ms={tpot:.2f} tok_full={cF} tok_s={1000.0/tpot:.1f}") +else: + print(f"TTFT_ms={ttft:.1f} TPOT_ms=nan tok_full={cF} tok_s=nan") diff --git a/tools/bench/run_pp_parallel.sh b/tools/bench/run_pp_parallel.sh new file mode 100644 index 0000000..1a43f44 --- /dev/null +++ b/tools/bench/run_pp_parallel.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Run the PP=1/2/4 sweep with xserv and llama.cpp CONCURRENTLY on disjoint GPU +# groups: xserv (--pp) on GPUs 0..N-1, llama.cpp (-sm layer) on GPUs 4..4+N-1. +# The 8x5090 box is grouped 0-3 / 4-7 (PHB intra-group), so each engine's P2P +# stays intra-group and the two engines never contend for a GPU. +# +# xserv splits layers across N GPUs and hands off hidden states via NCCL P2P; +# llama.cpp's default `-sm layer` does the analogous layer-wise split. +# +# Run from the repo root on the GPU host. Produces bench-out/pp{1,2,4}-{xserv,llama}. + +set -u +MODEL="${MODEL:-/opt/wjh/models/qwen3-8b}" +GGUF="${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}" +LIMIT="${LIMIT:-20}" +MAXSEQ="${MAXSEQ:-2048}" +PPS="${PPS:-1 2 4}" +TASKS="${TASKS:-gsm8k}" + +for PP in $PPS; do + LD=$(seq -s, 4 $((3 + PP))) # llama GPUs: 4 / 4,5 / 4,5,6,7 + echo "##### PP=$PP (xserv GPU 0..$((PP-1)) || llama GPU $LD) #####" + rm -rf "bench-out/pp$PP-xserv" "bench-out/pp$PP-llama" + + python3 -u -m tools.bench.runner --systems xserv --pp "$PP" \ + --xserv-bin ./target/release/xserv-server --xserv-model "$MODEL" \ + --suite quality --quality-tasks "$TASKS" --quality-limit "$LIMIT" \ + --max-batch 1 --max-seq-len "$MAXSEQ" \ + --out-dir "bench-out/pp$PP-xserv" > "/tmp/pp$PP-xserv.log" 2>&1 & + XP=$! + + python3 -u -m tools.bench.runner --systems llama.cpp --pp "$PP" --llama-devices "$LD" \ + --llama-bin third_party/llama.cpp/build/bin/llama-server --llama-gguf "$GGUF" \ + --suite quality --quality-tasks "$TASKS" --quality-limit "$LIMIT" \ + --max-batch 1 --max-seq-len "$MAXSEQ" \ + --out-dir "bench-out/pp$PP-llama" > "/tmp/pp$PP-llama.log" 2>&1 & + LP=$! + + wait "$XP" "$LP" + echo "PP=$PP done" +done +echo ALL_DONE diff --git a/tools/bench/runner.py b/tools/bench/runner.py index f9d4380..5bc1eab 100644 --- a/tools/bench/runner.py +++ b/tools/bench/runner.py @@ -72,6 +72,9 @@ def parse_args() -> argparse.Namespace: p.add_argument("--tp", type=int, default=1, help="Tensor-parallel degree for BOTH engines (xserv --tp N; " "llama.cpp --split-mode row over the first N GPUs).") + p.add_argument("--pp", type=int, default=1, + help="Pipeline-parallel degree for BOTH engines (xserv --pp N; " + "llama.cpp --split-mode layer over the first N GPUs).") p.add_argument("--llama-devices", default=None, help="Comma list of GPU ordinals for llama.cpp (first --tp used). " "Lets llama run on a disjoint GPU group (e.g. 4,5,6,7) so it " @@ -113,7 +116,7 @@ def build_endpoints(args) -> list[SystemEndpoint]: model_id=args.xserv_model_id, launch_cmd=xserv_launch_cmd( args.xserv_bin, model_dir, args.xserv_port, - max_batch=args.max_batch, max_seq_len=args.max_seq_len, tp=args.tp, + max_batch=args.max_batch, max_seq_len=args.max_seq_len, tp=args.tp, pp=args.pp, ), health_path="/health", ready_timeout_s=1200.0, @@ -140,10 +143,10 @@ def build_endpoints(args) -> list[SystemEndpoint]: # so it can run concurrently with xserv on 0..N-1. --split-mode row # then tensor-parallel-splits across exactly these devices. if args.llama_devices: - devs = [d.strip() for d in args.llama_devices.split(",") if d.strip()][: max(args.tp, 1)] + devs = [d.strip() for d in args.llama_devices.split(",") if d.strip()][: max(args.tp, args.pp, 1)] llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(devs)} - elif args.tp > 1: - llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(str(d) for d in range(args.tp))} + elif args.tp > 1 or args.pp > 1: + llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(str(d) for d in range(max(args.tp, args.pp)))} else: llama_env = {} eps.append(SystemEndpoint( @@ -152,7 +155,7 @@ def build_endpoints(args) -> list[SystemEndpoint]: model_id=args.llama_model_id, launch_cmd=llama_cpp_launch_cmd( args.llama_bin, gguf, args.llama_port, - n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len, tp=args.tp, + n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len, tp=args.tp, pp=args.pp, ), launch_env=llama_env, # llama-server's health endpoint also returns 200 only when model is loaded. diff --git a/tools/bench/servers.py b/tools/bench/servers.py index bfdf4ee..e07d8de 100644 --- a/tools/bench/servers.py +++ b/tools/bench/servers.py @@ -114,6 +114,7 @@ def xserv_launch_cmd( max_batch: int, max_seq_len: int, tp: int = 1, + pp: int = 1, ) -> list[str]: cmd = [ bin_path, @@ -122,7 +123,9 @@ def xserv_launch_cmd( "--max-batch", str(max_batch), "--max-seq-len", str(max_seq_len), ] - if tp > 1: + if pp > 1: + cmd += ["--pp", str(pp)] # xserv binds stage s -> GPU s internally + elif tp > 1: cmd += ["--tp", str(tp)] # xserv binds rank r -> GPU r internally return cmd @@ -136,6 +139,7 @@ def llama_cpp_launch_cmd( ctx_per_slot: int, n_gpu_layers: int = 99, tp: int = 1, + pp: int = 1, ) -> list[str]: # llama.cpp DIVIDES total -c across --parallel slots: per-slot context is # n_ctx / n_parallel. xserv gives each sequence the full max_seq_len, so to @@ -153,7 +157,10 @@ def llama_cpp_launch_cmd( # NOTE: do NOT pass --log-disable; its startup log reports per-slot # n_ctx, which is exactly the diagnostic that catches ctx misconfig. ] - if tp > 1: + if pp > 1: + # Pipeline / layer split across the visible GPUs (llama.cpp default). + cmd += ["--split-mode", "layer", "-ts", ",".join(["1"] * pp)] + elif tp > 1: # Tensor-parallel split across the visible GPUs (caller restricts the # set via CUDA_VISIBLE_DEVICES in launch_env). Row-split is llama.cpp's # tensor-parallel mode (vs the default layer/pipeline split). diff --git a/tools/bench/summarize_fullq.py b/tools/bench/summarize_fullq.py new file mode 100644 index 0000000..2c4673f --- /dev/null +++ b/tools/bench/summarize_fullq.py @@ -0,0 +1,17 @@ +"""Summarize the full quality matrix: bench-out/fullq-{xserv,llama}-pp{1,2,4}. +Prints one row per (engine, pp, task) with accuracy + latency.""" +import glob, json, os, sys +base = sys.argv[1] if len(sys.argv) > 1 else "bench-out" +print("%-6s %-3s %-9s %-8s %6s %9s %9s %10s" % + ("engine","PP","task","correct","acc%","mean_tok","TTFT_ms","TPOT_ms")) +for eng in ("xserv","llama"): + for pp in (1,2,4): + files = sorted(glob.glob(os.path.join(base, f"fullq-{eng}-pp{pp}", "comparison-*.json"))) + if not files: + print(f"{eng:<6} {pp:<3} (no results)"); continue + d = json.load(open(files[-1])) + for r in d.get("quality",{}).get("summary",[]): + print("%-6s %-3d %-9s %-8s %5.1f%% %9.0f %9.1f %10.2f" % ( + eng, pp, r["task"], f'{r["n_correct"]}/{r["n_total"]}', + r["accuracy"]*100, r.get("mean_completion_tokens",0), + r.get("mean_ttft_ms",0), r.get("mean_tpot_ms",0))) diff --git a/tools/bench/summarize_pp.py b/tools/bench/summarize_pp.py new file mode 100644 index 0000000..8f6517c --- /dev/null +++ b/tools/bench/summarize_pp.py @@ -0,0 +1,24 @@ +"""Summarize the concurrent PP sweep: bench-out/pp{1,2,4}-{xserv,llama}.""" +import glob +import json +import os +import sys + +base = sys.argv[1] if len(sys.argv) > 1 else "bench-out" +rows = [] +for pp in (1, 2, 4): + for sysname in ("xserv", "llama"): + files = sorted(glob.glob(os.path.join(base, f"pp{pp}-{sysname}", "comparison-*.json"))) + if not files: + continue + d = json.load(open(files[-1])) + for r in d["quality"]["summary"]: + rows.append((pp, sysname, r["task"], r["n_correct"], r["n_total"], + r["accuracy"] * 100, r["mean_completion_tokens"], + r["mean_ttft_ms"], r["mean_tpot_ms"], r["wall_s"])) + +print("%-3s %-7s %-9s %-9s %7s %9s %9s %10s %9s" % + ("PP", "engine", "task", "correct", "acc%", "mean_tok", "TTFT_ms", "TPOT_ms", "wall_s")) +for (pp, s, task, nc, nt, acc, tok, ttft, tpot, wall) in rows: + print("%-3d %-7s %-9s %-9s %6.1f%% %9.0f %9.1f %10.2f %9.0f" % + (pp, s, task, f"{nc}/{nt}", acc, tok, ttft, tpot, wall)) diff --git a/tools/pp_diag.sh b/tools/pp_diag.sh new file mode 100644 index 0000000..901b4d6 --- /dev/null +++ b/tools/pp_diag.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Diagnose pp4 divergence: run single x2 and pp4 x2, same prompt, compare all. +set -u +cd /opt/wjh/projects/xserv +export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH +MODEL=/opt/wjh/models/qwen3-8b; XBIN=./target/release/xserv-server +P='Explain what a transformer is in machine learning, in 3 sentences.' +D=bench-out/PP_DIAG.md; : > "$D" +kall(){ pkill -9 -f xserv-server 2>/dev/null; sleep 3; } +ready(){ for _ in $(seq 1 400); do [ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 http://127.0.0.1:8090/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null)" = 200 ] && return 0; kill -0 $1 2>/dev/null||return 1; sleep 3; done; return 1; } +run(){ local out=$1 cvd=$2; shift 2 + kall + CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 "$@" >/tmp/d.log 2>&1 & + local pid=$!; ready $pid || { echo "FAIL" >"$out"; kill -9 $pid 2>/dev/null; return; } + curl -s --max-time 200 http://127.0.0.1:8090/v1/chat/completions -H 'Content-Type: application/json' \ + -d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$P\"}],\"max_tokens\":128,\"temperature\":0,\"stream\":false}" \ + | python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > "$out" 2>/dev/null + kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3 +} +run /tmp/s_a.txt 0 +run /tmp/s_b.txt 0 +run /tmp/p4_a.txt 0,1,2,3 --pp 4 +run /tmp/p4_b.txt 0,1,2,3 --pp 4 +echo "single_A==single_B: $(cmp -s /tmp/s_a.txt /tmp/s_b.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D" +echo "pp4_A==pp4_B: $(cmp -s /tmp/p4_a.txt /tmp/p4_b.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D" +echo "single_A==pp4_A: $(cmp -s /tmp/s_a.txt /tmp/p4_a.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D" +echo "--- first diff offset single_A vs pp4_A ---" | tee -a "$D" +cmp /tmp/s_a.txt /tmp/p4_a.txt 2>&1 | tee -a "$D" +echo "--- lengths (chars) ---" | tee -a "$D" +wc -c /tmp/s_a.txt /tmp/s_b.txt /tmp/p4_a.txt /tmp/p4_b.txt | tee -a "$D" +echo "PP_DIAG_DONE" >> "$D" diff --git a/tools/pp_final.sh b/tools/pp_final.sh new file mode 100644 index 0000000..af78fdc --- /dev/null +++ b/tools/pp_final.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +# Definitive PP measurement, strictly sequential, with generated text captured +# for a real correctness byte-compare. Writes bench-out/PP_FINAL.md and per-config +# text files. One server at a time; readiness gated on a real generation. +set -u +cd /opt/wjh/projects/xserv +export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH +export CUDA_HOME=/usr/local/cuda-12.9 +MODEL=/opt/wjh/models/qwen3-8b +GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf +LBIN=third_party/llama.cpp/build/bin/llama-server +XBIN=./target/release/xserv-server +PROMPT='Explain what a transformer is in machine learning, in 3 sentences.' +R=bench-out/PP_FINAL.md +: > "$R" +log(){ echo "$@" >> "$R"; } + +kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; } +drain(){ for _ in $(seq 1 90); do hi=0; for g in ${1//,/ }; do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; } +# gen PORT MAXTOK -> echoes JSON; http code in $GCODE +gen(){ GCODE=$(curl -s -o /tmp/resp.json -w '%{http_code}' --max-time 300 \ + "http://127.0.0.1:$1/v1/chat/completions" -H 'Content-Type: application/json' \ + -d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":$2,\"temperature\":0,\"stream\":false}"); cat /tmp/resp.json; } +ready(){ local port=$1 pid=$2; for _ in $(seq 1 400); do + c=$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null) + [ "$c" = 200 ] && return 0; kill -0 "$pid" 2>/dev/null || return 1; sleep 3; done; return 1; } +snap(){ for g in ${1//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' '; } +# latency: TTFT from 1-tok, TPOT from 96-tok using server completion_tokens +lat(){ local port=$1 + local t0 t1 c1 cF tF + t0=$(date +%s.%N); gen "$port" 1 >/tmp/g1.json; t1=$(date +%s.%N) + c1=$(python3 -c 'import json;print(json.load(open("/tmp/g1.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 1) + local ta tb; ta=$(date +%s.%N); gen "$port" 96 >/tmp/gF.json; tb=$(date +%s.%N) + cF=$(python3 -c 'import json;print(json.load(open("/tmp/gF.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 0) + python3 -c " +ttft=($t1-$t0)*1000 +d=$cF-$c1 +print('TTFT_ms=%.1f TPOT_ms=%.2f tok_s=%.1f tokF=$cF'%(ttft,(($tb-$ta)-($t1-$t0))/d*1000 if d>0 else float('nan'),(1000.0/((($tb-$ta)-($t1-$t0))/d*1000)) if d>0 else float('nan')))" +} + +xserv(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd" + local ex=""; [ "$pp" -gt 1 ] && ex="--pp $pp" + XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $ex >/tmp/xf$pp.log 2>&1 & + local pid=$!; if ! ready 8090 $pid; then log "xserv pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi + local mib; mib=$(snap "$cvd") + gen 8090 64 | python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > /tmp/xtext_$pp.txt 2>/dev/null + local L; L=$(lat 8090) + log "xserv pp=$pp | VRAM=$mib MiB | $L" + kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3 +} +llama(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd" + local sm; if [ "$pp" -gt 1 ]; then sm="-sm layer -ts $(printf '1%.0s,' $(seq 1 $pp)|sed 's/,$//')"; else sm="-sm none"; fi + CUDA_VISIBLE_DEVICES=$cvd nohup $LBIN -m $GGUF --port 8090 --host 127.0.0.1 -c 2048 --parallel 1 -ngl 999 $sm >/tmp/lf$pp.log 2>&1 & + local pid=$!; if ! ready 8090 $pid; then log "llama pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi + local mib; mib=$(snap "$cvd"); local L; L=$(lat 8090) + log "llama pp=$pp | VRAM=$mib MiB | $L" + kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3 +} + +log "# PP FINAL — $(date)" +for pp in 1 2 4; do xserv $pp; done +log "" +log "## correctness (xserv greedy, byte compare of generated text)" +log "single==pp2: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_2.txt && echo IDENTICAL || echo DIFFER)" +log "single==pp4: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_4.txt && echo IDENTICAL || echo DIFFER)" +log "single_text: $(head -c 200 /tmp/xtext_1.txt)" +log "pp2_text: $(head -c 200 /tmp/xtext_2.txt)" +log "" +for pp in 1 2 4; do llama $pp; done +kall +log "" +log "PP_FINAL_DONE" diff --git a/tools/pp_llama_47.sh b/tools/pp_llama_47.sh new file mode 100644 index 0000000..f30ef33 --- /dev/null +++ b/tools/pp_llama_47.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# llama.cpp PP=1/2/4 quality (aime2025+gsm8k, 30 each) on physical GPUs 4-7, +# parallel with the xserv matrix on 0-3. Pass --llama-devices so the runner pins +# CUDA_VISIBLE_DEVICES to 4.. (it otherwise forces 0..N-1). Distinct port + dirs. +set -u +cd /opt/wjh/projects/xserv +export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH +export CUDA_HOME=/usr/local/cuda-12.9 +GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf +LBIN=third_party/llama.cpp/build/bin/llama-server +PROG=bench-out/LLAMA47_PROGRESS.md +: > "$PROG"; echo "# llama on GPU 4-7 — $(date)" >> "$PROG" +for pp in 1 2 4; do + dev=$(seq -s, 4 $((3+pp))) + out=bench-out/fullq-llama-pp$pp; rm -rf "$out" + echo "=== START llama pp=$pp dev=$dev $(date +%H:%M:%S) ===" >> "$PROG" + pkill -9 -f "llama-server.*18181" 2>/dev/null; sleep 2 + python3 -u -m tools.bench.runner --systems llama.cpp --pp "$pp" --llama-devices "$dev" \ + --llama-bin "$LBIN" --llama-gguf "$GGUF" --llama-port 18181 \ + --suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \ + --max-batch 1 --max-seq-len 4096 --out-dir "$out" >/tmp/fql-$pp.log 2>&1 + echo "=== END llama pp=$pp rc=$? $(date +%H:%M:%S) $(ls $out/comparison-*.json 2>/dev/null|wc -l) json ===" >> "$PROG" +done +pkill -9 -f "llama-server.*18181" 2>/dev/null +echo "LLAMA47_DONE" >> "$PROG" diff --git a/tools/pp_quality_full.sh b/tools/pp_quality_full.sh new file mode 100644 index 0000000..dab6ea9 --- /dev/null +++ b/tools/pp_quality_full.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# FULL quality matrix, strictly sequential (one server at a time, same GPU group +# 0..N-1, no concurrency). Both engines x PP=1/2/4 x {aime2025, gsm8k}. +# Each (engine,pp) invocation runs runner.py once (it does start->both tasks->stop). +# Writes bench-out/fullq--pp/comparison-*.json ; summarized at the end. +set -u +cd /opt/wjh/projects/xserv +export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH +export CUDA_HOME=/usr/local/cuda-12.9 +MODEL=/opt/wjh/models/qwen3-8b +GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf +XBIN=./target/release/xserv-server +LBIN=third_party/llama.cpp/build/bin/llama-server +AIME_LIMIT=${AIME_LIMIT:-30} +GSM_LIMIT=${GSM_LIMIT:-20} +MAXSEQ=${MAXSEQ:-4096} +PROG=bench-out/FULLQ_PROGRESS.md +: > "$PROG" +echo "# full quality matrix — $(date)" >> "$PROG" + +kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; pkill -9 -f runner.py 2>/dev/null; sleep 4; } +drain(){ for _ in $(seq 1 90); do hi=0; for g in $(seq 0 $1); do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; } + +run_one(){ # $1 engine $2 pp + local eng=$1 pp=$2 dev; dev=$(seq -s, 0 $((pp-1))) + kall; drain $((pp-1)) + local out=bench-out/fullq-$eng-pp$pp + rm -rf "$out" + echo "=== START $eng pp=$pp on GPU $dev $(date +%H:%M:%S) ===" >> "$PROG" + if [ "$eng" = xserv ]; then + python3 -u -m tools.bench.runner --systems xserv --pp "$pp" \ + --xserv-bin "$XBIN" --xserv-model "$MODEL" \ + --suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \ + --max-batch 1 --max-seq-len "$MAXSEQ" \ + --out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1 + else + python3 -u -m tools.bench.runner --systems llama.cpp --pp "$pp" \ + --llama-bin "$LBIN" --llama-gguf "$GGUF" \ + --suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \ + --max-batch 1 --max-seq-len "$MAXSEQ" \ + --out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1 + fi + echo "=== END $eng pp=$pp rc=$? $(date +%H:%M:%S) $(ls $out/comparison-*.json 2>/dev/null | wc -l) json ===" >> "$PROG" +} + +# aime2025 has 30 problems; runner uses one --quality-limit for ALL tasks, so we +# pass max(limits) and rely on the datasets' own sizes (gsm8k.json may be larger, +# but we cap with --quality-limit). To keep gsm8k at 20 and aime at 30 we run the +# matrix with --quality-limit 30 (aime full; gsm8k uses first 30 -> report shows n_total). +for eng in xserv llama; do + for pp in 1 2 4; do run_one "$eng" "$pp"; done +done +kall +echo "FULLQ_DONE" >> "$PROG" diff --git a/tools/pp_verify.sh b/tools/pp_verify.sh new file mode 100644 index 0000000..bd4c9d0 --- /dev/null +++ b/tools/pp_verify.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# One-shot pipeline-parallel (PP) verification + benchmark for Qwen3-8B. +# Run on the GPU host from the repo root. Writes bench-out/PP_RESULTS.md. +# +# 1. NCCL P2P send/recv + AllReduce unit tests +# 2. correctness: greedy (temp=0) output single == --pp 2 == --pp 4 (byte compare) +# 3. per-GPU VRAM (health-gated; weights + a minimal KV pool, ~1/P per card) +# 4. quality+latency sweep vs llama.cpp (-sm layer), gsm8k +# +# Env: MODEL, GGUF, LIMIT (problems), PPS (e.g. "1 2 4") may be overridden. +set -u +cd "$(dirname "$0")/.." +export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH +export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda-12.9} +MODEL=${MODEL:-/opt/wjh/models/qwen3-8b} +GGUF=${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf} +LIMIT=${LIMIT:-20} +PPS=${PPS:-1 2 4} +BIN=./target/release/xserv-server +R=bench-out/PP_RESULTS.md +mkdir -p bench-out +: > "$R" +log(){ echo "$@" | tee -a "$R"; } + +pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3 + +log "# PP verification — $(date)" + +# ---- 1. NCCL P2P + AllReduce unit tests ---- +log ""; log "## 1. NCCL P2P + AllReduce test" +cargo test -p xserv-distributed --release -- --test-threads=1 >/tmp/pp_t.log 2>&1 +log " cargo test exit=$?" +grep -hE "test result|pp_send_recv|allreduce_two_gpu" /tmp/pp_t.log | sed 's/^/ /' | tee -a "$R" + +# wait_ready PORT PID -> 0 when a real generation succeeds (xserv's /health +# returns 200 before the model is loaded, so gate on a generation, not /health). +wait_ready(){ local port=$1 pid=$2 + for _ in $(seq 1 400); do + curl -s -o /dev/null -w '%{http_code}' --max-time 8 \ + "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \ + -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' \ + 2>/dev/null | grep -q 200 && return 0 + kill -0 "$pid" 2>/dev/null || return 1 + sleep 3 + done; return 1 +} + +# ---- 2. correctness ---- +PROMPT='Explain what a transformer is in machine learning, in 3 sentences.' +gen(){ local port=$1 cvd=$2; shift 2 + CUDA_VISIBLE_DEVICES=$cvd nohup $BIN $MODEL --port $port --max-seq-len 2048 "$@" >/tmp/pp_s$port.log 2>&1 & + local pid=$! + wait_ready "$port" "$pid" || { echo "(server $port failed)"; kill -9 "$pid" 2>/dev/null; return; } + curl -s --max-time 200 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \ + -d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":64,\"temperature\":0,\"stream\":false}" \ + | python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' 2>/dev/null + kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3 +} +gen 8091 0 > /tmp/o_single.txt +gen 8092 0,1 --pp 2 > /tmp/o_pp2.txt +gen 8093 0,1,2,3 --pp 4 > /tmp/o_pp4.txt +log ""; log "## 2. Correctness (greedy temp=0, byte compare)" +log " single==pp2: $(cmp -s /tmp/o_single.txt /tmp/o_pp2.txt && echo IDENTICAL || echo DIFFER)" +log " single==pp4: $(cmp -s /tmp/o_single.txt /tmp/o_pp4.txt && echo IDENTICAL || echo DIFFER)" +log " single text: $(head -c 160 /tmp/o_single.txt)" + +# ---- 3. per-GPU VRAM (health-gated, KV pool capped so all configs comparable) ---- +log ""; log "## 3. Per-GPU VRAM (XSERV_MAX_KV_BLOCKS=160; weights + minimal KV)" +snap(){ nvidia-smi -i "$1" --query-gpu=memory.used --format=csv,noheader,nounits | paste -sd' '; } +vram(){ local label=$1 cvd=$2 port=$3; shift 3 + XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $BIN $MODEL --port $port --max-seq-len 2048 "$@" >/tmp/pp_v$port.log 2>&1 & + local pid=$! + wait_ready "$port" "$pid" || { log " $label: server failed"; kill -9 "$pid" 2>/dev/null; return; } + curl -s --max-time 120 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \ + -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":8,"temperature":0,"stream":false}' >/dev/null + local a b=""; for _ in $(seq 1 12); do a=$(snap "$cvd"); [ "$a" = "$b" ] && break; b=$a; sleep 2; done + log " $label ($cvd): $a MiB" + kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 5 +} +vram single 0 8094 +vram pp2 0,1 8095 --pp 2 +vram pp4 0,1,2,3 8096 --pp 4 + +# ---- 4. sweep vs llama.cpp ---- +log ""; log "## 4. Sweep (gsm8k $LIMIT, xserv --pp 0..N-1 vs llama -sm layer 4..)" +PPS="$PPS" LIMIT="$LIMIT" TASKS=gsm8k bash tools/bench/run_pp_parallel.sh >/tmp/pp_sweep.log 2>&1 +log '```' +python3 tools/bench/summarize_pp.py bench-out >> "$R" 2>&1 +log '```' +log ""; log "PP_VERIFY_DONE"