bench: PP harness (xserv --pp vs llama.cpp -sm layer)
runner/servers: add --pp for both engines (xserv --pp N; llama.cpp -sm layer over N GPUs). New drivers: pp_final.sh (sequential latency + per-GPU VRAM + byte-exact correctness), pp_diag.sh (single x2 vs pp4 x2 determinism control), pp_quality_full.sh / pp_llama_47.sh (AIME+GSM8K matrix, xserv on 0-3 || llama on 4-7), summarize_pp/summarize_fullq, pp_time.py latency probe. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
89
tools/bench/pp_clean_bench.sh
Normal file
89
tools/bench/pp_clean_bench.sh
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env bash
|
||||
# Clean, strictly-sequential single-stream latency + per-GPU VRAM for PP.
|
||||
# One server at a time. Readiness = first SUCCESSFUL generation (xserv's /health
|
||||
# returns 200 before the model finishes loading, so we must not gate on it).
|
||||
# Snapshots are therefore always post-load. Writes bench-out/PP_CLEAN.md.
|
||||
#
|
||||
# Env overrides: MODEL, GGUF, PPS (default "1 2 4"), LLAMA_BIN.
|
||||
set -u
|
||||
cd "$(dirname "$0")/../.."
|
||||
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
||||
export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda-12.9}
|
||||
MODEL=${MODEL:-/opt/wjh/models/qwen3-8b}
|
||||
GGUF=${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}
|
||||
LLAMA_BIN=${LLAMA_BIN:-third_party/llama.cpp/build/bin/llama-server}
|
||||
XBIN=./target/release/xserv-server
|
||||
PPS=${PPS:-1 2 4}
|
||||
PROMPT='Write a detailed paragraph explaining how GPUs accelerate neural network training.'
|
||||
OUT=bench-out/PP_CLEAN.md
|
||||
mkdir -p bench-out
|
||||
: > "$OUT"
|
||||
echo "# PP clean single-stream latency + VRAM — $(date)" >> "$OUT"
|
||||
echo "" >> "$OUT"
|
||||
echo "| engine | PP | TTFT_ms | TPOT_ms | tok/s | per-GPU VRAM (MiB) |" >> "$OUT"
|
||||
echo "|--------|----|---------|---------|-------|--------------------|" >> "$OUT"
|
||||
|
||||
killall_servers(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; }
|
||||
|
||||
drain(){ # wait until GPUs $1 (csv) all < 1500 MiB, max 120s
|
||||
for _ in $(seq 1 60); do
|
||||
local hi=0
|
||||
for g in ${1//,/ }; do
|
||||
m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits)
|
||||
[ "${m:-0}" -gt 1500 ] && hi=1
|
||||
done
|
||||
[ "$hi" -eq 0 ] && return 0; sleep 2
|
||||
done
|
||||
}
|
||||
|
||||
# probe_ready PORT PID -> 0 when a generation succeeds (deadline ~1200s)
|
||||
probe_ready(){ local port=$1 pid=$2
|
||||
for _ in $(seq 1 400); do
|
||||
if curl -s -o /dev/null -w '%{http_code}' --max-time 8 \
|
||||
"http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
||||
-d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' \
|
||||
2>/dev/null | grep -q 200; then return 0; fi
|
||||
kill -0 "$pid" 2>/dev/null || return 1
|
||||
sleep 3
|
||||
done; return 1
|
||||
}
|
||||
|
||||
vram(){ local cvd=$1; local a b="" # stabilized snapshot of GPUs $cvd
|
||||
for _ in $(seq 1 12); do
|
||||
a=$(for g in ${cvd//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' ')
|
||||
[ "$a" = "$b" ] && break; b=$a; sleep 2
|
||||
done; echo "$a"
|
||||
}
|
||||
|
||||
run_xserv(){ local pp=$1; local cvd; cvd=$(seq -s, 0 $((pp-1)))
|
||||
killall_servers; drain "$cvd"
|
||||
local extra=""; [ "$pp" -gt 1 ] && extra="--pp $pp"
|
||||
XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $extra >/tmp/x$pp.log 2>&1 &
|
||||
local pid=$!
|
||||
if ! probe_ready 8090 "$pid"; then echo "| xserv | $pp | FAILED (see /tmp/x$pp.log) | | | |" >> "$OUT"; kill -9 "$pid" 2>/dev/null; return; fi
|
||||
local mib; mib=$(vram "$cvd")
|
||||
local m; m=$(python3 tools/bench/pp_time.py http://127.0.0.1:8090 "$PROMPT")
|
||||
local ttft tpot toks; ttft=$(echo "$m"|sed -n 's/.*TTFT_ms=\([0-9.]*\).*/\1/p'); tpot=$(echo "$m"|sed -n 's/.*TPOT_ms=\([0-9.a-z]*\).*/\1/p'); toks=$(echo "$m"|sed -n 's/.*tok_s=\([0-9.a-z]*\).*/\1/p')
|
||||
echo "| xserv | $pp | $ttft | $tpot | $toks | $mib |" >> "$OUT"
|
||||
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3
|
||||
}
|
||||
|
||||
run_llama(){ local pp=$1; local cvd; cvd=$(seq -s, 0 $((pp-1)))
|
||||
killall_servers; drain "$cvd"
|
||||
local sm=(-sm none); [ "$pp" -gt 1 ] && sm=(-sm layer -ts "$(printf '1%.0s,' $(seq 1 $pp) | sed 's/,$//')")
|
||||
CUDA_VISIBLE_DEVICES=$cvd nohup $LLAMA_BIN -m $GGUF --port 8090 --host 127.0.0.1 \
|
||||
-c 2048 --parallel 1 -ngl 999 "${sm[@]}" >/tmp/l$pp.log 2>&1 &
|
||||
local pid=$!
|
||||
if ! probe_ready 8090 "$pid"; then echo "| llama | $pp | FAILED (see /tmp/l$pp.log) | | | |" >> "$OUT"; kill -9 "$pid" 2>/dev/null; return; fi
|
||||
local mib; mib=$(vram "$cvd")
|
||||
local m; m=$(python3 tools/bench/pp_time.py http://127.0.0.1:8090 "$PROMPT")
|
||||
local ttft tpot toks; ttft=$(echo "$m"|sed -n 's/.*TTFT_ms=\([0-9.]*\).*/\1/p'); tpot=$(echo "$m"|sed -n 's/.*TPOT_ms=\([0-9.a-z]*\).*/\1/p'); toks=$(echo "$m"|sed -n 's/.*tok_s=\([0-9.a-z]*\).*/\1/p')
|
||||
echo "| llama | $pp | $ttft | $tpot | $toks | $mib |" >> "$OUT"
|
||||
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3
|
||||
}
|
||||
|
||||
for pp in $PPS; do run_xserv "$pp"; done
|
||||
for pp in $PPS; do run_llama "$pp"; done
|
||||
killall_servers
|
||||
echo "" >> "$OUT"
|
||||
echo "PP_CLEAN_DONE" >> "$OUT"
|
||||
44
tools/bench/pp_time.py
Normal file
44
tools/bench/pp_time.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Tiny single-stream latency probe over the OpenAI HTTP API.
|
||||
|
||||
Usage: python3 pp_time.py BASE_URL "PROMPT"
|
||||
Prints: TTFT_ms=.. TPOT_ms=.. tok_full=.. tok_s=..
|
||||
|
||||
TTFT ~ wall time of a max_tokens=1 request (prefill + 1 token).
|
||||
TPOT ~ (t_full - t_1) / (tokens_full - tokens_1), using the server's reported
|
||||
completion_tokens so it is exact even if generation stops early.
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
base = sys.argv[1].rstrip("/")
|
||||
prompt = sys.argv[2]
|
||||
|
||||
|
||||
def req(max_tokens):
|
||||
body = json.dumps({
|
||||
"model": "qwen3-8b",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0,
|
||||
"stream": False,
|
||||
}).encode()
|
||||
r = urllib.request.Request(base + "/v1/chat/completions", body,
|
||||
{"Content-Type": "application/json"})
|
||||
t = time.time()
|
||||
d = json.load(urllib.request.urlopen(r, timeout=600))
|
||||
dt = time.time() - t
|
||||
ct = d.get("usage", {}).get("completion_tokens")
|
||||
return dt, ct
|
||||
|
||||
|
||||
t1, c1 = req(1)
|
||||
tF, cF = req(160)
|
||||
ttft = t1 * 1000.0
|
||||
denom = (cF - c1) if (cF and c1 and cF > c1) else None
|
||||
if denom:
|
||||
tpot = (tF - t1) / denom * 1000.0
|
||||
print(f"TTFT_ms={ttft:.1f} TPOT_ms={tpot:.2f} tok_full={cF} tok_s={1000.0/tpot:.1f}")
|
||||
else:
|
||||
print(f"TTFT_ms={ttft:.1f} TPOT_ms=nan tok_full={cF} tok_s=nan")
|
||||
42
tools/bench/run_pp_parallel.sh
Normal file
42
tools/bench/run_pp_parallel.sh
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run the PP=1/2/4 sweep with xserv and llama.cpp CONCURRENTLY on disjoint GPU
|
||||
# groups: xserv (--pp) on GPUs 0..N-1, llama.cpp (-sm layer) on GPUs 4..4+N-1.
|
||||
# The 8x5090 box is grouped 0-3 / 4-7 (PHB intra-group), so each engine's P2P
|
||||
# stays intra-group and the two engines never contend for a GPU.
|
||||
#
|
||||
# xserv splits layers across N GPUs and hands off hidden states via NCCL P2P;
|
||||
# llama.cpp's default `-sm layer` does the analogous layer-wise split.
|
||||
#
|
||||
# Run from the repo root on the GPU host. Produces bench-out/pp{1,2,4}-{xserv,llama}.
|
||||
|
||||
set -u
|
||||
MODEL="${MODEL:-/opt/wjh/models/qwen3-8b}"
|
||||
GGUF="${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}"
|
||||
LIMIT="${LIMIT:-20}"
|
||||
MAXSEQ="${MAXSEQ:-2048}"
|
||||
PPS="${PPS:-1 2 4}"
|
||||
TASKS="${TASKS:-gsm8k}"
|
||||
|
||||
for PP in $PPS; do
|
||||
LD=$(seq -s, 4 $((3 + PP))) # llama GPUs: 4 / 4,5 / 4,5,6,7
|
||||
echo "##### PP=$PP (xserv GPU 0..$((PP-1)) || llama GPU $LD) #####"
|
||||
rm -rf "bench-out/pp$PP-xserv" "bench-out/pp$PP-llama"
|
||||
|
||||
python3 -u -m tools.bench.runner --systems xserv --pp "$PP" \
|
||||
--xserv-bin ./target/release/xserv-server --xserv-model "$MODEL" \
|
||||
--suite quality --quality-tasks "$TASKS" --quality-limit "$LIMIT" \
|
||||
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
||||
--out-dir "bench-out/pp$PP-xserv" > "/tmp/pp$PP-xserv.log" 2>&1 &
|
||||
XP=$!
|
||||
|
||||
python3 -u -m tools.bench.runner --systems llama.cpp --pp "$PP" --llama-devices "$LD" \
|
||||
--llama-bin third_party/llama.cpp/build/bin/llama-server --llama-gguf "$GGUF" \
|
||||
--suite quality --quality-tasks "$TASKS" --quality-limit "$LIMIT" \
|
||||
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
||||
--out-dir "bench-out/pp$PP-llama" > "/tmp/pp$PP-llama.log" 2>&1 &
|
||||
LP=$!
|
||||
|
||||
wait "$XP" "$LP"
|
||||
echo "PP=$PP done"
|
||||
done
|
||||
echo ALL_DONE
|
||||
@@ -72,6 +72,9 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--tp", type=int, default=1,
|
||||
help="Tensor-parallel degree for BOTH engines (xserv --tp N; "
|
||||
"llama.cpp --split-mode row over the first N GPUs).")
|
||||
p.add_argument("--pp", type=int, default=1,
|
||||
help="Pipeline-parallel degree for BOTH engines (xserv --pp N; "
|
||||
"llama.cpp --split-mode layer over the first N GPUs).")
|
||||
p.add_argument("--llama-devices", default=None,
|
||||
help="Comma list of GPU ordinals for llama.cpp (first --tp used). "
|
||||
"Lets llama run on a disjoint GPU group (e.g. 4,5,6,7) so it "
|
||||
@@ -113,7 +116,7 @@ def build_endpoints(args) -> list[SystemEndpoint]:
|
||||
model_id=args.xserv_model_id,
|
||||
launch_cmd=xserv_launch_cmd(
|
||||
args.xserv_bin, model_dir, args.xserv_port,
|
||||
max_batch=args.max_batch, max_seq_len=args.max_seq_len, tp=args.tp,
|
||||
max_batch=args.max_batch, max_seq_len=args.max_seq_len, tp=args.tp, pp=args.pp,
|
||||
),
|
||||
health_path="/health",
|
||||
ready_timeout_s=1200.0,
|
||||
@@ -140,10 +143,10 @@ def build_endpoints(args) -> list[SystemEndpoint]:
|
||||
# so it can run concurrently with xserv on 0..N-1. --split-mode row
|
||||
# then tensor-parallel-splits across exactly these devices.
|
||||
if args.llama_devices:
|
||||
devs = [d.strip() for d in args.llama_devices.split(",") if d.strip()][: max(args.tp, 1)]
|
||||
devs = [d.strip() for d in args.llama_devices.split(",") if d.strip()][: max(args.tp, args.pp, 1)]
|
||||
llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(devs)}
|
||||
elif args.tp > 1:
|
||||
llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(str(d) for d in range(args.tp))}
|
||||
elif args.tp > 1 or args.pp > 1:
|
||||
llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(str(d) for d in range(max(args.tp, args.pp)))}
|
||||
else:
|
||||
llama_env = {}
|
||||
eps.append(SystemEndpoint(
|
||||
@@ -152,7 +155,7 @@ def build_endpoints(args) -> list[SystemEndpoint]:
|
||||
model_id=args.llama_model_id,
|
||||
launch_cmd=llama_cpp_launch_cmd(
|
||||
args.llama_bin, gguf, args.llama_port,
|
||||
n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len, tp=args.tp,
|
||||
n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len, tp=args.tp, pp=args.pp,
|
||||
),
|
||||
launch_env=llama_env,
|
||||
# llama-server's health endpoint also returns 200 only when model is loaded.
|
||||
|
||||
@@ -114,6 +114,7 @@ def xserv_launch_cmd(
|
||||
max_batch: int,
|
||||
max_seq_len: int,
|
||||
tp: int = 1,
|
||||
pp: int = 1,
|
||||
) -> list[str]:
|
||||
cmd = [
|
||||
bin_path,
|
||||
@@ -122,7 +123,9 @@ def xserv_launch_cmd(
|
||||
"--max-batch", str(max_batch),
|
||||
"--max-seq-len", str(max_seq_len),
|
||||
]
|
||||
if tp > 1:
|
||||
if pp > 1:
|
||||
cmd += ["--pp", str(pp)] # xserv binds stage s -> GPU s internally
|
||||
elif tp > 1:
|
||||
cmd += ["--tp", str(tp)] # xserv binds rank r -> GPU r internally
|
||||
return cmd
|
||||
|
||||
@@ -136,6 +139,7 @@ def llama_cpp_launch_cmd(
|
||||
ctx_per_slot: int,
|
||||
n_gpu_layers: int = 99,
|
||||
tp: int = 1,
|
||||
pp: int = 1,
|
||||
) -> list[str]:
|
||||
# llama.cpp DIVIDES total -c across --parallel slots: per-slot context is
|
||||
# n_ctx / n_parallel. xserv gives each sequence the full max_seq_len, so to
|
||||
@@ -153,7 +157,10 @@ def llama_cpp_launch_cmd(
|
||||
# NOTE: do NOT pass --log-disable; its startup log reports per-slot
|
||||
# n_ctx, which is exactly the diagnostic that catches ctx misconfig.
|
||||
]
|
||||
if tp > 1:
|
||||
if pp > 1:
|
||||
# Pipeline / layer split across the visible GPUs (llama.cpp default).
|
||||
cmd += ["--split-mode", "layer", "-ts", ",".join(["1"] * pp)]
|
||||
elif tp > 1:
|
||||
# Tensor-parallel split across the visible GPUs (caller restricts the
|
||||
# set via CUDA_VISIBLE_DEVICES in launch_env). Row-split is llama.cpp's
|
||||
# tensor-parallel mode (vs the default layer/pipeline split).
|
||||
|
||||
17
tools/bench/summarize_fullq.py
Normal file
17
tools/bench/summarize_fullq.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""Summarize the full quality matrix: bench-out/fullq-{xserv,llama}-pp{1,2,4}.
|
||||
Prints one row per (engine, pp, task) with accuracy + latency."""
|
||||
import glob, json, os, sys
|
||||
base = sys.argv[1] if len(sys.argv) > 1 else "bench-out"
|
||||
print("%-6s %-3s %-9s %-8s %6s %9s %9s %10s" %
|
||||
("engine","PP","task","correct","acc%","mean_tok","TTFT_ms","TPOT_ms"))
|
||||
for eng in ("xserv","llama"):
|
||||
for pp in (1,2,4):
|
||||
files = sorted(glob.glob(os.path.join(base, f"fullq-{eng}-pp{pp}", "comparison-*.json")))
|
||||
if not files:
|
||||
print(f"{eng:<6} {pp:<3} (no results)"); continue
|
||||
d = json.load(open(files[-1]))
|
||||
for r in d.get("quality",{}).get("summary",[]):
|
||||
print("%-6s %-3d %-9s %-8s %5.1f%% %9.0f %9.1f %10.2f" % (
|
||||
eng, pp, r["task"], f'{r["n_correct"]}/{r["n_total"]}',
|
||||
r["accuracy"]*100, r.get("mean_completion_tokens",0),
|
||||
r.get("mean_ttft_ms",0), r.get("mean_tpot_ms",0)))
|
||||
24
tools/bench/summarize_pp.py
Normal file
24
tools/bench/summarize_pp.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""Summarize the concurrent PP sweep: bench-out/pp{1,2,4}-{xserv,llama}."""
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
base = sys.argv[1] if len(sys.argv) > 1 else "bench-out"
|
||||
rows = []
|
||||
for pp in (1, 2, 4):
|
||||
for sysname in ("xserv", "llama"):
|
||||
files = sorted(glob.glob(os.path.join(base, f"pp{pp}-{sysname}", "comparison-*.json")))
|
||||
if not files:
|
||||
continue
|
||||
d = json.load(open(files[-1]))
|
||||
for r in d["quality"]["summary"]:
|
||||
rows.append((pp, sysname, r["task"], r["n_correct"], r["n_total"],
|
||||
r["accuracy"] * 100, r["mean_completion_tokens"],
|
||||
r["mean_ttft_ms"], r["mean_tpot_ms"], r["wall_s"]))
|
||||
|
||||
print("%-3s %-7s %-9s %-9s %7s %9s %9s %10s %9s" %
|
||||
("PP", "engine", "task", "correct", "acc%", "mean_tok", "TTFT_ms", "TPOT_ms", "wall_s"))
|
||||
for (pp, s, task, nc, nt, acc, tok, ttft, tpot, wall) in rows:
|
||||
print("%-3d %-7s %-9s %-9s %6.1f%% %9.0f %9.1f %10.2f %9.0f" %
|
||||
(pp, s, task, f"{nc}/{nt}", acc, tok, ttft, tpot, wall))
|
||||
31
tools/pp_diag.sh
Normal file
31
tools/pp_diag.sh
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
# Diagnose pp4 divergence: run single x2 and pp4 x2, same prompt, compare all.
|
||||
set -u
|
||||
cd /opt/wjh/projects/xserv
|
||||
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
||||
MODEL=/opt/wjh/models/qwen3-8b; XBIN=./target/release/xserv-server
|
||||
P='Explain what a transformer is in machine learning, in 3 sentences.'
|
||||
D=bench-out/PP_DIAG.md; : > "$D"
|
||||
kall(){ pkill -9 -f xserv-server 2>/dev/null; sleep 3; }
|
||||
ready(){ for _ in $(seq 1 400); do [ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 http://127.0.0.1:8090/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null)" = 200 ] && return 0; kill -0 $1 2>/dev/null||return 1; sleep 3; done; return 1; }
|
||||
run(){ local out=$1 cvd=$2; shift 2
|
||||
kall
|
||||
CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 "$@" >/tmp/d.log 2>&1 &
|
||||
local pid=$!; ready $pid || { echo "FAIL" >"$out"; kill -9 $pid 2>/dev/null; return; }
|
||||
curl -s --max-time 200 http://127.0.0.1:8090/v1/chat/completions -H 'Content-Type: application/json' \
|
||||
-d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$P\"}],\"max_tokens\":128,\"temperature\":0,\"stream\":false}" \
|
||||
| python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > "$out" 2>/dev/null
|
||||
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
|
||||
}
|
||||
run /tmp/s_a.txt 0
|
||||
run /tmp/s_b.txt 0
|
||||
run /tmp/p4_a.txt 0,1,2,3 --pp 4
|
||||
run /tmp/p4_b.txt 0,1,2,3 --pp 4
|
||||
echo "single_A==single_B: $(cmp -s /tmp/s_a.txt /tmp/s_b.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D"
|
||||
echo "pp4_A==pp4_B: $(cmp -s /tmp/p4_a.txt /tmp/p4_b.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D"
|
||||
echo "single_A==pp4_A: $(cmp -s /tmp/s_a.txt /tmp/p4_a.txt && echo IDENTICAL || echo DIFFER)" | tee -a "$D"
|
||||
echo "--- first diff offset single_A vs pp4_A ---" | tee -a "$D"
|
||||
cmp /tmp/s_a.txt /tmp/p4_a.txt 2>&1 | tee -a "$D"
|
||||
echo "--- lengths (chars) ---" | tee -a "$D"
|
||||
wc -c /tmp/s_a.txt /tmp/s_b.txt /tmp/p4_a.txt /tmp/p4_b.txt | tee -a "$D"
|
||||
echo "PP_DIAG_DONE" >> "$D"
|
||||
72
tools/pp_final.sh
Normal file
72
tools/pp_final.sh
Normal file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env bash
|
||||
# Definitive PP measurement, strictly sequential, with generated text captured
|
||||
# for a real correctness byte-compare. Writes bench-out/PP_FINAL.md and per-config
|
||||
# text files. One server at a time; readiness gated on a real generation.
|
||||
set -u
|
||||
cd /opt/wjh/projects/xserv
|
||||
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
||||
export CUDA_HOME=/usr/local/cuda-12.9
|
||||
MODEL=/opt/wjh/models/qwen3-8b
|
||||
GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf
|
||||
LBIN=third_party/llama.cpp/build/bin/llama-server
|
||||
XBIN=./target/release/xserv-server
|
||||
PROMPT='Explain what a transformer is in machine learning, in 3 sentences.'
|
||||
R=bench-out/PP_FINAL.md
|
||||
: > "$R"
|
||||
log(){ echo "$@" >> "$R"; }
|
||||
|
||||
kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3; }
|
||||
drain(){ for _ in $(seq 1 90); do hi=0; for g in ${1//,/ }; do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; }
|
||||
# gen PORT MAXTOK -> echoes JSON; http code in $GCODE
|
||||
gen(){ GCODE=$(curl -s -o /tmp/resp.json -w '%{http_code}' --max-time 300 \
|
||||
"http://127.0.0.1:$1/v1/chat/completions" -H 'Content-Type: application/json' \
|
||||
-d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":$2,\"temperature\":0,\"stream\":false}"); cat /tmp/resp.json; }
|
||||
ready(){ local port=$1 pid=$2; for _ in $(seq 1 400); do
|
||||
c=$(curl -s -o /dev/null -w '%{http_code}' --max-time 8 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' -d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' 2>/dev/null)
|
||||
[ "$c" = 200 ] && return 0; kill -0 "$pid" 2>/dev/null || return 1; sleep 3; done; return 1; }
|
||||
snap(){ for g in ${1//,/ }; do nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits; done | paste -sd' '; }
|
||||
# latency: TTFT from 1-tok, TPOT from 96-tok using server completion_tokens
|
||||
lat(){ local port=$1
|
||||
local t0 t1 c1 cF tF
|
||||
t0=$(date +%s.%N); gen "$port" 1 >/tmp/g1.json; t1=$(date +%s.%N)
|
||||
c1=$(python3 -c 'import json;print(json.load(open("/tmp/g1.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 1)
|
||||
local ta tb; ta=$(date +%s.%N); gen "$port" 96 >/tmp/gF.json; tb=$(date +%s.%N)
|
||||
cF=$(python3 -c 'import json;print(json.load(open("/tmp/gF.json"))["usage"]["completion_tokens"])' 2>/dev/null || echo 0)
|
||||
python3 -c "
|
||||
ttft=($t1-$t0)*1000
|
||||
d=$cF-$c1
|
||||
print('TTFT_ms=%.1f TPOT_ms=%.2f tok_s=%.1f tokF=$cF'%(ttft,(($tb-$ta)-($t1-$t0))/d*1000 if d>0 else float('nan'),(1000.0/((($tb-$ta)-($t1-$t0))/d*1000)) if d>0 else float('nan')))"
|
||||
}
|
||||
|
||||
xserv(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd"
|
||||
local ex=""; [ "$pp" -gt 1 ] && ex="--pp $pp"
|
||||
XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $XBIN $MODEL --port 8090 --max-seq-len 2048 $ex >/tmp/xf$pp.log 2>&1 &
|
||||
local pid=$!; if ! ready 8090 $pid; then log "xserv pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi
|
||||
local mib; mib=$(snap "$cvd")
|
||||
gen 8090 64 | python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' > /tmp/xtext_$pp.txt 2>/dev/null
|
||||
local L; L=$(lat 8090)
|
||||
log "xserv pp=$pp | VRAM=$mib MiB | $L"
|
||||
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
|
||||
}
|
||||
llama(){ local pp=$1 cvd; cvd=$(seq -s, 0 $((pp-1))); kall; drain "$cvd"
|
||||
local sm; if [ "$pp" -gt 1 ]; then sm="-sm layer -ts $(printf '1%.0s,' $(seq 1 $pp)|sed 's/,$//')"; else sm="-sm none"; fi
|
||||
CUDA_VISIBLE_DEVICES=$cvd nohup $LBIN -m $GGUF --port 8090 --host 127.0.0.1 -c 2048 --parallel 1 -ngl 999 $sm >/tmp/lf$pp.log 2>&1 &
|
||||
local pid=$!; if ! ready 8090 $pid; then log "llama pp=$pp: FAILED"; kill -9 $pid 2>/dev/null; return; fi
|
||||
local mib; mib=$(snap "$cvd"); local L; L=$(lat 8090)
|
||||
log "llama pp=$pp | VRAM=$mib MiB | $L"
|
||||
kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null; sleep 3
|
||||
}
|
||||
|
||||
log "# PP FINAL — $(date)"
|
||||
for pp in 1 2 4; do xserv $pp; done
|
||||
log ""
|
||||
log "## correctness (xserv greedy, byte compare of generated text)"
|
||||
log "single==pp2: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_2.txt && echo IDENTICAL || echo DIFFER)"
|
||||
log "single==pp4: $(cmp -s /tmp/xtext_1.txt /tmp/xtext_4.txt && echo IDENTICAL || echo DIFFER)"
|
||||
log "single_text: $(head -c 200 /tmp/xtext_1.txt)"
|
||||
log "pp2_text: $(head -c 200 /tmp/xtext_2.txt)"
|
||||
log ""
|
||||
for pp in 1 2 4; do llama $pp; done
|
||||
kall
|
||||
log ""
|
||||
log "PP_FINAL_DONE"
|
||||
25
tools/pp_llama_47.sh
Normal file
25
tools/pp_llama_47.sh
Normal file
@@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env bash
|
||||
# llama.cpp PP=1/2/4 quality (aime2025+gsm8k, 30 each) on physical GPUs 4-7,
|
||||
# parallel with the xserv matrix on 0-3. Pass --llama-devices so the runner pins
|
||||
# CUDA_VISIBLE_DEVICES to 4.. (it otherwise forces 0..N-1). Distinct port + dirs.
|
||||
set -u
|
||||
cd /opt/wjh/projects/xserv
|
||||
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
||||
export CUDA_HOME=/usr/local/cuda-12.9
|
||||
GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf
|
||||
LBIN=third_party/llama.cpp/build/bin/llama-server
|
||||
PROG=bench-out/LLAMA47_PROGRESS.md
|
||||
: > "$PROG"; echo "# llama on GPU 4-7 — $(date)" >> "$PROG"
|
||||
for pp in 1 2 4; do
|
||||
dev=$(seq -s, 4 $((3+pp)))
|
||||
out=bench-out/fullq-llama-pp$pp; rm -rf "$out"
|
||||
echo "=== START llama pp=$pp dev=$dev $(date +%H:%M:%S) ===" >> "$PROG"
|
||||
pkill -9 -f "llama-server.*18181" 2>/dev/null; sleep 2
|
||||
python3 -u -m tools.bench.runner --systems llama.cpp --pp "$pp" --llama-devices "$dev" \
|
||||
--llama-bin "$LBIN" --llama-gguf "$GGUF" --llama-port 18181 \
|
||||
--suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \
|
||||
--max-batch 1 --max-seq-len 4096 --out-dir "$out" >/tmp/fql-$pp.log 2>&1
|
||||
echo "=== END llama pp=$pp rc=$? $(date +%H:%M:%S) $(ls $out/comparison-*.json 2>/dev/null|wc -l) json ===" >> "$PROG"
|
||||
done
|
||||
pkill -9 -f "llama-server.*18181" 2>/dev/null
|
||||
echo "LLAMA47_DONE" >> "$PROG"
|
||||
54
tools/pp_quality_full.sh
Normal file
54
tools/pp_quality_full.sh
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env bash
|
||||
# FULL quality matrix, strictly sequential (one server at a time, same GPU group
|
||||
# 0..N-1, no concurrency). Both engines x PP=1/2/4 x {aime2025, gsm8k}.
|
||||
# Each (engine,pp) invocation runs runner.py once (it does start->both tasks->stop).
|
||||
# Writes bench-out/fullq-<engine>-pp<N>/comparison-*.json ; summarized at the end.
|
||||
set -u
|
||||
cd /opt/wjh/projects/xserv
|
||||
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
||||
export CUDA_HOME=/usr/local/cuda-12.9
|
||||
MODEL=/opt/wjh/models/qwen3-8b
|
||||
GGUF=/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf
|
||||
XBIN=./target/release/xserv-server
|
||||
LBIN=third_party/llama.cpp/build/bin/llama-server
|
||||
AIME_LIMIT=${AIME_LIMIT:-30}
|
||||
GSM_LIMIT=${GSM_LIMIT:-20}
|
||||
MAXSEQ=${MAXSEQ:-4096}
|
||||
PROG=bench-out/FULLQ_PROGRESS.md
|
||||
: > "$PROG"
|
||||
echo "# full quality matrix — $(date)" >> "$PROG"
|
||||
|
||||
kall(){ pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; pkill -9 -f runner.py 2>/dev/null; sleep 4; }
|
||||
drain(){ for _ in $(seq 1 90); do hi=0; for g in $(seq 0 $1); do m=$(nvidia-smi -i "$g" --query-gpu=memory.used --format=csv,noheader,nounits); [ "${m:-0}" -gt 1500 ] && hi=1; done; [ "$hi" = 0 ] && return 0; sleep 2; done; }
|
||||
|
||||
run_one(){ # $1 engine $2 pp
|
||||
local eng=$1 pp=$2 dev; dev=$(seq -s, 0 $((pp-1)))
|
||||
kall; drain $((pp-1))
|
||||
local out=bench-out/fullq-$eng-pp$pp
|
||||
rm -rf "$out"
|
||||
echo "=== START $eng pp=$pp on GPU $dev $(date +%H:%M:%S) ===" >> "$PROG"
|
||||
if [ "$eng" = xserv ]; then
|
||||
python3 -u -m tools.bench.runner --systems xserv --pp "$pp" \
|
||||
--xserv-bin "$XBIN" --xserv-model "$MODEL" \
|
||||
--suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \
|
||||
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
||||
--out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1
|
||||
else
|
||||
python3 -u -m tools.bench.runner --systems llama.cpp --pp "$pp" \
|
||||
--llama-bin "$LBIN" --llama-gguf "$GGUF" \
|
||||
--suite quality --quality-tasks aime2025,gsm8k --quality-limit 30 \
|
||||
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
||||
--out-dir "$out" >/tmp/fq-$eng-$pp.log 2>&1
|
||||
fi
|
||||
echo "=== END $eng pp=$pp rc=$? $(date +%H:%M:%S) $(ls $out/comparison-*.json 2>/dev/null | wc -l) json ===" >> "$PROG"
|
||||
}
|
||||
|
||||
# aime2025 has 30 problems; runner uses one --quality-limit for ALL tasks, so we
|
||||
# pass max(limits) and rely on the datasets' own sizes (gsm8k.json may be larger,
|
||||
# but we cap with --quality-limit). To keep gsm8k at 20 and aime at 30 we run the
|
||||
# matrix with --quality-limit 30 (aime full; gsm8k uses first 30 -> report shows n_total).
|
||||
for eng in xserv llama; do
|
||||
for pp in 1 2 4; do run_one "$eng" "$pp"; done
|
||||
done
|
||||
kall
|
||||
echo "FULLQ_DONE" >> "$PROG"
|
||||
90
tools/pp_verify.sh
Normal file
90
tools/pp_verify.sh
Normal file
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
# One-shot pipeline-parallel (PP) verification + benchmark for Qwen3-8B.
|
||||
# Run on the GPU host from the repo root. Writes bench-out/PP_RESULTS.md.
|
||||
#
|
||||
# 1. NCCL P2P send/recv + AllReduce unit tests
|
||||
# 2. correctness: greedy (temp=0) output single == --pp 2 == --pp 4 (byte compare)
|
||||
# 3. per-GPU VRAM (health-gated; weights + a minimal KV pool, ~1/P per card)
|
||||
# 4. quality+latency sweep vs llama.cpp (-sm layer), gsm8k
|
||||
#
|
||||
# Env: MODEL, GGUF, LIMIT (problems), PPS (e.g. "1 2 4") may be overridden.
|
||||
set -u
|
||||
cd "$(dirname "$0")/.."
|
||||
export PATH=$HOME/.cargo/bin:/usr/local/cuda-12.9/bin:$PATH
|
||||
export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda-12.9}
|
||||
MODEL=${MODEL:-/opt/wjh/models/qwen3-8b}
|
||||
GGUF=${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}
|
||||
LIMIT=${LIMIT:-20}
|
||||
PPS=${PPS:-1 2 4}
|
||||
BIN=./target/release/xserv-server
|
||||
R=bench-out/PP_RESULTS.md
|
||||
mkdir -p bench-out
|
||||
: > "$R"
|
||||
log(){ echo "$@" | tee -a "$R"; }
|
||||
|
||||
pkill -9 -f xserv-server 2>/dev/null; pkill -9 -f llama-server 2>/dev/null; sleep 3
|
||||
|
||||
log "# PP verification — $(date)"
|
||||
|
||||
# ---- 1. NCCL P2P + AllReduce unit tests ----
|
||||
log ""; log "## 1. NCCL P2P + AllReduce test"
|
||||
cargo test -p xserv-distributed --release -- --test-threads=1 >/tmp/pp_t.log 2>&1
|
||||
log " cargo test exit=$?"
|
||||
grep -hE "test result|pp_send_recv|allreduce_two_gpu" /tmp/pp_t.log | sed 's/^/ /' | tee -a "$R"
|
||||
|
||||
# wait_ready PORT PID -> 0 when a real generation succeeds (xserv's /health
|
||||
# returns 200 before the model is loaded, so gate on a generation, not /health).
|
||||
wait_ready(){ local port=$1 pid=$2
|
||||
for _ in $(seq 1 400); do
|
||||
curl -s -o /dev/null -w '%{http_code}' --max-time 8 \
|
||||
"http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
||||
-d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":1,"temperature":0,"stream":false}' \
|
||||
2>/dev/null | grep -q 200 && return 0
|
||||
kill -0 "$pid" 2>/dev/null || return 1
|
||||
sleep 3
|
||||
done; return 1
|
||||
}
|
||||
|
||||
# ---- 2. correctness ----
|
||||
PROMPT='Explain what a transformer is in machine learning, in 3 sentences.'
|
||||
gen(){ local port=$1 cvd=$2; shift 2
|
||||
CUDA_VISIBLE_DEVICES=$cvd nohup $BIN $MODEL --port $port --max-seq-len 2048 "$@" >/tmp/pp_s$port.log 2>&1 &
|
||||
local pid=$!
|
||||
wait_ready "$port" "$pid" || { echo "(server $port failed)"; kill -9 "$pid" 2>/dev/null; return; }
|
||||
curl -s --max-time 200 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
||||
-d "{\"model\":\"qwen3-8b\",\"messages\":[{\"role\":\"user\",\"content\":\"$PROMPT\"}],\"max_tokens\":64,\"temperature\":0,\"stream\":false}" \
|
||||
| python3 -c 'import sys,json;print(json.load(sys.stdin)["choices"][0]["message"]["content"])' 2>/dev/null
|
||||
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 3
|
||||
}
|
||||
gen 8091 0 > /tmp/o_single.txt
|
||||
gen 8092 0,1 --pp 2 > /tmp/o_pp2.txt
|
||||
gen 8093 0,1,2,3 --pp 4 > /tmp/o_pp4.txt
|
||||
log ""; log "## 2. Correctness (greedy temp=0, byte compare)"
|
||||
log " single==pp2: $(cmp -s /tmp/o_single.txt /tmp/o_pp2.txt && echo IDENTICAL || echo DIFFER)"
|
||||
log " single==pp4: $(cmp -s /tmp/o_single.txt /tmp/o_pp4.txt && echo IDENTICAL || echo DIFFER)"
|
||||
log " single text: $(head -c 160 /tmp/o_single.txt)"
|
||||
|
||||
# ---- 3. per-GPU VRAM (health-gated, KV pool capped so all configs comparable) ----
|
||||
log ""; log "## 3. Per-GPU VRAM (XSERV_MAX_KV_BLOCKS=160; weights + minimal KV)"
|
||||
snap(){ nvidia-smi -i "$1" --query-gpu=memory.used --format=csv,noheader,nounits | paste -sd' '; }
|
||||
vram(){ local label=$1 cvd=$2 port=$3; shift 3
|
||||
XSERV_MAX_KV_BLOCKS=160 CUDA_VISIBLE_DEVICES=$cvd nohup $BIN $MODEL --port $port --max-seq-len 2048 "$@" >/tmp/pp_v$port.log 2>&1 &
|
||||
local pid=$!
|
||||
wait_ready "$port" "$pid" || { log " $label: server failed"; kill -9 "$pid" 2>/dev/null; return; }
|
||||
curl -s --max-time 120 "http://127.0.0.1:$port/v1/chat/completions" -H 'Content-Type: application/json' \
|
||||
-d '{"model":"qwen3-8b","messages":[{"role":"user","content":"hi"}],"max_tokens":8,"temperature":0,"stream":false}' >/dev/null
|
||||
local a b=""; for _ in $(seq 1 12); do a=$(snap "$cvd"); [ "$a" = "$b" ] && break; b=$a; sleep 2; done
|
||||
log " $label ($cvd): $a MiB"
|
||||
kill -9 "$pid" 2>/dev/null; wait "$pid" 2>/dev/null; sleep 5
|
||||
}
|
||||
vram single 0 8094
|
||||
vram pp2 0,1 8095 --pp 2
|
||||
vram pp4 0,1,2,3 8096 --pp 4
|
||||
|
||||
# ---- 4. sweep vs llama.cpp ----
|
||||
log ""; log "## 4. Sweep (gsm8k $LIMIT, xserv --pp 0..N-1 vs llama -sm layer 4..)"
|
||||
PPS="$PPS" LIMIT="$LIMIT" TASKS=gsm8k bash tools/bench/run_pp_parallel.sh >/tmp/pp_sweep.log 2>&1
|
||||
log '```'
|
||||
python3 tools/bench/summarize_pp.py bench-out >> "$R" 2>&1
|
||||
log '```'
|
||||
log ""; log "PP_VERIFY_DONE"
|
||||
Reference in New Issue
Block a user