bench: TP sweep harness (xserv --tp, llama row-split, concurrent groups)
runner/servers gain --tp (xserv --tp N; llama.cpp --split-mode row) and --llama-devices so llama can run on a disjoint GPU group. run_tp_parallel.sh runs xserv (GPU 0..N-1) and llama.cpp (GPU 4..4+N-1) concurrently per TP, matching the box's 0-3 / 4-7 PHB groups. summarize_tp.py tabulates the sweep. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
38
tools/bench/run_tp_parallel.sh
Normal file
38
tools/bench/run_tp_parallel.sh
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
# Run the TP=1/2/4 quality sweep with xserv and llama.cpp CONCURRENTLY on
|
||||
# disjoint GPU groups: xserv on GPUs 0..N-1, llama.cpp on GPUs 4..4+N-1.
|
||||
# The 8x5090 box is grouped 0-3 / 4-7 (PHB intra-group), so each engine's TP
|
||||
# comm stays intra-group and the two engines never contend for a GPU.
|
||||
#
|
||||
# Run from the repo root on the GPU host. Produces bench-out/tp{1,2,4}-{xserv,llama}.
|
||||
|
||||
set -u
|
||||
MODEL="${MODEL:-/opt/wjh/models/qwen3-8b}"
|
||||
GGUF="${GGUF:-/opt/wjh/models/qwen3-8b/qwen3-8b-bf16.gguf}"
|
||||
LIMIT="${LIMIT:-30}"
|
||||
MAXSEQ="${MAXSEQ:-2048}"
|
||||
TPS="${TPS:-1 2 4}"
|
||||
|
||||
for TP in $TPS; do
|
||||
LD=$(seq -s, 4 $((3 + TP))) # llama GPUs: 4 / 4,5 / 4,5,6,7
|
||||
echo "##### TP=$TP (xserv GPU 0..$((TP-1)) || llama GPU $LD) #####"
|
||||
rm -rf "bench-out/tp$TP-xserv" "bench-out/tp$TP-llama"
|
||||
|
||||
python3 -u -m tools.bench.runner --systems xserv --tp "$TP" \
|
||||
--xserv-bin ./target/release/xserv-server --xserv-model "$MODEL" \
|
||||
--suite quality --quality-tasks aime2025,gsm8k --quality-limit "$LIMIT" \
|
||||
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
||||
--out-dir "bench-out/tp$TP-xserv" > "/tmp/tp$TP-xserv.log" 2>&1 &
|
||||
XP=$!
|
||||
|
||||
python3 -u -m tools.bench.runner --systems llama.cpp --tp "$TP" --llama-devices "$LD" \
|
||||
--llama-bin third_party/llama.cpp/build/bin/llama-server --llama-gguf "$GGUF" \
|
||||
--suite quality --quality-tasks aime2025,gsm8k --quality-limit "$LIMIT" \
|
||||
--max-batch 1 --max-seq-len "$MAXSEQ" \
|
||||
--out-dir "bench-out/tp$TP-llama" > "/tmp/tp$TP-llama.log" 2>&1 &
|
||||
LP=$!
|
||||
|
||||
wait "$XP" "$LP"
|
||||
echo "TP=$TP done (xserv exit=$? )"
|
||||
done
|
||||
echo ALL_DONE
|
||||
@@ -69,6 +69,13 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--max-seq-len", type=int, default=8192)
|
||||
p.add_argument("--systems", default="xserv,llama.cpp",
|
||||
help="Comma-separated subset to run, e.g. 'xserv' to skip llama.cpp")
|
||||
p.add_argument("--tp", type=int, default=1,
|
||||
help="Tensor-parallel degree for BOTH engines (xserv --tp N; "
|
||||
"llama.cpp --split-mode row over the first N GPUs).")
|
||||
p.add_argument("--llama-devices", default=None,
|
||||
help="Comma list of GPU ordinals for llama.cpp (first --tp used). "
|
||||
"Lets llama run on a disjoint GPU group (e.g. 4,5,6,7) so it "
|
||||
"can run concurrently with xserv on 0..N-1.")
|
||||
p.add_argument("--enable-thinking", action="store_true",
|
||||
help="Enable Qwen3 thinking on llama.cpp. Default OFF to match "
|
||||
"xserv, which hardcodes thinking off in its prompt builder.")
|
||||
@@ -106,10 +113,10 @@ def build_endpoints(args) -> list[SystemEndpoint]:
|
||||
model_id=args.xserv_model_id,
|
||||
launch_cmd=xserv_launch_cmd(
|
||||
args.xserv_bin, model_dir, args.xserv_port,
|
||||
max_batch=args.max_batch, max_seq_len=args.max_seq_len,
|
||||
max_batch=args.max_batch, max_seq_len=args.max_seq_len, tp=args.tp,
|
||||
),
|
||||
health_path="/health",
|
||||
ready_timeout_s=900.0,
|
||||
ready_timeout_s=1200.0,
|
||||
))
|
||||
|
||||
# Match xserv's hardcoded thinking-OFF mode unless explicitly overridden.
|
||||
@@ -128,17 +135,29 @@ def build_endpoints(args) -> list[SystemEndpoint]:
|
||||
gguf = args.llama_gguf or os.environ.get("LLAMA_GGUF")
|
||||
if not gguf:
|
||||
raise SystemExit("--llama-gguf or LLAMA_GGUF required (or pass --llama-base-url)")
|
||||
# Pick the GPUs llama.cpp runs on. Default is the first `tp` GPUs;
|
||||
# pass --llama-devices to place it on a disjoint group (e.g. 4,5,6,7)
|
||||
# so it can run concurrently with xserv on 0..N-1. --split-mode row
|
||||
# then tensor-parallel-splits across exactly these devices.
|
||||
if args.llama_devices:
|
||||
devs = [d.strip() for d in args.llama_devices.split(",") if d.strip()][: max(args.tp, 1)]
|
||||
llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(devs)}
|
||||
elif args.tp > 1:
|
||||
llama_env = {"CUDA_VISIBLE_DEVICES": ",".join(str(d) for d in range(args.tp))}
|
||||
else:
|
||||
llama_env = {}
|
||||
eps.append(SystemEndpoint(
|
||||
name=SYSTEM_LLAMA_CPP,
|
||||
base_url=f"http://127.0.0.1:{args.llama_port}",
|
||||
model_id=args.llama_model_id,
|
||||
launch_cmd=llama_cpp_launch_cmd(
|
||||
args.llama_bin, gguf, args.llama_port,
|
||||
n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len,
|
||||
n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len, tp=args.tp,
|
||||
),
|
||||
launch_env=llama_env,
|
||||
# llama-server's health endpoint also returns 200 only when model is loaded.
|
||||
health_path="/health",
|
||||
ready_timeout_s=900.0,
|
||||
ready_timeout_s=1200.0,
|
||||
extra_body=llama_extra_body,
|
||||
))
|
||||
return eps
|
||||
|
||||
@@ -113,14 +113,18 @@ def xserv_launch_cmd(
|
||||
*,
|
||||
max_batch: int,
|
||||
max_seq_len: int,
|
||||
tp: int = 1,
|
||||
) -> list[str]:
|
||||
return [
|
||||
cmd = [
|
||||
bin_path,
|
||||
model_dir,
|
||||
"--port", str(port),
|
||||
"--max-batch", str(max_batch),
|
||||
"--max-seq-len", str(max_seq_len),
|
||||
]
|
||||
if tp > 1:
|
||||
cmd += ["--tp", str(tp)] # xserv binds rank r -> GPU r internally
|
||||
return cmd
|
||||
|
||||
|
||||
def llama_cpp_launch_cmd(
|
||||
@@ -131,13 +135,14 @@ def llama_cpp_launch_cmd(
|
||||
n_parallel: int,
|
||||
ctx_per_slot: int,
|
||||
n_gpu_layers: int = 99,
|
||||
tp: int = 1,
|
||||
) -> list[str]:
|
||||
# llama.cpp DIVIDES total -c across --parallel slots: per-slot context is
|
||||
# n_ctx / n_parallel. xserv gives each sequence the full max_seq_len, so to
|
||||
# match we must set total -c = ctx_per_slot * n_parallel. Getting this wrong
|
||||
# silently truncates long generations (e.g. AIME) on llama.cpp's side.
|
||||
total_ctx = ctx_per_slot * n_parallel
|
||||
return [
|
||||
cmd = [
|
||||
bin_path,
|
||||
"-m", gguf_path,
|
||||
"--port", str(port),
|
||||
@@ -148,3 +153,9 @@ def llama_cpp_launch_cmd(
|
||||
# NOTE: do NOT pass --log-disable; its startup log reports per-slot
|
||||
# n_ctx, which is exactly the diagnostic that catches ctx misconfig.
|
||||
]
|
||||
if tp > 1:
|
||||
# Tensor-parallel split across the visible GPUs (caller restricts the
|
||||
# set via CUDA_VISIBLE_DEVICES in launch_env). Row-split is llama.cpp's
|
||||
# tensor-parallel mode (vs the default layer/pipeline split).
|
||||
cmd += ["--split-mode", "row"]
|
||||
return cmd
|
||||
|
||||
24
tools/bench/summarize_tp.py
Normal file
24
tools/bench/summarize_tp.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""Summarize the concurrent TP sweep: bench-out/tp{1,2,4}-{xserv,llama}."""
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
base = sys.argv[1] if len(sys.argv) > 1 else "bench-out"
|
||||
rows = []
|
||||
for tp in (1, 2, 4):
|
||||
for sysname in ("xserv", "llama"):
|
||||
files = sorted(glob.glob(os.path.join(base, f"tp{tp}-{sysname}", "comparison-*.json")))
|
||||
if not files:
|
||||
continue
|
||||
d = json.load(open(files[-1]))
|
||||
for r in d["quality"]["summary"]:
|
||||
rows.append((tp, sysname, r["task"], r["n_correct"], r["n_total"],
|
||||
r["accuracy"] * 100, r["mean_completion_tokens"],
|
||||
r["mean_ttft_ms"], r["mean_tpot_ms"], r["wall_s"]))
|
||||
|
||||
print("%-3s %-7s %-9s %-9s %7s %9s %9s %10s %9s" %
|
||||
("TP", "engine", "task", "correct", "acc%", "mean_tok", "TTFT_ms", "TPOT_ms", "wall_s"))
|
||||
for (tp, s, task, nc, nt, acc, tok, ttft, tpot, wall) in rows:
|
||||
print("%-3d %-7s %-9s %-9s %6.1f%% %9.0f %9.1f %10.2f %9.0f" %
|
||||
(tp, s, task, f"{nc}/{nt}", acc, tok, ttft, tpot, wall))
|
||||
Reference in New Issue
Block a user