tools: single-stream decode benchmark vs llama.cpp
xserv_vs_llama.py runs each server one at a time on the same GPUs (drains VRAM between), streams identical prompts through /v1/chat/completions, and reports median TTFT/TPOT/throughput. Counts llama's reasoning_content as real decode tokens so the gpt-oss CoT is measured fairly. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
226
tools/xserv_vs_llama.py
Normal file
226
tools/xserv_vs_llama.py
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Single-stream decode-speed comparison: xserv vs llama.cpp on the same GPUs.
|
||||||
|
|
||||||
|
Runs each server one at a time (drains VRAM between), streams identical prompts
|
||||||
|
through /v1/chat/completions, and reports median TTFT / TPOT / throughput. Both
|
||||||
|
servers are OpenAI-compatible, so the same streaming client drives both.
|
||||||
|
|
||||||
|
Run ON the GPU box:
|
||||||
|
|
||||||
|
python3 tools/xserv_vs_llama.py \
|
||||||
|
--xserv-model /opt/wjh/models/gpt-oss-20b-fp8 --xserv-tp 2 \
|
||||||
|
--llama-gguf /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-mxfp4.gguf \
|
||||||
|
--llama-tp 2 --gpus 0,1 --reps 6 --max-tokens 256
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
SCRIPT_DIR = Path(__file__).parent
|
||||||
|
XSERV_BIN = SCRIPT_DIR.parent / "target" / "release" / "xserv-server"
|
||||||
|
LLAMA_BIN = SCRIPT_DIR.parent / "third_party" / "llama.cpp" / "build" / "bin" / "llama-server"
|
||||||
|
|
||||||
|
PROMPTS = {
|
||||||
|
"short": "What is the capital of France? Answer in one sentence.",
|
||||||
|
"medium": ("Explain how backpropagation trains a neural network, covering the "
|
||||||
|
"forward pass, the chain rule, gradient descent, and weight updates."),
|
||||||
|
"long": ("Summarize, then critique, the following claim in detail: modern large "
|
||||||
|
"language models understand language the way humans do. " * 6
|
||||||
|
+ "Give a structured, multi-paragraph response."),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_max_mem_mb(gpus):
|
||||||
|
out = subprocess.check_output(
|
||||||
|
["nvidia-smi", "--query-gpu=index,memory.used", "--format=csv,noheader,nounits"],
|
||||||
|
text=True)
|
||||||
|
used = {int(i): int(m) for i, m in (l.split(",") for l in out.strip().splitlines())}
|
||||||
|
return max(used.get(g, 0) for g in gpus)
|
||||||
|
|
||||||
|
|
||||||
|
def drain(gpus, below_mb=2000, timeout=120):
|
||||||
|
t0 = time.time()
|
||||||
|
while time.time() - t0 < timeout:
|
||||||
|
if gpu_max_mem_mb(gpus) < below_mb:
|
||||||
|
return
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
|
||||||
|
def start(cmd, gpus, log_path):
|
||||||
|
env = dict(os.environ)
|
||||||
|
env["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in gpus)
|
||||||
|
logf = open(log_path, "wb")
|
||||||
|
return subprocess.Popen(cmd, stdout=logf, stderr=subprocess.STDOUT,
|
||||||
|
env=env, start_new_session=True)
|
||||||
|
|
||||||
|
|
||||||
|
def stop(p, gpus):
|
||||||
|
if p.poll() is None:
|
||||||
|
try:
|
||||||
|
os.killpg(os.getpgid(p.pid), signal.SIGTERM)
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
p.wait(timeout=30)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
try:
|
||||||
|
os.killpg(os.getpgid(p.pid), signal.SIGKILL)
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
drain(gpus)
|
||||||
|
|
||||||
|
|
||||||
|
def wait_ready(base, model_id, timeout=900):
|
||||||
|
body = json.dumps({"model": model_id, "messages": [{"role": "user", "content": "hi"}],
|
||||||
|
"max_tokens": 1, "temperature": 0.0, "stream": False}).encode()
|
||||||
|
t0 = time.time()
|
||||||
|
while time.time() - t0 < timeout:
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(base + "/v1/chat/completions", data=body,
|
||||||
|
headers={"Content-Type": "application/json"})
|
||||||
|
with urllib.request.urlopen(req, timeout=120) as r:
|
||||||
|
if r.status == 200:
|
||||||
|
json.loads(r.read())
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
time.sleep(3)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def stream_chat(base, model_id, user, max_tokens):
|
||||||
|
body = json.dumps({"model": model_id, "messages": [{"role": "user", "content": user}],
|
||||||
|
"max_tokens": max_tokens, "temperature": 0.0, "stream": True}).encode()
|
||||||
|
req = urllib.request.Request(base + "/v1/chat/completions", data=body,
|
||||||
|
headers={"Content-Type": "application/json"})
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
ttft = None
|
||||||
|
t_last = t0
|
||||||
|
n = 0
|
||||||
|
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||||
|
for raw in resp:
|
||||||
|
line = raw.decode("utf-8", "ignore").strip()
|
||||||
|
if not line.startswith("data:"):
|
||||||
|
continue
|
||||||
|
data = line[5:].strip()
|
||||||
|
if data == "[DONE]":
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
obj = json.loads(data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
delta = obj["choices"][0].get("delta", {})
|
||||||
|
# gpt-oss reasoning models split CoT into reasoning_content (llama.cpp)
|
||||||
|
# vs raw harmony in content (xserv); count BOTH as real decode steps.
|
||||||
|
piece = delta.get("content") or delta.get("reasoning_content")
|
||||||
|
if piece:
|
||||||
|
now = time.perf_counter()
|
||||||
|
if ttft is None:
|
||||||
|
ttft = now - t0
|
||||||
|
n += 1
|
||||||
|
t_last = now
|
||||||
|
ttft = ttft if ttft is not None else (time.perf_counter() - t0)
|
||||||
|
tpot = (t_last - t0 - ttft) / (n - 1) if n > 1 else 0.0
|
||||||
|
return ttft, tpot, n
|
||||||
|
|
||||||
|
|
||||||
|
def median(xs):
|
||||||
|
s = sorted(xs)
|
||||||
|
return s[len(s) // 2] if s else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def bench(base, model_id, reps, max_tokens):
|
||||||
|
# warmup
|
||||||
|
for _ in range(2):
|
||||||
|
stream_chat(base, model_id, PROMPTS["short"], 16)
|
||||||
|
out = {}
|
||||||
|
for name, prompt in PROMPTS.items():
|
||||||
|
ttfts, tpots, toks = [], [], []
|
||||||
|
for _ in range(reps):
|
||||||
|
ttft, tpot, n = stream_chat(base, model_id, prompt, max_tokens)
|
||||||
|
ttfts.append(ttft * 1000)
|
||||||
|
if tpot > 0:
|
||||||
|
tpots.append(tpot * 1000)
|
||||||
|
toks.append(n)
|
||||||
|
out[name] = {
|
||||||
|
"ttft_ms": median(ttfts), "tpot_ms": median(tpots),
|
||||||
|
"tok_s": 1000.0 / median(tpots) if median(tpots) > 0 else 0.0,
|
||||||
|
"mean_tok": sum(toks) / len(toks),
|
||||||
|
}
|
||||||
|
print(f" {name:7s} ttft={out[name]['ttft_ms']:7.1f}ms tpot={out[name]['tpot_ms']:6.2f}ms "
|
||||||
|
f"{out[name]['tok_s']:6.1f} tok/s (n={out[name]['mean_tok']:.0f})", flush=True)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--xserv-model", required=True)
|
||||||
|
ap.add_argument("--xserv-tp", type=int, default=2)
|
||||||
|
ap.add_argument("--llama-gguf", required=True)
|
||||||
|
ap.add_argument("--llama-tp", type=int, default=2)
|
||||||
|
ap.add_argument("--gpus", default="0,1")
|
||||||
|
ap.add_argument("--reps", type=int, default=6)
|
||||||
|
ap.add_argument("--max-tokens", type=int, default=256)
|
||||||
|
ap.add_argument("--port", type=int, default=18080)
|
||||||
|
ap.add_argument("--ctx", type=int, default=4096)
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
gpus = [int(g) for g in args.gpus.split(",")]
|
||||||
|
base = f"http://127.0.0.1:{args.port}"
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# ---- xserv ----
|
||||||
|
xid = Path(args.xserv_model).name
|
||||||
|
xcmd = [str(XSERV_BIN), str(args.xserv_model), "--port", str(args.port),
|
||||||
|
"--tp", str(args.xserv_tp), "--max-seq-len", "2048", "--max-batch", "8"]
|
||||||
|
print(f"=== xserv ({xid}, tp={args.xserv_tp}, gpus={gpus}) ===", flush=True)
|
||||||
|
p = start(xcmd, gpus, "/tmp/cmp_xserv.log")
|
||||||
|
try:
|
||||||
|
if wait_ready(base, xid):
|
||||||
|
results["xserv"] = bench(base, xid, args.reps, args.max_tokens)
|
||||||
|
else:
|
||||||
|
print(" xserv NOT READY:", subprocess.run(["tail", "-20", "/tmp/cmp_xserv.log"],
|
||||||
|
capture_output=True, text=True).stdout)
|
||||||
|
finally:
|
||||||
|
stop(p, gpus)
|
||||||
|
|
||||||
|
# ---- llama.cpp ----
|
||||||
|
lcmd = [str(LLAMA_BIN), "-m", str(args.llama_gguf), "--port", str(args.port),
|
||||||
|
"--host", "127.0.0.1", "-c", str(args.ctx), "-ngl", "99", "--parallel", "1"]
|
||||||
|
if args.llama_tp > 1:
|
||||||
|
lcmd += ["--split-mode", "row"]
|
||||||
|
print(f"\n=== llama.cpp ({Path(args.llama_gguf).name}, tp={args.llama_tp}, gpus={gpus}) ===", flush=True)
|
||||||
|
p = start(lcmd, gpus, "/tmp/cmp_llama.log")
|
||||||
|
try:
|
||||||
|
# llama-server accepts any model field
|
||||||
|
if wait_ready(base, "gpt-oss", timeout=300):
|
||||||
|
results["llama"] = bench(base, "gpt-oss", args.reps, args.max_tokens)
|
||||||
|
else:
|
||||||
|
print(" llama NOT READY:", subprocess.run(["tail", "-30", "/tmp/cmp_llama.log"],
|
||||||
|
capture_output=True, text=True).stdout)
|
||||||
|
finally:
|
||||||
|
stop(p, gpus)
|
||||||
|
|
||||||
|
# ---- summary ----
|
||||||
|
print(f"\n{'='*70}\n SUMMARY — single-stream decode (gpt-oss-20b)\n{'='*70}")
|
||||||
|
print(f"{'prompt':8s} {'metric':10s} {'xserv-FP8':>12s} {'llama':>12s} {'ratio':>8s}")
|
||||||
|
for name in PROMPTS:
|
||||||
|
x = results.get("xserv", {}).get(name)
|
||||||
|
l = results.get("llama", {}).get(name)
|
||||||
|
if not x or not l:
|
||||||
|
continue
|
||||||
|
for key, lab in [("ttft_ms", "TTFT ms"), ("tpot_ms", "TPOT ms"), ("tok_s", "tok/s")]:
|
||||||
|
xv, lv = x[key], l[key]
|
||||||
|
ratio = (lv / xv) if xv else 0
|
||||||
|
print(f"{name:8s} {lab:10s} {xv:12.2f} {lv:12.2f} {ratio:7.2f}x")
|
||||||
|
with open(f"/tmp/xserv_vs_llama_{int(time.time())}.json", "w") as f:
|
||||||
|
json.dump(results, f, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user