tools: single-stream decode benchmark vs llama.cpp

xserv_vs_llama.py runs each server one at a time on the same GPUs (drains VRAM between), streams identical prompts through /v1/chat/completions, and reports median TTFT/TPOT/throughput. Counts llama's reasoning_content as real decode tokens so the gpt-oss CoT is measured fairly. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-12 15:01:42 +08:00
parent d33220498a
commit cf1e9e41db
1 changed files with 226 additions and 0 deletions
--- a/tools/xserv_vs_llama.py
+++ b/tools/xserv_vs_llama.py
@@ -0,0 +1,226 @@
 #!/usr/bin/env python3
 """Single-stream decode-speed comparison: xserv vs llama.cpp on the same GPUs.
 Runs each server one at a time (drains VRAM between), streams identical prompts
 through /v1/chat/completions, and reports median TTFT / TPOT / throughput. Both
 servers are OpenAI-compatible, so the same streaming client drives both.
 Run ON the GPU box:
    python3 tools/xserv_vs_llama.py \
        --xserv-model /opt/wjh/models/gpt-oss-20b-fp8 --xserv-tp 2 \
        --llama-gguf /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-mxfp4.gguf \
        --llama-tp 2 --gpus 0,1 --reps 6 --max-tokens 256
 """
 import argparse
 import json
 import os
 import signal
 import subprocess
 import time
 import urllib.request
 from pathlib import Path
 SCRIPT_DIR = Path(__file__).parent
 XSERV_BIN = SCRIPT_DIR.parent / "target" / "release" / "xserv-server"
 LLAMA_BIN = SCRIPT_DIR.parent / "third_party" / "llama.cpp" / "build" / "bin" / "llama-server"
 PROMPTS = {
    "short": "What is the capital of France? Answer in one sentence.",
    "medium": ("Explain how backpropagation trains a neural network, covering the "
               "forward pass, the chain rule, gradient descent, and weight updates."),
    "long": ("Summarize, then critique, the following claim in detail: modern large "
             "language models understand language the way humans do. " * 6
             + "Give a structured, multi-paragraph response."),
 }
 def gpu_max_mem_mb(gpus):
    out = subprocess.check_output(
        ["nvidia-smi", "--query-gpu=index,memory.used", "--format=csv,noheader,nounits"],
        text=True)
    used = {int(i): int(m) for i, m in (l.split(",") for l in out.strip().splitlines())}
    return max(used.get(g, 0) for g in gpus)
 def drain(gpus, below_mb=2000, timeout=120):
    t0 = time.time()
    while time.time() - t0 < timeout:
        if gpu_max_mem_mb(gpus) < below_mb:
            return
        time.sleep(2)
 def start(cmd, gpus, log_path):
    env = dict(os.environ)
    env["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in gpus)
    logf = open(log_path, "wb")
    return subprocess.Popen(cmd, stdout=logf, stderr=subprocess.STDOUT,
                            env=env, start_new_session=True)
 def stop(p, gpus):
    if p.poll() is None:
        try:
            os.killpg(os.getpgid(p.pid), signal.SIGTERM)
        except ProcessLookupError:
            pass
    try:
        p.wait(timeout=30)
    except subprocess.TimeoutExpired:
        try:
            os.killpg(os.getpgid(p.pid), signal.SIGKILL)
        except ProcessLookupError:
            pass
    drain(gpus)
 def wait_ready(base, model_id, timeout=900):
    body = json.dumps({"model": model_id, "messages": [{"role": "user", "content": "hi"}],
                       "max_tokens": 1, "temperature": 0.0, "stream": False}).encode()
    t0 = time.time()
    while time.time() - t0 < timeout:
        try:
            req = urllib.request.Request(base + "/v1/chat/completions", data=body,
                                         headers={"Content-Type": "application/json"})
            with urllib.request.urlopen(req, timeout=120) as r:
                if r.status == 200:
                    json.loads(r.read())
                    return True
        except Exception:
            time.sleep(3)
    return False
 def stream_chat(base, model_id, user, max_tokens):
    body = json.dumps({"model": model_id, "messages": [{"role": "user", "content": user}],
                       "max_tokens": max_tokens, "temperature": 0.0, "stream": True}).encode()
    req = urllib.request.Request(base + "/v1/chat/completions", data=body,
                                 headers={"Content-Type": "application/json"})
    t0 = time.perf_counter()
    ttft = None
    t_last = t0
    n = 0
    with urllib.request.urlopen(req, timeout=300) as resp:
        for raw in resp:
            line = raw.decode("utf-8", "ignore").strip()
            if not line.startswith("data:"):
                continue
            data = line[5:].strip()
            if data == "[DONE]":
                break
            try:
                obj = json.loads(data)
            except json.JSONDecodeError:
                continue
            delta = obj["choices"][0].get("delta", {})
            # gpt-oss reasoning models split CoT into reasoning_content (llama.cpp)
            # vs raw harmony in content (xserv); count BOTH as real decode steps.
            piece = delta.get("content") or delta.get("reasoning_content")
            if piece:
                now = time.perf_counter()
                if ttft is None:
                    ttft = now - t0
                n += 1
                t_last = now
    ttft = ttft if ttft is not None else (time.perf_counter() - t0)
    tpot = (t_last - t0 - ttft) / (n - 1) if n > 1 else 0.0
    return ttft, tpot, n
 def median(xs):
    s = sorted(xs)
    return s[len(s) // 2] if s else 0.0
 def bench(base, model_id, reps, max_tokens):
    # warmup
    for _ in range(2):
        stream_chat(base, model_id, PROMPTS["short"], 16)
    out = {}
    for name, prompt in PROMPTS.items():
        ttfts, tpots, toks = [], [], []
        for _ in range(reps):
            ttft, tpot, n = stream_chat(base, model_id, prompt, max_tokens)
            ttfts.append(ttft * 1000)
            if tpot > 0:
                tpots.append(tpot * 1000)
            toks.append(n)
        out[name] = {
            "ttft_ms": median(ttfts), "tpot_ms": median(tpots),
            "tok_s": 1000.0 / median(tpots) if median(tpots) > 0 else 0.0,
            "mean_tok": sum(toks) / len(toks),
        }
        print(f"    {name:7s} ttft={out[name]['ttft_ms']:7.1f}ms  tpot={out[name]['tpot_ms']:6.2f}ms  "
              f"{out[name]['tok_s']:6.1f} tok/s  (n={out[name]['mean_tok']:.0f})", flush=True)
    return out
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--xserv-model", required=True)
    ap.add_argument("--xserv-tp", type=int, default=2)
    ap.add_argument("--llama-gguf", required=True)
    ap.add_argument("--llama-tp", type=int, default=2)
    ap.add_argument("--gpus", default="0,1")
    ap.add_argument("--reps", type=int, default=6)
    ap.add_argument("--max-tokens", type=int, default=256)
    ap.add_argument("--port", type=int, default=18080)
    ap.add_argument("--ctx", type=int, default=4096)
    args = ap.parse_args()
    gpus = [int(g) for g in args.gpus.split(",")]
    base = f"http://127.0.0.1:{args.port}"
    results = {}
    # ---- xserv ----
    xid = Path(args.xserv_model).name
    xcmd = [str(XSERV_BIN), str(args.xserv_model), "--port", str(args.port),
            "--tp", str(args.xserv_tp), "--max-seq-len", "2048", "--max-batch", "8"]
    print(f"=== xserv ({xid}, tp={args.xserv_tp}, gpus={gpus}) ===", flush=True)
    p = start(xcmd, gpus, "/tmp/cmp_xserv.log")
    try:
        if wait_ready(base, xid):
            results["xserv"] = bench(base, xid, args.reps, args.max_tokens)
        else:
            print("  xserv NOT READY:", subprocess.run(["tail", "-20", "/tmp/cmp_xserv.log"],
                  capture_output=True, text=True).stdout)
    finally:
        stop(p, gpus)
    # ---- llama.cpp ----
    lcmd = [str(LLAMA_BIN), "-m", str(args.llama_gguf), "--port", str(args.port),
            "--host", "127.0.0.1", "-c", str(args.ctx), "-ngl", "99", "--parallel", "1"]
    if args.llama_tp > 1:
        lcmd += ["--split-mode", "row"]
    print(f"\n=== llama.cpp ({Path(args.llama_gguf).name}, tp={args.llama_tp}, gpus={gpus}) ===", flush=True)
    p = start(lcmd, gpus, "/tmp/cmp_llama.log")
    try:
        # llama-server accepts any model field
        if wait_ready(base, "gpt-oss", timeout=300):
            results["llama"] = bench(base, "gpt-oss", args.reps, args.max_tokens)
        else:
            print("  llama NOT READY:", subprocess.run(["tail", "-30", "/tmp/cmp_llama.log"],
                  capture_output=True, text=True).stdout)
    finally:
        stop(p, gpus)
    # ---- summary ----
    print(f"\n{'='*70}\n  SUMMARY — single-stream decode (gpt-oss-20b)\n{'='*70}")
    print(f"{'prompt':8s} {'metric':10s} {'xserv-FP8':>12s} {'llama':>12s} {'ratio':>8s}")
    for name in PROMPTS:
        x = results.get("xserv", {}).get(name)
        l = results.get("llama", {}).get(name)
        if not x or not l:
            continue
        for key, lab in [("ttft_ms", "TTFT ms"), ("tpot_ms", "TPOT ms"), ("tok_s", "tok/s")]:
            xv, lv = x[key], l[key]
            ratio = (lv / xv) if xv else 0
            print(f"{name:8s} {lab:10s} {xv:12.2f} {lv:12.2f} {ratio:7.2f}x")
    with open(f"/tmp/xserv_vs_llama_{int(time.time())}.json", "w") as f:
        json.dump(results, f, indent=2)
 if __name__ == "__main__":
    main()