diff --git a/tools/xserv_vs_llama.py b/tools/xserv_vs_llama.py new file mode 100644 index 0000000..5d54009 --- /dev/null +++ b/tools/xserv_vs_llama.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +"""Single-stream decode-speed comparison: xserv vs llama.cpp on the same GPUs. + +Runs each server one at a time (drains VRAM between), streams identical prompts +through /v1/chat/completions, and reports median TTFT / TPOT / throughput. Both +servers are OpenAI-compatible, so the same streaming client drives both. + +Run ON the GPU box: + + python3 tools/xserv_vs_llama.py \ + --xserv-model /opt/wjh/models/gpt-oss-20b-fp8 --xserv-tp 2 \ + --llama-gguf /opt/wjh/models/gpt-oss-20b-gguf/gpt-oss-20b-mxfp4.gguf \ + --llama-tp 2 --gpus 0,1 --reps 6 --max-tokens 256 +""" + +import argparse +import json +import os +import signal +import subprocess +import time +import urllib.request +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent +XSERV_BIN = SCRIPT_DIR.parent / "target" / "release" / "xserv-server" +LLAMA_BIN = SCRIPT_DIR.parent / "third_party" / "llama.cpp" / "build" / "bin" / "llama-server" + +PROMPTS = { + "short": "What is the capital of France? Answer in one sentence.", + "medium": ("Explain how backpropagation trains a neural network, covering the " + "forward pass, the chain rule, gradient descent, and weight updates."), + "long": ("Summarize, then critique, the following claim in detail: modern large " + "language models understand language the way humans do. " * 6 + + "Give a structured, multi-paragraph response."), +} + + +def gpu_max_mem_mb(gpus): + out = subprocess.check_output( + ["nvidia-smi", "--query-gpu=index,memory.used", "--format=csv,noheader,nounits"], + text=True) + used = {int(i): int(m) for i, m in (l.split(",") for l in out.strip().splitlines())} + return max(used.get(g, 0) for g in gpus) + + +def drain(gpus, below_mb=2000, timeout=120): + t0 = time.time() + while time.time() - t0 < timeout: + if gpu_max_mem_mb(gpus) < below_mb: + return + time.sleep(2) + + +def start(cmd, gpus, log_path): + env = dict(os.environ) + env["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in gpus) + logf = open(log_path, "wb") + return subprocess.Popen(cmd, stdout=logf, stderr=subprocess.STDOUT, + env=env, start_new_session=True) + + +def stop(p, gpus): + if p.poll() is None: + try: + os.killpg(os.getpgid(p.pid), signal.SIGTERM) + except ProcessLookupError: + pass + try: + p.wait(timeout=30) + except subprocess.TimeoutExpired: + try: + os.killpg(os.getpgid(p.pid), signal.SIGKILL) + except ProcessLookupError: + pass + drain(gpus) + + +def wait_ready(base, model_id, timeout=900): + body = json.dumps({"model": model_id, "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 1, "temperature": 0.0, "stream": False}).encode() + t0 = time.time() + while time.time() - t0 < timeout: + try: + req = urllib.request.Request(base + "/v1/chat/completions", data=body, + headers={"Content-Type": "application/json"}) + with urllib.request.urlopen(req, timeout=120) as r: + if r.status == 200: + json.loads(r.read()) + return True + except Exception: + time.sleep(3) + return False + + +def stream_chat(base, model_id, user, max_tokens): + body = json.dumps({"model": model_id, "messages": [{"role": "user", "content": user}], + "max_tokens": max_tokens, "temperature": 0.0, "stream": True}).encode() + req = urllib.request.Request(base + "/v1/chat/completions", data=body, + headers={"Content-Type": "application/json"}) + t0 = time.perf_counter() + ttft = None + t_last = t0 + n = 0 + with urllib.request.urlopen(req, timeout=300) as resp: + for raw in resp: + line = raw.decode("utf-8", "ignore").strip() + if not line.startswith("data:"): + continue + data = line[5:].strip() + if data == "[DONE]": + break + try: + obj = json.loads(data) + except json.JSONDecodeError: + continue + delta = obj["choices"][0].get("delta", {}) + # gpt-oss reasoning models split CoT into reasoning_content (llama.cpp) + # vs raw harmony in content (xserv); count BOTH as real decode steps. + piece = delta.get("content") or delta.get("reasoning_content") + if piece: + now = time.perf_counter() + if ttft is None: + ttft = now - t0 + n += 1 + t_last = now + ttft = ttft if ttft is not None else (time.perf_counter() - t0) + tpot = (t_last - t0 - ttft) / (n - 1) if n > 1 else 0.0 + return ttft, tpot, n + + +def median(xs): + s = sorted(xs) + return s[len(s) // 2] if s else 0.0 + + +def bench(base, model_id, reps, max_tokens): + # warmup + for _ in range(2): + stream_chat(base, model_id, PROMPTS["short"], 16) + out = {} + for name, prompt in PROMPTS.items(): + ttfts, tpots, toks = [], [], [] + for _ in range(reps): + ttft, tpot, n = stream_chat(base, model_id, prompt, max_tokens) + ttfts.append(ttft * 1000) + if tpot > 0: + tpots.append(tpot * 1000) + toks.append(n) + out[name] = { + "ttft_ms": median(ttfts), "tpot_ms": median(tpots), + "tok_s": 1000.0 / median(tpots) if median(tpots) > 0 else 0.0, + "mean_tok": sum(toks) / len(toks), + } + print(f" {name:7s} ttft={out[name]['ttft_ms']:7.1f}ms tpot={out[name]['tpot_ms']:6.2f}ms " + f"{out[name]['tok_s']:6.1f} tok/s (n={out[name]['mean_tok']:.0f})", flush=True) + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--xserv-model", required=True) + ap.add_argument("--xserv-tp", type=int, default=2) + ap.add_argument("--llama-gguf", required=True) + ap.add_argument("--llama-tp", type=int, default=2) + ap.add_argument("--gpus", default="0,1") + ap.add_argument("--reps", type=int, default=6) + ap.add_argument("--max-tokens", type=int, default=256) + ap.add_argument("--port", type=int, default=18080) + ap.add_argument("--ctx", type=int, default=4096) + args = ap.parse_args() + + gpus = [int(g) for g in args.gpus.split(",")] + base = f"http://127.0.0.1:{args.port}" + results = {} + + # ---- xserv ---- + xid = Path(args.xserv_model).name + xcmd = [str(XSERV_BIN), str(args.xserv_model), "--port", str(args.port), + "--tp", str(args.xserv_tp), "--max-seq-len", "2048", "--max-batch", "8"] + print(f"=== xserv ({xid}, tp={args.xserv_tp}, gpus={gpus}) ===", flush=True) + p = start(xcmd, gpus, "/tmp/cmp_xserv.log") + try: + if wait_ready(base, xid): + results["xserv"] = bench(base, xid, args.reps, args.max_tokens) + else: + print(" xserv NOT READY:", subprocess.run(["tail", "-20", "/tmp/cmp_xserv.log"], + capture_output=True, text=True).stdout) + finally: + stop(p, gpus) + + # ---- llama.cpp ---- + lcmd = [str(LLAMA_BIN), "-m", str(args.llama_gguf), "--port", str(args.port), + "--host", "127.0.0.1", "-c", str(args.ctx), "-ngl", "99", "--parallel", "1"] + if args.llama_tp > 1: + lcmd += ["--split-mode", "row"] + print(f"\n=== llama.cpp ({Path(args.llama_gguf).name}, tp={args.llama_tp}, gpus={gpus}) ===", flush=True) + p = start(lcmd, gpus, "/tmp/cmp_llama.log") + try: + # llama-server accepts any model field + if wait_ready(base, "gpt-oss", timeout=300): + results["llama"] = bench(base, "gpt-oss", args.reps, args.max_tokens) + else: + print(" llama NOT READY:", subprocess.run(["tail", "-30", "/tmp/cmp_llama.log"], + capture_output=True, text=True).stdout) + finally: + stop(p, gpus) + + # ---- summary ---- + print(f"\n{'='*70}\n SUMMARY — single-stream decode (gpt-oss-20b)\n{'='*70}") + print(f"{'prompt':8s} {'metric':10s} {'xserv-FP8':>12s} {'llama':>12s} {'ratio':>8s}") + for name in PROMPTS: + x = results.get("xserv", {}).get(name) + l = results.get("llama", {}).get(name) + if not x or not l: + continue + for key, lab in [("ttft_ms", "TTFT ms"), ("tpot_ms", "TPOT ms"), ("tok_s", "tok/s")]: + xv, lv = x[key], l[key] + ratio = (lv / xv) if xv else 0 + print(f"{name:8s} {lab:10s} {xv:12.2f} {lv:12.2f} {ratio:7.2f}x") + with open(f"/tmp/xserv_vs_llama_{int(time.time())}.json", "w") as f: + json.dump(results, f, indent=2) + + +if __name__ == "__main__": + main()