fix: comprehensive review + 14 bug fixes + Phase 12/14 overhaul

Strict code review identified 30+ issues across correctness, performance, and architecture. This commit addresses 14 of them with verified fixes, restructures Phase 12 for honest continuous batching, and updates Phase 14 to target FA2 (RTX 5090 SM120 lacks TMEM required by FA4). Bug fixes: - FIX-01: Global cuBLAS handle (thread-local singleton, was per-call) - FIX-02: Remove 19 unnecessary cudaDeviceSynchronize calls from kernels - FIX-03: Qwen3 ChatML template (was plain text concatenation) - FIX-04: EOS token from tokenizer (was hardcoded 151645) - FIX-05: Storage tracks actual GPU device ordinal (was always Cuda(0)) - FIX-06: unsqueeze stride preserves contiguous layout - FIX-08: CudaDeviceProp replaced with heap buffer (was UB-prone padding) - FIX-09: Tokenizer byte_fallback to <0xNN> tokens (was panic) Feature additions: - FIX-10: SSE streaming (/v1/chat/completions, OpenAI-compatible) - FIX-11: Correct usage statistics (prompt/completion/total tokens) - FIX-13: Temperature / top-k / top-p sampling with SamplingParams Performance improvements: - FIX-07: Caching allocator wired up (thread-local pool, pooled flag) - FIX-12: KV cache staging buffers (zero-alloc get_kv_len via borrow_raw) - FIX-14: GPU strided copy kernel (eliminates contiguous() CPU round-trip) Architecture: - Phase 12 engine restructured: prefill/decode separation, honest TODO for batched GPU forward (requires Flash Attention) - Phase 14 updated: FA2 for SM120 (FA4 requires TMEM, absent on 5090) - Qwen3-7B → Qwen3-8B typo fixed across all docs (36 layers, hidden 4096) Validated on dash5 (8x RTX 5090): - 52/52 API prompts pass (EN/CN/code), SSE streaming verified - Logits match HF transformers 9/10 top-1, 4.0/5 avg top-5 overlap - 8 concurrent requests: 5.99x scheduling speedup (batch_size=4) - Throughput: 10.3 tok/s (serial), 30% of HF baseline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-22 17:53:28 +08:00
parent d8493bd70f
commit ee68d3565d
38 changed files with 3012 additions and 259 deletions
--- a/tools/bench_vs_hf.py
+++ b/tools/bench_vs_hf.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""
+Benchmark xserv vs HuggingFace transformers on Qwen3-8B.
+Measures: prefill latency, decode throughput, end-to-end latency.
+
+Usage:
+    # xserv server should be running on port 9090
+    python3 tools/bench_vs_hf.py
+"""
+
+import json
+import os
+import time
+import urllib.request
+
+MODEL_DIR = "/opt/wjh/models/qwen3-8b"
+XSERV_URL = "http://localhost:9090"
+
+BENCH_PROMPTS = [
+    # Short prompts (~10 tokens)
+    ("short", "What is gravity?"),
+    ("short", "Hello, how are you?"),
+    ("short", "Explain DNA briefly."),
+    # Medium prompts (~30 tokens)
+    ("medium", "Write a detailed explanation of how photosynthesis works in plants, including the light and dark reactions."),
+    ("medium", "Describe the process of machine learning training, including forward pass, loss computation, and backpropagation."),
+    ("medium", "Explain the differences between TCP and UDP protocols, including when you would use each one in practice."),
+    # Longer prompts (~60 tokens)
+    ("long", "You are an expert computer scientist. Please write a comprehensive explanation of how modern GPUs work, including the architecture of streaming multiprocessors, the memory hierarchy from registers to global memory, and how thousands of threads are scheduled concurrently. Include specific technical details."),
+    ("long", "You are a historian specializing in ancient civilizations. Please provide a detailed analysis of the rise and fall of the Roman Empire, covering the key factors that led to its expansion, the political and social structures that sustained it, and the multiple causes that contributed to its eventual decline and collapse."),
+]
+
+MAX_TOKENS = 64
+
+
+def bench_xserv():
+    """Benchmark xserv HTTP API."""
+    print("\n" + "=" * 60)
+    print("BENCHMARK: xserv (HTTP API, greedy, max_tokens={})".format(MAX_TOKENS))
+    print("=" * 60)
+
+    # Warmup
+    body = json.dumps({
+        "model": "qwen3-8b",
+        "messages": [{"role": "user", "content": "Hi"}],
+        "max_tokens": 8,
+        "temperature": 0.0,
+    }).encode()
+    req = urllib.request.Request(
+        f"{XSERV_URL}/v1/chat/completions",
+        data=body, headers={"Content-Type": "application/json"},
+    )
+    urllib.request.urlopen(req, timeout=120)
+    print("Warmup done.\n")
+
+    results = []
+    for category, prompt in BENCH_PROMPTS:
+        body = json.dumps({
+            "model": "qwen3-8b",
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": MAX_TOKENS,
+            "temperature": 0.0,
+        }).encode()
+        req = urllib.request.Request(
+            f"{XSERV_URL}/v1/chat/completions",
+            data=body, headers={"Content-Type": "application/json"},
+        )
+
+        t0 = time.perf_counter()
+        resp = urllib.request.urlopen(req, timeout=300)
+        elapsed = time.perf_counter() - t0
+        data = json.loads(resp.read())
+
+        usage = data.get("usage", {})
+        pt = usage.get("prompt_tokens", 0)
+        ct = usage.get("completion_tokens", 0)
+        tok_per_sec = ct / elapsed if elapsed > 0 else 0
+
+        print(f"  [{category:>6}] pt={pt:3d} ct={ct:2d} | {elapsed:6.2f}s | {tok_per_sec:5.1f} tok/s | {prompt[:50]}...")
+        results.append({
+            "category": category,
+            "prompt_tokens": pt,
+            "completion_tokens": ct,
+            "elapsed": elapsed,
+            "tok_per_sec": tok_per_sec,
+        })
+
+    # Summary
+    total_ct = sum(r["completion_tokens"] for r in results)
+    total_time = sum(r["elapsed"] for r in results)
+    avg_tok_per_sec = total_ct / total_time if total_time > 0 else 0
+
+    print(f"\n  xserv total: {total_ct} tokens in {total_time:.2f}s = {avg_tok_per_sec:.1f} tok/s")
+    return results
+
+
+def bench_hf():
+    """Benchmark HuggingFace transformers generate()."""
+    print("\n" + "=" * 60)
+    print("BENCHMARK: HuggingFace transformers (greedy, max_new_tokens={})".format(MAX_TOKENS))
+    print("=" * 60)
+
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    print(f"Loading model on GPU 1...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_DIR, dtype=torch.bfloat16, device_map="cuda:1", trust_remote_code=True)
+    model.eval()
+    print("Model loaded.\n")
+
+    # Warmup
+    inputs = tokenizer("Hi", return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        model.generate(**inputs, max_new_tokens=8, do_sample=False)
+    print("Warmup done.\n")
+
+    results = []
+    for category, prompt in BENCH_PROMPTS:
+        # Apply chat template (same as xserv)
+        messages = [{"role": "user", "content": prompt}]
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        pt = inputs["input_ids"].shape[1]
+
+        torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                max_new_tokens=MAX_TOKENS,
+                do_sample=False,
+            )
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - t0
+
+        ct = output.shape[1] - pt
+        tok_per_sec = ct / elapsed if elapsed > 0 else 0
+
+        print(f"  [{category:>6}] pt={pt:3d} ct={ct:2d} | {elapsed:6.2f}s | {tok_per_sec:5.1f} tok/s | {prompt[:50]}...")
+        results.append({
+            "category": category,
+            "prompt_tokens": pt,
+            "completion_tokens": ct,
+            "elapsed": elapsed,
+            "tok_per_sec": tok_per_sec,
+        })
+
+    total_ct = sum(r["completion_tokens"] for r in results)
+    total_time = sum(r["elapsed"] for r in results)
+    avg_tok_per_sec = total_ct / total_time if total_time > 0 else 0
+
+    print(f"\n  HF total: {total_ct} tokens in {total_time:.2f}s = {avg_tok_per_sec:.1f} tok/s")
+
+    del model
+    torch.cuda.empty_cache()
+    return results
+
+
+def main():
+    xserv_results = bench_xserv()
+    hf_results = bench_hf()
+
+    print("\n" + "=" * 60)
+    print("COMPARISON SUMMARY")
+    print("=" * 60)
+
+    print(f"\n{'Category':<10} {'Metric':<20} {'xserv':>10} {'HF':>10} {'Ratio':>10}")
+    print("-" * 62)
+
+    for cat in ["short", "medium", "long"]:
+        xs = [r for r in xserv_results if r["category"] == cat]
+        hf = [r for r in hf_results if r["category"] == cat]
+        if xs and hf:
+            xs_avg_tps = sum(r["tok_per_sec"] for r in xs) / len(xs)
+            hf_avg_tps = sum(r["tok_per_sec"] for r in hf) / len(hf)
+            xs_avg_lat = sum(r["elapsed"] for r in xs) / len(xs)
+            hf_avg_lat = sum(r["elapsed"] for r in hf) / len(hf)
+            ratio_tps = xs_avg_tps / hf_avg_tps if hf_avg_tps > 0 else 0
+            ratio_lat = xs_avg_lat / hf_avg_lat if hf_avg_lat > 0 else 0
+
+            print(f"{cat:<10} {'Throughput (tok/s)':<20} {xs_avg_tps:>10.1f} {hf_avg_tps:>10.1f} {ratio_tps:>9.2f}x")
+            print(f"{'':<10} {'Latency (s)':<20} {xs_avg_lat:>10.2f} {hf_avg_lat:>10.2f} {ratio_lat:>9.2f}x")
+
+    xs_total_tps = sum(r["completion_tokens"] for r in xserv_results) / sum(r["elapsed"] for r in xserv_results)
+    hf_total_tps = sum(r["completion_tokens"] for r in hf_results) / sum(r["elapsed"] for r in hf_results)
+    ratio = xs_total_tps / hf_total_tps if hf_total_tps > 0 else 0
+
+    print("-" * 62)
+    print(f"{'OVERALL':<10} {'Throughput (tok/s)':<20} {xs_total_tps:>10.1f} {hf_total_tps:>10.1f} {ratio:>9.2f}x")
+    print(f"\nxserv is {ratio:.1%} of HF transformers throughput")
+
+
+if __name__ == "__main__":
+    main()