#!/usr/bin/env python3
"""
Benchmark xserv vs HuggingFace transformers on Qwen3-8B.
Measures: prefill latency, decode throughput, end-to-end latency.

Usage:
    # xserv server should be running on port 9090
    python3 tools/bench_vs_hf.py
"""

import json
import os
import time
import urllib.request

MODEL_DIR = "/opt/wjh/models/qwen3-8b"
XSERV_URL = "http://localhost:9090"

BENCH_PROMPTS = [
    # Short prompts (~10 tokens)
    ("short", "What is gravity?"),
    ("short", "Hello, how are you?"),
    ("short", "Explain DNA briefly."),
    # Medium prompts (~30 tokens)
    ("medium", "Write a detailed explanation of how photosynthesis works in plants, including the light and dark reactions."),
    ("medium", "Describe the process of machine learning training, including forward pass, loss computation, and backpropagation."),
    ("medium", "Explain the differences between TCP and UDP protocols, including when you would use each one in practice."),
    # Longer prompts (~60 tokens)
    ("long", "You are an expert computer scientist. Please write a comprehensive explanation of how modern GPUs work, including the architecture of streaming multiprocessors, the memory hierarchy from registers to global memory, and how thousands of threads are scheduled concurrently. Include specific technical details."),
    ("long", "You are a historian specializing in ancient civilizations. Please provide a detailed analysis of the rise and fall of the Roman Empire, covering the key factors that led to its expansion, the political and social structures that sustained it, and the multiple causes that contributed to its eventual decline and collapse."),
]

MAX_TOKENS = 64


def bench_xserv():
    """Benchmark xserv HTTP API."""
    print("\n" + "=" * 60)
    print("BENCHMARK: xserv (HTTP API, greedy, max_tokens={})".format(MAX_TOKENS))
    print("=" * 60)

    # Warmup
    body = json.dumps({
        "model": "qwen3-8b",
        "messages": [{"role": "user", "content": "Hi"}],
        "max_tokens": 8,
        "temperature": 0.0,
    }).encode()
    req = urllib.request.Request(
        f"{XSERV_URL}/v1/chat/completions",
        data=body, headers={"Content-Type": "application/json"},
    )
    urllib.request.urlopen(req, timeout=120)
    print("Warmup done.\n")

    results = []
    for category, prompt in BENCH_PROMPTS:
        body = json.dumps({
            "model": "qwen3-8b",
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": MAX_TOKENS,
            "temperature": 0.0,
        }).encode()
        req = urllib.request.Request(
            f"{XSERV_URL}/v1/chat/completions",
            data=body, headers={"Content-Type": "application/json"},
        )

        t0 = time.perf_counter()
        resp = urllib.request.urlopen(req, timeout=300)
        elapsed = time.perf_counter() - t0
        data = json.loads(resp.read())

        usage = data.get("usage", {})
        pt = usage.get("prompt_tokens", 0)
        ct = usage.get("completion_tokens", 0)
        tok_per_sec = ct / elapsed if elapsed > 0 else 0

        print(f"  [{category:>6}] pt={pt:3d} ct={ct:2d} | {elapsed:6.2f}s | {tok_per_sec:5.1f} tok/s | {prompt[:50]}...")
        results.append({
            "category": category,
            "prompt_tokens": pt,
            "completion_tokens": ct,
            "elapsed": elapsed,
            "tok_per_sec": tok_per_sec,
        })

    # Summary
    total_ct = sum(r["completion_tokens"] for r in results)
    total_time = sum(r["elapsed"] for r in results)
    avg_tok_per_sec = total_ct / total_time if total_time > 0 else 0

    print(f"\n  xserv total: {total_ct} tokens in {total_time:.2f}s = {avg_tok_per_sec:.1f} tok/s")
    return results


def bench_hf():
    """Benchmark HuggingFace transformers generate()."""
    print("\n" + "=" * 60)
    print("BENCHMARK: HuggingFace transformers (greedy, max_new_tokens={})".format(MAX_TOKENS))
    print("=" * 60)

    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    print(f"Loading model on GPU 1...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_DIR, dtype=torch.bfloat16, device_map="cuda:1", trust_remote_code=True)
    model.eval()
    print("Model loaded.\n")

    # Warmup
    inputs = tokenizer("Hi", return_tensors="pt").to(model.device)
    with torch.no_grad():
        model.generate(**inputs, max_new_tokens=8, do_sample=False)
    print("Warmup done.\n")

    results = []
    for category, prompt in BENCH_PROMPTS:
        # Apply chat template (same as xserv)
        messages = [{"role": "user", "content": prompt}]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        pt = inputs["input_ids"].shape[1]

        torch.cuda.synchronize()
        t0 = time.perf_counter()
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=MAX_TOKENS,
                do_sample=False,
            )
        torch.cuda.synchronize()
        elapsed = time.perf_counter() - t0

        ct = output.shape[1] - pt
        tok_per_sec = ct / elapsed if elapsed > 0 else 0

        print(f"  [{category:>6}] pt={pt:3d} ct={ct:2d} | {elapsed:6.2f}s | {tok_per_sec:5.1f} tok/s | {prompt[:50]}...")
        results.append({
            "category": category,
            "prompt_tokens": pt,
            "completion_tokens": ct,
            "elapsed": elapsed,
            "tok_per_sec": tok_per_sec,
        })

    total_ct = sum(r["completion_tokens"] for r in results)
    total_time = sum(r["elapsed"] for r in results)
    avg_tok_per_sec = total_ct / total_time if total_time > 0 else 0

    print(f"\n  HF total: {total_ct} tokens in {total_time:.2f}s = {avg_tok_per_sec:.1f} tok/s")

    del model
    torch.cuda.empty_cache()
    return results


def main():
    xserv_results = bench_xserv()
    hf_results = bench_hf()

    print("\n" + "=" * 60)
    print("COMPARISON SUMMARY")
    print("=" * 60)

    print(f"\n{'Category':<10} {'Metric':<20} {'xserv':>10} {'HF':>10} {'Ratio':>10}")
    print("-" * 62)

    for cat in ["short", "medium", "long"]:
        xs = [r for r in xserv_results if r["category"] == cat]
        hf = [r for r in hf_results if r["category"] == cat]
        if xs and hf:
            xs_avg_tps = sum(r["tok_per_sec"] for r in xs) / len(xs)
            hf_avg_tps = sum(r["tok_per_sec"] for r in hf) / len(hf)
            xs_avg_lat = sum(r["elapsed"] for r in xs) / len(xs)
            hf_avg_lat = sum(r["elapsed"] for r in hf) / len(hf)
            ratio_tps = xs_avg_tps / hf_avg_tps if hf_avg_tps > 0 else 0
            ratio_lat = xs_avg_lat / hf_avg_lat if hf_avg_lat > 0 else 0

            print(f"{cat:<10} {'Throughput (tok/s)':<20} {xs_avg_tps:>10.1f} {hf_avg_tps:>10.1f} {ratio_tps:>9.2f}x")
            print(f"{'':<10} {'Latency (s)':<20} {xs_avg_lat:>10.2f} {hf_avg_lat:>10.2f} {ratio_lat:>9.2f}x")

    xs_total_tps = sum(r["completion_tokens"] for r in xserv_results) / sum(r["elapsed"] for r in xserv_results)
    hf_total_tps = sum(r["completion_tokens"] for r in hf_results) / sum(r["elapsed"] for r in hf_results)
    ratio = xs_total_tps / hf_total_tps if hf_total_tps > 0 else 0

    print("-" * 62)
    print(f"{'OVERALL':<10} {'Throughput (tok/s)':<20} {xs_total_tps:>10.1f} {hf_total_tps:>10.1f} {ratio:>9.2f}x")
    print(f"\nxserv is {ratio:.1%} of HF transformers throughput")


if __name__ == "__main__":
    main()