from __future__ import annotations

import argparse
import statistics
import sys
import time
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import torch

from kernels.triton.tiled_matmul import triton_tiled_matmul
from reference.torch_matmul import torch_matmul
from tools.lab_extension import build_extension


def benchmark(fn, *args, warmup: int = 5, reps: int = 20) -> float:
    for _ in range(warmup):
        fn(*args)
    if args[0].is_cuda:
        torch.cuda.synchronize()
    times_ms = []
    for _ in range(reps):
        if args[0].is_cuda:
            torch.cuda.synchronize()
        start = time.perf_counter()
        fn(*args)
        if args[0].is_cuda:
            torch.cuda.synchronize()
        times_ms.append((time.perf_counter() - start) * 1e3)
    return statistics.median(times_ms)


def report(name: str, elapsed_ms: float, m: int, n: int, k: int) -> None:
    tflops = (2.0 * m * n * k) / (elapsed_ms * 1e-3) / 1e12
    print(f"{name}: {elapsed_ms:.3f} ms | throughput {tflops:.3f} TFLOP/s")


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
    parser.add_argument("--mode", choices=["all", "torch", "triton", "cuda"], default="all")
    parser.add_argument("--m", type=int, default=1024)
    parser.add_argument("--n", type=int, default=1024)
    parser.add_argument("--k", type=int, default=1024)
    args = parser.parse_args()

    a = torch.randn(args.m, args.k, device=args.device)
    b = torch.randn(args.k, args.n, device=args.device)

    if args.mode in {"all", "torch"}:
        report("torch", benchmark(torch_matmul, a, b), args.m, args.n, args.k)

    if args.device == "cuda" and args.mode in {"all", "triton"}:
        try:
            report("triton", benchmark(triton_tiled_matmul, a, b), args.m, args.n, args.k)
        except (NotImplementedError, RuntimeError) as exc:
            print(f"triton: skipped ({exc})")

    if args.device == "cuda" and args.mode in {"all", "cuda"}:
        ext = build_extension(verbose=False)
        if ext is None or not hasattr(torch.ops, "kernel_lab"):
            print("cuda: skipped (extension unavailable)")
        else:
            try:
                report(
                    "cuda",
                    benchmark(torch.ops.kernel_lab.tiled_matmul, a, b),
                    args.m,
                    args.n,
                    args.k,
                )
            except Exception as exc:
                print(f"cuda: skipped ({exc})")


if __name__ == "__main__":
    main()