from __future__ import annotations

import statistics
import sys
import time
from pathlib import Path

ROOT = Path(__file__).resolve().parents[2]
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import torch

from kernels.triton.flash_attention_fwd import triton_flash_attention_fwd
from reference.torch_attention import torch_attention


def benchmark(fn, *args, warmup: int = 5, reps: int = 20, **kwargs) -> float:
    for _ in range(warmup):
        fn(*args, **kwargs)
    if args[0].is_cuda:
        torch.cuda.synchronize()
    times_ms = []
    for _ in range(reps):
        if args[0].is_cuda:
            torch.cuda.synchronize()
        start = time.perf_counter()
        fn(*args, **kwargs)
        if args[0].is_cuda:
            torch.cuda.synchronize()
        times_ms.append((time.perf_counter() - start) * 1e3)
    return statistics.median(times_ms)


def main() -> None:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    q = torch.randn(2, 8, 128, 64, device=device)
    k = torch.randn(2, 8, 128, 64, device=device)
    v = torch.randn(2, 8, 128, 64, device=device)
    ref_ms = benchmark(torch_attention, q, k, v, causal=False)
    print(f"torch_attention: {ref_ms:.3f} ms")
    if device == "cuda":
        try:
            triton_ms = benchmark(triton_flash_attention_fwd, q, k, v, causal=False)
            print(f"triton_flash_attention_fwd: {triton_ms:.3f} ms")
        except (NotImplementedError, RuntimeError) as exc:
            print(f"triton_flash_attention_fwd: skipped ({exc})")


if __name__ == "__main__":
    main()