#pragma once #include namespace kernel_lab { torch::Tensor vector_add_cuda(torch::Tensor x, torch::Tensor y); torch::Tensor row_softmax_cuda(torch::Tensor x); torch::Tensor tiled_matmul_cuda(torch::Tensor a, torch::Tensor b); torch::Tensor online_softmax_cuda(torch::Tensor x); torch::Tensor flash_attention_fwd_cuda( torch::Tensor q, torch::Tensor k, torch::Tensor v, bool causal); } // namespace kernel_lab