pub mod activation;
pub mod argmax;
pub mod attention;
pub mod dispatch;
pub mod embedding;
pub mod gemm;
pub mod layernorm;
pub mod moe;
pub mod quantization;
pub mod rmsnorm;
pub mod rope;
pub mod softmax;
pub mod transpose;

pub use activation::{add, bias_add_2d, gelu, gpt_oss_glu, mul, scale, silu, silu_mul};
pub use argmax::{argmax_bf16_single, argmax_bf16_to_host};
pub use attention::{
    attention, copy_kv_position, decode_attention, flash_attention, flash_attention_sinks,
    paged_decode_attention, paged_decode_attention_sinks, paged_decode_attention_tree,
    reshape_and_cache_batched_bf16, reshape_and_cache_bf16,
};
pub use embedding::{embedding, embedding_device_ids};
pub use gemm::{GemmBackend, batched_matmul, matmul, matmul_batched_gemv};
pub use layernorm::layernorm;
pub use rmsnorm::{add_rmsnorm, rmsnorm};
pub use rope::{RopeCache, rope_inplace, rope_inplace_device_pos};
pub use softmax::softmax;
pub use transpose::{
    merge_heads_gpu, repeat_kv_gpu, reshape_heads_gpu, strided_to_contiguous_gpu,
    transpose_for_rope_gpu, transpose_from_rope_gpu,
};

/// Register GPU kernels with the tensor crate. Call once at startup.
pub fn init() {
    xserv_tensor::register_gpu_contiguous(strided_to_contiguous_gpu);
}