pub mod activation; pub mod argmax; pub mod attention; pub mod dispatch; pub mod embedding; pub mod gemm; pub mod layernorm; pub mod moe; pub mod quantization; pub mod rmsnorm; pub mod rope; pub mod softmax; pub mod transpose; pub use activation::{add, bias_add_2d, gelu, gpt_oss_glu, mul, scale, silu, silu_mul}; pub use argmax::{argmax_bf16_single, argmax_bf16_to_host}; pub use attention::{ attention, copy_kv_position, decode_attention, flash_attention, flash_attention_sinks, paged_decode_attention, paged_decode_attention_sinks, paged_decode_attention_tree, reshape_and_cache_batched_bf16, reshape_and_cache_bf16, }; pub use embedding::{embedding, embedding_device_ids}; pub use gemm::{GemmBackend, batched_matmul, matmul, matmul_batched_gemv}; pub use layernorm::layernorm; pub use rmsnorm::{add_rmsnorm, rmsnorm}; pub use rope::{RopeCache, rope_inplace, rope_inplace_device_pos}; pub use softmax::softmax; pub use transpose::{ merge_heads_gpu, repeat_kv_gpu, reshape_heads_gpu, strided_to_contiguous_gpu, transpose_for_rope_gpu, transpose_from_rope_gpu, }; /// Register GPU kernels with the tensor crate. Call once at startup. pub fn init() { xserv_tensor::register_gpu_contiguous(strided_to_contiguous_gpu); }