- Batched GEMM via cublasGemmStridedBatchedEx - Causal mask CUDA kernel (F32 + BF16) - Element-wise scale CUDA kernel (F32 + BF16) - attention() composing: batched_matmul + scale + causal_mask + softmax - Fixed to_device/contiguous infinite recursion (GPU contiguous via CPU round-trip) - 5 attention tests passing (max_err < 3e-7 F32) - Total: 61 tests passing across all crates Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
30 lines
1.0 KiB
Rust
30 lines
1.0 KiB
Rust
use std::env;
|
|
|
|
fn main() {
|
|
let cuda_path = env::var("CUDA_HOME")
|
|
.or_else(|_| env::var("CUDA_PATH"))
|
|
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
|
|
|
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
|
println!("cargo:rustc-link-lib=dylib=cudart");
|
|
println!("cargo:rustc-link-lib=dylib=cublas");
|
|
|
|
cc::Build::new()
|
|
.cuda(true)
|
|
.cudart("shared")
|
|
.flag("-gencode=arch=compute_120,code=sm_120")
|
|
.include("../../csrc")
|
|
.file("../../csrc/gemm/naive.cu")
|
|
.file("../../csrc/gemm/tiled.cu")
|
|
.file("../../csrc/normalization/rmsnorm.cu")
|
|
.file("../../csrc/normalization/layernorm.cu")
|
|
.file("../../csrc/activation/activations.cu")
|
|
.file("../../csrc/reduce/softmax.cu")
|
|
.file("../../csrc/embedding/embedding.cu")
|
|
.file("../../csrc/embedding/rope.cu")
|
|
.file("../../csrc/attention/causal_mask.cu")
|
|
.compile("xserv_kernels");
|
|
|
|
println!("cargo:rerun-if-changed=../../csrc/");
|
|
}
|