Replace the per-token CPU-routed MoE forward with an all-GPU path: 1. moe_topk_softmax: GPU top-k + softmax (was CPU sort + softmax) 2. moe_replicate: broadcast input to all local experts 3. cublasGemmStridedBatchedEx: batched expert matmul (was per-expert cuBLAS) 4. moe_weighted_sum: FP32-accumulated weighted sum on GPU (was GPU→CPU→F32→BF16→GPU) Expert weights stored as contiguous 3D tensors for strided batched GEMM. Zero CPU↔GPU transfers per MoE layer (was ~40 per token per layer). Also: configurable geglu_alpha, LayerNorm bias auto-detect, unused-weight diagnostic at load time. GSM8K 30-problem: 11/30 → 23/30 (76.7%) vs llama.cpp 30/30 (100%). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
37 lines
1.4 KiB
Rust
37 lines
1.4 KiB
Rust
use std::env;
|
|
|
|
fn main() {
|
|
let cuda_path = env::var("CUDA_HOME")
|
|
.or_else(|_| env::var("CUDA_PATH"))
|
|
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
|
|
|
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
|
println!("cargo:rustc-link-lib=dylib=cudart");
|
|
println!("cargo:rustc-link-lib=dylib=cublas");
|
|
|
|
cc::Build::new()
|
|
.cuda(true)
|
|
.cudart("shared")
|
|
.flag("-gencode=arch=compute_120,code=sm_120")
|
|
.include("../../csrc")
|
|
.file("../../csrc/gemm/naive.cu")
|
|
.file("../../csrc/gemm/tiled.cu")
|
|
.file("../../csrc/gemm/gemv.cu")
|
|
.file("../../csrc/normalization/rmsnorm.cu")
|
|
.file("../../csrc/normalization/layernorm.cu")
|
|
.file("../../csrc/activation/activations.cu")
|
|
.file("../../csrc/reduce/softmax.cu")
|
|
.file("../../csrc/reduce/argmax.cu")
|
|
.file("../../csrc/embedding/embedding.cu")
|
|
.file("../../csrc/embedding/rope.cu")
|
|
.file("../../csrc/attention/causal_mask.cu")
|
|
.file("../../csrc/embedding/transpose.cu")
|
|
.file("../../csrc/attention/flash_attention.cu")
|
|
.file("../../csrc/attention/paged_attention.cu")
|
|
.file("../../csrc/attention/reshape_and_cache.cu")
|
|
.file("../../csrc/moe/moe_kernels.cu")
|
|
.compile("xserv_kernels");
|
|
|
|
println!("cargo:rerun-if-changed=../../csrc/");
|
|
}
|