New xserv-distributed crate: hand-written NCCL FFI, TpContext (one rank per thread, bound to one GPU), and in-place BF16 AllReduce on the null stream so it orders naturally with the model's kernels. 2-GPU AllReduce test included. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
14 lines
472 B
Rust
14 lines
472 B
Rust
use std::env;
|
|
|
|
fn main() {
|
|
let cuda_path = env::var("CUDA_HOME")
|
|
.or_else(|_| env::var("CUDA_PATH"))
|
|
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
|
|
|
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
|
// NCCL is typically installed as a system library.
|
|
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu");
|
|
println!("cargo:rustc-link-lib=dylib=nccl");
|
|
println!("cargo:rustc-link-lib=dylib=cudart");
|
|
}
|