New crate xtrain-distributed (mirrors xserv-distributed): hand-written NCCL
FFI (GetUniqueId / CommInitRank / AllReduce / CommDestroy / Group{Start,End},
ncclUniqueId passed by value per the NCCL ABI) and a safe DdpContext wrapper —
rank 0 mints the UniqueId, every rank inits its communicator under a group, and
all_reduce_average_grads in-place AllReduce(sum)s each param's .grad() device
buffer then scales by 1/world (reuses T7's scale_inplace kernel). AllReduce runs
on the null stream so it orders with the model's kernels (no extra barrier).
build.rs follows the per-crate convention: no nvcc -> no_cuda cfg (crate
compiles to empty, cargo check passes host-side); with nvcc, links -lnccl
-lcudart like xserv-distributed's build.rs.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
34 lines
1.2 KiB
Rust
34 lines
1.2 KiB
Rust
use std::env;
|
|
use std::path::Path;
|
|
use std::process::Command;
|
|
|
|
// Mirror the per-crate convention (see xtrain-cuda/build.rs): with no nvcc/GPU
|
|
// locally, emit `no_cuda` so the NCCL FFI + DDP code compiles (but is not linked
|
|
// or run). On dash5, link NCCL exactly like xserv-distributed's build.rs.
|
|
fn main() {
|
|
println!("cargo:rustc-check-cfg=cfg(no_cuda)");
|
|
|
|
let cuda_path = env::var("CUDA_HOME")
|
|
.or_else(|_| env::var("CUDA_PATH"))
|
|
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
|
|
|
if !nvcc_available(&cuda_path) {
|
|
println!("cargo:warning=nvcc not found — skipping NCCL link (host-only build).");
|
|
println!("cargo:rustc-cfg=no_cuda");
|
|
return;
|
|
}
|
|
|
|
println!("cargo:rustc-link-search=native={cuda_path}/lib64");
|
|
// NCCL is installed as a system library on dash5.
|
|
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu");
|
|
println!("cargo:rustc-link-lib=dylib=nccl");
|
|
println!("cargo:rustc-link-lib=dylib=cudart");
|
|
}
|
|
|
|
fn nvcc_available(cuda_path: &str) -> bool {
|
|
if Command::new("nvcc").arg("--version").output().is_ok() {
|
|
return true;
|
|
}
|
|
Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
|
|
}
|