Stand up the xtrain project skeleton: a Cargo workspace mirroring xserv's csrc/ + crates/ layout, with a single xtrain-cuda crate that wraps the CUDA Runtime over hand-written extern "C" FFI. build.rs compiles csrc/test/vecadd.cu via the cc crate targeting sm_120 (RTX 5090) and links cudart. A gated integration test runs the vector-add kernel on the GPU and asserts the result. When nvcc is absent (local GPU-less machine), build.rs skips CUDA compilation and sets a `no_cuda` cfg so host-side cargo check still works. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
17 lines
431 B
Plaintext
17 lines
431 B
Plaintext
extern "C" {
|
|
|
|
__global__ void vecadd_f32(const float* a, const float* b, float* c, int n) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < n) {
|
|
c[idx] = a[idx] + b[idx];
|
|
}
|
|
}
|
|
|
|
void launch_vecadd_f32(const float* a, const float* b, float* c, int n, void* stream) {
|
|
int block = 256;
|
|
int grid = (n + block - 1) / block;
|
|
vecadd_f32<<<grid, block, 0, (cudaStream_t)stream>>>(a, b, c, n);
|
|
}
|
|
|
|
}
|