tensor: add scale elementwise CUDA kernel + FFI
New csrc/ops/elementwise.cu (out[i]=in[i]*alpha), compiled by xtrain-cuda/build.rs and exposed via launch_scale_f32 FFI, gated behind not(no_cuda) like the existing vecadd smoke test. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
17
csrc/ops/elementwise.cu
Normal file
17
csrc/ops/elementwise.cu
Normal file
@@ -0,0 +1,17 @@
|
||||
extern "C" {
|
||||
|
||||
// out[i] = in[i] * alpha (in-place safe: out may alias in)
|
||||
__global__ void scale_f32(const float* in, float* out, float alpha, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) {
|
||||
out[idx] = in[idx] * alpha;
|
||||
}
|
||||
}
|
||||
|
||||
void launch_scale_f32(const float* in, float* out, float alpha, int n, void* stream) {
|
||||
int block = 256;
|
||||
int grid = (n + block - 1) / block;
|
||||
scale_f32<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, alpha, n);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user