tensor: add scale elementwise CUDA kernel + FFI

New csrc/ops/elementwise.cu (out[i]=in[i]*alpha), compiled by
xtrain-cuda/build.rs and exposed via launch_scale_f32 FFI, gated behind
not(no_cuda) like the existing vecadd smoke test.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 15:13:06 +08:00
parent 8557a289a2
commit 63dc05fd10
3 changed files with 30 additions and 2 deletions

17
csrc/ops/elementwise.cu Normal file
View File

@@ -0,0 +1,17 @@
extern "C" {
// out[i] = in[i] * alpha (in-place safe: out may alias in)
__global__ void scale_f32(const float* in, float* out, float alpha, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
out[idx] = in[idx] * alpha;
}
}
void launch_scale_f32(const float* in, float* out, float alpha, int n, void* stream) {
int block = 256;
int grid = (n + block - 1) / block;
scale_f32<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, alpha, n);
}
}