New csrc/ops/elementwise.cu (out[i]=in[i]*alpha), compiled by xtrain-cuda/build.rs and exposed via launch_scale_f32 FFI, gated behind not(no_cuda) like the existing vecadd smoke test. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
18 lines
499 B
Plaintext
18 lines
499 B
Plaintext
extern "C" {
|
|
|
|
// out[i] = in[i] * alpha (in-place safe: out may alias in)
|
|
__global__ void scale_f32(const float* in, float* out, float alpha, int n) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < n) {
|
|
out[idx] = in[idx] * alpha;
|
|
}
|
|
}
|
|
|
|
void launch_scale_f32(const float* in, float* out, float alpha, int n, void* stream) {
|
|
int block = 256;
|
|
int grid = (n + block - 1) / block;
|
|
scale_f32<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, alpha, n);
|
|
}
|
|
|
|
}
|