extern "C" { // out[i] = in[i] * alpha (in-place safe: out may alias in) __global__ void scale_f32(const float* in, float* out, float alpha, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { out[idx] = in[idx] * alpha; } } void launch_scale_f32(const float* in, float* out, float alpha, int n, void* stream) { int block = 256; int grid = (n + block - 1) / block; scale_f32<<>>(in, out, alpha, n); } }