softmax: cap block size at 512 threads

launch_softmax_{f32,bf16} clamped block to 1024 threads when cols was larger. Halving the ceiling to 512 keeps two blocks per SM resident on the large vocab kernels that dominate speculative verify workloads without changing rows/block indexing, and never exceeds cols.
2026-07-01 14:15:58 +08:00
parent f5ec10c2c3
commit a67753f516
1 changed files with 2 additions and 2 deletions
--- a/csrc/reduce/softmax.cu
+++ b/csrc/reduce/softmax.cu
@@ -90,7 +90,7 @@ __global__ void softmax_bf16(
 extern "C" {

 void launch_softmax_f32(const void* x, void* out, int rows, int cols, void* stream) {
-    int block = (cols < 1024) ? cols : 1024;
+    int block = (cols < 512) ? cols : 512;
    if (block < 32) block = 32;
    softmax_f32<<<rows, block, 0, (cudaStream_t)stream>>>(
        (const float*)x, (float*)out, cols);
@@ -98,7 +98,7 @@ void launch_softmax_f32(const void* x, void* out, int rows, int cols, void* stre
 }

 void launch_softmax_bf16(const void* x, void* out, int rows, int cols, void* stream) {
-    int block = (cols < 1024) ? cols : 1024;
+    int block = (cols < 512) ? cols : 512;
    if (block < 32) block = 32;
    softmax_bf16<<<rows, block, 0, (cudaStream_t)stream>>>(
        (const __nv_bfloat16*)x, (__nv_bfloat16*)out, cols);