perf: streams / drop per-op sync

Default-stream kernels run in order and every host read goes through a stream-ordered cudaMemcpy (to_device), so the per-op cudaDeviceSynchronize after each kernel was pure overhead — remove all 21 in tensor.rs. Host data is still correctly ordered by the D2H memcpy that reads it. Also zero op-output buffers with cudaMemset (device-side, async) instead of a blocking H2D memcpy of a host zero buffer on every allocation — that copy was itself a hidden per-op sync point. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 16:56:17 +08:00
parent 8070c1949a
commit a842e432b5
4 changed files with 11 additions and 23 deletions
--- a/crates/xtrain-cuda/src/ffi.rs
+++ b/crates/xtrain-cuda/src/ffi.rs
@@ -19,6 +19,7 @@ unsafe extern "C" {
    pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
    pub fn cudaFree(devptr: *mut u8) -> i32;
    pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
+    pub fn cudaMemset(devptr: *mut u8, value: i32, count: usize) -> i32;

    // --- Error ---
    pub fn cudaGetErrorString(error: i32) -> *const c_char;
--- a/crates/xtrain-cuda/src/memory.rs
+++ b/crates/xtrain-cuda/src/memory.rs
@@ -46,6 +46,12 @@ impl GpuBuffer {
            ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
        })
    }
+
+    /// Set every byte of the buffer to `value` on the device (no host copy).
+    /// Used to zero op-output buffers without a blocking H2D memcpy of zeros.
+    pub fn memset(&mut self, value: u8) -> Result<()> {
+        error::check(unsafe { ffi::cudaMemset(self.ptr, value as i32, self.len) })
+    }
 }

 impl Drop for GpuBuffer {
--- a/crates/xtrain-tensor/src/storage.rs
+++ b/crates/xtrain-tensor/src/storage.rs
@@ -99,9 +99,11 @@ impl Storage {
        match device {
            Device::Cpu => Ok(Storage::cpu(vec![0u8; len_bytes])),
            Device::Cuda(dev) => {
-                // No device memset in T2: stage zeros from the host.
+                // Device-side memset (Phase T7): avoids a blocking H2D memcpy of a
+                // host zero buffer on every op-output allocation. cudaMemset is
+                // async on the default stream, so it doesn't serialize the stream.
                let mut buf = GpuBuffer::alloc(len_bytes)?;
-                buf.copy_from_host(&vec![0u8; len_bytes])?;
+                buf.memset(0)?;
                Ok(Storage::cuda(buf, dev))
            }
        }
--- a/crates/xtrain-tensor/src/tensor.rs
+++ b/crates/xtrain-tensor/src/tensor.rs
@@ -153,7 +153,6 @@ impl Tensor {
                std::ptr::null_mut(), // default stream
            );
        }
-        xtrain_cuda::device::synchronize().expect("scale kernel sync failed");
        out
    }

@@ -201,7 +200,6 @@ impl Tensor {
            0.0,
            out.data_ptr() as *mut f32,
        );
-        xtrain_cuda::device::synchronize().expect("matmul kernel sync failed");
        out
    }

@@ -229,7 +227,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("transpose kernel sync failed");
        out
    }

@@ -277,7 +274,6 @@ impl Tensor {
            0.0,
            db.data_ptr() as *mut f32,
        );
-        xtrain_cuda::device::synchronize().expect("matmul_backward sync failed");
        (da, db)
    }

@@ -301,7 +297,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("add sync failed");
        out
    }

@@ -319,7 +314,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("mul sync failed");
        out
    }

@@ -342,7 +336,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("add_bias sync failed");
        out
    }

@@ -363,7 +356,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("sum_rows sync failed");
        out
    }

@@ -390,7 +382,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("rms_norm sync failed");
        (y, inv_rms)
    }

@@ -427,7 +418,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("rms_norm_backward sync failed");
        (dx, dgamma)
    }

@@ -444,7 +434,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("silu sync failed");
        out
    }

@@ -462,7 +451,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("silu_backward sync failed");
        dx
    }

@@ -485,7 +473,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("rope sync failed");
        out
    }

@@ -506,7 +493,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("rope_backward sync failed");
        dx
    }

@@ -525,7 +511,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("softmax sync failed");
        out
    }

@@ -545,7 +530,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("softmax_backward sync failed");
        dx
    }

@@ -571,7 +555,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("cross_entropy sync failed");
        (probs, loss)
    }

@@ -592,7 +575,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("cross_entropy_backward sync failed");
        dx
    }

@@ -640,7 +622,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("embedding sync failed");
        out
    }

@@ -660,7 +641,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("embedding_backward sync failed");
        dtable
    }

@@ -684,7 +664,6 @@ impl Tensor {
                std::ptr::null_mut(),
            );
        }
-        xtrain_cuda::device::synchronize().expect("transpose_3d01 sync failed");
        out
    }