perf: streams / drop per-op sync

Default-stream kernels run in order and every host read goes through a
stream-ordered cudaMemcpy (to_device), so the per-op cudaDeviceSynchronize
after each kernel was pure overhead — remove all 21 in tensor.rs. Host
data is still correctly ordered by the D2H memcpy that reads it.

Also zero op-output buffers with cudaMemset (device-side, async) instead of
a blocking H2D memcpy of a host zero buffer on every allocation — that
copy was itself a hidden per-op sync point.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 16:56:17 +08:00
parent 8070c1949a
commit a842e432b5
4 changed files with 11 additions and 23 deletions

View File

@@ -19,6 +19,7 @@ unsafe extern "C" {
pub fn cudaMalloc(devptr: *mut *mut u8, size: usize) -> i32;
pub fn cudaFree(devptr: *mut u8) -> i32;
pub fn cudaMemcpy(dst: *mut u8, src: *const u8, count: usize, kind: i32) -> i32;
pub fn cudaMemset(devptr: *mut u8, value: i32, count: usize) -> i32;
// --- Error ---
pub fn cudaGetErrorString(error: i32) -> *const c_char;

View File

@@ -46,6 +46,12 @@ impl GpuBuffer {
ffi::cudaMemcpy(dst.as_mut_ptr(), self.ptr, dst.len(), ffi::CUDA_MEMCPY_D2H)
})
}
/// Set every byte of the buffer to `value` on the device (no host copy).
/// Used to zero op-output buffers without a blocking H2D memcpy of zeros.
pub fn memset(&mut self, value: u8) -> Result<()> {
error::check(unsafe { ffi::cudaMemset(self.ptr, value as i32, self.len) })
}
}
impl Drop for GpuBuffer {

View File

@@ -99,9 +99,11 @@ impl Storage {
match device {
Device::Cpu => Ok(Storage::cpu(vec![0u8; len_bytes])),
Device::Cuda(dev) => {
// No device memset in T2: stage zeros from the host.
// Device-side memset (Phase T7): avoids a blocking H2D memcpy of a
// host zero buffer on every op-output allocation. cudaMemset is
// async on the default stream, so it doesn't serialize the stream.
let mut buf = GpuBuffer::alloc(len_bytes)?;
buf.copy_from_host(&vec![0u8; len_bytes])?;
buf.memset(0)?;
Ok(Storage::cuda(buf, dev))
}
}

View File

@@ -153,7 +153,6 @@ impl Tensor {
std::ptr::null_mut(), // default stream
);
}
xtrain_cuda::device::synchronize().expect("scale kernel sync failed");
out
}
@@ -201,7 +200,6 @@ impl Tensor {
0.0,
out.data_ptr() as *mut f32,
);
xtrain_cuda::device::synchronize().expect("matmul kernel sync failed");
out
}
@@ -229,7 +227,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("transpose kernel sync failed");
out
}
@@ -277,7 +274,6 @@ impl Tensor {
0.0,
db.data_ptr() as *mut f32,
);
xtrain_cuda::device::synchronize().expect("matmul_backward sync failed");
(da, db)
}
@@ -301,7 +297,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("add sync failed");
out
}
@@ -319,7 +314,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("mul sync failed");
out
}
@@ -342,7 +336,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("add_bias sync failed");
out
}
@@ -363,7 +356,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("sum_rows sync failed");
out
}
@@ -390,7 +382,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("rms_norm sync failed");
(y, inv_rms)
}
@@ -427,7 +418,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("rms_norm_backward sync failed");
(dx, dgamma)
}
@@ -444,7 +434,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("silu sync failed");
out
}
@@ -462,7 +451,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("silu_backward sync failed");
dx
}
@@ -485,7 +473,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("rope sync failed");
out
}
@@ -506,7 +493,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("rope_backward sync failed");
dx
}
@@ -525,7 +511,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("softmax sync failed");
out
}
@@ -545,7 +530,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("softmax_backward sync failed");
dx
}
@@ -571,7 +555,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("cross_entropy sync failed");
(probs, loss)
}
@@ -592,7 +575,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("cross_entropy_backward sync failed");
dx
}
@@ -640,7 +622,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("embedding sync failed");
out
}
@@ -660,7 +641,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("embedding_backward sync failed");
dtable
}
@@ -684,7 +664,6 @@ impl Tensor {
std::ptr::null_mut(),
);
}
xtrain_cuda::device::synchronize().expect("transpose_3d01 sync failed");
out
}