cuda: add cached_trim() to release pooled GPU buffers

Exposes the caching allocator's trim() through a public free function. Called after weight fusion during model loading to free temporary buffers that would otherwise sit in the pool and cause OOM. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-30 12:50:04 +08:00
parent 1ab6ca9c09
commit 6ce21345be
1 changed files with 7 additions and 0 deletions
--- a/crates/xserv-cuda/src/allocator.rs
+++ b/crates/xserv-cuda/src/allocator.rs
@@ -100,6 +100,13 @@ pub fn cached_alloc(size: usize) -> Result<GpuBuffer> {
    })
 }

+/// Free all cached (unused) GPU buffers back to the driver.
+pub fn cached_trim() {
+    ALLOCATOR.with(|cell| {
+        cell.borrow_mut().trim();
+    });
+}
+
 /// Return a raw GPU pointer to the caching allocator's free list.
 /// Called from `GpuBuffer::Drop` for pooled buffers. Takes raw pointer
 /// and size to avoid re-triggering Drop.