phase 0+1: fix Rust 2024 edition compat + memory query
- unsafe extern "C" blocks (Rust 2024 requirement) - unsafe blocks inside unsafe fn bodies - Use cudaMemGetInfo for accurate GPU memory reporting - Remove cc "cuda" feature (doesn't exist, built-in) - All 12 tests pass on RTX 5090 (CC 12.0, 170 SMs, 32GB) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,7 +4,7 @@ version.workspace = true
|
|||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
cc = { version = "1", features = ["cuda"] }
|
cc = "1"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = "0.9"
|
rand = "0.9"
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ pub struct DeviceInfo {
|
|||||||
pub index: u32,
|
pub index: u32,
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub total_memory: usize,
|
pub total_memory: usize,
|
||||||
|
pub free_memory: usize,
|
||||||
pub compute_major: i32,
|
pub compute_major: i32,
|
||||||
pub compute_minor: i32,
|
pub compute_minor: i32,
|
||||||
pub sm_count: i32,
|
pub sm_count: i32,
|
||||||
@@ -15,8 +16,9 @@ pub struct DeviceInfo {
|
|||||||
pub max_threads_per_block: i32,
|
pub max_threads_per_block: i32,
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
unsafe extern "C" {
|
||||||
fn cudaDeviceGetAttribute(value: *mut i32, attr: i32, device: i32) -> i32;
|
fn cudaDeviceGetAttribute(value: *mut i32, attr: i32, device: i32) -> i32;
|
||||||
|
fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_attr(attr: i32, device: u32) -> Result<i32> {
|
fn get_attr(attr: i32, device: u32) -> Result<i32> {
|
||||||
@@ -42,16 +44,24 @@ pub fn current_device() -> Result<u32> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn device_info(device: u32) -> Result<DeviceInfo> {
|
pub fn device_info(device: u32) -> Result<DeviceInfo> {
|
||||||
// Use cudaGetDeviceProperties only for the name (first field, always stable).
|
// Get device name from cudaGetDeviceProperties (only use the name field).
|
||||||
let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
|
let mut prop = unsafe { std::mem::zeroed::<ffi::CudaDeviceProp>() };
|
||||||
error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
|
error::check(unsafe { ffi::cudaGetDeviceProperties(&mut prop, device as i32) })?;
|
||||||
let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
|
let name = unsafe { CStr::from_ptr(prop.name.as_ptr()) }
|
||||||
.to_string_lossy()
|
.to_string_lossy()
|
||||||
.into_owned();
|
.into_owned();
|
||||||
|
|
||||||
// Use cudaDeviceGetAttribute for everything else (layout-independent).
|
// Get memory info via cudaMemGetInfo (layout-independent).
|
||||||
// Attribute IDs from cuda_runtime_api.h:
|
let prev = current_device()?;
|
||||||
const TOTAL_GLOBAL_MEM: i32 = 0; // not available via attribute, use prop
|
set_device(device)?;
|
||||||
|
let mut free = 0usize;
|
||||||
|
let mut total = 0usize;
|
||||||
|
error::check(unsafe { cudaMemGetInfo(&mut free, &mut total) })?;
|
||||||
|
if prev != device {
|
||||||
|
set_device(prev)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attribute IDs from cuda_runtime_api.h
|
||||||
const SHARED_MEM_PER_BLOCK: i32 = 8;
|
const SHARED_MEM_PER_BLOCK: i32 = 8;
|
||||||
const WARP_SIZE: i32 = 10;
|
const WARP_SIZE: i32 = 10;
|
||||||
const MAX_THREADS_PER_BLOCK: i32 = 1;
|
const MAX_THREADS_PER_BLOCK: i32 = 1;
|
||||||
@@ -62,7 +72,8 @@ pub fn device_info(device: u32) -> Result<DeviceInfo> {
|
|||||||
Ok(DeviceInfo {
|
Ok(DeviceInfo {
|
||||||
index: device,
|
index: device,
|
||||||
name,
|
name,
|
||||||
total_memory: prop.total_global_mem,
|
total_memory: total,
|
||||||
|
free_memory: free,
|
||||||
compute_major: get_attr(COMPUTE_MAJOR, device)?,
|
compute_major: get_attr(COMPUTE_MAJOR, device)?,
|
||||||
compute_minor: get_attr(COMPUTE_MINOR, device)?,
|
compute_minor: get_attr(COMPUTE_MINOR, device)?,
|
||||||
sm_count: get_attr(MULTI_PROCESSOR_COUNT, device)?,
|
sm_count: get_attr(MULTI_PROCESSOR_COUNT, device)?,
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ pub struct CudaDeviceProp {
|
|||||||
_pad: [u8; 4096],
|
_pad: [u8; 4096],
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
unsafe extern "C" {
|
||||||
// --- Device ---
|
// --- Device ---
|
||||||
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
|
pub fn cudaGetDeviceCount(count: *mut i32) -> i32;
|
||||||
pub fn cudaSetDevice(device: i32) -> i32;
|
pub fn cudaSetDevice(device: i32) -> i32;
|
||||||
|
|||||||
@@ -48,26 +48,30 @@ impl GpuBuffer {
|
|||||||
/// Safety: `src` must remain valid until the stream operation completes.
|
/// Safety: `src` must remain valid until the stream operation completes.
|
||||||
pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
|
pub unsafe fn copy_from_host_async(&mut self, src: &[u8], stream: &CudaStream) -> Result<()> {
|
||||||
assert!(src.len() <= self.len);
|
assert!(src.len() <= self.len);
|
||||||
error::check(ffi::cudaMemcpyAsync(
|
unsafe {
|
||||||
self.ptr,
|
error::check(ffi::cudaMemcpyAsync(
|
||||||
src.as_ptr(),
|
self.ptr,
|
||||||
src.len(),
|
src.as_ptr(),
|
||||||
ffi::CUDA_MEMCPY_H2D,
|
src.len(),
|
||||||
stream.as_raw(),
|
ffi::CUDA_MEMCPY_H2D,
|
||||||
))
|
stream.as_raw(),
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Async copy from device to host on the given stream.
|
/// Async copy from device to host on the given stream.
|
||||||
/// Safety: `dst` must remain valid until the stream operation completes.
|
/// Safety: `dst` must remain valid until the stream operation completes.
|
||||||
pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
|
pub unsafe fn copy_to_host_async(&self, dst: &mut [u8], stream: &CudaStream) -> Result<()> {
|
||||||
assert!(dst.len() <= self.len);
|
assert!(dst.len() <= self.len);
|
||||||
error::check(ffi::cudaMemcpyAsync(
|
unsafe {
|
||||||
dst.as_mut_ptr(),
|
error::check(ffi::cudaMemcpyAsync(
|
||||||
self.ptr,
|
dst.as_mut_ptr(),
|
||||||
dst.len(),
|
self.ptr,
|
||||||
ffi::CUDA_MEMCPY_D2H,
|
dst.len(),
|
||||||
stream.as_raw(),
|
ffi::CUDA_MEMCPY_D2H,
|
||||||
))
|
stream.as_raw(),
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy from another GPU buffer (D2D).
|
/// Copy from another GPU buffer (D2D).
|
||||||
|
|||||||
@@ -7,7 +7,8 @@ fn test_device_info() {
|
|||||||
|
|
||||||
let info = device::device_info(0).expect("failed to get device info");
|
let info = device::device_info(0).expect("failed to get device info");
|
||||||
println!("GPU 0: {}", info.name);
|
println!("GPU 0: {}", info.name);
|
||||||
println!(" Memory: {} MB", info.total_memory / (1024 * 1024));
|
println!(" Total Memory: {} MB", info.total_memory / (1024 * 1024));
|
||||||
|
println!(" Free Memory: {} MB", info.free_memory / (1024 * 1024));
|
||||||
println!(
|
println!(
|
||||||
" Compute Capability: {}.{}",
|
" Compute Capability: {}.{}",
|
||||||
info.compute_major, info.compute_minor
|
info.compute_major, info.compute_minor
|
||||||
@@ -17,7 +18,7 @@ fn test_device_info() {
|
|||||||
println!(" Warp Size: {}", info.warp_size);
|
println!(" Warp Size: {}", info.warp_size);
|
||||||
println!(" Max Threads/Block: {}", info.max_threads_per_block);
|
println!(" Max Threads/Block: {}", info.max_threads_per_block);
|
||||||
|
|
||||||
assert!(info.total_memory > 0);
|
assert!(info.total_memory > 30 * 1024 * 1024 * 1024); // 5090 has 32GB
|
||||||
assert!(info.sm_count > 0);
|
assert!(info.sm_count > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user