diff --git a/.circleci/config.yml b/.circleci/config.yml index 39995c1..cef3e69 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,7 +3,7 @@ version: 2.1 executors: default: machine: - image: ubuntu-1604-cuda-10.1:201909-23 + image: ubuntu-2004-cuda-11.2:202103-01 working_directory: ~/gpuci resource_class: gpu.nvidia.medium @@ -18,6 +18,14 @@ restore-cache: &restore-cache - repo-source-{{ .Branch }}-{{ .Revision }} commands: + set-env-path: + steps: + - run: + name: Set the PATH env variable + command: | + echo 'export PATH="$HOME:~/.cargo/bin:/usr/local/cuda-11.2/bin:$PATH"' | tee --append $BASH_ENV + source $BASH_ENV + test_target: parameters: target: @@ -37,10 +45,9 @@ jobs: steps: - checkout - run: curl https://sh.rustup.rs -sSf | sh -s -- -y - - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV + - set-env-path - run: echo $BASH_ENV - run: echo $HOME - - run: source $BASH_ENV - run: cargo --version - run: rustc --version - run: @@ -71,8 +78,7 @@ jobs: test_x86_64-unknown-linux-gnu: executor: default steps: - - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV - - run: source $BASH_ENV + - set-env-path - run: sudo apt-get update -y - run: apt-cache search opencl - run: sudo apt install -y ocl-icd-opencl-dev @@ -84,6 +90,7 @@ jobs: steps: - *restore-workspace - *restore-cache + - set-env-path - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV - run: source $BASH_ENV - run: @@ -95,8 +102,7 @@ jobs: steps: - *restore-workspace - *restore-cache - - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV - - run: source $BASH_ENV + - set-env-path - run: name: Run cargo clippy command: cargo clippy --all-features --all-targets -- -D warnings @@ -106,8 +112,7 @@ jobs: steps: - *restore-workspace - *restore-cache - - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV - - run: source $BASH_ENV + - set-env-path - run: name: Run cargo release build command: cargo build --release diff --git a/Cargo.toml b/Cargo.toml index 53b46c3..72bf73b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,10 @@ license = "MIT/Apache-2.0" repository = "https://github.com/filecoin-project/rust-gpu-tools" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["opencl", "cuda"] +opencl = ["opencl3"] +cuda = ["rustacuda"] [dependencies] dirs = "2.0.2" @@ -16,5 +20,7 @@ sha2 = "0.8.2" thiserror = "1.0.10" lazy_static = "1.2" log = "0.4.11" -opencl3 = { version = "0.4.1", default-features = false, features = ["CL_VERSION_1_2"] } hex = "0.4.3" + +opencl3 = { version = "0.4.1", default-features = false, features = ["CL_VERSION_1_2"], optional = true } +rustacuda = { package = "fil-rustacuda", version = "0.1.3", optional = true } diff --git a/src/cuda/mod.rs b/src/cuda/mod.rs new file mode 100644 index 0000000..a85747e --- /dev/null +++ b/src/cuda/mod.rs @@ -0,0 +1,437 @@ +//! The CUDA specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`]. +//! +//! The current operation mode is synchronuous, in order to have higher safety gurarantees. All +//! operations happen on a single stream, which is synchronized after each operation. This is a +//! similar behaviour to CUDA's default stream. The default stream isn't used for two reasons: +//! +//! 1. RustaCUDA doesn't expose a higher level function to launch a kernel on the default stream +//! 2. There was a bug, when the default stream was used implicitly via RustaCUDA's synchronuous +//! copy methods. To prevent such kind of bugs, be explicit which stream is used. + +pub(crate) mod utils; + +use std::convert::TryFrom; +use std::ffi::{c_void, CStr, CString}; +use std::fmt; +use std::hash::{Hash, Hasher}; + +use rustacuda::memory::{AsyncCopyDestination, DeviceBuffer}; +use rustacuda::stream::{Stream, StreamFlags}; + +use crate::device::{DeviceUuid, PciId, Vendor}; +use crate::error::{GPUError, GPUResult}; +use crate::LocalBuffer; + +/// A Buffer to be used for sending and receiving data to/from the GPU. +#[derive(Debug)] +pub struct Buffer { + buffer: DeviceBuffer, + /// The number of T-sized elements. + length: usize, + _phantom: std::marker::PhantomData, +} + +/// CUDA specific device. +#[derive(Debug, Clone)] +pub struct Device { + vendor: Vendor, + name: String, + /// The total memory of the GPU in bytes. + memory: u64, + pci_id: PciId, + uuid: Option, + device: rustacuda::device::Device, + context: rustacuda::context::UnownedContext, +} + +impl Hash for Device { + fn hash(&self, state: &mut H) { + self.vendor.hash(state); + self.name.hash(state); + self.memory.hash(state); + self.pci_id.hash(state); + self.uuid.hash(state); + } +} + +impl PartialEq for Device { + fn eq(&self, other: &Self) -> bool { + self.vendor == other.vendor + && self.name == other.name + && self.memory == other.memory + && self.pci_id == other.pci_id + && self.uuid == other.uuid + } +} + +impl Eq for Device {} + +impl Device { + /// Returns the [`Vendor`] of the GPU. + pub fn vendor(&self) -> Vendor { + self.vendor + } + + /// Returns the name of the GPU, e.g. "GeForce RTX 3090". + pub fn name(&self) -> String { + self.name.clone() + } + + /// Returns the memory of the GPU in bytes. + pub fn memory(&self) -> u64 { + self.memory + } + + /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information. + pub fn pci_id(&self) -> PciId { + self.pci_id + } + + /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more + /// information. + pub fn uuid(&self) -> Option { + self.uuid + } +} + +/// Abstraction that contains everything to run a CUDA kernel on a GPU. +/// +/// The majority of methods are the same as [`crate::opencl::Program`], so you can write code using this +/// API, which will then work with OpenCL as well as CUDA kernels. +// When compiled without the `opencl` feature, then the intra-doc link above will be broken. +#[allow(broken_intra_doc_links)] +#[derive(Debug)] +pub struct Program { + context: rustacuda::context::UnownedContext, + module: rustacuda::module::Module, + stream: Stream, + device_name: String, +} + +impl Program { + /// Returns the name of the GPU, e.g. "GeForce RTX 3090". + pub fn device_name(&self) -> &str { + &self.device_name + } + + /// Creates a program for a specific device from a compiled CUDA binary file. + pub fn from_binary(device: &Device, filename: &CStr) -> GPUResult { + rustacuda::context::CurrentContext::set_current(&device.context)?; + let module = rustacuda::module::Module::load_from_file(filename).map_err(|err| { + Self::pop_context(); + err + })?; + let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| { + Self::pop_context(); + err + })?; + let prog = Program { + module, + stream, + device_name: device.name(), + context: device.context.clone(), + }; + Self::pop_context(); + Ok(prog) + } + + /// Creates a program for a specific device from a compiled CUDA binary. + pub fn from_bytes(device: &Device, bytes: &[u8]) -> GPUResult { + rustacuda::context::CurrentContext::set_current(&device.context)?; + let module = rustacuda::module::Module::load_from_bytes(bytes).map_err(|err| { + Self::pop_context(); + err + })?; + let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| { + Self::pop_context(); + err + })?; + let prog = Program { + module, + stream, + device_name: device.name(), + context: device.context.clone(), + }; + Self::pop_context(); + Ok(prog) + } + + /// Creates a new buffer that can be used for input/output with the GPU. + /// + /// The `length` is the number of elements to create. + /// + /// It is usually used to create buffers that are initialized by the GPU. If you want to + /// directly transfer data from the host to the GPU, you would use the safe + /// [`Program::create_buffer_from_slice`] instead. + /// + /// ### Safety + /// + /// The buffer needs to be initalized (by the host with [`Program::write_from_buffer`]) or by + /// the GPU) before it can be read via [`Program::read_into_buffer`]. + pub unsafe fn create_buffer(&self, length: usize) -> GPUResult> { + assert!(length > 0); + // This is the unsafe call, the rest of the function is safe code. + let buffer = DeviceBuffer::::uninitialized(length * std::mem::size_of::())?; + + Ok(Buffer:: { + buffer, + length, + _phantom: std::marker::PhantomData, + }) + } + + /// Creates a new buffer on the GPU and initializes with the given slice. + pub fn create_buffer_from_slice(&self, slice: &[T]) -> GPUResult> { + // The number of bytes is used for the allocations. + let bytes_len = slice.len() * std::mem::size_of::(); + + // Transmuting types is safe as long a sizes match. + let bytes = unsafe { + std::slice::from_raw_parts(slice.as_ptr() as *const T as *const u8, bytes_len) + }; + + // It is only unsafe as long as the buffer isn't initialized, but that's what we do next. + let mut buffer = unsafe { DeviceBuffer::::uninitialized(bytes_len)? }; + // It is safe as we synchronize the stream after the call. + unsafe { buffer.async_copy_from(bytes, &self.stream)? }; + self.stream.synchronize()?; + + Ok(Buffer:: { + buffer, + length: slice.len(), + _phantom: std::marker::PhantomData, + }) + } + + /// Returns a kernel. + /// + /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total + /// number of threads. Instead it follows CUDA's definition and is the number of + /// `local_work_size` sized thread groups. So the total number of threads is + /// `global_work_size * local_work_size`. + pub fn create_kernel(&self, name: &str, gws: usize, lws: usize) -> GPUResult { + let function_name = CString::new(name).expect("Kernel name must not contain nul bytes"); + let function = self.module.get_function(&function_name)?; + + Ok(Kernel { + function, + global_work_size: gws, + local_work_size: lws, + stream: &self.stream, + args: Vec::new(), + }) + } + + /// Puts data from an existing buffer onto the GPU. + pub fn write_from_buffer(&self, buffer: &mut Buffer, data: &[T]) -> GPUResult<()> { + assert!(data.len() <= buffer.length, "Buffer is too small"); + + // Transmuting types is safe as long a sizes match. + let bytes = unsafe { + std::slice::from_raw_parts( + data.as_ptr() as *const T as *const u8, + data.len() * std::mem::size_of::(), + ) + }; + + // It is safe as we synchronize the stream after the call. + unsafe { buffer.buffer.async_copy_from(bytes, &self.stream)? }; + self.stream.synchronize()?; + + Ok(()) + } + + /// Reads data from the GPU into an existing buffer. + pub fn read_into_buffer(&self, buffer: &Buffer, data: &mut [T]) -> GPUResult<()> { + assert!(data.len() <= buffer.length, "Buffer is too small"); + + // Transmuting types is safe as long a sizes match. + let bytes = unsafe { + std::slice::from_raw_parts_mut( + data.as_mut_ptr() as *mut T as *mut u8, + data.len() * std::mem::size_of::(), + ) + }; + + // It is safe as we synchronize the stream after the call. + unsafe { buffer.buffer.async_copy_to(bytes, &self.stream)? }; + self.stream.synchronize()?; + + Ok(()) + } + + /// Run some code in the context of the program. + /// + /// It sets the correct contexts. + /// + /// It takes the program as a parameter, so that we can use the same function body, for both + /// the OpenCL and the CUDA code path. The only difference is the type of the program. + pub fn run(&self, fun: F, arg: A) -> Result + where + F: FnOnce(&Self, A) -> Result, + E: From, + { + rustacuda::context::CurrentContext::set_current(&self.context).map_err(Into::into)?; + let result = fun(self, arg); + Self::pop_context(); + result + } + + /// Pop the current context. + /// + /// It panics as it's an unrecoverable error. + fn pop_context() { + rustacuda::context::ContextStack::pop().expect("Cannot remove context."); + } +} + +// TODO vmx 2021-07-07: Check if RustaCUDA types used in `Program` can be made `Send`, so that +// this manual `Send` implementation is no longer needed. +unsafe impl Send for Program {} + +/// Abstraction for kernel arguments. +/// +/// Kernel arguments implement this trait, so that they can be converted it into the correct +/// pointers needed by the actual kernel call. +pub trait KernelArgument { + /// Converts into a C void pointer. + fn as_c_void(&self) -> *mut c_void; + + /// Returns the shared memory size. This is usally 0, except for [`LocalBuffer`]s. This + /// informations is used to allocate the memory correctly. + fn shared_mem(&self) -> u32 { + 0 + } +} + +impl KernelArgument for Buffer { + fn as_c_void(&self) -> *mut c_void { + &self.buffer as *const _ as _ + } +} + +impl KernelArgument for i32 { + fn as_c_void(&self) -> *mut c_void { + self as *const _ as _ + } +} + +impl KernelArgument for u32 { + fn as_c_void(&self) -> *mut c_void { + self as *const _ as _ + } +} + +impl KernelArgument for LocalBuffer { + // This is a hack: on CUDA kernels, you cannot have `__shared__` (`__local` in OpenCL lingo) + // kernel parameters. Hence, just pass on an arbirtary valid pointer. It won't be used, so it + // doesn't matter where it actually points to. A null pointer cannot be used as CUDA would + // return an "invalid argument" error. + fn as_c_void(&self) -> *mut c_void { + self as *const _ as _ + } + + fn shared_mem(&self) -> u32 { + u32::try_from(self.length * std::mem::size_of::()) + .expect("__shared__ memory allocation is too big.") + } +} + +/// A kernel that can be executed. +pub struct Kernel<'a> { + function: rustacuda::function::Function<'a>, + global_work_size: usize, + local_work_size: usize, + stream: &'a Stream, + args: Vec<&'a dyn KernelArgument>, +} + +impl fmt::Debug for Kernel<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let args = self + .args + .iter() + .map(|arg| (arg.as_c_void(), arg.shared_mem())) + .collect::>(); + f.debug_struct("Kernel") + .field("function", &self.function) + .field("global_work_size", &self.global_work_size) + .field("local_work_size", &self.local_work_size) + .field("stream", &self.stream) + .field("args", &args) + .finish() + } +} + +impl<'a> Kernel<'a> { + /// Set a kernel argument. + /// + /// The arguments must live as long as the kernel. Hence make sure they are not dropped as + /// long as the kernel is in use. + /// + /// Example where this behaviour is enforced and leads to a compile-time error: + /// + /// ```compile_fail + /// use rust_gpu_tools::cuda::Program; + /// + /// fn would_break(program: &Program) { + /// let data = vec![1, 2, 3, 4]; + /// let buffer = program.create_buffer_from_slice(&data).unwrap(); + /// let kernel = program.create_kernel("my_kernel", 4, 256).unwrap(); + /// let kernel = kernel.arg(&buffer); + /// // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime. + /// drop(buffer); + /// kernel.run().unwrap(); + /// } + /// ``` + pub fn arg(mut self, t: &'a T) -> Self { + self.args.push(t); + self + } + + /// Actually run the kernel. + /// + /// ### Panics + /// + /// Panics if the wrong number of arguments was provided. + pub fn run(self) -> GPUResult<()> { + // There can only be a single [`LocalBuffer`], due to CUDA restrictions. + let shared_mem = self + .args + .iter() + .try_fold(0, |acc, &arg| -> GPUResult { + let mem = arg.shared_mem(); + match (mem, acc) { + // No new shared memory needs to be allocated. + (0, _) => Ok(acc), + // Some shared memory needs to be allocated. + (_, 0) => Ok(mem), + // There should be memory allocated more than once + (_, _) => Err(GPUError::Generic( + "There cannot be more than one `LocalBuffer`.".to_string(), + )), + } + })?; + let args = self + .args + .iter() + .map(|arg| arg.as_c_void()) + .collect::>(); + // It is safe to launch the kernel as the arguments need to live when the kernel is called, + // and the buffers are copied synchronuously. At the end of the execution, the underlying + // stream is synchronized. + unsafe { + self.stream.launch( + &self.function, + self.global_work_size as u32, + self.local_work_size as u32, + shared_mem, + &args, + )?; + }; + // Synchronize after the kernel execution, so that the underlying pointers can be + // invalidated/dropped. + self.stream.synchronize()?; + Ok(()) + } +} diff --git a/src/cuda/utils.rs b/src/cuda/utils.rs new file mode 100644 index 0000000..e319377 --- /dev/null +++ b/src/cuda/utils.rs @@ -0,0 +1,126 @@ +use std::convert::TryFrom; + +use log::{debug, warn}; + +use crate::cuda::Device; +use crate::device::{PciId, Vendor}; +use crate::error::{GPUError, GPUResult}; + +// NOTE vmx 2021-04-14: This is a hack to make sure contexts stay around. We wrap them, so that +// `Sync` can be implemented. `Sync` is needed for lazy static. These contexts are never used +// directly, they are only accessed through [`cuda::Device`] which contains an `UnownedContext`. +// A device cannot have an own context itself, as then it couldn't be cloned, but that is needed +// for creating the kernels. +pub(crate) struct CudaContexts(Vec); +unsafe impl Sync for CudaContexts {} + +/// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID. +/// +/// It is the first two identifiers of e.g. `lspci`: +/// +/// ```text +/// 4e:00.0 VGA compatible controller +/// || └└-- Device ID +/// └└-- Bus ID +/// ``` +fn get_pci_id(device: &rustacuda::device::Device) -> Result { + let bus_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciBusId)? as u16; + let device_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciDeviceId)? as u16; + let pci_id = (bus_id << 8) | device_id; + Ok(pci_id.into()) +} + +fn get_memory(d: &rustacuda::device::Device) -> GPUResult { + let memory = d.total_memory()?; + Ok(u64::try_from(memory).expect("Platform must be <= 64-bit")) +} + +/// Get a list of all available and supported devices. +/// +/// If there is a failure initializing CUDA or retrieving a device, it won't lead to a hard error, +/// but an error will be logged and the corresponding device won't be available. +pub(crate) fn build_device_list() -> (Vec, CudaContexts) { + let mut all_devices = Vec::new(); + let mut devices_without_pci_id = Vec::new(); + let mut contexts = Vec::new(); + + rustacuda::init(rustacuda::CudaFlags::empty()) + .map_err(Into::into) + .and_then(|_| { + for device in rustacuda::device::Device::devices()? { + let device = device?; + let owned_context = rustacuda::context::Context::create_and_push( + rustacuda::context::ContextFlags::MAP_HOST + | rustacuda::context::ContextFlags::SCHED_AUTO, + device, + )?; + rustacuda::context::ContextStack::pop()?; + + let vendor = Vendor::Nvidia; + let name = device.name()?; + let memory = get_memory(&device)?; + let uuid = device.uuid().ok().map(Into::into); + let context = owned_context.get_unowned(); + + contexts.push(owned_context); + + // If a device doesn't have a PCI-ID, add those later to the list of + // devices with a fake PCI-ID. + match get_pci_id(&device) { + Ok(pci_id) => { + all_devices.push(Device { + vendor, + name, + memory, + pci_id, + uuid, + device, + context, + }); + } + Err(_) => { + // Use a temporary PCI-ID and replace it later with a non-colliding one. + let pci_id = PciId::from(0); + devices_without_pci_id.push(Device { + vendor, + name, + memory, + pci_id, + uuid, + device, + context, + }); + } + }; + } + + // Laptops might have an integrated GPU. Such devices might have neither a PCI-ID, nor a UUID. + // As those devices are used for development and not for production use, it's good enough to + // provide a workaround which doesn't add much complexity to the code. We use a fake PCI-ID + // instead, which is generated by enumerating the available devices. In order to make that + // case easier to spot when debugging issues, a starting number which is pleasant to the human + // eye was choosen, that works both, decimal and hexadecimal (4660 == 0x1234). + let mut enumerated_device: u16 = 4660; + for mut device in devices_without_pci_id.into_iter() { + // Make sure that no device has that actual PCI-ID + while all_devices + .iter() + .any(|d| d.pci_id() == enumerated_device.into()) + { + enumerated_device += 1; + } + device.pci_id = PciId::from(enumerated_device); + enumerated_device += 1; + all_devices.push(device); + } + + let wrapped_contexts = CudaContexts(contexts); + + debug!("Loaded CUDA devices: {:?}", all_devices); + Ok((all_devices, wrapped_contexts)) + }) + .unwrap_or_else(|error: GPUError| { + warn!("Unable to retrieve CUDA devices: {:?}", error); + (Vec::new(), CudaContexts(Vec::new())) + }) +} diff --git a/src/device.rs b/src/device.rs new file mode 100644 index 0000000..a28edd7 --- /dev/null +++ b/src/device.rs @@ -0,0 +1,566 @@ +use std::fmt; + +use lazy_static::lazy_static; +use log::debug; +#[cfg(all(feature = "opencl", feature = "cuda"))] +use log::warn; + +use std::convert::TryFrom; +use std::mem; + +use crate::error::{GPUError, GPUResult}; + +#[cfg(feature = "cuda")] +use crate::cuda; +#[cfg(feature = "opencl")] +use crate::opencl; + +/// The UUID of the devices returned by OpenCL as well as CUDA are always 16 bytes long. +const UUID_SIZE: usize = 16; +const AMD_DEVICE_VENDOR_STRING: &str = "Advanced Micro Devices, Inc."; +const AMD_DEVICE_VENDOR_ID: u32 = 0x1002; +// For some reason integrated AMD cards on Apple don't have the usual vendor name and ID +const AMD_DEVICE_ON_APPLE_VENDOR_STRING: &str = "AMD"; +const AMD_DEVICE_ON_APPLE_VENDOR_ID: u32 = 0x1021d00; +const NVIDIA_DEVICE_VENDOR_STRING: &str = "NVIDIA Corporation"; +const NVIDIA_DEVICE_VENDOR_ID: u32 = 0x10de; + +#[cfg(feature = "cuda")] +lazy_static! { + // The owned CUDA contexts are stored globally. Each devives contains an unowned reference, + // so that devices can be cloned. + static ref DEVICES: (Vec, cuda::utils::CudaContexts) = build_device_list(); +} + +#[cfg(all(feature = "opencl", not(feature = "cuda")))] +lazy_static! { + // Keep it as a tuple as the CUDA case, so that the using `DEVICES` is independent of the + // features set. + static ref DEVICES: (Vec, ()) = build_device_list(); +} + +/// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID. +/// +/// It is the first two identifiers of e.g. `lcpci`: +/// +/// ```text +/// 4e:00.0 VGA compatible controller +/// || └└-- Device ID +/// └└-- Bus ID +/// ``` +#[derive(Copy, Clone, Debug, Default, Eq, Hash, PartialEq)] +pub struct PciId(u16); + +impl From for PciId { + fn from(id: u16) -> Self { + Self(id) + } +} + +impl From for u16 { + fn from(id: PciId) -> Self { + id.0 + } +} + +/// Converts a PCI-ID formatted as Bus-ID:Device-ID, e.g. `e3:00`. +impl TryFrom<&str> for PciId { + type Error = GPUError; + + fn try_from(pci_id: &str) -> GPUResult { + let mut bytes = [0; mem::size_of::()]; + hex::decode_to_slice(pci_id.replace(":", ""), &mut bytes).map_err(|_| { + GPUError::InvalidId(format!( + "Cannot parse PCI ID, expected hex-encoded string formated as aa:bb, got {0}.", + pci_id + )) + })?; + let parsed = u16::from_be_bytes(bytes); + Ok(Self(parsed)) + } +} + +/// Formats the PCI-ID like `lspci`, Bus-ID:Device-ID, e.g. `e3:00`. +impl fmt::Display for PciId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let bytes = u16::to_be_bytes(self.0); + write!(f, "{:02x}:{:02x}", bytes[0], bytes[1]) + } +} + +/// A unique identifier based on UUID of the device. +#[derive(Copy, Clone, Default, Eq, Hash, PartialEq)] +pub struct DeviceUuid([u8; UUID_SIZE]); + +impl From<[u8; UUID_SIZE]> for DeviceUuid { + fn from(uuid: [u8; UUID_SIZE]) -> Self { + Self(uuid) + } +} + +impl From for [u8; UUID_SIZE] { + fn from(uuid: DeviceUuid) -> Self { + uuid.0 + } +} + +/// Converts a UUID formatted as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, +/// e.g. 46abccd6-022e-b783-572d-833f7104d05f +impl TryFrom<&str> for DeviceUuid { + type Error = GPUError; + + fn try_from(uuid: &str) -> GPUResult { + let mut bytes = [0; UUID_SIZE]; + hex::decode_to_slice(uuid.replace("-", ""), &mut bytes) + .map_err(|_| { + GPUError::InvalidId(format!("Cannot parse UUID, expected hex-encoded string formated as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, got {0}.", uuid)) + })?; + Ok(Self(bytes)) + } +} + +/// Formats the UUID the same way as `clinfo` does, as an example: +/// the output should looks like 46abccd6-022e-b783-572d-833f7104d05f +impl fmt::Display for DeviceUuid { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}-{}-{}-{}-{}", + hex::encode(&self.0[..4]), + hex::encode(&self.0[4..6]), + hex::encode(&self.0[6..8]), + hex::encode(&self.0[8..10]), + hex::encode(&self.0[10..]) + ) + } +} + +impl fmt::Debug for DeviceUuid { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.to_string()) + } +} + +/// Unique identifier that can either be a PCI ID or a UUID. +#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] +pub enum UniqueId { + /// ID based on the PCI bus. + PciId(PciId), + /// ID based on a globally unique identifier. + Uuid(DeviceUuid), +} + +/// If the string contains a dash, it's interpreted as UUID, else it's interpreted as PCI ID. +impl TryFrom<&str> for UniqueId { + type Error = GPUError; + + fn try_from(unique_id: &str) -> GPUResult { + Ok(match unique_id.contains('-') { + true => Self::Uuid(DeviceUuid::try_from(unique_id)?), + false => Self::PciId(PciId::try_from(unique_id)?), + }) + } +} + +impl fmt::Display for UniqueId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::PciId(id) => id.fmt(f), + Self::Uuid(id) => id.fmt(f), + } + } +} + +/// Currently supported vendors of this library. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum Vendor { + /// GPU by AMD. + Amd, + /// GPU by NVIDIA. + Nvidia, +} + +impl TryFrom<&str> for Vendor { + type Error = GPUError; + + fn try_from(vendor: &str) -> GPUResult { + match vendor { + AMD_DEVICE_VENDOR_STRING => Ok(Self::Amd), + AMD_DEVICE_ON_APPLE_VENDOR_STRING => Ok(Self::Amd), + NVIDIA_DEVICE_VENDOR_STRING => Ok(Self::Nvidia), + _ => Err(GPUError::UnsupportedVendor(vendor.to_string())), + } + } +} + +impl TryFrom for Vendor { + type Error = GPUError; + + fn try_from(vendor: u32) -> GPUResult { + match vendor { + AMD_DEVICE_VENDOR_ID => Ok(Self::Amd), + AMD_DEVICE_ON_APPLE_VENDOR_ID => Ok(Self::Amd), + NVIDIA_DEVICE_VENDOR_ID => Ok(Self::Nvidia), + _ => Err(GPUError::UnsupportedVendor(format!("0x{:x}", vendor))), + } + } +} + +impl fmt::Display for Vendor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let vendor = match self { + Self::Amd => AMD_DEVICE_VENDOR_STRING, + Self::Nvidia => NVIDIA_DEVICE_VENDOR_STRING, + }; + write!(f, "{}", vendor) + } +} + +/// Which framework to use, CUDA or OpenCL. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum Framework { + /// CUDA. + #[cfg(feature = "cuda")] + Cuda, + /// OpenCL. + #[cfg(feature = "opencl")] + Opencl, +} + +/// A device that may have a CUDA and/or OpenCL GPU associated with it. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct Device { + vendor: Vendor, + name: String, + memory: u64, + // All devices have a PCI ID. It is used as fallback in case there is not UUID. + pci_id: PciId, + uuid: Option, + #[cfg(feature = "cuda")] + cuda: Option, + #[cfg(feature = "opencl")] + opencl: Option, +} + +impl Device { + /// Returns the [`Vendor`] of the GPU. + pub fn vendor(&self) -> Vendor { + self.vendor + } + + /// Returns the name of the GPU, e.g. "GeForce RTX 3090". + pub fn name(&self) -> String { + self.name.clone() + } + + /// Returns the memory of the GPU in bytes. + pub fn memory(&self) -> u64 { + self.memory + } + + /// Returns the best possible unique identifier, a UUID is preferred over a PCI ID. + pub fn unique_id(&self) -> UniqueId { + match self.uuid { + Some(uuid) => UniqueId::Uuid(uuid), + None => UniqueId::PciId(self.pci_id), + } + } + + /// Returns the preferred framework (CUDA or OpenCL) to use. + /// + /// CUDA will be be preferred over OpenCL. The returned framework will work on the device. + /// E.g. it won't return `Framework::Cuda` for an AMD device. + pub fn framework(&self) -> Framework { + #[cfg(all(feature = "opencl", feature = "cuda"))] + if cfg!(feature = "cuda") && self.cuda.is_some() { + Framework::Cuda + } else { + Framework::Opencl + } + + #[cfg(all(feature = "cuda", not(feature = "opencl")))] + { + Framework::Cuda + } + + #[cfg(all(feature = "opencl", not(feature = "cuda")))] + { + Framework::Opencl + } + } + + /// Returns the underlying CUDA device if it is available. + #[cfg(feature = "cuda")] + pub fn cuda_device(&self) -> Option<&cuda::Device> { + self.cuda.as_ref() + } + + /// Returns the underlying OpenCL device if it is available. + #[cfg(feature = "opencl")] + pub fn opencl_device(&self) -> Option<&opencl::Device> { + self.opencl.as_ref() + } + + /// Returns all available GPUs that are supported. + pub fn all() -> Vec<&'static Device> { + Self::all_iter().collect() + } + + /// Returns the device matching the PCI ID if there is one. + pub fn by_pci_id(pci_id: PciId) -> Option<&'static Device> { + Self::all_iter().find(|d| pci_id == d.pci_id) + } + + /// Returns the device matching the UUID if there is one. + pub fn by_uuid(uuid: DeviceUuid) -> Option<&'static Device> { + Self::all_iter().find(|d| Some(uuid) == d.uuid) + } + + /// Returns the device matching the unique ID if there is one. + pub fn by_unique_id(unique_id: UniqueId) -> Option<&'static Device> { + Self::all_iter().find(|d| unique_id == d.unique_id()) + } + + /// Returns an iterator of all available GPUs that are supported. + fn all_iter() -> impl Iterator { + DEVICES.0.iter() + } +} + +/// Get a list of all available and supported devices. +/// +/// If both, the `cuda` and the `opencl` feature are enabled, a device supporting both will be +/// combined into a single device. You can then access the underlying CUDA and OpenCL device +/// if needed. +/// +/// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be +/// logged and the corresponding device won't be available. +#[cfg(feature = "cuda")] +fn build_device_list() -> (Vec, cuda::utils::CudaContexts) { + let mut all_devices = Vec::new(); + + #[cfg(feature = "opencl")] + let opencl_devices = opencl::utils::build_device_list(); + + #[cfg(all(feature = "cuda", feature = "opencl"))] + let (mut cuda_devices, cuda_contexts) = cuda::utils::build_device_list(); + #[cfg(all(feature = "cuda", not(feature = "opencl")))] + let (cuda_devices, cuda_contexts) = cuda::utils::build_device_list(); + + // Combine OpenCL and CUDA devices into one device if it is the same GPU + #[cfg(feature = "opencl")] + for opencl_device in opencl_devices { + let mut device = Device { + vendor: opencl_device.vendor(), + name: opencl_device.name(), + memory: opencl_device.memory(), + pci_id: opencl_device.pci_id(), + uuid: opencl_device.uuid(), + opencl: Some(opencl_device), + cuda: None, + }; + + // Only devices from Nvidia can use CUDA + #[cfg(feature = "cuda")] + if device.vendor == Vendor::Nvidia { + for ii in 0..cuda_devices.len() { + if (device.uuid.is_some() && cuda_devices[ii].uuid() == device.uuid) + || (cuda_devices[ii].pci_id() == device.pci_id) + { + if device.memory() != cuda_devices[ii].memory() { + warn!("OpenCL and CUDA report different amounts of memory for a device with the same identifier"); + break; + } + // Move the CUDA device out of the vector + device.cuda = Some(cuda_devices.remove(ii)); + // Only one device can match + break; + } + } + } + + all_devices.push(device) + } + + // All CUDA devices that don't have a corresponding OpenCL devices + for cuda_device in cuda_devices { + let device = Device { + vendor: cuda_device.vendor(), + name: cuda_device.name(), + memory: cuda_device.memory(), + pci_id: cuda_device.pci_id(), + uuid: cuda_device.uuid(), + cuda: Some(cuda_device), + #[cfg(feature = "opencl")] + opencl: None, + }; + all_devices.push(device); + } + + debug!("loaded devices: {:?}", all_devices); + (all_devices, cuda_contexts) +} + +/// Get a list of all available and supported OpenCL devices. +/// +/// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be +/// logged and the corresponding device won't be available. +#[cfg(all(feature = "opencl", not(feature = "cuda")))] +fn build_device_list() -> (Vec, ()) { + let devices = opencl::utils::build_device_list() + .into_iter() + .map(|device| Device { + vendor: device.vendor(), + name: device.name(), + memory: device.memory(), + pci_id: device.pci_id(), + uuid: device.uuid(), + opencl: Some(device), + }) + .collect(); + + debug!("loaded devices: {:?}", devices); + (devices, ()) +} + +#[cfg(test)] +mod test { + use super::{ + Device, DeviceUuid, GPUError, PciId, UniqueId, Vendor, AMD_DEVICE_ON_APPLE_VENDOR_ID, + AMD_DEVICE_ON_APPLE_VENDOR_STRING, AMD_DEVICE_VENDOR_ID, AMD_DEVICE_VENDOR_STRING, + NVIDIA_DEVICE_VENDOR_ID, NVIDIA_DEVICE_VENDOR_STRING, + }; + use std::convert::TryFrom; + + #[test] + fn test_device_all() { + let devices = Device::all(); + for device in devices.iter() { + println!("device: {:?}", device); + } + assert!(!devices.is_empty(), "No supported GPU found."); + } + + #[test] + fn test_vendor_from_str() { + assert_eq!( + Vendor::try_from(AMD_DEVICE_VENDOR_STRING).unwrap(), + Vendor::Amd, + "AMD vendor string can be converted." + ); + assert_eq!( + Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_STRING).unwrap(), + Vendor::Amd, + "AMD vendor string (on apple) can be converted." + ); + assert_eq!( + Vendor::try_from(NVIDIA_DEVICE_VENDOR_STRING).unwrap(), + Vendor::Nvidia, + "Nvidia vendor string can be converted." + ); + assert!(matches!( + Vendor::try_from("unknown vendor"), + Err(GPUError::UnsupportedVendor(_)) + )); + } + + #[test] + fn test_vendor_from_u32() { + assert_eq!( + Vendor::try_from(AMD_DEVICE_VENDOR_ID).unwrap(), + Vendor::Amd, + "AMD vendor ID can be converted." + ); + assert_eq!( + Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_ID).unwrap(), + Vendor::Amd, + "AMD vendor ID (on apple) can be converted." + ); + assert_eq!( + Vendor::try_from(NVIDIA_DEVICE_VENDOR_ID).unwrap(), + Vendor::Nvidia, + "Nvidia vendor ID can be converted." + ); + assert!(matches!( + Vendor::try_from(0x1abc), + Err(GPUError::UnsupportedVendor(_)) + )); + } + + #[test] + fn test_vendor_display() { + assert_eq!( + Vendor::Amd.to_string(), + AMD_DEVICE_VENDOR_STRING, + "AMD vendor can be converted to string." + ); + assert_eq!( + Vendor::Nvidia.to_string(), + NVIDIA_DEVICE_VENDOR_STRING, + "Nvidia vendor can be converted to string." + ); + } + + #[test] + fn test_uuid() { + let valid_string = "46abccd6-022e-b783-572d-833f7104d05f"; + let valid = DeviceUuid::try_from(valid_string).unwrap(); + assert_eq!(valid_string, &valid.to_string()); + + let too_short_string = "ccd6-022e-b783-572d-833f7104d05f"; + let too_short = DeviceUuid::try_from(too_short_string); + assert!(too_short.is_err(), "Parse error when UUID is too short."); + + let invalid_hex_string = "46abccd6-022e-b783-572d-833f7104d05h"; + let invalid_hex = DeviceUuid::try_from(invalid_hex_string); + assert!( + invalid_hex.is_err(), + "Parse error when UUID containts non-hex character." + ); + } + + #[test] + fn test_pci_id() { + let valid_string = "01:00"; + let valid = PciId::try_from(valid_string).unwrap(); + assert_eq!(valid_string, &valid.to_string()); + assert_eq!(valid, PciId(0x0100)); + + let too_short_string = "3f"; + let too_short = PciId::try_from(too_short_string); + assert!(too_short.is_err(), "Parse error when PCI ID is too short."); + + let invalid_hex_string = "aaxx"; + let invalid_hex = PciId::try_from(invalid_hex_string); + assert!( + invalid_hex.is_err(), + "Parse error when PCI ID containts non-hex character." + ); + } + + #[test] + fn test_unique_id() { + let valid_pci_id_string = "aa:bb"; + let valid_pci_id = UniqueId::try_from(valid_pci_id_string).unwrap(); + assert_eq!(valid_pci_id_string, &valid_pci_id.to_string()); + assert_eq!(valid_pci_id, UniqueId::PciId(PciId(0xaabb))); + + let valid_uuid_string = "aabbccdd-eeff-0011-2233-445566778899"; + let valid_uuid = UniqueId::try_from(valid_uuid_string).unwrap(); + assert_eq!(valid_uuid_string, &valid_uuid.to_string()); + assert_eq!( + valid_uuid, + UniqueId::Uuid(DeviceUuid([ + 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99 + ])) + ); + + let invalid_string = "aabbccddeeffgg"; + let invalid = UniqueId::try_from(invalid_string); + assert!( + invalid.is_err(), + "Parse error when ID matches neither a PCI Id, nor a UUID." + ); + } +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..53a35a6 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,68 @@ +#[cfg(feature = "opencl")] +use opencl3::{device::DeviceInfo, error_codes::ClError, program::ProgramInfo}; +#[cfg(feature = "cuda")] +use rustacuda::error::CudaError; + +/// Error types of this library. +#[derive(thiserror::Error, Debug)] +#[allow(clippy::upper_case_acronyms)] +pub enum GPUError { + /// Error from the underlying `opencl3` library, e.g. a memory allocation failure. + #[cfg(feature = "opencl")] + #[error("Opencl3 Error: {0}{}", match .1 { + Some(message) => format!(" {}", message), + None => "".to_string(), + })] + Opencl3(ClError, Option), + + /// Error for OpenCL `clGetProgramInfo()` call failures. + #[cfg(feature = "opencl")] + #[error("Program info not available!")] + ProgramInfoNotAvailable(ProgramInfo), + + /// Error for OpenCL `clGetDeviceInfo()` call failures. + #[cfg(feature = "opencl")] + #[error("Device info not available!")] + DeviceInfoNotAvailable(DeviceInfo), + + /// Error from the underlying `RustaCUDA` library, e.g. a memory allocation failure. + #[cfg(feature = "cuda")] + #[error("Cuda Error: {0}")] + Cuda(#[from] CudaError), + + /// Error when a device cannot be found. + #[error("Device not found!")] + DeviceNotFound, + + /// Error when a kernel with the given name cannot be found. + #[error("Kernel with name {0} not found!")] + KernelNotFound(String), + + /// Error when standard I/O fails. + #[error("IO Error: {0}")] + IO(#[from] std::io::Error), + + /// Error when the device is from an unsupported vendor. + #[error("Vendor {0} is not supported.")] + UnsupportedVendor(String), + + /// Error when the string representation of a unique identifier (PCI-ID or UUID) cannot be + /// parsed. + #[error("{0}")] + InvalidId(String), + + /// Errors that rarely happen and don't deserve their own error type. + #[error("{0}")] + Generic(String), +} + +/// Convenience type alias for [`GPUError`] based [`Result`]s. +#[allow(clippy::upper_case_acronyms)] +pub type GPUResult = std::result::Result; + +#[cfg(feature = "opencl")] +impl From for GPUError { + fn from(error: ClError) -> Self { + GPUError::Opencl3(error, None) + } +} diff --git a/src/lib.rs b/src/lib.rs index 5d7cae6..ba4b75c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1 +1,54 @@ +//! Abstraction layer for OpenCL and CUDA. +//! +//! Feature Flags +//! ------------- +//! +//! There are two [feature flags], `cuda` and `opencl`. By default `opencl` is enabled. You can +//! enable both at the same time. At least one of them needs to be enabled at any time. +//! +//! [feature flags]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-features-section + +#![warn(missing_docs)] + +mod device; +mod error; +#[cfg(any(feature = "cuda", feature = "opencl"))] +mod program; + +#[cfg(feature = "cuda")] +pub mod cuda; +#[cfg(feature = "opencl")] pub mod opencl; + +pub use device::{Device, DeviceUuid, Framework, PciId, UniqueId, Vendor}; +pub use error::GPUError; +#[cfg(any(feature = "cuda", feature = "opencl"))] +pub use program::Program; + +#[cfg(not(any(feature = "cuda", feature = "opencl")))] +compile_error!("At least one of the features `cuda` or `opencl` must be enabled."); + +/// A buffer on the GPU. +/// +/// The concept of a local buffer is from OpenCL. In CUDA you don't allocate a buffer directly +/// via API call. Instead you pass in the amount of shared memory that should be used. +/// +/// There can be at most a single local buffer per kernel. On CUDA a null pointer will be passed +/// in, instead of an actual value. The memory that should get allocated is then passed into the +/// kernel call automatically. +#[derive(Debug)] +pub struct LocalBuffer { + /// The number of T sized elements. + length: usize, + _phantom: std::marker::PhantomData, +} + +impl LocalBuffer { + /// Returns a new buffer of the specified `length`. + pub fn new(length: usize) -> Self { + LocalBuffer:: { + length, + _phantom: std::marker::PhantomData, + } + } +} diff --git a/src/opencl/mod.rs b/src/opencl/mod.rs index 3e9558f..5f270fb 100644 --- a/src/opencl/mod.rs +++ b/src/opencl/mod.rs @@ -1,210 +1,31 @@ //! The OpenCL specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`]. -mod error; -mod utils; +pub(crate) mod utils; use std::collections::HashMap; -use std::convert::TryFrom; -use std::fmt; use std::hash::{Hash, Hasher}; -use std::mem; use std::ptr; -pub use error::{GPUError, GPUResult}; - use opencl3::command_queue::CommandQueue; use opencl3::context::Context; -use opencl3::device::{DeviceInfo::CL_DEVICE_ENDIAN_LITTLE, CL_UUID_SIZE_KHR}; use opencl3::error_codes::ClError; use opencl3::kernel::ExecuteKernel; use opencl3::memory::CL_MEM_READ_WRITE; use opencl3::program::ProgramInfo::CL_PROGRAM_BINARIES; use opencl3::types::CL_BLOCKING; -const AMD_DEVICE_VENDOR_STRING: &str = "Advanced Micro Devices, Inc."; -const AMD_DEVICE_VENDOR_ID: u32 = 0x1002; -// For some reason integrated AMD cards on Apple don't have the usual vendor name and ID -const AMD_DEVICE_ON_APPLE_VENDOR_STRING: &str = "AMD"; -const AMD_DEVICE_ON_APPLE_VENDOR_ID: u32 = 0x1021d00; -const NVIDIA_DEVICE_VENDOR_STRING: &str = "NVIDIA Corporation"; -const NVIDIA_DEVICE_VENDOR_ID: u32 = 0x10de; +use log::debug; + +use crate::device::{DeviceUuid, PciId, Vendor}; +use crate::error::{GPUError, GPUResult}; +use crate::LocalBuffer; +/// The lowest level identifier of an OpenCL device, it changes whenever a device is initialized. #[allow(non_camel_case_types)] pub type cl_device_id = opencl3::types::cl_device_id; -// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID. -/// -/// It is the first two identifiers of e.g. `lspci`: -/// -/// ```text -/// 4e:00.0 VGA compatible controller -/// || └└-- Device ID -/// └└-- Bus ID -/// ``` -#[derive(Debug, Copy, Clone, PartialEq, Hash)] -pub struct PciId(u16); - -impl From for PciId { - fn from(id: u16) -> Self { - Self(id) - } -} - -impl From for u16 { - fn from(id: PciId) -> Self { - id.0 - } -} - -/// Converts a PCI-ID formatted as Bus-ID:Device-ID, e.g. `e3:00`. -impl TryFrom<&str> for PciId { - type Error = GPUError; - - fn try_from(pci_id: &str) -> GPUResult { - let mut bytes = [0; mem::size_of::()]; - hex::decode_to_slice(pci_id.replace(":", ""), &mut bytes).map_err(|_| { - GPUError::InvalidId(format!( - "Cannot parse PCI ID, expected hex-encoded string formated as aa:bb, got {0}.", - pci_id - )) - })?; - let parsed = u16::from_be_bytes(bytes); - Ok(Self(parsed)) - } -} - -/// Formats the PCI-ID like `lspci`, Bus-ID:Device-ID, e.g. `e3:00`. -impl fmt::Display for PciId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let bytes = u16::to_be_bytes(self.0); - write!(f, "{:02x}:{:02x}", bytes[0], bytes[1]) - } -} - -/// A unique identifier based on UUID of the device. -#[derive(Copy, Clone, Default, PartialEq, Eq, Hash)] -pub struct DeviceUuid([u8; CL_UUID_SIZE_KHR]); - -impl From<[u8; CL_UUID_SIZE_KHR]> for DeviceUuid { - fn from(uuid: [u8; CL_UUID_SIZE_KHR]) -> Self { - Self(uuid) - } -} - -impl From for [u8; CL_UUID_SIZE_KHR] { - fn from(uuid: DeviceUuid) -> Self { - uuid.0 - } -} - -/// Converts a UUID formatted as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, -/// e.g. 46abccd6-022e-b783-572d-833f7104d05f -impl TryFrom<&str> for DeviceUuid { - type Error = GPUError; - - fn try_from(uuid: &str) -> GPUResult { - let mut bytes = [0; CL_UUID_SIZE_KHR]; - hex::decode_to_slice(uuid.replace("-", ""), &mut bytes) - .map_err(|_| { - GPUError::InvalidId(format!("Cannot parse UUID, expected hex-encoded string formated as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, got {0}.", uuid)) - })?; - Ok(Self(bytes)) - } -} - -/// Formats the UUID the same way as `clinfo` does, as an example: -/// the output should looks like 46abccd6-022e-b783-572d-833f7104d05f -impl fmt::Display for DeviceUuid { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "{}-{}-{}-{}-{}", - hex::encode(&self.0[..4]), - hex::encode(&self.0[4..6]), - hex::encode(&self.0[6..8]), - hex::encode(&self.0[8..10]), - hex::encode(&self.0[10..]) - ) - } -} - -impl fmt::Debug for DeviceUuid { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.to_string()) - } -} - -/// Unique identifier that can either be a PCI ID or a UUID. -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum UniqueId { - PciId(PciId), - Uuid(DeviceUuid), -} - -/// If the string contains a dash, it's interpreted as UUID, else it's interpreted as PCI ID. -impl TryFrom<&str> for UniqueId { - type Error = GPUError; - - fn try_from(unique_id: &str) -> GPUResult { - Ok(match unique_id.contains('-') { - true => Self::Uuid(DeviceUuid::try_from(unique_id)?), - false => Self::PciId(PciId::try_from(unique_id)?), - }) - } -} - -impl fmt::Display for UniqueId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::PciId(id) => id.fmt(f), - Self::Uuid(id) => id.fmt(f), - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum Vendor { - Amd, - Nvidia, -} - -impl TryFrom<&str> for Vendor { - type Error = GPUError; - - fn try_from(vendor: &str) -> GPUResult { - match vendor { - AMD_DEVICE_VENDOR_STRING => Ok(Self::Amd), - AMD_DEVICE_ON_APPLE_VENDOR_STRING => Ok(Self::Amd), - NVIDIA_DEVICE_VENDOR_STRING => Ok(Self::Nvidia), - _ => Err(GPUError::UnsupportedVendor(vendor.to_string())), - } - } -} - -impl TryFrom for Vendor { - type Error = GPUError; - - fn try_from(vendor: u32) -> GPUResult { - match vendor { - AMD_DEVICE_VENDOR_ID => Ok(Self::Amd), - AMD_DEVICE_ON_APPLE_VENDOR_ID => Ok(Self::Amd), - NVIDIA_DEVICE_VENDOR_ID => Ok(Self::Nvidia), - _ => Err(GPUError::UnsupportedVendor(format!("0x{:x}", vendor))), - } - } -} - -impl fmt::Display for Vendor { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let vendor = match self { - Self::Amd => AMD_DEVICE_VENDOR_STRING, - Self::Nvidia => NVIDIA_DEVICE_VENDOR_STRING, - }; - write!(f, "{}", vendor) - } -} - /// A Buffer to be used for sending and receiving data to/from the GPU. +#[derive(Debug)] pub struct Buffer { buffer: opencl3::memory::Buffer, /// The number of T-sized elements. @@ -261,11 +82,6 @@ impl Device { pub fn memory(&self) -> u64 { self.memory } - pub fn is_little_endian(&self) -> GPUResult { - self.device - .endian_little() - .map_err(|_| GPUError::DeviceInfoNotAvailable(CL_DEVICE_ENDIAN_LITTLE)) - } /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information. pub fn pci_id(&self) -> PciId { @@ -278,44 +94,6 @@ impl Device { self.uuid } - /// Returns the best possible unique identifier, a UUID is preferred over a PCI ID. - pub fn unique_id(&self) -> UniqueId { - match self.uuid { - Some(uuid) => UniqueId::Uuid(uuid), - None => UniqueId::PciId(self.pci_id), - } - } - - /// Return all available GPU devices of supported vendors. - pub fn all() -> Vec<&'static Device> { - Self::all_iter().collect() - } - - pub fn by_pci_id(pci_id: PciId) -> GPUResult<&'static Device> { - Self::all_iter() - .find(|d| pci_id == d.pci_id) - .ok_or(GPUError::DeviceNotFound) - } - - pub fn by_uuid(uuid: DeviceUuid) -> GPUResult<&'static Device> { - Self::all_iter() - .find(|d| match d.uuid { - Some(id) => uuid == id, - None => false, - }) - .ok_or(GPUError::DeviceNotFound) - } - - pub fn by_unique_id(unique_id: UniqueId) -> GPUResult<&'static Device> { - Self::all_iter() - .find(|d| unique_id == d.unique_id()) - .ok_or(GPUError::DeviceNotFound) - } - - fn all_iter() -> impl Iterator { - utils::DEVICES.iter() - } - /// Low-level access to the device identifier. /// /// It changes when the device is initialized and should only be used to interact with other @@ -351,11 +129,19 @@ impl Program { Program::from_binary(device, bin) } else { let context = Context::from_device(&device.device)?; + debug!( + "Building kernel ({}) from source…", + cached.to_string_lossy() + ); let mut program = opencl3::program::Program::create_from_source(&context, src)?; if let Err(build_error) = program.build(context.devices(), "") { let log = program.get_build_log(context.devices()[0])?; return Err(GPUError::Opencl3(build_error, Some(log))); } + debug!( + "Building kernel ({}) from source: done.", + cached.to_string_lossy() + ); let queue = CommandQueue::create(&context, context.default_device(), 0)?; let kernels = opencl3::kernel::create_program_kernels(&program)?; let kernels_by_name = kernels @@ -409,7 +195,16 @@ impl Program { /// Creates a new buffer that can be used for input/output with the GPU. /// /// The `length` is the number of elements to create. - pub fn create_buffer(&self, length: usize) -> GPUResult> { + /// + /// It is usually used to create buffers that are initialized by the GPU. If you want to + /// directly transfer data from the host to the GPU, you would use the safe + /// [`Program::create_buffer_from_slice`] instead. + /// + /// # Safety + /// + /// This function isn't actually unsafe, it's marked as `unsafe` due to the CUDA version of it, + /// where it is unsafe. This is done to have symmetry between both APIs. + pub unsafe fn create_buffer(&self, length: usize) -> GPUResult> { assert!(length > 0); let mut buff = opencl3::memory::Buffer::create( &self.context, @@ -431,6 +226,35 @@ impl Program { }) } + /// Creates a new buffer on the GPU and initializes with the given slice. + pub fn create_buffer_from_slice(&self, slice: &[T]) -> GPUResult> { + let length = slice.len(); + // The underlying buffer is `u8`, hence we need the number of bytes. + let bytes_len = length * std::mem::size_of::(); + + let mut buffer = opencl3::memory::Buffer::create( + &self.context, + CL_MEM_READ_WRITE, + bytes_len, + ptr::null_mut(), + )?; + // Transmuting types is safe as long a sizes match. + let bytes = unsafe { + std::slice::from_raw_parts(slice.as_ptr() as *const T as *const u8, bytes_len) + }; + // Write some data right-away. This makes a significant performance different. + self.queue + .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, &[0u8], &[])?; + self.queue + .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, &bytes, &[])?; + + Ok(Buffer:: { + buffer, + length, + _phantom: std::marker::PhantomData, + }) + } + /// Returns a kernel. /// /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total @@ -453,73 +277,60 @@ impl Program { Ok(Kernel { builder, queue: &self.queue, + num_local_buffers: 0, }) } /// Puts data from an existing buffer onto the GPU. - /// - /// The `offset` is in number of `T` sized elements, not in their byte size. pub fn write_from_buffer( &self, - buffer: &Buffer, - offset: usize, + // From Rust's perspective, this buffer doesn't need to be mutable. But the sub-buffer is + // mutating the buffer, so it really should be. + buffer: &mut Buffer, data: &[T], ) -> GPUResult<()> { - assert!(offset + data.len() <= buffer.length, "Buffer is too small"); - - let bytes_len = data.len() * std::mem::size_of::(); - let mut buff = buffer.buffer.create_sub_buffer( - CL_MEM_READ_WRITE, - offset * std::mem::size_of::(), - bytes_len, - )?; + assert!(data.len() <= buffer.length, "Buffer is too small"); + // It is safe as long as the sizes match. let bytes = unsafe { - std::slice::from_raw_parts(data.as_ptr() as *const T as *const u8, bytes_len) + std::slice::from_raw_parts( + data.as_ptr() as *const T as *const u8, + data.len() * std::mem::size_of::(), + ) }; self.queue - .enqueue_write_buffer(&mut buff, CL_BLOCKING, 0, &bytes, &[])?; + .enqueue_write_buffer(&mut buffer.buffer, CL_BLOCKING, 0, &bytes, &[])?; Ok(()) } /// Reads data from the GPU into an existing buffer. - /// - /// The `offset` is in number of `T` sized elements, not in their byte size. - pub fn read_into_buffer( - &self, - buffer: &Buffer, - offset: usize, - data: &mut [T], - ) -> GPUResult<()> { - assert!(offset + data.len() <= buffer.length, "Buffer is too small"); - - let bytes_len = data.len() * std::mem::size_of::(); - let buff = buffer.buffer.create_sub_buffer( - CL_MEM_READ_WRITE, - offset * std::mem::size_of::(), - bytes_len, - )?; + pub fn read_into_buffer(&self, buffer: &Buffer, data: &mut [T]) -> GPUResult<()> { + assert!(data.len() <= buffer.length, "Buffer is too small"); + // It is safe as long as the sizes match. let mut bytes = unsafe { - std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut T as *mut u8, bytes_len) + std::slice::from_raw_parts_mut( + data.as_mut_ptr() as *mut T as *mut u8, + data.len() * std::mem::size_of::(), + ) }; self.queue - .enqueue_read_buffer(&buff, CL_BLOCKING, 0, &mut bytes, &[])?; + .enqueue_read_buffer(&buffer.buffer, CL_BLOCKING, 0, &mut bytes, &[])?; Ok(()) } - /// Run some code in the context of the program + /// Run some code in the context of the program. /// - /// On CUDA it sets the correct contexts and synchronizes the stream before returning. - /// On OpenCL it's only executing the closure without any other side-effects. - pub fn run(&self, fun: F) -> Result + /// It takes the program as a parameter, so that we can use the same function body, for both + /// the OpenCL and the CUDA code path. The only difference is the type of the program. + pub fn run(&self, fun: F, arg: A) -> Result where - F: FnOnce() -> Result, + F: FnOnce(&Self, A) -> Result, E: From, { - fun() + fun(self, arg) } } @@ -551,189 +362,60 @@ impl KernelArgument for u32 { } } -/// A local buffer. -pub struct LocalBuffer { - /// The number of T sized elements. - length: usize, - _phantom: std::marker::PhantomData, -} -impl LocalBuffer { - /// Returns a new buffer of the specified `length`. - pub fn new(length: usize) -> Self { - LocalBuffer:: { - length, - _phantom: std::marker::PhantomData, - } - } -} - impl KernelArgument for LocalBuffer { fn push(&self, kernel: &mut Kernel) { kernel .builder .set_arg_local_buffer::(self.length * std::mem::size_of::()); + kernel.num_local_buffers += 1; } } /// A kernel that can be executed. #[derive(Debug)] pub struct Kernel<'a> { - builder: ExecuteKernel<'a>, + /// The underlying kernel builder. + pub builder: ExecuteKernel<'a>, queue: &'a CommandQueue, + /// There can only be a single [`LocalBuffer`] as parameter due to CUDA restrictions. This + /// counts them, so that there can be an error if there are more `LocalBuffer` arguments. + num_local_buffers: u8, } impl<'a> Kernel<'a> { /// Set a kernel argument. - pub fn arg(mut self, t: &T) -> Self { + /// + /// The arguments must live as long as the kernel. Hence make sure they are not dropped as + /// long as the kernel is in use. + /// + /// Example where this behaviour is enforced and leads to a compile-time error: + /// + /// ```compile_fail + /// use rust_gpu_tools::opencl::Program; + /// + /// fn would_break(program: &Program) { + /// let data = vec![1, 2, 3, 4]; + /// let buffer = program.create_buffer_from_slice(&data).unwrap(); + /// let kernel = program.create_kernel("my_kernel", 4, 256).unwrap(); + /// let kernel = kernel.arg(&buffer); + /// // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime. + /// drop(buffer); + /// kernel.run().unwrap(); + /// } + /// ``` + pub fn arg(mut self, t: &'a T) -> Self { t.push(&mut self); self } /// Actually run the kernel. pub fn run(mut self) -> GPUResult<()> { + if self.num_local_buffers > 1 { + return Err(GPUError::Generic( + "There cannot be more than one `LocalBuffer`.".to_string(), + )); + } self.builder.enqueue_nd_range(&self.queue)?; Ok(()) } } - -#[cfg(test)] -mod test { - use super::{ - Device, DeviceUuid, GPUError, PciId, UniqueId, Vendor, AMD_DEVICE_ON_APPLE_VENDOR_ID, - AMD_DEVICE_ON_APPLE_VENDOR_STRING, AMD_DEVICE_VENDOR_ID, AMD_DEVICE_VENDOR_STRING, - NVIDIA_DEVICE_VENDOR_ID, NVIDIA_DEVICE_VENDOR_STRING, - }; - use std::convert::TryFrom; - - #[test] - fn test_device_all() { - let devices = Device::all(); - for device in devices.iter() { - println!("device: {:?}", device); - } - assert!(!devices.is_empty(), "No supported GPU found."); - } - - #[test] - fn test_vendor_from_str() { - assert_eq!( - Vendor::try_from(AMD_DEVICE_VENDOR_STRING).unwrap(), - Vendor::Amd, - "AMD vendor string can be converted." - ); - assert_eq!( - Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_STRING).unwrap(), - Vendor::Amd, - "AMD vendor string (on apple) can be converted." - ); - assert_eq!( - Vendor::try_from(NVIDIA_DEVICE_VENDOR_STRING).unwrap(), - Vendor::Nvidia, - "Nvidia vendor string can be converted." - ); - assert!(matches!( - Vendor::try_from("unknown vendor"), - Err(GPUError::UnsupportedVendor(_)) - )); - } - - #[test] - fn test_vendor_from_u32() { - assert_eq!( - Vendor::try_from(AMD_DEVICE_VENDOR_ID).unwrap(), - Vendor::Amd, - "AMD vendor ID can be converted." - ); - assert_eq!( - Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_ID).unwrap(), - Vendor::Amd, - "AMD vendor ID (on apple) can be converted." - ); - assert_eq!( - Vendor::try_from(NVIDIA_DEVICE_VENDOR_ID).unwrap(), - Vendor::Nvidia, - "Nvidia vendor ID can be converted." - ); - assert!(matches!( - Vendor::try_from(0x1abc), - Err(GPUError::UnsupportedVendor(_)) - )); - } - - #[test] - fn test_vendor_display() { - assert_eq!( - Vendor::Amd.to_string(), - AMD_DEVICE_VENDOR_STRING, - "AMD vendor can be converted to string." - ); - assert_eq!( - Vendor::Nvidia.to_string(), - NVIDIA_DEVICE_VENDOR_STRING, - "Nvidia vendor can be converted to string." - ); - } - - #[test] - fn test_uuid() { - let valid_string = "46abccd6-022e-b783-572d-833f7104d05f"; - let valid = DeviceUuid::try_from(valid_string).unwrap(); - assert_eq!(valid_string, &valid.to_string()); - - let too_short_string = "ccd6-022e-b783-572d-833f7104d05f"; - let too_short = DeviceUuid::try_from(too_short_string); - assert!(too_short.is_err(), "Parse error when UUID is too short."); - - let invalid_hex_string = "46abccd6-022e-b783-572d-833f7104d05h"; - let invalid_hex = DeviceUuid::try_from(invalid_hex_string); - assert!( - invalid_hex.is_err(), - "Parse error when UUID containts non-hex character." - ); - } - - #[test] - fn test_pci_id() { - let valid_string = "01:00"; - let valid = PciId::try_from(valid_string).unwrap(); - assert_eq!(valid_string, &valid.to_string()); - assert_eq!(valid, PciId(0x0100)); - - let too_short_string = "3f"; - let too_short = PciId::try_from(too_short_string); - assert!(too_short.is_err(), "Parse error when PCI ID is too short."); - - let invalid_hex_string = "aaxx"; - let invalid_hex = PciId::try_from(invalid_hex_string); - assert!( - invalid_hex.is_err(), - "Parse error when PCI ID containts non-hex character." - ); - } - - #[test] - fn test_unique_id() { - let valid_pci_id_string = "aa:bb"; - let valid_pci_id = UniqueId::try_from(valid_pci_id_string).unwrap(); - assert_eq!(valid_pci_id_string, &valid_pci_id.to_string()); - assert_eq!(valid_pci_id, UniqueId::PciId(PciId(0xaabb))); - - let valid_uuid_string = "aabbccdd-eeff-0011-2233-445566778899"; - let valid_uuid = UniqueId::try_from(valid_uuid_string).unwrap(); - assert_eq!(valid_uuid_string, &valid_uuid.to_string()); - assert_eq!( - valid_uuid, - UniqueId::Uuid(DeviceUuid([ - 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, - 0x88, 0x99 - ])) - ); - - let invalid_string = "aabbccddeeffgg"; - let invalid = UniqueId::try_from(invalid_string); - assert!( - invalid.is_err(), - "Parse error when ID matches neither a PCI Id, nor a UUID." - ); - } -} diff --git a/src/opencl/utils.rs b/src/opencl/utils.rs index 178f2c7..6bccd6c 100644 --- a/src/opencl/utils.rs +++ b/src/opencl/utils.rs @@ -1,11 +1,13 @@ use std::convert::{TryFrom, TryInto}; -use lazy_static::lazy_static; use log::{debug, warn}; use opencl3::device::DeviceInfo::CL_DEVICE_GLOBAL_MEM_SIZE; +use opencl3::device::CL_UUID_SIZE_KHR; use sha2::{Digest, Sha256}; -use super::{Device, DeviceUuid, GPUError, GPUResult, PciId, Vendor, CL_UUID_SIZE_KHR}; +use crate::device::{DeviceUuid, PciId, Vendor}; +use crate::error::{GPUError, GPUResult}; +use crate::opencl::Device; /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID. /// @@ -68,15 +70,11 @@ fn get_memory(d: &opencl3::device::Device) -> GPUResult { .map_err(|_| GPUError::DeviceInfoNotAvailable(CL_DEVICE_GLOBAL_MEM_SIZE)) } -lazy_static! { - pub(crate) static ref DEVICES: Vec = build_device_list(); -} - /// Get a list of all available and supported devices. /// /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be /// logged and the corresponding device won't be available. -fn build_device_list() -> Vec { +pub(crate) fn build_device_list() -> Vec { let mut all_devices = Vec::new(); let platforms: Vec<_> = opencl3::platform::get_platforms().unwrap_or_default(); diff --git a/src/program.rs b/src/program.rs new file mode 100644 index 0000000..6e371d0 --- /dev/null +++ b/src/program.rs @@ -0,0 +1,283 @@ +#[cfg(feature = "cuda")] +use crate::cuda; +use crate::error::GPUError; +#[cfg(feature = "opencl")] +use crate::opencl; + +/// Abstraction for running programs on CUDA or OpenCL. +pub enum Program { + /// CUDA program. + #[cfg(feature = "cuda")] + Cuda(cuda::Program), + /// OpenCL program. + #[cfg(feature = "opencl")] + Opencl(opencl::Program), +} + +impl Program { + /// Run some code in the context of the program. + /// + /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but + /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same + /// code code can be used to run on either of them. The only difference is the type of the + /// `Program`. + /// + /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their + /// corresponding program type as parameter. For convenience there is the [`define_closures`] + /// macro defined, which can help reducing code duplication by creating two closures out of + /// a single one. + /// + /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If + /// one of them is disabled, you still need to pass in two closures. This way the API stays, + /// the same, but you can disable it things at compile-time. + /// + /// The second parameter is a single arbitrary argument, which will be passed on into the + /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference + /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it + /// is used at most once. + #[cfg(all(feature = "cuda", feature = "opencl"))] + pub fn run(&self, fun: (F1, F2), arg: A) -> Result + where + E: From, + F1: FnOnce(&cuda::Program, A) -> Result, + F2: FnOnce(&opencl::Program, A) -> Result, + { + match self { + Self::Cuda(program) => program.run(fun.0, arg), + Self::Opencl(program) => program.run(fun.1, arg), + } + } + + /// Run some code in the context of the program. + /// + /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but + /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same + /// code code can be used to run on either of them. The only difference is the type of the + /// `Program`. + /// + /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their + /// corresponding program type as parameter. For convenience there is the [`define_closures`] + /// macro defined, which can help reducing code duplication by creating two closures out of + /// a single one. + /// + /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If + /// one of them is disabled, you still need to pass in two closures. This way the API stays, + /// the same, but you can disable it things at compile-time. + /// + /// The second parameter is a single arbitrary argument, which will be passed on into the + /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference + /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it + /// is used at most once. + #[cfg(all(feature = "cuda", not(feature = "opencl")))] + pub fn run(&self, fun: (F1, F2), arg: A) -> Result + where + E: From, + F1: FnOnce(&cuda::Program, A) -> Result, + { + match self { + Self::Cuda(program) => program.run(fun.0, arg), + } + } + + /// Run some code in the context of the program. + /// + /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but + /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same + /// code code can be used to run on either of them. The only difference is the type of the + /// `Program`. + /// + /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their + /// corresponding program type as parameter. For convenience there is the [`define_closures`] + /// macro defined, which can help reducing code duplication by creating two closures out of + /// a single one. + /// + /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If + /// one of them is disabled, you still need to pass in two closures. This way the API stays, + /// the same, but you can disable it things at compile-time. + /// + /// The second parameter is a single arbitrary argument, which will be passed on into the + /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference + /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it + /// is used at most once. + #[cfg(all(not(feature = "cuda"), feature = "opencl"))] + pub fn run(&self, fun: (F1, F2), arg: A) -> Result + where + E: From, + F2: FnOnce(&opencl::Program, A) -> Result, + { + match self { + Self::Opencl(program) => program.run(fun.1, arg), + } + } + + /// Returns the name of the GPU, e.g. "GeForce RTX 3090". + pub fn device_name(&self) -> &str { + match self { + #[cfg(feature = "cuda")] + Self::Cuda(program) => program.device_name(), + #[cfg(feature = "opencl")] + Self::Opencl(program) => program.device_name(), + } + } +} + +/// Creates two closures, one for CUDA, one for OpenCL for the given one. +/// +/// This macro is used to be able to interact with rust-gpu-tools with unified code for both, +/// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and +/// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`]. +/// +/// The second parameter is a single arbitrary argument, which will be passed on into the closure. +/// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be +/// shared between closures, hence we pass it on, so that the compiler knows that it is used at +/// most once. +/// +/// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct +/// thing and not specify one of them if it is appropriate. +/// +/// ### Example +/// +/// ``` +/// use rust_gpu_tools::{cuda, opencl, program_closures}; +/// +/// let closures = program_closures!(|program, arg: u8| -> bool { +/// true +/// }); +/// +/// // Generates +/// let closures = ( +/// |program: &cuda::Program, arg: u8| { true }, +/// |program: &opencl::Program, arg: u8| { true }, +/// ); +/// +/// // If e.g. the `cuda` feature is disabled, it would generate +/// let closures_without_cuda = ( +/// (), +/// |program: &opencl::Program, arg: u8| { true }, +/// ); +/// ``` +#[cfg(all(feature = "cuda", feature = "opencl"))] +#[macro_export] +macro_rules! program_closures { + // Additional argument without a type + (|$program:ident, $arg:ident| -> $ret:ty $body:block) => { + ( + |$program: &$crate::cuda::Program, $arg| -> $ret { $body }, + |$program: &$crate::opencl::Program, $arg| -> $ret { $body }, + ) + }; + // Additional argument with a type + (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => { + ( + |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body }, + |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body }, + ) + }; +} + +/// Creates two closures, one for CUDA, one for OpenCL for the given one. +/// +/// This macro is used to be able to interact with rust-gpu-tools with unified code for both, +/// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and +/// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`]. +/// +/// The second parameter is a single arbitrary argument, which will be passed on into the closure. +/// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be +/// shared between closures, hence we pass it on, so that the compiler knows that it is used at +/// most once. +/// +/// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct +/// thing and not specify one of them if it is appropriate. +/// +/// ### Example +/// +/// ``` +/// use rust_gpu_tools::{cuda, opencl, program_closures}; +/// +/// let closures = program_closures!(|program, arg: u8| -> bool { +/// true +/// }); +/// +/// // Generates +/// let closures = ( +/// |program: &cuda::Program, arg: u8| { true }, +/// |program: &opencl::Program, arg: u8| { true }, +/// ); +/// +/// // If e.g. the `cuda` feature is disabled, it would generate +/// let closures_without_cuda = ( +/// (), +/// |program: &opencl::Program, arg: u8| { true }, +/// ); +/// ``` +#[macro_export] +#[cfg(all(feature = "cuda", not(feature = "opencl")))] +macro_rules! program_closures { + // Additional argument without a type + (|$program:ident, $arg:ident| -> $ret:ty $body:block) => { + ( + |$program: &$crate::cuda::Program, $arg| -> $ret { $body }, + (), + ) + }; + // Additional argument with a type + (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => { + ( + |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body }, + (), + ) + }; +} + +/// Creates two closures, one for CUDA, one for OpenCL for the given one. +/// +/// This macro is used to be able to interact with rust-gpu-tools with unified code for both, +/// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and +/// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`]. +/// +/// The second parameter is a single arbitrary argument, which will be passed on into the closure. +/// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be +/// shared between closures, hence we pass it on, so that the compiler knows that it is used at +/// most once. +/// +/// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct +/// thing and not specify one of them if it is appropriate. +/// +/// ### Example +/// +/// ``` +/// use rust_gpu_tools::{cuda, opencl, program_closures}; +/// +/// let closures = program_closures!(|program, arg: u8| -> bool { +/// true +/// }); +/// +/// // Generates +/// let closures = ( +/// |program: &cuda::Program, arg: u8| { true }, +/// |program: &opencl::Program, arg: u8| { true }, +/// ); +/// +/// // If e.g. the `cuda` feature is disabled, it would generate +/// let closures_without_cuda = ( +/// (), +/// |program: &opencl::Program, arg: u8| { true }, +/// ); +/// ``` +#[macro_export] +#[cfg(all(not(feature = "cuda"), feature = "opencl"))] +macro_rules! program_closures { + // Additional argument without a type + (|$program:ident, $arg:ident| -> $ret:ty $body:block) => { + ((), |$program: &$crate::opencl::Program, $arg| -> $ret { + $body + }) + }; + // Additional argument with a type + (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => { + ( + (), + |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body }, + ) + }; +}