diff --git a/.circleci/config.yml b/.circleci/config.yml
index 39995c1..cef3e69 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,7 +3,7 @@ version: 2.1
 executors:
   default:
     machine:
-      image: ubuntu-1604-cuda-10.1:201909-23
+      image: ubuntu-2004-cuda-11.2:202103-01
     working_directory: ~/gpuci
     resource_class: gpu.nvidia.medium
 
@@ -18,6 +18,14 @@ restore-cache: &restore-cache
       - repo-source-{{ .Branch }}-{{ .Revision }}
 
 commands:
+  set-env-path:
+    steps:
+      - run:
+          name: Set the PATH env variable
+          command: |
+            echo 'export PATH="$HOME:~/.cargo/bin:/usr/local/cuda-11.2/bin:$PATH"' | tee --append $BASH_ENV
+            source $BASH_ENV
+
   test_target:
     parameters:
       target:
@@ -37,10 +45,9 @@ jobs:
     steps:
       - checkout
       - run: curl https://sh.rustup.rs -sSf | sh -s -- -y
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
+      - set-env-path
       - run: echo $BASH_ENV
       - run: echo $HOME
-      - run: source $BASH_ENV
       - run: cargo --version
       - run: rustc --version
       - run:
@@ -71,8 +78,7 @@ jobs:
   test_x86_64-unknown-linux-gnu:
     executor: default
     steps:
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
-      - run: source $BASH_ENV
+      - set-env-path
       - run: sudo apt-get update -y
       - run: apt-cache search opencl
       - run: sudo apt install -y ocl-icd-opencl-dev
@@ -84,6 +90,7 @@ jobs:
     steps:
       - *restore-workspace
       - *restore-cache
+      - set-env-path
       - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
       - run: source $BASH_ENV
       - run:
@@ -95,8 +102,7 @@ jobs:
     steps:
       - *restore-workspace
       - *restore-cache
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
-      - run: source $BASH_ENV
+      - set-env-path
       - run:
           name: Run cargo clippy
           command: cargo clippy --all-features --all-targets -- -D warnings
@@ -106,8 +112,7 @@ jobs:
     steps:
       - *restore-workspace
       - *restore-cache
-      - run: echo 'export PATH="$HOME:~/.cargo/bin:$PATH"' >> $BASH_ENV
-      - run: source $BASH_ENV
+      - set-env-path
       - run:
           name: Run cargo release build
           command: cargo build --release
diff --git a/Cargo.toml b/Cargo.toml
index 53b46c3..72bf73b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,10 @@ license = "MIT/Apache-2.0"
 repository = "https://github.com/filecoin-project/rust-gpu-tools"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[features]
+default = ["opencl", "cuda"]
+opencl = ["opencl3"]
+cuda = ["rustacuda"]
 
 [dependencies]
 dirs = "2.0.2"
@@ -16,5 +20,7 @@ sha2 = "0.8.2"
 thiserror = "1.0.10"
 lazy_static = "1.2"
 log = "0.4.11"
-opencl3 = { version = "0.4.1", default-features = false, features = ["CL_VERSION_1_2"] }
 hex = "0.4.3"
+
+opencl3 = { version = "0.4.1", default-features = false, features = ["CL_VERSION_1_2"], optional = true }
+rustacuda = { package = "fil-rustacuda", version = "0.1.3", optional = true }
diff --git a/src/cuda/mod.rs b/src/cuda/mod.rs
new file mode 100644
index 0000000..a85747e
--- /dev/null
+++ b/src/cuda/mod.rs
@@ -0,0 +1,437 @@
+//! The CUDA specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`].
+//!
+//! The current operation mode is synchronuous, in order to have higher safety gurarantees. All
+//! operations happen on a single stream, which is synchronized after each operation. This is a
+//! similar behaviour to CUDA's default stream. The default stream isn't used for two reasons:
+//!
+//!   1. RustaCUDA doesn't expose a higher level function to launch a kernel on the default stream
+//!   2. There was a bug, when the default stream was used implicitly via RustaCUDA's synchronuous
+//!   copy methods. To prevent such kind of bugs, be explicit which stream is used.
+
+pub(crate) mod utils;
+
+use std::convert::TryFrom;
+use std::ffi::{c_void, CStr, CString};
+use std::fmt;
+use std::hash::{Hash, Hasher};
+
+use rustacuda::memory::{AsyncCopyDestination, DeviceBuffer};
+use rustacuda::stream::{Stream, StreamFlags};
+
+use crate::device::{DeviceUuid, PciId, Vendor};
+use crate::error::{GPUError, GPUResult};
+use crate::LocalBuffer;
+
+/// A Buffer to be used for sending and receiving data to/from the GPU.
+#[derive(Debug)]
+pub struct Buffer<T> {
+    buffer: DeviceBuffer<u8>,
+    /// The number of T-sized elements.
+    length: usize,
+    _phantom: std::marker::PhantomData<T>,
+}
+
+/// CUDA specific device.
+#[derive(Debug, Clone)]
+pub struct Device {
+    vendor: Vendor,
+    name: String,
+    /// The total memory of the GPU in bytes.
+    memory: u64,
+    pci_id: PciId,
+    uuid: Option<DeviceUuid>,
+    device: rustacuda::device::Device,
+    context: rustacuda::context::UnownedContext,
+}
+
+impl Hash for Device {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.vendor.hash(state);
+        self.name.hash(state);
+        self.memory.hash(state);
+        self.pci_id.hash(state);
+        self.uuid.hash(state);
+    }
+}
+
+impl PartialEq for Device {
+    fn eq(&self, other: &Self) -> bool {
+        self.vendor == other.vendor
+            && self.name == other.name
+            && self.memory == other.memory
+            && self.pci_id == other.pci_id
+            && self.uuid == other.uuid
+    }
+}
+
+impl Eq for Device {}
+
+impl Device {
+    /// Returns the [`Vendor`] of the GPU.
+    pub fn vendor(&self) -> Vendor {
+        self.vendor
+    }
+
+    /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
+    pub fn name(&self) -> String {
+        self.name.clone()
+    }
+
+    /// Returns the memory of the GPU in bytes.
+    pub fn memory(&self) -> u64 {
+        self.memory
+    }
+
+    /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information.
+    pub fn pci_id(&self) -> PciId {
+        self.pci_id
+    }
+
+    /// Returns the PCI-ID of the GPU if available, see the [`DeviceUuid`] type for more
+    /// information.
+    pub fn uuid(&self) -> Option<DeviceUuid> {
+        self.uuid
+    }
+}
+
+/// Abstraction that contains everything to run a CUDA kernel on a GPU.
+///
+/// The majority of methods are the same as [`crate::opencl::Program`], so you can write code using this
+/// API, which will then work with OpenCL as well as CUDA kernels.
+// When compiled without the `opencl` feature, then the intra-doc link above will be broken.
+#[allow(broken_intra_doc_links)]
+#[derive(Debug)]
+pub struct Program {
+    context: rustacuda::context::UnownedContext,
+    module: rustacuda::module::Module,
+    stream: Stream,
+    device_name: String,
+}
+
+impl Program {
+    /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
+    pub fn device_name(&self) -> &str {
+        &self.device_name
+    }
+
+    /// Creates a program for a specific device from a compiled CUDA binary file.
+    pub fn from_binary(device: &Device, filename: &CStr) -> GPUResult<Program> {
+        rustacuda::context::CurrentContext::set_current(&device.context)?;
+        let module = rustacuda::module::Module::load_from_file(filename).map_err(|err| {
+            Self::pop_context();
+            err
+        })?;
+        let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| {
+            Self::pop_context();
+            err
+        })?;
+        let prog = Program {
+            module,
+            stream,
+            device_name: device.name(),
+            context: device.context.clone(),
+        };
+        Self::pop_context();
+        Ok(prog)
+    }
+
+    /// Creates a program for a specific device from a compiled CUDA binary.
+    pub fn from_bytes(device: &Device, bytes: &[u8]) -> GPUResult<Program> {
+        rustacuda::context::CurrentContext::set_current(&device.context)?;
+        let module = rustacuda::module::Module::load_from_bytes(bytes).map_err(|err| {
+            Self::pop_context();
+            err
+        })?;
+        let stream = Stream::new(StreamFlags::NON_BLOCKING, None).map_err(|err| {
+            Self::pop_context();
+            err
+        })?;
+        let prog = Program {
+            module,
+            stream,
+            device_name: device.name(),
+            context: device.context.clone(),
+        };
+        Self::pop_context();
+        Ok(prog)
+    }
+
+    /// Creates a new buffer that can be used for input/output with the GPU.
+    ///
+    /// The `length` is the number of elements to create.
+    ///
+    /// It is usually used to create buffers that are initialized by the GPU. If you want to
+    /// directly transfer data from the host to the GPU, you would use the safe
+    /// [`Program::create_buffer_from_slice`] instead.
+    ///
+    /// ### Safety
+    ///
+    /// The buffer needs to be initalized (by the host with [`Program::write_from_buffer`]) or by
+    /// the GPU) before it can be read via [`Program::read_into_buffer`].
+    pub unsafe fn create_buffer<T>(&self, length: usize) -> GPUResult<Buffer<T>> {
+        assert!(length > 0);
+        // This is the unsafe call, the rest of the function is safe code.
+        let buffer = DeviceBuffer::<u8>::uninitialized(length * std::mem::size_of::<T>())?;
+
+        Ok(Buffer::<T> {
+            buffer,
+            length,
+            _phantom: std::marker::PhantomData,
+        })
+    }
+
+    /// Creates a new buffer on the GPU and initializes with the given slice.
+    pub fn create_buffer_from_slice<T>(&self, slice: &[T]) -> GPUResult<Buffer<T>> {
+        // The number of bytes is used for the allocations.
+        let bytes_len = slice.len() * std::mem::size_of::<T>();
+
+        // Transmuting types is safe as long a sizes match.
+        let bytes = unsafe {
+            std::slice::from_raw_parts(slice.as_ptr() as *const T as *const u8, bytes_len)
+        };
+
+        // It is only unsafe as long as the buffer isn't initialized, but that's what we do next.
+        let mut buffer = unsafe { DeviceBuffer::<u8>::uninitialized(bytes_len)? };
+        // It is safe as we synchronize the stream after the call.
+        unsafe { buffer.async_copy_from(bytes, &self.stream)? };
+        self.stream.synchronize()?;
+
+        Ok(Buffer::<T> {
+            buffer,
+            length: slice.len(),
+            _phantom: std::marker::PhantomData,
+        })
+    }
+
+    /// Returns a kernel.
+    ///
+    /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total
+    /// number of threads. Instead it follows CUDA's definition and is the number of
+    /// `local_work_size` sized thread groups. So the total number of threads is
+    /// `global_work_size * local_work_size`.
+    pub fn create_kernel(&self, name: &str, gws: usize, lws: usize) -> GPUResult<Kernel> {
+        let function_name = CString::new(name).expect("Kernel name must not contain nul bytes");
+        let function = self.module.get_function(&function_name)?;
+
+        Ok(Kernel {
+            function,
+            global_work_size: gws,
+            local_work_size: lws,
+            stream: &self.stream,
+            args: Vec::new(),
+        })
+    }
+
+    /// Puts data from an existing buffer onto the GPU.
+    pub fn write_from_buffer<T>(&self, buffer: &mut Buffer<T>, data: &[T]) -> GPUResult<()> {
+        assert!(data.len() <= buffer.length, "Buffer is too small");
+
+        // Transmuting types is safe as long a sizes match.
+        let bytes = unsafe {
+            std::slice::from_raw_parts(
+                data.as_ptr() as *const T as *const u8,
+                data.len() * std::mem::size_of::<T>(),
+            )
+        };
+
+        // It is safe as we synchronize the stream after the call.
+        unsafe { buffer.buffer.async_copy_from(bytes, &self.stream)? };
+        self.stream.synchronize()?;
+
+        Ok(())
+    }
+
+    /// Reads data from the GPU into an existing buffer.
+    pub fn read_into_buffer<T>(&self, buffer: &Buffer<T>, data: &mut [T]) -> GPUResult<()> {
+        assert!(data.len() <= buffer.length, "Buffer is too small");
+
+        // Transmuting types is safe as long a sizes match.
+        let bytes = unsafe {
+            std::slice::from_raw_parts_mut(
+                data.as_mut_ptr() as *mut T as *mut u8,
+                data.len() * std::mem::size_of::<T>(),
+            )
+        };
+
+        // It is safe as we synchronize the stream after the call.
+        unsafe { buffer.buffer.async_copy_to(bytes, &self.stream)? };
+        self.stream.synchronize()?;
+
+        Ok(())
+    }
+
+    /// Run some code in the context of the program.
+    ///
+    /// It sets the correct contexts.
+    ///
+    /// It takes the program as a parameter, so that we can use the same function body, for both
+    /// the OpenCL and the CUDA code path. The only difference is the type of the program.
+    pub fn run<F, R, E, A>(&self, fun: F, arg: A) -> Result<R, E>
+    where
+        F: FnOnce(&Self, A) -> Result<R, E>,
+        E: From<GPUError>,
+    {
+        rustacuda::context::CurrentContext::set_current(&self.context).map_err(Into::into)?;
+        let result = fun(self, arg);
+        Self::pop_context();
+        result
+    }
+
+    /// Pop the current context.
+    ///
+    /// It panics as it's an unrecoverable error.
+    fn pop_context() {
+        rustacuda::context::ContextStack::pop().expect("Cannot remove context.");
+    }
+}
+
+// TODO vmx 2021-07-07: Check if RustaCUDA types used in `Program` can be made `Send`, so that
+// this manual `Send` implementation is no longer needed.
+unsafe impl Send for Program {}
+
+/// Abstraction for kernel arguments.
+///
+/// Kernel arguments implement this trait, so that they can be converted it into the correct
+/// pointers needed by the actual kernel call.
+pub trait KernelArgument {
+    /// Converts into a C void pointer.
+    fn as_c_void(&self) -> *mut c_void;
+
+    /// Returns the shared memory size. This is usally 0, except for [`LocalBuffer`]s. This
+    /// informations is used to allocate the memory correctly.
+    fn shared_mem(&self) -> u32 {
+        0
+    }
+}
+
+impl<T> KernelArgument for Buffer<T> {
+    fn as_c_void(&self) -> *mut c_void {
+        &self.buffer as *const _ as _
+    }
+}
+
+impl KernelArgument for i32 {
+    fn as_c_void(&self) -> *mut c_void {
+        self as *const _ as _
+    }
+}
+
+impl KernelArgument for u32 {
+    fn as_c_void(&self) -> *mut c_void {
+        self as *const _ as _
+    }
+}
+
+impl<T> KernelArgument for LocalBuffer<T> {
+    // This is a hack: on CUDA kernels, you cannot have `__shared__` (`__local` in OpenCL lingo)
+    // kernel parameters. Hence, just pass on an arbirtary valid pointer. It won't be used, so it
+    // doesn't matter where it actually points to. A null pointer cannot be used as CUDA would
+    // return an "invalid argument" error.
+    fn as_c_void(&self) -> *mut c_void {
+        self as *const _ as _
+    }
+
+    fn shared_mem(&self) -> u32 {
+        u32::try_from(self.length * std::mem::size_of::<T>())
+            .expect("__shared__ memory allocation is too big.")
+    }
+}
+
+/// A kernel that can be executed.
+pub struct Kernel<'a> {
+    function: rustacuda::function::Function<'a>,
+    global_work_size: usize,
+    local_work_size: usize,
+    stream: &'a Stream,
+    args: Vec<&'a dyn KernelArgument>,
+}
+
+impl fmt::Debug for Kernel<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let args = self
+            .args
+            .iter()
+            .map(|arg| (arg.as_c_void(), arg.shared_mem()))
+            .collect::<Vec<_>>();
+        f.debug_struct("Kernel")
+            .field("function", &self.function)
+            .field("global_work_size", &self.global_work_size)
+            .field("local_work_size", &self.local_work_size)
+            .field("stream", &self.stream)
+            .field("args", &args)
+            .finish()
+    }
+}
+
+impl<'a> Kernel<'a> {
+    /// Set a kernel argument.
+    ///
+    /// The arguments must live as long as the kernel. Hence make sure they are not dropped as
+    /// long as the kernel is in use.
+    ///
+    /// Example where this behaviour is enforced and leads to a compile-time error:
+    ///
+    /// ```compile_fail
+    /// use rust_gpu_tools::cuda::Program;
+    ///
+    /// fn would_break(program: &Program) {
+    ///    let data = vec![1, 2, 3, 4];
+    ///    let buffer = program.create_buffer_from_slice(&data).unwrap();
+    ///    let kernel = program.create_kernel("my_kernel", 4, 256).unwrap();
+    ///    let kernel = kernel.arg(&buffer);
+    ///    // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime.
+    ///    drop(buffer);
+    ///    kernel.run().unwrap();
+    /// }
+    /// ```
+    pub fn arg<T: KernelArgument>(mut self, t: &'a T) -> Self {
+        self.args.push(t);
+        self
+    }
+
+    /// Actually run the kernel.
+    ///
+    /// ### Panics
+    ///
+    /// Panics if the wrong number of arguments was provided.
+    pub fn run(self) -> GPUResult<()> {
+        // There can only be a single [`LocalBuffer`], due to CUDA restrictions.
+        let shared_mem = self
+            .args
+            .iter()
+            .try_fold(0, |acc, &arg| -> GPUResult<u32> {
+                let mem = arg.shared_mem();
+                match (mem, acc) {
+                    // No new shared memory needs to be allocated.
+                    (0, _) => Ok(acc),
+                    // Some shared memory needs to be allocated.
+                    (_, 0) => Ok(mem),
+                    // There should be memory allocated more than once
+                    (_, _) => Err(GPUError::Generic(
+                        "There cannot be more than one `LocalBuffer`.".to_string(),
+                    )),
+                }
+            })?;
+        let args = self
+            .args
+            .iter()
+            .map(|arg| arg.as_c_void())
+            .collect::<Vec<_>>();
+        // It is safe to launch the kernel as the arguments need to live when the kernel is called,
+        // and the buffers are copied synchronuously. At the end of the execution, the underlying
+        // stream is synchronized.
+        unsafe {
+            self.stream.launch(
+                &self.function,
+                self.global_work_size as u32,
+                self.local_work_size as u32,
+                shared_mem,
+                &args,
+            )?;
+        };
+        // Synchronize after the kernel execution, so that the underlying pointers can be
+        // invalidated/dropped.
+        self.stream.synchronize()?;
+        Ok(())
+    }
+}
diff --git a/src/cuda/utils.rs b/src/cuda/utils.rs
new file mode 100644
index 0000000..e319377
--- /dev/null
+++ b/src/cuda/utils.rs
@@ -0,0 +1,126 @@
+use std::convert::TryFrom;
+
+use log::{debug, warn};
+
+use crate::cuda::Device;
+use crate::device::{PciId, Vendor};
+use crate::error::{GPUError, GPUResult};
+
+// NOTE vmx 2021-04-14: This is a hack to make sure contexts stay around. We wrap them, so that
+// `Sync` can be implemented. `Sync` is needed for lazy static. These contexts are never used
+// directly, they are only accessed through [`cuda::Device`] which contains an `UnownedContext`.
+// A device cannot have an own context itself, as then it couldn't be cloned, but that is needed
+// for creating the kernels.
+pub(crate) struct CudaContexts(Vec<rustacuda::context::Context>);
+unsafe impl Sync for CudaContexts {}
+
+/// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID.
+///
+/// It is the first two identifiers of e.g. `lspci`:
+///
+/// ```text
+///     4e:00.0 VGA compatible controller
+///     || └└-- Device ID
+///     └└-- Bus ID
+/// ```
+fn get_pci_id(device: &rustacuda::device::Device) -> Result<PciId, GPUError> {
+    let bus_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciBusId)? as u16;
+    let device_id = device.get_attribute(rustacuda::device::DeviceAttribute::PciDeviceId)? as u16;
+    let pci_id = (bus_id << 8) | device_id;
+    Ok(pci_id.into())
+}
+
+fn get_memory(d: &rustacuda::device::Device) -> GPUResult<u64> {
+    let memory = d.total_memory()?;
+    Ok(u64::try_from(memory).expect("Platform must be <= 64-bit"))
+}
+
+/// Get a list of all available and supported devices.
+///
+/// If there is a failure initializing CUDA or retrieving a device, it won't lead to a hard error,
+/// but an error will be logged and the corresponding device won't be available.
+pub(crate) fn build_device_list() -> (Vec<Device>, CudaContexts) {
+    let mut all_devices = Vec::new();
+    let mut devices_without_pci_id = Vec::new();
+    let mut contexts = Vec::new();
+
+    rustacuda::init(rustacuda::CudaFlags::empty())
+        .map_err(Into::into)
+        .and_then(|_| {
+            for device in rustacuda::device::Device::devices()? {
+                let device = device?;
+                let owned_context = rustacuda::context::Context::create_and_push(
+                    rustacuda::context::ContextFlags::MAP_HOST
+                        | rustacuda::context::ContextFlags::SCHED_AUTO,
+                    device,
+                )?;
+                rustacuda::context::ContextStack::pop()?;
+
+                let vendor = Vendor::Nvidia;
+                let name = device.name()?;
+                let memory = get_memory(&device)?;
+                let uuid = device.uuid().ok().map(Into::into);
+                let context = owned_context.get_unowned();
+
+                contexts.push(owned_context);
+
+                // If a device doesn't have a PCI-ID, add those later to the list of
+                // devices with a fake PCI-ID.
+                match get_pci_id(&device) {
+                    Ok(pci_id) => {
+                        all_devices.push(Device {
+                            vendor,
+                            name,
+                            memory,
+                            pci_id,
+                            uuid,
+                            device,
+                            context,
+                        });
+                    }
+                    Err(_) => {
+                        // Use a temporary PCI-ID and replace it later with a non-colliding one.
+                        let pci_id = PciId::from(0);
+                        devices_without_pci_id.push(Device {
+                            vendor,
+                            name,
+                            memory,
+                            pci_id,
+                            uuid,
+                            device,
+                            context,
+                        });
+                    }
+                };
+            }
+
+            // Laptops might have an integrated GPU. Such devices might have neither a PCI-ID, nor a UUID.
+            // As those devices are used for development and not for production use, it's good enough to
+            // provide a workaround which doesn't add much complexity to the code. We use a fake PCI-ID
+            // instead, which is generated by enumerating the available devices. In order to make that
+            // case easier to spot when debugging issues, a starting number which is pleasant to the human
+            // eye was choosen, that works both, decimal and hexadecimal (4660 == 0x1234).
+            let mut enumerated_device: u16 = 4660;
+            for mut device in devices_without_pci_id.into_iter() {
+                // Make sure that no device has that actual PCI-ID
+                while all_devices
+                    .iter()
+                    .any(|d| d.pci_id() == enumerated_device.into())
+                {
+                    enumerated_device += 1;
+                }
+                device.pci_id = PciId::from(enumerated_device);
+                enumerated_device += 1;
+                all_devices.push(device);
+            }
+
+            let wrapped_contexts = CudaContexts(contexts);
+
+            debug!("Loaded CUDA devices: {:?}", all_devices);
+            Ok((all_devices, wrapped_contexts))
+        })
+        .unwrap_or_else(|error: GPUError| {
+            warn!("Unable to retrieve CUDA devices: {:?}", error);
+            (Vec::new(), CudaContexts(Vec::new()))
+        })
+}
diff --git a/src/device.rs b/src/device.rs
new file mode 100644
index 0000000..a28edd7
--- /dev/null
+++ b/src/device.rs
@@ -0,0 +1,566 @@
+use std::fmt;
+
+use lazy_static::lazy_static;
+use log::debug;
+#[cfg(all(feature = "opencl", feature = "cuda"))]
+use log::warn;
+
+use std::convert::TryFrom;
+use std::mem;
+
+use crate::error::{GPUError, GPUResult};
+
+#[cfg(feature = "cuda")]
+use crate::cuda;
+#[cfg(feature = "opencl")]
+use crate::opencl;
+
+/// The UUID of the devices returned by OpenCL as well as CUDA are always 16 bytes long.
+const UUID_SIZE: usize = 16;
+const AMD_DEVICE_VENDOR_STRING: &str = "Advanced Micro Devices, Inc.";
+const AMD_DEVICE_VENDOR_ID: u32 = 0x1002;
+// For some reason integrated AMD cards on Apple don't have the usual vendor name and ID
+const AMD_DEVICE_ON_APPLE_VENDOR_STRING: &str = "AMD";
+const AMD_DEVICE_ON_APPLE_VENDOR_ID: u32 = 0x1021d00;
+const NVIDIA_DEVICE_VENDOR_STRING: &str = "NVIDIA Corporation";
+const NVIDIA_DEVICE_VENDOR_ID: u32 = 0x10de;
+
+#[cfg(feature = "cuda")]
+lazy_static! {
+    // The owned CUDA contexts are stored globally. Each devives contains an unowned reference,
+    // so that devices can be cloned.
+    static ref DEVICES: (Vec<Device>, cuda::utils::CudaContexts) = build_device_list();
+}
+
+#[cfg(all(feature = "opencl", not(feature = "cuda")))]
+lazy_static! {
+    // Keep it as a tuple as the CUDA case, so that the using `DEVICES` is independent of the
+    // features set.
+    static ref DEVICES: (Vec<Device>, ()) = build_device_list();
+}
+
+/// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID.
+///
+/// It is the first two identifiers of e.g. `lcpci`:
+///
+/// ```text
+///     4e:00.0 VGA compatible controller
+///     || └└-- Device ID
+///     └└-- Bus ID
+/// ```
+#[derive(Copy, Clone, Debug, Default, Eq, Hash, PartialEq)]
+pub struct PciId(u16);
+
+impl From<u16> for PciId {
+    fn from(id: u16) -> Self {
+        Self(id)
+    }
+}
+
+impl From<PciId> for u16 {
+    fn from(id: PciId) -> Self {
+        id.0
+    }
+}
+
+/// Converts a PCI-ID formatted as Bus-ID:Device-ID, e.g. `e3:00`.
+impl TryFrom<&str> for PciId {
+    type Error = GPUError;
+
+    fn try_from(pci_id: &str) -> GPUResult<Self> {
+        let mut bytes = [0; mem::size_of::<u16>()];
+        hex::decode_to_slice(pci_id.replace(":", ""), &mut bytes).map_err(|_| {
+            GPUError::InvalidId(format!(
+                "Cannot parse PCI ID, expected hex-encoded string formated as aa:bb, got {0}.",
+                pci_id
+            ))
+        })?;
+        let parsed = u16::from_be_bytes(bytes);
+        Ok(Self(parsed))
+    }
+}
+
+/// Formats the PCI-ID like `lspci`, Bus-ID:Device-ID, e.g. `e3:00`.
+impl fmt::Display for PciId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let bytes = u16::to_be_bytes(self.0);
+        write!(f, "{:02x}:{:02x}", bytes[0], bytes[1])
+    }
+}
+
+/// A unique identifier based on UUID of the device.
+#[derive(Copy, Clone, Default, Eq, Hash, PartialEq)]
+pub struct DeviceUuid([u8; UUID_SIZE]);
+
+impl From<[u8; UUID_SIZE]> for DeviceUuid {
+    fn from(uuid: [u8; UUID_SIZE]) -> Self {
+        Self(uuid)
+    }
+}
+
+impl From<DeviceUuid> for [u8; UUID_SIZE] {
+    fn from(uuid: DeviceUuid) -> Self {
+        uuid.0
+    }
+}
+
+/// Converts a UUID formatted as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee,
+/// e.g. 46abccd6-022e-b783-572d-833f7104d05f
+impl TryFrom<&str> for DeviceUuid {
+    type Error = GPUError;
+
+    fn try_from(uuid: &str) -> GPUResult<Self> {
+        let mut bytes = [0; UUID_SIZE];
+        hex::decode_to_slice(uuid.replace("-", ""), &mut bytes)
+            .map_err(|_| {
+                GPUError::InvalidId(format!("Cannot parse UUID, expected hex-encoded string formated as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, got {0}.", uuid))
+            })?;
+        Ok(Self(bytes))
+    }
+}
+
+/// Formats the UUID the same way as `clinfo` does, as an example:
+/// the output should looks like 46abccd6-022e-b783-572d-833f7104d05f
+impl fmt::Display for DeviceUuid {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}-{}-{}-{}-{}",
+            hex::encode(&self.0[..4]),
+            hex::encode(&self.0[4..6]),
+            hex::encode(&self.0[6..8]),
+            hex::encode(&self.0[8..10]),
+            hex::encode(&self.0[10..])
+        )
+    }
+}
+
+impl fmt::Debug for DeviceUuid {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.to_string())
+    }
+}
+
+/// Unique identifier that can either be a PCI ID or a UUID.
+#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)]
+pub enum UniqueId {
+    /// ID based on the PCI bus.
+    PciId(PciId),
+    /// ID based on a globally unique identifier.
+    Uuid(DeviceUuid),
+}
+
+/// If the string contains a dash, it's interpreted as UUID, else it's interpreted as PCI ID.
+impl TryFrom<&str> for UniqueId {
+    type Error = GPUError;
+
+    fn try_from(unique_id: &str) -> GPUResult<Self> {
+        Ok(match unique_id.contains('-') {
+            true => Self::Uuid(DeviceUuid::try_from(unique_id)?),
+            false => Self::PciId(PciId::try_from(unique_id)?),
+        })
+    }
+}
+
+impl fmt::Display for UniqueId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::PciId(id) => id.fmt(f),
+            Self::Uuid(id) => id.fmt(f),
+        }
+    }
+}
+
+/// Currently supported vendors of this library.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub enum Vendor {
+    /// GPU by AMD.
+    Amd,
+    /// GPU by NVIDIA.
+    Nvidia,
+}
+
+impl TryFrom<&str> for Vendor {
+    type Error = GPUError;
+
+    fn try_from(vendor: &str) -> GPUResult<Self> {
+        match vendor {
+            AMD_DEVICE_VENDOR_STRING => Ok(Self::Amd),
+            AMD_DEVICE_ON_APPLE_VENDOR_STRING => Ok(Self::Amd),
+            NVIDIA_DEVICE_VENDOR_STRING => Ok(Self::Nvidia),
+            _ => Err(GPUError::UnsupportedVendor(vendor.to_string())),
+        }
+    }
+}
+
+impl TryFrom<u32> for Vendor {
+    type Error = GPUError;
+
+    fn try_from(vendor: u32) -> GPUResult<Self> {
+        match vendor {
+            AMD_DEVICE_VENDOR_ID => Ok(Self::Amd),
+            AMD_DEVICE_ON_APPLE_VENDOR_ID => Ok(Self::Amd),
+            NVIDIA_DEVICE_VENDOR_ID => Ok(Self::Nvidia),
+            _ => Err(GPUError::UnsupportedVendor(format!("0x{:x}", vendor))),
+        }
+    }
+}
+
+impl fmt::Display for Vendor {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let vendor = match self {
+            Self::Amd => AMD_DEVICE_VENDOR_STRING,
+            Self::Nvidia => NVIDIA_DEVICE_VENDOR_STRING,
+        };
+        write!(f, "{}", vendor)
+    }
+}
+
+/// Which framework to use, CUDA or OpenCL.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub enum Framework {
+    /// CUDA.
+    #[cfg(feature = "cuda")]
+    Cuda,
+    /// OpenCL.
+    #[cfg(feature = "opencl")]
+    Opencl,
+}
+
+/// A device that may have a CUDA and/or OpenCL GPU associated with it.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct Device {
+    vendor: Vendor,
+    name: String,
+    memory: u64,
+    // All devices have a PCI ID. It is used as fallback in case there is not UUID.
+    pci_id: PciId,
+    uuid: Option<DeviceUuid>,
+    #[cfg(feature = "cuda")]
+    cuda: Option<cuda::Device>,
+    #[cfg(feature = "opencl")]
+    opencl: Option<opencl::Device>,
+}
+
+impl Device {
+    /// Returns the [`Vendor`] of the GPU.
+    pub fn vendor(&self) -> Vendor {
+        self.vendor
+    }
+
+    /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
+    pub fn name(&self) -> String {
+        self.name.clone()
+    }
+
+    /// Returns the memory of the GPU in bytes.
+    pub fn memory(&self) -> u64 {
+        self.memory
+    }
+
+    /// Returns the best possible unique identifier, a UUID is preferred over a PCI ID.
+    pub fn unique_id(&self) -> UniqueId {
+        match self.uuid {
+            Some(uuid) => UniqueId::Uuid(uuid),
+            None => UniqueId::PciId(self.pci_id),
+        }
+    }
+
+    /// Returns the preferred framework (CUDA or OpenCL) to use.
+    ///
+    /// CUDA will be be preferred over OpenCL. The returned framework will work on the device.
+    /// E.g. it won't return `Framework::Cuda` for an AMD device.
+    pub fn framework(&self) -> Framework {
+        #[cfg(all(feature = "opencl", feature = "cuda"))]
+        if cfg!(feature = "cuda") && self.cuda.is_some() {
+            Framework::Cuda
+        } else {
+            Framework::Opencl
+        }
+
+        #[cfg(all(feature = "cuda", not(feature = "opencl")))]
+        {
+            Framework::Cuda
+        }
+
+        #[cfg(all(feature = "opencl", not(feature = "cuda")))]
+        {
+            Framework::Opencl
+        }
+    }
+
+    /// Returns the underlying CUDA device if it is available.
+    #[cfg(feature = "cuda")]
+    pub fn cuda_device(&self) -> Option<&cuda::Device> {
+        self.cuda.as_ref()
+    }
+
+    /// Returns the underlying OpenCL device if it is available.
+    #[cfg(feature = "opencl")]
+    pub fn opencl_device(&self) -> Option<&opencl::Device> {
+        self.opencl.as_ref()
+    }
+
+    /// Returns all available GPUs that are supported.
+    pub fn all() -> Vec<&'static Device> {
+        Self::all_iter().collect()
+    }
+
+    /// Returns the device matching the PCI ID if there is one.
+    pub fn by_pci_id(pci_id: PciId) -> Option<&'static Device> {
+        Self::all_iter().find(|d| pci_id == d.pci_id)
+    }
+
+    /// Returns the device matching the UUID if there is one.
+    pub fn by_uuid(uuid: DeviceUuid) -> Option<&'static Device> {
+        Self::all_iter().find(|d| Some(uuid) == d.uuid)
+    }
+
+    /// Returns the device matching the unique ID if there is one.
+    pub fn by_unique_id(unique_id: UniqueId) -> Option<&'static Device> {
+        Self::all_iter().find(|d| unique_id == d.unique_id())
+    }
+
+    /// Returns an iterator of all available GPUs that are supported.
+    fn all_iter() -> impl Iterator<Item = &'static Device> {
+        DEVICES.0.iter()
+    }
+}
+
+/// Get a list of all available and supported devices.
+///
+/// If both, the `cuda` and the `opencl` feature are enabled, a device supporting both will be
+/// combined into a single device. You can then access the underlying CUDA and OpenCL device
+/// if needed.
+///
+/// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be
+/// logged and the corresponding device won't be available.
+#[cfg(feature = "cuda")]
+fn build_device_list() -> (Vec<Device>, cuda::utils::CudaContexts) {
+    let mut all_devices = Vec::new();
+
+    #[cfg(feature = "opencl")]
+    let opencl_devices = opencl::utils::build_device_list();
+
+    #[cfg(all(feature = "cuda", feature = "opencl"))]
+    let (mut cuda_devices, cuda_contexts) = cuda::utils::build_device_list();
+    #[cfg(all(feature = "cuda", not(feature = "opencl")))]
+    let (cuda_devices, cuda_contexts) = cuda::utils::build_device_list();
+
+    // Combine OpenCL and CUDA devices into one device if it is the same GPU
+    #[cfg(feature = "opencl")]
+    for opencl_device in opencl_devices {
+        let mut device = Device {
+            vendor: opencl_device.vendor(),
+            name: opencl_device.name(),
+            memory: opencl_device.memory(),
+            pci_id: opencl_device.pci_id(),
+            uuid: opencl_device.uuid(),
+            opencl: Some(opencl_device),
+            cuda: None,
+        };
+
+        // Only devices from Nvidia can use CUDA
+        #[cfg(feature = "cuda")]
+        if device.vendor == Vendor::Nvidia {
+            for ii in 0..cuda_devices.len() {
+                if (device.uuid.is_some() && cuda_devices[ii].uuid() == device.uuid)
+                    || (cuda_devices[ii].pci_id() == device.pci_id)
+                {
+                    if device.memory() != cuda_devices[ii].memory() {
+                        warn!("OpenCL and CUDA report different amounts of memory for a device with the same identifier");
+                        break;
+                    }
+                    // Move the CUDA device out of the vector
+                    device.cuda = Some(cuda_devices.remove(ii));
+                    // Only one device can match
+                    break;
+                }
+            }
+        }
+
+        all_devices.push(device)
+    }
+
+    // All CUDA devices that don't have a corresponding OpenCL devices
+    for cuda_device in cuda_devices {
+        let device = Device {
+            vendor: cuda_device.vendor(),
+            name: cuda_device.name(),
+            memory: cuda_device.memory(),
+            pci_id: cuda_device.pci_id(),
+            uuid: cuda_device.uuid(),
+            cuda: Some(cuda_device),
+            #[cfg(feature = "opencl")]
+            opencl: None,
+        };
+        all_devices.push(device);
+    }
+
+    debug!("loaded devices: {:?}", all_devices);
+    (all_devices, cuda_contexts)
+}
+
+/// Get a list of all available and supported OpenCL devices.
+///
+/// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be
+/// logged and the corresponding device won't be available.
+#[cfg(all(feature = "opencl", not(feature = "cuda")))]
+fn build_device_list() -> (Vec<Device>, ()) {
+    let devices = opencl::utils::build_device_list()
+        .into_iter()
+        .map(|device| Device {
+            vendor: device.vendor(),
+            name: device.name(),
+            memory: device.memory(),
+            pci_id: device.pci_id(),
+            uuid: device.uuid(),
+            opencl: Some(device),
+        })
+        .collect();
+
+    debug!("loaded devices: {:?}", devices);
+    (devices, ())
+}
+
+#[cfg(test)]
+mod test {
+    use super::{
+        Device, DeviceUuid, GPUError, PciId, UniqueId, Vendor, AMD_DEVICE_ON_APPLE_VENDOR_ID,
+        AMD_DEVICE_ON_APPLE_VENDOR_STRING, AMD_DEVICE_VENDOR_ID, AMD_DEVICE_VENDOR_STRING,
+        NVIDIA_DEVICE_VENDOR_ID, NVIDIA_DEVICE_VENDOR_STRING,
+    };
+    use std::convert::TryFrom;
+
+    #[test]
+    fn test_device_all() {
+        let devices = Device::all();
+        for device in devices.iter() {
+            println!("device: {:?}", device);
+        }
+        assert!(!devices.is_empty(), "No supported GPU found.");
+    }
+
+    #[test]
+    fn test_vendor_from_str() {
+        assert_eq!(
+            Vendor::try_from(AMD_DEVICE_VENDOR_STRING).unwrap(),
+            Vendor::Amd,
+            "AMD vendor string can be converted."
+        );
+        assert_eq!(
+            Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_STRING).unwrap(),
+            Vendor::Amd,
+            "AMD vendor string (on apple) can be converted."
+        );
+        assert_eq!(
+            Vendor::try_from(NVIDIA_DEVICE_VENDOR_STRING).unwrap(),
+            Vendor::Nvidia,
+            "Nvidia vendor string can be converted."
+        );
+        assert!(matches!(
+            Vendor::try_from("unknown vendor"),
+            Err(GPUError::UnsupportedVendor(_))
+        ));
+    }
+
+    #[test]
+    fn test_vendor_from_u32() {
+        assert_eq!(
+            Vendor::try_from(AMD_DEVICE_VENDOR_ID).unwrap(),
+            Vendor::Amd,
+            "AMD vendor ID can be converted."
+        );
+        assert_eq!(
+            Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_ID).unwrap(),
+            Vendor::Amd,
+            "AMD vendor ID (on apple) can be converted."
+        );
+        assert_eq!(
+            Vendor::try_from(NVIDIA_DEVICE_VENDOR_ID).unwrap(),
+            Vendor::Nvidia,
+            "Nvidia vendor ID can be converted."
+        );
+        assert!(matches!(
+            Vendor::try_from(0x1abc),
+            Err(GPUError::UnsupportedVendor(_))
+        ));
+    }
+
+    #[test]
+    fn test_vendor_display() {
+        assert_eq!(
+            Vendor::Amd.to_string(),
+            AMD_DEVICE_VENDOR_STRING,
+            "AMD vendor can be converted to string."
+        );
+        assert_eq!(
+            Vendor::Nvidia.to_string(),
+            NVIDIA_DEVICE_VENDOR_STRING,
+            "Nvidia vendor can be converted to string."
+        );
+    }
+
+    #[test]
+    fn test_uuid() {
+        let valid_string = "46abccd6-022e-b783-572d-833f7104d05f";
+        let valid = DeviceUuid::try_from(valid_string).unwrap();
+        assert_eq!(valid_string, &valid.to_string());
+
+        let too_short_string = "ccd6-022e-b783-572d-833f7104d05f";
+        let too_short = DeviceUuid::try_from(too_short_string);
+        assert!(too_short.is_err(), "Parse error when UUID is too short.");
+
+        let invalid_hex_string = "46abccd6-022e-b783-572d-833f7104d05h";
+        let invalid_hex = DeviceUuid::try_from(invalid_hex_string);
+        assert!(
+            invalid_hex.is_err(),
+            "Parse error when UUID containts non-hex character."
+        );
+    }
+
+    #[test]
+    fn test_pci_id() {
+        let valid_string = "01:00";
+        let valid = PciId::try_from(valid_string).unwrap();
+        assert_eq!(valid_string, &valid.to_string());
+        assert_eq!(valid, PciId(0x0100));
+
+        let too_short_string = "3f";
+        let too_short = PciId::try_from(too_short_string);
+        assert!(too_short.is_err(), "Parse error when PCI ID is too short.");
+
+        let invalid_hex_string = "aaxx";
+        let invalid_hex = PciId::try_from(invalid_hex_string);
+        assert!(
+            invalid_hex.is_err(),
+            "Parse error when PCI ID containts non-hex character."
+        );
+    }
+
+    #[test]
+    fn test_unique_id() {
+        let valid_pci_id_string = "aa:bb";
+        let valid_pci_id = UniqueId::try_from(valid_pci_id_string).unwrap();
+        assert_eq!(valid_pci_id_string, &valid_pci_id.to_string());
+        assert_eq!(valid_pci_id, UniqueId::PciId(PciId(0xaabb)));
+
+        let valid_uuid_string = "aabbccdd-eeff-0011-2233-445566778899";
+        let valid_uuid = UniqueId::try_from(valid_uuid_string).unwrap();
+        assert_eq!(valid_uuid_string, &valid_uuid.to_string());
+        assert_eq!(
+            valid_uuid,
+            UniqueId::Uuid(DeviceUuid([
+                0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+                0x88, 0x99
+            ]))
+        );
+
+        let invalid_string = "aabbccddeeffgg";
+        let invalid = UniqueId::try_from(invalid_string);
+        assert!(
+            invalid.is_err(),
+            "Parse error when ID matches neither a PCI Id, nor a UUID."
+        );
+    }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..53a35a6
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,68 @@
+#[cfg(feature = "opencl")]
+use opencl3::{device::DeviceInfo, error_codes::ClError, program::ProgramInfo};
+#[cfg(feature = "cuda")]
+use rustacuda::error::CudaError;
+
+/// Error types of this library.
+#[derive(thiserror::Error, Debug)]
+#[allow(clippy::upper_case_acronyms)]
+pub enum GPUError {
+    /// Error from the underlying `opencl3` library, e.g. a memory allocation failure.
+    #[cfg(feature = "opencl")]
+    #[error("Opencl3 Error: {0}{}", match .1 {
+       Some(message) => format!(" {}", message),
+       None => "".to_string(),
+    })]
+    Opencl3(ClError, Option<String>),
+
+    /// Error for OpenCL `clGetProgramInfo()` call failures.
+    #[cfg(feature = "opencl")]
+    #[error("Program info not available!")]
+    ProgramInfoNotAvailable(ProgramInfo),
+
+    /// Error for OpenCL `clGetDeviceInfo()` call failures.
+    #[cfg(feature = "opencl")]
+    #[error("Device info not available!")]
+    DeviceInfoNotAvailable(DeviceInfo),
+
+    /// Error from the underlying `RustaCUDA` library, e.g. a memory allocation failure.
+    #[cfg(feature = "cuda")]
+    #[error("Cuda Error: {0}")]
+    Cuda(#[from] CudaError),
+
+    /// Error when a device cannot be found.
+    #[error("Device not found!")]
+    DeviceNotFound,
+
+    /// Error when a kernel with the given name cannot be found.
+    #[error("Kernel with name {0} not found!")]
+    KernelNotFound(String),
+
+    /// Error when standard I/O fails.
+    #[error("IO Error: {0}")]
+    IO(#[from] std::io::Error),
+
+    /// Error when the device is from an unsupported vendor.
+    #[error("Vendor {0} is not supported.")]
+    UnsupportedVendor(String),
+
+    /// Error when the string representation of a unique identifier (PCI-ID or UUID) cannot be
+    /// parsed.
+    #[error("{0}")]
+    InvalidId(String),
+
+    /// Errors that rarely happen and don't deserve their own error type.
+    #[error("{0}")]
+    Generic(String),
+}
+
+/// Convenience type alias for [`GPUError`] based [`Result`]s.
+#[allow(clippy::upper_case_acronyms)]
+pub type GPUResult<T> = std::result::Result<T, GPUError>;
+
+#[cfg(feature = "opencl")]
+impl From<ClError> for GPUError {
+    fn from(error: ClError) -> Self {
+        GPUError::Opencl3(error, None)
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 5d7cae6..ba4b75c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1 +1,54 @@
+//! Abstraction layer for OpenCL and CUDA.
+//!
+//! Feature Flags
+//! -------------
+//!
+//! There are two [feature flags], `cuda` and `opencl`. By default `opencl` is enabled. You can
+//! enable both at the same time. At least one of them needs to be enabled at any time.
+//!
+//! [feature flags]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-features-section
+
+#![warn(missing_docs)]
+
+mod device;
+mod error;
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+mod program;
+
+#[cfg(feature = "cuda")]
+pub mod cuda;
+#[cfg(feature = "opencl")]
 pub mod opencl;
+
+pub use device::{Device, DeviceUuid, Framework, PciId, UniqueId, Vendor};
+pub use error::GPUError;
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+pub use program::Program;
+
+#[cfg(not(any(feature = "cuda", feature = "opencl")))]
+compile_error!("At least one of the features `cuda` or `opencl` must be enabled.");
+
+/// A buffer on the GPU.
+///
+/// The concept of a local buffer is from OpenCL. In CUDA you don't allocate a buffer directly
+/// via API call. Instead you pass in the amount of shared memory that should be used.
+///
+/// There can be at most a single local buffer per kernel. On CUDA a null pointer will be passed
+/// in, instead of an actual value. The memory that should get allocated is then passed into the
+/// kernel call automatically.
+#[derive(Debug)]
+pub struct LocalBuffer<T> {
+    /// The number of T sized elements.
+    length: usize,
+    _phantom: std::marker::PhantomData<T>,
+}
+
+impl<T> LocalBuffer<T> {
+    /// Returns a new buffer of the specified `length`.
+    pub fn new(length: usize) -> Self {
+        LocalBuffer::<T> {
+            length,
+            _phantom: std::marker::PhantomData,
+        }
+    }
+}
diff --git a/src/opencl/mod.rs b/src/opencl/mod.rs
index 3e9558f..5f270fb 100644
--- a/src/opencl/mod.rs
+++ b/src/opencl/mod.rs
@@ -1,210 +1,31 @@
 //! The OpenCL specific implementation of a [`Buffer`], [`Device`], [`Program`] and [`Kernel`].
 
-mod error;
-mod utils;
+pub(crate) mod utils;
 
 use std::collections::HashMap;
-use std::convert::TryFrom;
-use std::fmt;
 use std::hash::{Hash, Hasher};
-use std::mem;
 use std::ptr;
 
-pub use error::{GPUError, GPUResult};
-
 use opencl3::command_queue::CommandQueue;
 use opencl3::context::Context;
-use opencl3::device::{DeviceInfo::CL_DEVICE_ENDIAN_LITTLE, CL_UUID_SIZE_KHR};
 use opencl3::error_codes::ClError;
 use opencl3::kernel::ExecuteKernel;
 use opencl3::memory::CL_MEM_READ_WRITE;
 use opencl3::program::ProgramInfo::CL_PROGRAM_BINARIES;
 use opencl3::types::CL_BLOCKING;
 
-const AMD_DEVICE_VENDOR_STRING: &str = "Advanced Micro Devices, Inc.";
-const AMD_DEVICE_VENDOR_ID: u32 = 0x1002;
-// For some reason integrated AMD cards on Apple don't have the usual vendor name and ID
-const AMD_DEVICE_ON_APPLE_VENDOR_STRING: &str = "AMD";
-const AMD_DEVICE_ON_APPLE_VENDOR_ID: u32 = 0x1021d00;
-const NVIDIA_DEVICE_VENDOR_STRING: &str = "NVIDIA Corporation";
-const NVIDIA_DEVICE_VENDOR_ID: u32 = 0x10de;
+use log::debug;
+
+use crate::device::{DeviceUuid, PciId, Vendor};
+use crate::error::{GPUError, GPUResult};
+use crate::LocalBuffer;
 
+/// The lowest level identifier of an OpenCL device, it changes whenever a device is initialized.
 #[allow(non_camel_case_types)]
 pub type cl_device_id = opencl3::types::cl_device_id;
 
-// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID.
-///
-/// It is the first two identifiers of e.g. `lspci`:
-///
-/// ```text
-///     4e:00.0 VGA compatible controller
-///     || └└-- Device ID
-///     └└-- Bus ID
-/// ```
-#[derive(Debug, Copy, Clone, PartialEq, Hash)]
-pub struct PciId(u16);
-
-impl From<u16> for PciId {
-    fn from(id: u16) -> Self {
-        Self(id)
-    }
-}
-
-impl From<PciId> for u16 {
-    fn from(id: PciId) -> Self {
-        id.0
-    }
-}
-
-/// Converts a PCI-ID formatted as Bus-ID:Device-ID, e.g. `e3:00`.
-impl TryFrom<&str> for PciId {
-    type Error = GPUError;
-
-    fn try_from(pci_id: &str) -> GPUResult<Self> {
-        let mut bytes = [0; mem::size_of::<u16>()];
-        hex::decode_to_slice(pci_id.replace(":", ""), &mut bytes).map_err(|_| {
-            GPUError::InvalidId(format!(
-                "Cannot parse PCI ID, expected hex-encoded string formated as aa:bb, got {0}.",
-                pci_id
-            ))
-        })?;
-        let parsed = u16::from_be_bytes(bytes);
-        Ok(Self(parsed))
-    }
-}
-
-/// Formats the PCI-ID like `lspci`, Bus-ID:Device-ID, e.g. `e3:00`.
-impl fmt::Display for PciId {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let bytes = u16::to_be_bytes(self.0);
-        write!(f, "{:02x}:{:02x}", bytes[0], bytes[1])
-    }
-}
-
-/// A unique identifier based on UUID of the device.
-#[derive(Copy, Clone, Default, PartialEq, Eq, Hash)]
-pub struct DeviceUuid([u8; CL_UUID_SIZE_KHR]);
-
-impl From<[u8; CL_UUID_SIZE_KHR]> for DeviceUuid {
-    fn from(uuid: [u8; CL_UUID_SIZE_KHR]) -> Self {
-        Self(uuid)
-    }
-}
-
-impl From<DeviceUuid> for [u8; CL_UUID_SIZE_KHR] {
-    fn from(uuid: DeviceUuid) -> Self {
-        uuid.0
-    }
-}
-
-/// Converts a UUID formatted as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee,
-/// e.g. 46abccd6-022e-b783-572d-833f7104d05f
-impl TryFrom<&str> for DeviceUuid {
-    type Error = GPUError;
-
-    fn try_from(uuid: &str) -> GPUResult<Self> {
-        let mut bytes = [0; CL_UUID_SIZE_KHR];
-        hex::decode_to_slice(uuid.replace("-", ""), &mut bytes)
-            .map_err(|_| {
-                GPUError::InvalidId(format!("Cannot parse UUID, expected hex-encoded string formated as aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee, got {0}.", uuid))
-            })?;
-        Ok(Self(bytes))
-    }
-}
-
-/// Formats the UUID the same way as `clinfo` does, as an example:
-/// the output should looks like 46abccd6-022e-b783-572d-833f7104d05f
-impl fmt::Display for DeviceUuid {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(
-            f,
-            "{}-{}-{}-{}-{}",
-            hex::encode(&self.0[..4]),
-            hex::encode(&self.0[4..6]),
-            hex::encode(&self.0[6..8]),
-            hex::encode(&self.0[8..10]),
-            hex::encode(&self.0[10..])
-        )
-    }
-}
-
-impl fmt::Debug for DeviceUuid {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{}", self.to_string())
-    }
-}
-
-/// Unique identifier that can either be a PCI ID or a UUID.
-#[derive(Debug, Copy, Clone, PartialEq)]
-pub enum UniqueId {
-    PciId(PciId),
-    Uuid(DeviceUuid),
-}
-
-/// If the string contains a dash, it's interpreted as UUID, else it's interpreted as PCI ID.
-impl TryFrom<&str> for UniqueId {
-    type Error = GPUError;
-
-    fn try_from(unique_id: &str) -> GPUResult<Self> {
-        Ok(match unique_id.contains('-') {
-            true => Self::Uuid(DeviceUuid::try_from(unique_id)?),
-            false => Self::PciId(PciId::try_from(unique_id)?),
-        })
-    }
-}
-
-impl fmt::Display for UniqueId {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::PciId(id) => id.fmt(f),
-            Self::Uuid(id) => id.fmt(f),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum Vendor {
-    Amd,
-    Nvidia,
-}
-
-impl TryFrom<&str> for Vendor {
-    type Error = GPUError;
-
-    fn try_from(vendor: &str) -> GPUResult<Self> {
-        match vendor {
-            AMD_DEVICE_VENDOR_STRING => Ok(Self::Amd),
-            AMD_DEVICE_ON_APPLE_VENDOR_STRING => Ok(Self::Amd),
-            NVIDIA_DEVICE_VENDOR_STRING => Ok(Self::Nvidia),
-            _ => Err(GPUError::UnsupportedVendor(vendor.to_string())),
-        }
-    }
-}
-
-impl TryFrom<u32> for Vendor {
-    type Error = GPUError;
-
-    fn try_from(vendor: u32) -> GPUResult<Self> {
-        match vendor {
-            AMD_DEVICE_VENDOR_ID => Ok(Self::Amd),
-            AMD_DEVICE_ON_APPLE_VENDOR_ID => Ok(Self::Amd),
-            NVIDIA_DEVICE_VENDOR_ID => Ok(Self::Nvidia),
-            _ => Err(GPUError::UnsupportedVendor(format!("0x{:x}", vendor))),
-        }
-    }
-}
-
-impl fmt::Display for Vendor {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let vendor = match self {
-            Self::Amd => AMD_DEVICE_VENDOR_STRING,
-            Self::Nvidia => NVIDIA_DEVICE_VENDOR_STRING,
-        };
-        write!(f, "{}", vendor)
-    }
-}
-
 /// A Buffer to be used for sending and receiving data to/from the GPU.
+#[derive(Debug)]
 pub struct Buffer<T> {
     buffer: opencl3::memory::Buffer<u8>,
     /// The number of T-sized elements.
@@ -261,11 +82,6 @@ impl Device {
     pub fn memory(&self) -> u64 {
         self.memory
     }
-    pub fn is_little_endian(&self) -> GPUResult<bool> {
-        self.device
-            .endian_little()
-            .map_err(|_| GPUError::DeviceInfoNotAvailable(CL_DEVICE_ENDIAN_LITTLE))
-    }
 
     /// Returns the PCI-ID of the GPU, see the [`PciId`] type for more information.
     pub fn pci_id(&self) -> PciId {
@@ -278,44 +94,6 @@ impl Device {
         self.uuid
     }
 
-    /// Returns the best possible unique identifier, a UUID is preferred over a PCI ID.
-    pub fn unique_id(&self) -> UniqueId {
-        match self.uuid {
-            Some(uuid) => UniqueId::Uuid(uuid),
-            None => UniqueId::PciId(self.pci_id),
-        }
-    }
-
-    /// Return all available GPU devices of supported vendors.
-    pub fn all() -> Vec<&'static Device> {
-        Self::all_iter().collect()
-    }
-
-    pub fn by_pci_id(pci_id: PciId) -> GPUResult<&'static Device> {
-        Self::all_iter()
-            .find(|d| pci_id == d.pci_id)
-            .ok_or(GPUError::DeviceNotFound)
-    }
-
-    pub fn by_uuid(uuid: DeviceUuid) -> GPUResult<&'static Device> {
-        Self::all_iter()
-            .find(|d| match d.uuid {
-                Some(id) => uuid == id,
-                None => false,
-            })
-            .ok_or(GPUError::DeviceNotFound)
-    }
-
-    pub fn by_unique_id(unique_id: UniqueId) -> GPUResult<&'static Device> {
-        Self::all_iter()
-            .find(|d| unique_id == d.unique_id())
-            .ok_or(GPUError::DeviceNotFound)
-    }
-
-    fn all_iter() -> impl Iterator<Item = &'static Device> {
-        utils::DEVICES.iter()
-    }
-
     /// Low-level access to the device identifier.
     ///
     /// It changes when the device is initialized and should only be used to interact with other
@@ -351,11 +129,19 @@ impl Program {
             Program::from_binary(device, bin)
         } else {
             let context = Context::from_device(&device.device)?;
+            debug!(
+                "Building kernel ({}) from source…",
+                cached.to_string_lossy()
+            );
             let mut program = opencl3::program::Program::create_from_source(&context, src)?;
             if let Err(build_error) = program.build(context.devices(), "") {
                 let log = program.get_build_log(context.devices()[0])?;
                 return Err(GPUError::Opencl3(build_error, Some(log)));
             }
+            debug!(
+                "Building kernel ({}) from source: done.",
+                cached.to_string_lossy()
+            );
             let queue = CommandQueue::create(&context, context.default_device(), 0)?;
             let kernels = opencl3::kernel::create_program_kernels(&program)?;
             let kernels_by_name = kernels
@@ -409,7 +195,16 @@ impl Program {
     /// Creates a new buffer that can be used for input/output with the GPU.
     ///
     /// The `length` is the number of elements to create.
-    pub fn create_buffer<T>(&self, length: usize) -> GPUResult<Buffer<T>> {
+    ///
+    /// It is usually used to create buffers that are initialized by the GPU. If you want to
+    /// directly transfer data from the host to the GPU, you would use the safe
+    /// [`Program::create_buffer_from_slice`] instead.
+    ///
+    /// # Safety
+    ///
+    /// This function isn't actually unsafe, it's marked as `unsafe` due to the CUDA version of it,
+    /// where it is unsafe. This is done to have symmetry between both APIs.
+    pub unsafe fn create_buffer<T>(&self, length: usize) -> GPUResult<Buffer<T>> {
         assert!(length > 0);
         let mut buff = opencl3::memory::Buffer::create(
             &self.context,
@@ -431,6 +226,35 @@ impl Program {
         })
     }
 
+    /// Creates a new buffer on the GPU and initializes with the given slice.
+    pub fn create_buffer_from_slice<T>(&self, slice: &[T]) -> GPUResult<Buffer<T>> {
+        let length = slice.len();
+        // The underlying buffer is `u8`, hence we need the number of bytes.
+        let bytes_len = length * std::mem::size_of::<T>();
+
+        let mut buffer = opencl3::memory::Buffer::create(
+            &self.context,
+            CL_MEM_READ_WRITE,
+            bytes_len,
+            ptr::null_mut(),
+        )?;
+        // Transmuting types is safe as long a sizes match.
+        let bytes = unsafe {
+            std::slice::from_raw_parts(slice.as_ptr() as *const T as *const u8, bytes_len)
+        };
+        // Write some data right-away. This makes a significant performance different.
+        self.queue
+            .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, &[0u8], &[])?;
+        self.queue
+            .enqueue_write_buffer(&mut buffer, CL_BLOCKING, 0, &bytes, &[])?;
+
+        Ok(Buffer::<T> {
+            buffer,
+            length,
+            _phantom: std::marker::PhantomData,
+        })
+    }
+
     /// Returns a kernel.
     ///
     /// The `global_work_size` does *not* follow the OpenCL definition. It is *not* the total
@@ -453,73 +277,60 @@ impl Program {
         Ok(Kernel {
             builder,
             queue: &self.queue,
+            num_local_buffers: 0,
         })
     }
 
     /// Puts data from an existing buffer onto the GPU.
-    ///
-    /// The `offset` is in number of `T` sized elements, not in their byte size.
     pub fn write_from_buffer<T>(
         &self,
-        buffer: &Buffer<T>,
-        offset: usize,
+        // From Rust's perspective, this buffer doesn't need to be mutable. But the sub-buffer is
+        // mutating the buffer, so it really should be.
+        buffer: &mut Buffer<T>,
         data: &[T],
     ) -> GPUResult<()> {
-        assert!(offset + data.len() <= buffer.length, "Buffer is too small");
-
-        let bytes_len = data.len() * std::mem::size_of::<T>();
-        let mut buff = buffer.buffer.create_sub_buffer(
-            CL_MEM_READ_WRITE,
-            offset * std::mem::size_of::<T>(),
-            bytes_len,
-        )?;
+        assert!(data.len() <= buffer.length, "Buffer is too small");
 
+        // It is safe as long as the sizes match.
         let bytes = unsafe {
-            std::slice::from_raw_parts(data.as_ptr() as *const T as *const u8, bytes_len)
+            std::slice::from_raw_parts(
+                data.as_ptr() as *const T as *const u8,
+                data.len() * std::mem::size_of::<T>(),
+            )
         };
         self.queue
-            .enqueue_write_buffer(&mut buff, CL_BLOCKING, 0, &bytes, &[])?;
+            .enqueue_write_buffer(&mut buffer.buffer, CL_BLOCKING, 0, &bytes, &[])?;
 
         Ok(())
     }
 
     /// Reads data from the GPU into an existing buffer.
-    ///
-    /// The `offset` is in number of `T` sized elements, not in their byte size.
-    pub fn read_into_buffer<T>(
-        &self,
-        buffer: &Buffer<T>,
-        offset: usize,
-        data: &mut [T],
-    ) -> GPUResult<()> {
-        assert!(offset + data.len() <= buffer.length, "Buffer is too small");
-
-        let bytes_len = data.len() * std::mem::size_of::<T>();
-        let buff = buffer.buffer.create_sub_buffer(
-            CL_MEM_READ_WRITE,
-            offset * std::mem::size_of::<T>(),
-            bytes_len,
-        )?;
+    pub fn read_into_buffer<T>(&self, buffer: &Buffer<T>, data: &mut [T]) -> GPUResult<()> {
+        assert!(data.len() <= buffer.length, "Buffer is too small");
 
+        // It is safe as long as the sizes match.
         let mut bytes = unsafe {
-            std::slice::from_raw_parts_mut(data.as_mut_ptr() as *mut T as *mut u8, bytes_len)
+            std::slice::from_raw_parts_mut(
+                data.as_mut_ptr() as *mut T as *mut u8,
+                data.len() * std::mem::size_of::<T>(),
+            )
         };
         self.queue
-            .enqueue_read_buffer(&buff, CL_BLOCKING, 0, &mut bytes, &[])?;
+            .enqueue_read_buffer(&buffer.buffer, CL_BLOCKING, 0, &mut bytes, &[])?;
 
         Ok(())
     }
 
-    /// Run some code in the context of the program
+    /// Run some code in the context of the program.
     ///
-    /// On CUDA it sets the correct contexts and synchronizes the stream before returning.
-    /// On OpenCL it's only executing the closure without any other side-effects.
-    pub fn run<F, R, E>(&self, fun: F) -> Result<R, E>
+    /// It takes the program as a parameter, so that we can use the same function body, for both
+    /// the OpenCL and the CUDA code path. The only difference is the type of the program.
+    pub fn run<F, R, E, A>(&self, fun: F, arg: A) -> Result<R, E>
     where
-        F: FnOnce() -> Result<R, E>,
+        F: FnOnce(&Self, A) -> Result<R, E>,
         E: From<GPUError>,
     {
-        fun()
+        fun(self, arg)
     }
 }
 
@@ -551,189 +362,60 @@ impl KernelArgument for u32 {
     }
 }
 
-/// A local buffer.
-pub struct LocalBuffer<T> {
-    /// The number of T sized elements.
-    length: usize,
-    _phantom: std::marker::PhantomData<T>,
-}
-impl<T> LocalBuffer<T> {
-    /// Returns a new buffer of the specified `length`.
-    pub fn new(length: usize) -> Self {
-        LocalBuffer::<T> {
-            length,
-            _phantom: std::marker::PhantomData,
-        }
-    }
-}
-
 impl<T> KernelArgument for LocalBuffer<T> {
     fn push(&self, kernel: &mut Kernel) {
         kernel
             .builder
             .set_arg_local_buffer::<T>(self.length * std::mem::size_of::<T>());
+        kernel.num_local_buffers += 1;
     }
 }
 
 /// A kernel that can be executed.
 #[derive(Debug)]
 pub struct Kernel<'a> {
-    builder: ExecuteKernel<'a>,
+    /// The underlying kernel builder.
+    pub builder: ExecuteKernel<'a>,
     queue: &'a CommandQueue,
+    /// There can only be a single [`LocalBuffer`] as parameter due to CUDA restrictions. This
+    /// counts them, so that there can be an error if there are more `LocalBuffer` arguments.
+    num_local_buffers: u8,
 }
 
 impl<'a> Kernel<'a> {
     /// Set a kernel argument.
-    pub fn arg<T: KernelArgument>(mut self, t: &T) -> Self {
+    ///
+    /// The arguments must live as long as the kernel. Hence make sure they are not dropped as
+    /// long as the kernel is in use.
+    ///
+    /// Example where this behaviour is enforced and leads to a compile-time error:
+    ///
+    /// ```compile_fail
+    /// use rust_gpu_tools::opencl::Program;
+    ///
+    /// fn would_break(program: &Program) {
+    ///    let data = vec![1, 2, 3, 4];
+    ///    let buffer = program.create_buffer_from_slice(&data).unwrap();
+    ///    let kernel = program.create_kernel("my_kernel", 4, 256).unwrap();
+    ///    let kernel = kernel.arg(&buffer);
+    ///    // This drop wouldn't error if the arguments wouldn't be bound to the kernels lifetime.
+    ///    drop(buffer);
+    ///    kernel.run().unwrap();
+    /// }
+    /// ```
+    pub fn arg<T: KernelArgument>(mut self, t: &'a T) -> Self {
         t.push(&mut self);
         self
     }
 
     /// Actually run the kernel.
     pub fn run(mut self) -> GPUResult<()> {
+        if self.num_local_buffers > 1 {
+            return Err(GPUError::Generic(
+                "There cannot be more than one `LocalBuffer`.".to_string(),
+            ));
+        }
         self.builder.enqueue_nd_range(&self.queue)?;
         Ok(())
     }
 }
-
-#[cfg(test)]
-mod test {
-    use super::{
-        Device, DeviceUuid, GPUError, PciId, UniqueId, Vendor, AMD_DEVICE_ON_APPLE_VENDOR_ID,
-        AMD_DEVICE_ON_APPLE_VENDOR_STRING, AMD_DEVICE_VENDOR_ID, AMD_DEVICE_VENDOR_STRING,
-        NVIDIA_DEVICE_VENDOR_ID, NVIDIA_DEVICE_VENDOR_STRING,
-    };
-    use std::convert::TryFrom;
-
-    #[test]
-    fn test_device_all() {
-        let devices = Device::all();
-        for device in devices.iter() {
-            println!("device: {:?}", device);
-        }
-        assert!(!devices.is_empty(), "No supported GPU found.");
-    }
-
-    #[test]
-    fn test_vendor_from_str() {
-        assert_eq!(
-            Vendor::try_from(AMD_DEVICE_VENDOR_STRING).unwrap(),
-            Vendor::Amd,
-            "AMD vendor string can be converted."
-        );
-        assert_eq!(
-            Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_STRING).unwrap(),
-            Vendor::Amd,
-            "AMD vendor string (on apple) can be converted."
-        );
-        assert_eq!(
-            Vendor::try_from(NVIDIA_DEVICE_VENDOR_STRING).unwrap(),
-            Vendor::Nvidia,
-            "Nvidia vendor string can be converted."
-        );
-        assert!(matches!(
-            Vendor::try_from("unknown vendor"),
-            Err(GPUError::UnsupportedVendor(_))
-        ));
-    }
-
-    #[test]
-    fn test_vendor_from_u32() {
-        assert_eq!(
-            Vendor::try_from(AMD_DEVICE_VENDOR_ID).unwrap(),
-            Vendor::Amd,
-            "AMD vendor ID can be converted."
-        );
-        assert_eq!(
-            Vendor::try_from(AMD_DEVICE_ON_APPLE_VENDOR_ID).unwrap(),
-            Vendor::Amd,
-            "AMD vendor ID (on apple) can be converted."
-        );
-        assert_eq!(
-            Vendor::try_from(NVIDIA_DEVICE_VENDOR_ID).unwrap(),
-            Vendor::Nvidia,
-            "Nvidia vendor ID can be converted."
-        );
-        assert!(matches!(
-            Vendor::try_from(0x1abc),
-            Err(GPUError::UnsupportedVendor(_))
-        ));
-    }
-
-    #[test]
-    fn test_vendor_display() {
-        assert_eq!(
-            Vendor::Amd.to_string(),
-            AMD_DEVICE_VENDOR_STRING,
-            "AMD vendor can be converted to string."
-        );
-        assert_eq!(
-            Vendor::Nvidia.to_string(),
-            NVIDIA_DEVICE_VENDOR_STRING,
-            "Nvidia vendor can be converted to string."
-        );
-    }
-
-    #[test]
-    fn test_uuid() {
-        let valid_string = "46abccd6-022e-b783-572d-833f7104d05f";
-        let valid = DeviceUuid::try_from(valid_string).unwrap();
-        assert_eq!(valid_string, &valid.to_string());
-
-        let too_short_string = "ccd6-022e-b783-572d-833f7104d05f";
-        let too_short = DeviceUuid::try_from(too_short_string);
-        assert!(too_short.is_err(), "Parse error when UUID is too short.");
-
-        let invalid_hex_string = "46abccd6-022e-b783-572d-833f7104d05h";
-        let invalid_hex = DeviceUuid::try_from(invalid_hex_string);
-        assert!(
-            invalid_hex.is_err(),
-            "Parse error when UUID containts non-hex character."
-        );
-    }
-
-    #[test]
-    fn test_pci_id() {
-        let valid_string = "01:00";
-        let valid = PciId::try_from(valid_string).unwrap();
-        assert_eq!(valid_string, &valid.to_string());
-        assert_eq!(valid, PciId(0x0100));
-
-        let too_short_string = "3f";
-        let too_short = PciId::try_from(too_short_string);
-        assert!(too_short.is_err(), "Parse error when PCI ID is too short.");
-
-        let invalid_hex_string = "aaxx";
-        let invalid_hex = PciId::try_from(invalid_hex_string);
-        assert!(
-            invalid_hex.is_err(),
-            "Parse error when PCI ID containts non-hex character."
-        );
-    }
-
-    #[test]
-    fn test_unique_id() {
-        let valid_pci_id_string = "aa:bb";
-        let valid_pci_id = UniqueId::try_from(valid_pci_id_string).unwrap();
-        assert_eq!(valid_pci_id_string, &valid_pci_id.to_string());
-        assert_eq!(valid_pci_id, UniqueId::PciId(PciId(0xaabb)));
-
-        let valid_uuid_string = "aabbccdd-eeff-0011-2233-445566778899";
-        let valid_uuid = UniqueId::try_from(valid_uuid_string).unwrap();
-        assert_eq!(valid_uuid_string, &valid_uuid.to_string());
-        assert_eq!(
-            valid_uuid,
-            UniqueId::Uuid(DeviceUuid([
-                0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
-                0x88, 0x99
-            ]))
-        );
-
-        let invalid_string = "aabbccddeeffgg";
-        let invalid = UniqueId::try_from(invalid_string);
-        assert!(
-            invalid.is_err(),
-            "Parse error when ID matches neither a PCI Id, nor a UUID."
-        );
-    }
-}
diff --git a/src/opencl/utils.rs b/src/opencl/utils.rs
index 178f2c7..6bccd6c 100644
--- a/src/opencl/utils.rs
+++ b/src/opencl/utils.rs
@@ -1,11 +1,13 @@
 use std::convert::{TryFrom, TryInto};
 
-use lazy_static::lazy_static;
 use log::{debug, warn};
 use opencl3::device::DeviceInfo::CL_DEVICE_GLOBAL_MEM_SIZE;
+use opencl3::device::CL_UUID_SIZE_KHR;
 use sha2::{Digest, Sha256};
 
-use super::{Device, DeviceUuid, GPUError, GPUResult, PciId, Vendor, CL_UUID_SIZE_KHR};
+use crate::device::{DeviceUuid, PciId, Vendor};
+use crate::error::{GPUError, GPUResult};
+use crate::opencl::Device;
 
 /// The PCI-ID is the combination of the PCI Bus ID and PCI Device ID.
 ///
@@ -68,15 +70,11 @@ fn get_memory(d: &opencl3::device::Device) -> GPUResult<u64> {
         .map_err(|_| GPUError::DeviceInfoNotAvailable(CL_DEVICE_GLOBAL_MEM_SIZE))
 }
 
-lazy_static! {
-    pub(crate) static ref DEVICES: Vec<Device> = build_device_list();
-}
-
 /// Get a list of all available and supported devices.
 ///
 /// If there is a failure retrieving a device, it won't lead to a hard error, but an error will be
 /// logged and the corresponding device won't be available.
-fn build_device_list() -> Vec<Device> {
+pub(crate) fn build_device_list() -> Vec<Device> {
     let mut all_devices = Vec::new();
     let platforms: Vec<_> = opencl3::platform::get_platforms().unwrap_or_default();
 
diff --git a/src/program.rs b/src/program.rs
new file mode 100644
index 0000000..6e371d0
--- /dev/null
+++ b/src/program.rs
@@ -0,0 +1,283 @@
+#[cfg(feature = "cuda")]
+use crate::cuda;
+use crate::error::GPUError;
+#[cfg(feature = "opencl")]
+use crate::opencl;
+
+/// Abstraction for running programs on CUDA or OpenCL.
+pub enum Program {
+    /// CUDA program.
+    #[cfg(feature = "cuda")]
+    Cuda(cuda::Program),
+    /// OpenCL program.
+    #[cfg(feature = "opencl")]
+    Opencl(opencl::Program),
+}
+
+impl Program {
+    /// Run some code in the context of the program.
+    ///
+    /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but
+    /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same
+    /// code code can be used to run on either of them. The only difference is the type of the
+    /// `Program`.
+    ///
+    /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their
+    /// corresponding program type as parameter. For convenience there is the [`define_closures`]
+    /// macro defined, which can help reducing code duplication by creating two closures out of
+    /// a single one.
+    ///
+    /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If
+    /// one of them is disabled, you still need to pass in two closures. This way the API stays,
+    /// the same, but you can disable it things at compile-time.
+    ///
+    /// The second parameter is a single arbitrary argument, which will be passed on into the
+    /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference
+    /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it
+    /// is used at most once.
+    #[cfg(all(feature = "cuda", feature = "opencl"))]
+    pub fn run<F1, F2, R, E, A>(&self, fun: (F1, F2), arg: A) -> Result<R, E>
+    where
+        E: From<GPUError>,
+        F1: FnOnce(&cuda::Program, A) -> Result<R, E>,
+        F2: FnOnce(&opencl::Program, A) -> Result<R, E>,
+    {
+        match self {
+            Self::Cuda(program) => program.run(fun.0, arg),
+            Self::Opencl(program) => program.run(fun.1, arg),
+        }
+    }
+
+    /// Run some code in the context of the program.
+    ///
+    /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but
+    /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same
+    /// code code can be used to run on either of them. The only difference is the type of the
+    /// `Program`.
+    ///
+    /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their
+    /// corresponding program type as parameter. For convenience there is the [`define_closures`]
+    /// macro defined, which can help reducing code duplication by creating two closures out of
+    /// a single one.
+    ///
+    /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If
+    /// one of them is disabled, you still need to pass in two closures. This way the API stays,
+    /// the same, but you can disable it things at compile-time.
+    ///
+    /// The second parameter is a single arbitrary argument, which will be passed on into the
+    /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference
+    /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it
+    /// is used at most once.
+    #[cfg(all(feature = "cuda", not(feature = "opencl")))]
+    pub fn run<F1, F2, R, E, A>(&self, fun: (F1, F2), arg: A) -> Result<R, E>
+    where
+        E: From<GPUError>,
+        F1: FnOnce(&cuda::Program, A) -> Result<R, E>,
+    {
+        match self {
+            Self::Cuda(program) => program.run(fun.0, arg),
+        }
+    }
+
+    /// Run some code in the context of the program.
+    ///
+    /// There is an implementation for OpenCL and for CUDA. Both use different Rust types, but
+    /// [`opencl::Program`] and [`cuda::Program`] implement the same API. This means that same
+    /// code code can be used to run on either of them. The only difference is the type of the
+    /// `Program`.
+    ///
+    /// You need to pass in two closures, one for OpenCL, one for CUDA, both get their
+    /// corresponding program type as parameter. For convenience there is the [`define_closures`]
+    /// macro defined, which can help reducing code duplication by creating two closures out of
+    /// a single one.
+    ///
+    /// CUDA and OpenCL support can be enabled/disabled by the `opencl` and `cuda` features. If
+    /// one of them is disabled, you still need to pass in two closures. This way the API stays,
+    /// the same, but you can disable it things at compile-time.
+    ///
+    /// The second parameter is a single arbitrary argument, which will be passed on into the
+    /// closure. This is useful when you e.g. need to pass in a mutable reference. Such a reference
+    /// cannot be shared between closures, hence we pass it on, so that the compiler knows that it
+    /// is used at most once.
+    #[cfg(all(not(feature = "cuda"), feature = "opencl"))]
+    pub fn run<F1, F2, R, E, A>(&self, fun: (F1, F2), arg: A) -> Result<R, E>
+    where
+        E: From<GPUError>,
+        F2: FnOnce(&opencl::Program, A) -> Result<R, E>,
+    {
+        match self {
+            Self::Opencl(program) => program.run(fun.1, arg),
+        }
+    }
+
+    /// Returns the name of the GPU, e.g. "GeForce RTX 3090".
+    pub fn device_name(&self) -> &str {
+        match self {
+            #[cfg(feature = "cuda")]
+            Self::Cuda(program) => program.device_name(),
+            #[cfg(feature = "opencl")]
+            Self::Opencl(program) => program.device_name(),
+        }
+    }
+}
+
+/// Creates two closures, one for CUDA, one for OpenCL for the given one.
+///
+/// This macro is used to be able to interact with rust-gpu-tools with unified code for both,
+/// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and
+/// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`].
+///
+/// The second parameter is a single arbitrary argument, which will be passed on into the closure.
+/// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be
+/// shared between closures, hence we pass it on, so that the compiler knows that it is used at
+/// most once.
+///
+/// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct
+/// thing and not specify one of them if it is appropriate.
+///
+/// ### Example
+///
+/// ```
+/// use rust_gpu_tools::{cuda, opencl, program_closures};
+///
+/// let closures = program_closures!(|program, arg: u8| -> bool {
+///     true
+/// });
+///
+/// // Generates
+/// let closures = (
+///     |program: &cuda::Program, arg: u8| { true },
+///     |program: &opencl::Program, arg: u8| { true },
+/// );
+///
+/// // If e.g. the `cuda` feature is disabled, it would generate
+/// let closures_without_cuda = (
+///     (),
+///     |program: &opencl::Program, arg: u8| { true },
+/// );
+/// ```
+#[cfg(all(feature = "cuda", feature = "opencl"))]
+#[macro_export]
+macro_rules! program_closures {
+    // Additional argument without a type
+    (|$program:ident, $arg:ident| -> $ret:ty $body:block) => {
+        (
+            |$program: &$crate::cuda::Program, $arg| -> $ret { $body },
+            |$program: &$crate::opencl::Program, $arg| -> $ret { $body },
+        )
+    };
+    // Additional argument with a type
+    (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => {
+        (
+            |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body },
+            |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body },
+        )
+    };
+}
+
+/// Creates two closures, one for CUDA, one for OpenCL for the given one.
+///
+/// This macro is used to be able to interact with rust-gpu-tools with unified code for both,
+/// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and
+/// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`].
+///
+/// The second parameter is a single arbitrary argument, which will be passed on into the closure.
+/// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be
+/// shared between closures, hence we pass it on, so that the compiler knows that it is used at
+/// most once.
+///
+/// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct
+/// thing and not specify one of them if it is appropriate.
+///
+/// ### Example
+///
+/// ```
+/// use rust_gpu_tools::{cuda, opencl, program_closures};
+///
+/// let closures = program_closures!(|program, arg: u8| -> bool {
+///     true
+/// });
+///
+/// // Generates
+/// let closures = (
+///     |program: &cuda::Program, arg: u8| { true },
+///     |program: &opencl::Program, arg: u8| { true },
+/// );
+///
+/// // If e.g. the `cuda` feature is disabled, it would generate
+/// let closures_without_cuda = (
+///     (),
+///     |program: &opencl::Program, arg: u8| { true },
+/// );
+/// ```
+#[macro_export]
+#[cfg(all(feature = "cuda", not(feature = "opencl")))]
+macro_rules! program_closures {
+    // Additional argument without a type
+    (|$program:ident, $arg:ident| -> $ret:ty $body:block) => {
+        (
+            |$program: &$crate::cuda::Program, $arg| -> $ret { $body },
+            (),
+        )
+    };
+    // Additional argument with a type
+    (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => {
+        (
+            |$program: &$crate::cuda::Program, $arg: $arg_type| -> $ret { $body },
+            (),
+        )
+    };
+}
+
+/// Creates two closures, one for CUDA, one for OpenCL for the given one.
+///
+/// This macro is used to be able to interact with rust-gpu-tools with unified code for both,
+/// CUDA and OpenCL, without the need to repeat the code. The input parameter is a `program` and
+/// it will be mapped to [`&cuda::Program`] and [`&opencl::Program`].
+///
+/// The second parameter is a single arbitrary argument, which will be passed on into the closure.
+/// This is useful when you e.g. need to pass in a mutable reference. Such a reference cannot be
+/// shared between closures, hence we pass it on, so that the compiler knows that it is used at
+/// most once.
+///
+/// Depending on whether the `cuda` and/or `opencl` feature is enabled, it will do the correct
+/// thing and not specify one of them if it is appropriate.
+///
+/// ### Example
+///
+/// ```
+/// use rust_gpu_tools::{cuda, opencl, program_closures};
+///
+/// let closures = program_closures!(|program, arg: u8| -> bool {
+///     true
+/// });
+///
+/// // Generates
+/// let closures = (
+///     |program: &cuda::Program, arg: u8| { true },
+///     |program: &opencl::Program, arg: u8| { true },
+/// );
+///
+/// // If e.g. the `cuda` feature is disabled, it would generate
+/// let closures_without_cuda = (
+///     (),
+///     |program: &opencl::Program, arg: u8| { true },
+/// );
+/// ```
+#[macro_export]
+#[cfg(all(not(feature = "cuda"), feature = "opencl"))]
+macro_rules! program_closures {
+    // Additional argument without a type
+    (|$program:ident, $arg:ident| -> $ret:ty $body:block) => {
+        ((), |$program: &$crate::opencl::Program, $arg| -> $ret {
+            $body
+        })
+    };
+    // Additional argument with a type
+    (|$program:ident, $arg:ident: $arg_type:ty| -> $ret:ty $body:block) => {
+        (
+            (),
+            |$program: &$crate::opencl::Program, $arg: $arg_type| -> $ret { $body },
+        )
+    };
+}