tracel-ai · nathanielsimard · Jul 24, 2024 · Jul 19, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ resolver = "2"
 
 members = [
     "crates/*",
-    "examples/*",
+    "examples/*", "profiling/matmul-example",
     "xtask",
 ]
 

diff --git a/profiling/matmul-example/Cargo.toml b/profiling/matmul-example/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "matmul-example"
+edition.workspace = true
+version.workspace = true
+license.workspace = true
+readme.workspace = true
+
+[dependencies]
+cubecl = { version = "0.1.0", path = "../../crates/cubecl", features = [
+    "cuda",
+    "linalg",
+], optional = true }
+burn = { version = "0.13.2", optional = true, features = ["tch"] }
+burn-tensor = { version = "0.13.2", optional = true }
+
+[features]
+burn-tch-cuda = ["burn", "burn-tensor"]
+cube-cuda = ["cubecl"]
diff --git a/profiling/matmul-example/README.md b/profiling/matmul-example/README.md
@@ -0,0 +1,86 @@
+# CubeCL Profiling Guide
+
+## 1. Ensure the CUDA Runtime is Installed
+
+To profile your GPU kernels, you must have the CUDA runtime installed on your system. Follow the official NVIDIA documentation to install the CUDA toolkit and runtime for your operating system: [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html).
+
+## 2. Ensure NVIDIA Nsight Compute is Installed
+
+NVIDIA Nsight Compute is a powerful tool for GPU profiling. Make sure it is installed on your system. You can download and install it from the NVIDIA Developer website: [NVIDIA Nsight Compute](https://developer.nvidia.com/nsight-compute).
+
+## 3. Isolate the Kernel to be Profiled into a Main Function
+
+For effective profiling, isolate the kernel you want to profile into a main function. This allows you to focus on the performance of a specific kernel without interference from other parts of your code.
+
+## 4. Use the CUDA device/runtime
+Make sure your code uses the CUDA runtime API and device for lauching the kernel.
+
+```rust
+#[cfg(feature = "cube-cuda")]
+mod cube_cuda {
+    use cubecl::cuda::{CudaDevice, CudaRuntime};
+    use cubecl::frontend::F32;
+    use cubecl::linalg::{matmul::tiling2d, tensor::TensorHandle};
+    use cubecl::prelude::*;
+    use cubecl::Runtime;
+
+    pub fn run() {
+        let device = CudaDevice::new(0);
+        let client = CudaRuntime::client(&device);
+
+        let num_of_batch = 12;
+        let heigth = 1024;
+        let width = 1024;
+
+        let tensor_values: Vec<f32> = (0..num_of_batch * heigth * width)
+            .map(|x| x as f32)
+            .collect();
+        let tensor_a_handle = client.create(f32::as_bytes(&tensor_values));
+        let tensor_b_handle = client.create(f32::as_bytes(&tensor_values));
+        let tensor_c_handle = client.empty(12 * 1024 * 1024 * core::mem::size_of::<f32>());
+
+        let tensor_a_shape = vec![num_of_batch, heigth, width];
+        let tensor_b_shape = vec![num_of_batch, heigth, width];
+        let tensor_c_shape = vec![num_of_batch, heigth, width];
+
+        let tensor_a: TensorHandle<CudaRuntime, F32> =
+            TensorHandle::new_contiguous(tensor_a_shape, tensor_a_handle);
+        let tensor_b: TensorHandle<CudaRuntime, F32> =
+            TensorHandle::new_contiguous(tensor_b_shape, tensor_b_handle);
+        let tensor_c: TensorHandle<CudaRuntime, F32> =
+            TensorHandle::new_contiguous(tensor_c_shape, tensor_c_handle);
+        tiling2d::launch(&client, tensor_a, tensor_b, tensor_c, Default::default());
+    }
+}
+```
+
+## 5. Building an executable
+Compile your main function into an executable using cargo, in the last examples case it would be by running the command : 
+cargo build --release --features cube-cuda
+
+## 6. Ensure NVIDIA Nsight Compute has permission for Performance counter
+This can be done by using sudo or by modifying Kernel Module Parameters as described at [link](https://gist.github.com/xaliander/8173ffe623546529c99e9cdd7e0655c4)
+
+## 7. Follow the NVIDIA Nsight Compute guide
+To profile and interpret the profilling results refer to the following [nvidia guide](https://docs.nvidia.com/nsight-compute/NsightCompute/index.html)
+
+## Optional : Profiling Burn.
+To profile Burn operations, use a runtime which uses Cuda, such as LibTorch with a Cuda device or Cuda-Jit
+
+Example:
+```rust
+#[cfg(feature = "burn-tch-cuda")]
+mod tch_gpu {
+    use burn::backend::libtorch::{LibTorch, LibTorchDevice, TchTensor};
+    use burn::tensor::{Distribution, Tensor};
+
+    pub fn run() {
+        let device = LibTorchDevice::Cuda(0);
+        let tensor_1: Tensor<LibTorch, 3> =
+            Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
+        let tensor_2: Tensor<LibTorch, 3> =
+            Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
+        let output = tensor_1.matmul(tensor_2);
+    }
+}
+```
diff --git a/profiling/matmul-example/src/main.rs b/profiling/matmul-example/src/main.rs
@@ -0,0 +1,58 @@
+fn main() {
+    #[cfg(feature = "burn-tch-cuda")]
+    tch_gpu::run();
+    #[cfg(feature = "cube-cuda")]
+    cube_cuda::run();
+}
+
+#[cfg(feature = "burn-tch-cuda")]
+mod tch_gpu {
+    use burn::backend::libtorch::{LibTorch, LibTorchDevice, TchTensor};
+    use burn::tensor::{Distribution, Tensor};
+
+    pub fn run() {
+        let device = LibTorchDevice::Cuda(0);
+        let tensor_1: Tensor<LibTorch, 3> =
+            Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
+        let tensor_2: Tensor<LibTorch, 3> =
+            Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
+        let output = tensor_1.matmul(tensor_2);
+    }
+}
+
+#[cfg(feature = "cube-cuda")]
+mod cube_cuda {
+    use cubecl::cuda::{CudaDevice, CudaRuntime};
+    use cubecl::frontend::F32;
+    use cubecl::linalg::{matmul::tiling2d, tensor::TensorHandle};
+    use cubecl::prelude::*;
+    use cubecl::Runtime;
+
+    pub fn run() {
+        let device = CudaDevice::new(0);
+        let client = CudaRuntime::client(&device);
+
+        let num_of_batch = 12;
+        let heigth = 1024;
+        let width = 1024;
+
+        let tensor_values: Vec<f32> = (0..num_of_batch * heigth * width)
+            .map(|x| x as f32)
+            .collect();
+        let tensor_a_handle = client.create(f32::as_bytes(&tensor_values));
+        let tensor_b_handle = client.create(f32::as_bytes(&tensor_values));
+        let tensor_c_handle = client.empty(12 * 1024 * 1024 * core::mem::size_of::<f32>());
+
+        let tensor_a_shape = vec![num_of_batch, heigth, width];
+        let tensor_b_shape = vec![num_of_batch, heigth, width];
+        let tensor_c_shape = vec![num_of_batch, heigth, width];
+
+        let tensor_a: TensorHandle<CudaRuntime, F32> =
+            TensorHandle::new_contiguous(tensor_a_shape, tensor_a_handle);
+        let tensor_b: TensorHandle<CudaRuntime, F32> =
+            TensorHandle::new_contiguous(tensor_b_shape, tensor_b_handle);
+        let tensor_c: TensorHandle<CudaRuntime, F32> =
+            TensorHandle::new_contiguous(tensor_c_shape, tensor_c_handle);
+        tiling2d::launch(&client, tensor_a, tensor_b, tensor_c, Default::default());
+    }
+}