From 935626fae7efbf8033d02067b3af352194b1e65d Mon Sep 17 00:00:00 2001 From: mepatrick73 Date: Fri, 19 Jul 2024 11:39:30 -0400 Subject: [PATCH] Profiling guide with new public TensorHandle struct --- Cargo.toml | 2 +- profiling/matmul-example/Cargo.toml | 18 ++++++ profiling/matmul-example/README.md | 86 ++++++++++++++++++++++++++++ profiling/matmul-example/src/main.rs | 58 +++++++++++++++++++ 4 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 profiling/matmul-example/Cargo.toml create mode 100644 profiling/matmul-example/README.md create mode 100644 profiling/matmul-example/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 2508fcf3a..c9c39e572 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ resolver = "2" members = [ "crates/*", - "examples/*", + "examples/*", "profiling/matmul-example", "xtask", ] diff --git a/profiling/matmul-example/Cargo.toml b/profiling/matmul-example/Cargo.toml new file mode 100644 index 000000000..260689939 --- /dev/null +++ b/profiling/matmul-example/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "matmul-example" +edition.workspace = true +version.workspace = true +license.workspace = true +readme.workspace = true + +[dependencies] +cubecl = { version = "0.1.0", path = "../../crates/cubecl", features = [ + "cuda", + "linalg", +], optional = true } +burn = { version = "0.13.2", optional = true, features = ["tch"] } +burn-tensor = { version = "0.13.2", optional = true } + +[features] +burn-tch-cuda = ["burn", "burn-tensor"] +cube-cuda = ["cubecl"] diff --git a/profiling/matmul-example/README.md b/profiling/matmul-example/README.md new file mode 100644 index 000000000..e2d5c94cd --- /dev/null +++ b/profiling/matmul-example/README.md @@ -0,0 +1,86 @@ +# CubeCL Profiling Guide + +## 1. Ensure the CUDA Runtime is Installed + +To profile your GPU kernels, you must have the CUDA runtime installed on your system. Follow the official NVIDIA documentation to install the CUDA toolkit and runtime for your operating system: [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). + +## 2. Ensure NVIDIA Nsight Compute is Installed + +NVIDIA Nsight Compute is a powerful tool for GPU profiling. Make sure it is installed on your system. You can download and install it from the NVIDIA Developer website: [NVIDIA Nsight Compute](https://developer.nvidia.com/nsight-compute). + +## 3. Isolate the Kernel to be Profiled into a Main Function + +For effective profiling, isolate the kernel you want to profile into a main function. This allows you to focus on the performance of a specific kernel without interference from other parts of your code. + +## 4. Use the CUDA device/runtime +Make sure your code uses the CUDA runtime API and device for lauching the kernel. + +```rust +#[cfg(feature = "cube-cuda")] +mod cube_cuda { + use cubecl::cuda::{CudaDevice, CudaRuntime}; + use cubecl::frontend::F32; + use cubecl::linalg::{matmul::tiling2d, tensor::TensorHandle}; + use cubecl::prelude::*; + use cubecl::Runtime; + + pub fn run() { + let device = CudaDevice::new(0); + let client = CudaRuntime::client(&device); + + let num_of_batch = 12; + let heigth = 1024; + let width = 1024; + + let tensor_values: Vec = (0..num_of_batch * heigth * width) + .map(|x| x as f32) + .collect(); + let tensor_a_handle = client.create(f32::as_bytes(&tensor_values)); + let tensor_b_handle = client.create(f32::as_bytes(&tensor_values)); + let tensor_c_handle = client.empty(12 * 1024 * 1024 * core::mem::size_of::()); + + let tensor_a_shape = vec![num_of_batch, heigth, width]; + let tensor_b_shape = vec![num_of_batch, heigth, width]; + let tensor_c_shape = vec![num_of_batch, heigth, width]; + + let tensor_a: TensorHandle = + TensorHandle::new_contiguous(tensor_a_shape, tensor_a_handle); + let tensor_b: TensorHandle = + TensorHandle::new_contiguous(tensor_b_shape, tensor_b_handle); + let tensor_c: TensorHandle = + TensorHandle::new_contiguous(tensor_c_shape, tensor_c_handle); + tiling2d::launch(&client, tensor_a, tensor_b, tensor_c, Default::default()); + } +} +``` + +## 5. Building an executable +Compile your main function into an executable using cargo, in the last examples case it would be by running the command : +cargo build --release --features cube-cuda + +## 6. Ensure NVIDIA Nsight Compute has permission for Performance counter +This can be done by using sudo or by modifying Kernel Module Parameters as described at [link](https://gist.github.com/xaliander/8173ffe623546529c99e9cdd7e0655c4) + +## 7. Follow the NVIDIA Nsight Compute guide +To profile and interpret the profilling results refer to the following [nvidia guide](https://docs.nvidia.com/nsight-compute/NsightCompute/index.html) + +## Optional : Profiling Burn. +To profile Burn operations, use a runtime which uses Cuda, such as LibTorch with a Cuda device or Cuda-Jit + +Example: +```rust +#[cfg(feature = "burn-tch-cuda")] +mod tch_gpu { + use burn::backend::libtorch::{LibTorch, LibTorchDevice, TchTensor}; + use burn::tensor::{Distribution, Tensor}; + + pub fn run() { + let device = LibTorchDevice::Cuda(0); + let tensor_1: Tensor = + Tensor::::random([12, 1024, 1024], Distribution::Default, &device); + let tensor_2: Tensor = + Tensor::::random([12, 1024, 1024], Distribution::Default, &device); + let output = tensor_1.matmul(tensor_2); + } +} +``` diff --git a/profiling/matmul-example/src/main.rs b/profiling/matmul-example/src/main.rs new file mode 100644 index 000000000..77cd68277 --- /dev/null +++ b/profiling/matmul-example/src/main.rs @@ -0,0 +1,58 @@ +fn main() { + #[cfg(feature = "burn-tch-cuda")] + tch_gpu::run(); + #[cfg(feature = "cube-cuda")] + cube_cuda::run(); +} + +#[cfg(feature = "burn-tch-cuda")] +mod tch_gpu { + use burn::backend::libtorch::{LibTorch, LibTorchDevice, TchTensor}; + use burn::tensor::{Distribution, Tensor}; + + pub fn run() { + let device = LibTorchDevice::Cuda(0); + let tensor_1: Tensor = + Tensor::::random([12, 1024, 1024], Distribution::Default, &device); + let tensor_2: Tensor = + Tensor::::random([12, 1024, 1024], Distribution::Default, &device); + let output = tensor_1.matmul(tensor_2); + } +} + +#[cfg(feature = "cube-cuda")] +mod cube_cuda { + use cubecl::cuda::{CudaDevice, CudaRuntime}; + use cubecl::frontend::F32; + use cubecl::linalg::{matmul::tiling2d, tensor::TensorHandle}; + use cubecl::prelude::*; + use cubecl::Runtime; + + pub fn run() { + let device = CudaDevice::new(0); + let client = CudaRuntime::client(&device); + + let num_of_batch = 12; + let heigth = 1024; + let width = 1024; + + let tensor_values: Vec = (0..num_of_batch * heigth * width) + .map(|x| x as f32) + .collect(); + let tensor_a_handle = client.create(f32::as_bytes(&tensor_values)); + let tensor_b_handle = client.create(f32::as_bytes(&tensor_values)); + let tensor_c_handle = client.empty(12 * 1024 * 1024 * core::mem::size_of::()); + + let tensor_a_shape = vec![num_of_batch, heigth, width]; + let tensor_b_shape = vec![num_of_batch, heigth, width]; + let tensor_c_shape = vec![num_of_batch, heigth, width]; + + let tensor_a: TensorHandle = + TensorHandle::new_contiguous(tensor_a_shape, tensor_a_handle); + let tensor_b: TensorHandle = + TensorHandle::new_contiguous(tensor_b_shape, tensor_b_handle); + let tensor_c: TensorHandle = + TensorHandle::new_contiguous(tensor_c_shape, tensor_c_handle); + tiling2d::launch(&client, tensor_a, tensor_b, tensor_c, Default::default()); + } +}