Skip to content

Commit

Permalink
Profiling guide with new public TensorHandle struct
Browse files Browse the repository at this point in the history
  • Loading branch information
mepatrick73 committed Jul 19, 2024
1 parent 27d7459 commit 935626f
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ resolver = "2"

members = [
"crates/*",
"examples/*",
"examples/*", "profiling/matmul-example",
"xtask",
]

Expand Down
18 changes: 18 additions & 0 deletions profiling/matmul-example/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[package]
name = "matmul-example"
edition.workspace = true
version.workspace = true
license.workspace = true
readme.workspace = true

[dependencies]
cubecl = { version = "0.1.0", path = "../../crates/cubecl", features = [
"cuda",
"linalg",
], optional = true }
burn = { version = "0.13.2", optional = true, features = ["tch"] }
burn-tensor = { version = "0.13.2", optional = true }

[features]
burn-tch-cuda = ["burn", "burn-tensor"]
cube-cuda = ["cubecl"]
86 changes: 86 additions & 0 deletions profiling/matmul-example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# CubeCL Profiling Guide

## 1. Ensure the CUDA Runtime is Installed

To profile your GPU kernels, you must have the CUDA runtime installed on your system. Follow the official NVIDIA documentation to install the CUDA toolkit and runtime for your operating system: [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html).

## 2. Ensure NVIDIA Nsight Compute is Installed

NVIDIA Nsight Compute is a powerful tool for GPU profiling. Make sure it is installed on your system. You can download and install it from the NVIDIA Developer website: [NVIDIA Nsight Compute](https://developer.nvidia.com/nsight-compute).

## 3. Isolate the Kernel to be Profiled into a Main Function

For effective profiling, isolate the kernel you want to profile into a main function. This allows you to focus on the performance of a specific kernel without interference from other parts of your code.

## 4. Use the CUDA device/runtime
Make sure your code uses the CUDA runtime API and device for lauching the kernel.

```rust
#[cfg(feature = "cube-cuda")]
mod cube_cuda {
use cubecl::cuda::{CudaDevice, CudaRuntime};
use cubecl::frontend::F32;
use cubecl::linalg::{matmul::tiling2d, tensor::TensorHandle};
use cubecl::prelude::*;
use cubecl::Runtime;

pub fn run() {
let device = CudaDevice::new(0);
let client = CudaRuntime::client(&device);

let num_of_batch = 12;
let heigth = 1024;
let width = 1024;

let tensor_values: Vec<f32> = (0..num_of_batch * heigth * width)
.map(|x| x as f32)
.collect();
let tensor_a_handle = client.create(f32::as_bytes(&tensor_values));
let tensor_b_handle = client.create(f32::as_bytes(&tensor_values));
let tensor_c_handle = client.empty(12 * 1024 * 1024 * core::mem::size_of::<f32>());

let tensor_a_shape = vec![num_of_batch, heigth, width];
let tensor_b_shape = vec![num_of_batch, heigth, width];
let tensor_c_shape = vec![num_of_batch, heigth, width];

let tensor_a: TensorHandle<CudaRuntime, F32> =
TensorHandle::new_contiguous(tensor_a_shape, tensor_a_handle);
let tensor_b: TensorHandle<CudaRuntime, F32> =
TensorHandle::new_contiguous(tensor_b_shape, tensor_b_handle);
let tensor_c: TensorHandle<CudaRuntime, F32> =
TensorHandle::new_contiguous(tensor_c_shape, tensor_c_handle);
tiling2d::launch(&client, tensor_a, tensor_b, tensor_c, Default::default());
}
}
```

## 5. Building an executable
Compile your main function into an executable using cargo, in the last examples case it would be by running the command :
cargo build --release --features cube-cuda

## 6. Ensure NVIDIA Nsight Compute has permission for Performance counter
This can be done by using sudo or by modifying Kernel Module Parameters as described at [link](https://gist.github.com/xaliander/8173ffe623546529c99e9cdd7e0655c4)

## 7. Follow the NVIDIA Nsight Compute guide
To profile and interpret the profilling results refer to the following [nvidia guide](https://docs.nvidia.com/nsight-compute/NsightCompute/index.html)

## Optional : Profiling Burn.
To profile Burn operations, use a runtime which uses Cuda, such as LibTorch with a Cuda device or Cuda-Jit

Example:
```rust
#[cfg(feature = "burn-tch-cuda")]
mod tch_gpu {
use burn::backend::libtorch::{LibTorch, LibTorchDevice, TchTensor};
use burn::tensor::{Distribution, Tensor};

pub fn run() {
let device = LibTorchDevice::Cuda(0);
let tensor_1: Tensor<LibTorch, 3> =
Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
let tensor_2: Tensor<LibTorch, 3> =
Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
let output = tensor_1.matmul(tensor_2);
}
}
```
58 changes: 58 additions & 0 deletions profiling/matmul-example/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
fn main() {
#[cfg(feature = "burn-tch-cuda")]
tch_gpu::run();
#[cfg(feature = "cube-cuda")]
cube_cuda::run();
}

#[cfg(feature = "burn-tch-cuda")]
mod tch_gpu {
use burn::backend::libtorch::{LibTorch, LibTorchDevice, TchTensor};
use burn::tensor::{Distribution, Tensor};

pub fn run() {
let device = LibTorchDevice::Cuda(0);
let tensor_1: Tensor<LibTorch, 3> =
Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
let tensor_2: Tensor<LibTorch, 3> =
Tensor::<LibTorch, 3>::random([12, 1024, 1024], Distribution::Default, &device);
let output = tensor_1.matmul(tensor_2);
}
}

#[cfg(feature = "cube-cuda")]
mod cube_cuda {
use cubecl::cuda::{CudaDevice, CudaRuntime};
use cubecl::frontend::F32;
use cubecl::linalg::{matmul::tiling2d, tensor::TensorHandle};
use cubecl::prelude::*;
use cubecl::Runtime;

pub fn run() {
let device = CudaDevice::new(0);
let client = CudaRuntime::client(&device);

let num_of_batch = 12;
let heigth = 1024;
let width = 1024;

let tensor_values: Vec<f32> = (0..num_of_batch * heigth * width)
.map(|x| x as f32)
.collect();
let tensor_a_handle = client.create(f32::as_bytes(&tensor_values));
let tensor_b_handle = client.create(f32::as_bytes(&tensor_values));
let tensor_c_handle = client.empty(12 * 1024 * 1024 * core::mem::size_of::<f32>());

let tensor_a_shape = vec![num_of_batch, heigth, width];
let tensor_b_shape = vec![num_of_batch, heigth, width];
let tensor_c_shape = vec![num_of_batch, heigth, width];

let tensor_a: TensorHandle<CudaRuntime, F32> =
TensorHandle::new_contiguous(tensor_a_shape, tensor_a_handle);
let tensor_b: TensorHandle<CudaRuntime, F32> =
TensorHandle::new_contiguous(tensor_b_shape, tensor_b_handle);
let tensor_c: TensorHandle<CudaRuntime, F32> =
TensorHandle::new_contiguous(tensor_c_shape, tensor_c_handle);
tiling2d::launch(&client, tensor_a, tensor_b, tensor_c, Default::default());
}
}

0 comments on commit 935626f

Please sign in to comment.