tracel-ai · nathanielsimard · Jul 19, 2024 · Jul 19, 2024 · Jul 19, 2024
diff --git a/crates/cubecl-linalg/src/matmul/cmma/launch.rs b/crates/cubecl-linalg/src/matmul/cmma/launch.rs
@@ -1,44 +1,116 @@
 use std::cmp::max;
 
 use cubecl_core::{
-    frontend::{Float, TensorArg, F16},
-    Compiler, Runtime,
+    client::ComputeClient,
+    frontend::{Float, TensorArg, TensorHandleRef, F16},
+    ir::{Elem, FloatKind},
+    Compiler, Feature, Runtime,
 };
 
 use crate::{
     matmul::cmma::{
         base::cmma_kernel,
         config::{cmma_cube_count, cmma_cube_dim, CmmaConfig, CmmaLaunchConfig},
     },
-    tensor::{MatrixLayout, TensorHandle},
+    tensor::{matrix_layout, MatrixLayout, TensorHandle},
 };
 
-/// Matrix multiplication using tiling 2d algorithm
+/// Matrix multiplication using [cooperative matrix-multiply and accumulate operations](cubecl_core::cmma).
 pub fn matmul_cmma<R: Runtime, F: Float>(
+    client: &ComputeClient<R::Server, R::Channel>,
     lhs: TensorHandle<R, F>,
     rhs: TensorHandle<R, F>,
     out: TensorHandle<R, F>,
-    device: &R::Device,
 ) -> TensorHandle<R, F> {
-    let rank = lhs.rank();
-    let m = lhs.shape[rank - 2];
-    let k = lhs.shape[rank - 1];
-    let n = rhs.shape[rank - 1];
+    matmul_cmma_ref::<R, F>(client, lhs.as_ref(), rhs.as_ref(), out.as_ref());
+    out
+}
 
-    let client = R::client(device);
+#[derive(Debug)]
+pub enum UnavailabilityReason {
+    TransposedInput, // TODO: Support that case.
+    NotMultipleOf4,  // TODO: Support that case.
+    HiglyPermutatedInput,
+    ShapeMemoryLimitBusted,
+    InvalidConfig(String),
+    CmmaInstructionsUnsupported,
+}
 
-    let check_layout = |tensor: &TensorHandle<R, F>| match tensor.matrix_layout() {
-        MatrixLayout::Contiguous => {}
+/// Checks if the matmul cmma can be used.
+pub fn check_cmma_availability<R: Runtime>(
+    client: &ComputeClient<R::Server, R::Channel>,
+    lhs: &TensorHandleRef<'_, R>,
+    rhs: &TensorHandleRef<'_, R>,
+    config: Option<&CmmaLaunchConfig>,
+) -> Result<(), UnavailabilityReason> {
+    let check_layout = |tensor: &TensorHandleRef<'_, R>| match matrix_layout(tensor.strides) {
+        MatrixLayout::Contiguous => Ok(()),
         MatrixLayout::MildlyPermuted {
             transposed: _,
             batch_swap: _,
-        } => panic!("Transposed input not supported yet."),
-        MatrixLayout::HighlyPermuted => {
-            panic!("Can't run on highly permuted tensor.")
-        }
+        } => Err(UnavailabilityReason::TransposedInput),
+        MatrixLayout::HighlyPermuted => Err(UnavailabilityReason::HiglyPermutatedInput),
     };
-    check_layout(&lhs);
-    check_layout(&rhs);
+
+    if !client.features().enabled(Feature::Cmma {
+        a: Elem::Float(FloatKind::F16),
+        b: Elem::Float(FloatKind::F16),
+        c: Elem::Float(FloatKind::F32),
+        m: 16,
+        k: 16,
+        n: 16,
+    }) {
+        return Err(UnavailabilityReason::CmmaInstructionsUnsupported);
+    }
+
+    check_layout(lhs)?;
+    check_layout(rhs)?;
+
+    let rank = lhs.shape.len();
+    let m = lhs.shape[rank - 2];
+    let k = lhs.shape[rank - 1];
+    let n = rhs.shape[rank - 1];
+
+    if !(m % 4 == 0 && k % 4 == 0 && n % 4 == 0) {
+        return Err(UnavailabilityReason::NotMultipleOf4);
+    }
+
+    if let Some(config) = config {
+        let (b_m, b_k, b_n) = (
+            config.block_size_m,
+            config.block_size_k,
+            config.block_size_n,
+        );
+
+        if b_k * max(b_m, b_n) > <R::Compiler as Compiler>::max_shared_memory_size() {
+            return Err(UnavailabilityReason::ShapeMemoryLimitBusted);
+        }
+
+        if b_m * b_n > <R::Compiler as Compiler>::max_shared_memory_size() {
+            return Err(UnavailabilityReason::ShapeMemoryLimitBusted);
+        }
+
+        if b_k != 2 * config.tile_size {
+            return Err(UnavailabilityReason::InvalidConfig(
+                "Variable tile number per coop_units not supported".to_string(),
+            ));
+        }
+    }
+
+    Ok(())
+}
+/// Matrix multiplication using [cooperative matrix-multiply and accumulate operations](cubecl_core::cmma).
+pub fn matmul_cmma_ref<R: Runtime, F: Float>(
+    client: &ComputeClient<R::Server, R::Channel>,
+    lhs: TensorHandleRef<'_, R>,
+    rhs: TensorHandleRef<'_, R>,
+    out: TensorHandleRef<'_, R>,
+) {
+    let rank = lhs.strides.len();
+
+    let m = lhs.shape[rank - 2];
+    let k = lhs.shape[rank - 1];
+    let n = rhs.shape[rank - 1];
 
     let vectorization = |shape: usize| {
         [4, 2]
@@ -53,41 +125,17 @@ pub fn matmul_cmma<R: Runtime, F: Float>(
     let rhs_vectorization = vectorization(n);
     let out_vectorization = vectorization(n);
 
-    let cube_count = cmma_cube_count::<R>(&out.shape, 64, 64);
+    let cube_count = cmma_cube_count::<R>(out.shape, 64, 64);
     let cube_dim = cmma_cube_dim();
     let launch_config = CmmaLaunchConfig::default();
-    let (b_m, b_k, b_n) = (
-        launch_config.block_size_m,
-        launch_config.block_size_k,
-        launch_config.block_size_n,
-    );
-
-    assert!(
-        lhs_vectorization == 4 && rhs_vectorization == 4 && out_vectorization == 4,
-        "Only vec4 is supported"
-    );
-    assert!(
-        b_k * max(b_m, b_n) <= <R::Compiler as Compiler>::max_shared_memory_size(),
-        "Shared memory limit will be busted. "
-    );
-    assert!(
-        b_m * b_n <= <R::Compiler as Compiler>::max_shared_memory_size(),
-        "Shared memory limit will be busted. "
-    );
-    assert!(
-        b_k == 2 * launch_config.tile_size,
-        "Variable tile number per coop_units not supported"
-    );
 
     cmma_kernel::launch::<F, F16, R>(
-        &client,
+        client,
         cube_count,
         cube_dim,
-        TensorArg::vectorized(lhs_vectorization, &lhs.handle, &lhs.strides, &lhs.shape),
-        TensorArg::vectorized(rhs_vectorization, &rhs.handle, &rhs.strides, &rhs.shape),
-        TensorArg::vectorized(out_vectorization, &out.handle, &out.strides, &out.shape),
+        TensorArg::vectorized(lhs_vectorization, lhs.handle, lhs.strides, lhs.shape),
+        TensorArg::vectorized(rhs_vectorization, rhs.handle, rhs.strides, rhs.shape),
+        TensorArg::vectorized(out_vectorization, out.handle, out.strides, out.shape),
         CmmaConfig::new(m, k, n, launch_config),
     );
-
-    out
 }
diff --git a/crates/cubecl-linalg/src/matmul/cmma/mod.rs b/crates/cubecl-linalg/src/matmul/cmma/mod.rs
@@ -7,4 +7,6 @@ mod launch;
 pub(crate) mod load_shared_memory;
 pub(crate) mod write_output;
 
-pub use launch::matmul_cmma;
+pub use launch::check_cmma_availability as is_available;
+pub use launch::matmul_cmma as launch;
+pub use launch::matmul_cmma_ref as launch_ref;
diff --git a/crates/cubecl-linalg/src/matmul/mod.rs b/crates/cubecl-linalg/src/matmul/mod.rs
@@ -1,3 +1,5 @@
+use cubecl_core::prelude::*;
+
 /// Contains algorithms for cooperative matrix multiplication.
 pub mod cmma;
 
@@ -7,3 +9,17 @@ pub mod tiling2d;
 
 #[cfg(feature = "export_tests")]
 pub mod tests;
+
+/// Launch a matrix multiplication kernel.
+pub fn launch_ref<R: Runtime, F: Float>(
+    client: &ComputeClient<R::Server, R::Channel>,
+    lhs: TensorHandleRef<'_, R>,
+    rhs: TensorHandleRef<'_, R>,
+    out: TensorHandleRef<'_, R>,
+) {
+    if cmma::is_available(client, &lhs, &rhs, None).is_ok() {
+        cmma::launch_ref::<R, F>(client, lhs, rhs, out);
+    } else {
+        tiling2d::launch_ref::<R, F>(client, lhs, rhs, out, Default::default());
+    }
+}
diff --git a/crates/cubecl-linalg/src/matmul/tests/cmma/compute_loop.rs b/crates/cubecl-linalg/src/matmul/tests/cmma/compute_loop.rs
@@ -66,9 +66,10 @@ pub fn compute_loop_k_test<R: Runtime>(device: &R::Device) {
     let m = 16;
     let k = 32;
     let n = 16;
-    let lhs = range_tensor_f16::<R>(m, k, device);
-    let rhs = range_tensor_f16::<R>(k, n, device);
-    let results = create_empty::<R>(m, n, device);
+    let client = R::client(device);
+    let lhs = range_tensor_f16::<R>(&client, m, k);
+    let rhs = range_tensor_f16::<R>(&client, k, n);
+    let results = create_empty::<R>(&client, m, n);
     let cube_dim = CubeDim::new(32, 1, 1);
     let cube_count = CubeCount::Static(1, 1, 1);
 
@@ -128,7 +129,7 @@ pub fn compute_loop_k_test<R: Runtime>(device: &R::Device) {
         3659328., 3671344., 3683360., 3695376.,
     ];
 
-    assert_equals::<R>(results, expected, device);
+    assert_equals::<R>(&client, results, expected);
 }
 
 /// Exported test
@@ -141,9 +142,10 @@ pub fn compute_loop_warp_test<R: Runtime>(device: &R::Device) {
     let m = 16;
     let k = 32;
     let n = 32;
-    let lhs = range_tensor_f16::<R>(m, k, device);
-    let rhs = range_tensor_f16::<R>(k, n, device);
-    let results = create_empty::<R>(m, n, device);
+    let client = R::client(device);
+    let lhs = range_tensor_f16::<R>(&client, m, k);
+    let rhs = range_tensor_f16::<R>(&client, k, n);
+    let results = create_empty::<R>(&client, m, n);
     let cube_dim = CubeDim::new(32, 1, 1);
     let cube_count = CubeCount::Static(1, 1, 1);
 
@@ -231,7 +233,7 @@ pub fn compute_loop_warp_test<R: Runtime>(device: &R::Device) {
         9763456., 9775472., 9787488., 9799504., 9811520., 9823536., 9835552., 9847568.,
     ];
 
-    assert_equals::<R>(results, expected, device);
+    assert_equals::<R>(&client, results, expected);
 }
 
 /// Exported test
@@ -244,10 +246,11 @@ pub fn cmma_compute_loop_two_warps_same_tile_row_test<R: Runtime>(device: &R::De
     let m = 16;
     let k = 32;
     let n = 64;
+    let client = R::client(device);
 
-    let lhs = range_tensor_f16::<R>(m, k, device);
-    let rhs = range_tensor_f16::<R>(k, n, device);
-    let results = create_empty::<R>(m, n, device);
+    let lhs = range_tensor_f16::<R>(&client, m, k);
+    let rhs = range_tensor_f16::<R>(&client, k, n);
+    let results = create_empty::<R>(&client, m, n);
     let cube_dim = CubeDim::new(32, 2, 1);
     let cube_count = CubeCount::Static(1, 1, 1);
 
@@ -263,7 +266,7 @@ pub fn cmma_compute_loop_two_warps_same_tile_row_test<R: Runtime>(device: &R::De
     };
 
     compute_loop_test::launch::<F32, F16, R>(
-        &R::client(device),
+        &client,
         cube_count,
         cube_dim,
         TensorArg::new(&lhs.handle, &lhs.strides, &lhs.shape),
@@ -413,5 +416,5 @@ pub fn cmma_compute_loop_two_warps_same_tile_row_test<R: Runtime>(device: &R::De
         22103888.0, 22115904.0, 22127920.0, 22139936.0, 22151952.0,
     ];
 
-    assert_equals::<R>(results, expected, device);
+    assert_equals::<R>(&client, results, expected);
 }