diff --git a/README.md b/README.md
index 567cfbf8c..6110e4f42 100644
--- a/README.md
+++ b/README.md
@@ -216,6 +216,7 @@ You can even ship the autotune cache with your program, reducing cold start time
 
 ## Resource
 
+For now we don't have a lot of resources to learn, but you can look at the [linear algebra library](/crates/cubecl-linalg/README.md) to see how CubeCL can be used.
 If you have any questions or want to contribute, don't hesitate to join the [Discord](https://discord.gg/uPEBbYYDB6).
 
 ## Disclaimer & History
diff --git a/crates/cubecl-core/src/ir/kernel.rs b/crates/cubecl-core/src/ir/kernel.rs
index 5133a79ef..f61309805 100644
--- a/crates/cubecl-core/src/ir/kernel.rs
+++ b/crates/cubecl-core/src/ir/kernel.rs
@@ -140,9 +140,16 @@ impl From<Elem> for Item {
 impl Display for Elem {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            // NOTE: we'll eventually want to differentiate between int/float types
-            Self::Float(_) => f.write_str("float"),
-            Self::Int(_) => f.write_str("int"),
+            Self::Float(kind) => match kind {
+                FloatKind::F16 => f.write_str("f16"),
+                FloatKind::BF16 => f.write_str("bf16"),
+                FloatKind::F32 => f.write_str("f32"),
+                FloatKind::F64 => f.write_str("f64"),
+            },
+            Self::Int(kind) => match kind {
+                IntKind::I32 => f.write_str("i32"),
+                IntKind::I64 => f.write_str("i64"),
+            },
             Self::UInt => f.write_str("uint"),
             Self::Bool => f.write_str("bool"),
         }
diff --git a/crates/cubecl-linalg/Cargo.toml b/crates/cubecl-linalg/Cargo.toml
index cb79e2a6f..2d04b1e6c 100644
--- a/crates/cubecl-linalg/Cargo.toml
+++ b/crates/cubecl-linalg/Cargo.toml
@@ -4,7 +4,7 @@ authors = [
     "louisfd <louisfd94@gmail.com>",
 ]
 categories = ["science", "mathematics", "algorithms"]
-description = "CubeCL Linear Algebra Components"
+description = "CubeCL Linear Algebra Library."
 edition.workspace = true
 keywords = []
 license.workspace = true
diff --git a/crates/cubecl-linalg/README.md b/crates/cubecl-linalg/README.md
new file mode 100644
index 000000000..7cb0782c9
--- /dev/null
+++ b/crates/cubecl-linalg/README.md
@@ -0,0 +1,60 @@
+# CubeCL Linear Algebra Library.
+
+
+The crate contains common linear algebra algorithms.
+
+## Algorithms
+
+- [X] Tiling 2D Matrix Multiplication.
+
+  The kernel is very flexible and can be used on pretty much any hardware.
+- [X] Cooperative Matrix Multiplication.
+
+  The kernel is using Automatic Mixed Precision (AMP) to leverage cooperative matrix-multiply and accumulate instructions.
+  For `f32` tensors, the inputs are casted into `f16`, but the accumulation is still performed in `f32`.
+  This may cause a small lost in precision, but with way faster execution.
+
+## Benchmarks
+
+You can run the benchmarks from the workspace with the following:
+
+```bash
+cargo bench --bench matmul --features wgpu # for wgpu
+cargo bench --bench matmul --features cuda # for cuda
+```
+
+On an RTX 3070 we get the following results:
+
+```
+matmul-wgpu-f32-tiling2d
+
+―――――――― Result ―――――――――
+  Samples     100
+  Mean        13.289ms
+  Variance    28.000ns
+  Median      13.271ms
+  Min         12.582ms
+  Max         13.768ms
+―――――――――――――――――――――――――
+matmul-cuda-f32-tiling2d
+
+―――――――― Result ―――――――――
+  Samples     100
+  Mean        12.754ms
+  Variance    93.000ns
+  Median      12.647ms
+  Min         12.393ms
+  Max         14.501ms
+―――――――――――――――――――――――――
+matmul-cuda-f32-cmma
+
+―――――――― Result ―――――――――
+  Samples     100
+  Mean        4.996ms
+  Variance    35.000ns
+  Median      5.084ms
+  Min         4.304ms
+  Max         5.155ms
+―――――――――――――――――――――――――
+```
+
diff --git a/crates/cubecl/benches/matmul.rs b/crates/cubecl/benches/matmul.rs
index 0d30b3c35..ef81c4d8a 100644
--- a/crates/cubecl/benches/matmul.rs
+++ b/crates/cubecl/benches/matmul.rs
@@ -7,7 +7,7 @@ use cubecl::frontend::Float;
 use cubecl_linalg::matmul;
 use cubecl_linalg::tensor::TensorHandle;
 
-impl<R: Runtime, E: Float> Benchmark for Tiling2dBench<R, E> {
+impl<R: Runtime, E: Float> Benchmark for MatmulBench<R, E> {
     type Args = (TensorHandle<R, E>, TensorHandle<R, E>, TensorHandle<R, E>);
 
     fn prepare(&self) -> Self::Args {
@@ -36,8 +36,7 @@ impl<R: Runtime, E: Float> Benchmark for Tiling2dBench<R, E> {
     }
 
     fn name(&self) -> String {
-        let elem = E::as_elem();
-        format!("tiling2d-{}-{:?}-{:?}", R::name(), elem, self.kind)
+        format!("matmul-{}-{}-{:?}", R::name(), E::as_elem(), self.kind).to_lowercase()
     }
 
     fn sync(&self) {
@@ -46,7 +45,7 @@ impl<R: Runtime, E: Float> Benchmark for Tiling2dBench<R, E> {
 }
 
 #[allow(dead_code)]
-struct Tiling2dBench<R: Runtime, E> {
+struct MatmulBench<R: Runtime, E> {
     b: usize,
     m: usize,
     k: usize,
@@ -66,7 +65,7 @@ enum MatmulKind {
 
 #[allow(dead_code)]
 fn run<R: Runtime, E: Float>(device: R::Device, kind: MatmulKind) {
-    let bench = Tiling2dBench::<R, E> {
+    let bench = MatmulBench::<R, E> {
         b: 32,
         m: 1024,
         k: 1024,