From d0f55e5545b2d4545e6057ba52d7d4c52619a4c3 Mon Sep 17 00:00:00 2001
From: nathaniel <nathaniel.simard.42@gmail.com>
Date: Mon, 22 Jul 2024 17:33:20 -0400
Subject: [PATCH 1/4] Better vectorization CUDA

---
 crates/cubecl-core/src/compute/kernel.rs      |  11 +-
 .../cubecl-core/src/runtime_tests/assign.rs   |  44 ++++
 crates/cubecl-core/src/runtime_tests/mod.rs   |   2 +
 crates/cubecl-cuda/src/compiler/base.rs       |  15 +-
 crates/cubecl-cuda/src/compiler/binary.rs     | 180 ++++----------
 crates/cubecl-cuda/src/compiler/element.rs    | 221 +++++++++++-------
 .../cubecl-cuda/src/compiler/instruction.rs   |  33 ++-
 crates/cubecl-cuda/src/compiler/kernel.rs     |  56 ++---
 crates/cubecl-cuda/src/compiler/unary.rs      | 102 ++------
 crates/cubecl/Cargo.toml                      |   4 +
 crates/cubecl/benches/matmul.rs               |  12 +-
 crates/cubecl/benches/unary.rs                | 111 +++++++++
 12 files changed, 429 insertions(+), 362 deletions(-)
 create mode 100644 crates/cubecl-core/src/runtime_tests/assign.rs
 create mode 100644 crates/cubecl/benches/unary.rs
diff --git a/crates/cubecl-core/src/compute/kernel.rs b/crates/cubecl-core/src/compute/kernel.rs
index 7b950ec03..8a5316f6f 100644
--- a/crates/cubecl-core/src/compute/kernel.rs
+++ b/crates/cubecl-core/src/compute/kernel.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use std::{fmt::Debug, marker::PhantomData};
 
 use crate::{codegen::CompilerRepresentation, ir::CubeDim, Compiler, Kernel};
 use alloc::sync::Arc;
@@ -78,6 +78,15 @@ pub enum CubeCount<S: ComputeServer> {
     Dynamic(Binding<S>),
 }
 
+impl<S: ComputeServer> Debug for CubeCount<S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            CubeCount::Static(x, y, z) => f.write_fmt(format_args!("({x}, {y}, {z})")),
+            CubeCount::Dynamic(_) => f.write_str("binding"),
+        }
+    }
+}
+
 impl<S: ComputeServer> Clone for CubeCount<S> {
     fn clone(&self) -> Self {
         match self {
diff --git a/crates/cubecl-core/src/runtime_tests/assign.rs b/crates/cubecl-core/src/runtime_tests/assign.rs
new file mode 100644
index 000000000..08dfd77f2
--- /dev/null
+++ b/crates/cubecl-core/src/runtime_tests/assign.rs
@@ -0,0 +1,44 @@
+use crate as cubecl;
+
+use cubecl::prelude::*;
+
+#[cube(launch)]
+pub fn kernel_assign(output: &mut Array<F32>, vectorization: Comptime<UInt>) {
+    if UNIT_POS == UInt::new(0) {
+        let item = F32::vectorized(5.0, Comptime::get(vectorization));
+        output[0] = item;
+    }
+}
+
+pub fn test_kernel_assign_scalar<R: Runtime>(client: ComputeClient<R::Server, R::Channel>) {
+    let handle = client.create(f32::as_bytes(&[0.0, 1.0]));
+
+    let vectorization = 2;
+
+    kernel_assign::launch::<R>(
+        &client,
+        CubeCount::Static(1, 1, 1),
+        CubeDim::default(),
+        ArrayArg::vectorized(vectorization, &handle, 2),
+        UInt::new(vectorization as u32),
+    );
+
+    let actual = client.read(handle.binding());
+    let actual = f32::from_bytes(&actual);
+
+    assert_eq!(actual[0], 5.0);
+}
+
+#[allow(missing_docs)]
+#[macro_export]
+macro_rules! testgen_assign {
+    () => {
+        use super::*;
+
+        #[test]
+        fn test_assign_scalar() {
+            let client = TestRuntime::client(&Default::default());
+            cubecl_core::runtime_tests::assign::test_kernel_assign_scalar::<TestRuntime>(client);
+        }
+    };
+}
diff --git a/crates/cubecl-core/src/runtime_tests/mod.rs b/crates/cubecl-core/src/runtime_tests/mod.rs
index d275d5831..e445a75f3 100644
--- a/crates/cubecl-core/src/runtime_tests/mod.rs
+++ b/crates/cubecl-core/src/runtime_tests/mod.rs
@@ -1,3 +1,4 @@
+pub mod assign;
 pub mod cmma;
 pub mod launch;
 pub mod slice;
@@ -13,5 +14,6 @@ macro_rules! testgen_all {
         cubecl_core::testgen_launch!();
         cubecl_core::testgen_cmma!();
         cubecl_core::testgen_slice!();
+        cubecl_core::testgen_assign!();
     };
 }
diff --git a/crates/cubecl-cuda/src/compiler/base.rs b/crates/cubecl-cuda/src/compiler/base.rs
index ded8b3f02..8fdc6cf7d 100644
--- a/crates/cubecl-cuda/src/compiler/base.rs
+++ b/crates/cubecl-cuda/src/compiler/base.rs
@@ -1,3 +1,5 @@
+use std::collections::HashSet;
+
 use cubecl_core::{
     ir::{self as gpu, ConstantScalarValue},
     Compiler,
@@ -22,6 +24,7 @@ pub struct CudaCompiler {
     stride: bool,
     num_inputs: usize,
     num_outputs: usize,
+    items: HashSet<super::Item>,
 }
 
 impl Compiler for CudaCompiler {
@@ -86,6 +89,7 @@ impl CudaCompiler {
             wmma_activated: self.wmma,
             bf16: self.bf16,
             f16: self.f16,
+            items: self.items,
         }
     }
 
@@ -548,13 +552,10 @@ impl CudaCompiler {
     }
 
     fn compile_item(&mut self, item: gpu::Item) -> super::Item {
-        match item.vectorization {
-            4 => super::Item::Vec4(self.compile_elem(item.elem)),
-            3 => super::Item::Vec3(self.compile_elem(item.elem)),
-            2 => super::Item::Vec2(self.compile_elem(item.elem)),
-            1 => super::Item::Scalar(self.compile_elem(item.elem)),
-            _ => panic!("Vectorization factor unsupported {:?}", item.vectorization),
-        }
+        let item = super::Item::new(self.compile_elem(item.elem), item.vectorization.into());
+        self.items.insert(item.clone());
+        self.items.insert(item.optimized());
+        item
     }
 
     fn compile_elem(&mut self, value: gpu::Elem) -> super::Elem {
diff --git a/crates/cubecl-cuda/src/compiler/binary.rs b/crates/cubecl-cuda/src/compiler/binary.rs
index e665b3e7f..8eec9c251 100644
--- a/crates/cubecl-cuda/src/compiler/binary.rs
+++ b/crates/cubecl-cuda/src/compiler/binary.rs
@@ -1,4 +1,4 @@
-use super::{Component, Elem, InstructionSettings, Item, Variable};
+use super::{Component, Elem, Variable};
 use std::fmt::Display;
 
 pub trait Binary {
@@ -9,36 +9,7 @@ pub trait Binary {
         out: &Variable,
     ) -> std::fmt::Result {
         let item = out.item();
-        let settings = Self::settings(*item.elem());
-
-        match item {
-            Item::Vec4(elem) => {
-                if settings.native_vec4 && lhs.item() == rhs.item() {
-                    Self::format_native_vec4(f, lhs, rhs, out, elem)
-                } else {
-                    Self::unroll_vec4(f, lhs, rhs, out, elem)
-                }
-            }
-            Item::Vec3(elem) => {
-                if settings.native_vec3 && lhs.item() == rhs.item() {
-                    Self::format_native_vec3(f, lhs, rhs, out, elem)
-                } else {
-                    Self::unroll_vec3(f, lhs, rhs, out, elem)
-                }
-            }
-            Item::Vec2(elem) => {
-                if settings.native_vec2 && lhs.item() == rhs.item() {
-                    Self::format_native_vec2(f, lhs, rhs, out, elem)
-                } else {
-                    Self::unroll_vec2(f, lhs, rhs, out, elem)
-                }
-            }
-            Item::Scalar(elem) => Self::format_scalar(f, *lhs, *rhs, *out, elem),
-        }
-    }
-
-    fn settings(_elem: Elem) -> InstructionSettings {
-        InstructionSettings::default()
+        Self::unroll_vec(f, lhs, rhs, out, item.elem, item.vectorization.into())
     }
 
     fn format_scalar<Lhs, Rhs, Out>(
@@ -53,78 +24,39 @@ pub trait Binary {
         Rhs: Component,
         Out: Component;
 
-    fn format_native_vec4(
-        f: &mut std::fmt::Formatter<'_>,
-        lhs: &Variable,
-        rhs: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::format_scalar(f, *lhs, *rhs, *out, elem)
-    }
-
-    fn format_native_vec3(
-        f: &mut std::fmt::Formatter<'_>,
-        lhs: &Variable,
-        rhs: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::format_scalar(f, *lhs, *rhs, *out, elem)
-    }
-
-    fn format_native_vec2(
-        f: &mut std::fmt::Formatter<'_>,
-        lhs: &Variable,
-        rhs: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::format_scalar(f, *lhs, *rhs, *out, elem)
-    }
-
-    fn unroll_vec2(
-        f: &mut std::fmt::Formatter<'_>,
-        lhs: &Variable,
-        rhs: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::unroll_vec(f, lhs, rhs, out, elem, 2)
-    }
-
-    fn unroll_vec3(
-        f: &mut std::fmt::Formatter<'_>,
-        lhs: &Variable,
-        rhs: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::unroll_vec(f, lhs, rhs, out, elem, 3)
-    }
-
-    fn unroll_vec4(
-        f: &mut std::fmt::Formatter<'_>,
-        lhs: &Variable,
-        rhs: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::unroll_vec(f, lhs, rhs, out, elem, 4)
-    }
-
     fn unroll_vec(
         f: &mut std::fmt::Formatter<'_>,
         lhs: &Variable,
         rhs: &Variable,
         out: &Variable,
-        elem: Elem,
+        mut elem: Elem,
         index: usize,
     ) -> core::fmt::Result {
+        if index == 1 {
+            return Self::format_scalar(f, *lhs, *rhs, *out, elem);
+        }
+
+        let lhs_optimized = lhs.optimized();
+        let rhs_optimized = rhs.optimized();
+        let out_optimized = out.optimized();
+        let lhs_elem = lhs_optimized.elem();
+        let optimized = lhs_elem == rhs_optimized.elem()
+            && lhs_elem == out_optimized.elem()
+            && lhs_elem != elem
+            && lhs_optimized.is_optimized();
+
+        let (lhs, rhs, out, index) = if optimized {
+            let factor = lhs.item().vectorization / lhs_optimized.item().vectorization;
+            elem = lhs_elem;
+            (lhs_optimized, rhs_optimized, out_optimized, index / factor)
+        } else {
+            (*lhs, *rhs, *out, index)
+        };
+
         for i in 0..index {
-            let lhsi = lhs.index(i);
-            let rhsi = rhs.index(i);
-            let outi = out.index(i);
+            let lhsi = lhs.index(i, optimized);
+            let rhsi = rhs.index(i, optimized);
+            let outi = out.index(i, optimized);
 
             Self::format_scalar(f, lhsi, rhsi, outi, elem)?;
         }
@@ -158,11 +90,6 @@ macro_rules! operator {
             ) -> std::fmt::Result {
                 f.write_fmt(format_args!("{out} = {lhs} {} {rhs};\n", $op))
             }
-
-            #[allow(unused_variables)]
-            fn settings(elem: Elem) -> InstructionSettings {
-                $vectorization
-            }
         }
     };
 }
@@ -192,11 +119,6 @@ macro_rules! function {
             ) -> std::fmt::Result {
                 f.write_fmt(format_args!("{out} = {}({lhs}, {rhs});\n", $op))
             }
-
-            #[allow(unused_variables)]
-            fn settings(elem: Elem) -> InstructionSettings {
-                $vectorization
-            }
         }
     };
 }
@@ -242,18 +164,18 @@ impl Binary for IndexAssign {
         let elem_rhs = rhs.elem();
         // Cast only when necessary.
         if elem != elem_rhs {
-            if let Elem::Bool = elem_rhs {
-                match rhs.item() {
-                    Item::Vec4(_) => {
-                        f.write_fmt(format_args!("{out}[{lhs}] = make_uint4({elem}({rhs}.x), {elem}({rhs}.y), {elem}({rhs}.z), {elem}({rhs}.w));\n"))
-                    },
-                    Item::Vec3(_) => todo!(),
-                    Item::Vec2(_) => todo!(),
-                    Item::Scalar(_) => todo!(),
-                }
-            } else {
-                f.write_fmt(format_args!("{out}[{lhs}] = {elem}({rhs});\n"))
-            }
+            // if let Elem::Bool = elem_rhs {
+            //     match rhs.item() {
+            //         Item::Vec4(_) => {
+            //             f.write_fmt(format_args!("{out}[{lhs}] = make_uint4({elem}({rhs}.x), {elem}({rhs}.y), {elem}({rhs}.z), {elem}({rhs}.w));\n"))
+            //         },
+            //         Item::Vec3(_) => todo!(),
+            //         Item::Vec2(_) => todo!(),
+            //         Item::Scalar(_) => todo!(),
+            //     }
+            // } else {
+            // }
+            f.write_fmt(format_args!("{out}[{lhs}] = {elem}({rhs});\n"))
         } else {
             f.write_fmt(format_args!("{out}[{lhs}] = {rhs};\n"))
         }
@@ -267,9 +189,13 @@ impl Binary for IndexAssign {
         elem: Elem,
         index: usize,
     ) -> std::fmt::Result {
+        if index == 1 {
+            return Self::format_scalar(f, *lhs, *rhs, *out, elem);
+        }
+
         for i in 0..index {
-            let lhsi = lhs.index(i);
-            let rhsi = rhs.index(i);
+            let lhsi = lhs.index(i, false);
+            let rhsi = rhs.index(i, false);
             Self::format_scalar(f, lhsi, rhsi, *out, elem)?;
         }
 
@@ -292,13 +218,9 @@ impl Binary for IndexAssign {
         };
 
         let elem = out.elem();
+        let item = lhs.item();
 
-        match lhs.item() {
-            Item::Vec4(_) => Self::unroll_vec4(f, lhs, rhs, out, elem),
-            Item::Vec3(_) => Self::unroll_vec3(f, lhs, rhs, out, elem),
-            Item::Vec2(_) => Self::unroll_vec2(f, lhs, rhs, out, elem),
-            Item::Scalar(_) => Self::format_scalar(f, *lhs, *rhs, *out, elem),
-        }
+        Self::unroll_vec(f, lhs, rhs, out, elem, item.vectorization.into())
     }
 }
 
@@ -375,8 +297,8 @@ impl IndexVector {
             }
         };
 
-        let out = out.index(index);
-        let lhs = lhs.index(index);
+        let out = out.index(index, false);
+        let lhs = lhs.index(index, false);
 
         f.write_fmt(format_args!("{out} = {lhs};\n"))
     }
@@ -397,8 +319,8 @@ impl IndexAssignVector {
             }
         };
 
-        let out = out.index(index);
-        let rhs = rhs.index(index);
+        let out = out.index(index, false);
+        let rhs = rhs.index(index, false);
 
         f.write_fmt(format_args!("{out} = {rhs};\n"))
     }
diff --git a/crates/cubecl-cuda/src/compiler/element.rs b/crates/cubecl-cuda/src/compiler/element.rs
index ff797a24b..a31c5906b 100644
--- a/crates/cubecl-cuda/src/compiler/element.rs
+++ b/crates/cubecl-cuda/src/compiler/element.rs
@@ -4,30 +4,32 @@ use std::fmt::Display;
 
 use super::Fragment;
 
-#[derive(Debug, Clone, PartialEq, Eq, Copy)]
+#[derive(Debug, Clone, PartialEq, Eq, Copy, Hash)]
 pub enum Elem {
     F32,
     F16,
+    F162,
     BF16,
+    BF162,
     I32,
     U32,
     Bool,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Copy)]
-pub enum Item {
-    Vec4(Elem),
-    Vec3(Elem),
-    Vec2(Elem),
-    Scalar(Elem),
+#[derive(Debug, Clone, PartialEq, Eq, Copy, Hash)]
+pub struct Item {
+    pub(crate) elem: Elem,
+    pub(crate) vectorization: usize,
 }
 
 impl Display for Elem {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Elem::F16 => f.write_str("__half"),
+            Elem::F162 => f.write_str("__half2"),
             Elem::F32 => f.write_str("float"),
             Elem::BF16 => f.write_str("__nv_bfloat16"),
+            Elem::BF162 => f.write_str("__nv_bfloat162"),
             Elem::I32 => f.write_str("int"),
             Elem::U32 => f.write_str("uint"),
             Elem::Bool => f.write_str("bool"),
@@ -37,33 +39,11 @@ impl Display for Elem {
 
 impl Display for Item {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Item::Vec4(elem) => match elem {
-                Elem::F32 => f.write_str("float4"),
-                Elem::I32 => f.write_str("int4"),
-                Elem::U32 => f.write_str("uint4"),
-                Elem::Bool => f.write_str("bool4"),
-                Elem::BF16 => f.write_str("__nv_bfloat164"),
-                Elem::F16 => f.write_str("__half4"),
-            },
-            Item::Vec3(elem) => match elem {
-                Elem::F32 => f.write_str("float3"),
-                Elem::I32 => f.write_str("int3"),
-                Elem::U32 => f.write_str("uint3"),
-                Elem::Bool => f.write_str("bool3"),
-                Elem::BF16 => f.write_str("__nv_bfloat164"),
-                Elem::F16 => f.write_str("__half3"),
-            },
-            Item::Vec2(elem) => match elem {
-                Elem::F32 => f.write_str("float2"),
-                Elem::I32 => f.write_str("int2"),
-                Elem::U32 => f.write_str("uint2"),
-                Elem::Bool => f.write_str("bool2"),
-                Elem::BF16 => f.write_str("__nv_bfloat162"),
-                Elem::F16 => f.write_str("__half2"),
-            },
-            Item::Scalar(elem) => f.write_fmt(format_args!("{elem}")),
+        if 1 == self.vectorization {
+            return f.write_fmt(format_args!("{}", self.elem));
         }
+
+        return f.write_fmt(format_args!("{}_{}", self.elem, self.vectorization));
     }
 }
 
@@ -95,38 +75,38 @@ impl Component for Variable {
                 item,
                 depth: _,
             } => *item,
-            Variable::ConstantScalar(_, e) => Item::Scalar(*e),
-            Variable::GlobalScalar(_, e, _) => Item::Scalar(*e),
-            Variable::IdxGlobal => Item::Scalar(Elem::U32),
-            Variable::ThreadIdxGlobal => Item::Scalar(Elem::U32),
-            Variable::ThreadIdxX => Item::Scalar(Elem::U32),
-            Variable::ThreadIdxY => Item::Scalar(Elem::U32),
-            Variable::ThreadIdxZ => Item::Scalar(Elem::U32),
-            Variable::Rank => Item::Scalar(Elem::U32),
+            Variable::ConstantScalar(_, e) => Item::scalar(*e),
+            Variable::GlobalScalar(_, e, _) => Item::scalar(*e),
+            Variable::IdxGlobal => Item::scalar(Elem::U32),
+            Variable::ThreadIdxGlobal => Item::scalar(Elem::U32),
+            Variable::ThreadIdxX => Item::scalar(Elem::U32),
+            Variable::ThreadIdxY => Item::scalar(Elem::U32),
+            Variable::ThreadIdxZ => Item::scalar(Elem::U32),
+            Variable::Rank => Item::scalar(Elem::U32),
             Variable::LocalScalar {
                 id: _,
                 elem,
                 depth: _,
-            } => Item::Scalar(*elem),
-            Variable::BlockIdxX => Item::Scalar(Elem::U32),
-            Variable::BlockIdxY => Item::Scalar(Elem::U32),
-            Variable::BlockIdxZ => Item::Scalar(Elem::U32),
-            Variable::AbsoluteIdxX => Item::Scalar(Elem::U32),
-            Variable::AbsoluteIdxY => Item::Scalar(Elem::U32),
-            Variable::AbsoluteIdxZ => Item::Scalar(Elem::U32),
-            Variable::BlockDimX => Item::Scalar(Elem::U32),
-            Variable::BlockDimY => Item::Scalar(Elem::U32),
-            Variable::BlockDimZ => Item::Scalar(Elem::U32),
-            Variable::GridDimX => Item::Scalar(Elem::U32),
-            Variable::GridDimY => Item::Scalar(Elem::U32),
-            Variable::GridDimZ => Item::Scalar(Elem::U32),
+            } => Item::scalar(*elem),
+            Variable::BlockIdxX => Item::scalar(Elem::U32),
+            Variable::BlockIdxY => Item::scalar(Elem::U32),
+            Variable::BlockIdxZ => Item::scalar(Elem::U32),
+            Variable::AbsoluteIdxX => Item::scalar(Elem::U32),
+            Variable::AbsoluteIdxY => Item::scalar(Elem::U32),
+            Variable::AbsoluteIdxZ => Item::scalar(Elem::U32),
+            Variable::BlockDimX => Item::scalar(Elem::U32),
+            Variable::BlockDimY => Item::scalar(Elem::U32),
+            Variable::BlockDimZ => Item::scalar(Elem::U32),
+            Variable::GridDimX => Item::scalar(Elem::U32),
+            Variable::GridDimY => Item::scalar(Elem::U32),
+            Variable::GridDimZ => Item::scalar(Elem::U32),
             Variable::LocalArray(_, e, _, _) => *e,
-            Variable::WarpSize => Item::Scalar(Elem::U32),
+            Variable::WarpSize => Item::scalar(Elem::U32),
             Variable::WmmaFragment {
                 id: _,
                 frag,
                 depth: _,
-            } => Item::Scalar(frag.elem),
+            } => Item::scalar(frag.elem),
         }
     }
 }
@@ -242,6 +222,47 @@ impl Display for Variable {
 }
 
 impl Variable {
+    pub fn is_optimized(&self) -> bool {
+        let item = self.item();
+        match item.elem {
+            Elem::F162 => true,
+            Elem::BF162 => true,
+            _ => false,
+        }
+    }
+    pub fn optimized(&self) -> Self {
+        match self {
+            Variable::GlobalInputArray(id, item) => {
+                Variable::GlobalInputArray(*id, item.optimized())
+            }
+            Variable::GlobalOutputArray(id, item) => {
+                Variable::GlobalOutputArray(*id, item.optimized())
+            }
+            Variable::Local { id, item, depth } => Variable::Local {
+                id: *id,
+                item: item.optimized(),
+                depth: *depth,
+            },
+            Variable::Slice { id, item, depth } => Variable::Slice {
+                id: *id,
+                item: item.optimized(),
+                depth: *depth,
+            },
+            Variable::SharedMemory(id, item, size) => Variable::SharedMemory(
+                *id,
+                item.optimized(),
+                size / 2, // TODO: Fix
+            ),
+            Variable::LocalArray(id, item, vec, size) => Variable::LocalArray(
+                *id,
+                item.optimized(),
+                *vec,
+                size / 2, // TODO: Fix
+            ),
+            _ => self.clone(),
+        }
+    }
+
     pub fn is_always_scalar(&self) -> bool {
         match self {
             Variable::GlobalScalar(_, _, _) => true,
@@ -292,54 +313,78 @@ impl Variable {
         }
     }
 
-    pub fn index(&self, index: usize) -> IndexedVariable {
-        IndexedVariable { var: *self, index }
+    pub fn index(&self, index: usize, optimized: bool) -> IndexedVariable {
+        IndexedVariable {
+            var: *self,
+            index,
+            optimized,
+        }
     }
 }
 
 #[derive(Debug, Clone)]
 pub struct IndexedVariable {
     var: Variable,
+    optimized: bool,
     index: usize,
 }
 
 impl Display for IndexedVariable {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         let var = &self.var;
-        let item = self.var.item();
 
-        match item {
-            Item::Vec4(_) => match self.index {
-                0 => f.write_fmt(format_args!("{var}.x"))?,
-                1 => f.write_fmt(format_args!("{var}.y"))?,
-                2 => f.write_fmt(format_args!("{var}.z"))?,
-                3 => f.write_fmt(format_args!("{var}.w"))?,
-                _ => unreachable!(),
-            },
-            Item::Vec3(_) => match self.index {
-                0 => f.write_fmt(format_args!("{var}.x"))?,
-                1 => f.write_fmt(format_args!("{var}.y"))?,
-                2 => f.write_fmt(format_args!("{var}.z"))?,
-                _ => unreachable!(),
-            },
-            Item::Vec2(_) => match self.index {
-                0 => f.write_fmt(format_args!("{var}.x"))?,
-                1 => f.write_fmt(format_args!("{var}.y"))?,
-                _ => unreachable!(),
-            },
-            Item::Scalar(_) => f.write_fmt(format_args!("{var}"))?,
+        if self.var.item().vectorization > 1 {
+            if self.optimized {
+                let item = self.var.item();
+                f.write_fmt(format_args!(
+                    "(reinterpret_cast<{item}*>(&{var}))->i_{}",
+                    self.index
+                ))
+            } else {
+                f.write_fmt(format_args!("{var}.i_{}", self.index))
+            }
+        } else {
+            f.write_fmt(format_args!("{var}"))
         }
-
-        Ok(())
     }
 }
 impl Item {
     pub fn elem(&self) -> &Elem {
-        match self {
-            Item::Vec4(e) => e,
-            Item::Vec3(e) => e,
-            Item::Vec2(e) => e,
-            Item::Scalar(e) => e,
+        &self.elem
+    }
+
+    pub fn new(elem: Elem, vectorization: usize) -> Self {
+        Self {
+            elem,
+            vectorization,
+        }
+    }
+    pub fn scalar(elem: Elem) -> Self {
+        Self {
+            elem,
+            vectorization: 1,
+        }
+    }
+
+    pub fn optimized(&self) -> Item {
+        if self.vectorization == 1 {
+            return self.clone();
+        }
+
+        if self.vectorization % 2 != 0 {
+            return self.clone();
+        }
+
+        match self.elem {
+            Elem::F16 => Item {
+                elem: Elem::F162,
+                vectorization: self.vectorization / 2,
+            },
+            Elem::BF16 => Item {
+                elem: Elem::BF162,
+                vectorization: self.vectorization / 2,
+            },
+            _ => self.clone(),
         }
     }
 }
@@ -347,9 +392,11 @@ impl Item {
 impl Elem {
     pub fn size(&self) -> usize {
         match self {
-            Self::F32 => core::mem::size_of::<f32>(),
             Self::F16 => core::mem::size_of::<f16>(),
+            Self::F162 => 2 * core::mem::size_of::<f16>(),
+            Self::BF162 => 2 * core::mem::size_of::<bf16>(),
             Self::BF16 => core::mem::size_of::<bf16>(),
+            Self::F32 => core::mem::size_of::<f32>(),
             Self::I32 => core::mem::size_of::<i32>(),
             Self::U32 => core::mem::size_of::<u32>(),
             Self::Bool => core::mem::size_of::<bool>(),
diff --git a/crates/cubecl-cuda/src/compiler/instruction.rs b/crates/cubecl-cuda/src/compiler/instruction.rs
index 8d6ae6e10..c36268a53 100644
--- a/crates/cubecl-cuda/src/compiler/instruction.rs
+++ b/crates/cubecl-cuda/src/compiler/instruction.rs
@@ -262,16 +262,14 @@ for (uint {i} = {start}; {i} < {end}; {i}++) {{
                     Variable::GlobalOutputArray(index, _) => *index as usize + num_inputs,
                     _ => panic!("Can only know the len of a global array."),
                 } + 1;
-                let factor = match input.item() {
-                    super::Item::Vec4(_) => 4,
-                    super::Item::Vec3(_) => 3,
-                    super::Item::Vec2(_) => 2,
-                    super::Item::Scalar(_) => {
-                        return f.write_fmt(format_args!(
-                            "{out} = info[({offset} * 2 * info[0]) + {index}];\n"
-                        ))
-                    }
-                };
+                let factor = input.item().vectorization;
+
+                if factor == 1 {
+                    return f.write_fmt(format_args!(
+                        "{out} = info[({offset} * 2 * info[0]) + {index}];\n"
+                    ));
+                }
+
                 f.write_fmt(format_args!(
                     "{out} = info[({offset} * 2 * info[0]) + {index}] / {factor};\n"
                 ))
@@ -293,18 +291,13 @@ impl Fma {
         c: &Variable,
         out: &Variable,
     ) -> core::fmt::Result {
-        let num = match out.item() {
-            super::Item::Vec4(_) => 4,
-            super::Item::Vec3(_) => 3,
-            super::Item::Vec2(_) => 2,
-            super::Item::Scalar(_) => 1,
-        };
+        let num = out.item().vectorization as usize;
 
         for i in 0..num {
-            let ai = a.index(i);
-            let bi = b.index(i);
-            let ci = c.index(i);
-            let outi = out.index(i);
+            let ai = a.index(i, false);
+            let bi = b.index(i, false);
+            let ci = c.index(i, false);
+            let outi = out.index(i, false);
 
             f.write_fmt(format_args!("{outi} = fma({ai}, {bi}, {ci});\n"))?;
         }
diff --git a/crates/cubecl-cuda/src/compiler/kernel.rs b/crates/cubecl-cuda/src/compiler/kernel.rs
index 0251b7945..3b56ebd3f 100644
--- a/crates/cubecl-cuda/src/compiler/kernel.rs
+++ b/crates/cubecl-cuda/src/compiler/kernel.rs
@@ -1,6 +1,6 @@
 use super::{Body, Item};
 use cubecl_core::{ir::CubeDim, CompilerRepresentation};
-use std::fmt::Display;
+use std::{collections::HashSet, fmt::Display};
 
 #[derive(Debug, PartialEq, Eq, Clone)]
 pub struct Binding {
@@ -50,6 +50,7 @@ pub struct ComputeKernel {
     pub wmma_activated: bool,
     pub bf16: bool,
     pub f16: bool,
+    pub items: HashSet<super::Item>,
 }
 
 impl CompilerRepresentation for ComputeKernel {
@@ -57,13 +58,7 @@ impl CompilerRepresentation for ComputeKernel {
         let mut current = 0usize;
 
         for var in self.body.shared_memories.iter() {
-            let factor = match var.item {
-                Item::Vec4(_) => 4,
-                Item::Vec3(_) => 3,
-                Item::Vec2(_) => 2,
-                Item::Scalar(_) => 1,
-            };
-
+            let factor = var.item.vectorization;
             let elem_size_bytes = var.item.elem().size();
             current += (var.size as usize) * factor * elem_size_bytes;
         }
@@ -89,30 +84,35 @@ impl Display for ComputeKernel {
             f.write_str("using namespace nvcuda;\n")?;
         }
 
-        if self.f16 {
-            f.write_str(
-                "
-extern \"C\" struct __half4 {
-    __half x;
-    __half y;
-    __half z;
-    __half w;
-};
-",
-            )?;
+        f.write_str(
+            "
+typedef unsigned int uint;
+            ",
+        )?;
+
+        for item in self.items.iter() {
+            let elem = item.elem;
+            let size = item.vectorization;
+            let alignment = elem.size() * size;
+            if size > 1 {
+                f.write_fmt(format_args!(
+                    "
+struct __align__({alignment}) {item} {{"
+                ))?;
+
+                for i in 0..size {
+                    f.write_fmt(format_args!(
+                        "
+    {elem} i_{i};"
+                    ))?;
+                }
+
+                f.write_str("\n};\n")?;
+            }
         }
 
         f.write_fmt(format_args!(
             "
-typedef unsigned int uint;
-
-extern \"C\" struct bool4 {{
-    bool x;
-    bool y;
-    bool z;
-    bool w;
-}};
-
 
 extern \"C\" __global__ void kernel(
 ",
diff --git a/crates/cubecl-cuda/src/compiler/unary.rs b/crates/cubecl-cuda/src/compiler/unary.rs
index 641c4835d..a368bc9a0 100644
--- a/crates/cubecl-cuda/src/compiler/unary.rs
+++ b/crates/cubecl-cuda/src/compiler/unary.rs
@@ -1,4 +1,4 @@
-use super::{Component, Elem, InstructionSettings, Item, Variable};
+use super::{Component, Elem, Variable};
 use std::fmt::Display;
 
 pub trait Unary {
@@ -8,36 +8,8 @@ pub trait Unary {
         out: &Variable,
     ) -> std::fmt::Result {
         let item = out.item();
-        let settings = Self::settings(*item.elem());
-
-        match item {
-            Item::Vec4(elem) => {
-                if settings.native_vec4 {
-                    Self::format_native_vec4(f, input, out, elem)
-                } else {
-                    Self::unroll_vec4(f, input, out, elem)
-                }
-            }
-            Item::Vec3(elem) => {
-                if settings.native_vec3 {
-                    Self::format_native_vec3(f, input, out, elem)
-                } else {
-                    Self::unroll_vec3(f, input, out, elem)
-                }
-            }
-            Item::Vec2(elem) => {
-                if settings.native_vec2 {
-                    Self::format_native_vec2(f, input, out, elem)
-                } else {
-                    Self::unroll_vec2(f, input, out, elem)
-                }
-            }
-            Item::Scalar(elem) => Self::format_scalar(f, *input, *out, elem),
-        }
-    }
 
-    fn settings(_elem: Elem) -> InstructionSettings {
-        InstructionSettings::default()
+        Self::unroll_vec(f, input, out, item.elem, item.vectorization)
     }
 
     fn format_scalar<Input, Out>(
@@ -50,70 +22,30 @@ pub trait Unary {
         Input: Component,
         Out: Component;
 
-    fn format_native_vec4(
-        f: &mut std::fmt::Formatter<'_>,
-        input: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::format_scalar(f, *input, *out, elem)
-    }
-
-    fn format_native_vec3(
-        f: &mut std::fmt::Formatter<'_>,
-        input: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::format_scalar(f, *input, *out, elem)
-    }
-
-    fn format_native_vec2(
-        f: &mut std::fmt::Formatter<'_>,
-        input: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::format_scalar(f, *input, *out, elem)
-    }
-
-    fn unroll_vec2(
+    fn unroll_vec(
         f: &mut std::fmt::Formatter<'_>,
         input: &Variable,
         out: &Variable,
         elem: Elem,
+        index: usize,
     ) -> std::fmt::Result {
-        Self::unroll_vec(f, input, out, elem, 2)
-    }
+        if index == 1 {
+            return Self::format_scalar(f, *input, *out, elem);
+        }
 
-    fn unroll_vec3(
-        f: &mut std::fmt::Formatter<'_>,
-        input: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::unroll_vec(f, input, out, elem, 3)
-    }
+        // let input_optimized = input.optimized();
+        // let output_optimized = out.optimized();
+        // let optimized = input_optimized.elem() == output_optimized.elem();
 
-    fn unroll_vec4(
-        f: &mut std::fmt::Formatter<'_>,
-        input: &Variable,
-        out: &Variable,
-        elem: Elem,
-    ) -> std::fmt::Result {
-        Self::unroll_vec(f, input, out, elem, 4)
-    }
+        // let (input, out) = if optimized {
+        //     (input_optimized, output_optimized)
+        // } else {
+        //     (*input, *out)
+        // };
 
-    fn unroll_vec(
-        f: &mut std::fmt::Formatter<'_>,
-        input: &Variable,
-        out: &Variable,
-        elem: Elem,
-        index: usize,
-    ) -> std::fmt::Result {
         for i in 0..index {
-            let inputi = input.index(i);
-            let outi = out.index(i);
+            let inputi = input.index(i, false);
+            let outi = out.index(i, false);
 
             Self::format_scalar(f, inputi, outi, elem)?;
         }
diff --git a/crates/cubecl/Cargo.toml b/crates/cubecl/Cargo.toml
index 83c0935ee..0d129d7bb 100644
--- a/crates/cubecl/Cargo.toml
+++ b/crates/cubecl/Cargo.toml
@@ -36,3 +36,7 @@ cubecl-linalg = { path = "../cubecl-linalg", version = "0.1.1", default-features
 [[bench]]
 name = "matmul"
 harness = false
+
+[[bench]]
+name = "unary"
+harness = false
diff --git a/crates/cubecl/benches/matmul.rs b/crates/cubecl/benches/matmul.rs
index ef81c4d8a..65e31cafb 100644
--- a/crates/cubecl/benches/matmul.rs
+++ b/crates/cubecl/benches/matmul.rs
@@ -80,11 +80,13 @@ fn run<R: Runtime, E: Float>(device: R::Device, kind: MatmulKind) {
 }
 
 fn main() {
-    #[cfg(feature = "wgpu")]
-    run::<cubecl::wgpu::WgpuRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
+    // #[cfg(feature = "wgpu")]
+    // run::<cubecl::wgpu::WgpuRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
+    // #[cfg(feature = "cuda")]
+    // run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
     #[cfg(feature = "cuda")]
-    run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
+    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), MatmulKind::Tiling2d);
 
-    #[cfg(feature = "cuda")]
-    run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Cmma);
+    // #[cfg(feature = "cuda")]
+    // run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Cmma);
 }
diff --git a/crates/cubecl/benches/unary.rs b/crates/cubecl/benches/unary.rs
new file mode 100644
index 000000000..74a32467c
--- /dev/null
+++ b/crates/cubecl/benches/unary.rs
@@ -0,0 +1,111 @@
+use cubecl::{
+    calculate_cube_count_elemwise, prelude::*, tensor_vectorization_factor, SUBCUBE_DIM_APPROX,
+};
+use std::marker::PhantomData;
+
+use cubecl::benchmark::Benchmark;
+use cubecl::client::SyncType;
+use cubecl::frontend::Float;
+use cubecl_linalg::tensor::TensorHandle;
+
+#[cube(launch)]
+fn execute<F: Float>(lhs: &Tensor<F>, rhs: &Tensor<F>, out: &mut Tensor<F>) {
+    if ABSOLUTE_POS < out.len() {
+        for i in range(0, 160, Comptime::new(false)) {
+            out[ABSOLUTE_POS] += F::erf(lhs[ABSOLUTE_POS] * rhs[ABSOLUTE_POS] + F::cast_from(i));
+        }
+    }
+}
+
+impl<R: Runtime, E: Float> Benchmark for UnaryBench<R, E> {
+    type Args = (TensorHandle<R, E>, TensorHandle<R, E>, TensorHandle<R, E>);
+
+    fn prepare(&self) -> Self::Args {
+        let client = R::client(&self.device);
+        let lhs = TensorHandle::zeros(client.clone(), self.shape.clone());
+        let rhs = TensorHandle::zeros(client.clone(), self.shape.clone());
+        let out = TensorHandle::zeros(client.clone(), self.shape.clone());
+
+        (lhs, rhs, out)
+    }
+
+    fn execute(&self, (lhs, rhs, out): Self::Args) {
+        let num_elems: usize = out.shape.iter().product();
+
+        let cube_count =
+            calculate_cube_count_elemwise::<R::Server>(num_elems / self.vectorization as usize, 16);
+
+        execute::launch::<E, R>(
+            &self.client,
+            cube_count,
+            CubeDim::new(16, 16, 1),
+            TensorArg::vectorized(self.vectorization, &lhs.handle, &lhs.strides, &lhs.shape),
+            TensorArg::vectorized(self.vectorization, &rhs.handle, &rhs.strides, &rhs.shape),
+            TensorArg::vectorized(self.vectorization, &out.handle, &out.strides, &out.shape),
+        )
+    }
+
+    fn num_samples(&self) -> usize {
+        10
+    }
+
+    fn name(&self) -> String {
+        format!(
+            "unary-{}-{}-{:?}",
+            R::name(),
+            E::as_elem(),
+            self.vectorization
+        )
+        .to_lowercase()
+    }
+
+    fn sync(&self) {
+        self.client.sync(SyncType::Wait);
+    }
+}
+
+#[allow(dead_code)]
+struct UnaryBench<R: Runtime, E> {
+    shape: Vec<usize>,
+    vectorization: u8,
+    device: R::Device,
+    client: ComputeClient<R::Server, R::Channel>,
+    _e: PhantomData<E>,
+}
+
+#[allow(dead_code)]
+#[derive(Debug)]
+enum MatmulKind {
+    Tiling2d,
+    Cmma,
+}
+
+#[allow(dead_code)]
+fn run<R: Runtime, E: Float>(device: R::Device, vectorization: u8) {
+    let bench = UnaryBench::<R, E> {
+        shape: vec![32, 512, 2048],
+        vectorization,
+        client: R::client(&device),
+        device,
+        _e: PhantomData,
+    };
+    println!("{}", bench.name());
+    println!("{}", bench.run());
+}
+
+fn main() {
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), 1);
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), 4);
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 1);
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 4);
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 8);
+    #[cfg(feature = "wgpu")]
+    run::<cubecl::wgpu::WgpuRuntime, F32>(Default::default(), 4);
+    // #[cfg(feature = "cuda")]
+    // run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 8);
+}

From cefea1c15afbc7325fc925ab95e3a48991361ac8 Mon Sep 17 00:00:00 2001
From: nathaniel <nathaniel.simard.42@gmail.com>
Date: Mon, 22 Jul 2024 20:19:10 -0400
Subject: [PATCH 2/4] WIP

---
 crates/cubecl-core/src/runtime_tests/mod.rs   |  2 +
 .../cubecl-core/src/runtime_tests/topology.rs | 57 +++++++++++++++++++
 crates/cubecl-cuda/src/compiler/binary.rs     | 30 ++++------
 crates/cubecl-cuda/src/compiler/element.rs    | 50 ++++++++++++++++
 crates/cubecl-cuda/src/compiler/kernel.rs     |  8 +++
 crates/cubecl-cuda/src/compiler/unary.rs      | 34 +++++------
 crates/cubecl-cuda/src/compute/server.rs      |  1 +
 crates/cubecl-cuda/src/runtime.rs             |  2 +-
 crates/cubecl-linalg/src/tensor/base.rs       | 10 ++--
 crates/cubecl/benches/matmul.rs               |  6 +-
 crates/cubecl/benches/unary.rs                | 28 ++++-----
 11 files changed, 165 insertions(+), 63 deletions(-)
 create mode 100644 crates/cubecl-core/src/runtime_tests/topology.rs

diff --git a/crates/cubecl-core/src/runtime_tests/mod.rs b/crates/cubecl-core/src/runtime_tests/mod.rs
index e445a75f3..005c5bb3c 100644
--- a/crates/cubecl-core/src/runtime_tests/mod.rs
+++ b/crates/cubecl-core/src/runtime_tests/mod.rs
@@ -3,6 +3,7 @@ pub mod cmma;
 pub mod launch;
 pub mod slice;
 pub mod subcube;
+pub mod topology;
 
 #[allow(missing_docs)]
 #[macro_export]
@@ -15,5 +16,6 @@ macro_rules! testgen_all {
         cubecl_core::testgen_cmma!();
         cubecl_core::testgen_slice!();
         cubecl_core::testgen_assign!();
+        cubecl_core::testgen_topology!();
     };
 }
diff --git a/crates/cubecl-core/src/runtime_tests/topology.rs b/crates/cubecl-core/src/runtime_tests/topology.rs
new file mode 100644
index 000000000..0d469b8b3
--- /dev/null
+++ b/crates/cubecl-core/src/runtime_tests/topology.rs
@@ -0,0 +1,57 @@
+use crate as cubecl;
+
+use cubecl::prelude::*;
+
+#[cube(launch)]
+pub fn kernel_absolute_pos(output1: &mut Array<UInt>, output2: &mut Array<UInt>) {
+    if ABSOLUTE_POS >= output1.len() {
+        return;
+    }
+
+    output1[ABSOLUTE_POS] = ABSOLUTE_POS;
+    output2[ABSOLUTE_POS] = ABSOLUTE_POS;
+}
+
+pub fn test_kernel_topology_absolute_pos<R: Runtime>(client: ComputeClient<R::Server, R::Channel>) {
+    let cube_count = (300, 5, 7);
+    let cube_dim = (32, 32, 1);
+    let extra: u32 = 3u32;
+
+    let length =
+        (cube_count.0 * cube_count.1 * cube_count.2 * cube_dim.0 * cube_dim.1 * cube_dim.2) + extra;
+    let handle1 = client.empty(length as usize * core::mem::size_of::<u32>());
+    let handle2 = client.empty(length as usize * core::mem::size_of::<u32>());
+
+    kernel_absolute_pos::launch::<R>(
+        &client,
+        CubeCount::Static(cube_count.0, cube_count.1, cube_count.2),
+        CubeDim::new(cube_dim.0, cube_dim.1, cube_dim.2),
+        ArrayArg::new(&handle1, length as usize),
+        ArrayArg::new(&handle2, length as usize),
+    );
+
+    let actual = client.read(handle1.binding());
+    let actual = u32::from_bytes(&actual);
+    let mut expect: Vec<u32> = (0..length - extra).into_iter().map(|v| v as u32).collect();
+    expect.push(0);
+    expect.push(0);
+    expect.push(0);
+
+    assert_eq!(actual, &expect);
+}
+
+#[allow(missing_docs)]
+#[macro_export]
+macro_rules! testgen_topology {
+    () => {
+        use super::*;
+
+        #[test]
+        fn test_topology_scalar() {
+            let client = TestRuntime::client(&Default::default());
+            cubecl_core::runtime_tests::topology::test_kernel_topology_absolute_pos::<TestRuntime>(
+                client,
+            );
+        }
+    };
+}
diff --git a/crates/cubecl-cuda/src/compiler/binary.rs b/crates/cubecl-cuda/src/compiler/binary.rs
index 8eec9c251..674fd75a7 100644
--- a/crates/cubecl-cuda/src/compiler/binary.rs
+++ b/crates/cubecl-cuda/src/compiler/binary.rs
@@ -8,7 +8,7 @@ pub trait Binary {
         rhs: &Variable,
         out: &Variable,
     ) -> std::fmt::Result {
-        let item = out.item();
+        let item = out.item().de_optimized();
         Self::unroll_vec(f, lhs, rhs, out, item.elem, item.vectorization.into())
     }
 
@@ -29,34 +29,24 @@ pub trait Binary {
         lhs: &Variable,
         rhs: &Variable,
         out: &Variable,
-        mut elem: Elem,
+        elem: Elem,
         index: usize,
     ) -> core::fmt::Result {
         if index == 1 {
             return Self::format_scalar(f, *lhs, *rhs, *out, elem);
         }
 
-        let lhs_optimized = lhs.optimized();
-        let rhs_optimized = rhs.optimized();
-        let out_optimized = out.optimized();
-        let lhs_elem = lhs_optimized.elem();
-        let optimized = lhs_elem == rhs_optimized.elem()
-            && lhs_elem == out_optimized.elem()
-            && lhs_elem != elem
-            && lhs_optimized.is_optimized();
-
-        let (lhs, rhs, out, index) = if optimized {
-            let factor = lhs.item().vectorization / lhs_optimized.item().vectorization;
-            elem = lhs_elem;
-            (lhs_optimized, rhs_optimized, out_optimized, index / factor)
-        } else {
-            (*lhs, *rhs, *out, index)
+        let optimized = Variable::optimized_args([*lhs, *rhs, *out]);
+        let [lhs, rhs, out] = optimized.args;
+        let (is_optimized, index) = match optimized.optimization_factor {
+            Some(factor) => (true, index / factor),
+            None => (false, index),
         };
 
         for i in 0..index {
-            let lhsi = lhs.index(i, optimized);
-            let rhsi = rhs.index(i, optimized);
-            let outi = out.index(i, optimized);
+            let lhsi = lhs.index(i, is_optimized);
+            let rhsi = rhs.index(i, is_optimized);
+            let outi = out.index(i, is_optimized);
 
             Self::format_scalar(f, lhsi, rhsi, outi, elem)?;
         }
diff --git a/crates/cubecl-cuda/src/compiler/element.rs b/crates/cubecl-cuda/src/compiler/element.rs
index a31c5906b..90eca63e2 100644
--- a/crates/cubecl-cuda/src/compiler/element.rs
+++ b/crates/cubecl-cuda/src/compiler/element.rs
@@ -43,6 +43,14 @@ impl Display for Item {
             return f.write_fmt(format_args!("{}", self.elem));
         }
 
+        if 2 == self.vectorization {
+            match self.elem {
+                Elem::F16 => return f.write_fmt(format_args!("{}", Elem::F162)),
+                Elem::BF16 => return f.write_fmt(format_args!("{}", Elem::BF162)),
+                _ => (),
+            }
+        }
+
         return f.write_fmt(format_args!("{}_{}", self.elem, self.vectorization));
     }
 }
@@ -221,6 +229,12 @@ impl Display for Variable {
     }
 }
 
+#[derive(new)]
+pub struct OptimizedArgs<const N: usize> {
+    pub args: [Variable; N],
+    pub optimization_factor: Option<usize>,
+}
+
 impl Variable {
     pub fn is_optimized(&self) -> bool {
         let item = self.item();
@@ -230,6 +244,34 @@ impl Variable {
             _ => false,
         }
     }
+
+    pub fn optimized_args<const N: usize>(args: [Self; N]) -> OptimizedArgs<N> {
+        let args_after = args.map(|a| a.optimized());
+
+        let item_reference_after = args_after[0].item();
+
+        let is_optimized = args_after
+            .iter()
+            .all(|var| var.elem() == item_reference_after.elem && var.is_optimized());
+
+        if is_optimized {
+            let vectorization_before = args
+                .iter()
+                .map(|var| var.item().vectorization)
+                .max()
+                .unwrap();
+            let vectorization_after = args_after
+                .iter()
+                .map(|var| var.item().vectorization)
+                .max()
+                .unwrap();
+
+            OptimizedArgs::new(args_after, Some(vectorization_before / vectorization_after))
+        } else {
+            OptimizedArgs::new(args, None)
+        }
+    }
+
     pub fn optimized(&self) -> Self {
         match self {
             Variable::GlobalInputArray(id, item) => {
@@ -353,6 +395,14 @@ impl Item {
         &self.elem
     }
 
+    pub fn de_optimized(&self) -> Self {
+        match self.elem {
+            Elem::F162 => Item::new(Elem::F16, self.vectorization * 2),
+            Elem::BF162 => Item::new(Elem::BF16, self.vectorization * 2),
+            _ => return *self,
+        }
+    }
+
     pub fn new(elem: Elem, vectorization: usize) -> Self {
         Self {
             elem,
diff --git a/crates/cubecl-cuda/src/compiler/kernel.rs b/crates/cubecl-cuda/src/compiler/kernel.rs
index 3b56ebd3f..2b5b0b0df 100644
--- a/crates/cubecl-cuda/src/compiler/kernel.rs
+++ b/crates/cubecl-cuda/src/compiler/kernel.rs
@@ -91,6 +91,14 @@ typedef unsigned int uint;
         )?;
 
         for item in self.items.iter() {
+            if item.vectorization == 2 {
+                match item.elem {
+                    super::Elem::BF16 => continue,
+                    super::Elem::F16 => continue,
+                    _ => (),
+                };
+            }
+
             let elem = item.elem;
             let size = item.vectorization;
             let alignment = elem.size() * size;
diff --git a/crates/cubecl-cuda/src/compiler/unary.rs b/crates/cubecl-cuda/src/compiler/unary.rs
index a368bc9a0..e7237c87a 100644
--- a/crates/cubecl-cuda/src/compiler/unary.rs
+++ b/crates/cubecl-cuda/src/compiler/unary.rs
@@ -29,23 +29,16 @@ pub trait Unary {
         elem: Elem,
         index: usize,
     ) -> std::fmt::Result {
-        if index == 1 {
-            return Self::format_scalar(f, *input, *out, elem);
-        }
-
-        // let input_optimized = input.optimized();
-        // let output_optimized = out.optimized();
-        // let optimized = input_optimized.elem() == output_optimized.elem();
-
-        // let (input, out) = if optimized {
-        //     (input_optimized, output_optimized)
-        // } else {
-        //     (*input, *out)
-        // };
+        let optimized = Variable::optimized_args([*input, *out]);
+        let [input, out] = optimized.args;
+        let (is_optimized, index, elem) = match optimized.optimization_factor {
+            Some(factor) => (true, index / factor, out.elem()),
+            None => (false, index, elem),
+        };
 
         for i in 0..index {
-            let inputi = input.index(i, false);
-            let outi = out.index(i, false);
+            let inputi = input.index(i, is_optimized);
+            let outi = out.index(i, is_optimized);
 
             Self::format_scalar(f, inputi, outi, elem)?;
         }
@@ -63,9 +56,16 @@ macro_rules! function {
                 f: &mut std::fmt::Formatter<'_>,
                 input: Input,
                 out: Out,
-                _elem: Elem,
+                elem: Elem,
             ) -> std::fmt::Result {
-                f.write_fmt(format_args!("{out} = {}({input});\n", $func))
+                match elem {
+                    Elem::F16 => f.write_fmt(format_args!("{out} = h{}({input});\n", $func)),
+                    Elem::F162 => f.write_fmt(format_args!("{out} = h2{}({input});\n", $func)),
+                    Elem::BF16 => f.write_fmt(format_args!("{out} = h{}({input});\n", $func)),
+                    Elem::BF162 => f.write_fmt(format_args!("{out} = h2{}({input});\n", $func)),
+                    Elem::F32 => f.write_fmt(format_args!("{out} = __{}f({input});\n", $func)),
+                    _ => f.write_fmt(format_args!("{out} = {}({input});\n", $func)),
+                }
             }
         }
     };
diff --git a/crates/cubecl-cuda/src/compute/server.rs b/crates/cubecl-cuda/src/compute/server.rs
index c22b067b1..9d1fc8e38 100644
--- a/crates/cubecl-cuda/src/compute/server.rs
+++ b/crates/cubecl-cuda/src/compute/server.rs
@@ -188,6 +188,7 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
 
     fn compile_kernel(&mut self, kernel_id: &str, kernel: Box<dyn CubeTask>, arch: i32) {
         let kernel_compiled = kernel.compile();
+        // println!("{}", kernel_compiled.source);
         let shared_mem_bytes = kernel_compiled.shared_mem_bytes;
         let cube_dim = kernel_compiled.cube_dim;
         let arch = format!("--gpu-architecture=sm_{}", arch);
diff --git a/crates/cubecl-cuda/src/runtime.rs b/crates/cubecl-cuda/src/runtime.rs
index 1a5cf7dcf..a0c7186f8 100644
--- a/crates/cubecl-cuda/src/runtime.rs
+++ b/crates/cubecl-cuda/src/runtime.rs
@@ -47,7 +47,7 @@ impl Runtime for CudaRuntime {
             )
             .unwrap();
             let storage = CudaStorage::new(stream);
-            let options = DynamicMemoryManagementOptions::preset(2048 * 1024 * 1024, 32);
+            let options = DynamicMemoryManagementOptions::preset(2048 * 1024 * 1024, 128);
             let memory_management = DynamicMemoryManagement::new(storage, options);
             CudaContext::new(memory_management, stream, ctx)
         }
diff --git a/crates/cubecl-linalg/src/tensor/base.rs b/crates/cubecl-linalg/src/tensor/base.rs
index 5716c711c..3cd6c93e8 100644
--- a/crates/cubecl-linalg/src/tensor/base.rs
+++ b/crates/cubecl-linalg/src/tensor/base.rs
@@ -109,7 +109,7 @@ where
     R: Runtime,
     E: Numeric,
 {
-    pub fn zeros(client: ComputeClient<R::Server, R::Channel>, shape: Vec<usize>) -> Self {
+    pub fn zeros(client: &ComputeClient<R::Server, R::Channel>, shape: Vec<usize>) -> Self {
         let num_elements: usize = shape.iter().product();
         let size = E::as_elem().size();
 
@@ -125,7 +125,7 @@ where
         );
 
         init::zeros_array::launch::<E, R>(
-            &client,
+            client,
             cube_count,
             CubeDim::default(),
             ArrayArg::new(&handle, num_elements),
@@ -141,8 +141,10 @@ pub(crate) mod init {
 
     #[cube(launch)]
     pub fn zeros_array<C: Numeric>(output: &mut Array<C>) {
-        if ABSOLUTE_POS < output.len() {
-            output[ABSOLUTE_POS] = C::from_int(0);
+        if ABSOLUTE_POS >= output.len() {
+            return;
         }
+
+        output[ABSOLUTE_POS] = C::from_int(0);
     }
 }
diff --git a/crates/cubecl/benches/matmul.rs b/crates/cubecl/benches/matmul.rs
index 65e31cafb..47ce625b0 100644
--- a/crates/cubecl/benches/matmul.rs
+++ b/crates/cubecl/benches/matmul.rs
@@ -13,9 +13,9 @@ impl<R: Runtime, E: Float> Benchmark for MatmulBench<R, E> {
     fn prepare(&self) -> Self::Args {
         let (b, m, k, n) = (self.b, self.m, self.k, self.n);
         let client = R::client(&self.device);
-        let lhs = TensorHandle::zeros(client.clone(), vec![b, m, k]);
-        let rhs = TensorHandle::zeros(client.clone(), vec![b, k, n]);
-        let out = TensorHandle::zeros(client.clone(), vec![b, m, n]);
+        let lhs = TensorHandle::zeros(&client, vec![b, m, k]);
+        let rhs = TensorHandle::zeros(&client, vec![b, k, n]);
+        let out = TensorHandle::zeros(&client, vec![b, m, n]);
 
         (lhs, rhs, out)
     }
diff --git a/crates/cubecl/benches/unary.rs b/crates/cubecl/benches/unary.rs
index 74a32467c..03c53f3de 100644
--- a/crates/cubecl/benches/unary.rs
+++ b/crates/cubecl/benches/unary.rs
@@ -1,6 +1,4 @@
-use cubecl::{
-    calculate_cube_count_elemwise, prelude::*, tensor_vectorization_factor, SUBCUBE_DIM_APPROX,
-};
+use cubecl::{calculate_cube_count_elemwise, prelude::*};
 use std::marker::PhantomData;
 
 use cubecl::benchmark::Benchmark;
@@ -11,8 +9,8 @@ use cubecl_linalg::tensor::TensorHandle;
 #[cube(launch)]
 fn execute<F: Float>(lhs: &Tensor<F>, rhs: &Tensor<F>, out: &mut Tensor<F>) {
     if ABSOLUTE_POS < out.len() {
-        for i in range(0, 160, Comptime::new(false)) {
-            out[ABSOLUTE_POS] += F::erf(lhs[ABSOLUTE_POS] * rhs[ABSOLUTE_POS] + F::cast_from(i));
+        for _ in range(0, 256, Comptime::new(false)) {
+            out[ABSOLUTE_POS] += F::cos(lhs[ABSOLUTE_POS] * rhs[ABSOLUTE_POS]);
         }
     }
 }
@@ -22,9 +20,9 @@ impl<R: Runtime, E: Float> Benchmark for UnaryBench<R, E> {
 
     fn prepare(&self) -> Self::Args {
         let client = R::client(&self.device);
-        let lhs = TensorHandle::zeros(client.clone(), self.shape.clone());
-        let rhs = TensorHandle::zeros(client.clone(), self.shape.clone());
-        let out = TensorHandle::zeros(client.clone(), self.shape.clone());
+        let lhs = TensorHandle::zeros(&client, self.shape.clone());
+        let rhs = TensorHandle::zeros(&client, self.shape.clone());
+        let out = TensorHandle::zeros(&client, self.shape.clone());
 
         (lhs, rhs, out)
     }
@@ -46,7 +44,7 @@ impl<R: Runtime, E: Float> Benchmark for UnaryBench<R, E> {
     }
 
     fn num_samples(&self) -> usize {
-        10
+        100
     }
 
     fn name(&self) -> String {
@@ -95,17 +93,11 @@ fn run<R: Runtime, E: Float>(device: R::Device, vectorization: u8) {
 
 fn main() {
     #[cfg(feature = "cuda")]
-    run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), 1);
+    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 8);
     #[cfg(feature = "cuda")]
     run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), 4);
-    #[cfg(feature = "cuda")]
-    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 1);
-    #[cfg(feature = "cuda")]
-    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 4);
-    #[cfg(feature = "cuda")]
-    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 8);
+    #[cfg(feature = "wgpu")]
+    run::<cubecl::wgpu::WgpuRuntime, F32>(Default::default(), 1);
     #[cfg(feature = "wgpu")]
     run::<cubecl::wgpu::WgpuRuntime, F32>(Default::default(), 4);
-    // #[cfg(feature = "cuda")]
-    // run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), 8);
 }

From e7089ee06c55533f89596ecaeac08e1c3ac64b20 Mon Sep 17 00:00:00 2001
From: nathaniel <nathaniel.simard.42@gmail.com>
Date: Wed, 24 Jul 2024 11:48:13 -0400
Subject: [PATCH 3/4] Cleanup

---
 .../cubecl-core/src/runtime_tests/topology.rs |  6 +-
 crates/cubecl-cuda/src/compiler/base.rs       |  2 +-
 crates/cubecl-cuda/src/compiler/binary.rs     | 25 ++-----
 crates/cubecl-cuda/src/compiler/element.rs    | 67 +++++++++++--------
 .../cubecl-cuda/src/compiler/instruction.rs   |  2 +-
 crates/cubecl-cuda/src/compiler/kernel.rs     |  8 +--
 crates/cubecl-cuda/src/compute/server.rs      |  1 -
 crates/cubecl-cuda/src/runtime.rs             |  2 +-
 crates/cubecl/benches/matmul.rs               | 16 +++--
 9 files changed, 60 insertions(+), 69 deletions(-)

diff --git a/crates/cubecl-core/src/runtime_tests/topology.rs b/crates/cubecl-core/src/runtime_tests/topology.rs
index 0d469b8b3..1b2df08e3 100644
--- a/crates/cubecl-core/src/runtime_tests/topology.rs
+++ b/crates/cubecl-core/src/runtime_tests/topology.rs
@@ -13,8 +13,8 @@ pub fn kernel_absolute_pos(output1: &mut Array<UInt>, output2: &mut Array<UInt>)
 }
 
 pub fn test_kernel_topology_absolute_pos<R: Runtime>(client: ComputeClient<R::Server, R::Channel>) {
-    let cube_count = (300, 5, 7);
-    let cube_dim = (32, 32, 1);
+    let cube_count = (3, 5, 7);
+    let cube_dim = (16, 16, 1);
     let extra: u32 = 3u32;
 
     let length =
@@ -32,7 +32,7 @@ pub fn test_kernel_topology_absolute_pos<R: Runtime>(client: ComputeClient<R::Se
 
     let actual = client.read(handle1.binding());
     let actual = u32::from_bytes(&actual);
-    let mut expect: Vec<u32> = (0..length - extra).into_iter().map(|v| v as u32).collect();
+    let mut expect: Vec<u32> = (0..length - extra).collect();
     expect.push(0);
     expect.push(0);
     expect.push(0);
diff --git a/crates/cubecl-cuda/src/compiler/base.rs b/crates/cubecl-cuda/src/compiler/base.rs
index 8fdc6cf7d..c63cd8632 100644
--- a/crates/cubecl-cuda/src/compiler/base.rs
+++ b/crates/cubecl-cuda/src/compiler/base.rs
@@ -553,7 +553,7 @@ impl CudaCompiler {
 
     fn compile_item(&mut self, item: gpu::Item) -> super::Item {
         let item = super::Item::new(self.compile_elem(item.elem), item.vectorization.into());
-        self.items.insert(item.clone());
+        self.items.insert(item);
         self.items.insert(item.optimized());
         item
     }
diff --git a/crates/cubecl-cuda/src/compiler/binary.rs b/crates/cubecl-cuda/src/compiler/binary.rs
index 674fd75a7..6b22437c1 100644
--- a/crates/cubecl-cuda/src/compiler/binary.rs
+++ b/crates/cubecl-cuda/src/compiler/binary.rs
@@ -9,7 +9,7 @@ pub trait Binary {
         out: &Variable,
     ) -> std::fmt::Result {
         let item = out.item().de_optimized();
-        Self::unroll_vec(f, lhs, rhs, out, item.elem, item.vectorization.into())
+        Self::unroll_vec(f, lhs, rhs, out, item.elem, item.vectorization)
     }
 
     fn format_scalar<Lhs, Rhs, Out>(
@@ -144,31 +144,14 @@ impl Binary for IndexAssign {
         lhs: Lhs,
         rhs: Rhs,
         out: Out,
-        elem: Elem,
+        _elem: Elem,
     ) -> std::fmt::Result
     where
         Lhs: Component,
         Rhs: Component,
         Out: Component,
     {
-        let elem_rhs = rhs.elem();
-        // Cast only when necessary.
-        if elem != elem_rhs {
-            // if let Elem::Bool = elem_rhs {
-            //     match rhs.item() {
-            //         Item::Vec4(_) => {
-            //             f.write_fmt(format_args!("{out}[{lhs}] = make_uint4({elem}({rhs}.x), {elem}({rhs}.y), {elem}({rhs}.z), {elem}({rhs}.w));\n"))
-            //         },
-            //         Item::Vec3(_) => todo!(),
-            //         Item::Vec2(_) => todo!(),
-            //         Item::Scalar(_) => todo!(),
-            //     }
-            // } else {
-            // }
-            f.write_fmt(format_args!("{out}[{lhs}] = {elem}({rhs});\n"))
-        } else {
-            f.write_fmt(format_args!("{out}[{lhs}] = {rhs};\n"))
-        }
+        f.write_fmt(format_args!("{out}[{lhs}] = {rhs};\n"))
     }
 
     fn unroll_vec(
@@ -210,7 +193,7 @@ impl Binary for IndexAssign {
         let elem = out.elem();
         let item = lhs.item();
 
-        Self::unroll_vec(f, lhs, rhs, out, elem, item.vectorization.into())
+        Self::unroll_vec(f, lhs, rhs, out, elem, item.vectorization)
     }
 }
 
diff --git a/crates/cubecl-cuda/src/compiler/element.rs b/crates/cubecl-cuda/src/compiler/element.rs
index 90eca63e2..22ce1ec16 100644
--- a/crates/cubecl-cuda/src/compiler/element.rs
+++ b/crates/cubecl-cuda/src/compiler/element.rs
@@ -43,12 +43,9 @@ impl Display for Item {
             return f.write_fmt(format_args!("{}", self.elem));
         }
 
-        if 2 == self.vectorization {
-            match self.elem {
-                Elem::F16 => return f.write_fmt(format_args!("{}", Elem::F162)),
-                Elem::BF16 => return f.write_fmt(format_args!("{}", Elem::BF162)),
-                _ => (),
-            }
+        if self.is_vec_native() {
+            let elem = self.optimized().elem;
+            return f.write_fmt(format_args!("{elem}"));
         }
 
         return f.write_fmt(format_args!("{}_{}", self.elem, self.vectorization));
@@ -237,12 +234,7 @@ pub struct OptimizedArgs<const N: usize> {
 
 impl Variable {
     pub fn is_optimized(&self) -> bool {
-        let item = self.item();
-        match item.elem {
-            Elem::F162 => true,
-            Elem::BF162 => true,
-            _ => false,
-        }
+        self.item().is_optimized()
     }
 
     pub fn optimized_args<const N: usize>(args: [Self; N]) -> OptimizedArgs<N> {
@@ -290,18 +282,23 @@ impl Variable {
                 item: item.optimized(),
                 depth: *depth,
             },
-            Variable::SharedMemory(id, item, size) => Variable::SharedMemory(
-                *id,
-                item.optimized(),
-                size / 2, // TODO: Fix
-            ),
-            Variable::LocalArray(id, item, vec, size) => Variable::LocalArray(
-                *id,
-                item.optimized(),
-                *vec,
-                size / 2, // TODO: Fix
-            ),
-            _ => self.clone(),
+            Variable::SharedMemory(id, item, size) => {
+                let before = item.vectorization;
+                let item = item.optimized();
+                let after = item.vectorization;
+                let scaling = (before / after) as u32;
+
+                Variable::SharedMemory(*id, item, size / scaling)
+            }
+            Variable::LocalArray(id, item, vec, size) => {
+                let before = item.vectorization;
+                let item = item.optimized();
+                let after = item.vectorization;
+                let scaling = (before / after) as u32;
+
+                Variable::LocalArray(*id, item.optimized(), *vec, size / scaling)
+            }
+            _ => *self,
         }
     }
 
@@ -399,7 +396,7 @@ impl Item {
         match self.elem {
             Elem::F162 => Item::new(Elem::F16, self.vectorization * 2),
             Elem::BF162 => Item::new(Elem::BF16, self.vectorization * 2),
-            _ => return *self,
+            _ => *self,
         }
     }
 
@@ -416,13 +413,27 @@ impl Item {
         }
     }
 
+    pub fn is_optimized(&self) -> bool {
+        matches!(self.elem, Elem::F162 | Elem::BF162)
+    }
+
+    pub fn is_vec_native(&self) -> bool {
+        match &self.elem {
+            Elem::F16 => self.vectorization == 2,
+            Elem::BF16 => self.vectorization == 2,
+            Elem::F162 => self.vectorization == 1,
+            Elem::BF162 => self.vectorization == 1,
+            _ => false,
+        }
+    }
+
     pub fn optimized(&self) -> Item {
         if self.vectorization == 1 {
-            return self.clone();
+            return *self;
         }
 
         if self.vectorization % 2 != 0 {
-            return self.clone();
+            return *self;
         }
 
         match self.elem {
@@ -434,7 +445,7 @@ impl Item {
                 elem: Elem::BF162,
                 vectorization: self.vectorization / 2,
             },
-            _ => self.clone(),
+            _ => *self,
         }
     }
 }
diff --git a/crates/cubecl-cuda/src/compiler/instruction.rs b/crates/cubecl-cuda/src/compiler/instruction.rs
index c36268a53..4e30af5c0 100644
--- a/crates/cubecl-cuda/src/compiler/instruction.rs
+++ b/crates/cubecl-cuda/src/compiler/instruction.rs
@@ -291,7 +291,7 @@ impl Fma {
         c: &Variable,
         out: &Variable,
     ) -> core::fmt::Result {
-        let num = out.item().vectorization as usize;
+        let num = out.item().vectorization;
 
         for i in 0..num {
             let ai = a.index(i, false);
diff --git a/crates/cubecl-cuda/src/compiler/kernel.rs b/crates/cubecl-cuda/src/compiler/kernel.rs
index 2b5b0b0df..c239d154f 100644
--- a/crates/cubecl-cuda/src/compiler/kernel.rs
+++ b/crates/cubecl-cuda/src/compiler/kernel.rs
@@ -91,12 +91,8 @@ typedef unsigned int uint;
         )?;
 
         for item in self.items.iter() {
-            if item.vectorization == 2 {
-                match item.elem {
-                    super::Elem::BF16 => continue,
-                    super::Elem::F16 => continue,
-                    _ => (),
-                };
+            if item.is_vec_native() {
+                continue;
             }
 
             let elem = item.elem;
diff --git a/crates/cubecl-cuda/src/compute/server.rs b/crates/cubecl-cuda/src/compute/server.rs
index 9d1fc8e38..c22b067b1 100644
--- a/crates/cubecl-cuda/src/compute/server.rs
+++ b/crates/cubecl-cuda/src/compute/server.rs
@@ -188,7 +188,6 @@ impl<MM: MemoryManagement<CudaStorage>> CudaContext<MM> {
 
     fn compile_kernel(&mut self, kernel_id: &str, kernel: Box<dyn CubeTask>, arch: i32) {
         let kernel_compiled = kernel.compile();
-        // println!("{}", kernel_compiled.source);
         let shared_mem_bytes = kernel_compiled.shared_mem_bytes;
         let cube_dim = kernel_compiled.cube_dim;
         let arch = format!("--gpu-architecture=sm_{}", arch);
diff --git a/crates/cubecl-cuda/src/runtime.rs b/crates/cubecl-cuda/src/runtime.rs
index a0c7186f8..1a5cf7dcf 100644
--- a/crates/cubecl-cuda/src/runtime.rs
+++ b/crates/cubecl-cuda/src/runtime.rs
@@ -47,7 +47,7 @@ impl Runtime for CudaRuntime {
             )
             .unwrap();
             let storage = CudaStorage::new(stream);
-            let options = DynamicMemoryManagementOptions::preset(2048 * 1024 * 1024, 128);
+            let options = DynamicMemoryManagementOptions::preset(2048 * 1024 * 1024, 32);
             let memory_management = DynamicMemoryManagement::new(storage, options);
             CudaContext::new(memory_management, stream, ctx)
         }
diff --git a/crates/cubecl/benches/matmul.rs b/crates/cubecl/benches/matmul.rs
index 47ce625b0..962df6169 100644
--- a/crates/cubecl/benches/matmul.rs
+++ b/crates/cubecl/benches/matmul.rs
@@ -80,13 +80,15 @@ fn run<R: Runtime, E: Float>(device: R::Device, kind: MatmulKind) {
 }
 
 fn main() {
-    // #[cfg(feature = "wgpu")]
-    // run::<cubecl::wgpu::WgpuRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
-    // #[cfg(feature = "cuda")]
-    // run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
+    #[cfg(feature = "wgpu")]
+    run::<cubecl::wgpu::WgpuRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
+
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Tiling2d);
     #[cfg(feature = "cuda")]
     run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), MatmulKind::Tiling2d);
-
-    // #[cfg(feature = "cuda")]
-    // run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Cmma);
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F32>(Default::default(), MatmulKind::Cmma);
+    #[cfg(feature = "cuda")]
+    run::<cubecl::cuda::CudaRuntime, F16>(Default::default(), MatmulKind::Cmma);
 }

From b454fc5155d80cc1c466be1436c0083003f5b812 Mon Sep 17 00:00:00 2001
From: nathaniel <nathaniel.simard.42@gmail.com>
Date: Wed, 24 Jul 2024 12:00:51 -0400
Subject: [PATCH 4/4] Fix profiling

---
 profiling/matmul-example/Cargo.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/profiling/matmul-example/Cargo.toml b/profiling/matmul-example/Cargo.toml
index 260689939..bbfa1830e 100644
--- a/profiling/matmul-example/Cargo.toml
+++ b/profiling/matmul-example/Cargo.toml
@@ -10,9 +10,9 @@ cubecl = { version = "0.1.0", path = "../../crates/cubecl", features = [
     "cuda",
     "linalg",
 ], optional = true }
-burn = { version = "0.13.2", optional = true, features = ["tch"] }
-burn-tensor = { version = "0.13.2", optional = true }
+burn = { git = "https://github.com/tracel-ai/burn", optional = true, features = ["tch"] }
+burn-tensor = { git = "https://github.com/tracel-ai/burn", optional = true }
 
 [features]
-burn-tch-cuda = ["burn", "burn-tensor"]
+burn-tch-cuda = ["burn"]
 cube-cuda = ["cubecl"]