diff --git a/README.md b/README.md index 567cfbf8c..6110e4f42 100644 --- a/README.md +++ b/README.md @@ -216,6 +216,7 @@ You can even ship the autotune cache with your program, reducing cold start time ## Resource +For now we don't have a lot of resources to learn, but you can look at the [linear algebra library](/crates/cubecl-linalg/README.md) to see how CubeCL can be used. If you have any questions or want to contribute, don't hesitate to join the [Discord](https://discord.gg/uPEBbYYDB6). ## Disclaimer & History diff --git a/crates/cubecl-core/src/ir/kernel.rs b/crates/cubecl-core/src/ir/kernel.rs index 5133a79ef..f61309805 100644 --- a/crates/cubecl-core/src/ir/kernel.rs +++ b/crates/cubecl-core/src/ir/kernel.rs @@ -140,9 +140,16 @@ impl From for Item { impl Display for Elem { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - // NOTE: we'll eventually want to differentiate between int/float types - Self::Float(_) => f.write_str("float"), - Self::Int(_) => f.write_str("int"), + Self::Float(kind) => match kind { + FloatKind::F16 => f.write_str("f16"), + FloatKind::BF16 => f.write_str("bf16"), + FloatKind::F32 => f.write_str("f32"), + FloatKind::F64 => f.write_str("f64"), + }, + Self::Int(kind) => match kind { + IntKind::I32 => f.write_str("i32"), + IntKind::I64 => f.write_str("i64"), + }, Self::UInt => f.write_str("uint"), Self::Bool => f.write_str("bool"), } diff --git a/crates/cubecl-linalg/Cargo.toml b/crates/cubecl-linalg/Cargo.toml index cb79e2a6f..2d04b1e6c 100644 --- a/crates/cubecl-linalg/Cargo.toml +++ b/crates/cubecl-linalg/Cargo.toml @@ -4,7 +4,7 @@ authors = [ "louisfd ", ] categories = ["science", "mathematics", "algorithms"] -description = "CubeCL Linear Algebra Components" +description = "CubeCL Linear Algebra Library." edition.workspace = true keywords = [] license.workspace = true diff --git a/crates/cubecl-linalg/README.md b/crates/cubecl-linalg/README.md new file mode 100644 index 000000000..7cb0782c9 --- /dev/null +++ b/crates/cubecl-linalg/README.md @@ -0,0 +1,60 @@ +# CubeCL Linear Algebra Library. + + +The crate contains common linear algebra algorithms. + +## Algorithms + +- [X] Tiling 2D Matrix Multiplication. + + The kernel is very flexible and can be used on pretty much any hardware. +- [X] Cooperative Matrix Multiplication. + + The kernel is using Automatic Mixed Precision (AMP) to leverage cooperative matrix-multiply and accumulate instructions. + For `f32` tensors, the inputs are casted into `f16`, but the accumulation is still performed in `f32`. + This may cause a small lost in precision, but with way faster execution. + +## Benchmarks + +You can run the benchmarks from the workspace with the following: + +```bash +cargo bench --bench matmul --features wgpu # for wgpu +cargo bench --bench matmul --features cuda # for cuda +``` + +On an RTX 3070 we get the following results: + +``` +matmul-wgpu-f32-tiling2d + +―――――――― Result ――――――――― + Samples 100 + Mean 13.289ms + Variance 28.000ns + Median 13.271ms + Min 12.582ms + Max 13.768ms +――――――――――――――――――――――――― +matmul-cuda-f32-tiling2d + +―――――――― Result ――――――――― + Samples 100 + Mean 12.754ms + Variance 93.000ns + Median 12.647ms + Min 12.393ms + Max 14.501ms +――――――――――――――――――――――――― +matmul-cuda-f32-cmma + +―――――――― Result ――――――――― + Samples 100 + Mean 4.996ms + Variance 35.000ns + Median 5.084ms + Min 4.304ms + Max 5.155ms +――――――――――――――――――――――――― +``` + diff --git a/crates/cubecl/benches/matmul.rs b/crates/cubecl/benches/matmul.rs index 0d30b3c35..ef81c4d8a 100644 --- a/crates/cubecl/benches/matmul.rs +++ b/crates/cubecl/benches/matmul.rs @@ -7,7 +7,7 @@ use cubecl::frontend::Float; use cubecl_linalg::matmul; use cubecl_linalg::tensor::TensorHandle; -impl Benchmark for Tiling2dBench { +impl Benchmark for MatmulBench { type Args = (TensorHandle, TensorHandle, TensorHandle); fn prepare(&self) -> Self::Args { @@ -36,8 +36,7 @@ impl Benchmark for Tiling2dBench { } fn name(&self) -> String { - let elem = E::as_elem(); - format!("tiling2d-{}-{:?}-{:?}", R::name(), elem, self.kind) + format!("matmul-{}-{}-{:?}", R::name(), E::as_elem(), self.kind).to_lowercase() } fn sync(&self) { @@ -46,7 +45,7 @@ impl Benchmark for Tiling2dBench { } #[allow(dead_code)] -struct Tiling2dBench { +struct MatmulBench { b: usize, m: usize, k: usize, @@ -66,7 +65,7 @@ enum MatmulKind { #[allow(dead_code)] fn run(device: R::Device, kind: MatmulKind) { - let bench = Tiling2dBench:: { + let bench = MatmulBench:: { b: 32, m: 1024, k: 1024,