From 2bb04969c9aa4cd0f7793e1d0495a6f85b439592 Mon Sep 17 00:00:00 2001 From: nathaniel Date: Wed, 18 Sep 2024 19:10:25 -0400 Subject: [PATCH 1/4] WIP Book --- README.md | 2 +- crates/cubecl-cuda/src/compiler/warp.rs | 4 +- cubecl-book/.gitignore | 18 +++++ cubecl-book/.prettierrc.json | 4 + cubecl-book/book.toml | 11 +++ cubecl-book/src/SUMMARY.md | 10 +++ cubecl-book/src/core-features/autotune.md | 11 +++ cubecl-book/src/core-features/comptime.md | 56 +++++++++++++ cubecl-book/src/core-features/summary.md | 0 .../src/core-features/vectorization.md | 9 +++ .../src/getting-started/cubecl.drawio.svg | 4 + cubecl-book/src/getting-started/design.md | 57 +++++++++++++ cubecl-book/src/getting-started/examples.md | 0 .../src/getting-started/installation.md | 0 cubecl-book/src/getting-started/summary.md | 0 cubecl-book/src/motivation.md | 33 ++++++++ cubecl-book/src/overview.md | 6 ++ examples/loop_unrolling/Cargo.toml | 16 ++++ .../loop_unrolling/examples/loop_unrolling.rs | 6 ++ examples/loop_unrolling/src/lib.rs | 55 +++++++++++++ xtask/src/commands/book.rs | 80 +++++++++++++++++++ xtask/src/commands/mod.rs | 1 + xtask/src/main.rs | 3 + 23 files changed, 383 insertions(+), 3 deletions(-) create mode 100644 cubecl-book/.gitignore create mode 100644 cubecl-book/.prettierrc.json create mode 100644 cubecl-book/book.toml create mode 100644 cubecl-book/src/SUMMARY.md create mode 100644 cubecl-book/src/core-features/autotune.md create mode 100644 cubecl-book/src/core-features/comptime.md create mode 100644 cubecl-book/src/core-features/summary.md create mode 100644 cubecl-book/src/core-features/vectorization.md create mode 100644 cubecl-book/src/getting-started/cubecl.drawio.svg create mode 100644 cubecl-book/src/getting-started/design.md create mode 100644 cubecl-book/src/getting-started/examples.md create mode 100644 cubecl-book/src/getting-started/installation.md create mode 100644 cubecl-book/src/getting-started/summary.md create mode 100644 cubecl-book/src/motivation.md create mode 100644 cubecl-book/src/overview.md create mode 100644 examples/loop_unrolling/Cargo.toml create mode 100644 examples/loop_unrolling/examples/loop_unrolling.rs create mode 100644 examples/loop_unrolling/src/lib.rs create mode 100644 xtask/src/commands/book.rs diff --git a/README.md b/README.md index 420ddb4b5..41de1ba0f 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ fn gelu_array(input: &Array, output: &mut Array) { #[cube] fn gelu_scalar(x: F) -> F { - x * (F::erf(x / F::sqrt(2.0.into())) + 1.0) / 2.0 + x * (F::erf(x / F::sqrt(2.0f32.into())) + F::new(1.0)) / F::new(2.0) } ``` diff --git a/crates/cubecl-cuda/src/compiler/warp.rs b/crates/cubecl-cuda/src/compiler/warp.rs index 7836caf24..e0059ae3b 100644 --- a/crates/cubecl-cuda/src/compiler/warp.rs +++ b/crates/cubecl-cuda/src/compiler/warp.rs @@ -105,8 +105,8 @@ fn reduce_operator( " {out} = {input}; {{ - for (int offset = warpSizeChecked / 2; offset > 0; offset /= 2) {{ - {out} {op} __shfl_down_sync(0xFFFFFFFF, {out}, offset); + for (int offset = 1; offset < warpSizeChecked; offset *=2 ) {{ + {out} {op} __shfl_xor_sync(-1, {out}, offset); }} }} " diff --git a/cubecl-book/.gitignore b/cubecl-book/.gitignore new file mode 100644 index 000000000..e8ab3065b --- /dev/null +++ b/cubecl-book/.gitignore @@ -0,0 +1,18 @@ +target + +# MacOS temp file +.DS_Store + +book-test +guide/book + +.vscode +tests/cubecl-book/book/ +book/ + +# Ignore Jetbrains specific files. +.idea/ + +# Ignore Vim temporary and swap files. +*.sw? +*~ diff --git a/cubecl-book/.prettierrc.json b/cubecl-book/.prettierrc.json new file mode 100644 index 000000000..0a59bf728 --- /dev/null +++ b/cubecl-book/.prettierrc.json @@ -0,0 +1,4 @@ +{ + "printWidth": 100, + "proseWrap": "always" +} diff --git a/cubecl-book/book.toml b/cubecl-book/book.toml new file mode 100644 index 000000000..67854e188 --- /dev/null +++ b/cubecl-book/book.toml @@ -0,0 +1,11 @@ +[book] +authors = [ + "Nathaniel Simard", +] +language = "en" +multilingual = false +src = "src" +title = "The CubeCL Book 🧊" + +[output.html] +mathjax-support = true diff --git a/cubecl-book/src/SUMMARY.md b/cubecl-book/src/SUMMARY.md new file mode 100644 index 000000000..828803beb --- /dev/null +++ b/cubecl-book/src/SUMMARY.md @@ -0,0 +1,10 @@ +- [Overview](./overview.md) +- [Why CubeCL](./motivation.md) +- [Getting Started](./getting-started/summary.md) + - [Installation](./getting-started/installation.md) + - [Design](./getting-started/design.md) + - [Examples](./getting-started/examples.md) +- [Core Features](./core-features/summary.md) + - [Comptime](./core-features/comptime.md) + - [Vectorization](./core-features/vectorization.md) + - [Autotune](./core-features/autotune.md) diff --git a/cubecl-book/src/core-features/autotune.md b/cubecl-book/src/core-features/autotune.md new file mode 100644 index 000000000..31c41d8e6 --- /dev/null +++ b/cubecl-book/src/core-features/autotune.md @@ -0,0 +1,11 @@ +# Autotune + +Autotuning drastically simplifies kernel selection by running small benchmarks at runtime to figure +out the best kernels with the best configurations to run on the current hardware; an essential +feature for portability. This feature combines gracefully with comptime to test the effect of +different comptime values on performance; sometimes it can be surprising! + +Even if the benchmarks may add some overhead when running the application for the first time, the +information gets cached on the device and will be reused. It is usually a no-brainer trade-off for +throughput-oriented programs such as deep learning models. You can even ship the autotune cache with +your program, reducing cold start time when you have more control over the deployment target. diff --git a/cubecl-book/src/core-features/comptime.md b/cubecl-book/src/core-features/comptime.md new file mode 100644 index 000000000..0d2d37838 --- /dev/null +++ b/cubecl-book/src/core-features/comptime.md @@ -0,0 +1,56 @@ +# Comptime + +CubeCL isn't just a new compute language: though it feels like you are writing GPU kernels, you are, +in fact, writing compiler plugins that you can fully customize! Comptime is a way to modify the +compiler IR at runtime when compiling a kernel for the first time. + +This enables a lot of optimizations and flexibility without having to write many separate variants +of the same kernels to ensure maximal performance. + +## Loop Unrolling + +You can easily unroll loops in CubeCL using the `unroll` attribute on top of a for loop. + +```rust +#[cube(launch)] +fn sum(input: &Array, output: &mut Array, #[comptime] end: Option) { + let unroll = end.is_some(); + let end = end.unwrap_or_else(|| input.len()); + let mut sum = F::new(0.0); + + #[unroll(unroll)] + for i in 0..end { + sum += input[i]; + } + + output[ABSOLUTE_POS] = sum; +} +``` + +Note that if you provide a variable `end` that can't be determined at compile time, a panic will +arise when trying to execute that kernel. + +## Feature Specialization + +You could also achieve the sum using subcube operations. We will write a kernel that use that +instruction when available based on a comptime feature flag. When it isn't available, it will +fallback on the previous implementation essentially making it portable. + +```rust +#[cube(launch)] +fn sum_subgroup( + input: &Array, + output: &mut Array, + #[comptime] subgroup: bool, + #[comptime] end: Option, +) { + if subgroup { + output[UNIT_POS] = subcube_sum(input[UNIT_POS]); + } else { + sum_basic(input, output, end); + } +} +``` + +Note that no branching will actually occured on the GPU, since three different kernels can be +generated from the last code snippet. diff --git a/cubecl-book/src/core-features/summary.md b/cubecl-book/src/core-features/summary.md new file mode 100644 index 000000000..e69de29bb diff --git a/cubecl-book/src/core-features/vectorization.md b/cubecl-book/src/core-features/vectorization.md new file mode 100644 index 000000000..80cbb9d61 --- /dev/null +++ b/cubecl-book/src/core-features/vectorization.md @@ -0,0 +1,9 @@ +# Vectorization + +High-performance kernels should rely on SIMD instructions whenever possible, but doing so can +quickly get pretty complicated! With CubeCL, you can specify the vectorization factor of each input +variable when launching a kernel. Inside the kernel code, you still use only one type, which is +dynamically vectorized and supports automatic broadcasting. The runtimes are able to compile kernels +and have all the necessary information to use the best instruction! However, since the algorithmic +behavior may depend on the vectorization factor, CubeCL allows you to access it directly in the +kernel when needed, without any performance loss, using the comptime system! diff --git a/cubecl-book/src/getting-started/cubecl.drawio.svg b/cubecl-book/src/getting-started/cubecl.drawio.svg new file mode 100644 index 000000000..e0278413a --- /dev/null +++ b/cubecl-book/src/getting-started/cubecl.drawio.svg @@ -0,0 +1,4 @@ + + + +
(2, 2, 0)
(1, 2, 0)
(0, 2, 0)
(2, 1, 0)
(1, 1, 0)
(0, 1, 0)
(2, 0, 0)
(1, 0, 0)
(0, 0, 0)
Cube
[2, 2, 0]
[1, 2, 0]
[0, 2, 0]
[2, 1, 0]
[1, 1, 0]
[0, 1, 0]
[2, 0, 0]
[1, 0, 0]
Hyper-Cube
(0, 0, 0)
Unit
(0, 0, 0)
(0, 0, 0)
diff --git a/cubecl-book/src/getting-started/design.md b/cubecl-book/src/getting-started/design.md new file mode 100644 index 000000000..0b96edcf8 --- /dev/null +++ b/cubecl-book/src/getting-started/design.md @@ -0,0 +1,57 @@ +# Design + +CubeCL is designed around - you guessed it - Cubes! More specifically, it's based on cuboids, +because not all axes are the same size. Since all compute APIs need to map to the hardware, which +are tiles that can be accessed using a 3D representation, our topology can easily be mapped to +concepts from other APIs. + +
+ +### CubeCL - Topology + + +
+
+
+ +_A cube is composed of units, so a 3x3x3 cube has 27 units that can be accessed by their positions +along the x, y, and z axes. Similarly, a hyper-cube is composed of cubes, just as a cube is composed +of units. Each cube in the hyper-cube can be accessed by its position relative to the hyper-cube +along the x, y, and z axes. Hence, a hyper-cube of 3x3x3 will have 27 cubes. In this example, the +total number of working units would be 27 x 27 = 729._ + +### Topology Equivalence + +Since all topology variables are constant within the kernel entry point, we chose to use the Rust +constant syntax with capital letters. Often when creating kernels, we don't always care about the +relative position of a unit within a cube along each axis, but often we only care about its position +in general. Therefore, each kind of variable also has its own axis-independent variable, which is +often not present in other languages, except WebGPU with `local_invocation_index`. + +
+ +| CubeCL | CUDA | WebGPU | +| -------------- | ----------- | ---------------------- | +| CUBE_COUNT | N/A | N/A | +| CUBE_COUNT_X | gridDim.x | num_workgroups.x | +| CUBE_COUNT_Y | gridDim.y | num_workgroups.y | +| CUBE_COUNT_Z | gridDim.z | num_workgroups.z | +| CUBE_POS | N/A | N/A | +| CUBE_POS_X | blockIdx.x | workgroup.x | +| CUBE_POS_Y | blockIdx.y | workgroup.y | +| CUBE_POS_Z | blockIdx.z | workgroup.z | +| CUBE_DIM | N/A | N/A | +| CUBE_DIM_X | blockDim.x | workgroup_size.x | +| CUBE_DIM_Y | blockDim.y | workgroup_size.y | +| CUBE_DIM_Z | blockDim.z | workgroup_size.z | +| UNIT_POS | N/A | local_invocation_index | +| UNIT_POS_X | threadIdx.x | local_invocation_id.x | +| UNIT_POS_Y | threadIdx.y | local_invocation_id.y | +| UNIT_POS_Z | threadIdx.z | local_invocation_id.z | +| SUBCUBE_DIM | warpSize | subgroup_size | +| ABSOLUTE_POS | N/A | N/A | +| ABSOLUTE_POS_X | N/A | global_id.x | +| ABSOLUTE_POS_Y | N/A | global_id.y | +| ABSOLUTE_POS_Z | N/A | global_id.z | + + diff --git a/cubecl-book/src/getting-started/examples.md b/cubecl-book/src/getting-started/examples.md new file mode 100644 index 000000000..e69de29bb diff --git a/cubecl-book/src/getting-started/installation.md b/cubecl-book/src/getting-started/installation.md new file mode 100644 index 000000000..e69de29bb diff --git a/cubecl-book/src/getting-started/summary.md b/cubecl-book/src/getting-started/summary.md new file mode 100644 index 000000000..e69de29bb diff --git a/cubecl-book/src/motivation.md b/cubecl-book/src/motivation.md new file mode 100644 index 000000000..dc0e07ba3 --- /dev/null +++ b/cubecl-book/src/motivation.md @@ -0,0 +1,33 @@ +# Why CubeCL is Important + +In the current landscape of high-performance computing, developers face several significant +challenges that CubeCL aims to address: + +### Complexity in Performance Optimization + +Optimizing compute kernels for different hardware is inherently complex. Developers must understand +the intricacies of each platform’s architecture and API, leading to a steep learning curve and the +risk of suboptimal performance. The need for manual tuning and platform-specific adjustments often +results in code that is difficult to maintain and extend. + +The simplest way to solve this problem is to provide high level abstractions that can be composed in +a variety of ways. All of those variation can be autotuned to select the best settings for the +current hardware and problem at hand. + +### Lack of Portability + +Portability remains a significant issue. Code written for one API or even for a single GPU +architecture often cannot be easily transferred to another, hindering the ability to develop +software that can run across diverse hardware environments. + +The GPU computing ecosystem is fragmented, with different hardware platforms like CUDA, ROCm, Metal, +and Vulkan requiring their own specialized codebases. This fragmentation forces developers to write +and maintain separate implementations for each platform, increasing both development time and +complexity. + +## Conclusion + +In essence, these challenges underscore the need for a more unified and developer-friendly approach +to high-performance computing. CubeCL seeks to bridge these gaps by addressing the core issues +within the current ecosystem, offering a new direction for high-performance and portable computing +solutions. diff --git a/cubecl-book/src/overview.md b/cubecl-book/src/overview.md new file mode 100644 index 000000000..b0d557a24 --- /dev/null +++ b/cubecl-book/src/overview.md @@ -0,0 +1,6 @@ +# Overview + +Welcome to The CubeCL Book 👋 + +This book will help you get started with your high-performance computing project using CubeCL, +making sure you leverage the most any hardware. diff --git a/examples/loop_unrolling/Cargo.toml b/examples/loop_unrolling/Cargo.toml new file mode 100644 index 000000000..ce545ce55 --- /dev/null +++ b/examples/loop_unrolling/Cargo.toml @@ -0,0 +1,16 @@ +[package] +authors = [] +name = "loop_unrolling" +publish = false +edition.workspace = true +license.workspace = true +version.workspace = true + +[features] +default = [] +wgpu = ["cubecl/wgpu"] +cuda = ["cubecl/cuda"] + +[dependencies] +cubecl = { path = "../../crates/cubecl", version = "0.2.0" } +half = { workspace = true } diff --git a/examples/loop_unrolling/examples/loop_unrolling.rs b/examples/loop_unrolling/examples/loop_unrolling.rs new file mode 100644 index 000000000..70b1572b9 --- /dev/null +++ b/examples/loop_unrolling/examples/loop_unrolling.rs @@ -0,0 +1,6 @@ +fn main() { + #[cfg(feature = "cuda")] + loop_unrolling::basic::(&Default::default()); + #[cfg(feature = "wgpu")] + loop_unrolling::basic::(&Default::default()); +} diff --git a/examples/loop_unrolling/src/lib.rs b/examples/loop_unrolling/src/lib.rs new file mode 100644 index 000000000..6f1aa9fc9 --- /dev/null +++ b/examples/loop_unrolling/src/lib.rs @@ -0,0 +1,55 @@ +use cubecl::{prelude::*, OutputInfo}; + +#[cube(launch_unchecked)] +fn sum_basic(input: &Array, output: &mut Array, #[comptime] end: Option) { + let unroll = end.is_some(); + let end = end.unwrap_or_else(|| input.len()); + + let mut sum = F::new(0.0); + + #[unroll(unroll)] + for i in 0..end { + sum += input[i]; + } + + output[UNIT_POS] = sum; +} + +#[cube(launch_unchecked)] +fn sum_subgroup( + input: &Array, + output: &mut Array, + #[comptime] subgroup: bool, + #[comptime] end: Option, +) { + if subgroup { + output[UNIT_POS] = subcube_sum(input[UNIT_POS]); + } else { + sum_basic(input, output, end); + } +} + +pub fn basic(device: &R::Device) { + let client = R::client(device); + let input = &[-1., 10., 1., 5.]; + + let output_handle = client.empty(input.len() * core::mem::size_of::()); + let input_handle = client.create(f32::as_bytes(input)); + + unsafe { + sum_subgroup::launch_unchecked::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(input.len() as u32, 1, 1), + ArrayArg::from_raw_parts(&input_handle, input.len(), 1), + ArrayArg::from_raw_parts(&output_handle, input.len(), 1), + client.features().enabled(cubecl::Feature::Subcube), + Some(input.len() as u32), + ); + } + + let bytes = client.read(output_handle.binding()); + let output = f32::from_bytes(&bytes); + + println!("Executed sum with runtime {:?} => {output:?}", R::name()); +} diff --git a/xtask/src/commands/book.rs b/xtask/src/commands/book.rs new file mode 100644 index 000000000..8a3123da3 --- /dev/null +++ b/xtask/src/commands/book.rs @@ -0,0 +1,80 @@ +use std::path::Path; + +use tracel_xtask::prelude::*; + +#[derive(clap::Args)] +pub struct BookArgs { + #[command(subcommand)] + command: BookSubCommand, +} + +#[derive(clap::Subcommand, strum::Display)] +pub(crate) enum BookSubCommand { + /// Build the book + Build, + /// Open the book on the specified port or random port and rebuild it automatically upon changes + Open(OpenArgs), +} + +#[derive(clap::Args)] +pub(crate) struct OpenArgs { + /// Specify the port to open the book on (defaults to a random port if not specified) + #[clap(long, default_value_t = random_port())] + port: u16, +} + +/// Book information +pub(crate) struct Book { + name: &'static str, + path: &'static Path, +} + +impl BookArgs { + pub(crate) fn parse(&self) -> anyhow::Result<()> { + Book::run(&self.command) + } +} + +impl Book { + const BOOK_NAME: &'static str = "CubeCL Book"; + const BOOK_PATH: &'static str = "./cubecl-book"; + + pub(crate) fn run(args: &BookSubCommand) -> anyhow::Result<()> { + let book = Self { + name: Self::BOOK_NAME, + path: Path::new(Self::BOOK_PATH), + }; + book.execute(args) + } + + fn execute(&self, command: &BookSubCommand) -> anyhow::Result<()> { + ensure_cargo_crate_is_installed("mdbook", None, None, false)?; + group!("{}: {}", self.name, command); + match command { + BookSubCommand::Build => self.build(), + BookSubCommand::Open(args) => self.open(args), + }?; + endgroup!(); + Ok(()) + } + + fn build(&self) -> anyhow::Result<()> { + run_process( + "mdbook", + &vec!["build"], + None, + Some(self.path), + "mdbook should build the book successfully", + ) + } + + fn open(&self, args: &OpenArgs) -> anyhow::Result<()> { + run_process( + "mdbook", + &vec!["serve", "--open", "--port", &args.port.to_string()], + None, + Some(self.path), + "mdbook should open the book successfully", + ) + } +} diff --git a/xtask/src/commands/mod.rs b/xtask/src/commands/mod.rs index 726f3ad4c..f3ac0a74e 100644 --- a/xtask/src/commands/mod.rs +++ b/xtask/src/commands/mod.rs @@ -1,2 +1,3 @@ +pub(crate) mod book; pub(crate) mod build; pub(crate) mod test; diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 945661188..0210ab05c 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -23,6 +23,8 @@ pub enum Command { Build(commands::build::CubeCLBuildCmdArgs), /// Test Burn. Test(commands::test::CubeCLTestCmdArgs), + /// Run commands to manage the book. + Book(commands::book::BookArgs), } fn main() -> anyhow::Result<()> { @@ -31,6 +33,7 @@ fn main() -> anyhow::Result<()> { match args.command { Command::Build(cmd_args) => commands::build::handle_command(cmd_args), Command::Test(cmd_args) => commands::test::handle_command(cmd_args), + Command::Book(cmd_args) => cmd_args.parse(), _ => dispatch_base_commands(args), }?; let duration = start.elapsed(); From 1bbfeee8ec570e9891ef02c7625a58604e9071f1 Mon Sep 17 00:00:00 2001 From: nathaniel Date: Thu, 19 Sep 2024 10:18:24 -0400 Subject: [PATCH 2/4] Add basic book --- cubecl-book/src/SUMMARY.md | 2 + cubecl-book/src/core-features/comptime.md | 3 +- cubecl-book/src/core-features/summary.md | 4 + cubecl-book/src/getting-started/examples.md | 11 + .../src/getting-started/installation.md | 19 ++ cubecl-book/src/getting-started/summary.md | 6 + cubecl-book/src/language-support/summary.md | 4 + cubecl-book/src/language-support/trait.md | 90 +++++++ .../loop_unrolling/examples/loop_unrolling.rs | 6 - examples/loop_unrolling/src/lib.rs | 55 ----- .../{loop_unrolling => sum_things}/Cargo.toml | 2 +- examples/sum_things/examples/sum_things.rs | 6 + examples/sum_things/src/lib.rs | 224 ++++++++++++++++++ 13 files changed, 369 insertions(+), 63 deletions(-) create mode 100644 cubecl-book/src/language-support/summary.md create mode 100644 cubecl-book/src/language-support/trait.md delete mode 100644 examples/loop_unrolling/examples/loop_unrolling.rs delete mode 100644 examples/loop_unrolling/src/lib.rs rename examples/{loop_unrolling => sum_things}/Cargo.toml (92%) create mode 100644 examples/sum_things/examples/sum_things.rs create mode 100644 examples/sum_things/src/lib.rs diff --git a/cubecl-book/src/SUMMARY.md b/cubecl-book/src/SUMMARY.md index 828803beb..39fd78a14 100644 --- a/cubecl-book/src/SUMMARY.md +++ b/cubecl-book/src/SUMMARY.md @@ -8,3 +8,5 @@ - [Comptime](./core-features/comptime.md) - [Vectorization](./core-features/vectorization.md) - [Autotune](./core-features/autotune.md) +- [Language Support](./language-support/summary.md) + - [Trait Support](./language-support/trait.md) diff --git a/cubecl-book/src/core-features/comptime.md b/cubecl-book/src/core-features/comptime.md index 0d2d37838..fb1e84819 100644 --- a/cubecl-book/src/core-features/comptime.md +++ b/cubecl-book/src/core-features/comptime.md @@ -53,4 +53,5 @@ fn sum_subgroup( ``` Note that no branching will actually occured on the GPU, since three different kernels can be -generated from the last code snippet. +generated from the last code snippet. You can also use the +[trait system](../language-support/trait.md) to achieve a similar behavior. diff --git a/cubecl-book/src/core-features/summary.md b/cubecl-book/src/core-features/summary.md index e69de29bb..e77110ad4 100644 --- a/cubecl-book/src/core-features/summary.md +++ b/cubecl-book/src/core-features/summary.md @@ -0,0 +1,4 @@ +# Core Features + +In this section, we'll explore the core features of CubeCL and what sets it apart from other +high-performance computing languages like CUDA, OpenCL, and HIP. diff --git a/cubecl-book/src/getting-started/examples.md b/cubecl-book/src/getting-started/examples.md index e69de29bb..2c0c67ae2 100644 --- a/cubecl-book/src/getting-started/examples.md +++ b/cubecl-book/src/getting-started/examples.md @@ -0,0 +1,11 @@ +# Examples + +For now we only have a limited amount of examples listed in the table bellow. Note that you can also +look at how the [linalg](https://github.com/tracel-ai/cubecl/tree/main/crates/cubecl-linalg) is +implemented. + +| Example | Description | +| :------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------- | +| [GeLU](https://github.com/tracel-ai/cubecl/tree/main/examples/gelu) | Implement the GeLU activation function using CubeCL. | +| [Sum Things](https://github.com/tracel-ai/cubecl/tree/main/examples/sum_things) | Sum some number using many different variation leveraging the CubeCL core features and trait support. | +| [Normalization](https://github.com/tracel-ai/cubecl/tree/main/examples/normalization) | Show how to use normalization on vectorized elements. | diff --git a/cubecl-book/src/getting-started/installation.md b/cubecl-book/src/getting-started/installation.md index e69de29bb..f9650585d 100644 --- a/cubecl-book/src/getting-started/installation.md +++ b/cubecl-book/src/getting-started/installation.md @@ -0,0 +1,19 @@ +# Installation + +Installing CubeCL is straightforward. It’s available as a Rust crate, and you can add it to your +project by updating your `Cargo.toml`: + +```toml +[dependencies] +cubecl = { version = "{version}", features = ["cuda", "wgpu"] } +``` + +The more challenging aspect is ensuring that you have the necessary drivers to run the selected +runtime. + +For `wgpu` on Linux and Windows, Vulkan drivers are required. These drivers are usually included +with the default OS installation. However, on certain setups, such as Windows Subsystem for Linux +(WSL), you may need to install them manually if they are missing. + +For `cuda`, simply install the CUDA drivers on your device. You can follow the installation +instructions provided on the [NVIDIA website](https://developer.nvidia.com/cuda-downloads). diff --git a/cubecl-book/src/getting-started/summary.md b/cubecl-book/src/getting-started/summary.md index e69de29bb..5738ccf82 100644 --- a/cubecl-book/src/getting-started/summary.md +++ b/cubecl-book/src/getting-started/summary.md @@ -0,0 +1,6 @@ +# Getting Started + +In this section, we’ll walk through the installation process for CubeCL, explore the design +principles behind its language extension, and provide some examples to help you start experimenting +with the tool. By the end, you'll have a solid understanding of how CubeCL integrates into your +workflow and how to leverage its features for high-performance computing. diff --git a/cubecl-book/src/language-support/summary.md b/cubecl-book/src/language-support/summary.md new file mode 100644 index 000000000..3f84a2c9b --- /dev/null +++ b/cubecl-book/src/language-support/summary.md @@ -0,0 +1,4 @@ +# Language Support + +In this section, we will highlight key language features of CubeCL and demonstrate how to use them +in your kernels to enhance performance, portability, and maintainability. diff --git a/cubecl-book/src/language-support/trait.md b/cubecl-book/src/language-support/trait.md new file mode 100644 index 000000000..c52eae47c --- /dev/null +++ b/cubecl-book/src/language-support/trait.md @@ -0,0 +1,90 @@ +# Trait Support + +CubeCL partially supports trait to modularize your kernel code without any overhead. For now most +features are supported except stateful functions. + +```rust +#[cube] +trait MyTrait { + /// Supported + fn my_function(x: &Array) -> f32; + /// Unsupported + fn my_function_2(&self, x: &Array) -> f32; +} +``` + +The trait system allows you to do specialization quite easily. Let's take the same example as in the +[comptime section](../core-features/comptime.md). + +First you can define your trait. Note that if you use your trait from the launch function, you will +need to add `'static + Send + Sync`. + +```rust +#[cube] +trait SumKind: 'static + Send + Sync { + fn sum(input: &Slice, #[comptime] end: Option) -> F; +} +``` + +Then we can define some implementations: + +```rust +struct SumBasic; +struct SumSubcube; + +#[cube] +impl SumKind for SumBasic { + fn sum(input: &Slice, #[comptime] end: Option) -> F { + let unroll = end.is_some(); + let end = end.unwrap_or_else(|| input.len()); + + let mut sum = F::new(0.0); + + #[unroll(unroll)] + for i in 0..end { + sum += input[i]; + } + + sum + } +} + +#[cube] +impl SumKind for SumSubcube { + fn sum(input: &Slice, #[comptime] _end: Option) -> F { + subcube_sum(input[UNIT_POS]) + } +} +``` + +Associated types are also supported. Let say you want to create a series from a sum. + +```rust +#[cube] +trait CreateSeries: 'static + Send + Sync { + type SumKind: SumKind; + + fn execute(input: &Slice, #[comptime] end: Option) -> F; +} +``` + +You may want to define what kind of series you want to create using an implementation. + +```rust +struct SumThenMul { + _p: PhantomData, +} + +#[cube] +impl CreateSeries for SumThenMul { + type SumKind = K; + + fn execute(input: &Slice, #[comptime] end: Option) -> F { + let val = Self::SumKind::sum(input, end); + val * input[UNIT_POS] + } +} +``` + +It's actually not the best example of using associated types, but it shows how they are totally +supported with CubeCL. diff --git a/examples/loop_unrolling/examples/loop_unrolling.rs b/examples/loop_unrolling/examples/loop_unrolling.rs deleted file mode 100644 index 70b1572b9..000000000 --- a/examples/loop_unrolling/examples/loop_unrolling.rs +++ /dev/null @@ -1,6 +0,0 @@ -fn main() { - #[cfg(feature = "cuda")] - loop_unrolling::basic::(&Default::default()); - #[cfg(feature = "wgpu")] - loop_unrolling::basic::(&Default::default()); -} diff --git a/examples/loop_unrolling/src/lib.rs b/examples/loop_unrolling/src/lib.rs deleted file mode 100644 index 6f1aa9fc9..000000000 --- a/examples/loop_unrolling/src/lib.rs +++ /dev/null @@ -1,55 +0,0 @@ -use cubecl::{prelude::*, OutputInfo}; - -#[cube(launch_unchecked)] -fn sum_basic(input: &Array, output: &mut Array, #[comptime] end: Option) { - let unroll = end.is_some(); - let end = end.unwrap_or_else(|| input.len()); - - let mut sum = F::new(0.0); - - #[unroll(unroll)] - for i in 0..end { - sum += input[i]; - } - - output[UNIT_POS] = sum; -} - -#[cube(launch_unchecked)] -fn sum_subgroup( - input: &Array, - output: &mut Array, - #[comptime] subgroup: bool, - #[comptime] end: Option, -) { - if subgroup { - output[UNIT_POS] = subcube_sum(input[UNIT_POS]); - } else { - sum_basic(input, output, end); - } -} - -pub fn basic(device: &R::Device) { - let client = R::client(device); - let input = &[-1., 10., 1., 5.]; - - let output_handle = client.empty(input.len() * core::mem::size_of::()); - let input_handle = client.create(f32::as_bytes(input)); - - unsafe { - sum_subgroup::launch_unchecked::( - &client, - CubeCount::Static(1, 1, 1), - CubeDim::new(input.len() as u32, 1, 1), - ArrayArg::from_raw_parts(&input_handle, input.len(), 1), - ArrayArg::from_raw_parts(&output_handle, input.len(), 1), - client.features().enabled(cubecl::Feature::Subcube), - Some(input.len() as u32), - ); - } - - let bytes = client.read(output_handle.binding()); - let output = f32::from_bytes(&bytes); - - println!("Executed sum with runtime {:?} => {output:?}", R::name()); -} diff --git a/examples/loop_unrolling/Cargo.toml b/examples/sum_things/Cargo.toml similarity index 92% rename from examples/loop_unrolling/Cargo.toml rename to examples/sum_things/Cargo.toml index ce545ce55..40e705943 100644 --- a/examples/loop_unrolling/Cargo.toml +++ b/examples/sum_things/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = [] -name = "loop_unrolling" +name = "sum_things" publish = false edition.workspace = true license.workspace = true diff --git a/examples/sum_things/examples/sum_things.rs b/examples/sum_things/examples/sum_things.rs new file mode 100644 index 000000000..9e52b40a0 --- /dev/null +++ b/examples/sum_things/examples/sum_things.rs @@ -0,0 +1,6 @@ +fn main() { + #[cfg(feature = "cuda")] + sum_things::launch::(&Default::default()); + #[cfg(feature = "wgpu")] + sum_things::launch::(&Default::default()); +} diff --git a/examples/sum_things/src/lib.rs b/examples/sum_things/src/lib.rs new file mode 100644 index 000000000..47dd11520 --- /dev/null +++ b/examples/sum_things/src/lib.rs @@ -0,0 +1,224 @@ +use cubecl::{prelude::*, server::Handle}; +use std::marker::PhantomData; + +#[cube(launch_unchecked)] +fn sum_basic(input: &Array, output: &mut Array, #[comptime] end: Option) { + let unroll = end.is_some(); + let end = end.unwrap_or_else(|| input.len()); + + let mut sum = F::new(0.0); + + #[unroll(unroll)] + for i in 0..end { + sum += input[i]; + } + + output[UNIT_POS] = sum; +} + +#[cube(launch_unchecked)] +fn sum_subgroup( + input: &Array, + output: &mut Array, + #[comptime] subgroup: bool, + #[comptime] end: Option, +) { + if subgroup { + output[UNIT_POS] = subcube_sum(input[UNIT_POS]); + } else { + sum_basic(input, output, end); + } +} + +#[cube] +trait SumKind: 'static + Send + Sync { + fn sum(input: &Slice, #[comptime] end: Option) -> F; +} + +struct SumBasic; +struct SumSubcube; + +#[cube] +impl SumKind for SumBasic { + fn sum(input: &Slice, #[comptime] end: Option) -> F { + let unroll = end.is_some(); + let end = end.unwrap_or_else(|| input.len()); + + let mut sum = F::new(0.0); + + #[unroll(unroll)] + for i in 0..end { + sum += input[i]; + } + + sum + } +} + +#[cube] +impl SumKind for SumSubcube { + fn sum(input: &Slice, #[comptime] _end: Option) -> F { + subcube_sum(input[UNIT_POS]) + } +} + +#[cube(launch_unchecked)] +fn sum_trait( + input: &Array, + output: &mut Array, + #[comptime] end: Option, +) { + output[UNIT_POS] = K::sum(input.as_slice(), end); +} + +#[cube] +trait CreateSeries: 'static + Send + Sync { + type SumKind: SumKind; + + fn execute(input: &Slice, #[comptime] end: Option) -> F; +} + +#[cube(launch_unchecked)] +fn series( + input: &Array, + output: &mut Array, + #[comptime] end: Option, +) { + output[UNIT_POS] = S::execute(input.as_slice(), end); +} + +struct SumThenMul { + _p: PhantomData, +} + +#[cube] +impl CreateSeries for SumThenMul { + type SumKind = K; + + fn execute(input: &Slice, #[comptime] end: Option) -> F { + let val = Self::SumKind::sum(input, end); + val * input[UNIT_POS] + } +} + +fn launch_basic( + client: &ComputeClient, + input: &Handle, + output: &Handle, + len: usize, +) { + unsafe { + sum_basic::launch_unchecked::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(len as u32, 1, 1), + ArrayArg::from_raw_parts(&input, len, 1), + ArrayArg::from_raw_parts(&output, len, 1), + Some(len as u32), + ); + } +} + +fn launch_subgroup( + client: &ComputeClient, + input: &Handle, + output: &Handle, + len: usize, +) { + unsafe { + sum_subgroup::launch_unchecked::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(len as u32, 1, 1), + ArrayArg::from_raw_parts(&input, len, 1), + ArrayArg::from_raw_parts(&output, len, 1), + client.features().enabled(cubecl::Feature::Subcube), + Some(len as u32), + ); + } +} + +fn launch_trait( + client: &ComputeClient, + input: &Handle, + output: &Handle, + len: usize, +) { + unsafe { + sum_trait::launch_unchecked::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(len as u32, 1, 1), + ArrayArg::from_raw_parts(&input, len, 1), + ArrayArg::from_raw_parts(&output, len, 1), + Some(len as u32), + ); + } +} + +fn launch_series( + client: &ComputeClient, + input: &Handle, + output: &Handle, + len: usize, +) { + unsafe { + series::launch_unchecked::( + &client, + CubeCount::Static(1, 1, 1), + CubeDim::new(len as u32, 1, 1), + ArrayArg::from_raw_parts(&input, len, 1), + ArrayArg::from_raw_parts(&output, len, 1), + Some(len as u32), + ); + } +} + +#[derive(Debug)] +enum KernelKind { + Basic, + Subcube, + TraitSum, + SeriesSumThenMul, +} + +pub fn launch(device: &R::Device) { + let client = R::client(device); + let input = &[-1., 10., 1., 5.]; + let len = input.len(); + + let output = client.empty(input.len() * core::mem::size_of::()); + let input = client.create(f32::as_bytes(input)); + + for kind in [ + KernelKind::Basic, + KernelKind::Subcube, + KernelKind::TraitSum, + KernelKind::SeriesSumThenMul, + ] { + match kind { + KernelKind::Basic => launch_basic::(&client, &input, &output, len), + KernelKind::Subcube => launch_subgroup::(&client, &input, &output, len), + KernelKind::TraitSum => { + // When using trait, it's normaly a good idea to check if the variation can be + // executed. + if client.features().enabled(cubecl::Feature::Subcube) { + launch_trait::(&client, &input, &output, len) + } else { + launch_trait::(&client, &input, &output, len) + } + } + KernelKind::SeriesSumThenMul => { + if client.features().enabled(cubecl::Feature::Subcube) { + launch_series::>(&client, &input, &output, len) + } else { + launch_series::>(&client, &input, &output, len) + } + } + } + let bytes = client.read(output.clone().binding()); + let output = f32::from_bytes(&bytes); + + println!("[{:?} - {kind:?}]\n {output:?}", R::name()); + } +} From 0d18e4ef4c173690a416e6424921401ebd1004d5 Mon Sep 17 00:00:00 2001 From: nathaniel Date: Thu, 19 Sep 2024 10:35:09 -0400 Subject: [PATCH 3/4] Fix CI --- examples/sum_things/src/lib.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/sum_things/src/lib.rs b/examples/sum_things/src/lib.rs index 47dd11520..f9293fea7 100644 --- a/examples/sum_things/src/lib.rs +++ b/examples/sum_things/src/lib.rs @@ -109,11 +109,11 @@ fn launch_basic( ) { unsafe { sum_basic::launch_unchecked::( - &client, + client, CubeCount::Static(1, 1, 1), CubeDim::new(len as u32, 1, 1), - ArrayArg::from_raw_parts(&input, len, 1), - ArrayArg::from_raw_parts(&output, len, 1), + ArrayArg::from_raw_parts(input, len, 1), + ArrayArg::from_raw_parts(output, len, 1), Some(len as u32), ); } @@ -127,11 +127,11 @@ fn launch_subgroup( ) { unsafe { sum_subgroup::launch_unchecked::( - &client, + client, CubeCount::Static(1, 1, 1), CubeDim::new(len as u32, 1, 1), - ArrayArg::from_raw_parts(&input, len, 1), - ArrayArg::from_raw_parts(&output, len, 1), + ArrayArg::from_raw_parts(input, len, 1), + ArrayArg::from_raw_parts(output, len, 1), client.features().enabled(cubecl::Feature::Subcube), Some(len as u32), ); @@ -146,11 +146,11 @@ fn launch_trait( ) { unsafe { sum_trait::launch_unchecked::( - &client, + client, CubeCount::Static(1, 1, 1), CubeDim::new(len as u32, 1, 1), - ArrayArg::from_raw_parts(&input, len, 1), - ArrayArg::from_raw_parts(&output, len, 1), + ArrayArg::from_raw_parts(input, len, 1), + ArrayArg::from_raw_parts(output, len, 1), Some(len as u32), ); } @@ -164,11 +164,11 @@ fn launch_series( ) { unsafe { series::launch_unchecked::( - &client, + client, CubeCount::Static(1, 1, 1), CubeDim::new(len as u32, 1, 1), - ArrayArg::from_raw_parts(&input, len, 1), - ArrayArg::from_raw_parts(&output, len, 1), + ArrayArg::from_raw_parts(input, len, 1), + ArrayArg::from_raw_parts(output, len, 1), Some(len as u32), ); } From a7b4c54b621cbcaab6abeb1317613c82b963aef6 Mon Sep 17 00:00:00 2001 From: nathaniel Date: Thu, 19 Sep 2024 12:57:39 -0400 Subject: [PATCH 4/4] Fix typos --- cubecl-book/src/core-features/comptime.md | 8 ++++---- cubecl-book/src/core-features/vectorization.md | 2 +- cubecl-book/src/getting-started/examples.md | 10 +++++----- cubecl-book/src/language-support/trait.md | 4 ++-- cubecl-book/src/motivation.md | 2 +- cubecl-book/src/overview.md | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cubecl-book/src/core-features/comptime.md b/cubecl-book/src/core-features/comptime.md index fb1e84819..de0b3f9be 100644 --- a/cubecl-book/src/core-features/comptime.md +++ b/cubecl-book/src/core-features/comptime.md @@ -32,9 +32,9 @@ arise when trying to execute that kernel. ## Feature Specialization -You could also achieve the sum using subcube operations. We will write a kernel that use that -instruction when available based on a comptime feature flag. When it isn't available, it will -fallback on the previous implementation essentially making it portable. +You could also achieve the sum using subcube operations. We will write a kernel that uses that +instruction when available based on a comptime feature flag. When it isn't available, it will fall +back on the previous implementation essentially making it portable. ```rust #[cube(launch)] @@ -52,6 +52,6 @@ fn sum_subgroup( } ``` -Note that no branching will actually occured on the GPU, since three different kernels can be +Note that no branching will actually occur on the GPU, since three different kernels can be generated from the last code snippet. You can also use the [trait system](../language-support/trait.md) to achieve a similar behavior. diff --git a/cubecl-book/src/core-features/vectorization.md b/cubecl-book/src/core-features/vectorization.md index 80cbb9d61..d46651b23 100644 --- a/cubecl-book/src/core-features/vectorization.md +++ b/cubecl-book/src/core-features/vectorization.md @@ -4,6 +4,6 @@ High-performance kernels should rely on SIMD instructions whenever possible, but quickly get pretty complicated! With CubeCL, you can specify the vectorization factor of each input variable when launching a kernel. Inside the kernel code, you still use only one type, which is dynamically vectorized and supports automatic broadcasting. The runtimes are able to compile kernels -and have all the necessary information to use the best instruction! However, since the algorithmic +and have all the necessary information to use the best instructions! However, since the algorithmic behavior may depend on the vectorization factor, CubeCL allows you to access it directly in the kernel when needed, without any performance loss, using the comptime system! diff --git a/cubecl-book/src/getting-started/examples.md b/cubecl-book/src/getting-started/examples.md index 2c0c67ae2..57658d9ad 100644 --- a/cubecl-book/src/getting-started/examples.md +++ b/cubecl-book/src/getting-started/examples.md @@ -4,8 +4,8 @@ For now we only have a limited amount of examples listed in the table bellow. No look at how the [linalg](https://github.com/tracel-ai/cubecl/tree/main/crates/cubecl-linalg) is implemented. -| Example | Description | -| :------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------- | -| [GeLU](https://github.com/tracel-ai/cubecl/tree/main/examples/gelu) | Implement the GeLU activation function using CubeCL. | -| [Sum Things](https://github.com/tracel-ai/cubecl/tree/main/examples/sum_things) | Sum some number using many different variation leveraging the CubeCL core features and trait support. | -| [Normalization](https://github.com/tracel-ai/cubecl/tree/main/examples/normalization) | Show how to use normalization on vectorized elements. | +| Example | Description | +| :------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------- | +| [GeLU](https://github.com/tracel-ai/cubecl/tree/main/examples/gelu) | Implement the GeLU activation function using CubeCL. | +| [Sum Things](https://github.com/tracel-ai/cubecl/tree/main/examples/sum_things) | Sum some numbers using many different variations leveraging the CubeCL core features and trait support. | +| [Normalization](https://github.com/tracel-ai/cubecl/tree/main/examples/normalization) | Show how to use normalization on vectorized elements. | diff --git a/cubecl-book/src/language-support/trait.md b/cubecl-book/src/language-support/trait.md index c52eae47c..7fa462f9c 100644 --- a/cubecl-book/src/language-support/trait.md +++ b/cubecl-book/src/language-support/trait.md @@ -1,6 +1,6 @@ # Trait Support -CubeCL partially supports trait to modularize your kernel code without any overhead. For now most +CubeCL partially supports traits to modularize your kernel code without any overhead. For now most features are supported except stateful functions. ```rust @@ -57,7 +57,7 @@ impl SumKind for SumSubcube { } ``` -Associated types are also supported. Let say you want to create a series from a sum. +Associated types are also supported. Let's say you want to create a series from a sum. ```rust #[cube] diff --git a/cubecl-book/src/motivation.md b/cubecl-book/src/motivation.md index dc0e07ba3..6a0f9050c 100644 --- a/cubecl-book/src/motivation.md +++ b/cubecl-book/src/motivation.md @@ -11,7 +11,7 @@ risk of suboptimal performance. The need for manual tuning and platform-specific results in code that is difficult to maintain and extend. The simplest way to solve this problem is to provide high level abstractions that can be composed in -a variety of ways. All of those variation can be autotuned to select the best settings for the +a variety of ways. All of those variations can be autotuned to select the best settings for the current hardware and problem at hand. ### Lack of Portability diff --git a/cubecl-book/src/overview.md b/cubecl-book/src/overview.md index b0d557a24..ab3da552b 100644 --- a/cubecl-book/src/overview.md +++ b/cubecl-book/src/overview.md @@ -3,4 +3,4 @@ Welcome to The CubeCL Book 👋 This book will help you get started with your high-performance computing project using CubeCL, -making sure you leverage the most any hardware. +making sure you leverage the most of any hardware.