diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c2d564dfda..5a883b64ed 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-pylibraft: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-raft-dask: needs: wheel-publish-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -99,7 +99,7 @@ jobs: wheel-publish-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 8c99e3de6a..c8bd28d4bb 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -24,41 +24,41 @@ jobs: - wheel-tests-raft-dask - devcontainer secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12 checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 with: build_type: pull-request node_type: cpu16 conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -68,34 +68,34 @@ jobs: wheel-build-pylibraft: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: pull-request script: ci/build_wheel_pylibraft.sh wheel-tests-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: pull-request script: ci/test_wheel_pylibraft.sh wheel-build-raft-dask: needs: wheel-tests-pylibraft secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: pull-request script: "ci/build_wheel_raft_dask.sh" wheel-tests-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: pull-request script: ci/test_wheel_raft_dask.sh devcontainer: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12 with: build_command: | sccache -z; diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4e45ae29f6..1c2395cb68 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -32,7 +32,7 @@ jobs: sha: ${{ inputs.sha }} wheel-tests-pylibraft: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -41,7 +41,7 @@ jobs: script: ci/test_wheel_pylibraft.sh wheel-tests-raft-dask: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/README.md b/README.md index 56d422b489..5b1297b63c 100755 --- a/README.md +++ b/README.md @@ -255,106 +255,54 @@ pairwise_distance(in1, in2, out=output, metric="euclidean") ## Installing -RAFT itself can be installed through conda, [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), pip, or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on installing and building RAFT and using it in downstream projects. +RAFT's C++ and Python libraries can both be installed through Conda and the Python libraries through Pip. -### Conda + +### Installing C++ and Python through Conda The easiest way to install RAFT is through conda and several packages are provided. -- `libraft-headers` RAFT headers -- `libraft` (optional) shared library of pre-compiled template instantiations and runtime APIs. -- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives. -- `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters. +- `libraft-headers` C++ headers +- `libraft` (optional) C++ shared library containing pre-compiled template instantiations and runtime API. +- `pylibraft` (optional) Python library +- `raft-dask` (optional) Python library for deployment of multi-node multi-GPU algorithms that use the RAFT `raft::comms` abstraction layer in Dask clusters. +- `raft-ann-bench` (optional) Benchmarking tool for easily producing benchmarks that compare RAFT's vector search algorithms against other state-of-the-art implementations. +- `raft-ann-bench-cpu` (optional) Reproducible benchmarking tool similar to above, but doesn't require CUDA to be installed on the machine. Can be used to test in environments with competitive CPUs. + +Use the following command, depending on your CUDA version, to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. +```bash +# for CUDA 11.8 +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=11.8 +``` -Use the following command to install all of the RAFT packages with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command. ```bash -mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft +# for CUDA 12.0 +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0 ``` -You can also install the conda packages individually using the `mamba` command above. +Note that the above commands will also install `libraft-headers` and `libraft`. + +You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project: +```bash +# for CUDA 12.0 +mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0 +``` -After installing RAFT, `find_package(raft COMPONENTS compiled distributed)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed. +If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-23.12/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above. -### Pip +### Installing Python through Pip -pylibraft and raft-dask both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install): +`pylibraft` and `raft-dask` both have experimental packages that can be [installed through pip](https://rapids.ai/pip.html#install): ```bash pip install pylibraft-cu11 --extra-index-url=https://pypi.nvidia.com pip install raft-dask-cu11 --extra-index-url=https://pypi.nvidia.com ``` -### CMake & CPM - -RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it easy to include in downstream cmake projects. RAPIDS-CMake provides a convenience layer around CPM. Please refer to [these instructions](https://github.com/rapidsai/rapids-cmake#installation) to install and use rapids-cmake in your project. - -#### Example Template Project +These packages statically build RAFT's pre-compiled instantiations and so the C++ headers and pre-compiled shared library won't be readily available to use in your code. -You can find an [example RAFT](cpp/template/README.md) project template in the `cpp/template` directory, which demonstrates how to build a new application with RAFT or incorporate RAFT into an existing cmake project. +The [build instructions](https://docs.rapids.ai/api/raft/nightly/build/) contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ and Python from source](https://docs.rapids.ai/api/raft/nightly/build/#building-c-and-python-from-source) section of the build instructions. -#### CMake Targets - -Additional CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available. RAFT headers require, at a minimum, the CUDA toolkit libraries and RMM dependencies. - -| Component | Target | Description | Base Dependencies | -|-------------|---------------------|----------------------------------------------------------|----------------------------------------| -| n/a | `raft::raft` | Full RAFT header library | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS | -| compiled | `raft::compiled` | Pre-compiled template instantiations and runtime library | raft::raft | -| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs | raft::raft, UCX, NCCL | - -### Source - -The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository: -1. Create an environment with the needed dependencies: -``` -mamba env create --name raft_dev_env -f conda/environments/all_cuda-118_arch-x86_64.yaml -mamba activate raft_dev_env -``` -``` -./build.sh raft-dask pylibraft libraft tests bench --compile-lib -``` +You can find an example [RAFT project template](cpp/template/README.md) in the `cpp/template` directory, which demonstrates how to build a new application with RAFT or incorporate RAFT into an existing CMake project. -The [build](docs/source/build.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](docs/source/build.md#building-raft-c-from-source-in-cmake) section of the build instructions. - -## Folder Structure and Contents - -The folder structure mirrors other RAPIDS repos, with the following folders: - -- `bench/ann`: Python scripts for running ANN benchmarks -- `ci`: Scripts for running CI in PRs -- `conda`: Conda recipes and development conda environments -- `cpp`: Source code for C++ libraries. - - `bench`: Benchmarks source code - - `cmake`: CMake modules and templates - - `doxygen`: Doxygen configuration - - `include`: The C++ API headers are fully-contained here (deprecated directories are excluded from the listing below) - - `cluster`: Basic clustering primitives and algorithms. - - `comms`: A multi-node multi-GPU communications abstraction layer for NCCL+UCX and MPI+NCCL, which can be deployed in Dask clusters using the `raft-dask` Python package. - - `core`: Core API headers which require minimal dependencies aside from RMM and Cudatoolkit. These are safe to expose on public APIs and do not require `nvcc` to build. This is the same for any headers in RAFT which have the suffix `*_types.hpp`. - - `distance`: Distance primitives - - `linalg`: Dense linear algebra - - `matrix`: Dense matrix operations - - `neighbors`: Nearest neighbors and knn graph construction - - `random`: Random number generation, sampling, and data generation primitives - - `solver`: Iterative and combinatorial solvers for optimization and approximation - - `sparse`: Sparse matrix operations - - `convert`: Sparse conversion functions - - `distance`: Sparse distance computations - - `linalg`: Sparse linear algebra - - `neighbors`: Sparse nearest neighbors and knn graph construction - - `op`: Various sparse operations such as slicing and filtering (Note: this will soon be renamed to `sparse/matrix`) - - `solver`: Sparse solvers for optimization and approximation - - `stats`: Moments, summary statistics, model performance measures - - `util`: Various reusable tools and utilities for accelerated algorithm development - - `internal`: A private header-only component that hosts the code shared between benchmarks and tests. - - `scripts`: Helpful scripts for development - - `src`: Compiled APIs and template instantiations for the shared libraries - - `template`: A skeleton template containing the bare-bones file structure and cmake configuration for writing applications with RAFT. - - `test`: Googletests source code -- `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs) -- `notebooks`: IPython notebooks with usage examples and tutorials -- `python`: Source code for Python libraries. - - `pylibraft`: Python build and source code for pylibraft library - - `raft-dask`: Python build and source code for raft-dask library -- `thirdparty`: Third-party licenses ## Contributing diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index a867a71f68..63e0fd5ba9 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -82,7 +82,7 @@ for FILE in .github/workflows/*.yaml; do done for FILE in .github/workflows/*.yaml; do - sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" + sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu index c804ecb3a1..19d78f4cd9 100644 --- a/cpp/bench/prims/distance/masked_nn.cu +++ b/cpp/bench/prims/distance/masked_nn.cu @@ -46,10 +46,10 @@ struct Params { AdjacencyPattern pattern; }; // struct Params -__global__ void init_adj(AdjacencyPattern pattern, - int n, - raft::device_matrix_view adj, - raft::device_vector_view group_idxs) +RAFT_KERNEL init_adj(AdjacencyPattern pattern, + int n, + raft::device_matrix_view adj, + raft::device_vector_view group_idxs) { int m = adj.extent(0); int num_groups = adj.extent(1); diff --git a/cpp/bench/prims/sparse/convert_csr.cu b/cpp/bench/prims/sparse/convert_csr.cu index c9dcae6985..634c749a54 100644 --- a/cpp/bench/prims/sparse/convert_csr.cu +++ b/cpp/bench/prims/sparse/convert_csr.cu @@ -30,7 +30,7 @@ struct bench_param { }; template -__global__ void init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor) +RAFT_KERNEL init_adj_kernel(bool* adj, index_t num_rows, index_t num_cols, index_t divisor) { index_t r = blockDim.y * blockIdx.y + threadIdx.y; index_t c = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/cpp/include/raft/cluster/detail/agglomerative.cuh b/cpp/include/raft/cluster/detail/agglomerative.cuh index 624e67b7fa..f2c83abdd3 100644 --- a/cpp/include/raft/cluster/detail/agglomerative.cuh +++ b/cpp/include/raft/cluster/detail/agglomerative.cuh @@ -155,9 +155,7 @@ void build_dendrogram_host(raft::resources const& handle, } template -__global__ void write_levels_kernel(const value_idx* children, - value_idx* parents, - value_idx n_vertices) +RAFT_KERNEL write_levels_kernel(const value_idx* children, value_idx* parents, value_idx n_vertices) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { @@ -179,12 +177,12 @@ __global__ void write_levels_kernel(const value_idx* children, * @param labels */ template -__global__ void inherit_labels(const value_idx* children, - const value_idx* levels, - std::size_t n_leaves, - value_idx* labels, - int cut_level, - value_idx n_vertices) +RAFT_KERNEL inherit_labels(const value_idx* children, + const value_idx* levels, + std::size_t n_leaves, + value_idx* labels, + int cut_level, + value_idx n_vertices) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh index ef046ab4ff..49ac6ae704 100644 --- a/cpp/include/raft/cluster/detail/connectivities.cuh +++ b/cpp/include/raft/cluster/detail/connectivities.cuh @@ -107,7 +107,7 @@ struct distance_graph_impl -__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz) +RAFT_KERNEL fill_indices2(value_idx* indices, size_t m, size_t nnz) { value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nnz) return; diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh index ade3a6e348..593d7d8fa9 100644 --- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh @@ -434,7 +434,7 @@ template -__global__ void __launch_bounds__((WarpSize * BlockDimY)) +__launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL adjust_centers_kernel(MathT* centers, // [n_clusters, dim] IdxT n_clusters, IdxT dim, diff --git a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh index 5a1479a81f..0b5dec4e19 100644 --- a/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_deprecated.cuh @@ -92,12 +92,12 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * initialized to zero. */ template -static __global__ void computeDistances(index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) +RAFT_KERNEL computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) { // Loop index index_type_t i; @@ -173,11 +173,11 @@ static __global__ void computeDistances(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(index_type_t n, - index_type_t k, - value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) +RAFT_KERNEL minDistances(index_type_t n, + index_type_t k, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { // Loop index index_type_t i, j; @@ -233,11 +233,11 @@ static __global__ void minDistances(index_type_t n, * @param code_new Index associated with new centroid. */ template -static __global__ void minDistances2(index_type_t n, - value_type_t* __restrict__ dists_old, - const value_type_t* __restrict__ dists_new, - index_type_t* __restrict__ codes_old, - index_type_t code_new) +RAFT_KERNEL minDistances2(index_type_t n, + value_type_t* __restrict__ dists_old, + const value_type_t* __restrict__ dists_new, + index_type_t* __restrict__ codes_old, + index_type_t code_new) { // Loop index index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -275,9 +275,9 @@ static __global__ void minDistances2(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes(index_type_t n, - const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) +RAFT_KERNEL computeClusterSizes(index_type_t n, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { @@ -308,10 +308,10 @@ static __global__ void computeClusterSizes(index_type_t n, * column is the mean position of a cluster). */ template -static __global__ void divideCentroids(index_type_t d, - index_type_t k, - const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) +RAFT_KERNEL divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) { // Global indices index_type_t gidx, gidy; diff --git a/cpp/include/raft/common/detail/scatter.cuh b/cpp/include/raft/common/detail/scatter.cuh index 87a8826aa6..6e7522853e 100644 --- a/cpp/include/raft/common/detail/scatter.cuh +++ b/cpp/include/raft/common/detail/scatter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ namespace raft::detail { template -__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) +RAFT_KERNEL scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) { typedef TxN_t DataVec; typedef TxN_t IdxVec; diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp index b23660fefe..dd50f47786 100644 --- a/cpp/include/raft/core/detail/copy.hpp +++ b/cpp/include/raft/core/detail/copy.hpp @@ -329,8 +329,8 @@ __device__ auto increment_indices(IdxType* indices, * parameters. */ template -__global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(DstType dst, - SrcType src) + +RAFT_KERNEL mdspan_copy_kernel(DstType dst, SrcType src) { using config = mdspan_copyable; diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp index bb4207938b..364914043e 100644 --- a/cpp/include/raft/core/detail/macros.hpp +++ b/cpp/include/raft/core/detail/macros.hpp @@ -86,6 +86,38 @@ // as a weak symbol rather than a global." #define RAFT_WEAK_FUNCTION __attribute__((weak)) +// The RAFT_HIDDEN_FUNCTION specificies that the function will be hidden +// and therefore not callable by consumers of raft when compiled as +// a shared library. +// +// Hidden visibility also ensures that the linker doesn't de-duplicate the +// symbol across multiple `.so`. This allows multiple libraries to embed raft +// without issue +#define RAFT_HIDDEN_FUNCTION __attribute__((visibility("hidden"))) + +// The RAFT_KERNEL specificies that a kernel has hidden visibility +// +// Raft needs to ensure that the visibility of its __global__ function +// templates have hidden visibility ( default is weak visibility). +// +// When kernls have weak visibility it means that if two dynamic libraries +// both contain identical instantiations of a RAFT template, then the linker +// will discard one of the two instantiations and use only one of them. +// +// Do to unique requirements of how the CUDA works this de-deduplication +// can lead to the wrong kernels being called ( SM version being wrong ), +// silently no kernel being called at all, or cuda runtime errors being +// thrown. +// +// https://github.com/rapidsai/raft/issues/1722 +#if defined(__CUDACC_RDC__) +#define RAFT_KERNEL RAFT_HIDDEN_FUNCTION __global__ void +#elif defined(_RAFT_HAS_CUDA) +#define RAFT_KERNEL static __global__ void +#else +#define RAFT_KERNEL static void +#endif + /** * Some macro magic to remove optional parentheses of a macro argument. * See https://stackoverflow.com/a/62984543 diff --git a/cpp/include/raft/distance/detail/compress_to_bits.cuh b/cpp/include/raft/distance/detail/compress_to_bits.cuh index fa0df25461..5ffb717c42 100644 --- a/cpp/include/raft/distance/detail/compress_to_bits.cuh +++ b/cpp/include/raft/distance/detail/compress_to_bits.cuh @@ -35,7 +35,7 @@ namespace raft::distance::detail { * Note: the division (`/`) is a ceilDiv. */ template ::value>> -__global__ void compress_to_bits_kernel( +RAFT_KERNEL compress_to_bits_kernel( raft::device_matrix_view in, raft::device_matrix_view out) { diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index f0f12acdb1..2468dcd740 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -87,7 +87,7 @@ struct MinReduceOpImpl { }; template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +RAFT_KERNEL initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; if (tid < m) { redOp.init(min + tid, maxVal); } @@ -139,20 +139,20 @@ template -__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - DataT maxVal, - int* mutex, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - OpT distance_op, - FinalLambda fin_op) +__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + OpT distance_op, + FinalLambda fin_op) { // compile only if below non-ampere arch. #if __CUDA_ARCH__ < 800 diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh index f02e29c797..8d5b2c766e 100644 --- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh +++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh @@ -36,7 +36,7 @@ namespace raft::distance::kernels::detail { * @param offset */ template -__global__ void polynomial_kernel_nopad( +RAFT_KERNEL polynomial_kernel_nopad( math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; @@ -56,7 +56,7 @@ __global__ void polynomial_kernel_nopad( * @param offset */ template -__global__ void polynomial_kernel( +RAFT_KERNEL polynomial_kernel( math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) { for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; @@ -75,7 +75,7 @@ __global__ void polynomial_kernel( * @param offset */ template -__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) +RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) { for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; tid += blockDim.x * gridDim.x) { @@ -93,7 +93,7 @@ __global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t * @param offset */ template -__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) +RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) { for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; tidy += blockDim.y * gridDim.y) @@ -121,7 +121,7 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga * @param gain */ template -__global__ void rbf_kernel_expanded( +RAFT_KERNEL rbf_kernel_expanded( math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) { for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh index 0e13783c19..4de9f4764a 100644 --- a/cpp/include/raft/distance/detail/masked_nn.cuh +++ b/cpp/include/raft/distance/detail/masked_nn.cuh @@ -40,24 +40,24 @@ template -__global__ __launch_bounds__(P::Nthreads, 2) void masked_l2_nn_kernel(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - const uint64_t* adj, - const IdxT* group_idxs, - IdxT num_groups, - IdxT m, - IdxT n, - IdxT k, - bool sqrt, - DataT maxVal, - int* mutex, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - CoreLambda core_op, - FinalLambda fin_op) +__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL masked_l2_nn_kernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + const uint64_t* adj, + const IdxT* group_idxs, + IdxT num_groups, + IdxT m, + IdxT n, + IdxT k, + bool sqrt, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + CoreLambda core_op, + FinalLambda fin_op) { extern __shared__ char smem[]; diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh index 2d0a98862e..5393bf7389 100644 --- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh +++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh @@ -31,8 +31,8 @@ template -__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel( - OpT distance_op, pairwise_matrix_params params) +__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL + pairwise_matrix_kernel(OpT distance_op, pairwise_matrix_params params) { // Early exit to minimize the size of the kernel when it is not supposed to be compiled. constexpr SM_compat_t sm_compat_range{}; diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh index 64d8b4bfae..6e432e050c 100644 --- a/cpp/include/raft/label/detail/classlabels.cuh +++ b/cpp/include/raft/label/detail/classlabels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -119,13 +119,13 @@ void getOvrlabels( // +/-1, return array with the new class labels and corresponding indices. template -__global__ void map_label_kernel(Type* map_ids, - size_t N_labels, - Type* in, - Type* out, - size_t N, - Lambda filter_op, - bool zero_based = false) +RAFT_KERNEL map_label_kernel(Type* map_ids, + size_t N_labels, + Type* in, + Type* out, + size_t N, + Lambda filter_op, + bool zero_based = false) { int tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh index f93a97d52b..166bb2122a 100644 --- a/cpp/include/raft/label/detail/merge_labels.cuh +++ b/cpp/include/raft/label/detail/merge_labels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,13 +32,12 @@ namespace detail { * For an additional cost we can build the graph with edges * E={(A[i], B[i]) | M[i]=1} and make this step faster */ template -__global__ void __launch_bounds__(TPB_X) - propagate_label_kernel(const value_idx* __restrict__ labels_a, - const value_idx* __restrict__ labels_b, - value_idx* __restrict__ R, - const bool* __restrict__ mask, - bool* __restrict__ m, - value_idx N) +RAFT_KERNEL __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a, + const value_idx* __restrict__ labels_b, + value_idx* __restrict__ R, + const bool* __restrict__ mask, + bool* __restrict__ m, + value_idx N) { value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { @@ -65,12 +64,11 @@ __global__ void __launch_bounds__(TPB_X) } template -__global__ void __launch_bounds__(TPB_X) - reassign_label_kernel(value_idx* __restrict__ labels_a, - const value_idx* __restrict__ labels_b, - const value_idx* __restrict__ R, - value_idx N, - value_idx MAX_LABEL) +RAFT_KERNEL __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a, + const value_idx* __restrict__ labels_b, + const value_idx* __restrict__ R, + value_idx N, + value_idx MAX_LABEL) { value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh index bf9b2bd1d8..121ac10e24 100644 --- a/cpp/include/raft/linalg/detail/add.cuh +++ b/cpp/include/raft/linalg/detail/add.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,10 +38,10 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st } template -__global__ void add_dev_scalar_kernel(OutT* outDev, - const InT* inDev, - const InT* singleScalarDev, - IdxType len) +RAFT_KERNEL add_dev_scalar_kernel(OutT* outDev, + const InT* inDev, + const InT* singleScalarDev, + IdxType len) { IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index 5b01196cf4..f3c150cbee 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -40,7 +40,7 @@ template -__global__ void __launch_bounds__(Policy::ThreadsPerBlock) +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedReductionThinKernel(OutType* dots, const InType* data, IdxType D, @@ -137,15 +137,15 @@ template -__global__ void __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots, - const InType* data, - IdxType D, - IdxType N, - OutType init, - MainLambda main_op, - ReduceLambda reduce_op, - FinalLambda final_op, - bool inplace = false) +RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op, + FinalLambda final_op, + bool inplace = false) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -225,7 +225,7 @@ template -__global__ void __launch_bounds__(Policy::ThreadsPerBlock) +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedReductionThickKernel(OutType* buffer, const InType* data, IdxType D, diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 0c79dec248..4ff3aa9754 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -65,7 +65,7 @@ __device__ __forceinline__ void map_kernel_mainloop( } template -__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) +RAFT_KERNEL map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) { const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; if constexpr (R <= 1) { diff --git a/cpp/include/raft/linalg/detail/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh index 6fae16117f..d1e211f8d2 100644 --- a/cpp/include/raft/linalg/detail/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh @@ -52,13 +52,13 @@ template -__global__ void mapThenReduceKernel(OutType* out, - IdxType len, - OutType neutral, - MapOp map, - ReduceLambda op, - const InType* in, - Args... args) +RAFT_KERNEL mapThenReduceKernel(OutType* out, + IdxType len, + OutType neutral, + MapOp map, + ReduceLambda op, + const InType* in, + Args... args) { OutType acc = neutral; auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); diff --git a/cpp/include/raft/linalg/detail/normalize.cuh b/cpp/include/raft/linalg/detail/normalize.cuh index 78c773ab35..d1ca4816e5 100644 --- a/cpp/include/raft/linalg/detail/normalize.cuh +++ b/cpp/include/raft/linalg/detail/normalize.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,7 @@ template -__global__ void __launch_bounds__(Policy::ThreadsPerBlock) +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalesced_normalize_thin_kernel(Type* out, const Type* in, IdxType D, @@ -92,15 +92,15 @@ template -__global__ void __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out, - const Type* in, - IdxType D, - IdxType N, - Type init, - MainLambda main_op, - ReduceLambda reduce_op, - FinalLambda fin_op, - Type eps) +RAFT_KERNEL __launch_bounds__(TPB) coalesced_normalize_medium_kernel(Type* out, + const Type* in, + IdxType D, + IdxType N, + Type init, + MainLambda main_op, + ReduceLambda reduce_op, + FinalLambda fin_op, + Type eps) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh index a85e04acca..b726e3ea5a 100644 --- a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,7 +29,7 @@ namespace detail { ///@todo: specialize this to support shared-mem based atomics template -__global__ void reduce_cols_by_key_direct_kernel( +RAFT_KERNEL reduce_cols_by_key_direct_kernel( const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys) { typedef typename std::iterator_traits::value_type KeyType; @@ -44,7 +44,7 @@ __global__ void reduce_cols_by_key_direct_kernel( } template -__global__ void reduce_cols_by_key_cached_kernel( +RAFT_KERNEL reduce_cols_by_key_cached_kernel( const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys) { typedef typename std::iterator_traits::value_type KeyType; diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh index 572d6b738c..ce11825e12 100644 --- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ namespace detail { // template -void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) +RAFT_KERNEL convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n) { for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) { dst[idx] = src[idx]; @@ -95,14 +95,14 @@ struct quadSum { template __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) - __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, - IdxT lda, - const char* d_keys, - const WeightT* d_weights, - IdxT nrows, - IdxT ncols, - IdxT nkeys, - SumsT* d_sums) + RAFT_KERNEL sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, + IdxT lda, + const char* d_keys, + const WeightT* d_weights, + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums) { typedef typename std::iterator_traits::value_type DataType; typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; @@ -193,15 +193,15 @@ template -__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, - IdxT lda, - KeysIteratorT d_keys, - const WeightT* d_weights, - IdxT nrows, - IdxT ncols, - int key_offset, - IdxT nkeys, - SumsT* d_sums) +RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, + IdxT lda, + KeysIteratorT d_keys, + const WeightT* d_weights, + IdxT nrows, + IdxT ncols, + int key_offset, + IdxT nkeys, + SumsT* d_sums) { typedef typename std::iterator_traits::value_type KeyType; typedef typename std::iterator_traits::value_type DataType; @@ -269,13 +269,13 @@ template -__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, - IdxT lda, - const WeightT* d_weights, - KeysIteratorT d_keys, - IdxT nrows, - IdxT ncols, - SumsT* d_sums) +RAFT_KERNEL sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, + IdxT lda, + const WeightT* d_weights, + KeysIteratorT d_keys, + IdxT nrows, + IdxT ncols, + SumsT* d_sums) { IdxT gid = threadIdx.x + (blockDim.x * static_cast(blockIdx.x)); IdxT j = gid % ncols; diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 42e79a9285..aef346bd4b 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -30,7 +30,7 @@ namespace detail { // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout template -__global__ void stridedSummationKernel( +RAFT_KERNEL stridedSummationKernel( Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) { // Thread reduction @@ -68,13 +68,13 @@ template -__global__ void stridedReductionKernel(OutType* dots, - const InType* data, - int D, - int N, - OutType init, - MainLambda main_op, - ReduceLambda reduce_op) +RAFT_KERNEL stridedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op) { // Thread reduction OutType thread_data = init; diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh index 6df09df8ed..6519d58fa1 100644 --- a/cpp/include/raft/linalg/detail/subtract.cuh +++ b/cpp/include/raft/linalg/detail/subtract.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,10 +38,10 @@ void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream } template -__global__ void subtract_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len) +RAFT_KERNEL subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) { // TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh index 5df7ba3cdc..652c4fda0f 100644 --- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh +++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ struct SmemPerBlock { }; template -__global__ void devLayoutIdx(InType* in, int n_cols, int totalElements) +RAFT_KERNEL devLayoutIdx(InType* in, int n_cols, int totalElements) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int n = n_cols; @@ -63,7 +63,7 @@ __global__ void devLayoutIdx(InType* in, int n_cols, int totalElements) } template -__global__ void devOffsetKernel(T* in, T value, int n_times) +RAFT_KERNEL devOffsetKernel(T* in, T value, int n_times) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < n_times) in[idx] = idx * value; @@ -76,12 +76,12 @@ template < int BLOCK_SIZE, int ITEMS_PER_THREAD, typename std::enable_if::IsValid, InType>::type* = nullptr> -__global__ void __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys, - InType* outputKeys, - OutType* inputVals, - int n_rows, - int n_cols, - InType MAX_VALUE) +RAFT_KERNEL __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys, + InType* outputKeys, + OutType* inputVals, + int n_rows, + int n_cols, + InType MAX_VALUE) { typedef cub::BlockLoad BlockLoadTypeKey; @@ -124,12 +124,12 @@ template < int BLOCK_SIZE, int ITEMS_PER_THREAD, typename std::enable_if::IsValid), InType>::type* = nullptr> -__global__ void devKeyValSortColumnPerRow(const InType* inputKeys, - InType* outputKeys, - OutType* inputVals, - int n_rows, - int n_cols, - InType MAX_VALUE) +RAFT_KERNEL devKeyValSortColumnPerRow(const InType* inputKeys, + InType* outputKeys, + OutType* inputVals, + int n_rows, + int n_cols, + InType MAX_VALUE) { // place holder function // so that compiler unrolls for all template types successfully diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh index 59fcf606c8..73072ec841 100644 --- a/cpp/include/raft/matrix/detail/gather.cuh +++ b/cpp/include/raft/matrix/detail/gather.cuh @@ -47,14 +47,14 @@ template -__global__ void gather_kernel(const InputIteratorT in, - IndexT D, - IndexT len, - const MapIteratorT map, - StencilIteratorT stencil, - OutputIteratorT out, - PredicateOp pred_op, - MapTransformOp transform_op) +RAFT_KERNEL gather_kernel(const InputIteratorT in, + IndexT D, + IndexT len, + const MapIteratorT map, + StencilIteratorT stencil, + OutputIteratorT out, + PredicateOp pred_op, + MapTransformOp transform_op) { typedef typename std::iterator_traits::value_type MapValueT; typedef typename std::iterator_traits::value_type StencilValueT; diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 514d0dc51b..6061fe6aee 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -260,7 +260,7 @@ template -__global__ void __launch_bounds__(BlockSize) +RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecColsMainKernel(Type* out, const Type* in, const IdxType arrOffset, @@ -304,15 +304,14 @@ __global__ void __launch_bounds__(BlockSize) * @param [in] vecs pointers to the argument vectors */ template -__global__ void __launch_bounds__(MaxOffset, 2) - matrixLinewiseVecColsTailKernel(Type* out, - const Type* in, - const IdxType arrOffset, - const IdxType arrTail, - const IdxType rowLen, - const IdxType len, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(MaxOffset, 2) matrixLinewiseVecColsTailKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType arrTail, + const IdxType rowLen, + const IdxType len, + Lambda op, + const Vecs*... vecs) { // Note, L::VecElems == 1 typedef Linewise L; @@ -370,14 +369,13 @@ template -__global__ void __launch_bounds__(BlockSize) - matrixLinewiseVecRowsMainKernel(Type* out, - const Type* in, - const IdxType arrOffset, - const IdxType rowLen, - const IdxType len, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecRowsMainKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType rowLen, + const IdxType len, + Lambda op, + const Vecs*... vecs) { typedef Linewise L; constexpr uint workSize = L::VecElems * BlockSize; @@ -413,14 +411,13 @@ template -__global__ void __launch_bounds__(BlockSize) - matrixLinewiseVecRowsSpanKernel(Type* out, - const Type* in, - const IdxType rowLen, - const IdxType rowLenPadded, - const IdxType lenPadded, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(BlockSize) matrixLinewiseVecRowsSpanKernel(Type* out, + const Type* in, + const IdxType rowLen, + const IdxType rowLenPadded, + const IdxType lenPadded, + Lambda op, + const Vecs*... vecs) { typedef Linewise L; constexpr uint workSize = L::VecElems * BlockSize; @@ -457,15 +454,14 @@ __global__ void __launch_bounds__(BlockSize) * @param [in] vecs pointers to the argument vectors */ template -__global__ void __launch_bounds__(MaxOffset, 2) - matrixLinewiseVecRowsTailKernel(Type* out, - const Type* in, - const IdxType arrOffset, - const IdxType arrTail, - const IdxType rowLen, - const IdxType len, - Lambda op, - const Vecs*... vecs) +RAFT_KERNEL __launch_bounds__(MaxOffset, 2) matrixLinewiseVecRowsTailKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType arrTail, + const IdxType rowLen, + const IdxType len, + Lambda op, + const Vecs*... vecs) { // Note, L::VecElems == 1 constexpr uint workSize = MaxOffset; diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index d2707e1254..9e9d7f8b3b 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -331,7 +331,7 @@ void matrixVectorBinarySub(Type* data, // Computes an argmin/argmax column-wise in a DxN matrix template -__global__ void argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out) +RAFT_KERNEL argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out) { typedef cub:: BlockReduce, TPB, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> @@ -396,7 +396,7 @@ void argmax(const math_t* in, idx_t D, idx_t N, out_t* out, cudaStream_t stream) // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by // flipping the sign if the |max| value for each column is negative. template -__global__ void signFlipKernel(T* d_in, int D, int N) +RAFT_KERNEL signFlipKernel(T* d_in, int D, int N) { typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 48821df5b2..2fa741fd96 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -169,8 +169,7 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) * (1-based) */ template -__global__ void slice( - const m_t* src_d, idx_t lda, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) +RAFT_KERNEL slice(const m_t* src_d, idx_t lda, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; @@ -211,7 +210,7 @@ void sliceMatrix(const m_t* in, * @param k: min(n_rows, n_cols) */ template -__global__ void getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) +RAFT_KERNEL getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t m = n_rows, n = n_cols; @@ -239,7 +238,7 @@ void copyUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, c * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t lda, idx_t k) +RAFT_KERNEL copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t lda, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; @@ -254,7 +253,7 @@ __global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t ld * @param k: dimensionality */ template -__global__ void copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t lda, idx_t k) +RAFT_KERNEL copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t lda, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; @@ -290,7 +289,7 @@ void getDiagonalMatrix( * @param len: size of one side of the matrix */ template -__global__ void matrixDiagonalInverse(m_t* in, idx_t len) +RAFT_KERNEL matrixDiagonalInverse(m_t* in, idx_t len) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; } diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh index edde924892..b3c07b9d3a 100644 --- a/cpp/include/raft/matrix/detail/select_radix.cuh +++ b/cpp/include/raft/matrix/detail/select_radix.cuh @@ -422,16 +422,16 @@ _RAFT_DEVICE void last_filter(const T* in_buf, } template -__global__ void last_filter_kernel(const T* in, - const IdxT* in_idx, - const T* in_buf, - const IdxT* in_idx_buf, - T* out, - IdxT* out_idx, - IdxT len, - IdxT k, - Counter* counters, - const bool select_min) +RAFT_KERNEL last_filter_kernel(const T* in, + const IdxT* in_idx, + const T* in_buf, + const IdxT* in_idx_buf, + T* out, + IdxT* out_idx, + IdxT len, + IdxT k, + Counter* counters, + const bool select_min) { const size_t batch_id = blockIdx.y; // size_t to avoid multiplication overflow @@ -525,20 +525,20 @@ __global__ void last_filter_kernel(const T* in, * their indices. */ template -__global__ void radix_kernel(const T* in, - const IdxT* in_idx, - const T* in_buf, - const IdxT* in_idx_buf, - T* out_buf, - IdxT* out_idx_buf, - T* out, - IdxT* out_idx, - Counter* counters, - IdxT* histograms, - const IdxT len, - const IdxT k, - const bool select_min, - const int pass) +RAFT_KERNEL radix_kernel(const T* in, + const IdxT* in_idx, + const T* in_buf, + const IdxT* in_idx_buf, + T* out_buf, + IdxT* out_idx_buf, + T* out, + IdxT* out_idx, + Counter* counters, + IdxT* histograms, + const IdxT len, + const IdxT k, + const bool select_min, + const int pass) { const size_t batch_id = blockIdx.y; auto counter = counters + batch_id; @@ -920,17 +920,17 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf, } template -__global__ void radix_topk_one_block_kernel(const T* in, - const IdxT* in_idx, - const IdxT len, - const IdxT k, - T* out, - IdxT* out_idx, - const bool select_min, - T* buf1, - IdxT* idx_buf1, - T* buf2, - IdxT* idx_buf2) +RAFT_KERNEL radix_topk_one_block_kernel(const T* in, + const IdxT* in_idx, + const IdxT len, + const IdxT k, + T* out, + IdxT* out_idx, + const bool select_min, + T* buf1, + IdxT* idx_buf1, + T* buf2, + IdxT* idx_buf2) { constexpr int num_buckets = calc_num_buckets(); __shared__ Counter counter; diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh index 2927604e7d..0ee87de4f7 100644 --- a/cpp/include/raft/matrix/detail/select_warpsort.cuh +++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh @@ -56,7 +56,7 @@ the top-k result. Example: - __global__ void kernel() { + RAFT_KERNEL kernel() { block_sort queue(...); for (IdxT i = threadIdx.x; i < len, i += blockDim.x) { @@ -80,7 +80,7 @@ (see the usage of LaunchThreshold::len_factor_for_choosing). Example: - __global__ void kernel() { + RAFT_KERNEL kernel() { warp_sort_immediate<...> queue(...); int warp_id = threadIdx.x / WarpSize; int lane_id = threadIdx.x % WarpSize; @@ -750,8 +750,8 @@ template