diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 27f619f391..a2b12764c5 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
"args": {
"CUDA": "11.8",
"PYTHON_PACKAGE_MANAGER": "conda",
- "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index db2a5dbcc6..f319536b18 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,27 +5,24 @@
"args": {
"CUDA": "11.8",
"PYTHON_PACKAGE_MANAGER": "pip",
- "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
- "version": "1.15.0"
- },
- "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
+ "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
"version": "11.8",
"installcuBLAS": true,
"installcuSOLVER": true,
"installcuRAND": true,
"installcuSPARSE": true
},
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 836a5feacd..adc79408a3 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
"args": {
"CUDA": "12.5",
"PYTHON_PACKAGE_MANAGER": "conda",
- "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 28798cbbf5..26b9a56e48 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,27 +5,24 @@
"args": {
"CUDA": "12.5",
"PYTHON_PACKAGE_MANAGER": "pip",
- "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
+ "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
- "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
+ "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
- "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": {
- "version": "1.15.0"
- },
- "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
+ "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
"version": "12.5",
"installcuBLAS": true,
"installcuSOLVER": true,
"installcuRAND": true,
"installcuSPARSE": true
},
- "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
},
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e6f7043f82..2b0ae5099c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
jobs:
cpp-build:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
python-build:
needs: [cpp-build]
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
upload-conda:
needs: [cpp-build, python-build]
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
if: github.ref_type == 'branch'
needs: python-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
with:
arch: "amd64"
branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
sha: ${{ inputs.sha }}
wheel-build-pylibraft:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
wheel-publish-pylibraft:
needs: wheel-build-pylibraft
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
package-name: pylibraft
wheel-build-raft-dask:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
wheel-publish-raft-dask:
needs: wheel-build-raft-dask
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
with:
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e6c9604221..381ca6b378 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,49 +25,49 @@ jobs:
- wheel-tests-raft-dask
- devcontainer
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
checks:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
with:
enable_check_generated_files: false
conda-cpp-build:
needs: checks
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
with:
build_type: pull-request
node_type: cpu16
conda-cpp-tests:
needs: conda-cpp-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
with:
build_type: pull-request
conda-cpp-checks:
needs: conda-cpp-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
with:
build_type: pull-request
enable_check_symbols: true
- symbol_exclusions: _ZN\d+raft_cutlass
+ symbol_exclusions: raft_cutlass
conda-python-build:
needs: conda-cpp-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
with:
build_type: pull-request
conda-python-tests:
needs: conda-python-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
with:
build_type: pull-request
docs-build:
needs: conda-python-build
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
with:
build_type: pull-request
node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
wheel-build-pylibraft:
needs: checks
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
with:
build_type: pull-request
script: ci/build_wheel_pylibraft.sh
wheel-tests-pylibraft:
needs: wheel-build-pylibraft
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
with:
build_type: pull-request
script: ci/test_wheel_pylibraft.sh
wheel-build-raft-dask:
needs: wheel-tests-pylibraft
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
with:
build_type: pull-request
script: "ci/build_wheel_raft_dask.sh"
wheel-tests-raft-dask:
needs: wheel-build-raft-dask
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
with:
build_type: pull-request
script: ci/test_wheel_raft_dask.sh
devcontainer:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
with:
arch: '["amd64"]'
cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0eba0f27d1..ad0456d526 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,17 +16,17 @@ on:
jobs:
conda-cpp-checks:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
with:
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
enable_check_symbols: true
- symbol_exclusions: _ZN\d+raft_cutlass
+ symbol_exclusions: raft_cutlass
conda-cpp-tests:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
with:
build_type: nightly
branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
sha: ${{ inputs.sha }}
conda-python-tests:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
with:
build_type: nightly
branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
sha: ${{ inputs.sha }}
wheel-tests-pylibraft:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
with:
build_type: nightly
branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
script: ci/test_wheel_pylibraft.sh
wheel-tests-raft-dask:
secrets: inherit
- uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+ uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
with:
build_type: nightly
branch: ${{ inputs.branch }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a0a4b4be87..458d8b1b51 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
# Explicitly specify the pyproject.toml at the repo root, not per-project.
args: ["--config", "pyproject.toml"]
- repo: https://github.com/PyCQA/flake8
- rev: 5.0.4
+ rev: 7.1.1
hooks:
- id: flake8
args: ["--config=.flake8"]
@@ -99,7 +99,7 @@ repos:
hooks:
- id: check-json
- repo: https://github.com/rapidsai/pre-commit-hooks
- rev: v0.2.0
+ rev: v0.4.0
hooks:
- id: verify-copyright
files: |
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0685145dca..9caa5ef571 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,48 @@
+# raft 24.10.00 (9 Oct 2024)
+
+## 🚨 Breaking Changes
+
+- [Feat] add `repeat`, `sparsity`, `eval_n_elements` APIs to `bitset` ([#2439](https://github.com/rapidsai/raft/pull/2439)) [@rhdong](https://github.com/rhdong)
+
+## 🐛 Bug Fixes
+
+- Disable NN Descent Batch tests temporarily ([#2453](https://github.com/rapidsai/raft/pull/2453)) [@divyegala](https://github.com/divyegala)
+- Fix sed syntax in `update-version.sh` ([#2441](https://github.com/rapidsai/raft/pull/2441)) [@raydouglass](https://github.com/raydouglass)
+- Use runtime check of cudart version for eig ([#2430](https://github.com/rapidsai/raft/pull/2430)) [@lowener](https://github.com/lowener)
+- [BUG] Fix bitset function visibility ([#2429](https://github.com/rapidsai/raft/pull/2429)) [@lowener](https://github.com/lowener)
+- Exclude any kernel symbol that uses cutlass ([#2425](https://github.com/rapidsai/raft/pull/2425)) [@robertmaynard](https://github.com/robertmaynard)
+
+## 🚀 New Features
+
+- [Feat] add `repeat`, `sparsity`, `eval_n_elements` APIs to `bitset` ([#2439](https://github.com/rapidsai/raft/pull/2439)) [@rhdong](https://github.com/rhdong)
+- [Opt] Enforce the UT Coverity and add benchmark for `transpose` ([#2438](https://github.com/rapidsai/raft/pull/2438)) [@rhdong](https://github.com/rhdong)
+- [FEA] Support for half-float mixed precise in brute-force ([#2382](https://github.com/rapidsai/raft/pull/2382)) [@rhdong](https://github.com/rhdong)
+
+## 🛠️ Improvements
+
+- bump NCCL floor to 2.19 ([#2458](https://github.com/rapidsai/raft/pull/2458)) [@jameslamb](https://github.com/jameslamb)
+- Deprecating vector search APIs and updating README accordingly ([#2448](https://github.com/rapidsai/raft/pull/2448)) [@cjnolet](https://github.com/cjnolet)
+- Update update-version.sh to use packaging lib ([#2447](https://github.com/rapidsai/raft/pull/2447)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Switch traceback to `native` ([#2446](https://github.com/rapidsai/raft/pull/2446)) [@galipremsagar](https://github.com/galipremsagar)
+- bump NCCL floor to 2.18.1.1 ([#2443](https://github.com/rapidsai/raft/pull/2443)) [@jameslamb](https://github.com/jameslamb)
+- Add missing `cuda_suffixed: true` ([#2440](https://github.com/rapidsai/raft/pull/2440)) [@trxcllnt](https://github.com/trxcllnt)
+- Use CI workflow branch 'branch-24.10' again ([#2437](https://github.com/rapidsai/raft/pull/2437)) [@jameslamb](https://github.com/jameslamb)
+- Update to flake8 7.1.1. ([#2435](https://github.com/rapidsai/raft/pull/2435)) [@bdice](https://github.com/bdice)
+- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#2433](https://github.com/rapidsai/raft/pull/2433)) [@jameslamb](https://github.com/jameslamb)
+- Allow coo_sort to work on int64_t indices ([#2432](https://github.com/rapidsai/raft/pull/2432)) [@benfred](https://github.com/benfred)
+- Adding NCCL clique to the RAFT handle ([#2431](https://github.com/rapidsai/raft/pull/2431)) [@viclafargue](https://github.com/viclafargue)
+- Add support for Python 3.12 ([#2428](https://github.com/rapidsai/raft/pull/2428)) [@jameslamb](https://github.com/jameslamb)
+- Update rapidsai/pre-commit-hooks ([#2420](https://github.com/rapidsai/raft/pull/2420)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Drop Python 3.9 support ([#2417](https://github.com/rapidsai/raft/pull/2417)) [@jameslamb](https://github.com/jameslamb)
+- Use CUDA math wheels ([#2415](https://github.com/rapidsai/raft/pull/2415)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove NumPy <2 pin ([#2414](https://github.com/rapidsai/raft/pull/2414)) [@seberg](https://github.com/seberg)
+- Update pre-commit hooks ([#2409](https://github.com/rapidsai/raft/pull/2409)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve update-version.sh ([#2408](https://github.com/rapidsai/raft/pull/2408)) [@bdice](https://github.com/bdice)
+- Use tool.scikit-build.cmake.version, set scikit-build-core minimum-version ([#2406](https://github.com/rapidsai/raft/pull/2406)) [@jameslamb](https://github.com/jameslamb)
+- [FEA] Batching NN Descent ([#2403](https://github.com/rapidsai/raft/pull/2403)) [@jinsolp](https://github.com/jinsolp)
+- Update pip devcontainers to UCX v1.17.0 ([#2401](https://github.com/rapidsai/raft/pull/2401)) [@jameslamb](https://github.com/jameslamb)
+- Merge branch-24.08 into branch-24.10 ([#2397](https://github.com/rapidsai/raft/pull/2397)) [@jameslamb](https://github.com/jameslamb)
+
# raft 24.08.00 (7 Aug 2024)
## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index d6f3ef9320..af2219fdd1 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
#
RAFT: Reusable Accelerated Functions and Tools for Vector Search and More
> [!IMPORTANT]
-> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.10 (October) release and will be removing them altogether in the 24.12 (December) release.
![RAFT tech stack](img/raft-tech-stack-vss.png)
@@ -36,7 +36,7 @@
## What is RAFT?
-RAFT contains fundamental widely-used algorithms and primitives for machine learning and information retrieval. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications.
+RAFT contains fundamental widely-used algorithms and primitives for machine learning and data mining. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications.
By taking a primitives-based approach to algorithm development, RAFT
- accelerates algorithm construction time
@@ -47,12 +47,10 @@ While not exhaustive, the following general categories help summarize the accele
#####
| Category | Accelerated Functions in RAFT |
|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------|
-| **Nearest Neighbors** | vector search, neighborhood graph construction, epsilon neighborhoods, pairwise distances |
-| **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
-| **Solvers** | combinatorial optimization, iterative solvers |
| **Data Formats** | sparse & dense, conversions, data generation |
| **Dense Operations** | linear algebra, matrix and vector operations, reductions, slicing, norms, factorization, least squares, svd & eigenvalue problems |
| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling |
+| **Solvers** | combinatorial optimization, iterative solvers |
| **Statistics** | sampling, moments and summary statistics, metrics, model evaluation |
| **Tools & Utilities** | common tools and utilities for developing CUDA applications, multi-node multi-gpu infrastructure |
@@ -67,42 +65,6 @@ In addition being a C++ library, RAFT also provides 2 Python libraries:
![RAFT is a C++ header-only template library with optional shared library and lightweight Python wrappers](img/arch.png)
-## Use cases
-
-### Vector Similarity Search
-
-RAFT contains state-of-the-art implementations of approximate nearest neighbors search (ANNS) algorithms on the GPU, such as:
-
-* [Brute force](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#brute-force). Performs a brute force nearest neighbors search without an index.
-* [IVF-Flat](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-flat) and [IVF-PQ](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-pq). Use an inverted file index structure to map contents to their locations. IVF-PQ additionally uses product quantization to reduce the memory usage of vectors. These methods were originally popularized by the [FAISS](https://github.com/facebookresearch/faiss) library.
-* [CAGRA](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#cagra) (Cuda Anns GRAph-based). Uses a fast ANNS graph construction and search implementation optimized for the GPU. CAGRA outperforms state-of-the art CPU methods (i.e. HNSW) for large batch queries, single queries, and graph construction time.
-
-Projects that use the RAFT ANNS algorithms for accelerating vector search include: [Milvus](https://milvus.io/), [Redis](https://redis.io/), and [Faiss](https://github.com/facebookresearch/faiss).
-
-Please see the example [Jupyter notebook](https://github.com/rapidsai/raft/blob/HEAD/notebooks/VectorSearch_QuestionRetrieval.ipynb) to get started RAFT for vector search in Python.
-
-
-
-### Information Retrieval
-
-RAFT contains a catalog of reusable primitives for composing algorithms that require fast neighborhood computations, such as
-
-1. Computing distances between vectors and computing kernel gramm matrices
-2. Performing ball radius queries for constructing epsilon neighborhoods
-3. Clustering points to partition a space for smaller and faster searches
-4. Constructing neighborhood "connectivities" graphs from dense vectors
-
-### Machine Learning
-
-RAFT's primitives are used in several RAPIDS libraries, including [cuML](https://github.com/rapidsai/cuml), [cuGraph](https://github.com/rapidsai/cugraph), and [cuOpt](https://github.com/rapidsai/cuopt) to build many end-to-end machine learning algorithms that span a large spectrum of different applications, including
-- data generation
-- model evaluation
-- classification and regression
-- clustering
-- manifold learning
-- dimensionality reduction.
-
-RAFT is also used by the popular collaborative filtering library [implicit](https://github.com/benfred/implicit) for recommender systems.
## Is RAFT right for me?
@@ -293,7 +255,7 @@ You can also install the conda packages individually using the `mamba` command a
mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.5
```
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.08/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.10/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
### Installing Python through Pip
@@ -327,70 +289,3 @@ When citing RAFT generally, please consider referencing this Github project.
year={2022}
}
```
-If citing the sparse pairwise distances API, please consider using the following bibtex:
-```bibtex
-@article{nolet2021semiring,
- title={Semiring primitives for sparse neighborhood methods on the gpu},
- author={Nolet, Corey J and Gala, Divye and Raff, Edward and Eaton, Joe and Rees, Brad and Zedlewski, John and Oates, Tim},
- journal={arXiv preprint arXiv:2104.06357},
- year={2021}
-}
-```
-
-If citing the single-linkage agglomerative clustering APIs, please consider the following bibtex:
-```bibtex
-@misc{nolet2023cuslink,
- title={cuSLINK: Single-linkage Agglomerative Clustering on the GPU},
- author={Corey J. Nolet and Divye Gala and Alex Fender and Mahesh Doijade and Joe Eaton and Edward Raff and John Zedlewski and Brad Rees and Tim Oates},
- year={2023},
- eprint={2306.16354},
- archivePrefix={arXiv},
- primaryClass={cs.LG}
-}
-```
-
-If citing CAGRA, please consider the following bibtex:
-```bibtex
-@misc{ootomo2023cagra,
- title={CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search for GPUs},
- author={Hiroyuki Ootomo and Akira Naruse and Corey Nolet and Ray Wang and Tamas Feher and Yong Wang},
- year={2024},
- series = {ICDE '24}
-}
-```
-
-If citing the k-selection routines, please consider the following bibtex:
-
-```bibtex
-@proceedings{10.1145/3581784,
- title = {Parallel Top-K Algorithms on GPU: A Comprehensive Study and New Methods},
- author={Jingrong Zhang, Akira Naruse, Xipeng Li, and Yong Wang},
- year = {2023},
- isbn = {9798400701092},
- publisher = {Association for Computing Machinery},
- address = {New York, NY, USA},
- location = {Denver, CO, USA},
- series = {SC '23}
-}
-```
-
-If citing the nearest neighbors descent API, please consider the following bibtex:
-```bibtex
-@inproceedings{10.1145/3459637.3482344,
- author = {Wang, Hui and Zhao, Wan-Lei and Zeng, Xiangxiang and Yang, Jianye},
- title = {Fast K-NN Graph Construction by GPU Based NN-Descent},
- year = {2021},
- isbn = {9781450384469},
- publisher = {Association for Computing Machinery},
- address = {New York, NY, USA},
- url = {https://doi.org/10.1145/3459637.3482344},
- doi = {10.1145/3459637.3482344},
- abstract = {NN-Descent is a classic k-NN graph construction approach. It is still widely employed in machine learning, computer vision, and information retrieval tasks due to its efficiency and genericness. However, the current design only works well on CPU. In this paper, NN-Descent has been redesigned to adapt to the GPU architecture. A new graph update strategy called selective update is proposed. It reduces the data exchange between GPU cores and GPU global memory significantly, which is the processing bottleneck under GPU computation architecture. This redesign leads to full exploitation of the parallelism of the GPU hardware. In the meantime, the genericness, as well as the simplicity of NN-Descent, are well-preserved. Moreover, a procedure that allows to k-NN graph to be merged efficiently on GPU is proposed. It makes the construction of high-quality k-NN graphs for out-of-GPU-memory datasets tractable. Our approach is 100-250\texttimes{} faster than the single-thread NN-Descent and is 2.5-5\texttimes{} faster than the existing GPU-based approaches as we tested on million as well as billion scale datasets.},
- booktitle = {Proceedings of the 30th ACM International Conference on Information \& Knowledge Management},
- pages = {1929–1938},
- numpages = {10},
- keywords = {high-dimensional, nn-descent, gpu, k-nearest neighbor graph},
- location = {Virtual Event, Queensland, Australia},
- series = {CIKM '21}
-}
-```
diff --git a/VERSION b/VERSION
index ec8489fda9..7c7ba04436 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 62d93a668e..e7ae52f33a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -20,10 +20,29 @@ rapids-generate-version > VERSION
cd "${package_dir}"
+case "${RAPIDS_CUDA_VERSION}" in
+ 12.*)
+ EXCLUDE_ARGS=(
+ --exclude "libcublas.so.12"
+ --exclude "libcublasLt.so.12"
+ --exclude "libcurand.so.10"
+ --exclude "libcusolver.so.11"
+ --exclude "libcusparse.so.12"
+ --exclude "libnvJitLink.so.12"
+ --exclude "libucp.so.0"
+ )
+ ;;
+ 11.*)
+ EXCLUDE_ARGS=(
+ --exclude "libucp.so.0"
+ )
+ ;;
+esac
+
# Hardcode the output dir
python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
mkdir -p final_dist
-python -m auditwheel repair -w final_dist --exclude "libucp.so.0" dist/*
+python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
index 895c311f46..ce9f0ed172 100755
--- a/ci/build_wheel_pylibraft.sh
+++ b/ci/build_wheel_pylibraft.sh
@@ -3,7 +3,16 @@
set -euo pipefail
+case "${RAPIDS_CUDA_VERSION}" in
+ 12.*)
+ EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+ ;;
+ 11.*)
+ EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+ ;;
+esac
+
# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF${EXTRA_CMAKE_ARGS}"
ci/build_wheel.sh pylibraft python/pylibraft
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 194ad9a07b..032b88b4aa 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -18,16 +18,15 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
-#Get . for next version
+# Get . for next version
NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
-NEXT_UCX_PY_VERSION="${NEXT_UCX_PY_SHORT_TAG}.*"
+NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
# Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-NEXT_UCX_PY_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCX_PY_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
@@ -37,8 +36,8 @@ function sed_runner() {
}
sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake
-sed_runner 's/'"find_and_configure_ucxx(VERSION .*"'/'"find_and_configure_ucxx(VERSION ${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
-sed_runner 's/'"branch-.*"'/'"branch-${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+sed_runner 's/'"find_and_configure_ucxx(VERSION .*"'/'"find_and_configure_ucxx(VERSION ${NEXT_UCXX_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+sed_runner 's/'"branch-.*"'/'"branch-${NEXT_UCXX_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
# Centralized version file update
echo "${NEXT_FULL_TAG}" > VERSION
@@ -46,35 +45,33 @@ echo "${NEXT_FULL_TAG}" > VERSION
DEPENDENCIES=(
dask-cuda
pylibraft
- pylibraft-cu11
- pylibraft-cu12
rmm
- rmm-cu11
- rmm-cu12
rapids-dask-dependency
- # ucx-py and ucxx are handled separately below
+)
+UCXX_DEPENDENCIES=(
+ ucx-py
+ libucxx
+ distributed-ucxx
)
for FILE in dependencies.yaml conda/environments/*.yaml; do
for DEP in "${DEPENDENCIES[@]}"; do
- sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
+ sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
+ done
+ for DEP in "${UCXX_DEPENDENCIES[@]}"; do
+ sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXX_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
done
- sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
- sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
- sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
- sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
- sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
- sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
- sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE};
done
for FILE in python/*/pyproject.toml; do
for DEP in "${DEPENDENCIES[@]}"; do
- sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+ sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
+ done
+ for DEP in "${UCXX_DEPENDENCIES[@]}"; do
+ sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_UCXX_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
done
- sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
done
-sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
-sed_runner "/^ucxx_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCXX_SHORT_TAG_PEP440}.*\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+sed_runner "/^ucxx_version:$/ {n;s/.*/ - \"${NEXT_UCXX_SHORT_TAG_PEP440}.*\"/}" conda/recipes/raft-dask/conda_build_config.yaml
for FILE in .github/workflows/*.yaml; do
sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index f2ae41822c..0b84772fad 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,8 +20,8 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
- doxygen>=1.8.20
- gcc_linux-aarch64=11.*
- graphviz
@@ -35,27 +35,27 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
-- nccl>=2.9.9
+- libucxx==0.40.*,>=0.0.0a0
+- nccl>=2.19
- ninja
- numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
- numpydoc
- nvcc_linux-aarch64=11.8
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
- recommonmark
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- scikit-learn
- scipy
- sphinx-copybutton
- sphinx-markdown-tables
- sysroot_linux-aarch64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8f5f4d3088..d1c01f1b16 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,8 +20,8 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
- doxygen>=1.8.20
- gcc_linux-64=11.*
- graphviz
@@ -35,27 +35,27 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
-- nccl>=2.9.9
+- libucxx==0.40.*,>=0.0.0a0
+- nccl>=2.19
- ninja
- numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
- numpydoc
- nvcc_linux-64=11.8
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
- recommonmark
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- scikit-learn
- scipy
- sphinx-copybutton
- sphinx-markdown-tables
- sysroot_linux-64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 2042156224..4c506f5297 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -21,8 +21,8 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
- doxygen>=1.8.20
- gcc_linux-aarch64=11.*
- graphviz
@@ -32,26 +32,26 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
-- nccl>=2.9.9
+- libucxx==0.40.*,>=0.0.0a0
+- nccl>=2.19
- ninja
- numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
- numpydoc
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
- recommonmark
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- scikit-learn
- scipy
- sphinx-copybutton
- sphinx-markdown-tables
- sysroot_linux-aarch64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index a2586cc211..a123950e3a 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,8 +21,8 @@ dependencies:
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- distributed-ucxx==0.39.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- distributed-ucxx==0.40.*,>=0.0.0a0
- doxygen>=1.8.20
- gcc_linux-64=11.*
- graphviz
@@ -32,26 +32,26 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
-- nccl>=2.9.9
+- libucxx==0.40.*,>=0.0.0a0
+- nccl>=2.19
- ninja
- numba>=0.57
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
- numpydoc
- pre-commit
- pydata-sphinx-theme
-- pylibraft==24.8.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
- pytest-cov
- pytest==7.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
- recommonmark
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- scikit-learn
- scipy
- sphinx-copybutton
- sphinx-markdown-tables
- sysroot_linux-64==2.17
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
name: all_cuda-125_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 000a8f4a1c..864eb2130b 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -30,9 +30,9 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
- matplotlib
-- nccl>=2.9.9
+- nccl>=2.19
- ninja
- nlohmann_json>=3.11.2
- nvcc_linux-aarch64=11.8
@@ -40,7 +40,7 @@ dependencies:
- pandas
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- sysroot_linux-aarch64==2.17
name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 52b3a8dc69..5da6eaf17e 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -30,9 +30,9 @@ dependencies:
- libcusolver=11.4.1.48
- libcusparse-dev=11.7.5.86
- libcusparse=11.7.5.86
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
- matplotlib
-- nccl>=2.9.9
+- nccl>=2.19
- ninja
- nlohmann_json>=3.11.2
- nvcc_linux-64=11.8
@@ -40,7 +40,7 @@ dependencies:
- pandas
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- sysroot_linux-64==2.17
name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 27baeda4b8..65de97c170 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -27,16 +27,16 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
- matplotlib
-- nccl>=2.9.9
+- nccl>=2.19
- ninja
- nlohmann_json>=3.11.2
- openblas
- pandas
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- sysroot_linux-aarch64==2.17
name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 5274d56bf6..7e1adbc483 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -27,16 +27,16 @@ dependencies:
- libcurand-dev
- libcusolver-dev
- libcusparse-dev
-- libucxx==0.39.*,>=0.0.0a0
+- libucxx==0.40.*,>=0.0.0a0
- matplotlib
-- nccl>=2.9.9
+- nccl>=2.19
- ninja
- nlohmann_json>=3.11.2
- openblas
- pandas
- pyyaml
- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.8.*,>=0.0.0a0
-- scikit-build-core>=0.7.0
+- rmm==24.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
- sysroot_linux-64==2.17
name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 00b133c821..bc0ff1fae7 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
- ">=3.26.4,!=3.30.0"
nccl_version:
- - ">=2.9.9"
+ - ">=2.19"
glog_version:
- ">=0.6.0"
@@ -73,7 +73,7 @@ cuda11_cuda_profiler_api_run_version:
- ">=11.4.240,<12"
spdlog_version:
- - ">=1.12.0,<1.13"
+ - ">=1.14.1,<1.15"
fmt_version:
- - ">=10.1.1,<11"
+ - ">=11.0.2,<12"
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 31086e30aa..9d91af712e 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -54,7 +54,7 @@ requirements:
- libraft-headers {{ version }}
- python x.x
- rmm ={{ minor_version }}
- - scikit-build-core >=0.7.0
+ - scikit-build-core >=0.10.0
- rapids-build-backend>=0.3.0,<0.4.0.dev0
run:
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
@@ -65,7 +65,7 @@ requirements:
{% endif %}
- libraft {{ version }}
- libraft-headers {{ version }}
- - numpy >=1.23,<2.0a0
+ - numpy >=1.23,<3.0a0
- python x.x
- rmm ={{ minor_version }}
diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
index 70d1f0490e..ed6f708e14 100644
--- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
@@ -23,7 +23,7 @@ nlohmann_json_version:
- ">=3.11.2"
spdlog_version:
- - ">=1.12.0,<1.13"
+ - ">=1.14.1,<1.15"
fmt_version:
- - ">=10.1.1,<11"
+ - ">=11.0.2,<12"
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index db0083b583..47bd730daf 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
- ">=3.26.4,!=3.30.0"
nccl_version:
- - ">=2.9.9"
+ - ">=2.19"
glog_version:
- ">=0.6.0"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index decd1fad18..65c589fc0c 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -17,10 +17,13 @@ c_stdlib_version:
- "2.17"
ucx_py_version:
- - "0.39.*"
+ - "0.40.*"
ucxx_version:
- - "0.39.*"
+ - "0.40.*"
cmake_version:
- ">=3.26.4,!=3.30.0"
+
+nccl_version:
+ - ">=2.19"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 641a11a241..bc13d352b7 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -50,11 +50,11 @@ requirements:
{% endif %}
- cuda-version ={{ cuda_version }}
- cython >=3.0.0
- - nccl >=2.9.9
+ - nccl {{ nccl_version }}
- pylibraft {{ version }}
- python x.x
- rmm ={{ minor_version }}
- - scikit-build-core >=0.7.0
+ - scikit-build-core >=0.10.0
- ucx-py {{ ucx_py_version }}
- ucxx {{ ucxx_version }}
- rapids-build-backend>=0.3.0,<0.4.0.dev0
@@ -68,7 +68,7 @@ requirements:
- dask-cuda ={{ minor_version }}
- rapids-dask-dependency ={{ minor_version }}
- joblib >=0.11
- - nccl >=2.9.9
+ - nccl {{ nccl_version }}
- pylibraft {{ version }}
- python x.x
- rmm ={{ minor_version }}
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 9d80cbaac2..52c63ad73b 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -132,6 +132,7 @@ if(BUILD_PRIMS_BENCH)
linalg/reduce_rows_by_key.cu
linalg/reduce.cu
linalg/sddmm.cu
+ linalg/transpose.cu
main.cpp
)
diff --git a/cpp/bench/prims/linalg/transpose.cu b/cpp/bench/prims/linalg/transpose.cu
new file mode 100644
index 0000000000..e60e50c125
--- /dev/null
+++ b/cpp/bench/prims/linalg/transpose.cu
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+namespace raft::bench::linalg {
+
+template
+struct transpose_input {
+ IdxT rows, cols;
+};
+
+template
+inline auto operator<<(std::ostream& os, const transpose_input& p) -> std::ostream&
+{
+ os << p.rows << "#" << p.cols;
+ return os;
+}
+
+template
+struct TransposeBench : public fixture {
+ TransposeBench(const transpose_input& p)
+ : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream)
+ {
+ raft::random::RngState rng{1234};
+ raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0);
+ }
+
+ void run_benchmark(::benchmark::State& state) override
+ {
+ std::ostringstream label_stream;
+ label_stream << params;
+ state.SetLabel(label_stream.str());
+
+ loop_on_state(state, [this]() {
+ auto input_view =
+ raft::make_device_matrix_view(in.data(), params.rows, params.cols);
+ auto output_view = raft::make_device_vector_view(out.data(), params.rows);
+ raft::linalg::transpose(handle,
+ input_view.data_handle(),
+ output_view.data_handle(),
+ params.rows,
+ params.cols,
+ handle.get_stream());
+ });
+ }
+
+ private:
+ transpose_input params;
+ rmm::device_uvector in, out;
+}; // struct TransposeBench
+
+const std::vector> transpose_inputs_i32 =
+ raft::util::itertools::product>({10, 128, 256, 512, 1024},
+ {10000, 100000, 1000000});
+
+RAFT_BENCH_REGISTER((TransposeBench), "", transpose_inputs_i32);
+RAFT_BENCH_REGISTER((TransposeBench), "", transpose_inputs_i32);
+
+RAFT_BENCH_REGISTER((TransposeBench), "", transpose_inputs_i32);
+RAFT_BENCH_REGISTER((TransposeBench), "", transpose_inputs_i32);
+
+} // namespace raft::bench::linalg
diff --git a/cpp/bench/prims/util/popc.cu b/cpp/bench/prims/util/popc.cu
index 249dc13d1e..c6249fb2bd 100644
--- a/cpp/bench/prims/util/popc.cu
+++ b/cpp/bench/prims/util/popc.cu
@@ -89,10 +89,9 @@ struct popc_bench : public fixture {
auto bits_view =
raft::make_device_vector_view(bits_d.data_handle(), bits_d.size());
- index_t max_len = params.n_rows * params.n_cols;
- auto max_len_view = raft::make_host_scalar_view(&max_len);
- auto nnz_actual_view =
- nnz_actual_d.view(); // raft::make_device_scalar_view(nnz_actual_d.data_handle());
+ index_t max_len = params.n_rows * params.n_cols;
+ auto max_len_view = raft::make_host_scalar_view(&max_len);
+ auto nnz_actual_view = nnz_actual_d.view();
raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
});
}
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index 7be7804c7e..57e38c2638 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -1,5 +1,5 @@
# =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
@@ -19,15 +19,6 @@ function(find_and_configure_spdlog)
rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports)
rapids_export_package(BUILD spdlog rmm-exports)
- if(spdlog_ADDED)
- rapids_export(
- BUILD spdlog
- EXPORT_SET spdlog
- GLOBAL_TARGETS spdlog spdlog_header_only
- NAMESPACE spdlog::)
- include("${rapids-cmake-dir}/export/find_package_root.cmake")
- rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET rmm-exports)
- endif()
endfunction()
find_and_configure_spdlog()
\ No newline at end of file
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index eb28cc1626..38318e8ec8 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -86,13 +86,14 @@ using KeyValueIndexOp = detail::KeyValueIndexOp;
* @param[out] n_iter Number of iterations run.
*/
template
-void fit(raft::resources const& handle,
- const KMeansParams& params,
- raft::device_matrix_view X,
- std::optional> sample_weight,
- raft::device_matrix_view centroids,
- raft::host_scalar_view inertia,
- raft::host_scalar_view n_iter)
+[[deprecated("Use cuVS instead")]] void fit(
+ raft::resources const& handle,
+ const KMeansParams& params,
+ raft::device_matrix_view X,
+ std::optional> sample_weight,
+ raft::device_matrix_view centroids,
+ raft::host_scalar_view inertia,
+ raft::host_scalar_view n_iter)
{
detail::kmeans_fit(handle, params, X, sample_weight, centroids, inertia, n_iter);
}
@@ -150,14 +151,15 @@ void fit(raft::resources const& handle,
* their closest cluster center.
*/
template
-void predict(raft::resources const& handle,
- const KMeansParams& params,
- raft::device_matrix_view X,
- std::optional> sample_weight,
- raft::device_matrix_view centroids,
- raft::device_vector_view labels,
- bool normalize_weight,
- raft::host_scalar_view inertia)
+[[deprecated("Use cuVS instead")]] void predict(
+ raft::resources const& handle,
+ const KMeansParams& params,
+ raft::device_matrix_view X,
+ std::optional> sample_weight,
+ raft::device_matrix_view centroids,
+ raft::device_vector_view labels,
+ bool normalize_weight,
+ raft::host_scalar_view inertia)
{
detail::kmeans_predict(
handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
@@ -213,14 +215,15 @@ void predict(raft::resources const& handle,
* @param[out] n_iter Number of iterations run.
*/
template
-void fit_predict(raft::resources const& handle,
- const KMeansParams& params,
- raft::device_matrix_view X,
- std::optional> sample_weight,
- std::optional> centroids,
- raft::device_vector_view labels,
- raft::host_scalar_view inertia,
- raft::host_scalar_view n_iter)
+[[deprecated("Use cuVS instead")]] void fit_predict(
+ raft::resources const& handle,
+ const KMeansParams& params,
+ raft::device_matrix_view X,
+ std::optional> sample_weight,
+ std::optional> centroids,
+ raft::device_vector_view labels,
+ raft::host_scalar_view inertia,
+ raft::host_scalar_view n_iter)
{
detail::kmeans_fit_predict(
handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
@@ -252,13 +255,13 @@ void transform(raft::resources const& handle,
}
template
-void transform(raft::resources const& handle,
- const KMeansParams& params,
- const DataT* X,
- const DataT* centroids,
- IndexT n_samples,
- IndexT n_features,
- DataT* X_new)
+[[deprecated("Use cuVS instead")]] void transform(raft::resources const& handle,
+ const KMeansParams& params,
+ const DataT* X,
+ const DataT* centroids,
+ IndexT n_samples,
+ IndexT n_features,
+ DataT* X_new)
{
detail::kmeans_transform(
handle, params, X, centroids, n_samples, n_features, X_new);
diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh
index a1a182608b..7479047fce 100644
--- a/cpp/include/raft/cluster/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/kmeans_balanced.cuh
@@ -73,11 +73,11 @@ namespace raft::cluster::kmeans_balanced {
* datatype. If DataT == MathT, this must be the identity.
*/
template
-void fit(const raft::resources& handle,
- kmeans_balanced_params const& params,
- raft::device_matrix_view X,
- raft::device_matrix_view centroids,
- MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void fit(const raft::resources& handle,
+ kmeans_balanced_params const& params,
+ raft::device_matrix_view X,
+ raft::device_matrix_view centroids,
+ MappingOpT mapping_op = raft::identity_op())
{
RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
"Number of features in dataset and centroids are different");
@@ -131,12 +131,13 @@ template
-void predict(const raft::resources& handle,
- kmeans_balanced_params const& params,
- raft::device_matrix_view X,
- raft::device_matrix_view centroids,
- raft::device_vector_view labels,
- MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void predict(
+ const raft::resources& handle,
+ kmeans_balanced_params const& params,
+ raft::device_matrix_view X,
+ raft::device_matrix_view centroids,
+ raft::device_vector_view labels,
+ MappingOpT mapping_op = raft::identity_op())
{
RAFT_EXPECTS(X.extent(0) == labels.extent(0),
"Number of rows in dataset and labels are different");
@@ -196,12 +197,13 @@ template
-void fit_predict(const raft::resources& handle,
- kmeans_balanced_params const& params,
- raft::device_matrix_view X,
- raft::device_matrix_view centroids,
- raft::device_vector_view labels,
- MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void fit_predict(
+ const raft::resources& handle,
+ kmeans_balanced_params const& params,
+ raft::device_matrix_view X,
+ raft::device_matrix_view centroids,
+ raft::device_vector_view labels,
+ MappingOpT mapping_op = raft::identity_op())
{
auto centroids_const = raft::make_device_matrix_view(
centroids.data_handle(), centroids.extent(0), centroids.extent(1));
@@ -255,14 +257,15 @@ template
-void build_clusters(const raft::resources& handle,
- const kmeans_balanced_params& params,
- raft::device_matrix_view X,
- raft::device_matrix_view centroids,
- raft::device_vector_view labels,
- raft::device_vector_view cluster_sizes,
- MappingOpT mapping_op = raft::identity_op(),
- std::optional> X_norm = std::nullopt)
+[[deprecated("Use cuVS instead")]] void build_clusters(
+ const raft::resources& handle,
+ const kmeans_balanced_params& params,
+ raft::device_matrix_view X,
+ raft::device_matrix_view centroids,
+ raft::device_vector_view labels,
+ raft::device_vector_view cluster_sizes,
+ MappingOpT mapping_op = raft::identity_op(),
+ std::optional> X_norm = std::nullopt)
{
RAFT_EXPECTS(X.extent(0) == labels.extent(0),
"Number of rows in dataset and labels are different");
@@ -334,13 +337,14 @@ template
-void calc_centers_and_sizes(const raft::resources& handle,
- raft::device_matrix_view X,
- raft::device_vector_view labels,
- raft::device_matrix_view centroids,
- raft::device_vector_view cluster_sizes,
- bool reset_counters = true,
- MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void calc_centers_and_sizes(
+ const raft::resources& handle,
+ raft::device_matrix_view X,
+ raft::device_vector_view labels,
+ raft::device_matrix_view centroids,
+ raft::device_vector_view cluster_sizes,
+ bool reset_counters = true,
+ MappingOpT mapping_op = raft::identity_op())
{
RAFT_EXPECTS(X.extent(0) == labels.extent(0),
"Number of rows in dataset and labels are different");
diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh
index d9eba6edc5..067445c542 100644
--- a/cpp/include/raft/cluster/single_linkage.cuh
+++ b/cpp/include/raft/cluster/single_linkage.cuh
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -50,14 +50,14 @@ namespace raft::cluster {
template
-void single_linkage(raft::resources const& handle,
- const value_t* X,
- size_t m,
- size_t n,
- raft::distance::DistanceType metric,
- linkage_output* out,
- int c,
- size_t n_clusters)
+[[deprecated("Use cuVS instead")]] void single_linkage(raft::resources const& handle,
+ const value_t* X,
+ size_t m,
+ size_t n,
+ raft::distance::DistanceType metric,
+ linkage_output* out,
+ int c,
+ size_t n_clusters)
{
detail::single_linkage(
handle, X, m, n, metric, out, c, n_clusters);
@@ -87,13 +87,14 @@ constexpr int DEFAULT_CONST_C = 15;
control of k. The algorithm will set `k = log(n) + c`
*/
template
-void single_linkage(raft::resources const& handle,
- raft::device_matrix_view X,
- raft::device_matrix_view dendrogram,
- raft::device_vector_view labels,
- raft::distance::DistanceType metric,
- size_t n_clusters,
- std::optional c = std::make_optional(DEFAULT_CONST_C))
+[[deprecated("Use cuVS instead")]] void single_linkage(
+ raft::resources const& handle,
+ raft::device_matrix_view X,
+ raft::device_matrix_view dendrogram,
+ raft::device_vector_view labels,
+ raft::distance::DistanceType metric,
+ size_t n_clusters,
+ std::optional c = std::make_optional(DEFAULT_CONST_C))
{
linkage_output out_arrs;
out_arrs.children = dendrogram.data_handle();
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index c5d64f6a29..ed869e6cae 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -310,13 +310,13 @@ class std_comms : public comms_iface {
// Wait for a UCXX progress thread roundtrip, prevent waiting for longer
// than 10ms for each operation, will retry in next iteration.
ucxx::utils::CallbackNotifier callbackNotifierPre{};
- worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); },
- 10000000 /* 10ms */);
+ (void)worker->registerGenericPre(
+ [&callbackNotifierPre]() { callbackNotifierPre.set(); }, 10000000 /* 10ms */);
callbackNotifierPre.wait();
ucxx::utils::CallbackNotifier callbackNotifierPost{};
- worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); },
- 10000000 /* 10ms */);
+ (void)worker->registerGenericPost(
+ [&callbackNotifierPost]() { callbackNotifierPost.set(); }, 10000000 /* 10ms */);
callbackNotifierPost.wait();
} else {
// Causes UCXX to progress through the send/recv message queue
diff --git a/cpp/include/raft/comms/nccl_clique.hpp b/cpp/include/raft/comms/nccl_clique.hpp
new file mode 100644
index 0000000000..c6520af753
--- /dev/null
+++ b/cpp/include/raft/comms/nccl_clique.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+
+#include
+#include
+
+#include
+
+/**
+ * @brief Error checking macro for NCCL runtime API functions.
+ *
+ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
+ * exception detailing the NCCL error that occurred
+ */
+#define RAFT_NCCL_TRY(call) \
+ do { \
+ ncclResult_t const status = (call); \
+ if (ncclSuccess != status) { \
+ std::string msg{}; \
+ SET_ERROR_MSG(msg, \
+ "NCCL error encountered at: ", \
+ "call='%s', Reason=%d:%s", \
+ #call, \
+ status, \
+ ncclGetErrorString(status)); \
+ throw raft::logic_error(msg); \
+ } \
+ } while (0);
+
+namespace raft::comms {
+void build_comms_nccl_only(raft::resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank);
+}
+
+namespace raft::comms {
+
+struct nccl_clique {
+ using pool_mr = rmm::mr::pool_memory_resource;
+
+ /**
+ * Instantiates a NCCL clique with all available GPUs
+ *
+ * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+ *
+ */
+ nccl_clique(int percent_of_free_memory = 80)
+ : root_rank_(0),
+ percent_of_free_memory_(percent_of_free_memory),
+ per_device_pools_(0),
+ device_resources_(0)
+ {
+ cudaGetDeviceCount(&num_ranks_);
+ device_ids_.resize(num_ranks_);
+ std::iota(device_ids_.begin(), device_ids_.end(), 0);
+ nccl_comms_.resize(num_ranks_);
+ nccl_clique_init();
+ }
+
+ /**
+ * Instantiates a NCCL clique
+ *
+ * Usage example:
+ * @code{.cpp}
+ * int n_devices;
+ * cudaGetDeviceCount(&n_devices);
+ * std::vector device_ids(n_devices);
+ * std::iota(device_ids.begin(), device_ids.end(), 0);
+ * cuvs::neighbors::mg::nccl_clique& clique(device_ids); // first device is the root rank
+ * @endcode
+ *
+ * @param[in] device_ids list of device IDs to be used to initiate the clique
+ * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+ *
+ */
+ nccl_clique(const std::vector& device_ids, int percent_of_free_memory = 80)
+ : root_rank_(0),
+ num_ranks_(device_ids.size()),
+ percent_of_free_memory_(percent_of_free_memory),
+ device_ids_(device_ids),
+ nccl_comms_(device_ids.size()),
+ per_device_pools_(0),
+ device_resources_(0)
+ {
+ nccl_clique_init();
+ }
+
+ void nccl_clique_init()
+ {
+ RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), num_ranks_, device_ids_.data()));
+
+ for (int rank = 0; rank < num_ranks_; rank++) {
+ RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+
+ // create a pool memory resource for each device
+ auto old_mr = rmm::mr::get_current_device_resource();
+ per_device_pools_.push_back(std::make_unique(
+ old_mr, rmm::percent_of_free_device_memory(percent_of_free_memory_)));
+ rmm::cuda_device_id id(device_ids_[rank]);
+ rmm::mr::set_per_device_resource(id, per_device_pools_.back().get());
+
+ // create a device resource handle for each device
+ device_resources_.emplace_back();
+
+ // add NCCL communications to the device resource handle
+ raft::comms::build_comms_nccl_only(
+ &device_resources_[rank], nccl_comms_[rank], num_ranks_, rank);
+ }
+
+ for (int rank = 0; rank < num_ranks_; rank++) {
+ RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+ raft::resource::sync_stream(device_resources_[rank]);
+ }
+ }
+
+ const raft::device_resources& set_current_device_to_root_rank() const
+ {
+ int root_device_id = device_ids_[root_rank_];
+ RAFT_CUDA_TRY(cudaSetDevice(root_device_id));
+ return device_resources_[root_rank_];
+ }
+
+ ~nccl_clique()
+ {
+#pragma omp parallel for // necessary to avoid hangs
+ for (int rank = 0; rank < num_ranks_; rank++) {
+ cudaSetDevice(device_ids_[rank]);
+ ncclCommDestroy(nccl_comms_[rank]);
+ rmm::cuda_device_id id(device_ids_[rank]);
+ rmm::mr::set_per_device_resource(id, nullptr);
+ }
+ }
+
+ int root_rank_;
+ int num_ranks_;
+ int percent_of_free_memory_;
+ std::vector device_ids_;
+ std::vector nccl_comms_;
+ std::vector> per_device_pools_;
+ std::vector device_resources_;
+};
+
+} // namespace raft::comms
diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh
index cafd1977ab..024b1244a6 100644
--- a/cpp/include/raft/core/bitmap.cuh
+++ b/cpp/include/raft/core/bitmap.cuh
@@ -35,9 +35,9 @@ _RAFT_HOST_DEVICE inline bool bitmap_view::test(const index_t
}
template
-_RAFT_HOST_DEVICE void bitmap_view::set(const index_t row,
- const index_t col,
- bool new_value) const
+_RAFT_DEVICE void bitmap_view::set(const index_t row,
+ const index_t col,
+ bool new_value) const
{
set(row * cols_ + col, new_value);
}
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index 0cdb4c1fb6..d1bffdb81e 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -26,6 +26,8 @@
#include
#include
+#include
+
#include
namespace raft::core {
@@ -46,8 +48,8 @@ _RAFT_HOST_DEVICE bool bitset_view::operator[](const index_t
}
template
-_RAFT_HOST_DEVICE void bitset_view::set(const index_t sample_index,
- bool set_value) const
+_RAFT_DEVICE void bitset_view::set(const index_t sample_index,
+ bool set_value) const
{
const index_t bit_element = sample_index / bitset_element_size;
const index_t bit_index = sample_index % bitset_element_size;
@@ -61,9 +63,106 @@ _RAFT_HOST_DEVICE void bitset_view::set(const index_t sample_
}
template
-_RAFT_HOST_DEVICE inline index_t bitset_view::n_elements() const
+void bitset_view::count(const raft::resources& res,
+ raft::device_scalar_view count_gpu_scalar) const
+{
+ auto max_len = raft::make_host_scalar_view(&bitset_len_);
+ auto values = raft::make_device_vector_view(bitset_ptr_, n_elements());
+ raft::popc(res, values, max_len, count_gpu_scalar);
+}
+
+template
+RAFT_KERNEL bitset_repeat_kernel(const bitset_t* src,
+ bitset_t* output,
+ index_t src_bit_len,
+ index_t repeat_times)
+{
+ constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+ int output_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+ index_t total_bits = src_bit_len * repeat_times;
+ index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
+ index_t src_size = (src_bit_len + bits_per_element - 1) / bits_per_element;
+
+ if (output_idx < output_size) {
+ bitset_t result = 0;
+ index_t bit_written = 0;
+
+ index_t start_bit = output_idx * bits_per_element;
+
+ while (bit_written < bits_per_element && start_bit + bit_written < total_bits) {
+ index_t bit_idx = (start_bit + bit_written) % src_bit_len;
+ index_t src_word_idx = bit_idx / bits_per_element;
+ index_t src_offset = bit_idx % bits_per_element;
+
+ index_t remaining_bits = min(bits_per_element - bit_written, src_bit_len - bit_idx);
+
+ bitset_t src_value = (src[src_word_idx] >> src_offset);
+
+ if (src_offset + remaining_bits > bits_per_element) {
+ bitset_t next_value = src[(src_word_idx + 1) % src_size];
+ src_value |= (next_value << (bits_per_element - src_offset));
+ }
+ src_value &= ((bitset_t{1} << remaining_bits) - 1);
+ result |= (src_value << bit_written);
+ bit_written += remaining_bits;
+ }
+ output[output_idx] = result;
+ }
+}
+
+template
+void bitset_repeat(raft::resources const& handle,
+ const bitset_t* d_src,
+ bitset_t* d_output,
+ index_t src_bit_len,
+ index_t repeat_times)
+{
+ if (src_bit_len == 0 || repeat_times == 0) return;
+ auto stream = resource::get_cuda_stream(handle);
+
+ constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+ const index_t total_bits = src_bit_len * repeat_times;
+ const index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
+
+ int threadsPerBlock = 128;
+ int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;
+ bitset_repeat_kernel<<>>(
+ d_src, d_output, src_bit_len, repeat_times);
+
+ return;
+}
+
+template
+void bitset_view::repeat(const raft::resources& res,
+ index_t times,
+ bitset_t* output_device_ptr) const
+{
+ auto thrust_policy = raft::resource::get_thrust_policy(res);
+ constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+
+ if (bitset_len_ % bits_per_element == 0) {
+ index_t num_elements_to_copy = bitset_len_ / bits_per_element;
+
+ for (index_t i = 0; i < times; ++i) {
+ raft::copy(output_device_ptr + i * num_elements_to_copy,
+ bitset_ptr_,
+ num_elements_to_copy,
+ raft::resource::get_cuda_stream(res));
+ }
+ } else {
+ bitset_repeat(res, bitset_ptr_, output_device_ptr, bitset_len_, times);
+ }
+}
+
+template
+double bitset_view::sparsity(const raft::resources& res) const
{
- return raft::ceildiv(bitset_len_, bitset_element_size);
+ index_t size_h = this->size();
+ if (0 == size_h) { return static_cast(1.0); }
+ index_t count_h = this->count(res);
+
+ return static_cast((1.0 * (size_h - count_h)) / (1.0 * size_h));
}
template
@@ -71,7 +170,7 @@ bitset::bitset(const raft::resources& res,
raft::device_vector_view mask_index,
index_t bitset_len,
bool default_value)
- : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+ : bitset_{std::size_t(raft::div_rounding_up_safe(bitset_len, bitset_element_size)),
raft::resource::get_cuda_stream(res)},
bitset_len_{bitset_len}
{
@@ -83,26 +182,20 @@ template
bitset::bitset(const raft::resources& res,
index_t bitset_len,
bool default_value)
- : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+ : bitset_{std::size_t(raft::div_rounding_up_safe(bitset_len, bitset_element_size)),
raft::resource::get_cuda_stream(res)},
bitset_len_{bitset_len}
{
reset(res, default_value);
}
-template
-index_t bitset::n_elements() const
-{
- return raft::ceildiv(bitset_len_, bitset_element_size);
-}
-
template
void bitset::resize(const raft::resources& res,
index_t new_bitset_len,
bool default_value)
{
- auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
- auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
+ auto old_size = raft::div_rounding_up_safe(bitset_len_, bitset_element_size);
+ auto new_size = raft::div_rounding_up_safe(new_bitset_len, bitset_element_size);
bitset_.resize(new_size);
bitset_len_ = new_bitset_len;
if (old_size < new_size) {
@@ -167,7 +260,7 @@ template
void bitset::count(const raft::resources& res,
raft::device_scalar_view count_gpu_scalar)
{
- auto max_len = raft::make_host_scalar_view(&bitset_len_);
+ auto max_len = raft::make_host_scalar_view(&bitset_len_);
auto values =
raft::make_device_vector_view(bitset_.data(), n_elements());
raft::popc(res, values, max_len, count_gpu_scalar);
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index 0df12f25e6..be828def87 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -20,6 +20,9 @@
#include
#include
#include
+#include
+
+#include
namespace raft::core {
/**
@@ -89,7 +92,10 @@ struct bitset_view {
/**
* @brief Get the number of elements used by the bitset representation.
*/
- inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t;
+ inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t
+ {
+ return raft::div_rounding_up_safe(bitset_len_, bitset_element_size);
+ }
inline auto to_mdspan() -> raft::device_vector_view
{
@@ -99,6 +105,80 @@ struct bitset_view {
{
return raft::make_device_vector_view(bitset_ptr_, n_elements());
}
+ /**
+ * @brief Returns the number of bits set to true in count_gpu_scalar.
+ *
+ * @param[in] res RAFT resources
+ * @param[out] count_gpu_scalar Device scalar to store the count
+ */
+ void count(const raft::resources& res, raft::device_scalar_view count_gpu_scalar) const;
+ /**
+ * @brief Returns the number of bits set to true.
+ *
+ * @param res RAFT resources
+ * @return index_t Number of bits set to true
+ */
+ auto count(const raft::resources& res) const -> index_t
+ {
+ auto count_gpu_scalar = raft::make_device_scalar(res, 0.0);
+ count(res, count_gpu_scalar.view());
+ index_t count_cpu = 0;
+ raft::update_host(
+ &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+ resource::sync_stream(res);
+ return count_cpu;
+ }
+
+ /**
+ * @brief Repeats the bitset data and copies it to the output device pointer.
+ *
+ * This function takes the original bitset data stored in the device memory
+ * and repeats it a specified number of times into a new location in the device memory.
+ * The bits are copied bit-by-bit to ensure that even if the number of bits (bitset_len_)
+ * is not a multiple of the bitset element size (e.g., 32 for uint32_t), the bits are
+ * tightly packed without any gaps between rows.
+ *
+ * @param res RAFT resources for managing CUDA streams and execution policies.
+ * @param times Number of times the bitset data should be repeated in the output.
+ * @param output_device_ptr Device pointer where the repeated bitset data will be stored.
+ *
+ * The caller must ensure that the output device pointer has enough memory allocated
+ * to hold `times * bitset_len` bits, where `bitset_len` is the number of bits in the original
+ * bitset. This function uses Thrust parallel algorithms to efficiently perform the operation on
+ * the GPU.
+ */
+ void repeat(const raft::resources& res, index_t times, bitset_t* output_device_ptr) const;
+
+ /**
+ * @brief Calculate the sparsity (fraction of 0s) of the bitset.
+ *
+ * This function computes the sparsity of the bitset, defined as the ratio of unset bits (0s)
+ * to the total number of bits in the set. If the total number of bits is zero, the function
+ * returns 1.0, indicating the set is fully sparse.
+ *
+ * @param res RAFT resources for managing CUDA streams and execution policies.
+ * @return double The sparsity of the bitset, i.e., the fraction of unset bits.
+ *
+ * This API will synchronize on the stream of `res`.
+ */
+ double sparsity(const raft::resources& res) const;
+
+ /**
+ * @brief Calculates the number of `bitset_t` elements required to store a bitset.
+ *
+ * This function computes the number of `bitset_t` elements needed to store a bitset, ensuring
+ * that all bits are accounted for. If the bitset length is not a multiple of the `bitset_t` size
+ * (in bits), the calculation rounds up to include the remaining bits in an additional `bitset_t`
+ * element.
+ *
+ * @param bitset_len The total length of the bitset in bits.
+ * @return size_t The number of `bitset_t` elements required to store the bitset.
+ */
+ static inline size_t eval_n_elements(size_t bitset_len)
+ {
+ const size_t bits_per_element = sizeof(bitset_t) * 8;
+ return (bitset_len + bits_per_element - 1) / bits_per_element;
+ }
private:
bitset_t* bitset_ptr_;
@@ -173,7 +253,10 @@ struct bitset {
/**
* @brief Get the number of elements used by the bitset representation.
*/
- inline auto n_elements() const -> index_t;
+ inline auto n_elements() const -> index_t
+ {
+ return raft::div_rounding_up_safe(bitset_len_, bitset_element_size);
+ }
/** @brief Get an mdspan view of the current bitset */
inline auto to_mdspan() -> raft::device_vector_view
diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
index 04e74c4e58..4faded5041 100644
--- a/cpp/include/raft/core/detail/copy.hpp
+++ b/cpp/include/raft/core/detail/copy.hpp
@@ -32,6 +32,7 @@
#include
#include
#ifdef __CUDACC__
+#include
#include
#endif
#endif
@@ -449,38 +450,51 @@ mdspan_copyable_t copy(resources const& res, DstType&& dst, Sr
#endif
} else if constexpr (config::can_use_cublas) {
#ifndef RAFT_DISABLE_CUDA
- auto constexpr const alpha = typename std::remove_reference_t::value_type{1};
- auto constexpr const beta = typename std::remove_reference_t::value_type{0};
- if constexpr (std::is_same_v) {
- CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
- CUBLAS_OP_T,
- CUBLAS_OP_N,
- dst.extent(1),
- dst.extent(0),
- &alpha,
- src.data_handle(),
- src.extent(0),
- &beta,
- dst.data_handle(),
- dst.extent(1),
- dst.data_handle(),
- dst.extent(1),
- resource::get_cuda_stream(res)));
+ if constexpr (!((std::is_same_v::value_type, half>)&&(
+ std::is_same_v::value_type, half>))) {
+ auto constexpr const alpha = typename std::remove_reference_t::value_type{1};
+ auto constexpr const beta = typename std::remove_reference_t::value_type{0};
+ if constexpr (std::is_same_v) {
+ CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+ CUBLAS_OP_T,
+ CUBLAS_OP_N,
+ dst.extent(1),
+ dst.extent(0),
+ &alpha,
+ src.data_handle(),
+ src.extent(0),
+ &beta,
+ dst.data_handle(),
+ dst.extent(1),
+ dst.data_handle(),
+ dst.extent(1),
+ resource::get_cuda_stream(res)));
+ } else {
+ CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
+ CUBLAS_OP_T,
+ CUBLAS_OP_N,
+ dst.extent(0),
+ dst.extent(1),
+ &alpha,
+ src.data_handle(),
+ src.extent(1),
+ &beta,
+ dst.data_handle(),
+ dst.extent(0),
+ dst.data_handle(),
+ dst.extent(0),
+ resource::get_cuda_stream(res)));
+ }
} else {
- CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res),
- CUBLAS_OP_T,
- CUBLAS_OP_N,
- dst.extent(0),
- dst.extent(1),
- &alpha,
- src.data_handle(),
- src.extent(1),
- &beta,
- dst.data_handle(),
- dst.extent(0),
- dst.data_handle(),
- dst.extent(0),
- resource::get_cuda_stream(res)));
+#ifdef __CUDACC__
+ raft::linalg::transpose(res, dst, src);
+#else
+ // Should never actually reach this because of enable_ifs. Included for
+ // safety.
+ RAFT_FAIL(
+ "raft::copy called in a way that requires custom kernel. Please use "
+ "raft/core/copy.cuh and include the header in a .cu file");
+#endif
}
#else
// Not possible to reach this due to enable_ifs. Included for safety.
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index e082aaf41a..c5de345082 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -106,7 +106,13 @@ template
RAFT_INLINE_FUNCTION auto asin(T x)
{
#ifdef __CUDA_ARCH__
- return ::asin(x);
+ if constexpr (std::is_same::value) {
+ float x_float = __half2float(x);
+ float result_float = ::asin(x_float);
+ return __float2half(result_float);
+ } else {
+ return ::asin(x);
+ }
#else
return std::asin(x);
#endif
@@ -337,6 +343,12 @@ RAFT_INLINE_FUNCTION auto max(const T1& x, const T2& y)
((std::is_same_v || std::is_same_v)&&(
std::is_same_v || std::is_same_v))) {
return ::max(x, y);
+ } else if constexpr (std::is_same_v && std::is_same_v) {
+ const float f_y = __half2float(y);
+ return (x < f_y) ? f_y : x;
+ } else if constexpr (std::is_same_v && std::is_same_v) {
+ const float f_x = __half2float(x);
+ return (f_x < y) ? y : f_x;
}
// Else, check that the types are the same and provide a generic implementation
else {
diff --git a/cpp/include/raft/core/operators.hpp b/cpp/include/raft/core/operators.hpp
index e42801fe32..6b10baa332 100644
--- a/cpp/include/raft/core/operators.hpp
+++ b/cpp/include/raft/core/operators.hpp
@@ -19,6 +19,8 @@
#include
#include
+#include
+
#include
#include
#include
@@ -104,13 +106,27 @@ struct sq_op {
{
return in * in;
}
+
+ template
+ constexpr RAFT_INLINE_FUNCTION auto operator()(const half& in, UnusedArgs...) const
+ {
+ return __half2float(in) * __half2float(in);
+ }
};
struct add_op {
template
constexpr RAFT_INLINE_FUNCTION auto operator()(const T1& a, const T2& b) const
{
- return a + b;
+ if constexpr (std::is_same_v && std::is_same_v) {
+ return __half2float(a) + __half2float(b);
+ } else if constexpr (std::is_same_v) {
+ return __half2float(a) + b;
+ } else if constexpr (std::is_same_v) {
+ return a + __half2float(b);
+ } else {
+ return a + b;
+ }
}
};
diff --git a/cpp/include/raft/core/resource/nccl_clique.hpp b/cpp/include/raft/core/resource/nccl_clique.hpp
new file mode 100644
index 0000000000..edda5043ae
--- /dev/null
+++ b/cpp/include/raft/core/resource/nccl_clique.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include
+#include
+#include
+
+#include
+
+namespace raft::resource {
+
+class nccl_clique_resource : public resource {
+ public:
+ nccl_clique_resource() : clique_(std::make_unique()) {}
+ ~nccl_clique_resource() override {}
+ void* get_resource() override { return clique_.get(); }
+
+ private:
+ std::unique_ptr clique_;
+};
+
+/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
+class nccl_clique_resource_factory : public resource_factory {
+ public:
+ resource_type get_resource_type() override { return resource_type::NCCL_CLIQUE; }
+ resource* make_resource() override { return new nccl_clique_resource(); }
+};
+
+/**
+ * @defgroup nccl_clique_resource resource functions
+ * @{
+ */
+
+/**
+ * Retrieves a NCCL clique from raft res if it exists, otherwise initializes it and return it.
+ *
+ * @param[in] res the raft resources object
+ * @return NCCL clique
+ */
+inline const raft::comms::nccl_clique& get_nccl_clique(resources const& res)
+{
+ if (!res.has_resource_factory(resource_type::NCCL_CLIQUE)) {
+ res.add_resource_factory(std::make_shared());
+ }
+ return *res.get_resource(resource_type::NCCL_CLIQUE);
+};
+
+/**
+ * @}
+ */
+
+} // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index d9126251c9..4fa84c3bdb 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -46,6 +46,7 @@ enum resource_type {
CUBLASLT_HANDLE, // cublasLt handle
CUSTOM, // runtime-shared default-constructible resource
LARGE_WORKSPACE_RESOURCE, // rmm device memory resource for somewhat large temporary allocations
+ NCCL_CLIQUE, // nccl clique
LAST_KEY // reserved for the last key
};
diff --git a/cpp/include/raft/distance/detail/masked_distance_base.cuh b/cpp/include/raft/distance/detail/masked_distance_base.cuh
index 55da634145..96b778f11f 100644
--- a/cpp/include/raft/distance/detail/masked_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/masked_distance_base.cuh
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -266,7 +266,7 @@ struct MaskedDistances : public BaseClass {
for (int i = 0; i < P::AccRowsPerTh; ++i) {
#pragma unroll
for (int j = 0; j < P::AccColsPerTh; ++j) {
- acc[i][j] = BaseClass::Zero;
+ acc[i][j] = BaseClass::Zero();
}
}
}
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index c6b09be31e..a8a541bf53 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -200,7 +200,7 @@ struct PairwiseDistances : public BaseClass {
for (int i = 0; i < P::AccRowsPerTh; ++i) {
#pragma unroll
for (int j = 0; j < P::AccColsPerTh; ++j) {
- acc[i][j] = BaseClass::Zero;
+ acc[i][j] = BaseClass::Zero();
}
}
}
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 2d41e029fe..dcbfbfdbc3 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -35,42 +35,43 @@ template
-void distance(raft::resources const& handle,
- const DataT* x,
- const DataT* y,
- OutT* dist,
- IdxT m,
- IdxT n,
- IdxT k,
- void* workspace,
- size_t worksize,
- FinalLambda fin_op,
- bool isRowMajor = true,
- DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+ const DataT* x,
+ const DataT* y,
+ OutT* dist,
+ IdxT m,
+ IdxT n,
+ IdxT k,
+ void* workspace,
+ size_t worksize,
+ FinalLambda fin_op,
+ bool isRowMajor = true,
+ DataT metric_arg = 2.0f) RAFT_EXPLICIT;
template
-void distance(raft::resources const& handle,
- const DataT* x,
- const DataT* y,
- OutT* dist,
- IdxT m,
- IdxT n,
- IdxT k,
- void* workspace,
- size_t worksize,
- bool isRowMajor = true,
- DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+ const DataT* x,
+ const DataT* y,
+ OutT* dist,
+ IdxT m,
+ IdxT n,
+ IdxT k,
+ void* workspace,
+ size_t worksize,
+ bool isRowMajor = true,
+ DataT metric_arg = 2.0f) RAFT_EXPLICIT;
template
-size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] size_t getWorkspaceSize(
+ const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
template
-void distance(raft::resources const& handle,
- const DataT* x,
- const DataT* y,
- OutT* dist,
- IdxT m,
- IdxT n,
- IdxT k,
- bool isRowMajor = true,
- DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+ const DataT* x,
+ const DataT* y,
+ OutT* dist,
+ IdxT m,
+ IdxT n,
+ IdxT k,
+ bool isRowMajor = true,
+ DataT metric_arg = 2.0f) RAFT_EXPLICIT;
template
-void pairwise_distance(raft::resources const& handle,
- const Type* x,
- const Type* y,
- Type* dist,
- IdxT m,
- IdxT n,
- IdxT k,
- rmm::device_uvector& workspace,
- raft::distance::DistanceType metric,
- bool isRowMajor = true,
- Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(raft::resources const& handle,
+ const Type* x,
+ const Type* y,
+ Type* dist,
+ IdxT m,
+ IdxT n,
+ IdxT k,
+ rmm::device_uvector& workspace,
+ raft::distance::DistanceType metric,
+ bool isRowMajor = true,
+ Type metric_arg = 2.0f) RAFT_EXPLICIT;
template
-void pairwise_distance(raft::resources const& handle,
- const Type* x,
- const Type* y,
- Type* dist,
- IdxT m,
- IdxT n,
- IdxT k,
- raft::distance::DistanceType metric,
- bool isRowMajor = true,
- Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(raft::resources const& handle,
+ const Type* x,
+ const Type* y,
+ Type* dist,
+ IdxT m,
+ IdxT n,
+ IdxT k,
+ raft::distance::DistanceType metric,
+ bool isRowMajor = true,
+ Type metric_arg = 2.0f) RAFT_EXPLICIT;
template
-void distance(raft::resources const& handle,
- raft::device_matrix_view const x,
- raft::device_matrix_view const y,
- raft::device_matrix_view dist,
- DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(
+ raft::resources const& handle,
+ raft::device_matrix_view const x,
+ raft::device_matrix_view const y,
+ raft::device_matrix_view dist,
+ DataT metric_arg = 2.0f) RAFT_EXPLICIT;
template
-void pairwise_distance(raft::resources const& handle,
- device_matrix_view const x,
- device_matrix_view