diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 536537f07f..27f619f391 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 92e7613a9b..db2a5dbcc6 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,27 +5,27 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": { + "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": { "version": "1.15.0" }, - "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": { + "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": { "version": "11.8", "installcuBLAS": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json similarity index 84% rename from .devcontainer/cuda12.2-conda/devcontainer.json rename to .devcontainer/cuda12.5-conda/devcontainer.json index 948680eaf6..836a5feacd 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -3,24 +3,24 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent", @@ -29,7 +29,7 @@ "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json similarity index 80% rename from .devcontainer/cuda12.2-pip/devcontainer.json rename to .devcontainer/cuda12.5-pip/devcontainer.json index cd287569d8..28798cbbf5 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.5-pip/devcontainer.json @@ -3,36 +3,36 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.2", + "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": { + "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": { "version": "1.15.0" }, - "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": { - "version": "12.2", + "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": { + "version": "12.5", "installcuBLAS": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent", @@ -40,7 +40,7 @@ "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d1cc52592c..a7f455e200 100755 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -17,7 +17,7 @@ build.sh @rapidsai/raft-cmake-codeowners /.pre-commit-config.yaml @rapidsai/ci-codeowners #packaging code owners -/.devcontainers/ @rapidsai/packaging-codeowners +/.devcontainer/ @rapidsai/packaging-codeowners /conda/ @rapidsai/packaging-codeowners /dependencies.yaml @rapidsai/packaging-codeowners /build.sh @rapidsai/packaging-codeowners diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e013d4f1c5..e6f7043f82 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-pylibraft: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: package-name: pylibraft wheel-build-raft-dask: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -98,7 +98,7 @@ jobs: wheel-publish-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index c2d9556859..e6c9604221 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,29 +25,29 @@ jobs: - wheel-tests-raft-dask - devcontainer secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08 with: build_type: pull-request node_type: cpu16 conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 with: build_type: pull-request enable_check_symbols: true @@ -55,19 +55,19 @@ jobs: conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -77,37 +77,37 @@ jobs: wheel-build-pylibraft: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: build_type: pull-request script: ci/build_wheel_pylibraft.sh wheel-tests-pylibraft: needs: wheel-build-pylibraft secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: build_type: pull-request script: ci/test_wheel_pylibraft.sh wheel-build-raft-dask: needs: wheel-tests-pylibraft secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08 with: build_type: pull-request script: "ci/build_wheel_raft_dask.sh" wheel-tests-raft-dask: needs: wheel-build-raft-dask secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: build_type: pull-request script: ci/test_wheel_raft_dask.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08 with: arch: '["amd64"]' - cuda: '["12.2"]' + cuda: '["12.5"]' build_command: | sccache -z; build-all -DBUILD_PRIMS_BENCH=ON -DBUILD_ANN_BENCH=ON --verbose; diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 18094cc05a..0eba0f27d1 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -26,7 +26,7 @@ jobs: symbol_exclusions: _ZN\d+raft_cutlass conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -34,7 +34,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -42,7 +42,7 @@ jobs: sha: ${{ inputs.sha }} wheel-tests-pylibraft: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} @@ -51,7 +51,7 @@ jobs: script: ci/test_wheel_pylibraft.sh wheel-tests-raft-dask: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2b89948ec1..692fce5e16 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -94,17 +94,12 @@ repos: additional_dependencies: [tomli] args: ["--toml", "pyproject.toml"] exclude: (?x)^(^CHANGELOG.md$) - - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.8.0 - hooks: - - id: rapids-dependency-file-generator - args: ["--clean"] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - id: check-json - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v0.0.3 + rev: v0.2.0 hooks: - id: verify-copyright files: | @@ -120,6 +115,13 @@ repos: cpp/include/raft/thirdparty/| docs/source/sphinxext/github_link[.]py| cpp/cmake/modules/FindAVX[.]cmake + - id: verify-alpha-spec + args: ["--fix", "--mode=release"] + - repo: https://github.com/rapidsai/dependency-file-generator + rev: v1.13.11 + hooks: + - id: rapids-dependency-file-generator + args: ["--clean"] default_language_version: python: python3 diff --git a/CHANGELOG.md b/CHANGELOG.md index e0599dae8a..0685145dca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,53 @@ +# raft 24.08.00 (7 Aug 2024) + +## 🚨 Breaking Changes + +- [Refactor] move `popc` to under util ([#2394](https://github.com/rapidsai/raft/pull/2394)) [@rhdong](https://github.com/rhdong) +- [Opt] Expose the `detail::popc` as public API ([#2346](https://github.com/rapidsai/raft/pull/2346)) [@rhdong](https://github.com/rhdong) + +## 🐛 Bug Fixes + +- Add timeout to UCXX generic operations ([#2398](https://github.com/rapidsai/raft/pull/2398)) [@pentschev](https://github.com/pentschev) +- [Fix] bitmap set/test issue ([#2371](https://github.com/rapidsai/raft/pull/2371)) [@rhdong](https://github.com/rhdong) +- Fix 0 recall issue in `raft_cagra_hnswlib` ANN benchmark ([#2369](https://github.com/rapidsai/raft/pull/2369)) [@divyegala](https://github.com/divyegala) +- Fix `ef` setting in HNSW wrapper ([#2367](https://github.com/rapidsai/raft/pull/2367)) [@divyegala](https://github.com/divyegala) +- Fix cagra graph opt bug ([#2365](https://github.com/rapidsai/raft/pull/2365)) [@enp1s0](https://github.com/enp1s0) +- Fix a bug where the wrong API is used to free the memory ([#2361](https://github.com/rapidsai/raft/pull/2361)) [@PointKernel](https://github.com/PointKernel) +- Allow anonymous user in devcontainer name ([#2355](https://github.com/rapidsai/raft/pull/2355)) [@bdice](https://github.com/bdice) +- Fix compilation error when _CLK_BREAKDOWN is defined in cagra. ([#2350](https://github.com/rapidsai/raft/pull/2350)) [@jiangyinzuo](https://github.com/jiangyinzuo) +- ensure raft-dask wheel tests install pylibraft wheel from the same CI run, fix wheel dependencies ([#2349](https://github.com/rapidsai/raft/pull/2349)) [@jameslamb](https://github.com/jameslamb) +- Change --config-setting to --config-settings ([#2342](https://github.com/rapidsai/raft/pull/2342)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Add workaround for syevd in CUDA 12.0 ([#2332](https://github.com/rapidsai/raft/pull/2332)) [@lowener](https://github.com/lowener) + +## 🚀 New Features + +- [FEA] add the support of `masked_matmul` ([#2362](https://github.com/rapidsai/raft/pull/2362)) [@rhdong](https://github.com/rhdong) +- [FEA] Dice Distance for Dense Inputs ([#2359](https://github.com/rapidsai/raft/pull/2359)) [@aamijar](https://github.com/aamijar) +- [Opt] Expose the `detail::popc` as public API ([#2346](https://github.com/rapidsai/raft/pull/2346)) [@rhdong](https://github.com/rhdong) +- Enable distance return for NN Descent ([#2345](https://github.com/rapidsai/raft/pull/2345)) [@jinsolp](https://github.com/jinsolp) + +## 🛠️ Improvements + +- [Refactor] move `popc` to under util ([#2394](https://github.com/rapidsai/raft/pull/2394)) [@rhdong](https://github.com/rhdong) +- split up CUDA-suffixed dependencies in dependencies.yaml ([#2388](https://github.com/rapidsai/raft/pull/2388)) [@jameslamb](https://github.com/jameslamb) +- Use workflow branch 24.08 again ([#2385](https://github.com/rapidsai/raft/pull/2385)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Add cusparseSpMV_preprocess to cusparse wrapper ([#2384](https://github.com/rapidsai/raft/pull/2384)) [@Kh4ster](https://github.com/Kh4ster) +- Consolidate SUM reductions ([#2381](https://github.com/rapidsai/raft/pull/2381)) [@mfoerste4](https://github.com/mfoerste4) +- Use slicing kernel to copy distances inside NN Descent ([#2380](https://github.com/rapidsai/raft/pull/2380)) [@jinsolp](https://github.com/jinsolp) +- Build and test with CUDA 12.5.1 ([#2378](https://github.com/rapidsai/raft/pull/2378)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Add CUDA_STATIC_MATH_LIBRARIES ([#2376](https://github.com/rapidsai/raft/pull/2376)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- skip CMake 3.30.0 ([#2375](https://github.com/rapidsai/raft/pull/2375)) [@jameslamb](https://github.com/jameslamb) +- Use verify-alpha-spec hook ([#2373](https://github.com/rapidsai/raft/pull/2373)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Binarize Dice Distance for Dense Inputs ([#2370](https://github.com/rapidsai/raft/pull/2370)) [@aamijar](https://github.com/aamijar) +- [FEA] Add distance epilogue for NN Descent ([#2364](https://github.com/rapidsai/raft/pull/2364)) [@jinsolp](https://github.com/jinsolp) +- resolve dependency-file-generator warning, other rapids-build-backend followup ([#2360](https://github.com/rapidsai/raft/pull/2360)) [@jameslamb](https://github.com/jameslamb) +- Remove text builds of documentation ([#2354](https://github.com/rapidsai/raft/pull/2354)) [@vyasr](https://github.com/vyasr) +- Use default init in reduction ([#2351](https://github.com/rapidsai/raft/pull/2351)) [@akifcorduk](https://github.com/akifcorduk) +- ensure update-version.sh preserves alpha spec, add tests on version constants ([#2344](https://github.com/rapidsai/raft/pull/2344)) [@jameslamb](https://github.com/jameslamb) +- remove unnecessary 'setuptools' dependencies ([#2343](https://github.com/rapidsai/raft/pull/2343)) [@jameslamb](https://github.com/jameslamb) +- Use rapids-build-backend ([#2331](https://github.com/rapidsai/raft/pull/2331)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Add FAISS with RAFT enabled Benchmarking to raft-ann-bench ([#2026](https://github.com/rapidsai/raft/pull/2026)) [@tarang-jain](https://github.com/tarang-jain) + # raft 24.06.00 (5 Jun 2024) ## 🚨 Breaking Changes diff --git a/README.md b/README.md index fc56859557..d6f3ef9320 100755 --- a/README.md +++ b/README.md @@ -281,19 +281,19 @@ mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-vers ``` ```bash -# for CUDA 12.0 -mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0 +# for CUDA 12.5 +mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.5 ``` Note that the above commands will also install `libraft-headers` and `libraft`. You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project: ```bash -# for CUDA 12.0 -mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0 +# for CUDA 12.5 +mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.5 ``` -If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above. +If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.08/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above. ### Installing Python through Pip @@ -369,7 +369,7 @@ If citing the k-selection routines, please consider the following bibtex: isbn = {9798400701092}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, - location = {Denver, CO, USA} + location = {Denver, CO, USA}, series = {SC '23} } ``` diff --git a/VERSION b/VERSION index 0bff6981a3..ec8489fda9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -24.06.00 +24.08.00 diff --git a/build.sh b/build.sh index 148d23c9c1..a77dd188f4 100755 --- a/build.sh +++ b/build.sh @@ -496,18 +496,18 @@ fi # Build and (optionally) install the pylibraft Python package if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \ - python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/pylibraft + python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/pylibraft fi # Build and (optionally) install the raft-dask Python package if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \ - python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-dask + python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/raft-dask fi # Build and (optionally) install the raft-ann-bench Python package if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then - python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-ann-bench -vvv + python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/raft-ann-bench -vvv fi if hasArg docs; then diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 2778c2a7d7..c456bcae80 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -13,10 +13,8 @@ export CMAKE_GENERATOR=Ninja rapids-print-env -version=$(rapids-generate-version) - rapids-logger "Begin cpp build" -RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild conda/recipes/libraft +RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/libraft rapids-upload-conda-to-s3 cpp diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 9605b52f8b..a2447f5f06 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -8,7 +8,7 @@ rapids-logger "Create test conda environment" rapids-dependency-file-generator \ --output conda \ - --file_key docs \ + --file-key docs \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml rapids-mamba-retry env create --yes -f env.yaml -n docs @@ -41,10 +41,8 @@ popd rapids-logger "Build Python docs" pushd docs sphinx-build -b dirhtml source _html -sphinx-build -b text source _text -mkdir -p "${RAPIDS_DOCS_DIR}/raft/"{html,txt} +mkdir -p "${RAPIDS_DOCS_DIR}/raft/"html mv _html/* "${RAPIDS_DOCS_DIR}/raft/html" -mv _text/* "${RAPIDS_DOCS_DIR}/raft/txt" popd rapids-upload-docs diff --git a/ci/build_python.sh b/ci/build_python.sh index a8b76269ae..80d37b5ae3 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -22,12 +22,6 @@ git_commit=$(git rev-parse HEAD) export RAPIDS_PACKAGE_VERSION=${version} echo "${version}" > VERSION -package_dir="python" -for package_name in pylibraft raft-dask; do - underscore_package_name=$(echo "${package_name}" | tr "-" "_") - sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/${underscore_package_name}/_version.py" -done - # TODO: Remove `--no-test` flags once importing on a CPU # node works correctly rapids-conda-retry mambabuild \ diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index e3e7ce9c89..62d93a668e 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -14,47 +14,9 @@ rm -rf /usr/lib64/libuc* source rapids-configure-sccache source rapids-date-string -version=$(rapids-generate-version) -git_commit=$(git rev-parse HEAD) - RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -# This is the version of the suffix with a preceding hyphen. It's used -# everywhere except in the final wheel name. -PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" - -# Patch project metadata files to include the CUDA version suffix and version override. -pyproject_file="${package_dir}/pyproject.toml" -version_file="${package_dir}/${underscore_package_name}/_version.py" - -sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} -echo "${version}" > VERSION -sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file} - -# For nightlies we want to ensure that we're pulling in alphas as well. The -# easiest way to do so is to augment the spec with a constraint containing a -# min alpha version that doesn't affect the version bounds but does allow usage -# of alpha versions for that dependency without --pre -alpha_spec='' -if ! rapids-is-release-build; then - alpha_spec=',>=0.0.0a0' -fi - -if [[ ${package_name} == "raft-dask" ]]; then - sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} - sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} - sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} - sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file} - sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file} - sed -r -i "s/distributed-ucxx==(.*)\"/distributed-ucxx${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} -else - sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} -fi - -if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then - sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} - sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} -fi +rapids-generate-version > VERSION cd "${package_dir}" diff --git a/ci/check_style.sh b/ci/check_style.sh index d7baa88e8f..d7ba4cae25 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -8,7 +8,7 @@ rapids-logger "Create checks conda environment" rapids-dependency-file-generator \ --output conda \ - --file_key checks \ + --file-key checks \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml rapids-mamba-retry env create --yes -f env.yaml -n checks diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 9554a7dde8..194ad9a07b 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -56,21 +56,21 @@ DEPENDENCIES=( ) for FILE in dependencies.yaml conda/environments/*.yaml; do for DEP in "${DEPENDENCIES[@]}"; do - sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*/g" ${FILE}; + sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; done - sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; - sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; - sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; - sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; - sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; - sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; - sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE}; + sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; + sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; + sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; + sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; + sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; + sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; + sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*,>=0.0.0a0/g" ${FILE}; done for FILE in python/*/pyproject.toml; do for DEP in "${DEPENDENCIES[@]}"; do - sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE} + sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE} done - sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE} + sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE} done sed_runner "/^ucx_py_version:$/ {n;s/.*/ - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index f83ddf616d..05323e4f5d 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -11,7 +11,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ rapids-logger "Generate C++ testing dependencies" rapids-dependency-file-generator \ --output conda \ - --file_key test_cpp \ + --file-key test_cpp \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee env.yaml rapids-mamba-retry env create --yes -f env.yaml -n test diff --git a/ci/test_python.sh b/ci/test_python.sh index 59da1f0bc4..01e5ac9456 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -11,7 +11,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ --output conda \ - --file_key test_python \ + --file-key test_python \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml rapids-mamba-retry env create --yes -f env.yaml -n test diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh index bd531e7e85..9b1187592d 100755 --- a/ci/test_wheel_raft_dask.sh +++ b/ci/test_wheel_raft_dask.sh @@ -11,7 +11,8 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl -python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links dist/ +# echo to expand wildcard before adding `[extra]` requires for pip +python -m pip install -v "$(echo ./dist/raft_dask_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" test_dir="python/raft-dask/raft_dask/test" diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index 590c3eb68b..0728f1257d 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -11,7 +11,7 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 - cuda-python>=11.7.1,<12.0a0 @@ -20,8 +20,8 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-cuda==24.6.* -- distributed-ucxx==0.38.* +- dask-cuda==24.8.* +- distributed-ucxx==0.39.* - doxygen>=1.8.20 - gcc_linux-aarch64=11.* - graphviz @@ -35,7 +35,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- libucxx==0.38.* +- libucxx==0.39.* - nccl>=2.9.9 - ninja - numba>=0.57 @@ -44,16 +44,18 @@ dependencies: - nvcc_linux-aarch64=11.8 - pre-commit - pydata-sphinx-theme +- pylibraft==24.8.* - pytest-cov - pytest==7.* -- rapids-dask-dependency==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-dask-dependency==24.8.* - recommonmark -- rmm==24.6.* +- rmm==24.8.* - scikit-build-core>=0.7.0 - scikit-learn - scipy - sphinx-copybutton - sphinx-markdown-tables - sysroot_linux-aarch64==2.17 -- ucx-py==0.38.* +- ucx-py==0.39.* name: all_cuda-118_arch-aarch64 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 00ed8fa65e..a46ace8a29 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -11,7 +11,7 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 - cuda-python>=11.7.1,<12.0a0 @@ -20,8 +20,8 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-cuda==24.6.* -- distributed-ucxx==0.38.* +- dask-cuda==24.8.* +- distributed-ucxx==0.39.* - doxygen>=1.8.20 - gcc_linux-64=11.* - graphviz @@ -35,7 +35,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- libucxx==0.38.* +- libucxx==0.39.* - nccl>=2.9.9 - ninja - numba>=0.57 @@ -44,16 +44,18 @@ dependencies: - nvcc_linux-64=11.8 - pre-commit - pydata-sphinx-theme +- pylibraft==24.8.* - pytest-cov - pytest==7.* -- rapids-dask-dependency==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-dask-dependency==24.8.* - recommonmark -- rmm==24.6.* +- rmm==24.8.* - scikit-build-core>=0.7.0 - scikit-learn - scipy - sphinx-copybutton - sphinx-markdown-tables - sysroot_linux-64==2.17 -- ucx-py==0.38.* +- ucx-py==0.39.* name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml similarity index 76% rename from conda/environments/all_cuda-122_arch-aarch64.yaml rename to conda/environments/all_cuda-125_arch-aarch64.yaml index f1f346706d..1f7604a5b5 100644 --- a/conda/environments/all_cuda-122_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -11,18 +11,18 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api - cuda-python>=12.0,<13.0a0 -- cuda-version=12.2 +- cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-cuda==24.6.* -- distributed-ucxx==0.38.* +- dask-cuda==24.8.* +- distributed-ucxx==0.39.* - doxygen>=1.8.20 - gcc_linux-aarch64=11.* - graphviz @@ -32,7 +32,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libucxx==0.38.* +- libucxx==0.39.* - nccl>=2.9.9 - ninja - numba>=0.57 @@ -40,16 +40,18 @@ dependencies: - numpydoc - pre-commit - pydata-sphinx-theme +- pylibraft==24.8.* - pytest-cov - pytest==7.* -- rapids-dask-dependency==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-dask-dependency==24.8.* - recommonmark -- rmm==24.6.* +- rmm==24.8.* - scikit-build-core>=0.7.0 - scikit-learn - scipy - sphinx-copybutton - sphinx-markdown-tables - sysroot_linux-aarch64==2.17 -- ucx-py==0.38.* -name: all_cuda-122_arch-aarch64 +- ucx-py==0.39.* +name: all_cuda-125_arch-aarch64 diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml similarity index 76% rename from conda/environments/all_cuda-122_arch-x86_64.yaml rename to conda/environments/all_cuda-125_arch-x86_64.yaml index 505a4f1a97..576666e2f4 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -11,18 +11,18 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api - cuda-python>=12.0,<13.0a0 -- cuda-version=12.2 +- cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-cuda==24.6.* -- distributed-ucxx==0.38.* +- dask-cuda==24.8.* +- distributed-ucxx==0.39.* - doxygen>=1.8.20 - gcc_linux-64=11.* - graphviz @@ -32,7 +32,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libucxx==0.38.* +- libucxx==0.39.* - nccl>=2.9.9 - ninja - numba>=0.57 @@ -40,16 +40,18 @@ dependencies: - numpydoc - pre-commit - pydata-sphinx-theme +- pylibraft==24.8.* - pytest-cov - pytest==7.* -- rapids-dask-dependency==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-dask-dependency==24.8.* - recommonmark -- rmm==24.6.* +- rmm==24.8.* - scikit-build-core>=0.7.0 - scikit-learn - scipy - sphinx-copybutton - sphinx-markdown-tables - sysroot_linux-64==2.17 -- ucx-py==0.38.* -name: all_cuda-122_arch-x86_64 +- ucx-py==0.39.* +name: all_cuda-125_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index 7315f82c13..1116bbb971 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -11,7 +11,7 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 - cuda-version=11.8 @@ -30,7 +30,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- libucxx==0.38.* +- libucxx==0.39.* - matplotlib - nccl>=2.9.9 - ninja @@ -39,7 +39,8 @@ dependencies: - openblas - pandas - pyyaml -- rmm==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rmm==24.8.* - scikit-build-core>=0.7.0 - sysroot_linux-aarch64==2.17 name: bench_ann_cuda-118_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index ff973acc0c..85121af42f 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -11,7 +11,7 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 - cuda-version=11.8 @@ -30,7 +30,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- libucxx==0.38.* +- libucxx==0.39.* - matplotlib - nccl>=2.9.9 - ninja @@ -39,7 +39,8 @@ dependencies: - openblas - pandas - pyyaml -- rmm==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rmm==24.8.* - scikit-build-core>=0.7.0 - sysroot_linux-64==2.17 name: bench_ann_cuda-118_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml index 056550fc07..7abf661289 100644 --- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml @@ -11,7 +11,7 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev @@ -27,7 +27,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libucxx==0.38.* +- libucxx==0.39.* - matplotlib - nccl>=2.9.9 - ninja @@ -35,7 +35,8 @@ dependencies: - openblas - pandas - pyyaml -- rmm==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rmm==24.8.* - scikit-build-core>=0.7.0 - sysroot_linux-aarch64==2.17 name: bench_ann_cuda-120_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml index 41a48f4a12..e9f299cf67 100644 --- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml @@ -11,7 +11,7 @@ dependencies: - c-compiler - clang-tools=16.0.6 - clang==16.0.6 -- cmake>=3.26.4 +- cmake>=3.26.4,!=3.30.0 - cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev @@ -27,7 +27,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libucxx==0.38.* +- libucxx==0.39.* - matplotlib - nccl>=2.9.9 - ninja @@ -35,7 +35,8 @@ dependencies: - openblas - pandas - pyyaml -- rmm==24.6.* +- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rmm==24.8.* - scikit-build-core>=0.7.0 - sysroot_linux-64==2.17 name: bench_ann_cuda-120_arch-x86_64 diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index bb9c715e3a..00b133c821 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -17,7 +17,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" nccl_version: - ">=2.9.9" diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml index e3ca633eb9..001878ff25 100644 --- a/conda/recipes/pylibraft/conda_build_config.yaml +++ b/conda/recipes/pylibraft/conda_build_config.yaml @@ -17,4 +17,4 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml index cbeaec3b55..31086e30aa 100644 --- a/conda/recipes/pylibraft/meta.yaml +++ b/conda/recipes/pylibraft/meta.yaml @@ -55,7 +55,7 @@ requirements: - python x.x - rmm ={{ minor_version }} - scikit-build-core >=0.7.0 - - setuptools + - rapids-build-backend>=0.3.0,<0.4.0.dev0 run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml index 4de3b98f48..70d1f0490e 100644 --- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml +++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml @@ -11,7 +11,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" glog_version: - ">=0.6.0" diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml index d0748fdb16..94f7102726 100644 --- a/conda/recipes/raft-ann-bench-cpu/meta.yaml +++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml @@ -53,6 +53,7 @@ requirements: - python - pyyaml - pandas + - rapids-build-backend>=0.3.0,<0.4.0.dev0 run: - glog {{ glog_version }} diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml index cf025a06a4..db0083b583 100644 --- a/conda/recipes/raft-ann-bench/conda_build_config.yaml +++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml @@ -17,7 +17,7 @@ c_stdlib_version: - "2.17" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" nccl_version: - ">=2.9.9" diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml index 8a6a3d033d..d6aeb5f860 100644 --- a/conda/recipes/raft-ann-bench/meta.yaml +++ b/conda/recipes/raft-ann-bench/meta.yaml @@ -82,6 +82,7 @@ requirements: - pyyaml # rmm is needed to determine if package is gpu-enabled - rmm ={{ minor_version }} + - rapids-build-backend>=0.3.0,<0.4.0.dev0 run: - python diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml index b157e41753..decd1fad18 100644 --- a/conda/recipes/raft-dask/conda_build_config.yaml +++ b/conda/recipes/raft-dask/conda_build_config.yaml @@ -17,10 +17,10 @@ c_stdlib_version: - "2.17" ucx_py_version: - - "0.38.*" + - "0.39.*" ucxx_version: - - "0.38.*" + - "0.39.*" cmake_version: - - ">=3.26.4" + - ">=3.26.4,!=3.30.0" diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml index af22c8853e..641a11a241 100644 --- a/conda/recipes/raft-dask/meta.yaml +++ b/conda/recipes/raft-dask/meta.yaml @@ -55,9 +55,9 @@ requirements: - python x.x - rmm ={{ minor_version }} - scikit-build-core >=0.7.0 - - setuptools - ucx-py {{ ucx_py_version }} - ucxx {{ ucxx_version }} + - rapids-build-backend>=0.3.0,<0.4.0.dev0 run: {% if cuda_major == "11" %} - cudatoolkit diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 39472cae67..d7eeb60b27 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -59,7 +59,8 @@ option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF) option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF ) -option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF) +option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +option(CUDA_STATIC_MATH_LIBRARIES "Statically link the CUDA math libraries" OFF) option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF) option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) @@ -105,7 +106,10 @@ message(VERBOSE "RAFT: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELIN message(VERBOSE "RAFT: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}") message(VERBOSE "RAFT: Enable nvtx markers: ${RAFT_NVTX}") message(VERBOSE - "RAFT: Statically link the CUDA toolkit runtime and libraries: ${CUDA_STATIC_RUNTIME}" + "RAFT: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}" +) +message(VERBOSE + "RAFT: Statically link the CUDA math libraries: ${CUDA_STATIC_MATH_LIBRARIES}" ) # Set RMM logging level @@ -135,7 +139,7 @@ endif() # * compiler options ---------------------------------------------------------- set(_ctk_static_suffix "") -if(CUDA_STATIC_RUNTIME) +if(CUDA_STATIC_MATH_LIBRARIES) set(_ctk_static_suffix "_static") endif() @@ -302,6 +306,8 @@ if(RAFT_COMPILE_LIBRARY) src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu + src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu + src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index f489cc62c6..35df378438 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -58,10 +58,6 @@ if(BUILD_CPU_ONLY) set(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OFF) set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF) set(RAFT_ANN_BENCH_USE_GGNN OFF) -elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0) - # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled. - # https://github.com/rapidsai/raft/issues/1627 - set(RAFT_FAISS_ENABLE_GPU OFF) endif() set(RAFT_ANN_BENCH_USE_RAFT OFF) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 8762ccd1fe..185d54a0a3 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -459,8 +459,14 @@ void register_search(std::shared_ptr> dataset, */ ->MeasureProcessCPUTime() ->UseRealTime(); - - if (metric_objective == Objective::THROUGHPUT) { b->ThreadRange(threads[0], threads[1]); } + if (metric_objective == Objective::THROUGHPUT) { + if (index.algo.find("faiss_gpu") != std::string::npos) { + log_warn( + "FAISS GPU does not work in throughput mode because the underlying " + "StandardGpuResources object is not thread-safe. This will cause unexpected results"); + } + b->ThreadRange(threads[0], threads[1]); + } } } } diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp index 965522d929..234b33d80a 100644 --- a/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp +++ b/cpp/bench/ann/src/faiss/faiss_cpu_benchmark.cpp @@ -51,10 +51,10 @@ void parse_build_param(const nlohmann::json& conf, { parse_base_build_param(conf, param); param.M = conf.at("M"); - if (conf.contains("usePrecomputed")) { - param.usePrecomputed = conf.at("usePrecomputed"); + if (conf.contains("use_precomputed_table")) { + param.use_precomputed_table = conf.at("use_precomputed_table"); } else { - param.usePrecomputed = false; + param.use_precomputed_table = false; } if (conf.contains("bitsPerCode")) { param.bitsPerCode = conf.at("bitsPerCode"); diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h index 3caca15b7f..c7ce4595b5 100644 --- a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h +++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h @@ -229,7 +229,7 @@ class FaissCpuIVFPQ : public FaissCpu { struct BuildParam : public FaissCpu::BuildParam { int M; int bitsPerCode; - bool usePrecomputed; + bool use_precomputed_table; }; FaissCpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissCpu(metric, dim, param) diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu index c5056cb364..b47c497e3d 100644 --- a/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu +++ b/cpp/bench/ann/src/faiss/faiss_gpu_benchmark.cu @@ -45,6 +45,11 @@ void parse_build_param(const nlohmann::json& conf, typename raft::bench::ann::FaissGpuIVFFlat::BuildParam& param) { parse_base_build_param(conf, param); + if (conf.contains("use_raft")) { + param.use_raft = conf.at("use_raft"); + } else { + param.use_raft = false; + } } template @@ -63,6 +68,16 @@ void parse_build_param(const nlohmann::json& conf, } else { param.useFloat16 = false; } + if (conf.contains("use_raft")) { + param.use_raft = conf.at("use_raft"); + } else { + param.use_raft = false; + } + if (conf.contains("bitsPerCode")) { + param.bitsPerCode = conf.at("bitsPerCode"); + } else { + param.bitsPerCode = 8; + } } template @@ -160,5 +175,18 @@ REGISTER_ALGO_INSTANCE(std::uint8_t); #ifdef ANN_BENCH_BUILD_MAIN #include "../common/benchmark.hpp" -int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); } +int main(int argc, char** argv) +{ + rmm::mr::cuda_memory_resource cuda_mr; + // Construct a resource that uses a coalescing best-fit pool allocator + // and is initially sized to half of free device memory. + rmm::mr::pool_memory_resource pool_mr{ + &cuda_mr, rmm::percent_of_free_device_memory(50)}; + // Updates the current device resource pointer to `pool_mr` + auto old_mr = rmm::mr::set_current_device_resource(&pool_mr); + auto ret = raft::bench::ann::run_main(argc, argv); + // Restores the current device resource pointer to its previous value + rmm::mr::set_current_device_resource(old_mr); + return ret; +} #endif diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h index 2effe631e5..6955201c5d 100644 --- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h +++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h @@ -17,15 +17,29 @@ #define FAISS_WRAPPER_H_ #include "../common/ann_types.hpp" +#include "../raft/raft_ann_bench_utils.h" +#include +#include +#include +#include #include +#include +#include #include +#include + +#include +#include +#include + #include #include #include #include #include +#include #include #include #include @@ -43,7 +57,7 @@ namespace { -faiss::MetricType parse_metric_type(raft::bench::ann::Metric metric) +faiss::MetricType parse_metric_faiss(raft::bench::ann::Metric metric) { if (metric == raft::bench::ann::Metric::kInnerProduct) { return faiss::METRIC_INNER_PRODUCT; @@ -95,7 +109,7 @@ class FaissGpu : public ANN, public AnnGPU { FaissGpu(Metric metric, int dim, const BuildParam& param) : ANN(metric, dim), gpu_resource_{std::make_shared()}, - metric_type_(parse_metric_type(metric)), + metric_type_(parse_metric_faiss(metric)), nlist_{param.nlist}, training_sample_fraction_{1.0 / double(param.ratio)} { @@ -127,7 +141,7 @@ class FaissGpu : public ANN, public AnnGPU { AlgoProperty property; // to enable building big dataset which is larger than GPU memory property.dataset_memory_type = MemoryType::Host; - property.query_memory_type = MemoryType::Host; + property.query_memory_type = MemoryType::Device; return property; } @@ -162,8 +176,10 @@ class FaissGpu : public ANN, public AnnGPU { int device_; double training_sample_fraction_; std::shared_ptr search_params_; + std::shared_ptr refine_search_params_{nullptr}; const T* dataset_; float refine_ratio_ = 1.0; + Objective metric_objective_; }; template @@ -201,19 +217,65 @@ template void FaissGpu::search( const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const { + ASSERT(Objective::LATENCY, "l2Knn: rowMajorIndex and rowMajorQuery should have same layout"); + using IdxT = faiss::idx_t; static_assert(sizeof(size_t) == sizeof(faiss::idx_t), "sizes of size_t and faiss::idx_t are different"); - if (this->refine_ratio_ > 1.0) { - // TODO: FAISS changed their search APIs to accept the search parameters as a struct object - // but their refine API doesn't allow the struct to be passed in. Once this is fixed, we - // need to re-enable refinement below - // index_refine_->search(batch_size, queries, k, distances, - // reinterpret_cast(neighbors), this->search_params_.get()); Related FAISS issue: - // https://github.com/facebookresearch/faiss/issues/3118 - throw std::runtime_error( - "FAISS doesn't support refinement in their new APIs so this feature is disabled in the " - "benchmarks for the time being."); + if (refine_ratio_ > 1.0) { + if (raft::get_device_for_address(queries) >= 0) { + uint32_t k0 = static_cast(refine_ratio_ * k); + auto distances_tmp = raft::make_device_matrix( + gpu_resource_->getRaftHandle(device_), batch_size, k0); + auto candidates = + raft::make_device_matrix(gpu_resource_->getRaftHandle(device_), batch_size, k0); + index_->search(batch_size, + queries, + k0, + distances_tmp.data_handle(), + candidates.data_handle(), + this->search_params_.get()); + + auto queries_host = raft::make_host_matrix(batch_size, index_->d); + auto candidates_host = raft::make_host_matrix(batch_size, k0); + auto neighbors_host = raft::make_host_matrix(batch_size, k); + auto distances_host = raft::make_host_matrix(batch_size, k); + auto dataset_v = raft::make_host_matrix_view( + this->dataset_, index_->ntotal, index_->d); + + raft::device_resources handle_ = gpu_resource_->getRaftHandle(device_); + + raft::copy(queries_host.data_handle(), queries, queries_host.size(), handle_.get_stream()); + raft::copy(candidates_host.data_handle(), + candidates.data_handle(), + candidates_host.size(), + handle_.get_stream()); + + // wait for the queries to copy to host in 'stream` + handle_.sync_stream(); + + raft::runtime::neighbors::refine(handle_, + dataset_v, + queries_host.view(), + candidates_host.view(), + neighbors_host.view(), + distances_host.view(), + parse_metric_type(this->metric_)); + + raft::copy(neighbors, + (size_t*)neighbors_host.data_handle(), + neighbors_host.size(), + handle_.get_stream()); + raft::copy( + distances, distances_host.data_handle(), distances_host.size(), handle_.get_stream()); + } else { + index_refine_->search(batch_size, + queries, + k, + distances, + reinterpret_cast(neighbors), + this->refine_search_params_.get()); + } } else { index_->search(batch_size, queries, @@ -255,13 +317,16 @@ void FaissGpu::load_(const std::string& file) template class FaissGpuIVFFlat : public FaissGpu { public: - using typename FaissGpu::BuildParam; + struct BuildParam : public FaissGpu::BuildParam { + bool use_raft; + }; FaissGpuIVFFlat(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) { faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = this->device_; - this->index_ = std::make_shared( + config.device = this->device_; + config.use_raft = param.use_raft; + this->index_ = std::make_shared( this->gpu_resource_.get(), dim, param.nlist, this->metric_type_, config); } @@ -295,6 +360,8 @@ class FaissGpuIVFPQ : public FaissGpu { int M; bool useFloat16; bool usePrecomputed; + bool use_raft; + int bitsPerCode; }; FaissGpuIVFPQ(Metric metric, int dim, const BuildParam& param) : FaissGpu(metric, dim, param) @@ -302,16 +369,17 @@ class FaissGpuIVFPQ : public FaissGpu { faiss::gpu::GpuIndexIVFPQConfig config; config.useFloat16LookupTables = param.useFloat16; config.usePrecomputedTables = param.usePrecomputed; + config.use_raft = param.use_raft; + config.interleavedLayout = param.use_raft; config.device = this->device_; - this->index_ = - std::make_shared(this->gpu_resource_.get(), - dim, - param.nlist, - param.M, - 8, // FAISS only supports bitsPerCode=8 - this->metric_type_, - config); + this->index_ = std::make_shared(this->gpu_resource_.get(), + dim, + param.nlist, + param.M, + param.bitsPerCode, + this->metric_type_, + config); } void set_search_param(const typename FaissGpu::AnnSearchParam& param) override @@ -329,6 +397,11 @@ class FaissGpuIVFPQ : public FaissGpu { this->index_refine_ = std::make_shared(this->index_.get(), this->dataset_); this->index_refine_.get()->k_factor = search_param.refine_ratio; + faiss::IndexRefineSearchParameters faiss_refine_search_params; + faiss_refine_search_params.k_factor = this->index_refine_.get()->k_factor; + faiss_refine_search_params.base_index_params = this->search_params_.get(); + this->refine_search_params_ = + std::make_unique(faiss_refine_search_params); } } @@ -385,6 +458,11 @@ class FaissGpuIVFSQ : public FaissGpu { this->index_refine_ = std::make_shared(this->index_.get(), this->dataset_); this->index_refine_.get()->k_factor = search_param.refine_ratio; + faiss::IndexRefineSearchParameters faiss_refine_search_params; + faiss_refine_search_params.k_factor = this->index_refine_.get()->k_factor; + faiss_refine_search_params.base_index_params = this->search_params_.get(); + this->refine_search_params_ = + std::make_unique(faiss_refine_search_params); } } diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h index 1c4b847d1a..1d2a1076ab 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h @@ -31,7 +31,7 @@ class RaftCagraHnswlib : public ANN, public AnnGPU { RaftCagraHnswlib(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1) : ANN(metric, dim), - cagra_build_{metric, dim, param, concurrent_searches}, + cagra_build_{metric, dim, param, concurrent_searches, true}, // HnswLib param values don't matter since we don't build with HnswLib hnswlib_search_{metric, dim, typename HnswLib::BuildParam{50, 100}} { diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h index 0b892dec35..b03f875a8e 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h @@ -72,11 +72,16 @@ class RaftCagra : public ANN, public AnnGPU { std::optional ivf_pq_search_params = std::nullopt; }; - RaftCagra(Metric metric, int dim, const BuildParam& param, int concurrent_searches = 1) + RaftCagra(Metric metric, + int dim, + const BuildParam& param, + int concurrent_searches = 1, + bool shall_include_dataset = false) : ANN(metric, dim), index_params_(param), dimension_(dim), need_dataset_update_(true), + shall_include_dataset_(shall_include_dataset), dataset_(std::make_shared>( std::move(make_device_matrix(handle_, 0, 0)))), graph_(std::make_shared>( @@ -135,6 +140,7 @@ class RaftCagra : public ANN, public AnnGPU { float refine_ratio_; BuildParam index_params_; bool need_dataset_update_; + bool shall_include_dataset_; raft::neighbors::cagra::search_params search_params_; std::shared_ptr> index_; int dimension_; @@ -161,7 +167,7 @@ void RaftCagra::build(const T* dataset, size_t nrow) auto& params = index_params_.cagra_params; // Do include the compressed dataset for the CAGRA-Q - bool shall_include_dataset = params.compression.has_value(); + bool include_dataset = params.compression.has_value() || shall_include_dataset_; index_ = std::make_shared>( std::move(raft::neighbors::cagra::detail::build(handle_, @@ -171,7 +177,7 @@ void RaftCagra::build(const T* dataset, size_t nrow) index_params_.ivf_pq_refine_rate, index_params_.ivf_pq_build_params, index_params_.ivf_pq_search_params, - shall_include_dataset))); + include_dataset))); } inline std::string allocator_to_string(AllocatorType mem_type) diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index 0771a60e58..9d80cbaac2 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -75,7 +75,20 @@ endfunction() if(BUILD_PRIMS_BENCH) ConfigureBench( - NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp + NAME + CORE_BENCH + PATH + core/bitset.cu + core/copy.cu + main.cpp + ) + + ConfigureBench( + NAME + UTIL_BENCH + PATH + util/popc.cu + main.cpp ) ConfigureBench( @@ -111,6 +124,7 @@ if(BUILD_PRIMS_BENCH) PATH linalg/add.cu linalg/map_then_reduce.cu + linalg/masked_matmul.cu linalg/matrix_vector_op.cu linalg/norm.cu linalg/normalize.cu diff --git a/cpp/bench/prims/linalg/masked_matmul.cu b/cpp/bench/prims/linalg/masked_matmul.cu new file mode 100644 index 0000000000..eda9cb1710 --- /dev/null +++ b/cpp/bench/prims/linalg/masked_matmul.cu @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace raft::bench::linalg { + +template +struct MaskedMatmulBenchParams { + size_t m; + size_t k; + size_t n; + float sparsity; + value_t alpha = 1.0; + value_t beta = 0.0; +}; + +template +inline auto operator<<(std::ostream& os, const MaskedMatmulBenchParams& params) + -> std::ostream& +{ + os << " m*k*n=" << params.m << "*" << params.k << "*" << params.n + << "\tsparsity=" << params.sparsity; + if (params.sparsity == 1.0) { os << "<-inner product for comparison"; } + return os; +} + +template +struct MaskedMatmulBench : public fixture { + MaskedMatmulBench(const MaskedMatmulBenchParams& p) + : fixture(true), + params(p), + handle(stream), + a_data_d(0, stream), + b_data_d(0, stream), + c_indptr_d(0, stream), + c_indices_d(0, stream), + c_data_d(0, stream), + bitmap_d(0, stream), + c_dense_data_d(0, stream) + { + index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bitmap_t) * 8)); + std::vector bitmap_h(element); + + a_data_d.resize(params.m * params.k, stream); + b_data_d.resize(params.k * params.n, stream); + bitmap_d.resize(element, stream); + + raft::random::RngState rng(2024ULL); + raft::random::uniform( + handle, rng, a_data_d.data(), params.m * params.k, value_t(-1.0), value_t(1.0)); + raft::random::uniform( + handle, rng, b_data_d.data(), params.k * params.n, value_t(-1.0), value_t(1.0)); + + std::vector c_dense_data_h(params.m * params.n); + + c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h); + + std::vector values(c_true_nnz); + std::vector indices(c_true_nnz); + std::vector indptr(params.m + 1); + + c_data_d.resize(c_true_nnz, stream); + c_indptr_d.resize(params.m + 1, stream); + c_indices_d.resize(c_true_nnz, stream); + c_dense_data_d.resize(params.m * params.n, stream); + + cpu_convert_to_csr(bitmap_h, params.m, params.n, indices, indptr); + RAFT_EXPECTS(c_true_nnz == c_indices_d.size(), + "Something wrong. The c_true_nnz != c_indices_d.size()!"); + + update_device(c_data_d.data(), values.data(), c_true_nnz, stream); + update_device(c_indices_d.data(), indices.data(), c_true_nnz, stream); + update_device(c_indptr_d.data(), indptr.data(), params.m + 1, stream); + update_device(bitmap_d.data(), bitmap_h.data(), element, stream); + } + + index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector& bitmap) + { + index_t total = static_cast(m * n); + index_t num_ones = static_cast((total * 1.0f) * sparsity); + index_t res = num_ones; + + for (auto& item : bitmap) { + item = static_cast(0); + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, total - 1); + + while (num_ones > 0) { + index_t index = dis(gen); + + bitmap_t& element = bitmap[index / (8 * sizeof(bitmap_t))]; + index_t bit_position = index % (8 * sizeof(bitmap_t)); + + if (((element >> bit_position) & 1) == 0) { + element |= (static_cast(1) << bit_position); + num_ones--; + } + } + return res; + } + + void cpu_convert_to_csr(std::vector& bitmap, + index_t rows, + index_t cols, + std::vector& indices, + std::vector& indptr) + { + index_t offset_indptr = 0; + index_t offset_values = 0; + indptr[offset_indptr++] = 0; + + index_t index = 0; + bitmap_t element = 0; + index_t bit_position = 0; + + for (index_t i = 0; i < rows; ++i) { + for (index_t j = 0; j < cols; ++j) { + index = i * cols + j; + element = bitmap[index / (8 * sizeof(bitmap_t))]; + bit_position = index % (8 * sizeof(bitmap_t)); + + if (((element >> bit_position) & 1)) { + indices[offset_values] = static_cast(j); + offset_values++; + } + } + indptr[offset_indptr++] = static_cast(offset_values); + } + } + + ~MaskedMatmulBench() {} + + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + auto a = raft::make_device_matrix_view( + a_data_d.data(), params.m, params.k); + + auto b = raft::make_device_matrix_view( + b_data_d.data(), params.n, params.k); + + auto c_structure = raft::make_device_compressed_structure_view( + c_indptr_d.data(), + c_indices_d.data(), + params.m, + params.n, + static_cast(c_indices_d.size())); + + auto mask = + raft::core::bitmap_view(bitmap_d.data(), params.m, params.n); + + auto c = raft::make_device_csr_matrix_view(c_data_d.data(), c_structure); + + if (params.sparsity < 1.0) { + raft::sparse::linalg::masked_matmul(handle, a, b, mask, c); + } else { + raft::distance::pairwise_distance(handle, + a_data_d.data(), + b_data_d.data(), + c_dense_data_d.data(), + static_cast(params.m), + static_cast(params.n), + static_cast(params.k), + raft::distance::DistanceType::InnerProduct, + true); + } + resource::sync_stream(handle); + + raft::sparse::linalg::masked_matmul(handle, a, b, mask, c); + resource::sync_stream(handle); + + loop_on_state(state, [this, &a, &b, &mask, &c]() { + if (params.sparsity < 1.0) { + raft::sparse::linalg::masked_matmul(handle, a, b, mask, c); + } else { + raft::distance::pairwise_distance(handle, + a_data_d.data(), + b_data_d.data(), + c_dense_data_d.data(), + static_cast(params.m), + static_cast(params.n), + static_cast(params.k), + raft::distance::DistanceType::InnerProduct, + true); + } + resource::sync_stream(handle); + }); + } + + private: + const raft::device_resources handle; + MaskedMatmulBenchParams params; + + rmm::device_uvector a_data_d; + rmm::device_uvector b_data_d; + rmm::device_uvector bitmap_d; + + rmm::device_uvector c_dense_data_d; + + size_t c_true_nnz = 0; + rmm::device_uvector c_indptr_d; + rmm::device_uvector c_indices_d; + rmm::device_uvector c_data_d; +}; + +template +static std::vector> getInputs() +{ + std::vector> param_vec; + struct TestParams { + size_t m; + size_t k; + size_t n; + float sparsity; + }; + + const std::vector params_group = + raft::util::itertools::product({size_t(10), size_t(1024)}, + {size_t(128), size_t(1024)}, + {size_t(1024 * 1024)}, + {0.01f, 0.1f, 0.2f, 0.5f, 1.0f}); + + param_vec.reserve(params_group.size()); + for (TestParams params : params_group) { + param_vec.push_back( + MaskedMatmulBenchParams({params.m, params.k, params.n, params.sparsity})); + } + return param_vec; +} + +RAFT_BENCH_REGISTER((MaskedMatmulBench), "", getInputs()); + +} // namespace raft::bench::linalg diff --git a/cpp/bench/prims/util/popc.cu b/cpp/bench/prims/util/popc.cu new file mode 100644 index 0000000000..249dc13d1e --- /dev/null +++ b/cpp/bench/prims/util/popc.cu @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace raft::bench::core { + +template +struct PopcInputs { + index_t n_rows; + index_t n_cols; + float sparsity; +}; + +template +inline auto operator<<(std::ostream& os, const PopcInputs& params) -> std::ostream& +{ + os << params.n_rows << "#" << params.n_cols << "#" << params.sparsity; + return os; +} + +template +struct popc_bench : public fixture { + popc_bench(const PopcInputs& p) + : params(p), + n_element(raft::ceildiv(params.n_rows * params.n_cols, index_t(sizeof(bits_t) * 8))), + bits_d{raft::make_device_vector(res, n_element)}, + nnz_actual_d{raft::make_device_scalar(res, 0)} + { + } + + index_t create_bitmap(index_t m, index_t n, float sparsity, std::vector& bitmap) + { + index_t total = static_cast(m * n); + index_t num_ones = static_cast((total * 1.0f) * sparsity); + index_t res = num_ones; + + for (auto& item : bitmap) { + item = static_cast(0); + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, total - 1); + + while (num_ones > 0) { + index_t index = dis(gen); + + bits_t& element = bitmap[index / (8 * sizeof(bits_t))]; + index_t bit_position = index % (8 * sizeof(bits_t)); + + if (((element >> bit_position) & 1) == 0) { + element |= (static_cast(1) << bit_position); + num_ones--; + } + } + return res; + } + void run_benchmark(::benchmark::State& state) override + { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + + std::vector bits_h(n_element); + auto stream = raft::resource::get_cuda_stream(res); + + create_bitmap(params.n_rows, params.n_cols, params.sparsity, bits_h); + update_device(bits_d.data_handle(), bits_h.data(), bits_h.size(), stream); + + resource::sync_stream(res); + + loop_on_state(state, [this]() { + auto bits_view = + raft::make_device_vector_view(bits_d.data_handle(), bits_d.size()); + + index_t max_len = params.n_rows * params.n_cols; + auto max_len_view = raft::make_host_scalar_view(&max_len); + auto nnz_actual_view = + nnz_actual_d.view(); // raft::make_device_scalar_view(nnz_actual_d.data_handle()); + raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view); + }); + } + + private: + raft::resources res; + PopcInputs params; + index_t n_element; + + raft::device_vector bits_d; + raft::device_scalar nnz_actual_d; +}; + +template +const std::vector> popc_input_vecs{ + {2, 131072, 0.4}, {8, 131072, 0.5}, {16, 131072, 0.2}, {2, 8192, 0.4}, {16, 8192, 0.5}, + {128, 8192, 0.2}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, + + {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, {1024, 8192, 0.1}, + + {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, + {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, + {1024, 8192, 0.5}, {1024, 8192, 0.2}, + + {1024, 8192, 0.5}, {1024, 8192, 0.2}, {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}, + {1024, 8192, 0.4}, {1024, 8192, 0.5}, {1024, 8192, 0.2}}; + +using PopcBenchI64 = popc_bench; + +RAFT_BENCH_REGISTER(PopcBenchI64, "", popc_input_vecs); + +} // namespace raft::bench::core diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake index 288da763bf..706b0c2f11 100644 --- a/cpp/cmake/thirdparty/get_faiss.cmake +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -55,6 +55,7 @@ function(find_and_configure_faiss) EXCLUDE_FROM_ALL ${exclude} OPTIONS "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}" + "FAISS_ENABLE_RAFT ${PKG_ENABLE_GPU}" "FAISS_ENABLE_PYTHON OFF" "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}" "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" @@ -115,4 +116,4 @@ endfunction() find_and_configure_faiss( BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC} ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU} -) +) \ No newline at end of file diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp index cb1accc95e..c5d64f6a29 100644 --- a/cpp/include/raft/comms/detail/std_comms.hpp +++ b/cpp/include/raft/comms/detail/std_comms.hpp @@ -307,13 +307,16 @@ class std_comms : public comms_iface { bool restart = false; // resets the timeout when any progress was made if (worker->isProgressThreadRunning()) { - // Wait for a UCXX progress thread roundtrip + // Wait for a UCXX progress thread roundtrip, prevent waiting for longer + // than 10ms for each operation, will retry in next iteration. ucxx::utils::CallbackNotifier callbackNotifierPre{}; - worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); }); + worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); }, + 10000000 /* 10ms */); callbackNotifierPre.wait(); ucxx::utils::CallbackNotifier callbackNotifierPost{}; - worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); }); + worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); }, + 10000000 /* 10ms */); callbackNotifierPost.wait(); } else { // Causes UCXX to progress through the send/recv message queue diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh index 2c23a77e47..cafd1977ab 100644 --- a/cpp/include/raft/core/bitmap.cuh +++ b/cpp/include/raft/core/bitmap.cuh @@ -39,7 +39,7 @@ _RAFT_HOST_DEVICE void bitmap_view::set(const index_t row, const index_t col, bool new_value) const { - set(row * cols_ + col, &new_value); + set(row * cols_ + col, new_value); } } // end namespace raft::core diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp index 5c77866164..86b2d77478 100644 --- a/cpp/include/raft/core/bitmap.hpp +++ b/cpp/include/raft/core/bitmap.hpp @@ -41,6 +41,9 @@ namespace raft::core { */ template struct bitmap_view : public bitset_view { + using bitset_view::set; + using bitset_view::test; + static_assert((std::is_same::type, uint32_t>::value || std::is_same::type, uint64_t>::value), "The bitmap_t must be uint32_t or uint64_t."); diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh index d7eedee92e..0cdb4c1fb6 100644 --- a/cpp/include/raft/core/bitset.cuh +++ b/cpp/include/raft/core/bitset.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include #include #include @@ -25,6 +24,7 @@ #include #include #include +#include #include @@ -167,9 +167,10 @@ template void bitset::count(const raft::resources& res, raft::device_scalar_view count_gpu_scalar) { + auto max_len = raft::make_host_scalar_view(&bitset_len_); auto values = raft::make_device_vector_view(bitset_.data(), n_elements()); - raft::detail::popc(res, values, bitset_len_, count_gpu_scalar); + raft::popc(res, values, max_len, count_gpu_scalar); } } // end namespace raft::core diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp index 9058f4847d..5a1968b529 100644 --- a/cpp/include/raft/core/cusparse_macros.hpp +++ b/cpp/include/raft/core/cusparse_macros.hpp @@ -34,7 +34,8 @@ // // (i.e., before including this header) // -#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100) +#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10010) +#define CUDA_VER_12_4_UP (CUDART_VERSION >= 12040) namespace raft { @@ -59,7 +60,7 @@ namespace detail { inline const char* cusparse_error_to_string(cusparseStatus_t err) { -#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 +#if defined(CUDART_VERSION) && CUDART_VERSION >= 10010 return cusparseGetErrorString(err); #else // CUDART_VERSION switch (err) { diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp index 9ce768bf40..e082aaf41a 100644 --- a/cpp/include/raft/core/math.hpp +++ b/cpp/include/raft/core/math.hpp @@ -56,7 +56,11 @@ constexpr RAFT_INLINE_FUNCTION auto abs(T x) !std::is_same_v, T> { - return x < T{0} ? -x : x; + if constexpr (std::is_unsigned_v) { + return x; + } else { + return x < T{0} ? -x : x; + } } #if defined(_RAFT_HAS_CUDA) template diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index a5c8c0ef4b..a39dbf6700 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -55,6 +55,7 @@ using distance_tag = std::integral_constant; * - DistanceType::Canberra: * - DistanceType::CorrelationExpanded: * - DistanceType::CosineExpanded: + * - DistanceType::DiceExpanded: * - DistanceType::HammingUnexpanded: * - DistanceType::HellingerExpanded: * - DistanceType::JensenShannon: @@ -238,6 +239,61 @@ void distance_impl(raft::resources const& handle, distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); } +template +void distance_impl(raft::resources const& handle, + distance_tag distance_type, + const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + AccT* workspace, + size_t worksize, + FinOpT fin_op, + bool is_row_major, + DataT) // unused +{ + // raft distance support inputs as float/double and output as uint8_t/float/double. + static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))), + "OutT can be uint8_t, float, double," + "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT)."); + + ASSERT(!(worksize < (m + n) * sizeof(AccT)), "workspace size error"); + ASSERT(workspace != nullptr, "workspace is null"); + + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + DataT* x_norm = workspace; + DataT* y_norm = workspace; + // TODO: Column major case looks to have lower accuracy for X == Y, + // perhaps the use of stridedSummationKernel could be causing this, + // need to investigate and fix. + if (x == y && is_row_major) { + raft::linalg::reduce(x_norm, + x, + k, + std::max(m, n), + (AccT)0, + is_row_major, + true, + stream, + false, + raft::nz_op(), + raft::add_op()); + } else { + y_norm += m; + raft::linalg::reduce( + x_norm, x, k, m, (AccT)0, is_row_major, true, stream, false, raft::nz_op(), raft::add_op()); + raft::linalg::reduce( + y_norm, y, k, n, (AccT)0, is_row_major, true, stream, false, raft::nz_op(), raft::add_op()); + } + + ops::dice_distance_op distance_op{}; + pairwise_matrix_dispatch( + distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); +} + template void distance_impl(raft::resources const& handle, distance_tag distance_type, @@ -794,9 +850,11 @@ template size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) { - size_t worksize = 0; - constexpr bool is_allocated = (distanceType <= raft::distance::DistanceType::CosineExpanded) || - (distanceType == raft::distance::DistanceType::CorrelationExpanded); + size_t worksize = 0; + constexpr bool is_allocated = + (distanceType <= raft::distance::DistanceType::CosineExpanded) || + (distanceType == raft::distance::DistanceType::CorrelationExpanded) || + (distanceType == raft::distance::DistanceType::DiceExpanded); constexpr int numOfBuffers = (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1; diff --git a/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh b/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh index 3e8f4e86fb..84eb3c705b 100644 --- a/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh +++ b/cpp/include/raft/distance/detail/distance_ops/all_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/raft/distance/detail/distance_ops/dice.cuh b/cpp/include/raft/distance/detail/distance_ops/dice.cuh new file mode 100644 index 0000000000..362ba7eab7 --- /dev/null +++ b/cpp/include/raft/distance/detail/distance_ops/dice.cuh @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include // DI + +namespace raft::distance::detail::ops { + +// Epilogue operator for CUTLASS based kernel +template +struct dice_cutlass_op { + __device__ dice_cutlass_op() noexcept {} + __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept + { + return static_cast(1.0) - static_cast(2 * accVal / (aNorm + bNorm)); + } + __device__ AccT operator()(DataT aData) const noexcept { return aData; } +}; + +/** + * @brief the expanded dice distance matrix calculation + * + * It computes the following equation: + * + * d(x, y) = 1 - 2*(x ⋅ y) / ( Σ(x) + Σ(y) ) + */ +template +struct dice_distance_op { + using DataT = DataType; + using AccT = AccType; + using IdxT = IdxType; + + // Load norms of input data + static constexpr bool use_norms = true; + // Whether the core function requires so many instructions that it makes sense + // to reduce loop unrolling, etc. We do this to keep compile times in check. + static constexpr bool expensive_inner_loop = false; + + // Size of shared memory. This is normally decided by the kernel policy, but + // some ops such as correlation_distance_op use more. + template + static constexpr size_t shared_mem_size() + { + return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT)); + } + + DI void core(AccT& acc, DataT& x, DataT& y) const + { + acc += (x != DataT(0) ? DataT(1) : DataT(0)) * (y != DataT(0) ? DataT(1) : DataT(0)); + }; + + template + DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], + DataT* regxn, + DataT* regyn, + IdxT gridStrideX, + IdxT gridStrideY) const + { +#pragma unroll + for (int i = 0; i < Policy::AccRowsPerTh; ++i) { +#pragma unroll + for (int j = 0; j < Policy::AccColsPerTh; ++j) { + acc[i][j] = 1.0 - (2 * acc[i][j] / (regxn[i] + regyn[j])); + } + } + } + + constexpr dice_cutlass_op get_cutlass_op() const + { + return dice_cutlass_op(); + } +}; + +} // namespace raft::distance::detail::ops diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh index e1dc6f9b37..bced721ec8 100644 --- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh +++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -120,6 +120,10 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch( raft::distance::detail::ops::cosine_distance_op, float, float, float, raft::identity_op, int); instantiate_raft_distance_detail_pairwise_matrix_dispatch( raft::distance::detail::ops::cosine_distance_op, double, double, double, raft::identity_op, int); +instantiate_raft_distance_detail_pairwise_matrix_dispatch( + raft::distance::detail::ops::dice_distance_op, float, float, float, raft::identity_op, int); +instantiate_raft_distance_detail_pairwise_matrix_dispatch( + raft::distance::detail::ops::dice_distance_op, double, double, double, raft::identity_op, int); instantiate_raft_distance_detail_pairwise_matrix_dispatch( raft::distance::detail::ops::hamming_distance_op, float, float, float, raft::identity_op, int); instantiate_raft_distance_detail_pairwise_matrix_dispatch( diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh index a634e8c995..2d41e029fe 100644 --- a/cpp/include/raft/distance/distance-ext.cuh +++ b/cpp/include/raft/distance/distance-ext.cuh @@ -204,6 +204,10 @@ instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int); instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, raft::identity_op, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, double, double, double, raft::identity_op, int); instantiate_raft_distance_distance( raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int); instantiate_raft_distance_distance( @@ -286,6 +290,10 @@ instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, float, float, float, int); instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, double, double, double, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, double, double, double, int); instantiate_raft_distance_distance( raft::distance::DistanceType::HammingUnexpanded, float, float, float, int); instantiate_raft_distance_distance( @@ -362,6 +370,10 @@ instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, float, float, float, int); instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, double, double, double, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, double, double, double, int); instantiate_raft_distance_distance( raft::distance::DistanceType::HammingUnexpanded, float, float, float, int); instantiate_raft_distance_distance( @@ -429,6 +441,10 @@ instantiate_raft_distance_getWorkspaceSize( raft::distance::DistanceType::CosineExpanded, float, float, float, int); instantiate_raft_distance_getWorkspaceSize( raft::distance::DistanceType::CosineExpanded, double, double, double, int); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, float, float, float, int); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, double, double, double, int); instantiate_raft_distance_getWorkspaceSize( raft::distance::DistanceType::HammingUnexpanded, float, float, float, int); instantiate_raft_distance_getWorkspaceSize( @@ -547,6 +563,22 @@ instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineE double, int, raft::layout_f_contiguous); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_c_contiguous); +instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + int, + raft::layout_c_contiguous); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_f_contiguous); +instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + int, + raft::layout_f_contiguous); instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded, float, float, @@ -822,6 +854,22 @@ instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded, double, raft::layout_f_contiguous, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_c_contiguous, int); +instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + raft::layout_c_contiguous, + int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_f_contiguous, int); +instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + raft::layout_f_contiguous, + int); instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded, float, float, diff --git a/cpp/include/raft/distance/distance-inl.cuh b/cpp/include/raft/distance/distance-inl.cuh index 647c5b2908..13c9d57efd 100644 --- a/cpp/include/raft/distance/distance-inl.cuh +++ b/cpp/include/raft/distance/distance-inl.cuh @@ -306,6 +306,9 @@ void pairwise_distance(raft::resources const& handle, case DistanceType::RusselRaoExpanded: dispatch(std::integral_constant{}); break; + case DistanceType::DiceExpanded: + dispatch(std::integral_constant{}); + break; default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index 9f3be7ce0e..9680cbc636 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -42,6 +42,18 @@ struct ReductionThinPolicy { static constexpr bool NoSequentialReduce = noLoop; }; +template +DI void KahanBabushkaNeumaierSum(Type& sum, Type& compensation, const Type& cur_value) +{ + const Type t = sum + cur_value; + if (abs(sum) >= abs(cur_value)) { + compensation += (sum - t) + cur_value; + } else { + compensation += (cur_value - t) + sum; + } + sum = t; +} + template +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThinKernel(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + MainLambda main_op, + FinalLambda final_op, + bool inplace = false) +{ + /* The strategy to achieve near-SOL memory bandwidth differs based on D: + * - For small D, we need to process multiple rows per logical warp in order to have + * multiple loads per thread and increase bytes in flight and amortize latencies. + * - For large D, we start with a sequential reduction. The compiler partially unrolls + * that loop (e.g. first a loop of stride 16, then 8, 4, and 1). + */ + IdxType i0 = threadIdx.y + (Policy::RowsPerBlock * static_cast(blockIdx.x)); + if (i0 >= N) return; + + OutType acc[Policy::RowsPerLogicalWarp]; + OutType thread_c[Policy::RowsPerLogicalWarp]; + +#pragma unroll + for (int k = 0; k < Policy::RowsPerLogicalWarp; k++) { + acc[k] = init; + thread_c[k] = 0; + } + + if constexpr (Policy::NoSequentialReduce) { + IdxType j = threadIdx.x; + if (j < D) { +#pragma unroll + for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) { + // Only the first row is known to be within bounds. Clamp to avoid out-of-mem read. + const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1); + // acc[k] = reduce_op(acc[k], main_op(data[j + (D * i)], j)); + KahanBabushkaNeumaierSum(acc[k], thread_c[k], main_op(data[j + (D * i)], j)); + } + } + } else { + for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) { +#pragma unroll + for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) { + const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1); + // acc[k] = reduce_op(acc[k], main_op(data[j + (D * i)], j)); + KahanBabushkaNeumaierSum(acc[k], thread_c[k], main_op(data[j + (D * i)], j)); + } + } + } + + /* This vector reduction has two benefits compared to naive separate reductions: + * - It avoids the LSU bottleneck when the number of columns is around 32 (e.g. for 32, 5 shuffles + * are required and there is no initial sequential reduction to amortize that cost). + * - It distributes the outputs to multiple threads, enabling a coalesced store when the number of + * rows per logical warp and logical warp size are equal. + */ + raft::logicalWarpReduceVector( + acc, threadIdx.x, raft::add_op()); + + raft::logicalWarpReduceVector( + thread_c, threadIdx.x, raft::add_op()); + + constexpr int reducOutVecWidth = + std::max(1, Policy::RowsPerLogicalWarp / Policy::LogicalWarpSize); + constexpr int reducOutGroupSize = + std::max(1, Policy::LogicalWarpSize / Policy::RowsPerLogicalWarp); + constexpr int reducNumGroups = Policy::LogicalWarpSize / reducOutGroupSize; + + if (threadIdx.x % reducOutGroupSize == 0) { + const int groupId = threadIdx.x / reducOutGroupSize; + if (inplace) { +#pragma unroll + for (int k = 0; k < reducOutVecWidth; k++) { + const int reductionId = k * reducNumGroups + groupId; + const IdxType i = i0 + reductionId * Policy::NumLogicalWarps; + if (i < N) { dots[i] = final_op(dots[i] + acc[k] + thread_c[k]); } + } + } else { +#pragma unroll + for (int k = 0; k < reducOutVecWidth; k++) { + const int reductionId = k * reducNumGroups + groupId; + const IdxType i = i0 + reductionId * Policy::NumLogicalWarps; + if (i < N) { dots[i] = final_op(acc[k] + thread_c[k]); } + } + } + } +} + template (Policy::NoSequentialReduce)); dim3 threads(Policy::LogicalWarpSize, Policy::NumLogicalWarps, 1); dim3 blocks(ceildiv(N, Policy::RowsPerBlock), 1, 1); - coalescedReductionThinKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + if constexpr (std::is_same_v) { + coalescedSumThinKernel + <<>>(dots, data, D, N, init, main_op, final_op, inplace); + } else { + coalescedReductionThinKernel<<>>( + dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -240,6 +350,44 @@ RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots, } } +template +RAFT_KERNEL __launch_bounds__(TPB) coalescedSumMediumKernel(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + MainLambda main_op, + FinalLambda final_op, + bool inplace = false) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage1; + __shared__ typename BlockReduce::TempStorage temp_storage2; + OutType thread_data = init; + OutType thread_c = (OutType)0; + + IdxType rowStart = blockIdx.x * D; + for (IdxType i = threadIdx.x; i < D; i += TPB) { + IdxType idx = rowStart + i; + KahanBabushkaNeumaierSum(thread_data, thread_c, main_op(data[idx], i)); + } + OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data); + OutType block_c = BlockReduce(temp_storage2).Sum(thread_c); + + if (threadIdx.x == 0) { + if (inplace) { + dots[blockIdx.x] = final_op(dots[blockIdx.x] + block_acc + block_c); + } else { + dots[blockIdx.x] = final_op(block_acc + block_c); + } + } +} + template fun_scope("coalescedReductionMedium<%d>", TPB); - coalescedReductionMediumKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + if constexpr (std::is_same_v) { + coalescedSumMediumKernel + <<>>(dots, data, D, N, init, main_op, final_op, inplace); + } else { + coalescedReductionMediumKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -322,6 +475,32 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) if (threadIdx.x == 0) { buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = acc; } } +template +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThickKernel( + OutType* buffer, const InType* data, IdxType D, IdxType N, OutType init, MainLambda main_op) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage1; + __shared__ typename BlockReduce::TempStorage temp_storage2; + + OutType thread_data = init; + OutType thread_c = (OutType)0; + + IdxType rowStart = blockIdx.x * D; + for (IdxType i = blockIdx.y * Policy::ThreadsPerBlock + threadIdx.x; i < D; + i += Policy::BlockStride) { + IdxType idx = rowStart + i; + KahanBabushkaNeumaierSum(thread_data, thread_c, main_op(data[idx], i)); + } + + OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data); + OutType block_c = BlockReduce(temp_storage2).Sum(thread_c); + + if (threadIdx.x == 0) { + buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = block_acc + block_c; + } +} + template - <<>>(buffer.data(), data, D, N, init, main_op, reduce_op); + if constexpr (std::is_same_v) { + coalescedSumThickKernel + <<>>(buffer.data(), data, D, N, init, main_op); + } else { + coalescedReductionThickKernel + <<>>(buffer.data(), data, D, N, init, main_op, reduce_op); + } RAFT_CUDA_TRY(cudaPeekAtLastError()); coalescedReductionThin(dots, @@ -391,18 +574,16 @@ void coalescedReductionThickDispatcher(OutType* dots, { // Note: multiple elements per thread to take advantage of the sequential reduction and loop // unrolling - if (D < IdxType(32768)) { - coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else { - coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } + coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along // rows for row major or reduce along columns for column major layout. Can do an inplace reduction // adding to original values of dots if requested. +// In case of an add-reduction, a compensated summation will be performed in order to reduce +// numerical error. Note that the compensation will only be performed 'per-thread' for performance +// reasons and therefore not be equivalent to a sequential compensation. template = IdxType(4) * numSMs) { + if (D <= IdxType(512) || (N >= IdxType(16) * numSMs && D < IdxType(2048))) { coalescedReductionThinDispatcher( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else if (N < numSMs && D >= IdxType(16384)) { + } else if (N < numSMs && D >= IdxType(1 << 17)) { coalescedReductionThickDispatcher( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else { diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index 2a4cfd52ec..ba7ed3dcdf 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -19,10 +19,12 @@ #include "cusolver_wrappers.hpp" #include +#include #include #include #include +#include #include #include @@ -90,7 +92,19 @@ void eigDC(raft::resources const& handle, { #if CUDART_VERSION < 11010 eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); + return; +#endif + +#if CUDART_VERSION <= 12040 + // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093. + rmm::cuda_stream stream_new_wrapper; + cudaStream_t stream_new = stream_new_wrapper.value(); + cudaEvent_t sync_event = resource::detail::get_cuda_stream_sync_event(handle); + RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event)); #else + cudaStream_t stream_new = stream; +#endif cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); cusolverDnParams_t dn_params = nullptr; @@ -108,15 +122,13 @@ void eigDC(raft::resources const& handle, eig_vals, &workspaceDevice, &workspaceHost, - stream)); + stream_new)); - rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream); - rmm::device_scalar d_dev_info(stream); + rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream_new); + rmm::device_scalar d_dev_info(stream_new); std::vector h_work(workspaceHost / sizeof(math_t)); - raft::matrix::copy(handle, - make_device_matrix_view(in, n_rows, n_cols), - make_device_matrix_view(eig_vectors, n_rows, n_cols)); + raft::copy(eig_vectors, in, n_rows * n_cols, stream_new); RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH, dn_params, @@ -131,14 +143,19 @@ void eigDC(raft::resources const& handle, h_work.data(), workspaceHost, d_dev_info.data(), - stream)); + stream_new)); RAFT_CUDA_TRY(cudaGetLastError()); RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params)); - int dev_info = d_dev_info.value(stream); + int dev_info = d_dev_info.value(stream_new); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); + +#if CUDART_VERSION <= 12040 + // Synchronize the created stream with the original stream before return + RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event)); #endif } diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 617ac6d874..567dc6220e 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -28,38 +28,63 @@ namespace raft { namespace linalg { namespace detail { -// Kernel to perform reductions along the strided dimension +// Kernel to perform summation along the strided dimension // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout +// A compensated summation will be performed in order to reduce numerical error. +// Note that the compensation will only be performed 'per-block' for performance +// reasons and therefore not be equivalent to a sequential compensation. + template RAFT_KERNEL stridedSummationKernel( - Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) + Type* out, const Type* data, int D, int N, Type init, MainLambda main_op) { // Thread reduction - Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + Type thread_sum = Type(init); + Type thread_c = Type(0); + int colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { int rowStart = blockIdx.y * blockDim.y + threadIdx.y; int stride = blockDim.y * gridDim.y; for (int j = rowStart; j < N; j += stride) { int idx = colStart + j * D; - thread_data += main_op(data[idx], j); + + // KahanBabushkaNeumaierSum + const Type cur_value = main_op(data[idx], j); + const Type t = thread_sum + cur_value; + if (abs(thread_sum) >= abs(cur_value)) { + thread_c += (thread_sum - t) + cur_value; + } else { + thread_c += (cur_value - t) + thread_sum; + } + thread_sum = t; } } // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - Type* temp = (Type*)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; - temp[myidx] = thread_data; + extern __shared__ char tmp[]; + auto* block_sum = (Type*)tmp; + auto* block_c = block_sum + blockDim.x; + + if (threadIdx.y == 0) { + block_sum[threadIdx.x] = Type(0); + block_c[threadIdx.x] = Type(0); + } __syncthreads(); - for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x]; - __syncthreads(); + // also compute compensation for block-sum + const Type old_sum = atomicAdd(block_sum + threadIdx.x, thread_sum); + const Type t = old_sum + thread_sum; + if (abs(old_sum) >= abs(thread_sum)) { + thread_c += (old_sum - t) + thread_sum; + } else { + thread_c += (thread_sum - t) + old_sum; } + raft::myAtomicAdd(block_c + threadIdx.x, thread_c); + __syncthreads(); // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]); + if (colStart < D && (threadIdx.y == 0)) + raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x] + block_c[threadIdx.x]); } // Kernel to perform reductions along the strided dimension @@ -127,23 +152,35 @@ void stridedReduction(OutType* dots, /// for atomics in stridedKernel (redesign for this is already underway) if (!inplace) raft::linalg::unaryOp(dots, dots, D, raft::const_op(init), stream); - // Arbitrary numbers for now, probably need to tune - const dim3 thrds(32, 16); - IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; - const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), - raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); - const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; - ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) if constexpr (std::is_same::value && - std::is_same::value) + std::is_same::value) { + constexpr int TPB = 256; + constexpr int ColsPerBlk = 8; + constexpr dim3 Block(ColsPerBlk, TPB / ColsPerBlk); + constexpr int MinRowsPerThread = 16; + constexpr int MinRowsPerBlk = Block.y * MinRowsPerThread; + constexpr int MaxBlocksDimY = 8192; + + const dim3 grid(raft::ceildiv(D, (IdxType)ColsPerBlk), + raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk))); + const size_t shmemSize = sizeof(OutType) * Block.x * 2; + stridedSummationKernel - <<>>(dots, data, D, N, init, main_op); - else + <<>>(dots, data, D, N, init, main_op); + } else { + // Arbitrary numbers for now, probably need to tune + const dim3 thrds(32, 16); + IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), + raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); + const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; + stridedReductionKernel <<>>(dots, data, D, N, init, main_op, reduce_op); + } ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index a4523c6926..8fd6e45d37 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -31,6 +31,9 @@ namespace linalg { /** * @brief Compute reduction of the input matrix along the requested dimension + * In case of an add-reduction, a compensated summation will be performed + * in order to reduce numerical error. Note that the compensation will not + * be equivalent to a sequential compensation to preserve parallel efficiency. * * @tparam InType the data type of the input * @tparam OutType the data type of the output (as well as the data type for @@ -92,7 +95,9 @@ void reduce(OutType* dots, * is either row-major or column-major, while allowing the choose the * dimension for reduction. Depending upon the dimension chosen for * reduction, the memory accesses may be coalesced or strided. - * + * In case of an add-reduction, a compensated summation will be performed + * in order to reduce numerical error. Note that the compensation will not + * be equivalent to a sequential compensation to preserve parallel efficiency. * @tparam InElementType the input data-type of underlying raft::matrix_view * @tparam LayoutPolicy The layout of Input/Output (row or col major) * @tparam OutElementType the output data-type of underlying raft::matrix_view and reduction diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index e4e3ea3512..93faf9dd19 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -423,24 +423,39 @@ void optimize(raft::resources const& res, const auto num_full = host_stats.data_handle()[1]; // Create pruned kNN graph - uint32_t max_detour = 0; -#pragma omp parallel for reduction(max : max_detour) +#pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { - uint64_t pk = 0; - for (uint32_t num_detour = 0; num_detour < output_graph_degree; num_detour++) { - if (max_detour < num_detour) { max_detour = num_detour; /* stats */ } + // Find the `output_graph_degree` smallest detourable count nodes by checking the detourable + // count of the neighbors while increasing the target detourable count from zero. + uint64_t pk = 0; + uint32_t num_detour = 0; + while (pk < output_graph_degree) { + uint32_t next_num_detour = std::numeric_limits::max(); for (uint64_t k = 0; k < input_graph_degree; k++) { - if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; } + const auto num_detour_k = detour_count.data_handle()[k + (input_graph_degree * i)]; + // Find the detourable count to check in the next iteration + if (num_detour_k > num_detour) { + next_num_detour = std::min(static_cast(num_detour_k), next_num_detour); + } + + // Store the neighbor index if its detourable count is equal to `num_detour`. + if (num_detour_k != num_detour) { continue; } output_graph_ptr[pk + (output_graph_degree * i)] = input_graph_ptr[k + (input_graph_degree * i)]; pk += 1; if (pk >= output_graph_degree) break; } if (pk >= output_graph_degree) break; + + assert(next_num_detour != std::numeric_limits::max()); + num_detour = next_num_detour; } - assert(pk == output_graph_degree); + RAFT_EXPECTS(pk == output_graph_degree, + "Couldn't find the output_graph_degree (%u) smallest detourable count nodes for " + "node %lu in the rank-based node reranking process", + output_graph_degree, + static_cast(i)); } - // RAFT_LOG_DEBUG("# max_detour: %u\n", max_detour); const double time_prune_end = cur_time(); RAFT_LOG_DEBUG( diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh index cfbb1e100c..16bb555aa4 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -209,7 +210,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( #if 0 /* debug */ - for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += BLOCK_SIZE) { + for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += blockDim.x) { result_indices_buffer[i] = utils::get_max_value(); result_distances_buffer[i] = utils::get_max_value(); } @@ -351,16 +352,19 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel( } #ifdef _CLK_BREAKDOWN - if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && (blockIdx.x == 0) && + if ((threadIdx.x == 0 || threadIdx.x == blockDim.x - 1) && (blockIdx.x == 0) && ((query_id * 3) % gridDim.y < 3)) { - RAFT_LOG_DEBUG( + printf( + "%s:%d " "query, %d, thread, %d" - ", init, %d" + ", init, %lu" ", 1st_distance, %lu" ", topk, %lu" ", pickup_parents, %lu" ", distance, %lu" "\n", + __FILE__, + __LINE__, query_id, threadIdx.x, clk_init, diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index e8104bd6f6..232dcb782a 100644 --- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -448,14 +449,6 @@ __device__ inline void hashmap_restore(INDEX_T* const hashmap_ptr, } } -template -__device__ inline void set_value_device(T* const ptr, const T fill, const std::uint32_t count) -{ - for (std::uint32_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { - ptr[i] = fill; - } -} - // One query one thread block template neighbors, raft::host_matrix_view distances) { + idx.set_ef(params.ef); auto const* hnswlib_index = reinterpret_cast::type> const*>( idx.get_index()); diff --git a/cpp/include/raft/neighbors/detail/hnsw_types.hpp b/cpp/include/raft/neighbors/detail/hnsw_types.hpp index 9d35effd1a..8d601f59ae 100644 --- a/cpp/include/raft/neighbors/detail/hnsw_types.hpp +++ b/cpp/include/raft/neighbors/detail/hnsw_types.hpp @@ -93,6 +93,11 @@ struct index_impl : index { */ auto get_index() const -> void const* override { return appr_alg_.get(); } + /** + @brief Set ef for search + */ + void set_ef(int ef) const override { appr_alg_->ef_ = ef; } + private: std::unique_ptr::type>> appr_alg_; std::unique_ptr::type>> space_; diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh index cd3c6f3947..9c37ee146d 100644 --- a/cpp/include/raft/neighbors/detail/nn_descent.cuh +++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh @@ -19,10 +19,14 @@ #include "../nn_descent_types.hpp" #include +#include #include #include +#include +#include #include #include +#include #include #include #include // raft::util::arch::SM_* @@ -45,6 +49,7 @@ #include #include +#include #include #include @@ -217,6 +222,7 @@ struct BuildConfig { // If internal_node_degree == 0, the value of node_degree will be assigned to it size_t max_iterations{50}; float termination_threshold{0.0001}; + size_t output_graph_degree{32}; }; template @@ -338,14 +344,19 @@ struct GnndGraph { ~GnndGraph(); }; -template +template class GNND { public: GNND(raft::resources const& res, const BuildConfig& build_config); GNND(const GNND&) = delete; GNND& operator=(const GNND&) = delete; - void build(Data_t* data, const Index_t nrow, Index_t* output_graph); + void build(Data_t* data, + const Index_t nrow, + Index_t* output_graph, + bool return_distances, + DistData_t* output_distances, + epilogue_op distance_epilogue = raft::identity_op()); ~GNND() = default; using ID_t = InternalID_t; @@ -355,7 +366,7 @@ class GNND { Index_t* d_rev_graph_ptr, int2* list_sizes, cudaStream_t stream = 0); - void local_join(cudaStream_t stream = 0); + void local_join(cudaStream_t stream = 0, epilogue_op distance_epilogue = raft::identity_op()); raft::resources const& res; @@ -688,7 +699,9 @@ __device__ __forceinline__ void remove_duplicates( // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM // is 1024 and 1536 respectively, which means the bounds don't work anymore -template > +template , + typename epilogue_op = raft::identity_op> RAFT_KERNEL #ifdef __CUDA_ARCH__ #if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890) @@ -710,7 +723,8 @@ __launch_bounds__(BLOCK_SIZE, 4) DistData_t* dists, int graph_width, int* locks, - DistData_t* l2_norms) + DistData_t* l2_norms, + epilogue_op distance_epilogue) { #if (__CUDA_ARCH__ >= 700) using namespace nvcuda; @@ -820,14 +834,17 @@ __launch_bounds__(BLOCK_SIZE, 4) __syncthreads(); for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { - if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size && - i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { + auto row_idx = i % SKEWED_MAX_NUM_BI_SAMPLES; + auto col_idx = i / SKEWED_MAX_NUM_BI_SAMPLES; + if (row_idx < list_new_size && col_idx < list_new_size) { + auto r = new_neighbors[row_idx]; + auto c = new_neighbors[col_idx]; if (l2_norms == nullptr) { - s_distances[i] = -s_distances[i]; + auto dist_val = -s_distances[i]; + s_distances[i] = distance_epilogue(dist_val, r, c); } else { - s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + - l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - - 2.0 * s_distances[i]; + auto dist_val = l2_norms[r] + l2_norms[c] - 2.0 * s_distances[i]; + s_distances[i] = distance_epilogue(dist_val, r, c); } } else { s_distances[i] = std::numeric_limits::max(); @@ -899,14 +916,17 @@ __launch_bounds__(BLOCK_SIZE, 4) __syncthreads(); for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { - if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size && - i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { + auto row_idx = i % SKEWED_MAX_NUM_BI_SAMPLES; + auto col_idx = i / SKEWED_MAX_NUM_BI_SAMPLES; + if (row_idx < list_old_size && col_idx < list_new_size) { + auto r = old_neighbors[row_idx]; + auto c = new_neighbors[col_idx]; if (l2_norms == nullptr) { - s_distances[i] = -s_distances[i]; + auto dist_val = -s_distances[i]; + s_distances[i] = distance_epilogue(dist_val, r, c); } else { - s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + - l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - - 2.0 * s_distances[i]; + auto dist_val = l2_norms[r] + l2_norms[c] - 2.0 * s_distances[i]; + s_distances[i] = distance_epilogue(dist_val, r, c); } } else { s_distances[i] = std::numeric_limits::max(); @@ -1134,8 +1154,9 @@ GnndGraph::~GnndGraph() assert(h_graph == nullptr); } -template -GNND::GNND(raft::resources const& res, const BuildConfig& build_config) +template +GNND::GNND(raft::resources const& res, + const BuildConfig& build_config) : res(res), build_config_(build_config), graph_(build_config.max_dataset_size, @@ -1174,12 +1195,12 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0); }; -template -void GNND::add_reverse_edges(Index_t* graph_ptr, - Index_t* h_rev_graph_ptr, - Index_t* d_rev_graph_ptr, - int2* list_sizes, - cudaStream_t stream) +template +void GNND::add_reverse_edges(Index_t* graph_ptr, + Index_t* h_rev_graph_ptr, + Index_t* d_rev_graph_ptr, + int2* list_sizes, + cudaStream_t stream) { add_rev_edges_kernel<<>>( graph_ptr, d_rev_graph_ptr, NUM_SAMPLES, list_sizes); @@ -1187,8 +1208,9 @@ void GNND::add_reverse_edges(Index_t* graph_ptr, h_rev_graph_ptr, d_rev_graph_ptr, nrow_ * NUM_SAMPLES, raft::resource::get_cuda_stream(res)); } -template -void GNND::local_join(cudaStream_t stream) +template +void GNND::local_join(cudaStream_t stream, + epilogue_op distance_epilogue) { thrust::fill(thrust::device.on(stream), dists_buffer_.data_handle(), @@ -1208,11 +1230,17 @@ void GNND::local_join(cudaStream_t stream) dists_buffer_.data_handle(), DEGREE_ON_DEVICE, d_locks_.data_handle(), - l2_norms_.data_handle()); + l2_norms_.data_handle(), + distance_epilogue); } -template -void GNND::build(Data_t* data, const Index_t nrow, Index_t* output_graph) +template +void GNND::build(Data_t* data, + const Index_t nrow, + Index_t* output_graph, + bool return_distances, + DistData_t* output_distances, + epilogue_op distance_epilogue) { using input_t = typename std::remove_const::type; @@ -1308,7 +1336,7 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out raft::util::arch::SM_range(raft::util::arch::SM_70(), raft::util::arch::SM_future()); if (wmma_range.contains(runtime_arch)) { - local_join(stream); + local_join(stream, distance_epilogue); } else { THROW("NN_DESCENT cannot be run for __CUDA_ARCH__ < 700"); } @@ -1338,6 +1366,26 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out // Reuse graph_.h_dists as the buffer for shrink the lists in graph static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t)); + + if (return_distances) { + auto graph_d_dists = raft::make_device_matrix( + res, nrow_, build_config_.node_degree); + raft::copy(graph_d_dists.data_handle(), + graph_.h_dists.data_handle(), + nrow_ * build_config_.node_degree, + raft::resource::get_cuda_stream(res)); + + auto output_dist_view = raft::make_device_matrix_view( + output_distances, nrow_, build_config_.output_graph_degree); + + raft::matrix::slice_coordinates coords{static_cast(0), + static_cast(0), + static_cast(nrow_), + static_cast(build_config_.output_graph_degree)}; + raft::matrix::slice( + res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords); + } + Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle(); #pragma omp parallel for @@ -1365,13 +1413,15 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out } template , memory_type::host>> void build(raft::resources const& res, const index_params& params, mdspan, row_major, Accessor> dataset, - index& idx) + index& idx, + epilogue_op distance_epilogue = raft::identity_op()) { RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits::max() - 1, "The dataset size for GNND should be less than %d", @@ -1410,10 +1460,25 @@ void build(raft::resources const& res, .node_degree = extended_graph_degree, .internal_node_degree = extended_intermediate_degree, .max_iterations = params.max_iterations, - .termination_threshold = params.termination_threshold}; - - GNND nnd(res, build_config); - nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle()); + .termination_threshold = params.termination_threshold, + .output_graph_degree = params.graph_degree}; + + GNND nnd(res, build_config); + + if (idx.distances().has_value() || !params.return_distances) { + nnd.build(dataset.data_handle(), + dataset.extent(0), + int_graph.data_handle(), + params.return_distances, + idx.distances() + .value_or(raft::make_device_matrix(res, 0, 0).view()) + .data_handle(), + distance_epilogue); + } else { + RAFT_EXPECTS(!params.return_distances, + "Distance view not allocated. Using return_distances set to true requires " + "distance view to be allocated."); + } #pragma omp parallel for for (size_t i = 0; i < static_cast(dataset.extent(0)); i++) { @@ -1425,12 +1490,14 @@ void build(raft::resources const& res, } template , memory_type::host>> index build(raft::resources const& res, const index_params& params, - mdspan, row_major, Accessor> dataset) + mdspan, row_major, Accessor> dataset, + epilogue_op distance_epilogue = raft::identity_op()) { size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; @@ -1444,9 +1511,10 @@ index build(raft::resources const& res, graph_degree = intermediate_degree; } - index idx{res, dataset.extent(0), static_cast(graph_degree)}; + index idx{ + res, dataset.extent(0), static_cast(graph_degree), params.return_distances}; - build(res, params, dataset, idx); + build(res, params, dataset, idx, distance_epilogue); return idx; } diff --git a/cpp/include/raft/neighbors/hnsw.hpp b/cpp/include/raft/neighbors/hnsw.hpp index 964c3ffacd..ee3f61e550 100644 --- a/cpp/include/raft/neighbors/hnsw.hpp +++ b/cpp/include/raft/neighbors/hnsw.hpp @@ -35,7 +35,7 @@ namespace raft::neighbors::hnsw { /** * @brief Construct an hnswlib base-layer-only index from a CAGRA index - * NOTE: 1. This method uses the filesystem to write the CAGRA index in `/tmp/cagra_index.bin` + * NOTE: 1. This method uses the filesystem to write the CAGRA index in `/tmp/.bin` * before reading it as an hnswlib index, then deleting the temporary file. * 2. This function is only offered as a compiled symbol in `libraft.so` * diff --git a/cpp/include/raft/neighbors/hnsw_types.hpp b/cpp/include/raft/neighbors/hnsw_types.hpp index 645a0903b7..f90de6f01b 100644 --- a/cpp/include/raft/neighbors/hnsw_types.hpp +++ b/cpp/include/raft/neighbors/hnsw_types.hpp @@ -62,6 +62,11 @@ struct index : ann::index { auto metric() const -> raft::distance::DistanceType { return metric_; } + /** + @brief Set ef for search + */ + virtual void set_ef(int ef) const; + private: int dim_; raft::distance::DistanceType metric_; diff --git a/cpp/include/raft/neighbors/nn_descent.cuh b/cpp/include/raft/neighbors/nn_descent.cuh index ceb5ae5643..a46a2006d6 100644 --- a/cpp/include/raft/neighbors/nn_descent.cuh +++ b/cpp/include/raft/neighbors/nn_descent.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,19 +48,22 @@ namespace raft::neighbors::experimental::nn_descent { * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index + * @tparam epilogue_op epilogue operation type for distances * @param[in] res raft::resources is an object mangaging resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] distance_epilogue epilogue operation for distances * @return index index containing all-neighbors knn graph in host memory */ -template +template index build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) + raft::device_matrix_view dataset, + epilogue_op distance_epilogue = raft::identity_op()) { - return detail::build(res, params, dataset); + return detail::build(res, params, dataset, distance_epilogue); } /** @@ -85,6 +88,7 @@ index build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index + * @tparam epilogue_op epilogue operation type for distances * @param res raft::resources is an object mangaging resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm @@ -92,14 +96,16 @@ index build(raft::resources const& res, * in device memory * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph * in host memory + * @param[in] distance_epilogue epilogue operation for distances */ -template +template void build(raft::resources const& res, index_params const& params, raft::device_matrix_view dataset, - index& idx) + index& idx, + epilogue_op distance_epilogue = raft::identity_op()) { - detail::build(res, params, dataset, idx); + detail::build(res, params, dataset, idx, distance_epilogue); } /** @@ -122,19 +128,22 @@ void build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index + * @tparam epilogue_op epilogue operation type for distances * @param res raft::resources is an object mangaging resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] distance_epilogue epilogue operation for distances * @return index index containing all-neighbors knn graph in host memory */ -template +template index build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) + raft::host_matrix_view dataset, + epilogue_op distance_epilogue = raft::identity_op()) { - return detail::build(res, params, dataset); + return detail::build(res, params, dataset, distance_epilogue); } /** @@ -159,6 +168,7 @@ index build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index + * @tparam epilogue_op epilogue operation type for distances * @param[in] res raft::resources is an object mangaging resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm @@ -166,14 +176,16 @@ index build(raft::resources const& res, * in host memory * @param[out] idx raft::neighbors::experimental::nn_descentindex containing all-neighbors knn graph * in host memory + * @param[in] distance_epilogue epilogue operation for distances */ -template +template void build(raft::resources const& res, index_params const& params, raft::host_matrix_view dataset, - index& idx) + index& idx, + epilogue_op distance_epilogue = raft::identity_op()) { - detail::build(res, params, dataset, idx); + detail::build(res, params, dataset, idx, distance_epilogue); } /** @} */ // end group nn-descent diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp index e1fc96878a..5d23ff2c2e 100644 --- a/cpp/include/raft/neighbors/nn_descent_types.hpp +++ b/cpp/include/raft/neighbors/nn_descent_types.hpp @@ -18,6 +18,8 @@ #include "ann_types.hpp" +#include +#include #include #include #include @@ -25,6 +27,8 @@ #include #include +#include + namespace raft::neighbors::experimental::nn_descent { /** * @ingroup nn-descent @@ -51,6 +55,7 @@ struct index_params : ann::index_params { size_t intermediate_graph_degree = 128; // Degree of input graph for pruning. size_t max_iterations = 20; // Number of nn-descent iterations. float termination_threshold = 0.0001; // Termination threshold of nn-descent. + bool return_distances = false; // return distances if true }; /** @@ -79,14 +84,20 @@ struct index : ann::index { * @param res raft::resources is an object mangaging resources * @param n_rows number of rows in knn-graph * @param n_cols number of cols in knn-graph + * @param return_distances whether to allocate and get distances information */ - index(raft::resources const& res, int64_t n_rows, int64_t n_cols) + index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false) : ann::index(), res_{res}, metric_{raft::distance::DistanceType::L2Expanded}, graph_{raft::make_host_matrix(n_rows, n_cols)}, - graph_view_{graph_.view()} + graph_view_{graph_.view()}, + return_distances_(return_distances) { + if (return_distances) { + distances_ = raft::make_device_matrix(res_, n_rows, n_cols); + distances_view_ = distances_.value().view(); + } } /** @@ -98,14 +109,23 @@ struct index : ann::index { * * @param res raft::resources is an object mangaging resources * @param graph_view raft::host_matrix_view for storing knn-graph + * @param distances_view std::optional> for + * storing knn-graph distances + * @param return_distances whether to allocate and get distances information */ index(raft::resources const& res, - raft::host_matrix_view graph_view) + raft::host_matrix_view graph_view, + std::optional> distances_view = + std::nullopt, + bool return_distances = false) : ann::index(), res_{res}, metric_{raft::distance::DistanceType::L2Expanded}, graph_{raft::make_host_matrix(0, 0)}, - graph_view_{graph_view} + distances_{raft::make_device_matrix(res_, 0, 0)}, + graph_view_{graph_view}, + distances_view_(distances_view), + return_distances_(return_distances) { } @@ -133,6 +153,13 @@ struct index : ann::index { return graph_view_; } + /** neighborhood graph distances [size, graph-degree] */ + [[nodiscard]] inline auto distances() noexcept + -> std::optional> + { + return distances_view_; + } + // Don't allow copying the index for performance reasons (try avoiding copying data) index(const index&) = delete; index(index&&) = default; @@ -144,8 +171,11 @@ struct index : ann::index { raft::resources const& res_; raft::distance::DistanceType metric_; raft::host_matrix graph_; // graph to return for non-int IdxT + std::optional> distances_; raft::host_matrix_view graph_view_; // view of graph for user provided matrix + std::optional> distances_view_; + bool return_distances_; }; /** @} */ diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h index 08efbb7106..ae552cc687 100644 --- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h @@ -393,6 +393,34 @@ inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); } +// cusparseSpMV_preprocess is only available starting CUDA 12.4 +#if CUDA_VER_12_4_UP +template < + typename T, + typename std::enable_if_t || std::is_same_v>* = nullptr> +cusparseStatus_t cusparsespmv_preprocess(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + T* externalBuffer, + cudaStream_t stream) +{ + auto constexpr float_type = []() constexpr { + if constexpr (std::is_same_v) { + return CUDA_R_32F; + } else if constexpr (std::is_same_v) { + return CUDA_R_64F; + } + }(); + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV_preprocess( + handle, opA, alpha, matA, vecX, beta, vecY, float_type, alg, externalBuffer); +} +#endif /** @} */ #else /** diff --git a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh new file mode 100644 index 0000000000..ef74316d04 --- /dev/null +++ b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain A copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace raft { +namespace sparse { +namespace linalg { +namespace detail { + +template +void masked_matmul(raft::resources const& handle, + raft::device_matrix_view& A, + raft::device_matrix_view& B, + raft::core::bitmap_view& mask, + raft::device_csr_matrix_view& C, + std::optional> alpha, + std::optional> beta) +{ + index_t m = A.extent(0); + index_t n = B.extent(0); + index_t dim = A.extent(1); + + auto compressed_C_view = C.structure_view(); + + RAFT_EXPECTS(A.extent(1) == B.extent(1), "The dim of A must be equal to the dim of B."); + RAFT_EXPECTS(A.extent(0) == compressed_C_view.get_n_rows(), + "Number of rows in C must match the number of rows in A."); + RAFT_EXPECTS(B.extent(0) == compressed_C_view.get_n_cols(), + "Number of columns in C must match the number of columns in B."); + + auto stream = raft::resource::get_cuda_stream(handle); + + auto C_matrix = raft::make_device_csr_matrix(handle, compressed_C_view); + + // fill C + raft::sparse::convert::bitmap_to_csr(handle, mask, C_matrix); + + if (m > 10 || alpha.has_value() || beta.has_value()) { + auto C_view = raft::make_device_csr_matrix_view( + C.get_elements().data(), compressed_C_view); + + // create B col_major view + auto B_col_major = raft::make_device_matrix_view( + B.data_handle(), dim, n); + + value_t default_alpha = static_cast(1.0f); + value_t default_beta = static_cast(0.0f); + + if (!alpha.has_value()) { alpha = raft::make_host_scalar_view(&default_alpha); } + if (!beta.has_value()) { beta = raft::make_host_scalar_view(&default_beta); } + + raft::sparse::linalg::sddmm(handle, + A, + B_col_major, + C_view, + raft::linalg::Operation::NON_TRANSPOSE, + raft::linalg::Operation::NON_TRANSPOSE, + *alpha, + *beta); + } else { + raft::sparse::distance::detail::faster_dot_on_csr(handle, + C.get_elements().data(), + compressed_C_view.get_nnz(), + compressed_C_view.get_indptr().data(), + compressed_C_view.get_indices().data(), + A.data_handle(), + B.data_handle(), + compressed_C_view.get_n_rows(), + dim); + } +} + +} // namespace detail +} // namespace linalg +} // namespace sparse +} // namespace raft diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.hpp b/cpp/include/raft/sparse/linalg/masked_matmul.hpp new file mode 100644 index 0000000000..560cd3f715 --- /dev/null +++ b/cpp/include/raft/sparse/linalg/masked_matmul.hpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain A copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace raft { +namespace sparse { +namespace linalg { + +/** + * @defgroup masked_matmul Masked Matrix Multiplication + * @{ + */ + +/** + * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise + * multiplication with the sparsity pattern defined by the mask, resulting in the computation + * C = alpha * ((A * B) ∘ spy(mask)) + beta * C. + * + * This function multiplies two dense matrices A and B, and then applies an element-wise + * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha + * and added to beta times the original matrix C. + * + * @tparam value_t Data type of elements in the input/output matrices (e.g., float, double) + * @tparam index_t Type used for matrix indices + * @tparam nnz_t Type used for the number of non-zero entries in CSR format + * @tparam bitmap_t Type of the bitmap used for the mask + * + * @param[in] handle RAFT handle for resource management + * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k] + * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k] + * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape + * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is + * included (1) or masked out (0). + * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m, + * n] + * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if + * std::nullopt) + * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt) + */ +template +void masked_matmul(raft::resources const& handle, + raft::device_matrix_view A, + raft::device_matrix_view B, + raft::core::bitmap_view mask, + raft::device_csr_matrix_view C, + std::optional> alpha = std::nullopt, + std::optional> beta = std::nullopt) +{ + detail::masked_matmul(handle, A, B, mask, C, alpha, beta); +} + +/** @} */ // end of masked_matmul + +} // end namespace linalg +} // end namespace sparse +} // end namespace raft diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index 6c330acb26..ee39c87a68 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -25,61 +26,23 @@ namespace raft { namespace stats { namespace detail { -///@todo: ColsPerBlk has been tested only for 32! -template -RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) - thread_data += (colId < D) ? data[i * D + colId] : Type(0); - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - thread_data += data[idx]; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } -} - template void mean( Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor<<>>(mu, data, D, N); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); - } else { - meanKernelColMajor<<>>(mu, data, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); + raft::linalg::reduce(mu, + data, + D, + N, + Type(0), + rowMajor, + false, + stream, + false, + raft::identity_op(), + raft::add_op(), + raft::mul_const_op(ratio)); } } // namespace detail diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index bc2644a233..4c861b49fb 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -16,7 +16,9 @@ #pragma once +#include #include +#include #include #include @@ -25,63 +27,6 @@ namespace raft { namespace stats { namespace detail { -///@todo: ColPerBlk has been tested only for 32! -template -RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) { - Type val = (colId < D) ? data[i * D + colId] : Type(0); - thread_data += val * val; - } - __shared__ Type sstd[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) sstd[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(sstd + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(std + colId, sstd[thisColId]); -} - -template -RAFT_KERNEL stddevKernelColMajor(Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - Type diff = data[idx] - m; - thread_data += diff * diff; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); } -} - -template -RAFT_KERNEL varsKernelColMajor(Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - Type diff = data[idx] - m; - thread_data += diff * diff; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } -} - /** * @brief Compute stddev of the input matrix * @@ -110,26 +55,22 @@ void stddev(Type* std, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(std, data, D, N); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - std, - std, - mu, - D, - [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); }, - stream); - } else { - stddevKernelColMajor<<>>(std, data, mu, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + raft::linalg::reduce( + std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { + return a * a; + }); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); + Type ratio_mean = sample ? ratio * Type(N) : Type(1); + raft::linalg::binaryOp(std, + std, + mu, + D, + raft::compose_op(raft::sqrt_op(), + raft::abs_op(), + [ratio, ratio_mean] __device__(Type a, Type b) { + return a * ratio - b * b * ratio_mean; + }), + stream); } /** @@ -160,21 +101,21 @@ void vars(Type* var, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(var, data, D, N); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); - } else { - varsKernelColMajor<<>>(var, data, mu, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + raft::linalg::reduce( + var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { + return a * a; + }); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); + Type ratio_mean = sample ? ratio * Type(N) : Type(1); + raft::linalg::binaryOp(var, + var, + mu, + D, + raft::compose_op(raft::abs_op(), + [ratio, ratio_mean] __device__(Type a, Type b) { + return a * ratio - b * b * ratio_mean; + }), + stream); } } // namespace detail diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh index 4f85536e6c..39bd2c3b6c 100644 --- a/cpp/include/raft/stats/detail/sum.cuh +++ b/cpp/include/raft/stats/detail/sum.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -25,106 +26,10 @@ namespace raft { namespace stats { namespace detail { -///@todo: ColsPerBlk has been tested only for 32! -template -RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_sum = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) { - thread_sum += (colId < D) ? data[i * D + colId] : Type(0); - } - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_sum); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL sumKahanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_sum = Type(0); - Type thread_c = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) { - // KahanBabushkaNeumaierSum - const Type cur_value = (colId < D) ? data[i * D + colId] : Type(0); - const Type t = thread_sum + cur_value; - if (abs(thread_sum) >= abs(cur_value)) { - thread_c += (thread_sum - t) + cur_value; - } else { - thread_c += (cur_value - t) + thread_sum; - } - thread_sum = t; - } - thread_sum += thread_c; - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_sum); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL sumKahanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_sum = Type(0); - Type thread_c = Type(0); - IdxType colStart = N * blockIdx.x; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - // KahanBabushkaNeumaierSum - IdxType idx = colStart + i; - const Type cur_value = data[idx]; - const Type t = thread_sum + cur_value; - if (abs(thread_sum) >= abs(cur_value)) { - thread_c += (thread_sum - t) + cur_value; - } else { - thread_c += (cur_value - t) + thread_sum; - } - thread_sum = t; - } - thread_sum += thread_c; - Type acc = BlockReduce(temp_storage).Sum(thread_sum); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } -} - template void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int ColsPerBlk = 8; - static const int MinRowsPerThread = 16; - static const int MinRowsPerBlk = (TPB / ColsPerBlk) * MinRowsPerThread; - static const int MaxBlocksDimX = 8192; - - const IdxType grid_y = raft::ceildiv(D, (IdxType)ColsPerBlk); - const IdxType grid_x = - raft::min((IdxType)MaxBlocksDimX, raft::ceildiv(N, (IdxType)MinRowsPerBlk)); - - dim3 grid(grid_x, grid_y); - RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D)); - sumKahanKernelRowMajor - <<>>(output, input, D, N); - } else { - sumKahanKernelColMajor<<>>(output, input, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + raft::linalg::reduce(output, input, D, N, Type(0), rowMajor, false, stream); } } // namespace detail diff --git a/cpp/include/raft/core/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh similarity index 86% rename from cpp/include/raft/core/detail/popc.cuh rename to cpp/include/raft/util/detail/popc.cuh index d74b68b715..20b4814216 100644 --- a/cpp/include/raft/core/detail/popc.cuh +++ b/cpp/include/raft/util/detail/popc.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -28,15 +29,15 @@ namespace raft::detail { * @tparam value_t the value type of the vector. * @tparam index_t the index type of vector and scalar. * - * @param[in] res raft handle for managing expensive resources - * @param[in] values Number of row in the matrix. + * @param[in] res RAFT handle for managing expensive resources + * @param[in] values Device vector view containing the values to be processed. * @param[in] max_len Maximum number of bits to count. - * @param[out] counter Number of bits that are set to 1. + * @param[out] counter Device scalar view to store the number of bits that are set to 1. */ template void popc(const raft::resources& res, device_vector_view values, - index_t max_len, + raft::host_scalar_view max_len, raft::device_scalar_view counter) { auto values_size = values.size(); @@ -46,7 +47,7 @@ void popc(const raft::resources& res, static constexpr index_t len_per_item = sizeof(value_t) * 8; - value_t tail_len = (max_len % len_per_item); + value_t tail_len = (max_len[0] % len_per_item); value_t tail_mask = tail_len ? (value_t)((value_t{1} << tail_len) - value_t{1}) : ~value_t{0}; raft::linalg::coalesced_reduction( res, diff --git a/cpp/include/raft/util/popc.cuh b/cpp/include/raft/util/popc.cuh new file mode 100644 index 0000000000..153694e45e --- /dev/null +++ b/cpp/include/raft/util/popc.cuh @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { + +/** + * @brief Count the number of bits that are set to 1 in a vector. + * + * @tparam value_t the value type of the vector. + * @tparam index_t the index type of vector and scalar. + * + * @param[in] res RAFT handle for managing expensive resources + * @param[in] values Device vector view containing the values to be processed. + * @param[in] max_len Host scalar view to store the Maximum number of bits to count. + * @param[out] counter Device scalar view to store the number of bits that are set to 1. + */ +template +void popc(const raft::resources& res, + device_vector_view values, + raft::host_scalar_view max_len, + raft::device_scalar_view counter) +{ + detail::popc(res, values, max_len, counter); +} + +} // namespace raft diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh index c0d3da7609..504f35fa0f 100644 --- a/cpp/include/raft/util/reduction.cuh +++ b/cpp/include/raft/util/reduction.cuh @@ -98,7 +98,7 @@ DI T blockReduce(T val, char* smem, ReduceLambda reduce_op = raft::add_op{}) val = warpReduce(val, reduce_op); if (lid == 0) sTemp[wid] = val; __syncthreads(); - val = lid < nWarps ? sTemp[lid] : T(0); + val = lid < nWarps ? sTemp[lid] : T(); return warpReduce(val, reduce_op); } diff --git a/cpp/include/raft_runtime/neighbors/refine.hpp b/cpp/include/raft_runtime/neighbors/refine.hpp index fba7d0fc0e..592c8be82b 100644 --- a/cpp/include/raft_runtime/neighbors/refine.hpp +++ b/cpp/include/raft_runtime/neighbors/refine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,9 @@ #pragma once #include +#include #include -// #include +#include namespace raft::runtime::neighbors { @@ -29,7 +30,7 @@ namespace raft::runtime::neighbors { raft::device_matrix_view neighbor_candidates, \ raft::device_matrix_view indices, \ raft::device_matrix_view distances, \ - distance::DistanceType metric); \ + raft::distance::DistanceType metric); \ \ void refine(raft::resources const& handle, \ raft::host_matrix_view dataset, \ @@ -37,7 +38,7 @@ namespace raft::runtime::neighbors { raft::host_matrix_view neighbor_candidates, \ raft::host_matrix_view indices, \ raft::host_matrix_view distances, \ - distance::DistanceType metric); + raft::distance::DistanceType metric); RAFT_INST_REFINE(int64_t, float); RAFT_INST_REFINE(int64_t, uint8_t); diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py index 97fe120458..6adff0eee1 100644 --- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py +++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ # NOTE: this template is not perfectly formatted. Use pre-commit to get # everything in shape again. header = """/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -95,6 +95,11 @@ OpT="raft::distance::detail::ops::cosine_distance_op", archs = [60, 80], ), + dict( + path_prefix="dice", + OpT="raft::distance::detail::ops::dice_distance_op", + archs = [60, 80], + ), dict( path_prefix="hamming_unexpanded", OpT="raft::distance::detail::ops::hamming_distance_op", diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu new file mode 100644 index 0000000000..a259f8b3b0 --- /dev/null +++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by dispatch_00_generate.py + * + * Make changes there and run in this directory: + * + * > python dispatch_00_generate.py + * + */ + +#include // raft::identity_op +#include // ops::* +#include // dispatch +#include +#include +#define instantiate_raft_distance_detail_pairwise_matrix_dispatch( \ + OpT, DataT, AccT, OutT, FinOpT, IdxT) \ + template void raft::distance::detail:: \ + pairwise_matrix_dispatch, DataT, AccT, OutT, FinOpT, IdxT>( \ + OpT distance_op, \ + IdxT m, \ + IdxT n, \ + IdxT k, \ + const DataT* x, \ + const DataT* y, \ + const DataT* x_norm, \ + const DataT* y_norm, \ + OutT* out, \ + FinOpT fin_op, \ + cudaStream_t stream, \ + bool is_row_major) + +instantiate_raft_distance_detail_pairwise_matrix_dispatch( + raft::distance::detail::ops::dice_distance_op, double, double, double, raft::identity_op, int); + +#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu new file mode 100644 index 0000000000..e89f8b422c --- /dev/null +++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by dispatch_00_generate.py + * + * Make changes there and run in this directory: + * + * > python dispatch_00_generate.py + * + */ + +#include // raft::identity_op +#include // ops::* +#include // dispatch +#include +#include +#define instantiate_raft_distance_detail_pairwise_matrix_dispatch( \ + OpT, DataT, AccT, OutT, FinOpT, IdxT) \ + template void raft::distance::detail:: \ + pairwise_matrix_dispatch, DataT, AccT, OutT, FinOpT, IdxT>( \ + OpT distance_op, \ + IdxT m, \ + IdxT n, \ + IdxT k, \ + const DataT* x, \ + const DataT* y, \ + const DataT* x_norm, \ + const DataT* y_norm, \ + OutT* out, \ + FinOpT fin_op, \ + cudaStream_t stream, \ + bool is_row_major) + +instantiate_raft_distance_detail_pairwise_matrix_dispatch( + raft::distance::detail::ops::dice_distance_op, float, float, float, raft::identity_op, int); + +#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu index 8c94608311..8fe0bf2007 100644 --- a/cpp/src/distance/distance.cu +++ b/cpp/src/distance/distance.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,6 +72,10 @@ instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int); instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, raft::identity_op, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, double, double, double, raft::identity_op, int); instantiate_raft_distance_distance( raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int); instantiate_raft_distance_distance( @@ -154,6 +158,10 @@ instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, float, float, float, int); instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, double, double, double, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, double, double, double, int); instantiate_raft_distance_distance( raft::distance::DistanceType::HammingUnexpanded, float, float, float, int); instantiate_raft_distance_distance( @@ -230,6 +238,10 @@ instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, float, float, float, int); instantiate_raft_distance_distance( raft::distance::DistanceType::CosineExpanded, double, double, double, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, double, double, double, int); instantiate_raft_distance_distance( raft::distance::DistanceType::HammingUnexpanded, float, float, float, int); instantiate_raft_distance_distance( @@ -297,6 +309,10 @@ instantiate_raft_distance_getWorkspaceSize( raft::distance::DistanceType::CosineExpanded, float, float, float, int); instantiate_raft_distance_getWorkspaceSize( raft::distance::DistanceType::CosineExpanded, double, double, double, int); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, float, float, float, int); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, double, double, double, int); instantiate_raft_distance_getWorkspaceSize( raft::distance::DistanceType::HammingUnexpanded, float, float, float, int); instantiate_raft_distance_getWorkspaceSize( @@ -415,6 +431,22 @@ instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineE double, int, raft::layout_f_contiguous); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_c_contiguous); +instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + int, + raft::layout_c_contiguous); +instantiate_raft_distance_getWorkspaceSize( + raft::distance::DistanceType::DiceExpanded, float, float, float, int, raft::layout_f_contiguous); +instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + int, + raft::layout_f_contiguous); instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded, float, float, @@ -690,6 +722,22 @@ instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded, double, raft::layout_f_contiguous, int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_c_contiguous, int); +instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + raft::layout_c_contiguous, + int); +instantiate_raft_distance_distance( + raft::distance::DistanceType::DiceExpanded, float, float, float, raft::layout_f_contiguous, int); +instantiate_raft_distance_distance(raft::distance::DistanceType::DiceExpanded, + double, + double, + double, + raft::layout_f_contiguous, + int); instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded, float, float, diff --git a/cpp/src/raft_runtime/neighbors/hnsw.cpp b/cpp/src/raft_runtime/neighbors/hnsw.cpp index 6eb770abd6..5356e708d2 100644 --- a/cpp/src/raft_runtime/neighbors/hnsw.cpp +++ b/cpp/src/raft_runtime/neighbors/hnsw.cpp @@ -21,6 +21,8 @@ #include #include +#include +#include namespace raft::neighbors::hnsw { #define RAFT_INST_HNSW(T) \ @@ -28,7 +30,11 @@ namespace raft::neighbors::hnsw { std::unique_ptr> from_cagra( \ raft::resources const& res, raft::neighbors::cagra::index cagra_index) \ { \ - std::string filepath = "/tmp/cagra_index.bin"; \ + std::random_device dev; \ + std::mt19937 rng(dev()); \ + std::uniform_int_distribution dist(0); \ + auto uuid = std::to_string(dist(rng)); \ + std::string filepath = "/tmp/" + uuid + ".bin"; \ raft::runtime::neighbors::cagra::serialize_to_hnswlib(res, filepath, cagra_index); \ auto hnsw_index = raft::runtime::neighbors::hnsw::deserialize_file( \ res, filepath, cagra_index.dim(), cagra_index.metric()); \ diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake index 11d2403963..0f1d5ff020 100644 --- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake +++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ # the License. # Use this variable to update RAPIDS and RAFT versions -set(RAPIDS_VERSION "24.06") +set(RAPIDS_VERSION "24.08") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index ff0518a4d0..e3af6ebb78 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -113,6 +113,7 @@ if(BUILD_TESTS) NAME CORE_TEST PATH + core/bitmap.cu core/bitset.cu core/device_resources_manager.cpp core/device_setter.cpp @@ -156,6 +157,7 @@ if(BUILD_TESTS) distance/dist_canberra.cu distance/dist_correlation.cu distance/dist_cos.cu + distance/dist_dice.cu distance/dist_hamming.cu distance/dist_hellinger.cu distance/dist_inner_product.cu @@ -315,6 +317,7 @@ if(BUILD_TESTS) sparse/csr_transpose.cu sparse/degree.cu sparse/filter.cu + sparse/masked_matmul.cu sparse/norm.cu sparse/normalize.cu sparse/reduce.cu @@ -505,6 +508,7 @@ if(BUILD_TESTS) util/integer_utils.cpp util/integer_utils.cu util/memory_type_dispatcher.cu + util/popc.cu util/pow2_utils.cu util/reduction.cu ) diff --git a/cpp/test/core/bitmap.cu b/cpp/test/core/bitmap.cu new file mode 100644 index 0000000000..358c08a50f --- /dev/null +++ b/cpp/test/core/bitmap.cu @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace raft::core { + +template +struct test_spec_bitmap { + index_t rows; + index_t cols; + index_t mask_len; + index_t query_len; +}; + +template +auto operator<<(std::ostream& os, const test_spec_bitmap& ss) -> std::ostream& +{ + os << "bitmap{rows: " << ss.rows << ", cols: " << ss.cols << ", mask_len: " << ss.mask_len + << ", query_len: " << ss.query_len << "}"; + return os; +} + +template +void create_cpu_bitmap(std::vector& bitmap, + std::vector& mask_idx, + const index_t rows, + const index_t cols) +{ + for (size_t i = 0; i < bitmap.size(); i++) { + bitmap[i] = ~bitmap_t(0x00); + } + constexpr size_t bitmap_element_size = sizeof(bitmap_t) * 8; + for (size_t i = 0; i < mask_idx.size(); i++) { + auto row = mask_idx[i] / cols; + auto col = mask_idx[i] % cols; + auto idx = row * cols + col; + bitmap[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size)); + } +} + +template +void test_cpu_bitmap(const std::vector& bitmap, + const std::vector& queries, + std::vector& result, + index_t rows, + index_t cols) +{ + constexpr size_t bitmap_element_size = sizeof(bitmap_t) * 8; + for (size_t i = 0; i < queries.size(); i++) { + auto row = queries[i] / cols; + auto col = queries[i] % cols; + auto idx = row * cols + col; + result[i] = uint8_t( + (bitmap[idx / bitmap_element_size] & (bitmap_t{1} << (idx % bitmap_element_size))) != 0); + } +} + +template +class BitmapTest : public testing::TestWithParam> { + protected: + index_t static constexpr const bitmap_element_size = sizeof(bitmap_t) * 8; + const test_spec_bitmap spec; + std::vector bitmap_result; + std::vector bitmap_ref; + raft::resources res; + + public: + explicit BitmapTest() + : spec(testing::TestWithParam>::GetParam()), + bitmap_result(raft::ceildiv(spec.rows * spec.cols, index_t(bitmap_element_size))), + bitmap_ref(raft::ceildiv(spec.rows * spec.cols, index_t(bitmap_element_size))) + { + } + + void run() + { + auto stream = resource::get_cuda_stream(res); + + raft::random::RngState rng(42); + auto mask_device = raft::make_device_vector(res, spec.mask_len); + std::vector mask_cpu(spec.mask_len); + raft::random::uniformInt( + res, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols)); + raft::update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream); + resource::sync_stream(res, stream); + + create_cpu_bitmap(bitmap_ref, mask_cpu, spec.rows, spec.cols); + + auto bitset_d = raft::core::bitset( + res, raft::make_const_mdspan(mask_device.view()), index_t(spec.rows * spec.cols)); + + auto bitmap_view_d = + raft::core::bitmap_view(bitset_d.data(), spec.rows, spec.cols); + + ASSERT_EQ(bitmap_view_d.get_n_rows(), spec.rows); + ASSERT_EQ(bitmap_view_d.get_n_cols(), spec.cols); + + auto query_device = raft::make_device_vector(res, spec.query_len); + auto result_device = raft::make_device_vector(res, spec.query_len); + auto query_cpu = std::vector(spec.query_len); + auto result_cpu = std::vector(spec.query_len); + auto result_ref = std::vector(spec.query_len); + + raft::random::uniformInt( + res, rng, query_device.view(), index_t(0), index_t(spec.rows * spec.cols)); + raft::update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream); + + auto queries_device_view = + raft::make_device_vector_view(query_device.data_handle(), spec.query_len); + + raft::linalg::map( + res, + result_device.view(), + [bitmap_view_d] __device__(index_t query) { + auto row = query / bitmap_view_d.get_n_cols(); + auto col = query % bitmap_view_d.get_n_cols(); + return (uint8_t)(bitmap_view_d.test(row, col)); + }, + queries_device_view); + + raft::update_host(result_cpu.data(), result_device.data_handle(), query_device.size(), stream); + resource::sync_stream(res, stream); + + test_cpu_bitmap(bitmap_ref, query_cpu, result_ref, spec.rows, spec.cols); + + ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare())); + + raft::random::uniformInt( + res, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols)); + raft::update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream); + resource::sync_stream(res, stream); + + thrust::for_each_n(raft::resource::get_thrust_policy(res), + mask_device.data_handle(), + mask_device.extent(0), + [bitmap_view_d] __device__(const index_t sample_index) { + auto row = sample_index / bitmap_view_d.get_n_cols(); + auto col = sample_index % bitmap_view_d.get_n_cols(); + bitmap_view_d.set(row, col, false); + }); + + raft::update_host(bitmap_result.data(), bitmap_view_d.data(), bitmap_result.size(), stream); + + for (size_t i = 0; i < mask_cpu.size(); i++) { + auto row = mask_cpu[i] / spec.cols; + auto col = mask_cpu[i] % spec.cols; + auto idx = row * spec.cols + col; + bitmap_ref[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size)); + } + resource::sync_stream(res, stream); + ASSERT_TRUE(hostVecMatch(bitmap_ref, bitmap_result, raft::Compare())); + } +}; + +template +auto inputs_bitmap = + ::testing::Values(test_spec_bitmap{32, 32, 5, 10}, + test_spec_bitmap{100, 100, 30, 10}, + test_spec_bitmap{1024, 1024, 55, 100}, + test_spec_bitmap{10000, 10000, 1000, 1000}, + test_spec_bitmap{1 << 15, 1 << 15, 1 << 3, 1 << 12}, + test_spec_bitmap{1 << 15, 1 << 15, 1 << 24, 1 << 13}); + +using BitmapTest_Uint32_32 = BitmapTest; +TEST_P(BitmapTest_Uint32_32, Run) { run(); } +INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint32_32, inputs_bitmap); + +using BitmapTest_Uint64_32 = BitmapTest; +TEST_P(BitmapTest_Uint64_32, Run) { run(); } +INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint64_32, inputs_bitmap); + +using BitmapTest_Uint32_64 = BitmapTest; +TEST_P(BitmapTest_Uint32_64, Run) { run(); } +INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint32_64, inputs_bitmap); + +using BitmapTest_Uint64_64 = BitmapTest; +TEST_P(BitmapTest_Uint64_64, Run) { run(); } +INSTANTIATE_TEST_CASE_P(BitmapTest, BitmapTest_Uint64_64, inputs_bitmap); + +} // namespace raft::core diff --git a/cpp/test/core/memory_type.cpp b/cpp/test/core/memory_type.cpp index 07ca66ccc1..f020215595 100644 --- a/cpp/test/core/memory_type.cpp +++ b/cpp/test/core/memory_type.cpp @@ -49,7 +49,7 @@ TEST(MemoryTypeFromPointer, Host) auto ptr1 = static_cast(nullptr); cudaMallocHost(&ptr1, 1); EXPECT_EQ(memory_type_from_pointer(ptr1), memory_type::host); - cudaFree(ptr1); + cudaFreeHost(ptr1); auto ptr2 = static_cast(nullptr); EXPECT_EQ(memory_type_from_pointer(ptr2), memory_type::host); } diff --git a/cpp/test/distance/dist_dice.cu b/cpp/test/distance/dist_dice.cu new file mode 100644 index 0000000000..e127659dc6 --- /dev/null +++ b/cpp/test/distance/dist_dice.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include "distance_base.cuh" + +namespace raft { +namespace distance { + +template +class DistanceExpDice : public DistanceTest { +}; + +template +class DistanceExpDiceXequalY + : public DistanceTestSameBuffer {}; + +const std::vector> inputsf = { + {0.001f, 128, (65536 + 128) * 128, 8, true, 1234ULL}, + {0.001f, 1024, 1024, 32, true, 1234ULL}, + {0.001f, 1024, 32, 1024, true, 1234ULL}, + {0.001f, 32, 1024, 1024, true, 1234ULL}, + {0.003f, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, (65536 + 128) * 128, 128, 8, false, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; + +const std::vector> inputsXeqYf = { + {0.01f, 1024, 1024, 32, true, 1234ULL}, + {0.01f, 1024, 32, 1024, true, 1234ULL}, + {0.01f, 32, 1024, 1024, true, 1234ULL}, + {0.03f, 1024, 1024, 1024, true, 1234ULL}, + {0.01f, 1024, 1024, 32, false, 1234ULL}, + {0.01f, 1024, 32, 1024, false, 1234ULL}, + {0.01f, 32, 1024, 1024, false, 1234ULL}, + {0.03f, 1024, 1024, 1024, false, 1234ULL}, +}; + +typedef DistanceExpDice DistanceExpDiceF; +TEST_P(DistanceExpDiceF, Result) +{ + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApproxNaN(params.tolerance), stream)); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceF, ::testing::ValuesIn(inputsf)); + +typedef DistanceExpDiceXequalY DistanceExpDiceXequalYF; +TEST_P(DistanceExpDiceXequalYF, Result) +{ + int m = params.m; + int n = params.m; + ASSERT_TRUE(raft::devArrMatch(dist_ref[0].data(), + dist[0].data(), + m, + n, + raft::CompareApproxNaN(params.tolerance), + stream)); + n = params.isRowMajor ? m : m / 2; + m = params.isRowMajor ? m / 2 : m; + + ASSERT_TRUE(raft::devArrMatch(dist_ref[1].data(), + dist[1].data(), + m, + n, + raft::CompareApproxNaN(params.tolerance), + stream)); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceXequalYF, ::testing::ValuesIn(inputsXeqYf)); + +const std::vector> inputsd = { + {0.001, 1024, 1024, 32, true, 1234ULL}, + {0.001, 1024, 32, 1024, true, 1234ULL}, + {0.001, 32, 1024, 1024, true, 1234ULL}, + {0.003, 1024, 1024, 1024, true, 1234ULL}, + {0.001f, 1024, 1024, 32, false, 1234ULL}, + {0.001f, 1024, 32, 1024, false, 1234ULL}, + {0.001f, 32, 1024, 1024, false, 1234ULL}, + {0.003f, 1024, 1024, 1024, false, 1234ULL}, +}; +typedef DistanceExpDice DistanceExpDiceD; +TEST_P(DistanceExpDiceD, Result) +{ + int m = params.isRowMajor ? params.m : params.n; + int n = params.isRowMajor ? params.n : params.m; + ASSERT_TRUE(devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApproxNaN(params.tolerance), stream)); +} +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceD, ::testing::ValuesIn(inputsd)); + +class BigMatrixDice : public BigMatrixDistanceTest {}; +TEST_F(BigMatrixDice, Result) {} + +} // end namespace distance +} // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index 2854a8f3df..f44fb18519 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -96,6 +96,34 @@ RAFT_KERNEL naiveL1_Linf_CanberraDistanceKernel(DataType* dist, dist[outidx] = acc; } +template +RAFT_KERNEL naiveDiceDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ + int midx = threadIdx.x + blockIdx.x * blockDim.x; + int nidx = threadIdx.y + blockIdx.y * blockDim.y; + if (midx >= m || nidx >= n) { return; } + + DataType acc_a = DataType(0); + DataType acc_b = DataType(0); + DataType acc_ab = DataType(0); + + for (int i = 0; i < k; ++i) { + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + acc_a += a; + acc_b += b; + acc_ab += a * b; + } + + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + + // Use 1.0 - (dice dissimilarity) to calc the distance + dist[outidx] = (DataType)1.0 - (2 * acc_ab / ((acc_a) + (acc_b))); +} + template RAFT_KERNEL naiveCosineDistanceKernel( DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) @@ -391,6 +419,9 @@ void naiveDistance(DataType* dist, naiveCorrelationDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor); break; + case raft::distance::DistanceType::DiceExpanded: + naiveDiceDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); + break; default: FAIL() << "should be here\n"; } RAFT_CUDA_TRY(cudaPeekAtLastError()); @@ -482,7 +513,8 @@ class DistanceTest : public ::testing::TestWithParam> { // Hellinger works only on positive numbers uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0)); uniform(handle, r, y.data(), n * k, DataType(0.0), DataType(1.0)); - } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded) { + } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded || + distanceType == raft::distance::DistanceType::DiceExpanded) { uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0)); uniform(handle, r, y.data(), n * k, DataType(0.0), DataType(1.0)); // Russel rao works on boolean values. @@ -571,7 +603,8 @@ class DistanceTestSameBuffer : public ::testing::TestWithParam> { eig_vals_large, eig_vals_jacobi_large; }; +TEST(Raft, EigStream) +{ + // Separate test to check eig_dc stream workaround for CUDA 12+ + raft::resources handle; + auto n_rows = 5000; + auto cov_matrix_stream = + raft::make_device_matrix(handle, n_rows, n_rows); + auto eig_vectors_stream = + raft::make_device_matrix(handle, n_rows, n_rows); + auto eig_vals_stream = raft::make_device_vector(handle, n_rows); + + raft::linalg::eig_dc(handle, + raft::make_const_mdspan(cov_matrix_stream.view()), + eig_vectors_stream.view(), + eig_vals_stream.view()); + raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle)); +} + const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh index 495af081f1..f74cadb415 100644 --- a/cpp/test/neighbors/ann_nn_descent.cuh +++ b/cpp/test/neighbors/ann_nn_descent.cuh @@ -15,11 +15,11 @@ */ #pragma once -#include "../test_utils.cuh" #include "ann_utils.cuh" #include #include +#include #include #include @@ -65,7 +65,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam { { size_t queries_size = ps.n_rows * ps.graph_degree; std::vector indices_NNDescent(queries_size); + std::vector distances_NNDescent(queries_size); std::vector indices_naive(queries_size); + std::vector distances_naive(queries_size); { rmm::device_uvector distances_naive_dev(queries_size, stream_); @@ -81,6 +83,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam { ps.graph_degree, ps.metric); update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); + update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); resource::sync_stream(handle_); } @@ -91,6 +94,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam { index_params.graph_degree = ps.graph_degree; index_params.intermediate_graph_degree = 2 * ps.graph_degree; index_params.max_iterations = 100; + index_params.return_distances = true; auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); @@ -102,20 +106,39 @@ class AnnNNDescentTest : public ::testing::TestWithParam { auto database_host_view = raft::make_host_matrix_view( (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); auto index = nn_descent::build(handle_, index_params, database_host_view); - update_host( + raft::copy( indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } + } else { auto index = nn_descent::build(handle_, index_params, database_view); - update_host( + raft::copy( indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } }; } resource::sync_stream(handle_); } double min_recall = ps.min_recall; - EXPECT_TRUE(eval_recall( - indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall)); + EXPECT_TRUE(eval_neighbours(indices_naive, + indices_NNDescent, + distances_naive, + distances_NNDescent, + ps.n_rows, + ps.graph_degree, + 0.001, + min_recall)); } } diff --git a/cpp/test/sparse/masked_matmul.cu b/cpp/test/sparse/masked_matmul.cu new file mode 100644 index 0000000000..0ece716a1b --- /dev/null +++ b/cpp/test/sparse/masked_matmul.cu @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace raft { +namespace sparse { + +template +struct MaskedMatmulInputs { + value_t tolerance; + + index_t m; + index_t k; + index_t n; + + value_t sparsity; + + unsigned long long int seed; +}; + +template +struct sum_abs_op { + __host__ __device__ value_t operator()(const value_t& x, const value_t& y) const + { + return y >= value_t(0.0) ? (x + y) : (x - y); + } +}; + +template +::std::ostream& operator<<(::std::ostream& os, const MaskedMatmulInputs& params) +{ + os << " m: " << params.m << "\tk: " << params.k << "\tn: " << params.n + << "\tsparsity: " << params.sparsity; + + return os; +} + +template +class MaskedMatmulTest : public ::testing::TestWithParam> { + public: + MaskedMatmulTest() + : params(::testing::TestWithParam>::GetParam()), + stream(resource::get_cuda_stream(handle)), + a_data_d(0, resource::get_cuda_stream(handle)), + b_data_d(0, resource::get_cuda_stream(handle)), + bitmap_d(0, resource::get_cuda_stream(handle)), + c_indptr_d(0, resource::get_cuda_stream(handle)), + c_indices_d(0, resource::get_cuda_stream(handle)), + c_data_d(0, resource::get_cuda_stream(handle)), + c_expected_data_d(0, resource::get_cuda_stream(handle)) + { + } + + protected: + index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector& bitmap) + { + index_t total = static_cast(m * n); + index_t num_ones = static_cast((total * 1.0f) * sparsity); + index_t res = num_ones; + + for (auto& item : bitmap) { + item = static_cast(0); + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, total - 1); + + while (num_ones > 0) { + index_t index = dis(gen); + + bitmap_t& element = bitmap[index / (8 * sizeof(bitmap_t))]; + index_t bit_position = index % (8 * sizeof(bitmap_t)); + + if (((element >> bit_position) & 1) == 0) { + element |= (static_cast(1) << bit_position); + num_ones--; + } + } + return res; + } + + void cpu_convert_to_csr(std::vector& bitmap, + index_t rows, + index_t cols, + std::vector& indices, + std::vector& indptr) + { + index_t offset_indptr = 0; + index_t offset_values = 0; + indptr[offset_indptr++] = 0; + + index_t index = 0; + bitmap_t element = 0; + index_t bit_position = 0; + + for (index_t i = 0; i < rows; ++i) { + for (index_t j = 0; j < cols; ++j) { + index = i * cols + j; + element = bitmap[index / (8 * sizeof(bitmap_t))]; + bit_position = index % (8 * sizeof(bitmap_t)); + + if (((element >> bit_position) & 1)) { + indices[offset_values] = static_cast(j); + offset_values++; + } + } + indptr[offset_indptr++] = static_cast(offset_values); + } + } + + void cpu_sddmm(const std::vector& A, + const std::vector& B, + std::vector& vals, + const std::vector& cols, + const std::vector& row_ptrs, + bool is_row_major_A, + bool is_row_major_B) + { + if (params.m * params.k != static_cast(A.size()) || + params.k * params.n != static_cast(B.size())) { + std::cerr << "Matrix dimensions and vector size do not match!" << std::endl; + return; + } + + for (index_t i = 0; i < params.m; ++i) { + for (index_t j = row_ptrs[i]; j < row_ptrs[i + 1]; ++j) { + value_t sum = 0; + for (index_t l = 0; l < params.k; ++l) { + index_t a_index = i * params.k + l; + index_t b_index = cols[j] * params.k + l; + sum += A[a_index] * B[b_index]; + } + vals[j] = sum; + } + } + } + + void make_data() + { + index_t a_size = params.m * params.k; + index_t b_size = params.k * params.n; + index_t c_size = params.m * params.n; + + index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bitmap_t) * 8)); + std::vector bitmap_h(element); + + std::vector a_data_h(a_size); + std::vector b_data_h(b_size); + + a_data_d.resize(a_size, stream); + b_data_d.resize(b_size, stream); + bitmap_d.resize(bitmap_h.size(), stream); + + auto blobs_a_b = raft::make_device_matrix(handle, 1, a_size + b_size); + auto labels = raft::make_device_vector(handle, 1); + + raft::random::make_blobs(blobs_a_b.data_handle(), + labels.data_handle(), + 1, + a_size + b_size, + 1, + stream, + false, + nullptr, + nullptr, + value_t(1.0), + false, + value_t(-1.0f), + value_t(1.0f), + uint64_t(2024)); + + raft::copy(a_data_h.data(), blobs_a_b.data_handle(), a_size, stream); + raft::copy(b_data_h.data(), blobs_a_b.data_handle() + a_size, b_size, stream); + + raft::copy(a_data_d.data(), blobs_a_b.data_handle(), a_size, stream); + raft::copy(b_data_d.data(), blobs_a_b.data_handle() + a_size, b_size, stream); + + resource::sync_stream(handle); + + index_t c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h); + + std::vector c_indptr_h(params.m + 1); + std::vector c_indices_h(c_true_nnz); + std::vector c_data_h(c_true_nnz); + + cpu_convert_to_csr(bitmap_h, params.m, params.n, c_indices_h, c_indptr_h); + + c_data_d.resize(c_data_h.size(), stream); + + update_device(c_data_d.data(), c_data_h.data(), c_data_h.size(), stream); + update_device(bitmap_d.data(), bitmap_h.data(), bitmap_h.size(), stream); + resource::sync_stream(handle); + + cpu_sddmm(a_data_h, b_data_h, c_data_h, c_indices_h, c_indptr_h, true, true); + + c_indptr_d.resize(c_indptr_h.size(), stream); + c_indices_d.resize(c_indices_h.size(), stream); + c_expected_data_d.resize(c_data_h.size(), stream); + + update_device(c_indptr_d.data(), c_indptr_h.data(), c_indptr_h.size(), stream); + update_device(c_indices_d.data(), c_indices_h.data(), c_indices_h.size(), stream); + update_device(c_expected_data_d.data(), c_data_h.data(), c_data_h.size(), stream); + + resource::sync_stream(handle); + } + + void SetUp() override { make_data(); } + + void Run() + { + auto A = + raft::make_device_matrix_view(a_data_d.data(), params.m, params.k); + auto B = + raft::make_device_matrix_view(b_data_d.data(), params.n, params.k); + + auto mask = + raft::core::bitmap_view(bitmap_d.data(), params.m, params.n); + + auto c_structure = raft::make_device_compressed_structure_view( + c_indptr_d.data(), + c_indices_d.data(), + params.m, + params.n, + static_cast(c_indices_d.size())); + + auto C = raft::make_device_csr_matrix_view(c_data_d.data(), c_structure); + + raft::sparse::linalg::masked_matmul(handle, A, B, mask, C); + + resource::sync_stream(handle); + + ASSERT_TRUE(raft::devArrMatch(c_expected_data_d.data(), + C.get_elements().data(), + c_expected_data_d.size(), + raft::CompareApprox(params.tolerance), + stream)); + + thrust::device_ptr expected_data_ptr = + thrust::device_pointer_cast(c_expected_data_d.data()); + value_t sum_abs = thrust::reduce(thrust::cuda::par.on(stream), + expected_data_ptr, + expected_data_ptr + c_expected_data_d.size(), + value_t(0.0f), + sum_abs_op()); + value_t avg = sum_abs / (1.0f * c_expected_data_d.size()); + + ASSERT_GE(avg, (params.tolerance * static_cast(0.001f))); + } + + raft::resources handle; + cudaStream_t stream; + MaskedMatmulInputs params; + + rmm::device_uvector a_data_d; + rmm::device_uvector b_data_d; + rmm::device_uvector bitmap_d; + + rmm::device_uvector c_indptr_d; + rmm::device_uvector c_indices_d; + rmm::device_uvector c_data_d; + + rmm::device_uvector c_expected_data_d; +}; + +using MaskedMatmulTestF = MaskedMatmulTest; +TEST_P(MaskedMatmulTestF, Result) { Run(); } + +using MaskedMatmulTestD = MaskedMatmulTest; +TEST_P(MaskedMatmulTestD, Result) { Run(); } + +const std::vector> sddmm_inputs_f = { + {0.0001f, 10, 5, 32, 0.1, 1234ULL}, + {0.0001f, 1024, 32, 1024, 0.1, 1234ULL}, + {0.0003f, 32, 1024, 1024, 0.2, 1234ULL}, + {0.001f, 1024, 1024, 1024, 0.19, 1234ULL}, + {0.0001f, 1024, 1024, 32, 0.3, 1234ULL}, + {0.0001f, 1024, 32, 1024, 0.4, 1234ULL}, + {0.0003f, 32, 1024, 1024, 0.19, 1234ULL}, + {0.001f, 1024, 1024, 1024, 0.1, 1234ULL}}; + +const std::vector> sddmm_inputs_d = { + {0.0001f, 10, 5, 32, 0.01, 1234ULL}, + {0.0001f, 1024, 32, 1024, 0.1, 1234ULL}, + {0.0001f, 32, 1024, 1024, 0.2, 1234ULL}, + {0.0001f, 1024, 1024, 1024, 0.19, 1234ULL}, + {0.0001f, 1024, 1024, 32, 0.3, 1234ULL}, + {0.0001f, 1024, 32, 1024, 0.4, 1234ULL}, + {0.0001f, 32, 1024, 1024, 0.19, 1234ULL}, + {0.0001f, 1024, 1024, 1024, 0.1, 1234ULL}}; + +INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestF, ::testing::ValuesIn(sddmm_inputs_f)); + +INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestD, ::testing::ValuesIn(sddmm_inputs_d)); + +} // namespace sparse +} // namespace raft diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu index 41812979b6..602f356b9f 100644 --- a/cpp/test/stats/cov.cu +++ b/cpp/test/stats/cov.cu @@ -40,7 +40,8 @@ struct CovInputs { template ::std::ostream& operator<<(::std::ostream& os, const CovInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.sample << ", " << dims.rowMajor << "}" << std::endl; } template @@ -71,8 +72,7 @@ class CovTest : public ::testing::TestWithParam> { cov_act.resize(cols * cols, stream); normal(handle, r, data.data(), len, params.mean, var); - raft::stats::mean( - mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream); + raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream); if (params.rowMajor) { using layout = raft::row_major; cov(handle, @@ -102,7 +102,7 @@ class CovTest : public ::testing::TestWithParam> { raft::update_device(data_cm.data(), data_h, 6, stream); raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream); - raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream); + raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream); cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream); } diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 61b57ce739..c5fe83d95b 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -35,12 +35,14 @@ struct MeanInputs { int rows, cols; bool sample, rowMajor; unsigned long long int seed; + T stddev = (T)1.0; }; template ::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl; } template @@ -61,7 +63,7 @@ class MeanTest : public ::testing::TestWithParam> { { raft::random::RngState r(params.seed); int len = rows * cols; - normal(handle, r, data.data(), len, params.mean, (T)1.0); + normal(handle, r, data.data(), len, params.mean, params.stddev); meanSGtest(data.data(), stream); } @@ -96,38 +98,72 @@ class MeanTest : public ::testing::TestWithParam> { // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) const std::vector> inputsf = { - {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, {0.15f, -1.f, 1024, 256, false, true, 1234ULL}, - {0.15f, -1.f, 1030, 1, false, false, 1234ULL}, {0.15f, -1.f, 1030, 60, true, false, 1234ULL}, - {2.0f, -1.f, 31, 120, false, false, 1234ULL}, {2.0f, -1.f, 1, 130, true, false, 1234ULL}, - {0.15f, -1.f, 1030, 1, false, true, 1234ULL}, {0.15f, -1.f, 1030, 60, true, true, 1234ULL}, - {2.0f, -1.f, 31, 120, false, true, 1234ULL}, {2.0f, -1.f, 1, 130, false, true, 1234ULL}, - {2.0f, -1.f, 1, 1, false, false, 1234ULL}, {2.0f, -1.f, 1, 1, false, true, 1234ULL}, - {2.0f, -1.f, 7, 23, false, false, 1234ULL}, {2.0f, -1.f, 7, 23, false, true, 1234ULL}, - {2.0f, -1.f, 17, 5, false, false, 1234ULL}, {2.0f, -1.f, 17, 5, false, true, 1234ULL}}; + {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}, + {0.15f, -1.f, 1030, 1, false, false, 1234ULL}, + {0.15f, -1.f, 1030, 60, true, false, 1234ULL}, + {2.0f, -1.f, 31, 120, false, false, 1234ULL}, + {2.0f, -1.f, 1, 130, false, false, 1234ULL}, + {0.15f, -1.f, 1030, 1, false, true, 1234ULL}, + {0.15f, -1.f, 1030, 60, true, true, 1234ULL}, + {2.0f, -1.f, 31, 120, false, true, 1234ULL}, + {2.0f, -1.f, 1, 130, false, true, 1234ULL}, + {2.0f, -1.f, 1, 1, false, false, 1234ULL}, + {2.0f, -1.f, 1, 1, false, true, 1234ULL}, + {2.0f, -1.f, 7, 23, false, false, 1234ULL}, + {2.0f, -1.f, 7, 23, false, true, 1234ULL}, + {2.0f, -1.f, 17, 5, false, false, 1234ULL}, + {2.0f, -1.f, 17, 5, false, true, 1234ULL}, + {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f}, + {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}}; const std::vector> inputsd = { - {0.15, 1.0, 1024, 32, true, false, 1234ULL}, {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, {0.15, -1.0, 1024, 256, false, true, 1234ULL}, - {0.15, -1.0, 1030, 1, false, false, 1234ULL}, {0.15, -1.0, 1030, 60, true, false, 1234ULL}, - {2.0, -1.0, 31, 120, false, false, 1234ULL}, {2.0, -1.0, 1, 130, true, false, 1234ULL}, - {0.15, -1.0, 1030, 1, false, true, 1234ULL}, {0.15, -1.0, 1030, 60, true, true, 1234ULL}, - {2.0, -1.0, 31, 120, false, true, 1234ULL}, {2.0, -1.0, 1, 130, false, true, 1234ULL}, - {2.0, -1.0, 1, 1, false, false, 1234ULL}, {2.0, -1.0, 1, 1, false, true, 1234ULL}, - {2.0, -1.0, 7, 23, false, false, 1234ULL}, {2.0, -1.0, 7, 23, false, true, 1234ULL}, - {2.0, -1.0, 17, 5, false, false, 1234ULL}, {2.0, -1.0, 17, 5, false, true, 1234ULL}}; + {0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}, + {0.15, -1.0, 1030, 1, false, false, 1234ULL}, + {0.15, -1.0, 1030, 60, true, false, 1234ULL}, + {2.0, -1.0, 31, 120, false, false, 1234ULL}, + {2.0, -1.0, 1, 130, false, false, 1234ULL}, + {0.15, -1.0, 1030, 1, false, true, 1234ULL}, + {0.15, -1.0, 1030, 60, true, true, 1234ULL}, + {2.0, -1.0, 31, 120, false, true, 1234ULL}, + {2.0, -1.0, 1, 130, false, true, 1234ULL}, + {2.0, -1.0, 1, 1, false, false, 1234ULL}, + {2.0, -1.0, 1, 1, false, true, 1234ULL}, + {2.0, -1.0, 7, 23, false, false, 1234ULL}, + {2.0, -1.0, 7, 23, false, true, 1234ULL}, + {2.0, -1.0, 17, 5, false, false, 1234ULL}, + {2.0, -1.0, 17, 5, false, true, 1234ULL}, + {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001}, + {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}}; typedef MeanTest MeanTestF; TEST_P(MeanTestF, Result) diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 641621c1c6..f4c5f92f49 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -39,7 +39,8 @@ struct StdDevInputs { template ::std::ostream& operator<<(::std::ostream& os, const StdDevInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.sample << ", " << dims.rowMajor << "}" << std::endl; } template @@ -81,7 +82,7 @@ class StdDevTest : public ::testing::TestWithParam> { mean(handle, raft::make_device_matrix_view(data, rows, cols), raft::make_device_vector_view(mean_act.data(), cols), - params.sample); + false); stddev(handle, raft::make_device_matrix_view(data, rows, cols), @@ -99,7 +100,7 @@ class StdDevTest : public ::testing::TestWithParam> { mean(handle, raft::make_device_matrix_view(data, rows, cols), raft::make_device_vector_view(mean_act.data(), cols), - params.sample); + false); stddev(handle, raft::make_device_matrix_view(data, rows, cols), @@ -147,13 +148,15 @@ const std::vector> inputsf = { {0.5f, -1.f, 2.f, 31, 1, true, true, 1234ULL}, {1.f, -1.f, 2.f, 1, 257, false, true, 1234ULL}, {0.5f, -1.f, 2.f, 31, 1, false, false, 1234ULL}, - {1.f, -1.f, 2.f, 1, 257, true, false, 1234ULL}, + {1.f, -1.f, 2.f, 1, 257, false, false, 1234ULL}, {1.f, -1.f, 2.f, 1, 1, false, false, 1234ULL}, {1.f, -1.f, 2.f, 7, 23, false, false, 1234ULL}, {1.f, -1.f, 2.f, 17, 5, false, false, 1234ULL}, {1.f, -1.f, 2.f, 1, 1, false, true, 1234ULL}, {1.f, -1.f, 2.f, 7, 23, false, true, 1234ULL}, - {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL}}; + {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL}, + {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, true, 1234ULL}, + {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, false, 1234ULL}}; const std::vector> inputsd = { {0.1, 1.0, 2.0, 1024, 32, true, false, 1234ULL}, @@ -177,13 +180,15 @@ const std::vector> inputsd = { {0.5, -1.0, 2.0, 31, 1, true, true, 1234ULL}, {1.0, -1.0, 2.0, 1, 257, false, true, 1234ULL}, {0.5, -1.0, 2.0, 31, 1, false, false, 1234ULL}, - {1.0, -1.0, 2.0, 1, 257, true, false, 1234ULL}, + {1.0, -1.0, 2.0, 1, 257, false, false, 1234ULL}, {1.0, -1.0, 2.0, 1, 1, false, false, 1234ULL}, {1.0, -1.0, 2.0, 7, 23, false, false, 1234ULL}, {1.0, -1.0, 2.0, 17, 5, false, false, 1234ULL}, {1.0, -1.0, 2.0, 1, 1, false, true, 1234ULL}, {1.0, -1.0, 2.0, 7, 23, false, true, 1234ULL}, - {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL}}; + {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL}, + {1e-7, 0.001, 0.0, 1 << 27, 2, false, true, 1234ULL}, + {1e-7, 0.001, 0.0, 1 << 27, 2, false, false, 1234ULL}}; typedef StdDevTest StdDevTestF; TEST_P(StdDevTestF, Result) diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index bf2aa44a2c..fbb398cc5b 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -40,7 +40,8 @@ struct SumInputs { template ::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.rowMajor << ", " << dims.value << "}" << std::endl; } template @@ -57,13 +58,31 @@ class SumTest : public ::testing::TestWithParam> { } protected: - void runTest() + void runTest(bool checkErrorCompensation = false) { int len = rows * cols; + double large_factor = 1e7; + + if constexpr (std::is_same_v) large_factor = 1e12; + std::vector data_h(len); for (int i = 0; i < len; i++) { - data_h[i] = T(params.value); + data_h[i] = double(params.value); + int row = params.rowMajor ? i / cols : i % rows; + + // every 3 elements (in a column) contain 2 large dummy elements + // (one of them negative) and one element with 3x compensating + // for the 2 missing elements + if (checkErrorCompensation && row % 3 == 2) { + data_h[i] = double(params.value) * large_factor; + // compensate with opposite error 3 rows up + int idx2 = params.rowMajor ? (i - cols) : (i - 1); + data_h[idx2] = -1 * double(params.value) * large_factor; + // compensate 2 missing values + int idx3 = params.rowMajor ? (i - 2 * cols) : (i - 2); + data_h[idx3] = 3.0 * double(params.value); + } } raft::update_device(data.data(), data_h.data(), len, stream); @@ -83,8 +102,10 @@ class SumTest : public ::testing::TestWithParam> { double expected = double(params.rows) * params.value; + double tolerance = checkErrorCompensation ? 100 * params.tolerance : params.tolerance; + ASSERT_TRUE(raft::devArrMatch( - T(expected), sum_act.data(), params.cols, raft::CompareApprox(params.tolerance))); + T(expected), sum_act.data(), params.cols, raft::CompareApprox(tolerance))); } protected: @@ -96,43 +117,29 @@ class SumTest : public ::testing::TestWithParam> { rmm::device_uvector data, sum_act; }; -const std::vector> inputsf = {{0.0001f, 4, 5, true, 1}, - {0.0001f, 1024, 32, true, 1}, - {0.0001f, 1024, 256, true, 1}, - {0.0001f, 100000000, 1, true, 0.001}, - {0.0001f, 1, 30, true, 0.001}, - {0.0001f, 1, 1, true, 0.001}, - {0.0001f, 17, 5, true, 0.001}, - {0.0001f, 7, 23, true, 0.001}, - {0.0001f, 3, 97, true, 0.001}, - {0.0001f, 4, 5, false, 1}, - {0.0001f, 1024, 32, false, 1}, - {0.0001f, 1024, 256, false, 1}, - {0.0001f, 100000000, 1, false, 0.001}, - {0.0001f, 1, 30, false, 0.001}, - {0.0001f, 1, 1, false, 0.001}, - {0.0001f, 17, 5, false, 0.001}, - {0.0001f, 7, 23, false, 0.001}, - {0.0001f, 3, 97, false, 0.001}}; - -const std::vector> inputsd = {{0.000001, 1024, 32, true, 1}, - {0.000001, 1024, 256, true, 1}, - {0.000001, 1024, 256, true, 1}, - {0.000001, 100000000, 1, true, 0.001}, - {0.000001, 1, 30, true, 0.0001}, - {0.000001, 1, 1, true, 0.0001}, - {0.000001, 17, 5, true, 0.0001}, - {0.000001, 7, 23, true, 0.0001}, - {0.000001, 3, 97, true, 0.0001}, - {0.000001, 1024, 32, false, 1}, - {0.000001, 1024, 256, false, 1}, - {0.000001, 1024, 256, false, 1}, - {0.000001, 100000000, 1, false, 0.001}, - {0.000001, 1, 30, false, 0.0001}, - {0.000001, 1, 1, false, 0.0001}, - {0.000001, 17, 5, false, 0.0001}, - {0.000001, 7, 23, false, 0.0001}, - {0.000001, 3, 97, false, 0.0001}}; +const std::vector> inputsf = { + {0.0001f, 4, 5, true, 1}, {0.0001f, 1024, 32, true, 1}, + {0.0001f, 1024, 256, true, 1}, {0.0001f, 100000000, 1, true, 0.001}, + {0.0001f, 1 << 27, 2, true, 0.1}, {0.0001f, 1, 30, true, 0.001}, + {0.0001f, 1, 1, true, 0.001}, {0.0001f, 17, 5, true, 0.001}, + {0.0001f, 7, 23, true, 0.001}, {0.0001f, 3, 97, true, 0.001}, + {0.0001f, 4, 5, false, 1}, {0.0001f, 1024, 32, false, 1}, + {0.0001f, 1024, 256, false, 1}, {0.0001f, 100000000, 1, false, 0.001}, + {0.0001f, 1 << 27, 2, false, 0.1}, {0.0001f, 1, 30, false, 0.001}, + {0.0001f, 1, 1, false, 0.001}, {0.0001f, 17, 5, false, 0.001}, + {0.0001f, 7, 23, false, 0.001}, {0.0001f, 3, 97, false, 0.001}}; + +const std::vector> inputsd = { + {0.000001, 1024, 32, true, 1}, {0.000001, 1024, 256, true, 1}, + {0.000001, 1024, 256, true, 1}, {0.000001, 100000000, 1, true, 0.001}, + {1e-9, 1 << 27, 2, true, 0.1}, {0.000001, 1, 30, true, 0.0001}, + {0.000001, 1, 1, true, 0.0001}, {0.000001, 17, 5, true, 0.0001}, + {0.000001, 7, 23, true, 0.0001}, {0.000001, 3, 97, true, 0.0001}, + {0.000001, 1024, 32, false, 1}, {0.000001, 1024, 256, false, 1}, + {0.000001, 1024, 256, false, 1}, {0.000001, 100000000, 1, false, 0.001}, + {1e-9, 1 << 27, 2, false, 0.1}, {0.000001, 1, 30, false, 0.0001}, + {0.000001, 1, 1, false, 0.0001}, {0.000001, 17, 5, false, 0.0001}, + {0.000001, 7, 23, false, 0.0001}, {0.000001, 3, 97, false, 0.0001}}; typedef SumTest SumTestF; typedef SumTest SumTestD; @@ -140,6 +147,9 @@ typedef SumTest SumTestD; TEST_P(SumTestF, Result) { runTest(); } TEST_P(SumTestD, Result) { runTest(); } +TEST_P(SumTestF, Accuracy) { runTest(true); } +TEST_P(SumTestD, Accuracy) { runTest(true); } + INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf)); INSTANTIATE_TEST_CASE_P(SumTests, SumTestD, ::testing::ValuesIn(inputsd)); diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h index cf9a885cfb..63ec74d1da 100644 --- a/cpp/test/test_utils.h +++ b/cpp/test/test_utils.h @@ -55,6 +55,23 @@ struct CompareApprox { T eps; }; +template +struct CompareApproxNaN { + CompareApproxNaN(T eps_) : eps(eps_) {} + bool operator()(const T& a, const T& b) const + { + T diff = std::abs(a - b); + T m = std::max(std::abs(a), std::abs(b)); + T ratio = diff > eps ? diff / m : diff; + + if (std::isnan(a) && std::isnan(b)) { return true; } + return (ratio <= eps); + } + + private: + T eps; +}; + template ::std::ostream& operator<<(::std::ostream& os, const raft::KeyValuePair& kv) { diff --git a/cpp/test/util/popc.cu b/cpp/test/util/popc.cu new file mode 100644 index 0000000000..c08faacb07 --- /dev/null +++ b/cpp/test/util/popc.cu @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +template +struct PopcInputs { + index_t n_rows; + index_t n_cols; + float sparsity; + bool owning; +}; + +template +class PopcTest : public ::testing::TestWithParam> { + public: + PopcTest() + : stream(resource::get_cuda_stream(handle)), + params(::testing::TestWithParam>::GetParam()), + bits_d(0, stream) + { + } + + protected: + index_t create_bitmap(index_t m, index_t n, float sparsity, std::vector& bitmap) + { + index_t total = static_cast(m * n); + index_t num_ones = static_cast((total * 1.0f) * sparsity); + index_t res = num_ones; + + for (auto& item : bitmap) { + item = static_cast(0); + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, total - 1); + + while (num_ones > 0) { + index_t index = dis(gen); + + bits_t& element = bitmap[index / (8 * sizeof(bits_t))]; + index_t bit_position = index % (8 * sizeof(bits_t)); + + if (((element >> bit_position) & 1) == 0) { + element |= (static_cast(1) << bit_position); + num_ones--; + } + } + return res; + } + + void SetUp() override + { + index_t element = raft::ceildiv(params.n_rows * params.n_cols, index_t(sizeof(bits_t) * 8)); + std::vector bits_h(element); + + nnz_expected = create_bitmap(params.n_rows, params.n_cols, params.sparsity, bits_h); + bits_d.resize(bits_h.size(), stream); + update_device(bits_d.data(), bits_h.data(), bits_h.size(), stream); + + resource::sync_stream(handle); + } + + void Run() + { + auto bits_view = + raft::make_device_vector_view(bits_d.data(), bits_d.size()); + + index_t max_len = params.n_rows * params.n_cols; + auto max_len_view = raft::make_host_scalar_view(&max_len); + + index_t nnz_actual_h = 0; + rmm::device_scalar nnz_actual_d(0, stream); + auto nnz_actual_view = raft::make_device_scalar_view(nnz_actual_d.data()); + + raft::popc(handle, bits_view, max_len_view, nnz_actual_view); + raft::copy(&nnz_actual_h, nnz_actual_d.data(), 1, stream); + resource::sync_stream(handle); + + ASSERT_EQ(nnz_expected, nnz_actual_h); + } + + protected: + raft::resources handle; + cudaStream_t stream; + + PopcInputs params; + rmm::device_uvector bits_d; + index_t nnz_expected; +}; + +using PopcTestI32 = PopcTest; +TEST_P(PopcTestI32, Result) { Run(); } + +template +const std::vector> popc_inputs = { + {0, 0, 0.2}, + {10, 32, 0.4}, + {10, 3, 0.2}, + {32, 1024, 0.4}, + {1024, 1048576, 0.01}, + {1024, 1024, 0.4}, + {64 * 1024 + 10, 2, 0.3}, + {16, 16, 0.3}, + {17, 16, 0.3}, + {18, 16, 0.3}, + {32 + 9, 33, 0.2}, + {2, 33, 0.2}, + {0, 0, 0.2}, + {10, 32, 0.4}, + {10, 3, 0.2}, + {32, 1024, 0.4}, + {1024, 1048576, 0.01}, + {1024, 1024, 0.4}, + {64 * 1024 + 10, 2, 0.3}, + {16, 16, 0.3}, + {17, 16, 0.3}, + {18, 16, 0.3}, + {32 + 9, 33, 0.2}, + {2, 33, 0.2}, +}; + +INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32, ::testing::ValuesIn(popc_inputs)); + +} // namespace raft diff --git a/dependencies.yaml b/dependencies.yaml index 98fc7fa8fc..34e7998ddf 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -3,10 +3,10 @@ files: all: output: conda matrix: - cuda: ["11.8", "12.2"] + cuda: ["11.8", "12.5"] arch: [x86_64, aarch64] includes: - - build + - rapids_build - build_pylibraft - cuda - cuda_version @@ -14,9 +14,10 @@ files: - depends_on_distributed_ucxx - develop - checks - - build_wheels - test_libraft - docs + - rapids_build_setuptools + - rapids_build_skbuild - run_raft_dask - run_pylibraft - test_python_common @@ -27,12 +28,13 @@ files: cuda: ["11.8", "12.0"] arch: [x86_64, aarch64] includes: - - build + - rapids_build - cuda - cuda_version - develop - nn_bench - nn_bench_python + - rapids_build_skbuild test_cpp: output: none includes: @@ -65,7 +67,15 @@ files: extras: table: build-system includes: - - build + - rapids_build_skbuild + py_rapids_build_pylibraft: + output: pyproject + pyproject_dir: python/pylibraft + extras: + table: tool.rapids-build-backend + key: requires + includes: + - rapids_build - build_pylibraft py_run_pylibraft: output: pyproject @@ -90,7 +100,15 @@ files: extras: table: build-system includes: - - build + - rapids_build_skbuild + py_rapids_build_raft_dask: + output: pyproject + pyproject_dir: python/raft-dask + extras: + table: tool.rapids-build-backend + key: requires + includes: + - rapids_build - depends_on_ucx_build py_run_raft_dask: output: pyproject @@ -114,7 +132,7 @@ files: extras: table: build-system includes: - - build_wheels + - rapids_build_setuptools py_run_raft_ann_bench: output: pyproject pyproject_dir: python/raft-ann-bench @@ -129,11 +147,22 @@ channels: - conda-forge - nvidia dependencies: - build: + rapids_build_skbuild: + common: + - output_types: [conda, requirements, pyproject] + packages: + - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0 + - output_types: [conda] + packages: + - scikit-build-core>=0.7.0 + - output_types: [requirements, pyproject] + packages: + - scikit-build-core[pyproject]>=0.7.0 + rapids_build: common: - output_types: [conda, requirements, pyproject] packages: - - &cmake_ver cmake>=3.26.4 + - &cmake_ver cmake>=3.26.4,!=3.30.0 - cython>=3.0.0 - ninja - output_types: [conda] @@ -141,11 +170,7 @@ dependencies: - c-compiler - cxx-compiler - nccl>=2.9.9 - - libucxx==0.38.* - - scikit-build-core>=0.7.0 - - output_types: [requirements, pyproject] - packages: - - scikit-build-core[pyproject]>=0.7.0 + - libucxx==0.39.* specific: - output_types: conda matrices: @@ -184,7 +209,7 @@ dependencies: common: - output_types: [conda] packages: - - &rmm_conda rmm==24.6.* + - &rmm_unsuffixed rmm==24.8.* - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -198,18 +223,26 @@ dependencies: cuda: "12.*" packages: - &cuda_python12 cuda-python>=12.0,<13.0a0 - - matrix: # All CUDA 11 versions + - matrix: + cuda: "11.*" packages: - &cuda_python11 cuda-python>=11.7.1,<12.0a0 + - matrix: + packages: + - &cuda_python cuda-python - output_types: [requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} + - matrix: + cuda: "12.*" + cuda_suffixed: "true" packages: - - &rmm_cu12 rmm-cu12==24.6.* - - matrix: {cuda: "11.*"} + - &rmm_cu12 rmm-cu12==24.8.* + - matrix: + cuda: "11.*" + cuda_suffixed: "true" packages: - - &rmm_cu11 rmm-cu11==24.6.* - - {matrix: null, packages: [*rmm_conda] } + - &rmm_cu11 rmm-cu11==24.8.* + - {matrix: null, packages: [*rmm_unsuffixed] } checks: common: - output_types: [conda, requirements] @@ -231,7 +264,7 @@ dependencies: - h5py>=3.8.0 - benchmark>=1.8.2 - openblas - - *rmm_conda + - *rmm_unsuffixed nn_bench_python: common: - output_types: [conda] @@ -268,6 +301,10 @@ dependencies: cuda: "12.2" packages: - cuda-version=12.2 + - matrix: + cuda: "12.5" + packages: + - cuda-version=12.5 cuda: specific: - output_types: conda @@ -375,12 +412,13 @@ dependencies: - recommonmark - sphinx-copybutton - sphinx-markdown-tables - build_wheels: + rapids_build_setuptools: common: - output_types: [requirements, pyproject] packages: - wheel - setuptools + - *rapids_build_backend py_version: specific: - output_types: conda @@ -407,7 +445,7 @@ dependencies: - &numpy numpy>=1.23,<2.0a0 - output_types: [conda] packages: - - *rmm_conda + - *rmm_unsuffixed - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -421,9 +459,13 @@ dependencies: cuda: "12.*" packages: - *cuda_python12 - - matrix: # All CUDA 11 versions + - matrix: + cuda: "11.*" packages: - *cuda_python11 + - matrix: + packages: + - *cuda_python - output_types: [requirements, pyproject] matrices: - matrix: {cuda: "12.*"} @@ -432,23 +474,20 @@ dependencies: - matrix: {cuda: "11.*"} packages: - *rmm_cu11 - - {matrix: null, packages: [*rmm_conda]} + - {matrix: null, packages: [*rmm_unsuffixed]} run_raft_dask: common: - output_types: [conda, pyproject] packages: - - dask-cuda==24.6.* + - dask-cuda==24.8.* - joblib>=0.11 - numba>=0.57 - *numpy - - rapids-dask-dependency==24.6.* - - ucx-py==0.38.* + - rapids-dask-dependency==24.8.* - output_types: conda packages: - - &ucx_py_conda ucx-py==0.38.* - - output_types: pyproject - packages: - - &pylibraft_conda pylibraft==24.6.* + - &pylibraft_unsuffixed pylibraft==24.8.* + - &ucx_py_unsuffixed ucx-py==0.39.* - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -458,15 +497,19 @@ dependencies: specific: - output_types: [requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} + - matrix: + cuda: "12.*" + cuda_suffixed: "true" packages: - - &pylibraft_cu12 pylibraft-cu12==24.6.* - - &ucx_py_cu12 ucx-py-cu12==0.38.* - - matrix: {cuda: "11.*"} + - &pylibraft_cu12 pylibraft-cu12==24.8.* + - &ucx_py_cu12 ucx-py-cu12==0.39.* + - matrix: + cuda: "11.*" + cuda_suffixed: "true" packages: - - &pylibraft_cu11 pylibraft-cu11==24.6.* - - &ucx_py_cu11 ucx-py-cu11==0.38.* - - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]} + - &pylibraft_cu11 pylibraft-cu11==24.8.* + - &ucx_py_cu11 ucx-py-cu11==0.39.* + - {matrix: null, packages: [*pylibraft_unsuffixed, *ucx_py_unsuffixed]} test_python_common: common: - output_types: [conda, requirements, pyproject] @@ -485,7 +528,7 @@ dependencies: packages: # UCXX is not currently a hard-dependency thus only installed during tests, # this will change in the future. - - &distributed_ucxx_conda distributed-ucxx==0.38.* + - &distributed_ucxx_unsuffixed distributed-ucxx==0.39.* - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -494,18 +537,22 @@ dependencies: specific: - output_types: [requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} + - matrix: + cuda: "12.*" + cuda_suffixed: "true" packages: - - distributed-ucxx-cu12==0.38.* - - matrix: {cuda: "11.*"} + - distributed-ucxx-cu12==0.39.* + - matrix: + cuda: "11.*" + cuda_suffixed: "true" packages: - - distributed-ucxx-cu11==0.38.* - - {matrix: null, packages: [*distributed_ucxx_conda]} + - distributed-ucxx-cu11==0.39.* + - {matrix: null, packages: [*distributed_ucxx_unsuffixed]} depends_on_ucx_build: common: - output_types: conda packages: - - &ucx_conda_build ucx==1.15.0 + - ucx==1.15.0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -514,10 +561,14 @@ dependencies: specific: - output_types: [requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} + - matrix: + cuda: "12.*" + cuda_suffixed: "true" packages: - libucx-cu12==1.15.0 - - matrix: {cuda: "11.*"} + - matrix: + cuda: "11.*" + cuda_suffixed: "true" packages: - libucx-cu11==1.15.0 - matrix: null @@ -527,7 +578,7 @@ dependencies: common: - output_types: conda packages: - - &ucx_conda_run ucx>=1.15.0 + - ucx>=1.15.0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -536,10 +587,14 @@ dependencies: specific: - output_types: [requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} + - matrix: + cuda: "12.*" + cuda_suffixed: "true" packages: - libucx-cu12>=1.15.0 - - matrix: {cuda: "11.*"} + - matrix: + cuda: "11.*" + cuda_suffixed: "true" packages: - libucx-cu11>=1.15.0 - matrix: null diff --git a/docs/source/ann_benchmarks_build.md b/docs/source/ann_benchmarks_build.md index 80730c5d68..56af8e555c 100644 --- a/docs/source/ann_benchmarks_build.md +++ b/docs/source/ann_benchmarks_build.md @@ -36,9 +36,12 @@ You can limit the algorithms that are built by providing a semicolon-delimited l ``` Available targets to use with `--limit-bench-ann` are: -- FAISS_IVF_FLAT_ANN_BENCH -- FAISS_IVF_PQ_ANN_BENCH -- FAISS_BFKNN_ANN_BENCH +- FAISS_GPU_IVF_FLAT_ANN_BENCH +- FAISS_GPU_IVF_PQ_ANN_BENCH +- FAISS_CPU_IVF_FLAT_ANN_BENCH +- FAISS_CPU_IVF_PQ_ANN_BENCH +- FAISS_GPU_FLAT_ANN_BENCH +- FAISS_CPU_FLAT_ANN_BENCH - GGNN_ANN_BENCH - HNSWLIB_ANN_BENCH - RAFT_CAGRA_ANN_BENCH diff --git a/docs/source/build.md b/docs/source/build.md index c0abf3f995..64f3bd01a2 100644 --- a/docs/source/build.md +++ b/docs/source/build.md @@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0 ``` -If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above. +If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.08/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above. ## Installing Python through Pip @@ -106,7 +106,7 @@ In addition to the libraries included with cudatoolkit 11.0+, there are some oth Conda environment scripts are provided for installing the necessary dependencies to build both the C++ and Python libraries from source. It is preferred to use `mamba`, as it provides significant speedup over `conda`: ```bash -mamba env create --name rapids_raft -f conda/environments/all_cuda-122_arch-x86_64.yaml +mamba env create --name rapids_raft -f conda/environments/all_cuda-125_arch-x86_64.yaml mamba activate rapids_raft ``` @@ -257,19 +257,20 @@ RAFT's CMake has the following configurable flags available: | Flag | Possible Values | Default Value | Behavior | |---------------------------------|----------------------| --- |------------------------------------------------------------------------------| -| BUILD_TESTS | ON, OFF | ON | Compile Googletests | +| BUILD_TESTS | ON, OFF | ON | Compile Googletests | | BUILD_PRIMS_BENCH | ON, OFF | OFF | Compile benchmarks | -| BUILD_ANN_BENCH | ON, OFF | OFF | Compile end-to-end ANN benchmarks | +| BUILD_ANN_BENCH | ON, OFF | OFF | Compile end-to-end ANN benchmarks | | CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` | | CUDA_ENABLE_LINEINFO | ON, OFF | OFF | Enable the -lineinfo option for nvcc | | CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime | -| DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies | -| raft_FIND_COMPONENTS | compiled distributed | | Configures the optional components as a space-separated list | +| CUDA_STATIC_MATH_LIBRARIES | ON, OFF | OFF | Statically link the CUDA math libraries | +| DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies | +| raft_FIND_COMPONENTS | compiled distributed | | Configures the optional components as a space-separated list | | RAFT_COMPILE_LIBRARY | ON, OFF | ON if either BUILD_TESTS or BUILD_PRIMS_BENCH is ON; otherwise OFF | Compiles all `libraft` shared libraries (these are required for Googletests) | -| RAFT_ENABLE_CUBLAS_DEPENDENCY | ON, OFF | ON | Link against cublas library in `raft::raft` | -| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against cusolver library in `raft::raft` | -| RAFT_ENABLE_CUSPARSE_DEPENDENCY | ON, OFF | ON | Link against cusparse library in `raft::raft` | -| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against curand library in `raft::raft` | +| RAFT_ENABLE_CUBLAS_DEPENDENCY | ON, OFF | ON | Link against cublas library in `raft::raft` | +| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against cusolver library in `raft::raft` | +| RAFT_ENABLE_CUSPARSE_DEPENDENCY | ON, OFF | ON | Link against cusparse library in `raft::raft` | +| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against curand library in `raft::raft` | | RAFT_NVTX | ON, OFF | OFF | Enable NVTX Markers | ### Build documentation diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md index 5e288e9f2f..36588f3450 100644 --- a/docs/source/developer_guide.md +++ b/docs/source/developer_guide.md @@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour 1. Do not split empty functions/records/namespaces. 2. Two-space indentation everywhere, including the line continuations. 3. Disable reflowing of comments. - The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/.clang-format). + The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/.clang-format). [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter. In order to run doxygen as a linter on C++/CUDA code, run @@ -205,7 +205,7 @@ you can run `codespell -i 3 -w .` from the repository root directory. This will bring up an interactive prompt to select which spelling fixes to apply. ### #include style -[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/scripts/include_checker.py) is used to enforce the include style as follows: +[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/scripts/include_checker.py) is used to enforce the include style as follows: 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies. 2. `#include <...>` should be used for referencing everything else @@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` ## Logging ### Introduction -Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all. +Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.08/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all. ### Usage ```cpp diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 4b3aef5600..21a8404212 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -62,7 +62,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10: ```bash -docker pull rapidsai/raft-ann-bench:24.06a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container. +docker pull rapidsai/raft-ann-bench:24.08a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container. ``` The CUDA and python versions can be changed for the supported values: @@ -83,7 +83,7 @@ You can see the exact versions as well in the dockerhub site: [//]: # () [//]: # (```bash) -[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.06-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.) +[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.08-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.) [//]: # (```) @@ -344,7 +344,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe export DATA_FOLDER=path/to/store/datasets/and/results docker run --gpus all --rm -it -u $(id -u) \ -v $DATA_FOLDER:/data/benchmarks \ - rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10 \ + rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10 \ "--dataset deep-image-96-angular" \ "--normalize" \ "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \ @@ -355,7 +355,7 @@ Usage of the above command is as follows: | Argument | Description | |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------| -| `rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10` | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets` | +| `rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10` | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets` | | `"--dataset deep-image-96-angular"` | Dataset name | | `"--normalize"` | Whether to normalize the dataset | | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` | @@ -372,7 +372,7 @@ The container arguments in the above section also be used for the CPU-only conta export DATA_FOLDER=path/to/store/datasets/and/results docker run --rm -it -u $(id -u) \ -v $DATA_FOLDER:/data/benchmarks \ - rapidsai/raft-ann-bench-cpu:24.06a-py3.10 \ + rapidsai/raft-ann-bench-cpu:24.08a-py3.10 \ "--dataset deep-image-96-angular" \ "--normalize" \ "--algorithms hnswlib --batch-size 10 -k 10" \ @@ -389,7 +389,7 @@ docker run --gpus all --rm -it -u $(id -u) \ --entrypoint /bin/bash \ --workdir /data/benchmarks \ -v $DATA_FOLDER:/data/benchmarks \ - rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10 + rapidsai/raft-ann-bench:24.08a-cuda11.8-py3.10 ``` This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above: diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt index 7a2d77041d..6cbe8e4cbf 100644 --- a/python/pylibraft/CMakeLists.txt +++ b/python/pylibraft/CMakeLists.txt @@ -53,6 +53,7 @@ if(NOT raft_FOUND) set(BUILD_ANN_BENCH OFF) set(RAFT_COMPILE_LIBRARY ON) set(CUDA_STATIC_RUNTIME ON) + set(CUDA_STATIC_MATH_LIBRARIES ON) add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL) diff --git a/python/pylibraft/pylibraft/_version.py b/python/pylibraft/pylibraft/_version.py index 3b359e2ff3..0fa0ba80bc 100644 --- a/python/pylibraft/pylibraft/_version.py +++ b/python/pylibraft/pylibraft/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,22 @@ # limitations under the License. # - import importlib.resources __version__ = ( - importlib.resources.files("pylibraft") + importlib.resources.files(__package__) .joinpath("VERSION") .read_text() .strip() ) -__git_commit__ = "" +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__version__", "__git_commit__"] diff --git a/python/pylibraft/pylibraft/neighbors/hnsw.pyx b/python/pylibraft/pylibraft/neighbors/hnsw.pyx index aa589ffb65..e6f2d69eb8 100644 --- a/python/pylibraft/pylibraft/neighbors/hnsw.pyx +++ b/python/pylibraft/pylibraft/neighbors/hnsw.pyx @@ -52,6 +52,7 @@ from pylibraft.common.mdspan cimport ( from pylibraft.neighbors.common cimport _get_metric_string import os +import uuid import numpy as np @@ -292,7 +293,7 @@ def from_cagra(Index index, handle=None): Returns an hnswlib base-layer-only index from a CAGRA index. NOTE: This method uses the filesystem to write the CAGRA index in - `/tmp/cagra_index.bin` before reading it as an hnswlib index, + `/tmp/.bin` before reading it as an hnswlib index, then deleting the temporary file. Saving / loading the index is experimental. The serialization format is @@ -320,7 +321,8 @@ def from_cagra(Index index, handle=None): >>> # Serialize the CAGRA index to hnswlib base layer only index format >>> hnsw_index = hnsw.from_cagra(index, handle=handle) """ - filename = "/tmp/cagra_index.bin" + uuid_num = uuid.uuid4() + filename = f"/tmp/{uuid_num}.bin" save(filename, index, handle=handle) hnsw_index = load(filename, index.dim, np.dtype(index.active_index_type), _get_metric_string(index.metric), handle=handle) diff --git a/python/pylibraft/pylibraft/test/test_version.py b/python/pylibraft/pylibraft/test/test_version.py new file mode 100644 index 0000000000..6d6c107bcc --- /dev/null +++ b/python/pylibraft/pylibraft/test/test_version.py @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pylibraft + + +def test_version_constants_are_populated(): + # __git_commit__ will only be non-empty in a built distribution + assert isinstance(pylibraft.__git_commit__, str) + + # __version__ should always be non-empty + assert isinstance(pylibraft.__version__, str) + assert len(pylibraft.__version__) > 0 diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml index df1001538e..8b61df5d1b 100644 --- a/python/pylibraft/pyproject.toml +++ b/python/pylibraft/pyproject.toml @@ -15,14 +15,10 @@ [build-system] requires = [ - "cmake>=3.26.4", - "cuda-python>=11.7.1,<12.0a0", - "cython>=3.0.0", - "ninja", - "rmm==24.6.*", + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. -build-backend = "scikit_build_core.build" +build-backend = "rapids_build_backend.build" [project] name = "pylibraft" @@ -35,9 +31,9 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cuda-python>=11.7.1,<12.0a0", + "cuda-python", "numpy>=1.23,<2.0a0", - "rmm==24.6.*", + "rmm==24.8.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -117,6 +113,18 @@ provider = "scikit_build_core.metadata.regex" input = "pylibraft/VERSION" regex = "(?P.*)" +[tool.rapids-build-backend] +build-backend = "scikit_build_core.build" +requires = [ + "cmake>=3.26.4,!=3.30.0", + "cuda-python", + "cython>=3.0.0", + "ninja", + "rmm==24.8.*", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +dependencies-file = "../../dependencies.yaml" +matrix-entry = "cuda_suffixed=true" + [tool.pytest.ini_options] filterwarnings = [ "error", diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml index e1f0e18304..d22dd567fe 100644 --- a/python/raft-ann-bench/pyproject.toml +++ b/python/raft-ann-bench/pyproject.toml @@ -1,8 +1,9 @@ # Copyright (c) 2023, NVIDIA CORPORATION. [build-system] -build-backend = "setuptools.build_meta" +build-backend = "rapids_build_backend.build" requires = [ + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -61,3 +62,10 @@ skip = [ [tool.setuptools.dynamic] version = { file = "raft_ann_bench/VERSION" } + +[tool.rapids-build-backend] +build-backend = "setuptools.build_meta" +requires = [] +dependencies-file = "../../dependencies.yaml" +commit-files = ["src/raft_ann_bench/GIT_COMMIT"] +matrix-entry = "cuda_suffixed=true" diff --git a/python/raft-ann-bench/src/raft_ann_bench/_version.py b/python/raft-ann-bench/src/raft_ann_bench/_version.py index 394acd755d..0fa0ba80bc 100644 --- a/python/raft-ann-bench/src/raft_ann_bench/_version.py +++ b/python/raft-ann-bench/src/raft_ann_bench/_version.py @@ -13,13 +13,22 @@ # limitations under the License. # - import importlib.resources __version__ = ( - importlib.resources.files("raft_ann_bench") + importlib.resources.files(__package__) .joinpath("VERSION") .read_text() .strip() ) -__git_commit__ = "" +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__version__", "__git_commit__"] diff --git a/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py b/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py index 2b7b2728fe..e94ee56c92 100644 --- a/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py +++ b/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -51,3 +51,27 @@ def raft_cagra_search_constraints(params, build_params, k, batch_size): def hnswlib_search_constraints(params, build_params, k, batch_size): if "ef" in params: return params["ef"] >= k + + +def faiss_gpu_ivf_pq_build_constraints(params, dims): + ret = True + # M must be defined + ret = params["M"] <= dims and dims % params["M"] == 0 + if "use_raft" in params and params["use_raft"]: + return ret + pq_bits = 8 + if "bitsPerCode" in params: + pq_bits = params["bitsPerCode"] + lookup_table_size = 4 + if "useFloat16" in params and params["useFloat16"]: + lookup_table_size = 2 + # FAISS constraint to check if lookup table fits in shared memory + # for now hard code maximum shared memory per block to 49 kB (the value for A100 and V100) + return ret and lookup_table_size * params["M"] * (2**pq_bits) <= 49152 + + +def faiss_gpu_ivf_pq_search_constraints(params, build_params, k, batch_size): + ret = True + if "nlist" in build_params and "nprobe" in params: + ret = ret and build_params["nlist"] >= params["nprobe"] + return ret diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_flat.yaml new file mode 100644 index 0000000000..29c145f86d --- /dev/null +++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_flat.yaml @@ -0,0 +1,10 @@ +name: faiss_cpu_ivf_flat +groups: + base: + build: + nlist: [2048] + ratio: [10] + useFloat16: [False] + search: + nprobe: [1, 5, 10, 50, 100, 200] + refine_ratio: [1] \ No newline at end of file diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_pq.yaml new file mode 100644 index 0000000000..a531ec8294 --- /dev/null +++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_ivf_pq.yaml @@ -0,0 +1,18 @@ +name: faiss_cpu_ivf_pq +groups: + base: + build: + nlist: [1024, 2048, 4096, 8192] + M: [48, 32, 16] + ratio: [10] + bitsPerCode: [8, 6, 5, 4] + search: + nprobe: [1, 5, 10, 50, 100, 200] + large: + build: + nlist: [8192, 16384, 32768, 65536] + M: [48, 32, 16] + ratio: [10] + bitsPerCode: [8, 6, 5, 4] + search: + nprobe: [20, 30, 40, 50, 100, 200, 500, 1000] diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml index ed237becb3..e4abc35f5c 100644 --- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml +++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml @@ -3,8 +3,19 @@ groups: base: build: nlist: [2048] - ratio: [1, 4, 10] - useFloat16: [False] + ratio: [10] + useFloat16: [False, True] + use_raft: [False] search: - nprobe: [2048] + nprobe: [1, 5, 10, 50, 100, 200] refine_ratio: [1] +groups: + baseraft: + build: + nlist: [2048] + ratio: [10] + useFloat16: [False, True] + use_raft: [True] + search: + nprobe: [1, 5, 10, 50, 100, 200] + refine_ratio: [1] \ No newline at end of file diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml index 87c3afc727..7560ceaa9c 100644 --- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml +++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml @@ -1,12 +1,77 @@ name: faiss_gpu_ivf_pq +constraints: + build: raft-ann-bench.constraints.faiss_gpu_ivf_pq_build_constraints + search: raft-ann-bench.constraints.faiss_gpu_ivf_pq_search_constraints groups: base: build: nlist: [1024, 2048, 4096, 8192] - M: [8, 16] - ratio: [10, 25] + M: [64, 32, 16] + ratio: [10] + usePrecomputed: [False, True] + useFloat16: [False, True] + use_raft: [False] + bitsPerCode: [8] + search: + nprobe: [1, 5, 10, 50, 100, 200] + refine_ratio: [1, 2, 4] + baseraft: + build: + nlist: [1024, 2048, 4096, 8192] + M: [64, 32, 16] + ratio: [10] usePrecomputed: [False] - useFloat16: [False] + useFloat16: [False, True] + use_raft: [True] + bitsPerCode: [8, 6, 5, 4] search: nprobe: [1, 5, 10, 50, 100, 200] + refine_ratio: [1, 2, 4] + large: + build: + nlist: [8192, 16384, 32768, 65536] + M: [48, 32, 16] + ratio: [4] + usePrecomputed: [False, True] + useFloat16: [False, True] + use_raft: [False] + bitsPerCode: [8] + search: + nprobe: [20, 30, 40, 50, 100, 200, 500, 1000] + refine_ratio: [1, 2, 4] + largeraft: + build: + nlist: [8192, 16384, 32768, 65536] + M: [48, 32, 16] + ratio: [4] + usePrecomputed: [False] + useFloat16: [False, True] + use_raft: [True] + bitsPerCode: [8, 6, 5, 4] + search: + nprobe: [20, 30, 40, 50, 100, 200, 500, 1000] + refine_ratio: [1, 2, 4] + 100M: + build: + nlist: [50000] + M: [48] + ratio: [10] + usePrecomputed: [False, True] + useFloat16: [False, True] + use_raft: [False] + bitsPerCode: [8] + search: + nprobe: [20, 30, 40, 50, 100, 200, 500, 1000] + refine_ratio: [1] + 100Mraft: + build: + nlist: [50000] + M: [48] + ratio: [10] + usePrecomputed: [False, True] + useFloat16: [False, True] + use_raft: [True] + bitsPerCode: [8, 6, 5, 4] + search: + nprobe: [20, 30, 40, 50, 100, 200, 500, 1000] refine_ratio: [1] \ No newline at end of file diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml index 7eaec2b77b..bcdcde42a2 100644 --- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml +++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml @@ -6,12 +6,36 @@ groups: base: build: nlist: [1024, 2048, 4096, 8192] - pq_dim: [64, 32] + pq_dim: [64, 32, 16] pq_bits: [8, 6, 5, 4] - ratio: [10, 25] + ratio: [10] niter: [25] search: nprobe: [1, 5, 10, 50, 100, 200] internalDistanceDtype: ["float"] smemLutDtype: ["float", "fp8", "half"] - refine_ratio: [1, 2, 4] \ No newline at end of file + refine_ratio: [1, 2, 4] + large: + build: + nlist: [8192, 16384, 32768, 65536] + pq_dim: [48, 32, 16] + pq_bits: [8, 6, 5, 4] + ratio: [4] + niter: [20] + search: + nprobe: [20, 30, 40, 50, 100, 200, 500, 1000] + internalDistanceDtype: ["float"] + smemLutDtype: ["float", "fp8", "half"] + refine_ratio: [1, 2, 4] + 100M: + build: + nlist: [50000] + pq_dim: [48] + pq_bits: [8, 6, 5, 4] + ratio: [10] + niter: [10] + search: + nprobe: [20, 30, 40, 50, 100, 200, 500, 1000] + internalDistanceDtype: ["float"] + smemLutDtype: ["float", "fp8", "half"] + refine_ratio: [1] diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt index 2c629f3b73..197ddae05f 100644 --- a/python/raft-dask/CMakeLists.txt +++ b/python/raft-dask/CMakeLists.txt @@ -51,6 +51,7 @@ if(NOT raft_FOUND) set(RAFT_COMPILE_DIST_LIBRARY OFF) set(RAFT_COMPILE_NN_LIBRARY OFF) set(CUDA_STATIC_RUNTIME ON) + set(CUDA_STATIC_MATH_LIBRARIES ON) set(RAFT_DASK_UCXX_STATIC ON) add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL) diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake index 8e340eec73..de6a4b109c 100644 --- a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake +++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake @@ -47,9 +47,9 @@ endfunction() # Change pinned tag here to test a commit in CI # To use a different RAFT locally, set the CMake variable # CPM_raft_SOURCE=/path/to/local/raft -find_and_configure_ucxx(VERSION 0.38 +find_and_configure_ucxx(VERSION 0.39 FORK rapidsai - PINNED_TAG branch-0.38 + PINNED_TAG branch-0.39 EXCLUDE_FROM_ALL YES UCXX_STATIC ${RAFT_DASK_UCXX_STATIC} ) diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index e8ded4cd4a..29c0db0048 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -14,12 +14,9 @@ [build-system] -build-backend = "scikit_build_core.build" +build-backend = "rapids_build_backend.build" requires = [ - "cmake>=3.26.4", - "cython>=3.0.0", - "libucx==1.15.0", - "ninja", + "rapids-build-backend>=0.3.0,<0.4.0.dev0", "scikit-build-core[pyproject]>=0.7.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -34,14 +31,14 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "dask-cuda==24.6.*", - "distributed-ucxx==0.38.*", + "dask-cuda==24.8.*", + "distributed-ucxx==0.39.*", "joblib>=0.11", "numba>=0.57", "numpy>=1.23,<2.0a0", - "pylibraft==24.6.*", - "rapids-dask-dependency==24.6.*", - "ucx-py==0.38.*", + "pylibraft==24.8.*", + "rapids-dask-dependency==24.8.*", + "ucx-py==0.39.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -118,3 +115,14 @@ wheel.packages = ["raft_dask"] provider = "scikit_build_core.metadata.regex" input = "raft_dask/VERSION" regex = "(?P.*)" + +[tool.rapids-build-backend] +build-backend = "scikit_build_core.build" +requires = [ + "cmake>=3.26.4,!=3.30.0", + "cython>=3.0.0", + "libucx==1.15.0", + "ninja", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +dependencies-file = "../../dependencies.yaml" +matrix-entry = "cuda_suffixed=true" diff --git a/python/raft-dask/raft_dask/_version.py b/python/raft-dask/raft_dask/_version.py index 03cddef557..0fa0ba80bc 100644 --- a/python/raft-dask/raft_dask/_version.py +++ b/python/raft-dask/raft_dask/_version.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,22 @@ # limitations under the License. # - import importlib.resources __version__ = ( - importlib.resources.files("raft_dask") + importlib.resources.files(__package__) .joinpath("VERSION") .read_text() .strip() ) -__git_commit__ = "" +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__version__", "__git_commit__"] diff --git a/python/raft-dask/raft_dask/test/test_version.py b/python/raft-dask/raft_dask/test/test_version.py new file mode 100644 index 0000000000..94ba33d051 --- /dev/null +++ b/python/raft-dask/raft_dask/test/test_version.py @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import raft_dask + + +def test_version_constants_are_populated(): + # __git_commit__ will only be non-empty in a built distribution + assert isinstance(raft_dask.__git_commit__, str) + + # __version__ should always be non-empty + assert isinstance(raft_dask.__version__, str) + assert len(raft_dask.__version__) > 0