From 6f72fe7c4161b030c212e68c5858dcd59199a813 Mon Sep 17 00:00:00 2001 From: Ray Douglass Date: Thu, 19 Sep 2024 12:06:59 -0400 Subject: [PATCH 01/47] DOC v24.12 Updates [skip ci] --- .../cuda11.8-conda/devcontainer.json | 6 ++--- .devcontainer/cuda11.8-pip/devcontainer.json | 8 +++---- .../cuda12.5-conda/devcontainer.json | 6 ++--- .devcontainer/cuda12.5-pip/devcontainer.json | 8 +++---- .github/workflows/build.yaml | 14 +++++------ .github/workflows/pr.yaml | 24 +++++++++---------- .github/workflows/test.yaml | 8 +++---- README.md | 2 +- VERSION | 2 +- .../all_cuda-118_arch-aarch64.yaml | 4 ++-- .../all_cuda-118_arch-x86_64.yaml | 4 ++-- .../all_cuda-125_arch-aarch64.yaml | 4 ++-- .../all_cuda-125_arch-x86_64.yaml | 4 ++-- .../bench_ann_cuda-118_arch-aarch64.yaml | 4 ++-- .../bench_ann_cuda-118_arch-x86_64.yaml | 4 ++-- .../bench_ann_cuda-125_arch-aarch64.yaml | 4 ++-- .../bench_ann_cuda-125_arch-x86_64.yaml | 4 ++-- dependencies.yaml | 12 +++++----- docs/source/developer_guide.md | 6 ++--- examples/cmake/thirdparty/fetch_rapids.cmake | 2 +- python/cuvs/pyproject.toml | 2 +- rust/Cargo.toml | 2 +- rust/cuvs/Cargo.toml | 2 +- 23 files changed, 68 insertions(+), 68 deletions(-) diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 13103e8f7..05f11c005 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 74d62afcc..b4c507f86 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,24 +5,24 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": { + "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": { "version": "11.8", "installcuBLAS": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json index d6902d3f9..4f8d628c2 100644 --- a/.devcontainer/cuda12.5-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json index 3dcf52e83..8e6ba4de8 100644 --- a/.devcontainer/cuda12.5-pip/devcontainer.json +++ b/.devcontainer/cuda12.5-pip/devcontainer.json @@ -5,24 +5,24 @@ "args": { "CUDA": "12.5", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": { + "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": { "version": "12.5", "installcuBLAS": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index db20bdbc1..7ac02e365 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: rust-build: needs: cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -50,7 +50,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -59,7 +59,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -70,7 +70,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -82,7 +82,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cuvs: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-publish-cuvs: needs: wheel-build-cuvs secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 07b10e85a..4e3fb600a 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -24,29 +24,29 @@ jobs: - wheel-tests-cuvs - devcontainer secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12 with: build_type: pull-request node_type: cpu16 conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12 with: build_type: pull-request enable_check_symbols: true @@ -54,19 +54,19 @@ jobs: conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12 with: build_type: pull-request conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: pull-request docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -76,7 +76,7 @@ jobs: rust-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -86,20 +86,20 @@ jobs: wheel-build-cuvs: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12 with: build_type: pull-request script: ci/build_wheel_cuvs.sh wheel-tests-cuvs: needs: wheel-build-cuvs secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: build_type: pull-request script: ci/test_wheel_cuvs.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12 with: arch: '["amd64"]' cuda: '["12.5"]' diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 0821233a1..5f60c0a34 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -26,7 +26,7 @@ jobs: symbol_exclusions: (void (thrust::|cub::)|raft_cutlass) conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -34,7 +34,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -42,7 +42,7 @@ jobs: sha: ${{ inputs.sha }} wheel-tests-cuvs: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/README.md b/README.md index e697c61ed..e23b94616 100755 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ mamba install -c conda-forge -c nvidia -c rapidsai cuvs If installing a version that has not yet been released, the `rapidsai` channel can be replaced with `rapidsai-nightly`: ```bash -mamba install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.10 +mamba install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.12 ``` Please see the [Build and Install Guide](https://docs.rapids.ai/api/cuvs/stable/build/) for more information on installing cuVS and building from source. diff --git a/VERSION b/VERSION index 7c7ba0443..af28c42b5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -24.10.00 +24.12.00 diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index cfcb56225..cb5c804d9 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -35,7 +35,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - make - nccl>=2.9.9 - ninja @@ -45,7 +45,7 @@ dependencies: - openblas - pre-commit - pydata-sphinx-theme -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pytest-cov - pytest==7.* - rapids-build-backend>=0.3.0,<0.4.0.dev0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index dc519d1b5..3b126c1dc 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -35,7 +35,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - make - nccl>=2.9.9 - ninja @@ -45,7 +45,7 @@ dependencies: - openblas - pre-commit - pydata-sphinx-theme -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pytest-cov - pytest==7.* - rapids-build-backend>=0.3.0,<0.4.0.dev0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index b32650e44..0eafb709c 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -32,7 +32,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - make - nccl>=2.9.9 - ninja @@ -41,7 +41,7 @@ dependencies: - openblas - pre-commit - pydata-sphinx-theme -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pytest-cov - pytest==7.* - rapids-build-backend>=0.3.0,<0.4.0.dev0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index d40fc3b99..fc15743c0 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -32,7 +32,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - make - nccl>=2.9.9 - ninja @@ -41,7 +41,7 @@ dependencies: - openblas - pre-commit - pydata-sphinx-theme -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pytest-cov - pytest==7.* - rapids-build-backend>=0.3.0,<0.4.0.dev0 diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index c6e8b05a2..47d012c03 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -33,7 +33,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.9.9 - ninja @@ -41,7 +41,7 @@ dependencies: - nvcc_linux-aarch64=11.8 - openblas - pandas -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pyyaml - sysroot_linux-aarch64==2.17 name: bench_ann_cuda-118_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index d6c023ae9..ae7a64e44 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -33,7 +33,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.9.9 - ninja @@ -41,7 +41,7 @@ dependencies: - nvcc_linux-64=11.8 - openblas - pandas -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pyyaml - sysroot_linux-64==2.17 name: bench_ann_cuda-118_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml index 4d0ca9496..3807661eb 100644 --- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml @@ -30,14 +30,14 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 - openblas - pandas -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pyyaml - sysroot_linux-aarch64==2.17 name: bench_ann_cuda-125_arch-aarch64 diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml index 7dd67ab5e..14182f865 100644 --- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml @@ -30,14 +30,14 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- librmm==24.10.*,>=0.0.0a0 +- librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 - openblas - pandas -- pylibraft==24.10.*,>=0.0.0a0 +- pylibraft==24.12.*,>=0.0.0a0 - pyyaml - sysroot_linux-64==2.17 name: bench_ann_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 9fcbeaae2..956f33196 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -481,7 +481,7 @@ dependencies: common: - output_types: conda packages: - - &librmm_unsuffixed librmm==24.10.*,>=0.0.0a0 + - &librmm_unsuffixed librmm==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -494,18 +494,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - librmm-cu12==24.10.*,>=0.0.0a0 + - librmm-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - librmm-cu11==24.10.*,>=0.0.0a0 + - librmm-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*librmm_unsuffixed]} depends_on_pylibraft: common: - output_types: conda packages: - - &pylibraft_unsuffixed pylibraft==24.10.*,>=0.0.0a0 + - &pylibraft_unsuffixed pylibraft==24.12.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -518,10 +518,10 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibraft-cu12==24.10.*,>=0.0.0a0 + - pylibraft-cu12==24.12.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - pylibraft-cu11==24.10.*,>=0.0.0a0 + - pylibraft-cu11==24.12.*,>=0.0.0a0 - {matrix: null, packages: [*pylibraft_unsuffixed]} diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md index 516819b1c..c4a099fab 100644 --- a/docs/source/developer_guide.md +++ b/docs/source/developer_guide.md @@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour 1. Do not split empty functions/records/namespaces. 2. Two-space indentation everywhere, including the line continuations. 3. Disable reflowing of comments. - The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/.clang-format). + The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/.clang-format). [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter. In order to run doxygen as a linter on C++/CUDA code, run @@ -205,7 +205,7 @@ you can run `codespell -i 3 -w .` from the repository root directory. This will bring up an interactive prompt to select which spelling fixes to apply. ### #include style -[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/scripts/include_checker.py) is used to enforce the include style as follows: +[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/scripts/include_checker.py) is used to enforce the include style as follows: 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies. 2. `#include <...>` should be used for referencing everything else @@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` ## Logging ### Introduction -Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.10/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all. +Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all. ### Usage ```cpp diff --git a/examples/cmake/thirdparty/fetch_rapids.cmake b/examples/cmake/thirdparty/fetch_rapids.cmake index f64a924cf..6f4c627ed 100644 --- a/examples/cmake/thirdparty/fetch_rapids.cmake +++ b/examples/cmake/thirdparty/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # Use this variable to update RAPIDS and RAFT versions -set(RAPIDS_VERSION "24.10") +set(RAPIDS_VERSION "24.12") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml index 68bd9a868..bf62f5adf 100644 --- a/python/cuvs/pyproject.toml +++ b/python/cuvs/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "nvidia-curand", "nvidia-cusolver", "nvidia-cusparse", - "pylibraft==24.10.*,>=0.0.0a0", + "pylibraft==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 52125bef3..79aa5756a 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -6,7 +6,7 @@ members = [ resolver = "2" [workspace.package] -version = "24.10.0" +version = "24.12.0" edition = "2021" repository = "https://github.com/rapidsai/cuvs" homepage = "https://github.com/rapidsai/cuvs" diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml index 7e5b18143..13cc658e3 100644 --- a/rust/cuvs/Cargo.toml +++ b/rust/cuvs/Cargo.toml @@ -9,7 +9,7 @@ authors.workspace = true license.workspace = true [dependencies] -ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "24.10.0" } +ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "24.12.0" } ndarray = "0.15" [dev-dependencies] From 2ad639702b3912e4eb037a6817335e48dc90ad73 Mon Sep 17 00:00:00 2001 From: Micka Date: Fri, 27 Sep 2024 20:21:10 +0200 Subject: [PATCH 02/47] Fix Question Retrieval notebook (#352) Authors: - Micka (https://github.com/lowener) - Corey J. Nolet (https://github.com/cjnolet) - rhdong (https://github.com/rhdong) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/352 --- notebooks/VectorSearch_QuestionRetrieval.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb index 21d59975b..1115a5920 100644 --- a/notebooks/VectorSearch_QuestionRetrieval.ipynb +++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb @@ -160,7 +160,7 @@ }, "outputs": [], "source": [ - "pq_index_mem = pq_index.pq_dim * pq_index.size * pq_index.pq_bits\n", + "pq_index_mem = params.pq_dim * corpus_embeddings.shape[0] * params.pq_bits\n", "print(\"IVF-PQ memory footprint: {:.1f} MB\".format(pq_index_mem / 2**20))\n", "\n", "original_mem = corpus_embeddings.shape[0] * corpus_embeddings.shape[1] * 4\n", From 397e56e0df4a430edc7f8e16e572fc8a03a0e0c0 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 4 Oct 2024 13:21:33 -0400 Subject: [PATCH 03/47] Prune workflows based on changed files (#392) Contributes to https://github.com/rapidsai/build-planning/issues/94 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cuvs/pull/392 --- .github/workflows/pr.yaml | 46 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 4e3fb600a..e18e82df0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -12,6 +12,7 @@ concurrency: jobs: pr-builder: needs: + - changed-files - checks - conda-cpp-build - conda-cpp-tests @@ -25,6 +26,42 @@ jobs: - devcontainer secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 + if: always() + with: + needs: ${{ toJSON(needs) }} + changed-files: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12 + with: + files_yaml: | + test_cpp: + - '**' + - '!.devcontainer/**' + - '!.pre-commit-config.yaml' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!python/**' + - '!rust/**' + - '!thirdparty/LICENSES/**' + test_notebooks: + - '**' + - '!.devcontainer/**' + - '!.pre-commit-config.yaml' + - '!README.md' + - '!rust/**' + - '!thirdparty/LICENSES/**' + test_python: + - '**' + - '!.devcontainer/**' + - '!.pre-commit-config.yaml' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!rust/**' + - '!thirdparty/LICENSES/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12 @@ -38,9 +75,10 @@ jobs: build_type: pull-request node_type: cpu16 conda-cpp-tests: - needs: conda-cpp-build + needs: [conda-cpp-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request conda-cpp-checks: @@ -58,9 +96,10 @@ jobs: with: build_type: pull-request conda-python-tests: - needs: conda-python-build + needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request docs-build: @@ -91,9 +130,10 @@ jobs: build_type: pull-request script: ci/build_wheel_cuvs.sh wheel-tests-cuvs: - needs: wheel-build-cuvs + needs: [wheel-build-cuvs, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request script: ci/test_wheel_cuvs.sh From 7debf51ae3bd9817143544b4f6593688fcb159f2 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:19:51 +0200 Subject: [PATCH 04/47] Fix NVTX annotations (#400) 1) Replace the domain name from `raft` to `cuvs` to avoid confusion when using tools such as NSYS to analyze the program timeline. 2) Use C++17 feature `__has_include` instead of a CMake script to find out if NVTX available in the benchmark executable. It turns out our CMake check has been not reliable due to not finding include directories correctly. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Micka (https://github.com/lowener) URL: https://github.com/rapidsai/cuvs/pull/400 --- cpp/bench/ann/CMakeLists.txt | 25 +++---------------- cpp/bench/ann/src/common/util.hpp | 3 ++- cpp/src/cluster/detail/kmeans.cuh | 18 ++++++------- cpp/src/cluster/detail/kmeans_balanced.cuh | 10 ++++---- .../neighbors/detail/cagra/cagra_build.cuh | 3 ++- .../neighbors/detail/cagra/cagra_search.cuh | 4 +-- .../detail/cagra/cagra_serialize.cuh | 8 +++--- cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh | 6 +++++ 8 files changed, 33 insertions(+), 44 deletions(-) diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 8cbf8c8b3..ac1301221 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -87,21 +87,6 @@ if(CUVS_ANN_BENCH_USE_FAISS) include(cmake/thirdparty/get_faiss) endif() -# ################################################################################################## -# * Enable NVTX if available - -# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench -# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just -# raft/cuVS. -if(TARGET CUDA::nvtx3) - set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES}) - get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES) - unset(NVTX3_HEADERS_FOUND CACHE) - # Check the headers explicitly to make sure the cpu-only build succeeds - CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND) - set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG}) -endif() - # ################################################################################################## # * Target function ------------------------------------------------------------- @@ -127,12 +112,9 @@ function(ConfigureAnnBench) add_dependencies(${BENCH_NAME} ANN_BENCH) else() add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH}) - target_compile_definitions( - ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN - $<$:ANN_BENCH_NVTX3_HEADERS_FOUND> - ) + target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN>) target_link_libraries( - ${BENCH_NAME} PRIVATE benchmark::benchmark $<$:CUDA::nvtx3> + ${BENCH_NAME} PRIVATE benchmark::benchmark $<$:CUDA::nvtx3> ) endif() @@ -300,7 +282,7 @@ if(CUVS_ANN_BENCH_SINGLE_EXE) target_link_libraries( ANN_BENCH PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only - spdlog::spdlog_header_only $<$:CUDA::nvtx3> + spdlog::spdlog_header_only $<$:CUDA::nvtx3> ) set_target_properties( ANN_BENCH @@ -318,7 +300,6 @@ if(CUVS_ANN_BENCH_SINGLE_EXE) ANN_BENCH PRIVATE $<$:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}"> - $<$:ANN_BENCH_NVTX3_HEADERS_FOUND> ) target_link_options(ANN_BENCH PRIVATE -export-dynamic) diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp index c3db2bb4b..dbde74ccc 100644 --- a/cpp/bench/ann/src/common/util.hpp +++ b/cpp/bench/ann/src/common/util.hpp @@ -18,7 +18,8 @@ #include "ann_types.hpp" #include "cuda_stub.hpp" // cuda-related utils -#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND +#if __has_include() +#define ANN_BENCH_NVTX3_HEADERS_FOUND #include #endif diff --git a/cpp/src/cluster/detail/kmeans.cuh b/cpp/src/cluster/detail/kmeans.cuh index 9b673bca3..3d054f0fd 100644 --- a/cpp/src/cluster/detail/kmeans.cuh +++ b/cpp/src/cluster/detail/kmeans.cuh @@ -15,12 +15,12 @@ */ #pragma once +#include "../../core/nvtx.hpp" #include "kmeans_common.cuh" #include #include -#include #include #include #include @@ -71,7 +71,7 @@ void initRandom(raft::resources const& handle, raft::device_matrix_view X, raft::device_matrix_view centroids) { - raft::common::nvtx::range fun_scope("initRandom"); + raft::common::nvtx::range fun_scope("initRandom"); auto n_clusters = params.n_clusters; cuvs::cluster::kmeans::detail::shuffleAndGather( handle, X, centroids, n_clusters, params.rng_state.seed); @@ -98,7 +98,7 @@ void kmeansPlusPlus(raft::resources const& handle, raft::device_matrix_view centroidsRawData, rmm::device_uvector& workspace) { - raft::common::nvtx::range fun_scope("kmeansPlusPlus"); + raft::common::nvtx::range fun_scope("kmeansPlusPlus"); cudaStream_t stream = raft::resource::get_cuda_stream(handle); auto n_samples = X.extent(0); auto n_features = X.extent(1); @@ -372,7 +372,7 @@ void kmeans_fit_main(raft::resources const& handle, raft::host_scalar_view n_iter, rmm::device_uvector& workspace) { - raft::common::nvtx::range fun_scope("kmeans_fit_main"); + raft::common::nvtx::range fun_scope("kmeans_fit_main"); raft::logger::get(RAFT_NAME).set_level(params.verbosity); cudaStream_t stream = raft::resource::get_cuda_stream(handle); auto n_samples = X.extent(0); @@ -590,7 +590,7 @@ void initScalableKMeansPlusPlus(raft::resources const& handle, raft::device_matrix_view centroidsRawData, rmm::device_uvector& workspace) { - raft::common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "initScalableKMeansPlusPlus"); cudaStream_t stream = raft::resource::get_cuda_stream(handle); auto n_samples = X.extent(0); @@ -841,7 +841,7 @@ void kmeans_fit(raft::resources const& handle, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { - raft::common::nvtx::range fun_scope("kmeans_fit"); + raft::common::nvtx::range fun_scope("kmeans_fit"); auto n_samples = X.extent(0); auto n_features = X.extent(1); auto n_clusters = pams.n_clusters; @@ -1009,7 +1009,7 @@ void kmeans_predict(raft::resources const& handle, bool normalize_weight, raft::host_scalar_view inertia) { - raft::common::nvtx::range fun_scope("kmeans_predict"); + raft::common::nvtx::range fun_scope("kmeans_predict"); auto n_samples = X.extent(0); auto n_features = X.extent(1); cudaStream_t stream = raft::resource::get_cuda_stream(handle); @@ -1153,7 +1153,7 @@ void kmeans_fit_predict(raft::resources const& handle, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { - raft::common::nvtx::range fun_scope("kmeans_fit_predict"); + raft::common::nvtx::range fun_scope("kmeans_fit_predict"); if (!centroids.has_value()) { auto n_features = X.extent(1); auto centroids_matrix = @@ -1217,7 +1217,7 @@ void kmeans_transform(raft::resources const& handle, raft::device_matrix_view centroids, raft::device_matrix_view X_new) { - raft::common::nvtx::range fun_scope("kmeans_transform"); + raft::common::nvtx::range fun_scope("kmeans_transform"); raft::logger::get(RAFT_NAME).set_level(pams.verbosity); cudaStream_t stream = raft::resource::get_cuda_stream(handle); auto n_samples = X.extent(0); diff --git a/cpp/src/cluster/detail/kmeans_balanced.cuh b/cpp/src/cluster/detail/kmeans_balanced.cuh index 34bb22e85..3f1ad2334 100644 --- a/cpp/src/cluster/detail/kmeans_balanced.cuh +++ b/cpp/src/cluster/detail/kmeans_balanced.cuh @@ -20,10 +20,10 @@ #include "kmeans_common.cuh" #include +#include "../../core/nvtx.hpp" #include "../../distance/distance.cuh" #include -#include #include #include #include @@ -378,7 +378,7 @@ void compute_norm(const raft::resources& handle, FinOpT norm_fin_op, std::optional mr = std::nullopt) { - raft::common::nvtx::range fun_scope("compute_norm"); + raft::common::nvtx::range fun_scope("compute_norm"); auto stream = raft::resource::get_cuda_stream(handle); rmm::device_uvector mapped_dataset( 0, stream, mr.value_or(raft::resource::get_workspace_resource(handle))); @@ -434,7 +434,7 @@ void predict(const raft::resources& handle, const MathT* dataset_norm = nullptr) { auto stream = raft::resource::get_cuda_stream(handle); - raft::common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "predict(%zu, %u)", static_cast(n_rows), n_clusters); auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle)); auto [max_minibatch_size, _mem_per_row] = @@ -603,7 +603,7 @@ auto adjust_centers(MathT* centers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref device_memory) -> bool { - raft::common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "adjust_centers(%zu, %u)", static_cast(n_rows), n_clusters); if (n_clusters == 0) { return false; } constexpr static std::array kPrimes{29, 71, 113, 173, 229, 281, 349, 409, 463, 541, @@ -1036,7 +1036,7 @@ void build_hierarchical(const raft::resources& handle, auto stream = raft::resource::get_cuda_stream(handle); using LabelT = uint32_t; - raft::common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "build_hierarchical(%zu, %u)", static_cast(n_rows), n_clusters); IdxT n_mesoclusters = std::min(n_clusters, static_cast(std::sqrt(n_clusters) + 0.5)); diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index e5495dc3e..9e4d453e3 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "../../../core/nvtx.hpp" #include "../../vpq_dataset.cuh" #include "graph_core.cuh" #include @@ -130,7 +131,7 @@ void build_knn_graph( "Currently only L2Expanded or InnerProduct metric are supported"); uint32_t node_degree = knn_graph.extent(1); - raft::common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "cagra::build_graph(%zu, %zu, %u)", size_t(dataset.extent(0)), size_t(dataset.extent(1)), diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 4c15b8e14..95c158675 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -16,6 +16,7 @@ #pragma once +#include "../../../core/nvtx.hpp" #include "factory.cuh" #include "sample_filter_utils.cuh" #include "search_plan.cuh" @@ -23,7 +24,6 @@ #include #include -#include #include #include @@ -66,7 +66,7 @@ void search_main_core(raft::resources const& res, params.max_queries = std::min(queries.extent(0), deviceProp.maxGridSize[1]); } - raft::common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh index f86ed9ef6..4c3fe5e81 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh @@ -21,10 +21,10 @@ #include #include #include -#include #include #include +#include "../../../core/nvtx.hpp" #include "../dataset_serialize.hpp" #include @@ -53,7 +53,7 @@ void serialize(raft::resources const& res, const index& index_, bool include_dataset) { - raft::common::nvtx::range fun_scope("cagra::serialize"); + raft::common::nvtx::range fun_scope("cagra::serialize"); RAFT_LOG_DEBUG( "Saving CAGRA index, size %zu, dim %u", static_cast(index_.size()), index_.dim()); @@ -103,7 +103,7 @@ void serialize_to_hnswlib(raft::resources const& res, { // static_assert(std::is_same_v or std::is_same_v, // "An hnswlib index can only be trained with int32 or uint32 IdxT"); - raft::common::nvtx::range fun_scope("cagra::serialize"); + raft::common::nvtx::range fun_scope("cagra::serialize"); RAFT_LOG_DEBUG("Saving CAGRA index to hnswlib format, size %zu, dim %u", static_cast(index_.size()), index_.dim()); @@ -234,7 +234,7 @@ void serialize_to_hnswlib(raft::resources const& res, template void deserialize(raft::resources const& res, std::istream& is, index* index_) { - raft::common::nvtx::range fun_scope("cagra::deserialize"); + raft::common::nvtx::range fun_scope("cagra::deserialize"); char dtype_string[4]; is.read(dtype_string, 4); diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh index c65ea8108..f0f464950 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh @@ -1729,6 +1729,12 @@ auto build(raft::resources const& handle, if constexpr (std::is_same_v) { raft::matrix::sample_rows(handle, random_state, dataset, trainset.view()); } else { + raft::common::nvtx::range fun_scope( + " ivf_pq::build(%zu, %zu)/sample rows with tmp trainset (%zu rows).", + size_t(n_rows), + size_t(dim), + size_t(n_rows_train)); + // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp. auto trainset_tmp = raft::make_device_mdarray( handle, big_memory_resource, raft::make_extents(n_rows_train, dim)); From e55e655e1ac6fb10ba846e808d3003ce20c580f5 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 11 Oct 2024 15:08:21 -0500 Subject: [PATCH 05/47] make conda installs in CI stricter (#406) Contributes to https://github.com/rapidsai/build-planning/issues/106 Proposes specifying the RAPIDS version in `conda install` calls that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases. Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Mike Sarahan (https://github.com/msarahan) URL: https://github.com/rapidsai/cuvs/pull/406 --- ci/build_docs.sh | 11 ++++++----- ci/build_rust.sh | 6 ++++-- ci/test_cpp.sh | 5 ++++- ci/test_python.sh | 5 ++++- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 460cc3899..bce93c605 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -6,6 +6,9 @@ set -euo pipefail rapids-logger "Create test conda environment" . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" +export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" + rapids-dependency-file-generator \ --output conda \ --file-key docs \ @@ -28,11 +31,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - libcuvs cuvs + "libcuvs=${RAPIDS_VERSION}" \ + "cuvs=${RAPIDS_VERSION}" -export RAPIDS_VERSION="$(rapids-version)" -export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" -export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build CPP docs" @@ -54,4 +55,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/cuvs/"html mv _html/* "${RAPIDS_DOCS_DIR}/cuvs/html" popd -rapids-upload-docs +RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs diff --git a/ci/build_rust.sh b/ci/build_rust.sh index 31d0de053..309501c32 100755 --- a/ci/build_rust.sh +++ b/ci/build_rust.sh @@ -6,6 +6,8 @@ set -euo pipefail rapids-logger "Create test conda environment" . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-dependency-file-generator \ --output conda \ --file-key rust \ @@ -32,7 +34,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) # installing libcuvs/libraft will speed up the rust build substantially rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ - libcuvs \ - libraft + "libcuvs=${RAPIDS_VERSION}" \ + "libraft=${RAPIDS_VERSION}" bash ./build.sh rust diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 6dfc2cf71..134dc4421 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate C++ testing dependencies" rapids-dependency-file-generator \ --output conda \ @@ -26,7 +28,8 @@ rapids-print-env rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ - libcuvs libcuvs-tests + "libcuvs=${RAPIDS_VERSION}" \ + "libcuvs-tests=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_python.sh b/ci/test_python.sh index 93bc597cf..b9c394062 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ --output conda \ @@ -31,7 +33,8 @@ rapids-print-env rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - libcuvs cuvs + "libcuvs=${RAPIDS_VERSION}" \ + "cuvs=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi From f62b217f97c9e14b340f11bcbfe556fcad9ed816 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:07:41 +0200 Subject: [PATCH 06/47] Add --no-lap-sync cmd option to ann-bench (#405) Add a command-line option to disable the CUDA event/stream synchronization on each iteration. Only one sync is done per benchmark loop in this case instead. As a result, the measured QPS is observed due to: 1) A small `cudaEventSynchronize` is removed from the benchmark loop; 2) If a GPU algorithm has little to no sync between the GPU and CPU, the kernel launch latency and other CPU overheads are completely hidden. The new option is experimental and disabled by default. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/cuvs/pull/405 --- cpp/bench/ann/src/common/benchmark.hpp | 161 +++++++++++++++---------- 1 file changed, 98 insertions(+), 63 deletions(-) diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index db3e533e0..06e1e27af 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -119,7 +119,8 @@ template void bench_build(::benchmark::State& state, std::shared_ptr> dataset, configuration::index index, - bool force_overwrite) + bool force_overwrite, + bool no_lap_sync) { // NB: these two thread-local vars can be used within algo wrappers cuvs::bench::benchmark_thread_id = state.thread_index(); @@ -149,9 +150,22 @@ void bench_build(::benchmark::State& state, cuda_timer gpu_timer{algo}; { nvtx_case nvtx{state.name()}; + /* Note: GPU timing + + The GPU time is measured between construction and destruction of `cuda_lap` objects (`gpu_all` + and `gpu_lap` variables) and added to the `gpu_timer` object. + + We sync with the GPU (cudaEventSynchronize) either each iteration (lifetime of the `gpu_lap` + variable) or once per benchmark loop (lifetime of the `gpu_all` variable). The decision is + + controlled by the `no_lap_sync` argument. In either case, we need at least one sync throughout + the benchmark loop to make sure the GPU has finished its work before we measure the total run + time. + */ + [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync); for (auto _ : state) { [[maybe_unused]] auto ntx_lap = nvtx.lap(); - [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync); try { algo->build(base_set, index_size); } catch (const std::exception& e) { @@ -173,7 +187,8 @@ template void bench_search(::benchmark::State& state, configuration::index index, std::size_t search_param_ix, - std::shared_ptr> dataset) + std::shared_ptr> dataset, + bool no_lap_sync) { // NB: these two thread-local vars can be used within algo wrappers cuvs::bench::benchmark_thread_id = state.thread_index(); @@ -300,25 +315,29 @@ void bench_search(::benchmark::State& state, // Initialize with algo, so that the timer.lap() object can sync with algo::get_sync_stream() cuda_timer gpu_timer{a}; auto start = std::chrono::high_resolution_clock::now(); - for (auto _ : state) { - [[maybe_unused]] auto ntx_lap = nvtx.lap(); - [[maybe_unused]] auto gpu_lap = gpu_timer.lap(); - try { - a->search(query_set + batch_offset * dataset->dim(), - n_queries, - k, - neighbors_ptr + out_offset * k, - distances_ptr + out_offset * k); - } catch (const std::exception& e) { - state.SkipWithError("Benchmark loop: " + std::string(e.what())); - break; - } + { + /* See the note above: GPU timing */ + [[maybe_unused]] auto gpu_all = gpu_timer.lap(no_lap_sync); + for (auto _ : state) { + [[maybe_unused]] auto ntx_lap = nvtx.lap(); + [[maybe_unused]] auto gpu_lap = gpu_timer.lap(!no_lap_sync); + try { + a->search(query_set + batch_offset * dataset->dim(), + n_queries, + k, + neighbors_ptr + out_offset * k, + distances_ptr + out_offset * k); + } catch (const std::exception& e) { + state.SkipWithError("Benchmark loop: " + std::string(e.what())); + break; + } - // advance to the next batch - batch_offset = (batch_offset + queries_stride) % query_set_size; - out_offset = (out_offset + n_queries) % query_set_size; + // advance to the next batch + batch_offset = (batch_offset + queries_stride) % query_set_size; + out_offset = (out_offset + n_queries) % query_set_size; - queries_processed += n_queries; + queries_processed += n_queries; + } } auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast>(end - start).count(); @@ -379,44 +398,51 @@ void bench_search(::benchmark::State& state, inline void printf_usage() { ::benchmark::PrintDefaultHelp(); - fprintf(stdout, - " [--build|--search] \n" - " [--force]\n" - " [--data_prefix=]\n" - " [--index_prefix=]\n" - " [--override_kv=]\n" - " [--mode=\n" - " [--threads=min[:max]]\n" - " .json\n" - "\n" - "Note the non-standard benchmark parameters:\n" - " --build: build mode, will build index\n" - " --search: search mode, will search using the built index\n" - " one and only one of --build and --search should be specified\n" - " --force: force overwriting existing index files\n" - " --data_prefix=:" - " prepend to dataset file paths specified in the .json (default = " - "'data/').\n" - " --index_prefix=:" - " prepend to index file paths specified in the .json (default = " - "'index/').\n" - " --override_kv=:" - " override a build/search key one or more times multiplying the number of configurations;" - " you can use this parameter multiple times to get the Cartesian product of benchmark" - " configs.\n" - " --mode=" - " run the benchmarks in latency (accumulate times spent in each batch) or " - " throughput (pipeline batches and measure end-to-end) mode\n" - " --threads=min[:max] specify the number threads to use for throughput benchmark." - " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified," - " then a single test is run with 'min' threads. By default min=1, max=.\n"); + fprintf( + stdout, + " [--build|--search] \n" + " [--force]\n" + " [--data_prefix=]\n" + " [--index_prefix=]\n" + " [--override_kv=]\n" + " [--mode=\n" + " [--threads=min[:max]]\n" + " [--no-lap-sync]\n" + " .json\n" + "\n" + "Note the non-standard benchmark parameters:\n" + " --build: build mode, will build index\n" + " --search: search mode, will search using the built index\n" + " one and only one of --build and --search should be specified\n" + " --force: force overwriting existing index files\n" + " --data_prefix=:" + " prepend to dataset file paths specified in the .json (default = " + "'data/').\n" + " --index_prefix=:" + " prepend to index file paths specified in the .json (default = " + "'index/').\n" + " --override_kv=:" + " override a build/search key one or more times multiplying the number of configurations;" + " you can use this parameter multiple times to get the Cartesian product of benchmark" + " configs.\n" + " --mode=" + " run the benchmarks in latency (accumulate times spent in each batch) or " + " throughput (pipeline batches and measure end-to-end) mode\n" + " --threads=min[:max] specify the number threads to use for throughput benchmark." + " Power of 2 values between 'min' and 'max' will be used. If only 'min' is specified," + " then a single test is run with 'min' threads. By default min=1, max=.\n" + " --no-lap-sync disable CUDA event synchronization between benchmark iterations. If a GPU" + " algorithm has no sync with CPU, this can make the GPU processing significantly lag behind the" + " CPU scheduling. Then this also hides the scheduling latencies and thus improves the measured" + " throughput (QPS). Note there's a sync at the end of the benchmark loop in any case.\n"); } template void register_build(std::shared_ptr> dataset, std::vector indices, - bool force_overwrite) + bool force_overwrite, + bool no_lap_sync) { for (auto index : indices) { auto suf = static_cast(index.build_param["override_suffix"]); @@ -425,7 +451,7 @@ void register_build(std::shared_ptr> dataset, std::replace(file_suf.begin(), file_suf.end(), '/', '-'); index.file += file_suf; auto* b = ::benchmark::RegisterBenchmark( - index.name + suf, bench_build, dataset, index, force_overwrite); + index.name + suf, bench_build, dataset, index, force_overwrite, no_lap_sync); b->Unit(benchmark::kSecond); b->MeasureProcessCPUTime(); b->UseRealTime(); @@ -436,14 +462,16 @@ template void register_search(std::shared_ptr> dataset, std::vector indices, Mode metric_objective, - const std::vector& threads) + const std::vector& threads, + bool no_lap_sync) { for (auto index : indices) { for (std::size_t i = 0; i < index.search_params.size(); i++) { auto suf = static_cast(index.search_params[i]["override_suffix"]); index.search_params[i].erase("override_suffix"); - auto* b = ::benchmark::RegisterBenchmark(index.name + suf, bench_search, index, i, dataset) + auto* b = ::benchmark::RegisterBenchmark( + index.name + suf, bench_search, index, i, dataset, no_lap_sync) ->Unit(benchmark::kMillisecond) /** * The following are important for getting accuracy QPS measurements on both CPU @@ -470,7 +498,8 @@ void dispatch_benchmark(std::string cmdline, std::string index_prefix, kv_series override_kv, Mode metric_objective, - const std::vector& threads) + const std::vector& threads, + bool no_lap_sync) { ::benchmark::AddCustomContext("command_line", cmdline); for (auto [key, value] : host_info()) { @@ -514,7 +543,7 @@ void dispatch_benchmark(std::string cmdline, more_indices.push_back(modified_index); } } - register_build(dataset, more_indices, force_overwrite); + register_build(dataset, more_indices, force_overwrite, no_lap_sync); } else if (search_mode) { if (file_exists(query_file)) { log_info("Using the query file '%s'", query_file.c_str()); @@ -543,7 +572,7 @@ void dispatch_benchmark(std::string cmdline, index.search_params = apply_overrides(index.search_params, override_kv); index.file = combine_path(index_prefix, index.file); } - register_search(dataset, indices, metric_objective, threads); + register_search(dataset, indices, metric_objective, threads, no_lap_sync); } } @@ -571,6 +600,7 @@ inline auto run_main(int argc, char** argv) -> int bool force_overwrite = false; bool build_mode = false; bool search_mode = false; + bool no_lap_sync = false; std::string data_prefix = "data"; std::string index_prefix = "index"; std::string new_override_kv = ""; @@ -604,6 +634,7 @@ inline auto run_main(int argc, char** argv) -> int if (parse_bool_flag(argv[i], "--force", force_overwrite) || parse_bool_flag(argv[i], "--build", build_mode) || parse_bool_flag(argv[i], "--search", search_mode) || + parse_bool_flag(argv[i], "--no-lap-sync", no_lap_sync) || parse_string_flag(argv[i], "--data_prefix", data_prefix) || parse_string_flag(argv[i], "--index_prefix", index_prefix) || parse_string_flag(argv[i], "--mode", mode) || @@ -686,7 +717,8 @@ inline auto run_main(int argc, char** argv) -> int index_prefix, override_kv, metric_objective, - threads); + threads, + no_lap_sync); } else if (dtype == "half") { dispatch_benchmark(cmdline, conf, @@ -697,7 +729,8 @@ inline auto run_main(int argc, char** argv) -> int index_prefix, override_kv, metric_objective, - threads); + threads, + no_lap_sync); } else if (dtype == "uint8") { dispatch_benchmark(cmdline, conf, @@ -708,7 +741,8 @@ inline auto run_main(int argc, char** argv) -> int index_prefix, override_kv, metric_objective, - threads); + threads, + no_lap_sync); } else if (dtype == "int8") { dispatch_benchmark(cmdline, conf, @@ -719,7 +753,8 @@ inline auto run_main(int argc, char** argv) -> int index_prefix, override_kv, metric_objective, - threads); + threads, + no_lap_sync); } else { log_error("datatype '%s' is not supported", dtype.c_str()); return -1; From c86e74d6d12cd3396fdf1ed9fa3b96d858b1fa5f Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Thu, 17 Oct 2024 11:38:13 -0400 Subject: [PATCH 07/47] Add `click` package to `cuvs-bench` conda recipe (#408) This package is available in `dependencies.yaml`, but due to an oversight was not added to conda metas. Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cuvs/pull/408 --- conda/recipes/cuvs_bench/meta.yaml | 1 + conda/recipes/cuvs_bench_cpu/meta.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/conda/recipes/cuvs_bench/meta.yaml b/conda/recipes/cuvs_bench/meta.yaml index 9ecbf82bb..67d66efce 100644 --- a/conda/recipes/cuvs_bench/meta.yaml +++ b/conda/recipes/cuvs_bench/meta.yaml @@ -82,6 +82,7 @@ requirements: run: - benchmark + - click - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} - cudatoolkit diff --git a/conda/recipes/cuvs_bench_cpu/meta.yaml b/conda/recipes/cuvs_bench_cpu/meta.yaml index 0ce5db744..95bea92ef 100644 --- a/conda/recipes/cuvs_bench_cpu/meta.yaml +++ b/conda/recipes/cuvs_bench_cpu/meta.yaml @@ -55,6 +55,7 @@ requirements: run: - benchmark + - click - glog {{ glog_version }} - h5py {{ h5py_version }} - matplotlib From f708fe388ee206105fd7894388283995e88ab7f9 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 17 Oct 2024 14:13:24 -0400 Subject: [PATCH 08/47] We need to enable the c_api by default (#416) Remove a collection of unneccesarily complex CMake logic. Major change is that we explicitly opt-in to building the C API bindings by default since it is a hard requirement for our python bindings, and the project has numerous conditions to disable it. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/cuvs/pull/416 --- cpp/CMakeLists.txt | 18 ++++-------------- cpp/test/CMakeLists.txt | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3e98a247e..746245791 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -53,8 +53,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON) option(BUILD_TESTS "Build cuvs unit-tests" ON) -option(BUILD_C_LIBRARY "Build cuVS C API library" OFF) -option(BUILD_C_TESTS "Build cuVS C API tests" OFF) +option(BUILD_C_LIBRARY "Build cuVS C API library" ON) option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF) option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON) option(BUILD_MG_ALGOS "Build with multi-GPU support" ON) @@ -72,21 +71,12 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF) option(CUVS_NVTX "Enable nvtx markers" OFF) option(CUVS_RAFT_CLONE_ON_PIN "Explicitly clone RAFT branch when pinned to non-feature branch" ON) -if((BUILD_TESTS OR BUILD_C_LIBRARY) AND NOT BUILD_CPU_ONLY) - -endif() - if(BUILD_CPU_ONLY) set(BUILD_SHARED_LIBS OFF) set(BUILD_TESTS OFF) set(BUILD_C_LIBRARY OFF) -endif() - -if(NOT BUILD_C_LIBRARY) - set(BUILD_C_TESTS OFF) -endif() - -if(NOT BUILD_SHARED_LIBS) + set(BUILD_CAGRA_HNSWLIB OFF) +elseif(NOT BUILD_SHARED_LIBS) set(BUILD_TESTS OFF) set(BUILD_C_LIBRARY OFF) set(BUILD_CAGRA_HNSWLIB OFF) @@ -771,7 +761,7 @@ endif() # ################################################################################################## # * build test executable ---------------------------------------------------- -if(BUILD_TESTS OR BUILD_C_TESTS) +if(BUILD_TESTS) add_subdirectory(internal) add_subdirectory(test) endif() diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index f4d35e438..60007825c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -215,7 +215,9 @@ if(BUILD_TESTS) ) endif() -if(BUILD_C_TESTS) +if(TARGET cuvs::c_api) + enable_language(C) + ConfigureTest(NAME INTEROP_TEST PATH core/interop.cu C_LIB) ConfigureTest( NAME DISTANCE_C_TEST PATH distance/run_pairwise_distance_c.c distance/pairwise_distance_c.cu @@ -239,19 +241,37 @@ if(BUILD_C_TESTS) target_link_libraries(NEIGHBORS_HNSW_TEST PRIVATE hnswlib::hnswlib) target_compile_definitions(NEIGHBORS_HNSW_TEST PUBLIC CUVS_BUILD_CAGRA_HNSWLIB) endif() -endif() -# ################################################################################################## -# Install tests #################################################################################### -# ################################################################################################## -rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcuvs) - -if(BUILD_C_TESTS) - enable_language(C) add_executable(cuvs_c_test core/c_api.c) target_link_libraries(cuvs_c_test PUBLIC cuvs::c_api) add_executable(cuvs_c_neighbors_test neighbors/c_api.c) target_link_libraries(cuvs_c_neighbors_test PUBLIC cuvs::c_api) + + set_target_properties( + cuvs_c_test cuvs_c_neighbors_test + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + INSTALL_RPATH "\$ORIGIN/../../../lib" + ) + + rapids_test_add( + NAME cuvs_c_test + COMMAND cuvs_c_test + GPUS 1 + PERCENT 100 + INSTALL_COMPONENT_SET testing + ) + rapids_test_add( + NAME cuvs_c_neighbors_test + COMMAND cuvs_c_neighbors_test + GPUS 1 + PERCENT 100 + INSTALL_COMPONENT_SET testing + ) endif() + +# ################################################################################################## +# Install tests #################################################################################### +# ################################################################################################## +rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcuvs) From 801945fb1acfe4ca12b4d6dd30592f824166a389 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 18 Oct 2024 12:33:14 -0500 Subject: [PATCH 09/47] Use dashes in cuvs-bench package name. (#417) This attempts to rename `cuvs_bench` to `cuvs-bench` and similarly for the CPU package. This follows from this thread: https://github.com/rapidsai/docker/pull/715#discussion_r1805232894 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cuvs/pull/417 --- build.sh | 2 +- ci/build_python.sh | 8 ++++---- .../{cuvs_bench_cpu => cuvs-bench-cpu}/build.sh | 0 .../conda_build_config.yaml | 0 .../{cuvs_bench_cpu => cuvs-bench-cpu}/meta.yaml | 2 +- conda/recipes/{cuvs_bench => cuvs-bench}/build.sh | 0 .../conda_build_config.yaml | 0 conda/recipes/{cuvs_bench => cuvs-bench}/meta.yaml | 2 +- docs/source/cuvs_bench/index.rst | 12 ++++++------ 9 files changed, 13 insertions(+), 13 deletions(-) rename conda/recipes/{cuvs_bench_cpu => cuvs-bench-cpu}/build.sh (100%) rename conda/recipes/{cuvs_bench_cpu => cuvs-bench-cpu}/conda_build_config.yaml (100%) rename conda/recipes/{cuvs_bench_cpu => cuvs-bench-cpu}/meta.yaml (98%) rename conda/recipes/{cuvs_bench => cuvs-bench}/build.sh (100%) rename conda/recipes/{cuvs_bench => cuvs-bench}/conda_build_config.yaml (100%) rename conda/recipes/{cuvs_bench => cuvs-bench}/meta.yaml (99%) diff --git a/build.sh b/build.sh index b787d3a41..29e8fe7c6 100755 --- a/build.sh +++ b/build.sh @@ -447,7 +447,7 @@ if (( ${NUMARGS} == 0 )) || hasArg python; then python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs fi -# Build and (optionally) install the cuvs_bench Python package +# Build and (optionally) install the cuvs-bench Python package if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs_bench fi diff --git a/ci/build_python.sh b/ci/build_python.sh index 7b0c639af..deb67e91c 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -31,14 +31,14 @@ rapids-conda-retry mambabuild \ --channel "${CPP_CHANNEL}" \ conda/recipes/cuvs -# Build cuvs_bench for each cuda and python version +# Build cuvs-bench for each cuda and python version rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ - conda/recipes/cuvs_bench + conda/recipes/cuvs-bench -# Build cuvs_bench_cpu only in CUDA 12 jobs since it only depends on python +# Build cuvs-bench-cpu only in CUDA 12 jobs since it only depends on python # version RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then @@ -46,7 +46,7 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ - conda/recipes/cuvs_bench_cpu + conda/recipes/cuvs-bench-cpu fi rapids-upload-conda-to-s3 python diff --git a/conda/recipes/cuvs_bench_cpu/build.sh b/conda/recipes/cuvs-bench-cpu/build.sh similarity index 100% rename from conda/recipes/cuvs_bench_cpu/build.sh rename to conda/recipes/cuvs-bench-cpu/build.sh diff --git a/conda/recipes/cuvs_bench_cpu/conda_build_config.yaml b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml similarity index 100% rename from conda/recipes/cuvs_bench_cpu/conda_build_config.yaml rename to conda/recipes/cuvs-bench-cpu/conda_build_config.yaml diff --git a/conda/recipes/cuvs_bench_cpu/meta.yaml b/conda/recipes/cuvs-bench-cpu/meta.yaml similarity index 98% rename from conda/recipes/cuvs_bench_cpu/meta.yaml rename to conda/recipes/cuvs-bench-cpu/meta.yaml index 95bea92ef..02c11346f 100644 --- a/conda/recipes/cuvs_bench_cpu/meta.yaml +++ b/conda/recipes/cuvs-bench-cpu/meta.yaml @@ -8,7 +8,7 @@ {% set date_string = environ['RAPIDS_DATE_STRING'] %} package: - name: cuvs_bench_cpu + name: cuvs-bench-cpu version: {{ version }} script: build.sh diff --git a/conda/recipes/cuvs_bench/build.sh b/conda/recipes/cuvs-bench/build.sh similarity index 100% rename from conda/recipes/cuvs_bench/build.sh rename to conda/recipes/cuvs-bench/build.sh diff --git a/conda/recipes/cuvs_bench/conda_build_config.yaml b/conda/recipes/cuvs-bench/conda_build_config.yaml similarity index 100% rename from conda/recipes/cuvs_bench/conda_build_config.yaml rename to conda/recipes/cuvs-bench/conda_build_config.yaml diff --git a/conda/recipes/cuvs_bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml similarity index 99% rename from conda/recipes/cuvs_bench/meta.yaml rename to conda/recipes/cuvs-bench/meta.yaml index 67d66efce..3e81edc58 100644 --- a/conda/recipes/cuvs_bench/meta.yaml +++ b/conda/recipes/cuvs-bench/meta.yaml @@ -10,7 +10,7 @@ {% set date_string = environ['RAPIDS_DATE_STRING'] %} package: - name: cuvs_bench + name: cuvs-bench version: {{ version }} script: build.sh diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst index 61ac622d2..81fb7537c 100644 --- a/docs/source/cuvs_bench/index.rst +++ b/docs/source/cuvs_bench/index.rst @@ -93,20 +93,20 @@ We provide images for GPU enabled systems, as well as systems without a GPU. The - `cuvs-bench-datasets`: Contains the GPU and CPU benchmarks with million-scale datasets already included in the container. Best suited for users that want to run multiple million scale datasets already included in the image. - `cuvs-bench-cpu`: Contains only CPU benchmarks with minimal size. Best suited for users that want the smallest containers to reproduce benchmarks on systems without a GPU. -Nightly images are located in `dockerhub `_, meanwhile release (stable) versions are located in `NGC `_, starting with release 24.10. +Nightly images are located in `dockerhub `_, meanwhile release (stable) versions are located in `NGC `_, starting with release 24.10. -The following command pulls the nightly container for python version 10, cuda version 12, and CUVS version 23.10: +The following command pulls the nightly container for Python version 3.10, CUDA version 12.0, and cuVS version 24.10: .. code-block:: bash - docker pull rapidsai/cuvs_bench:24.10a-cuda12.0-py3.10 #substitute cuvs_bench for the exact desired container. + docker pull rapidsai/cuvs-bench:24.10a-cuda12.0-py3.10 #substitute cuvs-bench for the exact desired container. The CUDA and python versions can be changed for the supported values: - Supported CUDA versions: 11.4 and 12.x - Supported Python versions: 3.9 and 3.10. You can see the exact versions as well in the dockerhub site: -- `cuVS bench images `_ +- `cuVS bench images `_ - `cuVS bench with datasets preloaded images `_ - `cuVS bench CPU only images `_ @@ -583,7 +583,7 @@ A default `datasets.yaml` is provided by CUVS in `${CUVS_HOME}/python/cuvs-ann-b dims: 128 distance: euclidean -Configuration files for ANN algorithms supported by `cuvs-bench` are provided in `${CUVS_HOME}/python/cuvs-bench/src/cuvs_bench/run/conf`. `cuvs_cagra` algorithm configuration looks like: +Configuration files for ANN algorithms supported by `cuvs-bench` are provided in `${CUVS_HOME}/python/cuvs_bench/cuvs_bench/config/algos`. `cuvs_cagra` algorithm configuration looks like: .. code-block:: yaml @@ -767,4 +767,4 @@ Add a new entry to `algos.yaml` to map the name of the algorithm to its binary e requires_gpu: true `executable` : specifies the name of the binary that will build/search the index. It is assumed to be available in `cuvs/cpp/build/`. -`requires_gpu` : denotes whether an algorithm requires GPU to run. \ No newline at end of file +`requires_gpu` : denotes whether an algorithm requires GPU to run. From 009bb8de03ce9708d4d797166187250f77a59a36 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 21 Oct 2024 14:24:34 -0500 Subject: [PATCH 10/47] Use Python for sccache hit rate computation. (#420) Fixes an issue in CI computations of sccache hit rates. See https://github.com/rapidsai/cuvs/pull/414 for details. --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index 29e8fe7c6..9503dff21 100755 --- a/build.sh +++ b/build.sh @@ -410,14 +410,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has if [[ ${CACHE_TOOL} == "sccache" && -x "$(command -v sccache)" ]]; then COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }') CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }') - HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}") + HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')") MSG="${MSG}
cache hit rate ${HIT_RATE} %" elif [[ ${CACHE_TOOL} == "ccache" && -x "$(command -v ccache)" ]]; then CACHE_STATS_LINE=$(ccache -s | grep "Hits: \+ [0-9]\+ / [0-9]\+" | tail -n1) if [[ ! -z "$CACHE_STATS_LINE" ]]; then CACHE_HITS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $2 }') COMPILE_REQUESTS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $4 }') - HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}") + HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')") MSG="${MSG}
cache hit rate ${HIT_RATE} %" fi fi From e7f1085b71c340b9600f5f38f7f0059a5c7aa806 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 22 Oct 2024 07:57:08 -0500 Subject: [PATCH 11/47] Use environment variables in cache hit rate computation. (#422) Follow-up PR to address feedback: https://github.com/rapidsai/raft/pull/2474#discussion_r1809398110 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cuvs/pull/422 --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index 9503dff21..c08c2900e 100755 --- a/build.sh +++ b/build.sh @@ -410,14 +410,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has if [[ ${CACHE_TOOL} == "sccache" && -x "$(command -v sccache)" ]]; then COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }') CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }') - HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')") + HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')") MSG="${MSG}
cache hit rate ${HIT_RATE} %" elif [[ ${CACHE_TOOL} == "ccache" && -x "$(command -v ccache)" ]]; then CACHE_STATS_LINE=$(ccache -s | grep "Hits: \+ [0-9]\+ / [0-9]\+" | tail -n1) if [[ ! -z "$CACHE_STATS_LINE" ]]; then CACHE_HITS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $2 }') COMPILE_REQUESTS=$(echo "$CACHE_STATS_LINE" - | awk '{ print $4 }') - HIT_RATE=$(python3 -c "print(f'{${CACHE_HITS} / ${COMPILE_REQUESTS}:.2f}' if ${COMPILE_REQUESTS} else 'nan')") + HIT_RATE=$(COMPILE_REQUESTS="${COMPILE_REQUESTS}" CACHE_HITS="${CACHE_HITS}" python3 -c "import os; print(f'{int(os.getenv(\"CACHE_HITS\")) / int(os.getenv(\"COMPILE_REQUESTS\")):.2f}' if int(os.getenv(\"COMPILE_REQUESTS\")) else 'nan')") MSG="${MSG}
cache hit rate ${HIT_RATE} %" fi fi From 12b10e88e8ea6e944e91dee8a0380c89999b3b21 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 28 Oct 2024 11:56:53 -0500 Subject: [PATCH 12/47] Fix correct call to brute force in generate groundtruth of cuvs-bench (#427) Fixes issue with helper script for generating ground truthset in cuvs-bench, which was using the old RAFT NN API. Authors: - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Divye Gala (https://github.com/divyegala) - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/cuvs/pull/427 --- .../cuvs_bench/generate_groundtruth/__main__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py index 2b4213016..dbee6cd36 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py @@ -24,7 +24,7 @@ from pylibraft.common import DeviceResources from rmm.allocators.cupy import rmm_cupy_allocator -from cuvs.neighbors.brute_force import knn +from cuvs.neighbors.brute_force import build, search from .utils import memmap_bin_file, suffix_from_dtype, write_bin @@ -49,7 +49,7 @@ def choose_random_queries(dataset, n_queries): def calc_truth(dataset, queries, k, metric="sqeuclidean"): - handle = DeviceResources() + resources = DeviceResources() n_samples = dataset.shape[0] n = 500000 # batch size for processing neighbors i = 0 @@ -63,8 +63,9 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): X = cp.asarray(dataset[i : i + n_batch, :], cp.float32) - D, Ind = knn(X, queries, k, metric=metric, handle=handle) - handle.sync() + index = build(X, metric=metric, resources=resources) + D, Ind = search(index, queries, k, resources=resources) + resources.sync() D, Ind = cp.asarray(D), cp.asarray(Ind) Ind += i # shift neighbor index by offset i From 9f035d8e0e44c8eabdf6983049dfe58f9f1ef807 Mon Sep 17 00:00:00 2001 From: abner-ma <969023674@qq.com> Date: Tue, 29 Oct 2024 05:25:13 +0800 Subject: [PATCH 13/47] Ivf c example (#404) Add examples of ivf-flat and ivf-pq in C language Authors: - https://github.com/abner-ma Approvers: - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/cuvs/pull/404 --- examples/c/CMakeLists.txt | 8 + examples/c/src/common.h | 109 ++++++++++++ examples/c/src/ivf_flat_c_example.c | 259 ++++++++++++++++++++++++++++ examples/c/src/ivf_pq_c_example.c | 189 ++++++++++++++++++++ 4 files changed, 565 insertions(+) create mode 100644 examples/c/src/common.h create mode 100644 examples/c/src/ivf_flat_c_example.c create mode 100644 examples/c/src/ivf_pq_c_example.c diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt index ec8ca827a..2a7e70522 100644 --- a/examples/c/CMakeLists.txt +++ b/examples/c/CMakeLists.txt @@ -42,3 +42,11 @@ target_link_libraries(CAGRA_C_EXAMPLE PRIVATE cuvs::c_api $") target_link_libraries(L2_C_EXAMPLE PRIVATE cuvs::c_api $) + +add_executable(IVF_FLAT_C_EXAMPLE src/ivf_flat_c_example.c) +target_include_directories(IVF_FLAT_C_EXAMPLE PUBLIC "$") +target_link_libraries(IVF_FLAT_C_EXAMPLE PRIVATE cuvs::c_api $) + +add_executable(IVF_PQ_C_EXAMPLE src/ivf_pq_c_example.c) +target_include_directories(IVF_PQ_C_EXAMPLE PUBLIC "$") +target_link_libraries(IVF_PQ_C_EXAMPLE PRIVATE cuvs::c_api $) diff --git a/examples/c/src/common.h b/examples/c/src/common.h new file mode 100644 index 000000000..60b9b73cf --- /dev/null +++ b/examples/c/src/common.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +/** + * @brief Initialize Tensor for kDLFloat. + * + * @param[in] t_d Pointer to a vector + * @param[in] t_shape[] Two-dimensional array, which stores the number of rows and columns of vectors. + * @param[out] t_tensor Stores the initialized DLManagedTensor. + */ +void float_tensor_initialize(float* t_d, int64_t t_shape[2], DLManagedTensor* t_tensor) { + t_tensor->dl_tensor.data = t_d; + t_tensor->dl_tensor.device.device_type = kDLCUDA; + t_tensor->dl_tensor.ndim = 2; + t_tensor->dl_tensor.dtype.code = kDLFloat; + t_tensor->dl_tensor.dtype.bits = 32; + t_tensor->dl_tensor.dtype.lanes = 1; + t_tensor->dl_tensor.shape = t_shape; + t_tensor->dl_tensor.strides = NULL; +} + +/** + * @brief Initialize Tensor for kDLInt. + * + * @param[in] t_d Pointer to a vector + * @param[in] t_shape[] Two-dimensional array, which stores the number of rows and columns of vectors. + * @param[out] t_tensor Stores the initialized DLManagedTensor. + */ +void int_tensor_initialize(int64_t* t_d, int64_t t_shape[], DLManagedTensor* t_tensor) { + t_tensor->dl_tensor.data = t_d; + t_tensor->dl_tensor.device.device_type = kDLCUDA; + t_tensor->dl_tensor.ndim = 2; + t_tensor->dl_tensor.dtype.code = kDLInt; + t_tensor->dl_tensor.dtype.bits = 64; + t_tensor->dl_tensor.dtype.lanes = 1; + t_tensor->dl_tensor.shape = t_shape; + t_tensor->dl_tensor.strides = NULL; +} + +/** + * @brief Fill a vector with random values. + * + * @param[out] Vec Pointer to a vector + * @param[in] n_rows the number of rows in the matrix. + * @param[in] n_cols the number of columns in the matrix. + * @param[in] min Minimum value among random values. + * @param[in] max Maximum value among random values. + */ +void generate_dataset(float * Vec,int n_rows, int n_cols, float min, float max) { + float scale; + float * ptr = Vec; + srand((unsigned int)time(NULL)); + for (int i = 0; i < n_rows; i++) { + for (int j = 0; j < n_cols; j++) { + scale = rand()/(float)RAND_MAX; + ptr = Vec + i * n_cols + j; + *ptr = min + scale * (max - min); + } + } +} + +/** + * @brief print the result. + * + * @param[in] neighbor Pointer to a neighbor vector + * @param[in] distances Pointer to a distances vector. + * @param[in] n_rows the number of rows in the matrix. + * @param[in] n_cols the number of columns in the matrix. + */ +void print_results(int64_t * neighbor, float* distances,int n_rows, int n_cols) { + int64_t * pn = neighbor; + float * pd = distances; + for (int i = 0; i < n_rows; ++i) { + printf("Query %d neighbor indices: =[", i); + for (int j = 0; j < n_cols; ++j) { + pn = neighbor + i * n_cols + j; + printf(" %ld", *pn); + } + printf("]\n"); + printf("Query %d neighbor distances: =[", i); + for (int j = 0; j < n_cols; ++j) { + pd = distances + i * n_cols + j; + printf(" %f", *pd); + } + printf("]\n"); + } +} + diff --git a/examples/c/src/ivf_flat_c_example.c b/examples/c/src/ivf_flat_c_example.c new file mode 100644 index 000000000..c068d04f8 --- /dev/null +++ b/examples/c/src/ivf_flat_c_example.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include "common.h" + +void ivf_flat_build_search_simple(cuvsResources_t *res, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) { + // Create default index params + cuvsIvfFlatIndexParams_t index_params; + cuvsIvfFlatIndexParamsCreate(&index_params); + index_params->n_lists = 1024; // default value + index_params->kmeans_n_iters = 20; // default value + index_params->kmeans_trainset_fraction = 0.1; + //index_params->metric default is L2Expanded + + // Create IVF-Flat index + cuvsIvfFlatIndex_t index; + cuvsIvfFlatIndexCreate(&index); + + printf("Building IVF-Flat index\n"); + // Build the IVF-Flat Index + cuvsError_t build_status = cuvsIvfFlatBuild(*res, index_params, dataset_tensor, index); + if (build_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + cuvsIvfFlatIndexDestroy(index); + cuvsIvfFlatIndexParamsDestroy(index_params); + return; + } + + // Create output arrays. + int64_t topk = 10; + int64_t n_queries = queries_tensor->dl_tensor.shape[0]; + + //Allocate memory for `neighbors` and `distances` output + int64_t *neighbors_d; + float *distances_d; + cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk); + cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk); + + DLManagedTensor neighbors_tensor; + int64_t neighbors_shape[2] = {n_queries, topk}; + int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor); + + DLManagedTensor distances_tensor; + int64_t distances_shape[2] = {n_queries, topk}; + float_tensor_initialize(distances_d, distances_shape, &distances_tensor); + + // Create default search params + cuvsIvfFlatSearchParams_t search_params; + cuvsIvfFlatSearchParamsCreate(&search_params); + search_params->n_probes = 50; + + // Search the `index` built using `ivfFlatBuild` + cuvsError_t search_status = cuvsIvfFlatSearch(*res, search_params, index, + queries_tensor, &neighbors_tensor, &distances_tensor); + if (build_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + } + + int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t)); + float *distances = (float *)malloc(n_queries * topk * sizeof(float)); + memset(neighbors, 0, n_queries * topk * sizeof(int64_t)); + memset(distances, 0, n_queries * topk * sizeof(float)); + + cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault); + cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault); + + print_results(neighbors, distances, 2, topk); + + free(distances); + free(neighbors); + + cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk); + cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk); + + cuvsIvfFlatSearchParamsDestroy(search_params); + cuvsIvfFlatIndexDestroy(index); + cuvsIvfFlatIndexParamsDestroy(index_params); +} + +void ivf_flat_build_extend_search(cuvsResources_t *res, DLManagedTensor * trainset_tensor, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) { + int64_t *data_indices_d; + int64_t n_dataset = dataset_tensor->dl_tensor.shape[0]; + cuvsRMMAlloc(*res, (void**) &data_indices_d, sizeof(int64_t) * n_dataset); + DLManagedTensor data_indices_tensor; + int64_t data_indices_shape[1] = {n_dataset}; + int_tensor_initialize(data_indices_d, data_indices_shape, &data_indices_tensor); + data_indices_tensor.dl_tensor.ndim = 1; + + printf("\nRun k-means clustering using the training set\n"); + + int64_t *data_indices = (int64_t *)malloc(n_dataset * sizeof(int64_t)); + int64_t * ptr = data_indices; + for (int i = 0; i < n_dataset; i++) { + *ptr = i; + ptr++; + } + ptr = NULL; + cudaMemcpy(data_indices_d, data_indices, sizeof(int64_t) * n_dataset, cudaMemcpyDefault); + + // Create default index params + cuvsIvfFlatIndexParams_t index_params; + cuvsIvfFlatIndexParamsCreate(&index_params); + index_params->n_lists = 100; + index_params->add_data_on_build = false; + //index_params->metric default is L2Expanded + + // Create IVF-Flat index + cuvsIvfFlatIndex_t index; + cuvsIvfFlatIndexCreate(&index); + + // Build the IVF-Flat Index + cuvsError_t build_status = cuvsIvfFlatBuild(*res, index_params, trainset_tensor, index); + if (build_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + cuvsIvfFlatIndexDestroy(index); + cuvsIvfFlatIndexParamsDestroy(index_params); + return; + } + + printf("Filling index with the dataset vectors\n"); + cuvsError_t extend_status = cuvsIvfFlatExtend(*res, dataset_tensor, &data_indices_tensor, index); + if (extend_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + return; + } + + // Create output arrays. + int64_t topk = 10; + int64_t n_queries = queries_tensor->dl_tensor.shape[0]; + + //Allocate memory for `neighbors` and `distances` output + int64_t *neighbors_d; + float *distances_d; + cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk); + cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk); + + DLManagedTensor neighbors_tensor; + int64_t neighbors_shape[2] = {n_queries, topk}; + int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor); + + DLManagedTensor distances_tensor; + int64_t distances_shape[2] = {n_queries, topk}; + float_tensor_initialize(distances_d, distances_shape, &distances_tensor); + + // Create default search params + cuvsIvfFlatSearchParams_t search_params; + cuvsIvfFlatSearchParamsCreate(&search_params); + search_params->n_probes = 10; + + // Search the `index` built using `ivfFlatBuild` + cuvsError_t search_status = cuvsIvfFlatSearch(*res, search_params, index, + queries_tensor, &neighbors_tensor, &distances_tensor); + if (search_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + exit(-1); + } + + int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t)); + float *distances = (float *)malloc(n_queries * topk * sizeof(float)); + memset(neighbors, 0, n_queries * topk * sizeof(int64_t)); + memset(distances, 0, n_queries * topk * sizeof(float)); + + cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault); + cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault); + + print_results(neighbors, distances, 2, topk); + + free(distances); + free(neighbors); + free(data_indices); + cuvsRMMFree(*res, data_indices_d, sizeof(int64_t) * n_dataset); + cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk); + cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk); + + cuvsIvfFlatSearchParamsDestroy(search_params); + cuvsIvfFlatIndexDestroy(index); + cuvsIvfFlatIndexParamsDestroy(index_params); +} + +int main() { + // Create input arrays. + int64_t n_samples = 10000; + int64_t n_dim = 3; + int64_t n_queries = 10; + float *dataset = (float *)malloc(n_samples * n_dim * sizeof(float)); + float *queries = (float *)malloc(n_queries * n_dim * sizeof(float)); + generate_dataset(dataset, n_samples, n_dim, -10.0, 10.0); + generate_dataset(queries, n_queries, n_dim, -1.0, 1.0); + + // Create a cuvsResources_t object + cuvsResources_t res; + cuvsResourcesCreate(&res); + + // Allocate memory for `queries` + float *dataset_d; + cuvsRMMAlloc(res, (void**) &dataset_d, sizeof(float) * n_samples * n_dim); + // Use DLPack to represent `dataset_d` as a tensor + cudaMemcpy(dataset_d, dataset, sizeof(float) * n_samples * n_dim, cudaMemcpyDefault); + + DLManagedTensor dataset_tensor; + int64_t dataset_shape[2] = {n_samples,n_dim}; + float_tensor_initialize(dataset_d, dataset_shape, &dataset_tensor); + + // Allocate memory for `queries` + float *queries_d; + cuvsRMMAlloc(res, (void**) &queries_d, sizeof(float) * n_queries * n_dim); + + // Use DLPack to represent `queries` as tensors + cudaMemcpy(queries_d, queries, sizeof(float) * n_queries * n_dim, cudaMemcpyDefault); + + DLManagedTensor queries_tensor; + int64_t queries_shape[2] = {n_queries, n_dim}; + float_tensor_initialize(queries_d, queries_shape, &queries_tensor); + + // Simple build and search example. + ivf_flat_build_search_simple(&res, &dataset_tensor, &queries_tensor); + + float *trainset_d; + int64_t n_trainset = n_samples * 0.1; + float *trainset = (float *)malloc(n_trainset * n_dim * sizeof(float)); + for (int i = 0; i < n_trainset; i++) { + for (int j = 0; j < n_dim; j++) { + *(trainset + i * n_dim + j) = *(dataset + i * n_dim + j); + } + } + cuvsRMMAlloc(res, (void**) &trainset_d, sizeof(float) * n_trainset * n_dim); + cudaMemcpy(trainset_d, trainset, sizeof(float) * n_trainset * n_dim, cudaMemcpyDefault); + DLManagedTensor trainset_tensor; + int64_t trainset_shape[2] = {n_trainset, n_dim}; + float_tensor_initialize(trainset_d, trainset_shape, &trainset_tensor); + + // Build and extend example. + ivf_flat_build_extend_search(&res, &trainset_tensor, &dataset_tensor, &queries_tensor); + + cuvsRMMFree(res, trainset_d, sizeof(float) * n_trainset * n_dim); + cuvsRMMFree(res, queries_d, sizeof(float) * n_queries * n_dim); + cuvsRMMFree(res, dataset_d, sizeof(float) * n_samples * n_dim); + cuvsResourcesDestroy(res); + free(trainset); + free(dataset); + free(queries); +} diff --git a/examples/c/src/ivf_pq_c_example.c b/examples/c/src/ivf_pq_c_example.c new file mode 100644 index 000000000..b6d6b485b --- /dev/null +++ b/examples/c/src/ivf_pq_c_example.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include "common.h" + +void ivf_pq_build_search(cuvsResources_t *res, DLManagedTensor * dataset_tensor, DLManagedTensor * queries_tensor) { + // Create default index params + cuvsIvfPqIndexParams_t index_params; + cuvsIvfPqIndexParamsCreate(&index_params); + index_params->n_lists = 1024; // default value + index_params->kmeans_trainset_fraction = 0.1; + //index_params->metric default is L2Expanded + index_params->pq_bits = 8; + index_params->pq_dim = 2; + + // Create IVF-PQ index + cuvsIvfPqIndex_t index; + cuvsIvfPqIndexCreate(&index); + + printf("Building IVF-PQ index\n"); + + // Build the IVF-PQ Index + cuvsError_t build_status = cuvsIvfPqBuild(*res, index_params, dataset_tensor, index); + if (build_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + cuvsIvfPqIndexDestroy(index); + cuvsIvfPqIndexParamsDestroy(index_params); + return; + } + + // Create output arrays. + int64_t topk = 10; + int64_t n_queries = queries_tensor->dl_tensor.shape[0]; + + //Allocate memory for `neighbors` and `distances` output + int64_t *neighbors_d; + float *distances_d; + cuvsRMMAlloc(*res, (void**) &neighbors_d, sizeof(int64_t) * n_queries * topk); + cuvsRMMAlloc(*res, (void**) &distances_d, sizeof(float) * n_queries * topk); + + DLManagedTensor neighbors_tensor; + int64_t neighbors_shape[2] = {n_queries, topk}; + int_tensor_initialize(neighbors_d, neighbors_shape, &neighbors_tensor); + + DLManagedTensor distances_tensor; + int64_t distances_shape[2] = {n_queries, topk}; + float_tensor_initialize(distances_d, distances_shape, &distances_tensor); + + // Create default search params + cuvsIvfPqSearchParams_t search_params; + cuvsIvfPqSearchParamsCreate(&search_params); + search_params->n_probes = 50; + search_params->internal_distance_dtype = CUDA_R_16F; + search_params->lut_dtype = CUDA_R_16F; + + // Search the `index` built using `cuvsIvfPqBuild` + cuvsError_t search_status = cuvsIvfPqSearch(*res, search_params, index, + queries_tensor, &neighbors_tensor, &distances_tensor); + if (search_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + exit(-1); + } + + int64_t *neighbors = (int64_t *)malloc(n_queries * topk * sizeof(int64_t)); + float *distances = (float *)malloc(n_queries * topk * sizeof(float)); + memset(neighbors, 0, n_queries * topk * sizeof(int64_t)); + memset(distances, 0, n_queries * topk * sizeof(float)); + + cudaMemcpy(neighbors, neighbors_d, sizeof(int64_t) * n_queries * topk, cudaMemcpyDefault); + cudaMemcpy(distances, distances_d, sizeof(float) * n_queries * topk, cudaMemcpyDefault); + + printf("\nOriginal results:\n"); + print_results(neighbors, distances, 2, topk); + + // Re-ranking operation: refine the initial search results by computing exact distances + int64_t topk_refined = 7; + int64_t *neighbors_refined_d; + float *distances_refined_d; + cuvsRMMAlloc(*res, (void**) &neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined); + cuvsRMMAlloc(*res, (void**) &distances_refined_d, sizeof(float) * n_queries * topk_refined); + + DLManagedTensor neighbors_refined_tensor; + int64_t neighbors_refined_shape[2] = {n_queries, topk_refined}; + int_tensor_initialize(neighbors_refined_d, neighbors_refined_shape, &neighbors_refined_tensor); + + DLManagedTensor distances_refined_tensor; + int64_t distances_refined_shape[2] = {n_queries, topk_refined}; + float_tensor_initialize(distances_refined_d, distances_refined_shape, &distances_refined_tensor); + + // Note, refinement requires the original dataset and the queries. + // Don't forget to specify the same distance metric as used by the index. + cuvsError_t refine_status = cuvsRefine(*res, dataset_tensor, queries_tensor, + &neighbors_tensor, index_params->metric, + &neighbors_refined_tensor, &distances_refined_tensor); + if (refine_status != CUVS_SUCCESS) { + printf("%s.\n", cuvsGetLastErrorText()); + exit(-1); + } + + int64_t *neighbors_refine = (int64_t *)malloc(n_queries * topk_refined * sizeof(int64_t)); + float *distances_refine = (float *)malloc(n_queries * topk_refined * sizeof(float)); + memset(neighbors_refine, 0, n_queries * topk_refined * sizeof(int64_t)); + memset(distances_refine, 0, n_queries * topk_refined * sizeof(float)); + + cudaMemcpy(neighbors_refine, neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined, cudaMemcpyDefault); + cudaMemcpy(distances_refine, distances_refined_d, sizeof(float) * n_queries * topk_refined, cudaMemcpyDefault); + + printf("\nRefined results:\n"); + print_results(neighbors, distances, 2, topk_refined); + + free(distances_refine); + free(neighbors_refine); + + free(distances); + free(neighbors); + + cuvsRMMFree(*res, neighbors_refined_d, sizeof(int64_t) * n_queries * topk_refined); + cuvsRMMFree(*res, distances_refined_d, sizeof(float) * n_queries * topk_refined); + + cuvsRMMFree(*res, neighbors_d, sizeof(int64_t) * n_queries * topk); + cuvsRMMFree(*res, distances_d, sizeof(float) * n_queries * topk); + + cuvsIvfPqSearchParamsDestroy(search_params); + cuvsIvfPqIndexDestroy(index); + cuvsIvfPqIndexParamsDestroy(index_params); +} + +int main() { + // Create input arrays. + int64_t n_samples = 10000; + int64_t n_dim = 3; + int64_t n_queries = 10; + float *dataset = (float *)malloc(n_samples * n_dim * sizeof(float)); + float *queries = (float *)malloc(n_queries * n_dim * sizeof(float)); + generate_dataset(dataset, n_samples, n_dim, -10.0, 10.0); + generate_dataset(queries, n_queries, n_dim, -1.0, 1.0); + + // Create a cuvsResources_t object + cuvsResources_t res; + cuvsResourcesCreate(&res); + + // Allocate memory for `queries` + float *dataset_d; + cuvsRMMAlloc(res, (void**) &dataset_d, sizeof(float) * n_samples * n_dim); + // Use DLPack to represent `dataset_d` as a tensor + cudaMemcpy(dataset_d, dataset, sizeof(float) * n_samples * n_dim, cudaMemcpyDefault); + + DLManagedTensor dataset_tensor; + int64_t dataset_shape[2] = {n_samples,n_dim}; + float_tensor_initialize(dataset_d, dataset_shape, &dataset_tensor); + + // Allocate memory for `queries` + float *queries_d; + cuvsRMMAlloc(res, (void**) &queries_d, sizeof(float) * n_queries * n_dim); + + // Use DLPack to represent `queries` as tensors + cudaMemcpy(queries_d, queries, sizeof(float) * n_queries * n_dim, cudaMemcpyDefault); + + DLManagedTensor queries_tensor; + int64_t queries_shape[2] = {n_queries, n_dim}; + float_tensor_initialize(queries_d, queries_shape, &queries_tensor); + + // Simple build and search example. + ivf_pq_build_search(&res, &dataset_tensor, &queries_tensor); + + cuvsRMMFree(res, queries_d, sizeof(float) * n_queries * n_dim); + cuvsRMMFree(res, dataset_d, sizeof(float) * n_samples * n_dim); + cuvsResourcesDestroy(res); + free(dataset); + free(queries); +} From d296d811e3d0f9917068c9d5d2ef04fccacdcd08 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 29 Oct 2024 18:03:51 -0500 Subject: [PATCH 14/47] print sccache stats in builds (#413) Contributes to https://github.com/rapidsai/build-planning/issues/111 Proposes some small packaging/CI changes, matching similar changes being made across RAPIDS. * printing `sccache` stats to CI logs * reducing `pip`'s verbosity in wheel building scripts * updating to the latest `rapids-dependency-file-generator` (v1.16.0) * always explicitly specifying `cpp` / `python` in calls to `rapids-upload-wheels-to-s3` * modifying `dependencies.yaml` to match RAPIDS-wide naming conventions ## Notes for Reviewers This originally also ran wheel builds with `--no-build-isolation`, but I reverted that based on https://github.com/rapidsai/build-planning/issues/108#issuecomment-2436764212. Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cuvs/pull/413 --- .pre-commit-config.yaml | 2 +- ci/build_cpp.sh | 4 ++++ ci/build_python.sh | 10 ++++++++++ ci/build_wheel.sh | 16 +++++++++++++--- dependencies.yaml | 10 +++++----- 5 files changed, 33 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 439b42959..f4fdf202e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -113,7 +113,7 @@ repos: cpp/cmake/modules/FindAVX\.cmake| - id: verify-alpha-spec - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.13.11 + rev: v1.16.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 7bc0be5a7..db4c496cc 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -15,6 +15,10 @@ rapids-print-env rapids-logger "Begin cpp build" +sccache --zero-stats + RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/libcuvs +sccache --show-adv-stats + rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index deb67e91c..3241a2c2b 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -24,6 +24,8 @@ version=$(rapids-generate-version) export RAPIDS_PACKAGE_VERSION=${version} echo "${version}" > VERSION +sccache --zero-stats + # TODO: Remove `--no-test` flags once importing on a CPU # node works correctly rapids-conda-retry mambabuild \ @@ -31,6 +33,9 @@ rapids-conda-retry mambabuild \ --channel "${CPP_CHANNEL}" \ conda/recipes/cuvs +sccache --show-adv-stats +sccache --zero-stats + # Build cuvs-bench for each cuda and python version rapids-conda-retry mambabuild \ --no-test \ @@ -38,6 +43,9 @@ rapids-conda-retry mambabuild \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cuvs-bench +sccache --show-adv-stats +sccache --zero-stats + # Build cuvs-bench-cpu only in CUDA 12 jobs since it only depends on python # version RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" @@ -47,6 +55,8 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cuvs-bench-cpu + + sccache --show-adv-stats fi rapids-upload-conda-to-s3 python diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index d1030276f..4994374a8 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -32,10 +32,20 @@ case "${RAPIDS_CUDA_VERSION}" in ;; esac -# Hardcode the output dir -python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check +rapids-logger "Building '${package_name}' wheel" + +sccache --zero-stats + +python -m pip wheel \ + -w dist \ + -v \ + --no-deps \ + --disable-pip-version-check \ + . + +sccache --show-adv-stats mkdir -p final_dist python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/* -RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist +RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist diff --git a/dependencies.yaml b/dependencies.yaml index a68a550bb..cf9b68c8a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -74,14 +74,14 @@ files: - rapids_build - cuda - rust - py_build_py_cuvs: + py_build_cuvs: output: pyproject pyproject_dir: python/cuvs extras: table: build-system includes: - build - py_rapids_build_py_cuvs: + py_rapids_build_cuvs: output: pyproject pyproject_dir: python/cuvs extras: @@ -90,7 +90,7 @@ files: includes: - rapids_build - build_py_cuvs - py_run_py_cuvs: + py_run_cuvs: output: pyproject pyproject_dir: python/cuvs extras: @@ -99,7 +99,7 @@ files: - cuda_wheels - run_py_cuvs - depends_on_pylibraft - py_test_py_cuvs: + py_test_cuvs: output: pyproject pyproject_dir: python/cuvs extras: @@ -116,7 +116,7 @@ files: table: build-system includes: - rapids_build_setuptools - py_rapids_build_py_cuvs_bench: + py_rapids_build_cuvs_bench: output: pyproject pyproject_dir: python/cuvs_bench extras: From b422cbeec92fe925ee59f5f966ac9834440200e2 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 30 Oct 2024 11:20:31 -0400 Subject: [PATCH 15/47] Add ci run_ scripts needed for build infra (#434) These `run_*` scripts are needed by the build infra team and bring the cuvs project in line with the rest of RAPIDS Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cuvs/pull/434 --- ci/run_ctests.sh | 9 +++++++++ ci/run_cuvs_pytests.sh | 9 +++++++++ 2 files changed, 18 insertions(+) create mode 100755 ci/run_ctests.sh create mode 100755 ci/run_cuvs_pytests.sh diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh new file mode 100755 index 000000000..6bf83961b --- /dev/null +++ b/ci/run_ctests.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# Support customizing the ctests' install location +cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcuvs/" + +ctest --output-on-failure --no-tests=error "$@" diff --git a/ci/run_cuvs_pytests.sh b/ci/run_cuvs_pytests.sh new file mode 100755 index 000000000..4de8927b1 --- /dev/null +++ b/ci/run_cuvs_pytests.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# Support invoking run_pytests.sh outside the script directory +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuvs/cuvs + +pytest --cache-clear --verbose "$@" tests From 6041a81ce8e534ac79f5b27c595c9231b88d1d10 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:25:10 +0100 Subject: [PATCH 16/47] Enable NVTX in cuvs-cagra-search component (#439) Since parts of CAGRA code have been separated into a static library component `cuvs-cagra-search` (to selectively enable CUDA separable compilation on them), the NVTX flags are not passed to the affected sources anymore. This PR fixes that. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Micka (https://github.com/lowener) URL: https://github.com/rapidsai/cuvs/pull/439 --- cpp/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 746245791..e56e21383 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -603,6 +603,9 @@ SECTIONS # This enables NVTX within the project with no option to disable it downstream. target_link_libraries(cuvs PUBLIC CUDA::nvtx3) target_compile_definitions(cuvs PUBLIC NVTX_ENABLED) + + target_link_libraries(cuvs-cagra-search PUBLIC CUDA::nvtx3) + target_compile_definitions(cuvs-cagra-search PUBLIC NVTX_ENABLED) else() # Allow enable NVTX downstream if not set here. This creates a new option at build/install time, # which is set by default to OFF, but can be enabled in the dependent project. From 71deb26c457bbf398c9af0142740aefadf83220a Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:18:32 +0100 Subject: [PATCH 17/47] Fix index overflow in edge cases of CAGRA graph optimize (#435) Force `input_graph_degree`, `output_graph_degree`, and `graph_size` variables to `uint64_t`. Before the PR, they've been `uint32_t`, and the product of them could overflow. This would lead to `cudaMemsetAsync` not filling in a large fraction of the graph. It's not known whether this bug has surfaced for anyone until now, but it's better to be safe than sorry. The change shouldn't have any impact on performance. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Micka (https://github.com/lowener) URL: https://github.com/rapidsai/cuvs/pull/435 --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 43bf1ba2b..4253cb781 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -1076,11 +1076,11 @@ void optimize( "Each input array is expected to have the same number of rows"); RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1), "output graph cannot have more columns than input graph"); - const uint32_t input_graph_degree = knn_graph.extent(1); - const uint32_t output_graph_degree = new_graph.extent(1); + const uint64_t input_graph_degree = knn_graph.extent(1); + const uint64_t output_graph_degree = new_graph.extent(1); + const uint64_t graph_size = new_graph.extent(0); auto input_graph_ptr = knn_graph.data_handle(); auto output_graph_ptr = new_graph.data_handle(); - const IdxT graph_size = new_graph.extent(0); // MST optimization auto mst_graph_num_edges = raft::make_host_vector(graph_size); @@ -1148,7 +1148,7 @@ void optimize( constexpr int MAX_DEGREE = 1024; if (input_graph_degree > MAX_DEGREE) { RAFT_FAIL( - "The degree of input knn graph is too large (%u). " + "The degree of input knn graph is too large (%zu). " "It must be equal to or smaller than %d.", input_graph_degree, 1024); @@ -1217,11 +1217,12 @@ void optimize( assert(next_num_detour != std::numeric_limits::max()); num_detour = next_num_detour; } - RAFT_EXPECTS(pk == output_graph_degree, - "Couldn't find the output_graph_degree (%u) smallest detourable count nodes for " - "node %lu in the rank-based node reranking process", - output_graph_degree, - static_cast(i)); + RAFT_EXPECTS( + pk == output_graph_degree, + "Couldn't find the output_graph_degree (%lu) smallest detourable count nodes for " + "node %lu in the rank-based node reranking process", + output_graph_degree, + i); } const double time_prune_end = cur_time(); @@ -1317,7 +1318,7 @@ void optimize( uint32_t kf = 0; uint32_t k = mst_graph_num_edges_ptr[i]; - const uint64_t num_protected_edges = max(k, output_graph_degree / 2); + const auto num_protected_edges = std::max(k, output_graph_degree / 2); assert(num_protected_edges <= output_graph_degree); if (num_protected_edges == output_graph_degree) continue; @@ -1342,7 +1343,7 @@ void optimize( assert(kf <= output_graph_degree); // Replace some edges of the output graph with edges of the reverse graph. - uint32_t kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree); + auto kr = std::min(rev_graph_count.data_handle()[i], output_graph_degree); while (kr) { kr -= 1; if (my_rev_graph[kr] < graph_size) { From 9bea21585ae121194c4df49e2ad4ce1bd16e3408 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 4 Nov 2024 14:12:05 -0500 Subject: [PATCH 18/47] call `enable_testing` in root CMakeLists.txt (#437) Required to allow `ctest` to be called in the root of the build directory Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cuvs/pull/437 --- cpp/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e56e21383..c493af488 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -765,6 +765,7 @@ endif() # * build test executable ---------------------------------------------------- if(BUILD_TESTS) + enable_testing() add_subdirectory(internal) add_subdirectory(test) endif() From 3ac206364afdd9f413de2175763cc37fdefd58b3 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 4 Nov 2024 14:24:59 -0500 Subject: [PATCH 19/47] Don't presume pointers location infers usability. (#441) Here is the results of looking at the cudaPointerGetAttributes of different allocation types on Grace + Hopper. Allocations of `malloc` are still usable on the GPU. ``` ccudaPointerGetAttributes attributes malloc ptr is_dev_ptr -> 1 is_host_ptr -> 1 memory loc -> unregistered cudaPointerGetAttributes attributes cudaMalloc ptr is_dev_ptr -> 1 is_host_ptr -> 0 memory loc -> device cudaPointerGetAttributes attributes cudaMallocManaged cudaMemAttachGlobal ptr is_dev_ptr -> 1 is_host_ptr -> 1 memory loc -> managed ``` Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/cuvs/pull/441 --- cpp/src/neighbors/detail/ann_utils.cuh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 29f790ec5..652d41c85 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -63,14 +63,9 @@ struct pointer_residency_count { auto [on_device, on_host] = pointer_residency_count::run(ptrs...); cudaPointerAttributes attr; RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, ptr)); - switch (attr.type) { - case cudaMemoryTypeUnregistered: return std::make_tuple(on_device, on_host + 1); - case cudaMemoryTypeHost: - return std::make_tuple(on_device + int(attr.devicePointer == ptr), on_host + 1); - case cudaMemoryTypeDevice: return std::make_tuple(on_device + 1, on_host); - case cudaMemoryTypeManaged: return std::make_tuple(on_device + 1, on_host + 1); - default: return std::make_tuple(on_device, on_host); - } + if (attr.devicePointer || attr.type == cudaMemoryTypeDevice) { ++on_device; } + if (attr.hostPointer || attr.type == cudaMemoryTypeUnregistered) { ++on_host; } + return std::make_tuple(on_device, on_host); } }; From eff2cc5ccd83ba25083436d74f8ae3a3d6836f97 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Mon, 4 Nov 2024 20:49:04 +0100 Subject: [PATCH 20/47] BUG: CAGRA multi-cta illegal access with bad queries (#438) CAGRA search kernel errors with `cudaErrorIllegalAddress` in some conditions, specified in the new test case. According to compute-sanitizer, the illegal access happens in [compute_distance_to_child_nodes(...) function](https://github.com/rapidsai/cuvs/blob/b422cbeec92fe925ee59f5f966ac9834440200e2/cpp/src/neighbors/detail/cagra/device_common.hpp#L185) accessing the graph. The `parent_id` variable in that function sometimes appears to be out-of-bounds (larger than the graph size / number of records); it's invalid and seems to be the same reported by multiple threads, yet it's not an `invalid_index`, neither `index_msb_1_mask`, neither any derivative of the two. Further observations: - I've checked the graph just before calling the search kernel; it does not contain any invalid indices. - One should disable any fancy pool memory resources to make it easier to reproduce the error (so that `parent_id` does not hit other user allocations in the pool) - It seems important that the query yields infinite distance to the dataset. - Running the search with a newly created `raft::resources` seems to increase the chance to hit the error - Even with all conditions satisfied, the error does not reproduce every time... ### Reproducer ``` ./build.sh -n tests --limit-tests=NEIGHBORS_ANN_CAGRA_TEST && ./cpp/build/gtests/NEIGHBORS_ANN_CAGRA_TEST --gtest_filter=AnnCagraBugMultiCTACrash* ``` Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/438 --- .../neighbors/detail/cagra/device_common.hpp | 5 +- cpp/test/CMakeLists.txt | 2 +- .../ann_cagra/bug_multi_cta_crash.cu | 108 ++++++++++++++++++ 3 files changed, 112 insertions(+), 3 deletions(-) create mode 100644 cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu diff --git a/cpp/src/neighbors/detail/cagra/device_common.hpp b/cpp/src/neighbors/detail/cagra/device_common.hpp index b7cb9c42d..7ec3d4d9e 100644 --- a/cpp/src/neighbors/detail/cagra/device_common.hpp +++ b/cpp/src/neighbors/detail/cagra/device_common.hpp @@ -120,7 +120,7 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( for (uint32_t i = threadIdx.x >> team_size_bits; i < max_i; i += (blockDim.x >> team_size_bits)) { const bool valid_i = (i < num_pickup); - IndexT best_index_team_local; + IndexT best_index_team_local = raft::upper_bound(); DistanceT best_norm2_team_local = raft::upper_bound(); for (uint32_t j = 0; j < num_distilation; j++) { // Select a node randomly and compute the distance to it @@ -145,7 +145,8 @@ RAFT_DEVICE_INLINE_FUNCTION void compute_distance_to_random_nodes( const unsigned lane_id = threadIdx.x & ((1u << team_size_bits) - 1u); if (valid_i && lane_id == 0) { - if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { + if (best_index_team_local != raft::upper_bound() && + hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) { result_distances_ptr[i] = best_norm2_team_local; result_indices_ptr[i] = best_index_team_local; } else { diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 60007825c..1ed8466b3 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -137,6 +137,7 @@ if(BUILD_TESTS) NAME NEIGHBORS_ANN_CAGRA_TEST PATH + neighbors/ann_cagra/bug_multi_cta_crash.cu neighbors/ann_cagra/test_float_uint32_t.cu neighbors/ann_cagra/test_half_uint32_t.cu neighbors/ann_cagra/test_int8_t_uint32_t.cu @@ -242,7 +243,6 @@ if(TARGET cuvs::c_api) target_compile_definitions(NEIGHBORS_HNSW_TEST PUBLIC CUVS_BUILD_CAGRA_HNSWLIB) endif() - add_executable(cuvs_c_test core/c_api.c) target_link_libraries(cuvs_c_test PUBLIC cuvs::c_api) diff --git a/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu b/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu new file mode 100644 index 000000000..6f4aa059e --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/bug_multi_cta_crash.cu @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../ann_cagra.cuh" + +#include + +#include +#include + +#include + +namespace cuvs::neighbors::cagra { + +class AnnCagraBugMultiCTACrash : public ::testing::TestWithParam { + public: + using data_type = half; + + protected: + void run() + { + cagra::index_params cagra_index_params; + cagra_index_params.graph_degree = 32; + cagra_index_params.intermediate_graph_degree = 48; + + auto cagra_index = + cagra::build(res, cagra_index_params, raft::make_const_mdspan(dataset->view())); + raft::resource::sync_stream(res); + + cagra::search_params cagra_search_params; + cagra_search_params.itopk_size = 32; + cagra_search_params.thread_block_size = 256; + cagra_search_params.search_width = 1; + cagra_search_params.max_iterations = 0; + cagra_search_params.algo = ::testing::TestWithParam::GetParam(); + + // NOTE: when using one resource/stream for everything, the bug is NOT reproducible + raft::resources res_search; + cagra::search(res_search, + cagra_search_params, + cagra_index, + raft::make_const_mdspan(queries->view()), + neighbors->view(), + distances->view()); + + raft::resource::sync_stream(res_search); + } + + void SetUp() override + { + dataset.emplace(raft::make_device_matrix(res, n_samples, n_dim)); + queries.emplace(raft::make_device_matrix(res, n_queries, n_dim)); + neighbors.emplace(raft::make_device_matrix(res, n_queries, k)); + distances.emplace(raft::make_device_matrix(res, n_queries, k)); + raft::random::RngState r(1234ULL); + InitDataset(res, dataset->data_handle(), n_samples, n_dim, metric, r); + // NOTE: when initializing queries with "normal" data, the bug is NOT reproducible + raft::linalg::map( + res, queries->view(), raft::const_op{raft::upper_bound()}); + // InitDataset(res, queries->data_handle(), n_queries, n_dim, metric, r); + raft::resource::sync_stream(res); + } + + void TearDown() override + { + dataset.reset(); + queries.reset(); + neighbors.reset(); + distances.reset(); + raft::resource::sync_stream(res); + } + + private: + raft::resources res; + std::optional> dataset = std::nullopt; + std::optional> queries = std::nullopt; + std::optional> neighbors = std::nullopt; + std::optional> distances = std::nullopt; + + constexpr static int64_t n_samples = 1183514; + constexpr static int64_t n_dim = 100; + constexpr static int64_t n_queries = 30; + constexpr static int64_t k = 10; + constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded; +}; + +TEST_P(AnnCagraBugMultiCTACrash, AnnCagraBugMultiCTACrash) { this->run(); } + +INSTANTIATE_TEST_CASE_P(AnnCagraBugMultiCTACrashReproducer, + AnnCagraBugMultiCTACrash, + ::testing::Values(cagra::search_algo::MULTI_CTA)); + +} // namespace cuvs::neighbors::cagra From 6b35b65923933e6396ae61322ce2e9b0772eea4a Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Wed, 6 Nov 2024 08:24:44 +0100 Subject: [PATCH 21/47] CAGRA tech debt: distance descriptor and workspace memory (#436) This PR introduces two changes: 1. Refactor `dataset_descriptor_host` to pass and cache it by value while keeping the state in a thread-safe object in a shared pointers. Before this, the descriptor host itself was kept in shared pointer in LRU cache and was passed by reference; as a result, it could in theory die due to cache eviction while still being used via references to it. 2. Adjust the temporary buffers to always use the workspace resource in all CAGRA algo implementations (as of now, only SINGLE_CTA algo does this; the PR expands the change to MULTI_CTA and MULTI_KERNEL). Both of the changes are required for effective use of stream-ordered dynamic batching https://github.com/rapidsai/cuvs/pull/261 (1. fixes crashes and 2. fixes thread-blocking behavior). Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/436 --- .../neighbors/detail/cagra/cagra_search.cuh | 4 +- .../detail/cagra/compute_distance.hpp | 77 +++++++++++++------ cpp/src/neighbors/detail/cagra/factory.cuh | 20 ++--- .../detail/cagra/search_multi_cta.cuh | 12 +-- .../detail/cagra/search_multi_kernel.cuh | 53 +++++++------ .../neighbors/detail/cagra/search_plan.cuh | 2 +- 6 files changed, 100 insertions(+), 68 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 95c158675..5778d85a6 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -151,7 +151,7 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset - auto& desc = dataset_descriptor_init_with_cache( + auto desc = dataset_descriptor_init_with_cache( res, params, *strided_dset, index.metric()); search_main_core( res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); @@ -161,7 +161,7 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - auto& desc = dataset_descriptor_init_with_cache( + auto desc = dataset_descriptor_init_with_cache( res, params, *vpq_dset, index.metric()); search_main_core( res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 297eb1f55..7eb798459 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -31,8 +31,10 @@ #include #include +#include #include #include +#include #include #include @@ -232,52 +234,77 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { */ template struct dataset_descriptor_host { - using dev_descriptor_t = dataset_descriptor_base_t; - using dd_ptr_t = std::shared_ptr; - using init_f = - std::tuple, size_t>; + using dev_descriptor_t = dataset_descriptor_base_t; uint32_t smem_ws_size_in_bytes = 0; uint32_t team_size = 0; + struct state { + using ready_t = std::tuple; + using init_f = + std::tuple, size_t>; + + std::mutex mutex; + std::atomic ready; // Not sure if std::holds_alternative is thread-safe + std::variant value; + + template + state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)} + { + } + + ~state() noexcept + { + if (std::holds_alternative(value)) { + auto& [ptr, stream] = std::get(value); + RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream)); + } + } + + void eval(rmm::cuda_stream_view stream) + { + std::lock_guard lock(mutex); + if (std::holds_alternative(value)) { + auto& [fun, size] = std::get(value); + dev_descriptor_t* ptr = nullptr; + RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream)); + fun(ptr, stream); + value = std::make_tuple(ptr, stream); + ready.store(true, std::memory_order_release); + } + } + + auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t* + { + if (!ready.load(std::memory_order_acquire)) { eval(stream); } + return std::get<0>(std::get(value)); + } + }; + template dataset_descriptor_host(const DescriptorImpl& dd_host, InitF init) - : value_{std::make_tuple(init, sizeof(DescriptorImpl))}, + : value_{std::make_shared(init, sizeof(DescriptorImpl))}, smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, team_size{dd_host.team_size()} { } + dataset_descriptor_host() = default; + /** * Return the device pointer, possibly evaluating it in the given thread. */ [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) const -> const dev_descriptor_t* { - if (std::holds_alternative(value_)) { value_ = eval(std::get(value_), stream); } - return std::get(value_).get(); + return value_->get(stream); } + [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) -> dev_descriptor_t* { - if (std::holds_alternative(value_)) { value_ = eval(std::get(value_), stream); } - return std::get(value_).get(); + return value_->get(stream); } private: - mutable std::variant value_; - - static auto eval(init_f init, rmm::cuda_stream_view stream) -> dd_ptr_t - { - using raft::RAFT_NAME; - auto& [fun, size] = init; - dd_ptr_t dev_ptr{ - [stream, s = size]() { - dev_descriptor_t* p; - RAFT_CUDA_TRY(cudaMallocAsync(&p, s, stream)); - return p; - }(), - [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }}; - fun(dev_ptr.get(), stream); - return dev_ptr; - } + mutable std::shared_ptr value_; }; /** diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index abc907da5..e6e7ff64f 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -135,11 +135,9 @@ template struct store { /** Number of descriptors to cache. */ static constexpr size_t kDefaultSize = 100; - raft::cache::lru, - std::shared_ptr>> - value{kDefaultSize}; + raft::cache:: + lru, dataset_descriptor_host> + value{kDefaultSize}; }; } // namespace descriptor_cache @@ -159,20 +157,18 @@ auto dataset_descriptor_init_with_cache(const raft::resources& res, const cagra::search_params& params, const DatasetT& dataset, cuvs::distance::DistanceType metric) - -> const dataset_descriptor_host& + -> dataset_descriptor_host { - using desc_t = dataset_descriptor_host; - auto key = descriptor_cache::make_key(params, dataset, metric); + auto key = descriptor_cache::make_key(params, dataset, metric); auto& cache = raft::resource::get_custom_resource>(res) ->value; - std::shared_ptr desc{nullptr}; + dataset_descriptor_host desc; if (!cache.get(key, &desc)) { - desc = std::make_shared( - std::move(dataset_descriptor_init(params, dataset, metric))); + desc = dataset_descriptor_init(params, dataset, metric); cache.set(key, desc); } - return *desc; + return desc; } }; // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh index 0003f2495..ecfd856f1 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh @@ -93,10 +93,10 @@ struct search : public search_plan_impl intermediate_indices; - rmm::device_uvector intermediate_distances; + lightweight_uvector intermediate_indices; + lightweight_uvector intermediate_distances; size_t topk_workspace_size; - rmm::device_uvector topk_workspace; + lightweight_uvector topk_workspace; search(raft::resources const& res, search_params params, @@ -105,9 +105,9 @@ struct search : public search_plan_impl<<<1, 1, 0, cuda_stream>>>(host_ptr, dev_ptr); } +template +auto get_value(const T* const dev_ptr, cudaStream_t stream) -> T +{ + T value; + RAFT_CUDA_TRY(cudaMemcpyAsync(&value, dev_ptr, sizeof(value), cudaMemcpyDefault, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + return value; +} + // MAX_DATASET_DIM : must equal to or greater than dataset_dim template RAFT_KERNEL random_pickup_kernel( @@ -609,18 +618,18 @@ struct search : search_plan_impl { using base_type::num_seeds; size_t result_buffer_allocation_size; - rmm::device_uvector result_indices; // results_indices_buffer - rmm::device_uvector result_distances; // result_distances_buffer - rmm::device_uvector parent_node_list; - rmm::device_uvector topk_hint; - rmm::device_scalar terminate_flag; // dev_terminate_flag, host_terminate_flag.; - rmm::device_uvector topk_workspace; + lightweight_uvector result_indices; // results_indices_buffer + lightweight_uvector result_distances; // result_distances_buffer + lightweight_uvector parent_node_list; + lightweight_uvector topk_hint; + lightweight_uvector terminate_flag; // dev_terminate_flag, host_terminate_flag.; + lightweight_uvector topk_workspace; // temporary storage for _find_topk - rmm::device_uvector input_keys_storage; - rmm::device_uvector output_keys_storage; - rmm::device_uvector input_values_storage; - rmm::device_uvector output_values_storage; + lightweight_uvector input_keys_storage; + lightweight_uvector output_keys_storage; + lightweight_uvector input_values_storage; + lightweight_uvector output_values_storage; search(raft::resources const& res, search_params params, @@ -629,16 +638,16 @@ struct search : search_plan_impl { int64_t graph_degree, uint32_t topk) : base_type(res, params, dataset_desc, dim, graph_degree, topk), - result_indices(0, raft::resource::get_cuda_stream(res)), - result_distances(0, raft::resource::get_cuda_stream(res)), - parent_node_list(0, raft::resource::get_cuda_stream(res)), - topk_hint(0, raft::resource::get_cuda_stream(res)), - topk_workspace(0, raft::resource::get_cuda_stream(res)), - terminate_flag(raft::resource::get_cuda_stream(res)), - input_keys_storage(0, raft::resource::get_cuda_stream(res)), - output_keys_storage(0, raft::resource::get_cuda_stream(res)), - input_values_storage(0, raft::resource::get_cuda_stream(res)), - output_values_storage(0, raft::resource::get_cuda_stream(res)) + result_indices(res), + result_distances(res), + parent_node_list(res), + topk_hint(res), + topk_workspace(res), + terminate_flag(res), + input_keys_storage(res), + output_keys_storage(res), + input_values_storage(res), + output_values_storage(res) { set_params(res); } @@ -662,7 +671,7 @@ struct search : search_plan_impl { itopk_size, max_queries, result_buffer_size, utils::get_cuda_data_type()); RAFT_LOG_DEBUG("# topk_workspace_size: %lu", topk_workspace_size); topk_workspace.resize(topk_workspace_size, raft::resource::get_cuda_stream(res)); - + terminate_flag.resize(1, raft::resource::get_cuda_stream(res)); hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res)); } @@ -847,7 +856,7 @@ struct search : search_plan_impl { stream); // termination (2) - if (iter + 1 >= min_iterations && terminate_flag.value(stream)) { + if (iter + 1 >= min_iterations && get_value(terminate_flag.data(), stream)) { iter++; break; } diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index f23b96631..99254aa50 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -151,7 +151,7 @@ struct search_plan_impl : public search_plan_impl_base { lightweight_uvector hashmap; lightweight_uvector num_executed_iterations; // device or managed? lightweight_uvector dev_seed; - const dataset_descriptor_host& dataset_desc; + dataset_descriptor_host dataset_desc; search_plan_impl(raft::resources const& res, search_params params, From 2d4afb515e3b509152adc652e3a9d97816b7bc3b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 7 Nov 2024 11:23:15 -0500 Subject: [PATCH 22/47] Put a ceiling on cuda-python (#445) This project is incompatible with newer versions of `cuda-python`. This puts ceilings of `<=11.8.3` (CUDA 11) and `<=12.6.0` (CUDA 12) on that library. Those ceilings should be removed and replaced with `!=` constraints once new releases of `cuda-python` are up that this project is compatible with. See https://github.com/rapidsai/build-planning/issues/116 for more information. Authors: - Bradley Dice (https://github.com/bdice) - James Lamb (https://github.com/jameslamb) Approvers: - James Lamb (https://github.com/jameslamb) - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/cuvs/pull/445 --- conda/environments/all_cuda-118_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +- conda/environments/bench_ann_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/bench_ann_cuda-125_arch-aarch64.yaml | 2 +- conda/environments/bench_ann_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cuvs/meta.yaml | 8 +++++--- cpp/test/neighbors/ann_ivf_flat.cuh | 1 + dependencies.yaml | 4 ++-- python/cuvs/pyproject.toml | 1 + 12 files changed, 17 insertions(+), 13 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index aa12b4ed6..80bfb0c24 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cupy>=12.0.0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 494ec394d..07937726c 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cupy>=12.0.0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index f4f03ccee..b7fd6fcfa 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index a295e93f4..83a457465 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index a73839457..21cb98180 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cxx-compiler diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 3f869da9a..432509bcb 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cxx-compiler diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml index 407fb6058..0c5043ac2 100644 --- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml index 81943b184..cbb22333c 100644 --- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cxx-compiler - cython>=3.0.0 diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml index e7e2daf0c..560c95feb 100644 --- a/conda/recipes/cuvs/meta.yaml +++ b/conda/recipes/cuvs/meta.yaml @@ -26,6 +26,7 @@ build: - {{ compiler('cuda') }} - cuda-cudart-dev {% endif %} + - cuda-python requirements: build: @@ -42,10 +43,10 @@ requirements: - {{ stdlib("c") }} host: {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 - cudatoolkit {% else %} - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.0,<13.0a0,<=12.6.0 - cuda-cudart-dev {% endif %} - cuda-version ={{ cuda_version }} @@ -60,13 +61,14 @@ requirements: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} - cudatoolkit + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 {% else %} - cuda-cudart + - cuda-python >=12.0,<13.0a0,<=12.6.0 {% endif %} - pylibraft {{ minor_version }} - libcuvs {{ version }} - python x.x - - cuda-python - numpy >=1.23,<3.0a0 tests: diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index 8cc46b2f7..23d84ca98 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/dependencies.yaml b/dependencies.yaml index cf9b68c8a..e909ad0dc 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -213,11 +213,11 @@ dependencies: - matrix: cuda: "12.*" packages: - - &cuda_python12 cuda-python>=12.0,<13.0a0 + - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0 - matrix: cuda: "11.*" packages: - - &cuda_python11 cuda-python>=11.7.1,<12.0a0 + - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3 - matrix: packages: - &cuda_python cuda-python diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml index bf62f5adf..30d784c67 100644 --- a/python/cuvs/pyproject.toml +++ b/python/cuvs/pyproject.toml @@ -136,4 +136,5 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" [tool.pytest.ini_options] filterwarnings = [ "error", + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning" ] From e559d581acec030d8e71833aee1295fe442facb3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 7 Nov 2024 21:04:57 -0500 Subject: [PATCH 23/47] Adding tech stack to docs (#448) Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/cuvs/pull/448 --- README.md | 9 ++++++ docs/source/index.rst | 72 +++++++++++++++++++++++++++++++++++------- img/tech_stack.png | Bin 0 -> 125904 bytes 3 files changed, 70 insertions(+), 11 deletions(-) create mode 100644 img/tech_stack.png diff --git a/README.md b/README.md index c1b74a9e8..572e8d098 100755 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Finally, faster vector search enables interactions between dense vectors and gra Below are some common use-cases for vector search + - ### Semantic search - Generative AI & Retrieval augmented generation (RAG) - Recommender systems @@ -68,6 +69,14 @@ There are several benefits to using cuVS and GPUs for vector search, including In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale. +## cuVS Technology Stack + +cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU. + +![cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU](img/tech_stack.png "cuVS Technology Stack") + + + ## Installing cuVS cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/). Different packages are available for the different languages supported by cuVS: diff --git a/docs/source/index.rst b/docs/source/index.rst index 647061ae5..286836c18 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,19 +1,8 @@ cuVS: Vector Search and Clustering on the GPU ============================================= - Welcome to cuVS, the premier library for GPU-accelerated vector search and clustering! cuVS provides several core building blocks for constructing new algorithms, as well as end-to-end vector search and clustering algorithms for use either standalone or through a growing list of :doc:`integrations `. -There are several benefits to using cuVS and GPUs for vector search, including - -#. Fast index build -#. Latency critical and high throughput search -#. Parameter tuning -#. Cost savings -#. Interoperability (build on GPU, deploy on CPU) -#. Multiple language support -#. Building blocks for composing new or accelerating existing algorithms - Useful Resources ################ @@ -26,6 +15,67 @@ Useful Resources - `Issue tracker `_: Report issues or request features. + +What is cuVS? +############# + +cuVS contains state-of-the-art implementations of several algorithms for running approximate and exact nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering. + +Vector search is an information retrieval method that has been growing in popularity over the past few years, partly because of the rising importance of multimedia embeddings created from unstructured data and the need to perform semantic search on the embeddings to find items which are semantically similar to each other. + +Vector search is also used in *data mining and machine learning* tasks and comprises an important step in many *clustering* and *visualization* algorithms like `UMAP `_, `t-SNE `_, K-means, and `HDBSCAN `_. + +Finally, faster vector search enables interactions between dense vectors and graphs. Converting a pile of dense vectors into nearest neighbors graphs unlocks the entire world of graph analysis algorithms, such as those found in `GraphBLAS `_ and `cuGraph `_. + +Below are some common use-cases for vector search + +Semantic search +~~~~~~~~~~~~~~~ +- Generative AI & Retrieval augmented generation (RAG) +- Recommender systems +- Computer vision +- Image search +- Text search +- Audio search +- Molecular search +- Model training + + +Data mining +~~~~~~~~~~~ +- Clustering algorithms +- Visualization algorithms +- Sampling algorithms +- Class balancing +- Ensemble methods +- k-NN graph construction + +Why cuVS? +######### + +There are several benefits to using cuVS and GPUs for vector search, including + +1. Fast index build +2. Latency critical and high throughput search +3. Parameter tuning +4. Cost savings +5. Interoperability (build on GPU, deploy on CPU) +6. Multiple language support +7. Building blocks for composing new or accelerating existing algorithms + +In addition to the items above, cuVS shoulders the responsibility of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be receiving the best performance and scale. + +cuVS Technology Stack +##################### + +cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU. + +.. image:: ../../img/tech_stack.png + :width: 600 + :alt: cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU + + + Contents ######## diff --git a/img/tech_stack.png b/img/tech_stack.png new file mode 100644 index 0000000000000000000000000000000000000000..2b3eeedba99957e985ccea233c8970c396acadb0 GIT binary patch literal 125904 zcmc%xWmuI_7d4D-xw8UCRR?p|Pe#sW5DR+;J2Pe%6K69sdlyRw*Auu-A#f2h z^dfO*Gb2|k2YU)ND?2lYhM6e^8xMtogF6KW8wVE!8^>ENe(;k*NuJ`ps@gouoIM0W z0g;gqRrAO`TJhFZy?RDEoj2@S|B4anAU>I`!tvF(cw3s;!aYMdN|;;fX;^9GmmkAk zqsV#RGC>Uo^8|kM{1&zra8|+}vUR`)A6K_^J?Cv;Xrylzk`&!~eT3 z0v9zIfA6jFUQ26X7A^Ue;Qi6}_>t|u3-JG2 z!qk^buXlfZuUB}gSF1$$X=vcTeEIVK`hVZnxsvX-|1IeM_f?Apa{TXh8~=X^%m1|- zL^gfL6#;?KRU93>WScond)nv<9iqV6eakJswJ3fYfh|_W1l@3r@Ui{pCFkAR<_Gyt z)#cU2ICs5>gynK<^QQsnaIaA$HOYy#9t+p@GJlBBa^$@V(Gu_&X15U$RC^Fs51zK z?z1)eugdh$Gx44^a+0u8%38uAvfwk^K@*<`!WiX#<6mP&r&xdEen&EB8fY#oFwn8w zjBT!;==%BGnW!6Clth8N+h2s{rLI$AI4+z1!2?Fz*UGm)=+zohs2sBNPw{y@1aXYt zvIUYB;~a7;@A*J?2romT4_PC^eLoEjg+SiUGyW%R`thilP8{7DK2Dgf_{^H0 z{CS))1^l}y-M=E9$I2N<@#lW7VD=)6B{Jl*54qLGM31eOGS)?Pjy*f>bRwal#E}+s z3;&U!+(re1^5&Avk7geCqTy<2*&$#(9p)~H&SOZXFNt79s!)2EyIaN1$tT7^^}7vt zY-wm|R>5L2$_s9d`%>FPo(r+ACHVmH>+Fdv?*s}0J8sSlYj9&BG``BB?*IBNV=6elz0d*pW z^*2I2PHREgLNnLd*kW(-R@6bV4E}=elVa($L3-7(rd-2~l`@S>X=hEEMb=_h6jV)9 zz)eX0=hA)@ACYh(y*oIVMi)^_T1PT)+epYhU~)A0r7Si9OR$=Avm5%*PX=r6%8bua zX73+`D0e7O5=80jc99!+IMKr~*p_JG-wzAUL%emXkr4z!Fg9s7=R~eM5UWZ=amvIt z4qzn^`o*ritTxXy4&vK;V{TB=_?_B_!Zs1pZ6FFbh+0{Q2%AP#Y&`m(tnMdrZao;# zE%>CVw&J-zA@guD#=SDCSnoy%rDW1wJik~@&K$nWl#s@rQ$(~<5@?*K^hl2*GGx10 zqv05$qOttPHZf!H++h8Qe`k!1gtKcDPB#L*&M!^I!G_0o0P^U>$ye5Pt6XfuXjEEC zeiMs^Dz4d?y7k%pkCb*0@BD5xF&T1;tN&qy;GARUI;-CIxm3|VF~g|TG3J~Usk$xX zuFG!JA@S-ab~PSn#ypU^Iw#hl|W9)41IldaS!yVZ|OTILCp zZGtr4T^bIprev93+8Er{BOOMwDMyB+uo2C~@o6&V3MSqh+J1&~E@T!fq|F|Dxp#Ua zby!X0-%Zf+xQ;rXs9)ub4-=ZC`tCCc1-TTyZ~pOc{yJaOjV>Ashb-Fp)@-hN*!<@v z7n74a@idFUAcxC^R^;S{r&f7nimkLsrV$6-i&m0P@i7dqOKL~qvV&y!(*E|-ewTaPo{&;y(mLyL zew|ud2%Bc+EVxT;&RzGg9VtgN-n=DGJI&BkjEJeW~Wu()-aq!tZ;D>C3m8;Z|Ca7w(m`asn!I>AXE=Obqiq>Gj!W6Y4xW z3-J=EyP3HM9I5Rz31tOEEZ4!Z^UkhRva9;Xsl3|$4-`mRaY0Z4b?%f8zotKdgNJ16C80u8m#1eyY=0BHOYl zJxD%fVcQ=CSdEO#vT!n%#pCzy5@O^`Zsm`M`=V06WsFV-XWBprDvb`ag9sWa`ZmK& zGD+t32x>2#>If>(5ETYyWTOP(ZDVRlK0Cg8@v~wmeGuoq8NQ_wC7F3aZqr$zNcBSS zq_h6pd4~Is%kx1J8jDPM`J#oP^BsnFf~Ve1`Y(u6J!hur=Gf~gC_Zxh4Rmg>&z5NF zu*tV&Fgc_f(QzF!+nmypsb7j+f!ofw9sQv=qDDDp%PP@7bzRS_X)lvM>dY;-kr~G% znaW?(EI7=*Fk@@(x+wQFN}#Kwp!clul|7VHHTLRI)2DN1Za3Xnf$WX90kdFnI+s!Y z=y19_{cBCd$Bu-Q=09<|ux2BHZVId=nYR(Na?MJbxjO#jP7_jzvaGtxRnLKLa=0r& zt}y5-w6b$V@9aw~W92k?icL9XKMKjlTA42yGpKeN?A&^NOsk1$BKQN>uTreN-1nc| z`1MC5;|>bgxe@R*=%4JREhM|u^>7~vF@}CJ<8z57f0hjyE{n|fJYJc@h^HX`JQ`X}x$ZyjO=6|c+-*5|boCP!Q0TSF^(G&xSU=VNSU_ zY8KaE<-?#KtwC)TE&98ULmE{p1~v-0tfraLA%Nv0#?@ibu{;OAzjSsMW7Zh`V(I&x zGxtJ8CMmVT)AC^~9QG$OQgrTI4XYPfWFvI3gF&PUIeGjH$+$QPT(40wC+h}^SGc`Q z#|VRy#jYhw!#?38)O^)s8%m&23B6$;wj@(WzLUKetc&N{<~9=k1X+6sm2a~%Pe+|M zT$tO*MAM);+-Fa3&|+=F#39;ZM@`%57DU%k$+xGP;J=V!Cy<-{n%^wwRt!Pgsp!Uk zMKyNilm3}rD=VUKEX+I;=7P;9J_JE$aUQo>e7Lp}uUbAQG<%a})qrs@N}7T1l-}2d zz}%NnkWS&q`Dm@{7LC{B+vq^uY`=V7bS7+}TskeuLu``Ri;_U|4}!@r6g+#)CC?5~ zq^)*6Z4yil+5IlLOiA&Iv(d^XzT3+WzKO22{YuObdTh1Hmu5DGXgV(6Rrff?h1d8| zj8_=pa0StROF2F>Kuag5Dl|QqNfNnN{owXNtHX0GPo?;=An-xRd-FTKs{$7kt(DfvyLI7V!7G8lrfJMYX`m_ zCsR?^-JL7AOgHoRgJf~7k5o}qXv!`PXQ-@QnytH^Z<1JfsFoa^`C@LL={R?GcA}N6 z{HQ4hyJ~%?R>kL~H(GmB@T54!**QOlv#rVHkvO!b4AU}+6}o>EvFuuT<0PT}dt-5q z%;u<4oSiuOLv}jD-H1ch;cIFsgMGaP26-gFX`0#}vl&eO2K=1$`7++Y=d+CKJhUH* z@wv)=)YM&XA`JFo%++!1`f8qQCCXNQF?DS*B-(sUyk3tjt2O1Gwx~J3vu%u$B=zCY zMn>v&-4EJ*CSMt1S$(}N%eU6UQ^*tCe8uTGE5Aurtxa#EKlWtWHwx~4p;KZs%hv2k z=H%3Tf}!TjWiq)vdCd`OljK~RUG&%H_SQYSE_H^!b%(k(S*FNPS3+=lC%MMCAR#4f zhutoFlvz=Fz>H_^QtMzu{HJl+18&CXc{%Zj_@>)ly0h(_#O7R&2nScjHWBf)*`4;t zjLNm(7{)AzQH}-w2q8n8WZlr6I((ufx2$&h_*9y;)?%AmY^G&n8*^$cO^zxtGXVY3 zqT9P6^c_pi$%A`oS?lBxtpRBxG!*!s89R?SCC)ZR+s#(Tl|)oYnjNw>5s9IwX^R#N zNm1Q}1=E`5PW+teL|O~~(6Q?0NmQHNSJ=!rKf63(s?>2c?#vRX*J(44rrFm&hhR@} zS$oMNH#eknWJsPW(HUiuT+*5pvKMVl7hhmA4f?ibIbVD6lTwCt)V?^n*D)ixqtMq^KUK{()ryaDh! z#P>-TE?c(SS;hVk)`T||rD#`k)4Ma87yk9e!1|kK ziH(acmgqy#Wl1Z%)^hLo{QSEvn653(i$@@wFA-U|;7*KDurMRj>D1Bu zV@sPPHJr|*J4_==viWr-ZhmiZOMpJttc7RHVc)k9A+WBWdk=9iS@s=oFi%T6d?MQw z;(DkfS~6=+E6%)&bC;Vf3fs>u+jZY^wGq7W`)GSANJDC^+cYBcth)VbYcn1yX9#l2D$ipS68DVX72%HZUB;j3M%^ zdb4U@U8M--3Ke<@-xKyEqi8maiMy=yBK#tCxzk|^wf-^Hi)f}KTw!VJWzU)4<+*jS zM`N0n`eoTLq;?oSxqRx-&eYd{0W?;zap*ac~jkzHC;^nN*taP&aJXS%-oWRD}q^+WbjCB88Nos25O( zoW{yei!;zXYV0=FR(gd4ELNZwgtp9w8M@yX5}kJ1gOg8m5I9HvQVvKEWuVv9RvukwA&hTGQ_)L|94F z_BgTvh~6);aJHvuMup6n((6vh{6O49LhIamgMq!J%N}x3MKUgSH=uIOU(oMN(~%eH z=1?Mjufma7{EedERvVd817*b?d?iAf-8zf+tr zz$Q9*(RcTpK|;H3LtNLmxKOZRdXQ%9CO2+Yp_4AbhQp^~B&WZ5)I|o@5H6 z4$O>K90}hmA@7miWc~fFeQQUcLN+UUU5B}V_9|H6NhDHM*vOyo+JbUKyi<2XW_>j* z*sL^U>S6@`6_!g;tsf)nE3Drhr)@97tmn5`ynn>U#)f~3lW>JkYwLkEqQ2qKq_;LOM8cF zVGz@NXrhU>_&X!;BS@*b>r5(-$Vtk=T7Giu1~%h_eZcj|MRI`ic!f1$gvH9Q9nZQn z5U`Ekc|9=RMmd!)H1R-_7;7bxB zV0|O$>FL$>=Brc^RxAxrL)7Agk<@HL7jfEr+3bxhe4rsWMbl!iU!g$?Ii@ar|i!$!O&t7C(e}nbNP> zDF}DixTmzdDAHE2fX>#0nUBvRKCPt=Yt;lFFpW^w=IU>I&{V`Rl-V>3kJQ|8;P{g{ z8}(j1{lFES_1{rCR*C**O13;q^o4L%ycD{;C zkxpNmf&Zl**;a`mUpYqq`(XL#P6gwKAzYeu&Ek-*%RXi5?h4tz<~MhDzrsd zNs!W3nJpVsl$k_N@qIG43Y|;oXejM&-){NpA*z1lKj$r?6q?$mw?#DVNZI!|ve%F= zT|iGVX-yf*U`GxnlkdK>BgLng-wX=MdH}kSW%Z`l4STKOq{+m<)-{=KL@Ycd0h4i% zCt7E$VhMR{ytAJ2Q8&JD!Fd=sK?QiV>Ho~tcaeTVAY=q-b(GDc?E|ZBr4FZQ&GSE4 zSBHx&#IG9=qjrupxDoLjw1Sgw=!ZU{DTxH@171({O_?sl6*ZI&CJG--$!lMdSXT`vyYR>Ut5|Ncxd*jQII z_dT+T)`z2WZzM!ZQc7vWSv+PnCzE&%?NBpeuHw{HjSwm!l(X#cuJ=sg<}!8pt5!)( z^b&e^^TRbfFCuNOf?hJzO-4#*ehm{N2wS6ouu*#dBW!DEbW_%Jfx+7)z|-9cMX(A4 zVZcz;;h&eAOAnwsAeZO1YjPa>H_a+CoQ!9L{aRUsQvhW%g(D;saeg;Xg?Wav4xYPI zBlf;%5lLFJJA|`hvAX<3{sK3`R7AUJrS;sD{@D9qxI+ zGJ>pFBU9#3YX^sPYbW!-bK)cXdW7_4x(1og1Jf?w9j&;BA_dmbFF$dB%E;R1J2#zu zq0!ajDhXDVU<|$_*HyrZUv4HZvEdqw8SYs1XMDuwv;4Xn$i}ntJkm;UXaIbCpE2og=pH)c8}qlxyZt#`rOfM^4F~zN74ny6z{)@;oFO{&RI5hpMRZP z`>5YkGDMJ)MJr>)bf@2(Nqcw^MV0+q(346rk-^(q@2QUIfB(x{drmhX2(yFmPNn@c zmy!&AR{Hv-?A;^nalu;y24j9aR3qt8a(XdkRXzM09K64{xi2HB8gQ-c`K1d|loaJ9 z@4|eN_b=k{{$73`Lw@W|UH*(rS@;Mwd|iq6`7qRCcV)=#NNwUj?p#$RDk^3Pm(SOAj)+eUS4xC>RmYmn0A z7P1WQXyuv$NJ}OE(85=wRo(AKudx2~ikEGzdz6?+`WpQC%%2;4VUnTQDSs1vwC>TT z|1(@eA-0q)$uNM$co*&Qb|lxGOHX{uWdv=6CG=OD;TbYt^JVZ{Lt%Fv9O-m>XF=$0<5LQ z5tAhIc^{)WqXon?4%1E6)!Bl``=MgCv#22x{>(*hJhBp}<+c`>^u+o)S;0Z%m%qq4 zyBwSF`a}6OpAWcVh&jk?;9)S4w)^1eiO1_TBl}8Hf_`K*GCA%$pr0bm8_q4y*X)eV z*BCxA$z>(Pj25K%p(l#&oNw9x&J4{Cv$5waaQ({_&PwlcetVEry{v*0|42YF$n5yH$+;qzL!Ym+N(QbVX_z`5o>bqbCQLLsI^#$7dG2>VzF9(5N>g|e zn=D&iEm6yim1d1NAC>xcEP`};7WJOMSU(M=*>cJ&e5FoG6qJdgwR-%+gnBa!g3$?U zpEFA7Y|IRS>hG8N9 zkGS?*Imb9VKK!sH$F0!IrMi*9s6zF?EZv>h+0bO$-*>({M6}!TjLokqRAGK#ts$@BLTqa4vROcMS!ht=ll zZESkRM6!&ITBY2#^cvKoud*{M%bgcWSSH(TomJ?_i;ai7Kcw#bc#dTl+_qie!}9E{ z)GgQCFl}BFurEzTkyW>;dSP-DkjkJta#&PXO~T}oY{o*DQPjL;z@jpisQ|5ekMA;VxIE`88t=nmTE#tIhVAV`MO~FFw)+mc+UKNX2i? zwf03+ZGCrxc3qvGG$^NjP4Qib+)~qB2hMDOkaMSY>kD(M{gjav)9PMB;31Wg`Rpwd z!`Da`>eRZzzvF11l=yk@Yk%wvCrflEI*({gwqar>XGCsKbc{Yy>yB|?YNj*H_oBfE zFc75iH_X=EOOH%j0xL&totdd~ow1ntR2TCtmy&r*ihttoZ}(^~lZNUr#&i)uj3$92 zn+cfP+m|n+0W~X_c2-iD`C&*wgp$miS(im2PBr8rpNLH{0u!kx^pNR)a^=f zUmKIY#3p#XA#A1i-&p{A1?B`%?vLP|&4zL-n|kD{R@^cBkK~f~qHFApsMOe}c1jcz zhreWI$He?KqaTqEt}A<5dgRj7&<#53C#Jvmd`pEnMaZ4F$|_dIg7INql{6#ds7H?G zv#aogZV0N(t0<)!U5rT0*sc2gUlM4%T{6K-^Cv3W{wwEcuSN0qtz=1n#Z1$l6E`Ks ztd<*2_;iS-Lisq2-gtopi66qIBP?zS9+o8fQw1|a4K`8&7|N(;E*$X& z+95G~iiZu2rZV~3`T=dUXG%>w-U_Jwrp)J2pl6SX;*|8?@iR*EuX7ktW}E0#w#5CQ zaqFD|uM8SZ<5CLq7E-Uu!^uqJP*%x|&_-?dQ?~y4hHB?a^*EtBwJx+}= zyT5P@6_{PAiqwZK1WW~kS+=JLNlc9p#}IWEviZBUH<$G9tV0lLW>yF{Z8~TThQ!e0bybdS>`-6PZ`= z3p%p37BE2+!)*RxX3_E__{wmDAO1Qrh-z_~djmS~tVLWCGmWozTw)dqlAT{v>kURj zmHa)1I+TShS*0k*h?Vwr)Dp%A@Uawe0!ph6CO0*H`yhKs!hs6Aqmq%&;{itAD!9{h z^@(HXy-zrV-#qel2u&wZ8~v^z2O}z>C~bU%M(ZkZ*WnGzQY>-dx9MbeNs{T-N_iG$ z&O`TlIcqoP`y8_qF24JJsG{S==S;EXktxTPzFS~{a}|0qgzT^rLie`i^Cka1$l?D& za1`iXMwkV|jPN-)ueM^65QJ5pSQ6TtT%^tWgYpb%F#pyN^@cNpx{}~5iA^S$TcEe_ zqdy5fAsG<;tW3k{gitBs5vg=pSGaAx|p|7cYWiUVF z&U23==_&`9DZVt+Qaucz|DsnOn%L3>&Xk-5SpVk;i}P60#HeI<7)>dQKsqtfehp2< zl5)V0FL<=zP|I1&e=j{~H9 zIy!PPe9m9?UIVbb;_Q`mX)i zC#?2YXa=`PEuT&>6Qto;S&6?z#P9r{Q%+Ne7^3H$&yNmFT6OT9Pj_!1A~ZPNtE=Rn z{&>E|Lq+hqKE^OGFvu48gt@)FeRJG;;&{2I{QIxFCKEy1siLjze_Lc z>+5T@D`SPm^!4?Fh^@r3xw)?Wjpl4`Zf=dcdqQu#uI<4KBih=#kK7|5N2{IWQhq`~ zAK$1O8k*7&w6(?Gjo?n9|2GT876aJB6oqc#(O$Cjm8q7Htgd>Ww0mr)XgD5LHd3*% zVH?*Hg8S4g{cUnwhaENNe7eYc7Ac?IxL)-~nz*<*A7Op*@+A_8>;qULxa{=&Tuf7w z(AdO;-KYniKIH=>-!OxZZNPD7l!!_DhU5IxF+5yNr>iib2_cU${2%Y%C$LcJUxq+@A$dGLz;C$3L69@Hm)|614TUgAzoPdgpVFnD--Q}gi=P6$5>s;H`FRg_&^UA=F# zUtzObD9h;j*pN3yU~GMy;3{0^|%G?Vs~N?95)Nkedn#e z-%C9Y`O~wRA}d-)!$-)w;)?6{a3%#3_7VjPY4-$6I95DeE$aN-S+i1o`2}w9bcrJM zMKG3Oy}raj5W?PE<*R_Zou#brb@8{|;Z2U+E-!=r86l&icX4uBY}D48RCoPD+2_Nf zPqI<5vt#yqe*E}!T&lJ->TwJsxiH_u5Isl<>95ReSIxvLDh6@IBcYdz_R4o1R)g}}nWC1hqsSxpyn z6VdRt)jVEI2-mtDs@pC%y`W1%ac^z^>C4^r!z{*VwZm6hPOk5FtqHrsAHkt*=K~8b z((>~1UbOEP)c(5)%M=NQs_Jy0Nn6)9EG*PD^`vtj_2Ja_eEDFw*0! zC@sE|ll*&bPNbsn6MVaO_-3<~OsqQA^Pid*uN8A8u3y>j-AF7EF)Cz>>=a|2?wQ0X z7s2mO0Y6|Ky>%t_`RTgzce^+52U*#(!|KkB##O)G{pk{;TFov(@$2Ii%9IZ;aBzY; zeD9&P0RsbbdUfT-bymAz3re!IvhqFH0y#Qd#y4+j{h$4^-+9kXHZpN>;kA2T56JVb zz=w(jg8F(1!c!cM<2<(1?DCq-_flDtDRo@oLwM6>tU!&$1exo}pZ4!bVpX^?>?C}S z5OUwk1nQw#OD+&2*O^gI6@W`YL9i9N%{Z(6&+XLG1d$SJe3;_lnEf?Iy}vsB{oxT2 zH;xv*^SSO}pI=-ktE-2#w7i9aizQ;^Q6=&}; z-~f_;2FbqhommI+;#b0#iUl76C#-U1J-z8b9ontoe31-SPmpxs3MIVA+Ne;_|w5}WzMifzc z2)XQ_W0LR%BqrkEy?nWOf6^)CE8B2L7>z`z_Ib7ZUb z#YJtwi&4%RO{OM*q?3&ftK$kPt$6tO>m_-9J>xk7FUZMP7k*OC2ZA864a9@y0m?*_ z24{WXl|!HK^Q~fBWgM|KXUn zc5z|2Q5qBxGC6rr%D{aspjdq*n_5aRG!}*N*QZMNQy&c z7HFhWth2K-kJD}>$^>ba z&M)eFOO33E7{np3^L%h1j>pUVK3B6ELqC6tDl5N$!jmNzF**?kymlqQhaw!1=A@a3 zVpW9z*9wY@pX8r-SAC}US5P@PIcv?wh#g773kzxb1_w8R6QEgR2&0fI7%4^O2@j$b z)NkDV^?mdy;M(h{I`-Y5`$y^6HzTmNox(!l(!lSwNHs3_p`Lp*;bB@)*y#S>UuaWN z*493_@tv#C0sxprQEKTy>Alg1a(s_S>zX_gE%Gj8S%>Tg&9v+^H zC>a2^$#RpE5@U+AynMe!o^Ke~ei9)cLWj09L=Fy)UjjnhvMMXaK?iu&Zoh217#HxkUitI_8yixujZv@tIh@=N zivH6dWu*X+K%1jnIvWfCrM?EcrTDb8@RN>v%(!>|2#Xc-VnIA5BqVg}xn4|>jasRU zjpq_dbC^X@%Ohcu1gMntEjPQ=nhat>L}X-8>|6ixki7H4gFa=z9fJZoB3#Gq z&MPSm_`(vp$$0BGP!lEL|*RPSFSP1}<-FZtIa8LIG`5%gTLh0-#hy$NrK{+2KH8q?=`xSxTUqy8>6 zZl3r*3t*B99qlPD1EvM-8(ds!HTv)0BS7h7b>|Z?c&}Lgc>Y*%kjJ_HPo99{&A*`G zf~OTRk=GZ`UuR@uLI-u|eSO>y(q2hb6|Pb}S|*7OaF^!~IjB;sSl|(aZ2=`!~D9`(gHcfBzU_Zq&HgSJ7GaEoBR-_#wJ>fT6Gy z0q$WlU)BH8z$XyoE)|Hf|NYWx%M3&t9LXpA?+O3Fh3^)dphFCTh8fZy{gPK80*z8) zFW37xP>u1unHg=sUALu}xVY2vzM?MHD9x-bS5kmOZ)v|>5 z)oXt{kS);IB`FG_B>jXRAOzB6vRs1|z{MKi8-V-A5_4N{nvZtHk_wPKCc(kNI?n5T zT2Il?7qzvmSo%!_Xygr8&e?)R9wjgDORRU7*mMfn5j8bz5J$9eadGil_rKaPWbcSy zvgihM8+N5K>*P;bR!uEX>V@X8gH8eC=jR7#aSNAY8S=|Y^%=d)=vM&+!oIyX%ov`- zUT0M4E0}=%Ls^$GIXU6*h*9IxIuI9r97N-JjUGoTqg~LZF2x(1g@h?#*hqm+$Odtr z*`0*~DNG@tsgFqmOK?k!e&qzyYtre#A`BPG z-u`V|J^lGQZ*5M;q5tmc(5W{iJu@H9!_)IW{jT%riWREo7Zw&g?)FQZmh2kc+^3MO z*vn@R2r)W1LqbE>vm83QK`KqQdw;6Xu18&IaqIo|u^gL(B*yptpV6N-&nAF=PP;ju zzJkbmfX;IEOB{=mn|qd>f}E0c%)ZfNzDn+6&8*e^${w08G!R~gT z2HgjhvYA@_UVjrHL2Q4sZSC#`vxmhA-61E@$>YF_ZdFg9bw&KXON$jx`7oN37Nqan zLb#)0&DK}Z;JJ=Y!w=)_>ax6`w3fs@W%m1CAA*8g z6-j3bgo?*nAkyjGVRf%1mrA}|6~m`h-%n{?`5mBjEO%rH`wQ(%SJPfi}^|FB68G6_4mK8)z) zzoUiVkLBv?OAej=dHzrPmS64awQGSVoSdAz)^UH_JDS1CZaqT}+AW%O0lC*6B_19g zUvc?c?|)1|!NJviR+M@X(lkMU)Eu|m+aEnU2{Zcher;4fZ1|&qr;cYzgV=XJKKX;lWH9ApCZ-BSoMD7AB=;7NDZ)eKdj5bg?|3|DjMl0wsjS#m4 zR#w(vAq~+5u3Dk&B3SVszk56Iu&(YZne~9(?YXv$DYaViAT#j`>*zO=XKfDe*GtUEQ{CV?_&M$PJdyK zTq%&jJhs0Bas)n6(b6I~IXOYa3XqrUCns3tv#6~r3UJGMQW8|kzG*4=ZyY=R3EIa+ zc1M@70v7)T2#u#F5`u?KGa!q^0Idhm#i#0q_>xs$!cd7qKV1OH;J~%ClrOU90T#Xa zkOdMz`Z;*`qng^x;1AN@=X;>dS3Hwr;o`fV%wcfm`XYuI_;mx6uc7iKQtzlI21*wYd zG025T-e^=wn3@(+m;SrH9t7Pja=ka4T0r0xx!(;X_`DmT37_XVjlR#>d!MuZmw-5; z(%?iEd<>PQ`RX56#YvTb3^>ir6EK?Y)M$Yz(-#ndT%p@QVsNq$ymfnV+Lz1#3BVso zrM`}4Jq7T+e#_6gmeua9Y|+>Yypt)}-~n!^R0Ck#Q#dHVC5^b2e!)+})n}4{6WyPF zK9fYx#MBpef4y0wA18o>b^MD9drIXpM6{(4EEKB#0BEs9MMa%nPAhJL=HAWRxIATr z^$%Etdy@q>J6R4jPMea+sj0@l@HT`Dyat{Vo`2ujJ>b7%KL`;&sn?FEi!Xct}W%Ik74pGOgfeZu0;g#dl zgJ-s|KiTc=?F}Gm(SZK;7}xU6?N1g+WlhUey#rEB`hHMsX5sJ9-KqGf*FS#O!#^12 zNkmkvFEB~pZfUv$D$i#j6s;(H3*XWv_+P_L&htPT|OmuGwHq1iVzBPXa`P5smhzihxHHq+Sjs znWvLdjcKvlD4je(z#1Gr)k zzptN8{*bzp4b_VtF%K+nnL@3&Jl}H^(A&e8PMSyoN2d1U_d+)dA9g(3c%RMhOAOFk zOiWBk*$D^iaQgXHl5_?ymR}m;DeS?;RDsE$nA`JXlgHTxzRdT|PGMMB*vy9&u%B_U zzUWJhzWelXdRA*P&W})!K?bBXSR>TGWV4#0v0eOu@_aYn2^DM`;8&Ogl_;=QNJU-b zQ;1(cG8j&1|+BndS&Jigdozz*N;l|lyz z4Xb|lV`tCcK6ZaFP0)c~3&>rN*q5%YpGjwZxQc5I{`apShQJW`-Q+}F_x(E*=h{wt z(15<~{$VxE2KwgZ(E=`nq(FCYV8D2(!ER`Dv=8XRh?CA|vKV$LC1J-k5mdS=)kH&- zDFPOe6a99IbDZ_b!2uvUuh-WC;nEY*kp)cFJjr|=T2QJx&lK@?UWzQfL61|+MUUNF zZg`Co`KwZ=QGqW}lKoR3P##f9uLZbCx9%i?K%D|HBylMPB*4_%+!!PopT{W$w1NP8 zTL@G3yT1|#APUT(8qW)JXx0Kuk+8C2P-Dc; z_5G){3_Pr${d^i66!X#aK#;i5O&*BCe?v0X2X;3O(8EoDr4Fko6!V3^*P`6NU5M=r z2Tdusd+kP`O_98>uaAO*V~X8TLt_rGSr5-hLw<+-qr&(@LWJS1%*jd}Ug+nE(}jHB zDrEBof&l%;Gy?db{P82g_k|d!>TGHv1hNkxG|1Ag`NHYn!){?7A0DtlX?1mppaV4x z^;V&Z1;{X{Wibe({A9I(FfmPNJPSSOXcK}L==M$PVdDy@x6zK6vE|>sEiNA=v4@r9 zdZCt;m64Uay*xjM9dv3stZH=xlze@rOcki7Ppj77w#>wu*&PfU z4z>yWX)BhKL&+8)*+!QWe*ASSa}(I+8FGjja$%9UuQxXOqrq*3p>96Z@aOwW^LsdgO+WnY)Zmg8NevL<2Iz7aI&Jw#Dr*?0BWMKq`(583jdjBKKYE1Ic)Zt^;(G zlaLSyW-h0KU|#&82r4+v)`GFzoezc#`U7C#Nc5~b9{=h5{`-#yYiIWKL|!)4#)Ig- zvGWl_r3?=+xo+_0N=ixr$;o)7v(}f#D@!ka+~3|BO%=-AY^NF2826(>5eTRZNicH) zxHVIJnb?ULDFLR?&I{WI2M5Gb3Q~i-D;6=LN7VLiM2wFPC~a-+ze-4{$8o3hp|*N1 z$mpJ)9_$^Cc2T(26_&lh*%Ikk259Z; zSiGP`_W^f+iM|tP$lncium1gA3LhF8!dxM57U~}u*Z_S948_2w-TAyO>9?Gl4Ku=; ztzgywOXqW3gA}TiZ3?um{5g2!c@O~D6bfbJz$5F2LInt?H&vtM|IPv!k7x6P&1OB_ zN>bdY9V83R%Oh`VYs1FF3jzMMfMjdVKgOOAu;@nkJ>Ef`7Vm#Y-T9)yHFir4?td-Tx&k92 zBc0GK;UTYhY#|WfoEFs7U;_cCR6YwxV>cKpEY=H)5!2(LLiMnSum3inf~}8Pm#4j0 zCPVR*(D@SZIG`pOv;~1N6kvyXtxu2V|NadCRviVrg%^N-s#e1VOKCOBp4Pqc>R|EGR$+qA72nHge%S(xJjmLgofW2NZ=gZJ_3uO^Wj|9;&pKRo7i1X z;4vB9o^J!rG5|^jXjaq#LQFEj2;fA59_|B@oX`D;kj(pt9opo3LQt6hgs<2S0imV8%%gIFN`41bjz?b@1&c znzQxWy^OYo5- z@!hWipwd8*LOF)ZjvUle2PQ7G#Ni8^d6~*5E#ngtgTO+-AQDKtwX%|2YYpNdgDhzp zSQB)MF>^;47(T~6!|)TRwrsi|@8&yn_P0khK?h^X-m6VlS=H+-SwmXODmuvk7T?B>P;D*x5XXYHCtxcC2nneNwuf`s(+ zh@VNXsd;$jnB(aR89-AQ1jdgO7;N>}H*W6c`40@Iym12VvJ@H>a1((dF$BKz(AZc% zc%m0%WKfzVQZTs&Ah+_^Yt8C5jEFGX(|V2+$b3riFi0sD4*;np^}kuVySgpKZha+2 z-pq=yz0EGWrw3o(|8DjbC{$qpG9%oaiy|iqDiT1EtONA?j|h5txE)sjPFa5t*q-&~ z-CSHQQQEC)%bn2-&2}$tpr*e}??e5cK?e|0D5>x1?IndPI>TdBZ=5WLCp)S#I@ZV0J6hL({`GZiJ3 zrKvfpm_KtdKT$4-ql<0#dQf+ZgV6(J^un**BZV_>B4gKuWn$NJ+;iLo@W3J927E3l zd5tA_3bC@XN_+ea@-z?lc*gbGVBOxQ*Vns%j+%fpWEt$z<13%WcJaHunm6DCj0cRw zhy*>@^jch@Tmd@h0i&(8Z--*%gM-u4Ng$AHz>F0X6aA+t-!UWu&&RHib4e}0LW@X&`^HcHdtQ=wofeD@_-2p^dFy}9yMq`0S>G8 z?Ei3cR?aBv?BZm{)AF%INCA997FwViOcrq!PZvsTz>$T`oa;Z-5+zBIZ8R8B=;e#=d_Dq^J0e{-PTXE94;CZq>gJCVub>b>2I695?{Yy((WbeFc99BCJ z6J)5GbAh7B1(=(ZnfXCU3HueVeIek)vsGH1e(wNR1hc)BlogFi^$S21m9(^e0$G^B zW&{iH{Urv%BWPM4{}wF3Kv6E|Z3^%OvOvCa0vn6RZm}=#`N2Xmj%+N0(?Y3Ct*XGe zE0>axf|>aJf;gHNc!K0EmzX`(;;DFdcW?+XK)Y z2Q^`nGx7jZf!+VfN9b;k+VB2vV|n5dc=^>cctDMhsapdjb#;l~Ehe0SpFCWijiHYr zT4@K{`CttEEXC;tI|1zkmzr-Ln-w{8c01_mC1`t1%W3U!6u39)sB z%I83@g2EjKZp%;qrwelMKp>TN{&=5k1Jknzu(%)gD|amxS3eePv){T!0Rns0izQrR zzw)-hb|DB%^@0&Gj6ne&t+b{AfOu*drcB_1S1p#K1i=Q%|NkNFtKza+gML*+Q9vXF zX{7}LX(R<{B$XDBZX~5c1*DPg?(QyW>23t+2I+=x-hIw@ajwsWd;j)UWW8%WGxM)m zl(<}cvEfqBH&HD3_`qGGu<%G7)O#3ldf+|r98c(EeV`=)N~SuVuTts-7!iT?W*s-_ zd3br(LArq1_cxGx=M$X50@EIP{{2e=#oYP& zY;^{1H1m99S|2C1?uc=@$&(2Xh*$|64hKcgVDj6;A6cKR=LX5l-rhb*G1q9S>_w_b z03ujZcRXd{3_TMpqOeE}N1vXacN+Hd$iR)oz==?+Fx7%Cgh+~jS1@9H^kmlzR}f&; z-*dZM#ry(sUNQ5RaJIt3!%gRo>2rsNCMxY1m!B-7Ot+2u_b2iP1ucesV=?HnxXF?W ziwwt1L7gx4N9#-A?T(U5eZj~`o~6!xUA==UAtCV;nXtsnhnvkRF+GXL? zl1j!oEn?9GB*w6CISoZG)%=y~`=o!@4HA-VH^=sMvBXo~aCL=<_(-H{!wW4ywu)qI zybL+o9501D0Vu&@Vqz#D9X(@Z4Fjk-oGyvS^<_(}_RDqw*IDGKF&uGMMB5?avHuAr zVe$9ZZ7}@05mfj1Sfp^A1zHbGQh``@vnAt_C5%p>E|Co@l8&J)wr)WGt`DQFTh|9Y+<{J6okHASQF&d_V0E~L#>e3Q|OUr%u z&3}8{QHFjCTb{HQ<}k$i3$fvt>*equ2NyHt7~U~C)_Ij+An*@ zA(UIc`vlVE2g{7!$FQ5MOyMpD0_{a7!Y|_G<$dt*VMGoE`2h+_668k^sV%sYsmjkXP%?;b561#L>dUJu zIh?j-A*Bx=WEB-fzzEeZjRxxyxG5qrbq2p)1EHs+q=YC&#KM9$%om_OdQjR3?zm>d zrCMR?pip{)C9Y!trRxliEh27px&7G|tuAw0f*lQiYQ9d*1Pv9;u zLwiYvi+icrFq7ZH3zQwYWee176)0N>s}*p=-6ZJ!b!RKFkzwUwIR*17RWg!+=WHodX^UoeqPV4;Q6*g-lbF~A z-S~}FD%`2|{jd-6e5dXyDP&xBJ3ZfZI;>1~XANWSFKlnS46=lXJ|Y$HY>6-$%flyi zj_R6RC}qVc`=Jbud+AFyPdHI`V(+KOXT#*wQ8A_k*VR_d{L{h8DB1PYy~@Qaj0gqP z`g>0_W;IyNrf`Jima6BVb`j5-L`Ftts?6vu6x!2QpKgKny$W0y0ztuGg~ovVkpR3# zyL9q*ceiysx-yHUFZT!9aFgSqvl|i;63}n0z?(xjOUy75f%{RlxKnTt!`E^FBme=> zvY+<^Asd44`S<)99@ecHht+uVT{keH9L|;_VXi`KW@!1cW)T{GD*%rm{_Ch8B#z!t z9KiNrkh#H11H#n;$sh6pRRA~(b9g#?g-3Z?ptS>}791{gQBJU%A(1;*d(q|s|NLdeRHaD1W{_u-s=rD z0{t1?e*?XKpibC2rqWPjd-m6_U&a?d8Nr6&fC~S7DC#Of!3HC7$*ZNMmwFuR#- z)@1SfsI&1DS`_GLX|wY3gye8QIDQQvU!_F%!M*QaAx(np4xt(!0Hw^1hlW%#3GX1R zhM?Y`8?V?QErpPDXRNyEfhfelQ=19*?h|*p$Q;J(|NZ+fB)&vY5WWQ7~QGT8PFrwgc+7m-Z~ z_A8L;7_E4PM!@O~)*vW{!N%i?ii#l-5$)iA8k;$+%~Z3*a(piMVyXb`P7o!A)u+@m~) zd3wLNxS09Q232xen#b!>10B8#!r=@Vg9Y-Ss^cz>dufaG)r0>bGWp1&_4B{$>kfb7 z7S9PdusQ?806jy-mx7Yg5}NJ-Oz08_vmpBOg9n!C9FU;GeW~mYYy$=fo^8l&1onh* zhI**7Ac}N67xFL2c;%>H0G``xMr8ybEe@0P>1!(rM7kDHhk$?=5Eu*O)q)454C*xl zHMS;TpdgZra1<@WpFDp2tIpv7(X9&N9rZ8k=Hpuqx4fiNG$iNmk2E|89YryC!_XzRm3E}(x5P2_T!(JdK5l~&BBRP&o|H?i0qLBnusMcM+ahzFJM;@A}l^CQ0B zcTE$m+=;ACz2?OBD34H?pyZ$#k^^!Au8BFy{SPQj(D9f?C;wP-Xr;*G2NAsXjbDI3vM{iM!{2Vsd-?Hk)=nk|JQNrfak(Dqc;+ zlNY|t(8mxkI6c%k^i#R&#aF;_62n{|}U2@w`F;|YbKIUHl7U5Wo zHuFG2ZIkQZRK6#XpVS>@&ZsJHflt#XM4IGqb3Ev^^~#57x5@1H)QD6f7=YqXzvUM| zR6cBO>H4)nu&n-UuY>e#yz#mXYKW08cWI?1`tKh6KLfBlL1=-@rV21!$YoO)mrpH! z05$={!DuiA8R2LFtXM(}oPM`!7MNQ3E6$Kr=<4ofdWr_h(;ie3uv3-slcuI7p3}eI zM|HEavMeYsxZJLJ3LDO8L8=CJh7~U1DUzzSHBk(1We_B2=90fq1bhWn?aE_u>-jSY z2){lEsr~V2c_;bp*Yn-bLR9uxDUvgaYOVhY+6v+s8SkQ-!BGSGW#S2SMwLcbc=#UV z{PrNer^8A>s7mCM)>Jj5a3^zK)0XXAo0E7K21kvQ{0L&8lp6~R2sG(+gKYODe?r|T zL#wm*EwFZkNPv(AKXrys0@qwkQF{oH3Lk%e3T9@ktQ~s@MU+6a7N!oOqe6`M`CZy@ z#IP0#BfpzN)ks$9IG-8mL@v+A z>@~La``C91PF#;;9@UK+rxz9d1Wo zci2O9^wpTx7A_!z}S8PI`AQs z#F*Wj^8xx|N6;!nO%`GF>$Q4gUV=~T&O09|Wc31y0iV<28RU7aUKhh(pbN)1PpvW# z0_uL!$50K>=9MeWiJ6#~-X-$Ihf>P`qy(l%1@us*R3DLJf%YSd)AnS4yRH#tm%l%n zQgprJ(E=P_JrIo$0u-PnB~+xy86K!U=U7$&l%~nKK8@pZuitOXBDXpCR2$;H1_#&G ze7X&k~>x9E>`ghvL5)PB?r?1f$cusTSQ#?=Y}&X zz$>G*v`x{oWWLJj1!txN;y7f!l^iZ6N>}95CpD97bwJu|Tc=<>Lw z482-~F319=vvtuR(<9=_UTFB=CQVl$1Zok`1afI@*JHs~PRFKDXF%W!k1k*iq-Zv} zj_*C7bi)DO<*<>RiC8kAQ_OpV(ga-Q*INu;AsZW3xEMqmgjAHXiwmHPW$24rQ{{f3 zAzfeXy45znQ=okU*alIL`Y?Swe*74hm|bqwFGXaDVSYWDc&iJwBurxe_xAcz$$y83 zX#o}J0OQEuoU>1{N7h{fNP;x`PR;Ly-CY?t202HP&+h^aS*|wF-`}n*KHty3i6ejYNfdpvGd22<{1p8Wb%+EN!4zG=rG^8m@x>=8^@Zg1>)1AyfxlT?)h+Q01&69UmW` z?L)$FF^cl_sq+7XOax+P2{dCT1Z+zQK?R8~Qsl)sUE7SY5U2d^~(}xysK2 zxq`6B$W%Cre`+}Z5HdtlaBu*$^YHWvF6E8QqgQ*%X_z$K_N}W+f={JBMoA;VuT8s} zp`^@s)EBw1q3dUDs=7-lfKh|Q=+3dcm#URUEt0^d-tOXWRP1_rdSX6#(v}mRt6XEx z1kVra@!$SKd=>_-V2a>N5;0Mgp*3K~$!<5-h;xVFa!~MAAn}0p4zz0-^7@5zrTNfl zfnC%1A~I?4tiUc*Y8Qqn*sI^7U#eBE3R5x(k^nHEjf}E4DN_aw*C4C`pGQbU1dD*c z7w8Z8Z&thhLRJB56UF4E`n@^FDxmNp%v+Es^KH%Gz#wc4l^UBD2tzmt)Zv%vHD!uc zl;9LUz5_#w4`zs}6F+6hIo8vszX5%&fntY1)PP_h>Z0)_y|1MO2^fKf!}1egFaC*h zZ6~2zp{&a*E6ji(lti`wp(5~HrxPUy9>_m4BuwryeAiLi(~WKRr?1p&N2NFP*ke`d zjoA=tGX{D+1A9*sz=iF~$XA?>-hcYA*oXKp;k2BZ-5pE4;L)e=bW4qW)&mQQUErzL3gSK%znC6+7FIH<4*sx zGaWFgdm%Z6&uaJ>_yQvNf>6G}d#$4S9H){;M@I*P7mzsg3WTDXm4yPCD?ctYNF!3Q zu!KUe9cDX!)eh`<&;m<48&oq%pQ|MY`3#aM>4k zbNR14z;)8Fe(mUa^mnB5eSw=&n9@H;RLMj2Z#7==U)x+HZr~s%%V&EJiSA7-1|7gc zmvfo(Yc~?spP+ebSVvQa)I3An*&xuI;U1;JiHYRF9>u4k4|?`ian+k(_<@JadUXe}-XR}-}CKu5IIXLL36%cX_EchT44@kcFtgRV>GD;IP5d4u1 z4hk4+0eBkrUlGM|d20C(74;0P8TH9=BqXHH&d&K>jyaga9`M`D27N@Z2ZztBi-OqA z0f{gJWG=v^5P50^*4hF-@?^bJC0%zp=&>-cA$%=WKqYCMO!Kg?;hG?nG`bZQO9T%h z=L&oySGJ05*{O92!=EsE;9vJ3@9S2A<`v}zd?^- z-Yo3L#=>ge{QOdT;@?xn%yi3E!=)cL*GixP-+K%Hh`qP|Op@YQVG?YkB3O$A<#YD$ zb2Bid&fc3(ZZ?$ zolF+%+(%S0#OLO1op3vQJr9T$a}2&J@iuG~)b2EQcl!a|u!6)G$nYEu*XJgP1&i@2 zYYjS&U;3rhL>Q}Ik^u`v!C4xS2%`w*e1MKlY1C(`-!HLnWj6;J5yt+|&4pXRl19bc z>*?}d{l#HOt4QYSZ&g`FZdF2n0CX7Ce*S(pYwJPVq30e+P;0Gdqr2s z!~=mO_FG6!-GbU$0xt^D907I}2qef^9mtMGDdBj+)JcE^LI5G?Kw@D2C~scuoqB$k&q0smUg+~huT%( z0CA1K8ys)kZ-K`3^&t(9OAO3%sQgUuOHu9`JakJ>OKWwFU-Wk>65S>SY6w_Z_7n49 z4us&BG9G_}$34HiEDUav$#QjGo_}TJ%lSr!+6UW%-#aq-=DVm_C>&8FXC3`6T`mgL z{)kgVB954dFtm1J?k~Eq?Scqz1-K?!sHotB7xSNg1piKM3l`h(fWie8^Rpq-M}$WM z9|_1gAaz!FcIhDUGy>Om21C!@(GlvjAAys=33%T9YPpX=(rR4Z#oHA!_XK7Xd=m)0LhRq&?P^og8Dd0$s8CY}*CcLb$Y;yqcml}7 z0>cX|2_V%r5aw$l=c5Q9g)aq&ox0nYSn#K2Z!d>+%w-RmPATt`tttU*brn2t`N}J} zy4R6malRy8xm^C>;X>+@8*l=DP-pOcU2lCbWw-XLWCRC?)RP)J)ADk{l52x&f9GPi z_Hg5~u<#}2^>f<1cbI)K0kwjG?mt)~9| zAb3Efz!}Hu0W;o3kQIcl_W+v;@*7cLy@D9M0E0#!QQ1q8mDpxzF=q48}Ne}8+rVX}S6qZ10VbR|^gtnp+{H2-)h zpv`anCUbg5RO+|bLvbWBC6L@70-69M(E}U+p~WDUY@o+efjRtTdy36>pcHm1{=vrw z9|Eyfq0{cK1$PLsO;=`!)rsx{xDO$xA>3dvKoDDjMVsH#B_ploTuyHP-Q4U3cPuNb z>aob$YS3{Y+Z@Z+L9Er~E9&T1 zmr+k9CI&tJc7Jiv!G1`{+1RucTd>GzSmf?Nsew#(EhW0we@Ta_auZR|ViBPU*!4r0 zgP>pyWXh02BnVtCP(=RKbR#ybBN#RNem;MTK>3Ufr(}pv1{F%p%^eGtvN%^KRwoFBPRnn=g%TWA@Qn<`DX=<^TIXNojv6nJ zI-;oQQIZx)SjSU78pyUbn7#Rge|3Ih0G?nS^X?PpCv&R8S5^ymHO(fxHAAT`*N^$H z3WA;oe-JL9ktCo8YY0%tckl!e>l*v8%JLmC)-S=6(1EiI8;YKg&={!vtO*{Zu%RIh zDBkVk<9J}ffVL!;J6d3^6E7|YeHX$VP;TJw(J(S*!mNPV1W~bG5Q2e&V?eA7ItLyi z)oH^;>~e98;IJzoG8-QHv{Qr-Vq#!m0Lqd4ydf!gSj%F9gk{`Vmj%9NAT9+7VyG>Svzuspabw^ZBQ@}S``Fr zZlw5v(Tb zLyq5y4ImQdtKGu0zc<$ygx@-_6O$bLR?G{-#auDcQeRdluMMT6LZCwHgQeKidR!CGx9AZ89=uOWxcemv~951Cqi46l%N zR!bA{_W@Y$y1(0qAOS)Adw0)i>UqedhLQm2S-o8V=mG=k^i1UO1AnE1Y&ECc(vU446fEKyPV_sd9ZDPmE0fakS7F@_~ zx&*Tw#$yKG-rWa5e7-+2Z9*u~{h(vIp@3^6i-VjlK_vqHD_9JSLE_bjk~u;kycztw z-+{MX?+uC)U9VF|G#uf6X4Lq&&2{^f%}B6cqu!p3DY*7f&Z7OrVKG$fPxE3Z#So&ptDhOAP@jZqV)09Q<&ZXHy?f-(_ro!|J1E9Nj=C+cf!v zvQwS}g)L-h0?;KN`DHLtj!%8};0PJe)Mr)uQrA*uSNix9ow{nd zZd-@Z|a zi3VLeE-Z9l4es59@ zjtL)luz9e(50Tnn8Gdyb{bSb1c~^XKay@sDcf4{wZlwKLOMsXb2|nHML=kmvL9UrcwQsZ0NLi!Nk#9Z3{4DhI%I2Au0U7?aJ|FHavpWK*@+dt; zo3W=qTT#z5F#SyyyWpDIgk-mPLORxFK58}nJ7<+Yc{A4$!EBwxEfA3Bs?Q$nDS(z3 z`_B1yc}PJ5*QC1BI|~*?1kW#U2c+a{x!CRl9Y>>cJ48~RVyNBluAxxRkBn}OSUzd; zH<(P_40hY2a>im<=W7(f3YO|_8SGEi$!1A!`wkeE3&OHnz9Kp6HHyXM*`~+!I%{#^ z>4w`*49{wfee&&OC$5VPJxR%m7GP~&&i>N!&a7s?)l!pa6#CS zxZP4tBSTCxl)*}!)+RBE;|(Sa6WQ|N5l&e{n6FFhu<&_|spaX(SW7^_(97Vb-`)hg z$;W0vM`ulv$@G4ToQxJlef*lAJA!^_Jk77CZqDFtjf=rmAM?y9d^stV_rD0~SV6kx zdqOBKA3RVnMjs;AqNfB%U`2Nr&((i6uev2PG4jr`DoZRZ;^A}OVd1)a0^Lu8;|#Eb zg^&4SuF9m{MOq7U_oo9seT>v^MGYWGMw7%e&+i<17yQ_?kaTPN zV=i3(nU^U2WtwECJ!dHc;r!z+wg+HC+Gh62AI1|J4ldSKUM`aODSs_Jf_6!D)t}NE zA}ofBVo$AI3<3@+bwB-P(rzedw0gj5nMT3_fumQ^zBIJBVKK+SM_O2x&xN1P63HwT zO)h;*cSpaD3!zJM_wTsJa@|YR^W3s$+9HZf{XUmtGnpvD& zbbbYn+lcC6s`h{pE6u}C>~f!Orz@dZx|A8R;;G#;BU)7TAqbo$efgU3RTYc-7yMT^ zG&I)_k<~bU8sxk9IBec4p-=J&6qs|zWHg_COI9stzCUiZDC8sYjPHPw-s0%fo+KycuTeR>ZzXR#OrtH4&^CV)lr^L{7tk_f7@CS$ba+~^qRpx%tLlZR z!WLHDDmL54GgN`!L?CpHi*%3jC&~Bb!I~}>olExEIA`(ikUb{z*GkD&4R0!^(J^0ht?3cV$^e;H@kpFB1|YR)a!Rkd_Ji8M3~t z3CaeR58sCNwtn)b$gf+xRPS(QRqqXQ2+G(OIjQvguzIz$)uLOGR5mhbH_%p?4%b$} zNPr$w__^6lS@BzXVzw47T4TsAp5V zd~4yR{oV0%cP+L_wy4|vdA}Iu*R|%OwF{F!l`6y?F=w)msyp7;somvrLv7C|e0ACH zElljOQz03q`Mr+*p1-1J)Ox9p_3$n$l75Me0N*gX;HO~xtof^O&Lv*$X4BEfBk7Fi zSr%oZz9pY#f&+?Ky&L4T$^}Orej%Jt!VwEXS#`nnuPtFx{kF#a3hm41XD+e9zXTS2 za<#KG<@-mHG_HMXcl6Fv|DRo%QzF5(IsLQG z(|o?FM{nldzrAi}Mf>5r&P|nolD}x@YqF6}usojJ2Y2oFtWhKG%`=;NGM~H`lBwzL zWy|P*O|(wj7JTgM$0!cS-Tfv#rF zq=Iv&_K=c__Hy5GU@zgoj8t4Gl7LNBo5H~4QMIV=FMjVL6y9@LyS^=jhSHwMkYom~ zzYlFA(%tLKCYMN==>(Ffe&@Zz|vf;}73#ieINY$qNaR29W0| zU=p~sJ$Xc2nj~_Hm~H}?n1y;iVaoZ7HQgSfV{HCIVN?sMWdFwfmcA?S@kcOV-8WR< zf6-^4VB+bjIEH-2a1dw9&~XHB>c|`x>hgFcIuUUogIOadY2tAm;XA{z;L(RKT|jgE zSoN`lVVziK;h=fDP4I-~dv!%oWy{l%!;N{h^b4V>)cIps^>Q*+f;NVrlqct_%%0h* z@wh%)HVWd3W$67uedOO7^a7^D;hy}vyWHg06_@h-aW!JY>;k9{!H|_H^3Q|uV z8T8;_sv39Z-rW=)@}fOb45kZlB>NYHy1f{rRSFmcg2B1UU|~R*UDVrPmp~I)}_!Iopys{&RiB{K9Ttw{^bYeecES zH#E}3K4dnLdjq&3a4Vgv+0K*L zX~lKfa%2zuiVPA^lpe>(8KsD!Slat&(BKhbKe=eFyFPCCSwXL|_We)U@(OLZQOS=$ z+Lj34JUz?u^Slh1Z(-+1sUxoid)sfX{`y@+?ApH;yNPVVD+Goon_ z%})KG-r7%clwT^!`^F;Ci8}-HFWT@ZpZtAd=a<5*r{n{i8x95`D7lmOPQ452osGx8 zvkT}dR-x8^VJYw%5{he7$5_&*@U@T%DEX$-I>gJSCyB>$-&md9qZ`)*p8-p0eA`7! zHKlSoUPgcPbfrj)Bnw5*!1ra#yX5zDf{ke+(djB=Oi#bMJ*gFs5*j0_Hz9gK5YSyQ zcqoHoq+nJqxiRSHp`k>CYqV%{r(9$WZ^>7_ay!}Eo8yrt>tT&Kd*0#+o8_o&Vby@! z8uNSIar-%4-fZlwmewAky^eu;4jLswM|$GrcHbAmt%i^9QhUGSaQZA+*^8rP?15GP zp=w0IwPe6#OTuP$rR5m!y>ReErGAF@z%vx_$I%M+7U%Vt0K^&UtornP7fvZl`4IIb z!`mal*TsTR!ZVBJiE#V7;E6VK&djG$&APNDk?LWTSZ^_Dw8&glDqg*w^$>O%64G`P zP8#z)WmC{-AjI-e3_u~oC2zi<_FiIBAv3WqxVU;HyQzpp772IK|M(ME47>s23skrcS~{UsfgM zsGCi)2&8RWcIjYEN-lUv2gu%|Qj*)E3TBrem)+d5-T9fm_9;S{i{Ty3=y3&}Pfs>i zt=xfO%x~K=m-S-PVjKUf1Ml&iI%Xn9e5ns1zkBR>oU+B23Zh>e(qnq`;YEn5;mF`OkdIW_e>mR3C7gtg3yHG@h?~Bn{8E!fhDSRHtNkn zm$j3}Oq#}6le_JHuaE^S>cs36uG16U=I(T)twyDjO{Y^>3r~@2mB-^}e&u;4(t0@J z*J?K`Jht+(c(?3rfoh=D)kY8B?OgAF*?oL8LhI`;WYz-y138#eqCwa?>t2m70&1C7`}+J&*+gyYt1;TBoS>7RNca z$cMFp*#%4(b$Ye5xb(4*JEY)8w8;q3pimko{O0|VY__pPgU3BO^uin&J)xVopKWPJeLKr&8RXP1%G9Nd=yF{>N2*ADr+V>3OU{IYfPKn z0MJ2hia3e+1H9Gbtbx$*AH?m8n|8 zAYk}Jcr8Ppi^$zH7)K(1RCBl%PoLBz`%~r+QIvC2@LZfB$Ntrkc6M>#s4Pb6Ns%@C zNVHHctI|ssR*hCEWz}wX+TJ2=k*RD2eQC!1xFd#+d7ES(%ol9fZcDEfZ#!z(V@5NF z2sKx6GO?ttgnd0gR!wi9B`U2j*Jl*AED&?j$rhVlq3PHgdVk)JePOJjw)D!BNa7=k zffFs+O3T#M>OZe^G6$6vHu0!0y_RFyYHufr1Mx~!sr9)u9sAg=$2kr{jLTGbJ7O%J z#*aizSbgG%n;Ca9dq;NAX)&HxhDF4nP1S>QFe`b;53r7?%uMa-XI z)Y#=$W1HJF+)9Uk*1nyuMLQmk`&s1^={?peCv%K@fRnqiEXmVIgQZeP>3z!NLzX_Z z;ZsDIC=uYQw(;ucs49oqa(hyNQhd1Yp=Dr2?s6Hl=Mvpw8!7We_A%cf;kpWaej2QG z?6Sb{lbHBLLA4r_*s>V2b(^q10cHkwn1mTa0#c@qBEN(Y?^=~hQok$RGsup~aSZNM|Q8NaJBe~ix7VX>QXYWaF@<-a~oBfUU>8t85nOX3n>_C&(;VaAaBqeen&9~W#~G&Squ ztn#%V!eb2uZfNJ9injU+!aqs+c)}nDR<%hl#<5# z8Cw)8DnsJkm51GO=6+iCz4ax$M{D1|zI9mQ+7RY8CV7!NEKZ_e{ayWW&9QqqN@(e$ zQGvTgPdG!GhQb}9a;ZlCVNhSM3yoVoLsmmyQh8>o`*AzDW|Zjj;M3>oj| zP{kInXiw5`;(@wx*c`L>GqAk~jIt1yi6WiQscr#Dg@vaQQHfAQZOo zop=RFFS^m&wj(O^<#%6vnmvEy*Fc};YT3`J&$rIwm|v{bqRWQu;)QxcW}Tb4jF91s;vi9&yB6^Bf~WZK!0^5`vpPcep5nihtfT*=uWbQ5gyn5Jrsjh`(3bFlw2_{oX*i^tt=XPhST3 z3!=n^zwv)x9r=`r zEKNy@gRjDQ2_upIcyJ##2SwL?d;+o~bbs3qcq6|ui|ReSj4{(qeuhy*(8;g=_4Y%F zEz^*WrB~afXK1_Uk7{m1ksPk+XnSXe+oxhC(-__gZIc&88v53DCo5ca@)g_8d4{S| zeecK-k%>aXco|v_WkBJ<1=oovvbqgf=b#UTTJKXkJ)H^tX9lrfd-7lBxl;ZZ?H*RH zp$u)6{bky!jBf8Lo)A;?{XQj{@s?e<~k$0KIH7&1iuM}v1v#olc%a;8)emzl~N~_ym zevVKoD4eS~l}Lw&{hmu?yLkq__{D&`iqGH4Mh~S65g!7XkCArDWqjc_cH;FTyaS_| z^xs=%;zn~Gr=j2Rk3wY}voNQ5QPha^{X0sIWM#eLZeGra()|%71LPosS;^c^7fYo6 zM?*}EtRYc`yyt6Ay+p^QamMUrWiP)9qNdx%xfJ0{&#_4S+WjX>&vv)w;^Xa)eGyP@ zcW{EQ&JEk$WZyF6PVPj7b6)&PcVpI$BKZ@$I}yK2jXj`T%4ACSW}T&%PYZ1=<bX`J>&k)$sB1`iTs8fET)!C{tH zC*s;IB~`gal$%B^UQ^Uv@|C)w>LFrr{7YuN?Rh>FB{EOHe*aeFtp9A9;D?6Ub0-sa z{Ts0hUCNoiH76?~1vDJS&0NO z5Wru`$a`;n@W&)NlHa1^vP7mt9_~xr3r3BXM-25q00Vc>&C+Osr1 z+^MwbO~f8Exc7j9C&Im9g%&5tN22{_q;OZ<-9uGuuWFGCiNWxK6^4VXW+f5;vk zB=5&voF%mXnZd^AGM7$M79a8D?C_Klf)Yl?H{Ra)UY#_Nt@tT-0IiK ze<8ZPdLPxnjr5)Jh`qjB97a=cYiof1r6!hG6zfy~%gUQ=4ZD%FSzGu?rV@R8%#l45 z*X+oP5g%6gSrVGmzpt@c@p*ldyi|T$N*6vtj9au4%9U}S*HPgNRpwdhtyhF%R}<(+2mRr!NVhTQ>ccctjhec`cYPFpFD~igI}GQQBQ1+s6nUx z(R67vPYI(yjTjYUzX?x=oq{m2uANuj*%E0^H#)ArsK5I+h9%!joJ|4sLwB78yjOV9 zCYx~Hzt$hF-Cv*K{=?=|R2Zjfh4xg!{bn-to6YybDyuI)HU@vZxmH9vbJvUgx`j0$ z+)+j4MUT5xqd8CJB5iq8z2VXlPh!iL^VMPaJ=2|tKe-MDR|+(Fj?+a1hjK1*5>~Ia z?*8s5>h)$8lS#WX|D8t3<~!y!f)U3>%+#b*uQYb*Lql=p10^rHqto4%^w7YZlB1JrIh zzg0h#rL%oxR72ZykC`3U9ZTR{m^w=|e{9Fr5wf8dJta4JagAI}i_pfj7gv0wbNX_X z;ICY*rjNe$8aTgVPHrY^Pu8+_E@m%28%egH5u{;8Z zFVJ$B{j`<3ATE!pbzNM%Bc!;IC5!1kVvx@h!TDmh9v+Zc;(eq%}>efYa=5;AN2dBJ3O zy=4xmD!!$Lp76mcN*_z=MZJd9bymjNt?NGAOWwaWPF{SU$IMr|=VoL!k2WVQ?eA}H zm%rw9%5vSu4{TL;Fibt~nQ_Q-u|YK<{iB`!kgF(^nBU&lK*QO;cHD`x+^uCz?a4klWKowfP?Dav-2uchsmIYnKo;unL=0}X3bW{`e)y%r?f7=w?0Zw07&1-r)B_bACBHamV1siSPxNg~u2LBgPEdfXC$e7nRlPzKK;=Hv?4IBM+Kd za`bNaq7dzSo&zWJJhOy5jfF^yM(h zZS9-+d+e`IN^gj8TzFHao5Ofce2v4Xx%*XV`GW87ZZB0MErd!ui}J zEBoj>OHkeB*D>0_#Cs3N)U~aazp#OjyXl|W*guWF zdzA(lv19gs53No4AMY9YAJ#e-a;1bZ*T&qD1td(xD%;%3B^Aq^9~#+kd+i-F6>* z8Tqza@Q{Q^%HoTH{R`SSoj-o{Y$W*&abbloIwq}yb5ddRr0QW~X57BK!349OUHJTZ zcFO)g%Rr*+2LV2x)G`czTJy_V*AQ69f54%uw5dx8FWu`r!l9q~b^N)oW4i7KBmZUV z{;f#h1z}=RJwH zjF5L@%EQ!qfBeqxaMjvgN@bACm1lSw@>_*gDsJaOS`I%?Gg zHWl#A8^_t*|izPjw>#V!=ph&{`$3tOL`?*P!pz^(t z;J}+x*TBn5riROUdyL{sA_qtA3+|@U9RCoC+&a6$7kAt!xm}5vEFR`>ovo7?JLWA; zBK>5&FOxC6F-czm-EQ5G`tvfIqowd*_g~Xf-pK&7N(`VUP1Vji>7~VLQ6U z+Uc>K2rJ`O_DXc)v;$C0xb3ThHOri*Qvdu;rF`V#bi zU$i=QEmp|x@BhjiTS)eb=MQ{3LN{wy&CzE!S)CZqDC8{B>onmqfsr8J6&aZHPu#Ei zLSyrE`g+lu>i25J!Nh9gFLD`GJ;Fj0#U5t=C8Ae?qx_CPOH#NUwvN^WIp3=P{Mff= zUdSwsYxb$zz}JF(cOLbGuh}|N$K|JMj@#ig3}+Yq`~)kK3;i|)d;`6*t{rtr4SBi? zH=M=Gi6TGIuHBX3T_Mffg}s^b2&>A(#rqW{7s_;fL;G2aeWfXJA!&+%byeY4nN6h& zJOA{o#-Ra+)=pTmYWH80Z`ogSx~^*PG}zhLX3fg*wH){344%jKez^Q|Ef@Dz6b^q` z)V_klY0&m+bp3_VQH0#A=66jYhjpS$(c^=jue6urk)>bc499TzP9D&6IQ* zE!e0!OppYHN|xy*Rwqw%T`69#DO#^yOO$T>1Hc|t6!qRZO0DHPSlKYaNUgkst+u} zRcuWYs}I&6nh?(ZtMhkRPp*^e+`2y)xzVPgTW~dV&-(3+rf0RQRCe_0qx8-f9}UcT zaru5VGP1_;SDWcy*;p|;b7qyU;*bV^{!-C;QGLu#OGeUW zqgL^engQY)mDc9#xs~_MR8jTYEU@}Dloj$*HhB(|%<4T^i7>E>y$Jje6MkVA%iVx~ zE;UouG{YNxfH$(u%JFhP%H7AVxcZnSwx?I-Gh3yrlVA1vpsl`9LAYacX;$-$3U;wM z4R4s;pf(HRK6h;={$6}EIp0{`2RdoG6Mc_{vXAL9B2-#FV9n_sPnSYMerWMIFu8u!d3H=Z(5oLOSE#Esr0nbX?gR@74@o;5RfDH}PY z(sDGPHdUPT{X!eY&|aF>JX2#b_iH!FKE9ey!!e8Vg`$@mqGH(!uB}B9RIV*NRWJ`e zf3KztQ(`rDq*>K2J**%ry_B5p`6HUslS5wnKg4}?RNc?B8153>C3u3nTkznJ;O_43 z?hq`vyF+ky=i=_}?hd`=`+KkbqwQ&5U)#Co+{58~?(WR&=+1t2M$0o*E!<)mcuVUS zrWg^g-cr9EMx=@CJr$PU*E}!M8ueLf)f`Ng?-eYUX-r{^|4c4C@&Gs0#k7n{N8|YV z>q?xjCOjMXs;~VeC41_!SKYXp(>Rj-$o}_o{dY8OsBv>|+ME0`iLXrT1bBLj?dURR z?K2*puJPf3zVNH=j*;=c2fF7eq1D<24UcKPnytfhqutA|HAlOad;LGPjx}Cb)rYG# z_vG6fr^^QGyyDC|s%KZ4j2oJ+TT%gafpeS&t2J|ld|LBmNru>)oI1k$W!u?2lTEvO z{nkMFC#t9&e(RCmYyNhuvgBY<)M_;!eIDO*8mFXXpQcrFgN6r?mQ|aN8r5FbsZigl z+cS($!PAc7`&OamR;04;+a9lkgRy6|%xN+}Sst(HdT+(l@H*T~zwp{Ky4S3C{4|@0 z?Beq6K22h-uHQ?RG3$Jx8rXU@a^{}x&YA_Qg>i_Mo&C)wQ`NO~xNN0nbOrs|gKxo+ zOo`BNnuYt@V{Y6P08ZU)bh98B_ma2FG^L#vqkw(L)NMtH&iuJ@slmMM@G$khc(vYD z@n(fvn(Eh8u&{PU>u38<+AGFZk28AH$F*SMon%A$n5%V@E0;vd}@ z538Pgfl2clm^D|a=Fk47oeo;sW>zkD@rUr_rS}0cSDx(Pf7bU0>$ly#G!H0phRdFG zsceqa$6FT49yW_F(maiwlkgH;diXn@*;AiWq^3R0OPlW8s5|Y0=u+BixJRY$v62am zAB?vEw5-(anm8!p4*);%~BIZyQshnkz zQgB$hW@_ctxI!}=@{sY(_4mA|X{zcXx_)2E>SPC=Gf+{I%Z+~iD7jPItCzifttjW0 z1saXxOHAtETKhjuP5hk%JaTNKo0-OqTI@D4s?Hz-9QA zkSdZ9GTx(TP~qt?kOzoc!{Xsev}D}oAwg1n#5ms6JB%v?RN2*l^{q+iFtYA&7 zw1PsQbupa}JUE{JRMt9}L>9RJqrUV-P5xk^Y8UgDjqQ6v(R$$ZlY;$g+%pMSk37V1 zf$7uQl3qfv-;!lE69T^+o0t=_c}n)mkS!sS1_G~1kA6isK$#D6yluKq1$-A`nq_oz zmKXV6eIqz2%f90UAqH| ztOA^=wzUHkVRtptqBe2KUvW3b!E5UQSL|~3XCz+x_*Yg}LC8x}OJVDWu-^QaYy092 z(}jhgondM4o9;wuiY{jyLEMec1NS<6MuSl ziMpZxX}5VerR!~+O%QpYg2(Trw%#(>H>dQ%af#q`vgF7ncdZ5gmglgygLJv3W?tlF z?b4B~``d+MKe=7=*T%hD-EHD>s+iV}EeAfY7w*{X#MHLIm^8{jT*q&v4zzI&e1o1h zfZGd8tFn;fd-d8SQH-ZT(O&2jDxrIBnx$h)@gVkAi^ba5D(9~%?@7#(D2qH8{_;9j zjT47hKCJ;K^MePx6K-5!D8)m{2^|sr+Ioz0dTCdFn>pRJjq|j`RebYcT2!8$k9N$? zCt+v*m}*{RnuNc9okVd&cqe3kmd^9>R_A@kyC?82!y)arUF|3rPsm{sk{I9W5Gn9U z->a-A@ZHfL1dF{eK{%)=F%l+azSU6IsK8!pK=>n{MMLrz^l}p}Nk|%gRPDAHHd^6m zmcXGTN9hcHveZL6ub|l{sJ3Zya=#677#4ZsltIhOMtLI1sv-y z{I?hhsQ&X@uPBh1e;^Xf@PBte3EYw4(a}*BR@O-6e+?*H(ZmQjybf#^I8ciJFSO#i zfkpxNt|dX0*}vgOkqd105M|c(p&qt$|3Y;zKEJE=AH4n_5%Axv(f^bqsd<0OR%SW0 z`ab=`A{?SGs`~2QQodKI{J!v&5#=ENB6ZtzO;fJf_U5?su7qAKFLqkDH{`yzKHK+xN@!-xvGJdNe-*(_{-r4DPWePT< zFuwAnEfeZLI#9X5gq?vjm`9&Bcl`!6eN+g8w)gNWLS{HcX(y8rcA>=~_|Z?d^%lB4zk2^4D;r0o7`f5o3Y@CiF_hnmBPU%vYE2$XLQ z-ZV}%$NfDh(k+Ug&-EGECXjw=`yX914l6^}(RU3osoY|S|FS4a2WS5+a6PGu{zF*U z08Z}1d`)Li6IF+Gro+-vLbQCgfnD-Kjx7AlFmo|VCq5E&R=HCNtzCxH|5+3&uRuf! zLBL(JH)OKz;!IVX9zlCX9%0d{D3V>RPI_z#+cR*veCeo)X%(Gx0#Z#YjDT$U@$2T` z_~tL7ur=j5J%%VVG7BI_2|qYGcw^vmPuTd#cVRT3O|t-YOWzZkDX5;7HLn6z$}Ymr z&G+75bEEQNY0M8seIAmufB^jlc62)1D_Fz}(HDpUT7+k%0#*mTeGcw}KwYOuw$4}- zj!-?V&yoq{fhcrBnzu{NVUNZ^Cuj`rFehY8P&-d8j)@X;DD+_trTB5=uCUQDklKo3 zkT|`e7O6LS>XRGFh=6J02AMo?a4R-_|N7c zH`0ql;gcuLgk$LuT5Jm-?bwqxOSn|3Y%AR#i);O{xCUxNuRQ2G!m`M7J;_8}=CYmU z>YFdijApWmx|`a4gy`lZGejXVikui$Dm;VVi*1B#Z+7a3IdbK+efsyXr#0m+?rp}` zg(e9$oRBL4KtY_2`4GgxTHW~0Zj8xkf8&jf`RuMa?S?yYm-kBD-J=jo^iEpm&TLQR zx*|qMt>7o4!C!;cX>|w@!waK11S7j+On(uxYL+S45%m@Q8V|HD(EteiN$j zw8mM*xG6H<>?1tJ|IYT|W0KkI<^+C-n0y+0$ppSOCcDa3qz=Vdyg6N??4!SaF~t-~ zO*BIo_~PD8+47xu1cebqMf|S%psUHxIJ=(i&1%UD@+Um7FN@%?o)7o>(}*hCl2tAS z!#t7y()Wum)*lO~_?KI5HeHDD`cjpFDK6j+bv-SWs$Du0(4a&Rub)gHJh;qB{{1Wf zV??dIvS3f?L_O5h$!r^r()H0(Nl4)8pHxkD0-yS>VL~6~^?*)e-9s}Qgqq@}7|1o6 zi#1sCQ@IALwI|M61X9o6At8f5az4CB0i-^#@u{fneR}YnpkM!zx9?J^Qi(hVyklK7 zCS#F`5)Ssy&C$xpzG#fc3{Rz!;?Iv|_gWoTVRM3S1D&kBS-QFk(4z2_ZeU{wG5){- z4Zcq5XaOaN7Z9jot2O_uNYi0WRcL3Byh=X#B}rv>SkA z|IGS_Y*Lv|^W6C*aYDG+XAmx=P|L$=NC9Amx!Bdyq;uv$Df0ux2z(}-Mz86hHbQr1 zjk{>kC{5a(KaCK>oZ3Cp?&wKK>=27T9H$g~7st=!&`UExN??l?4?taTuDS8_cPKj^ zV=tQKV%L3nlZfxVJThF_URW<~q%&;&N#}k5NM+M?pp>CW{Z(T4_PYho7~?$6Hc#9k z|8eKM2(M+|??u1L(no!p7JWb$8ATr|xEaFU7NQoU=eA3N(K;g9GUFq#(SR{j0WoXa zhdtQzyg*48<0Kz|gV=_Qj=~?j_+|}K;qa{z87y6CW0ihvcx=%>egL$_8ry19{~#bG zH@ansZPceMUrrNp?IdK^99qJERyj{&kS?ee|Crv9u90~Ipqzbl+BIKy&8k8)g#8r& zoR5O_BZy<7g`gLZTud9BWf*c~FfgF=*fTkOP2_XbP!5gQvpF0}Xo=V9 zZJLA!3}KRY{<^#ZXrfkkrv4imYB3wB;%KZ~So;eEmgSs&9kd61i0q2|Xf%BtY7Mzo zKBwlA7?=#7=H?62elV3PjvO|25LW2mHLZPJlc>wiPI)M5qA-FNQ6{^roi%J`un7kv z?7tI%{k7(`^7p6?{V*o>M7}R+A)!cv-Yia8>K({jr)apF3S}$3hjLdK7fbcJ1_GD! z*^NGbC~c6wNgMXo{uqoDhsP)jT6)s8s3epjh~lNz=UutBRB=Hz*b#=B++K)z{h;&gfH&7a%ciah%NT$URKf6F7y;zstdD4{kfbVA? zncLiKV6y@BhY7Y=^uvs0Vsbn-T)YufZD)#)H;AmL| z#mWlF2s(_~R6!m&eVc{ffcKlUvScpPn?ot2@xLJO>z5UBGB~5odfTA#Up3-*5!HpEN=>tfwOqv%Zgf*`tKy@e#FTa-;)p~i zpqRd?g%J>Hco*MJ*)-xPk+Kb2GN?2j)5z7RkAWng2nNWd& z_`w;w&EH-7$4Qtt-kOnfw=5Yg8MRToD9O0ZxSRZM^SvkL-C&04A;O|YMQB|E@8`Y0 zMTGmU)-*4Ul()y{8EKi`g4`8(XZ&eUn(p>iS^r$_1k6$S11IQVJg?~5UzSDXDiEgV z$c{2~m#&U7jk%n3R!om&bx<5-_hzyk<>rThYfP~Y!HoLf+}4{&{EeL+3K8`wq+l_G3cJ2--C zV<6@PpZZp(aK@x^=%4roD>~~(jd7LW(__gBj}{Y(;-mqNy#au7F1hu7;;m|h^?E%= zQ=tO4Q}(6C2$4slAQ2k$BR_`r+Zp@Lal=x(KxAYrFr+W{117Jhn?HD|@K)}7P=SDJ z*cG1%d@Ogi8+;lEuGd82Iyf~yycKlbD!7RIrh(>7y-}r4PT(gVILbRD&n<@5=nkY6 z!HsAz!(=bq%)t4tJ%xrZIAv}N&DbqU0%yOnZ0Yg}z`7>?230{<>n1*JFF@F=B5?N2@j5IIb)|Z>0ONkp#uWj|+VtRlv`NsmQV^!< zGCSvtfCMKS`hc|5^85Kawwd$^=>RGEKT+oR(a*|NFEYo5{X~NO^jVxRqDTh&YSS&p zS=n>-8F;M?->N}tg*v7dX@@-X2HECe!LOkk;p#V_0*W*V)=nj^%i;z84}wUo zvR(2AdU~2K)m90qtUkVjMXVhlx5xrkUpIVe@v>g^tC(gA8?O^DrtZ7xTUC*}hiB*B zX~+G~x1%jfMPxe0)jCesH}i=C&npYD?Ak#_&i~LIcrM*_w|j$TTBgLnOTOb?!RL$+ z6#2)gXFmm4cHASxm)_aBIDethM*EYQxVbUS5QMo$5~mGfrX8@wO*{%R)n%#TTit@7 z#AiV}S5KYuQ;Cgc5nXMcr}aj}1b!61tkPhAQ*IuSXom-aoE;fpAK!U}x?#Cc$M}Mc z6|+zs2K{Eujc|KOrxs~*Ku?(`&J+o(Orjg03y7K>1R|ShM|N2Bu(_C$;=5zw{9kESha;_^3kTkybipWB(?94!c)hhp|DL~i{(h$ zD)$SvLXfG_i%&Xk=ri1@^P~u&18h0Mg`b* zFjNN+eB>ciRp;Kj;^f>xJ+y-hI)Z*?FRkol_V_01pVNDOPV)wfQUXJwdc&kanB_C1 z1<##oP9AfGsTSJ3A_!`eMVJvbe8(tL18>YS;Z1vgK2R%(K;^hbV7HmJFt1`r1g%rgwqo_zsw_7zsf5#dr6jN> z=`r%rbr&GG&~C{YyaAd>I3|tytXbq$(xwuwtc}VnV(R|ZZz_+@_4AGw#~lH8`Bg{o zf7lbRRjZJx)ZQIl@VsM+Nj&RVzd90$u^G_gW7G61!#ZZGtt}~MbIdfbw;)8wjmkd_ zWOXdhbj7geBH;U%w7p;}eHwI) zGCBoTslJVe(42{qplQG~yb8g2M^vMGOLR7mKkT}obfxX{6t@B2nM^6L zh%3^sJmJ*I6Bhoode1)7W;(dEc8ECqrIww0q2khF(&n<=N7=y%rAV#6vp zt!m^%A_(nIE;vq$Q1HxQI=2+&@5XPKGIIz3`1)UOmZ4IZU6`d%McN=Qh!b+K##%@O zSe{&)FXqwi$kB+LA`=ss-^ZaIqQ0$CiT1*25%zx@s`sU6{*1lttNK3eM0z)Uw~7}B_8N_8Mlp4BY&*PR1BK)Pp5TRRcO^89zd={^c6A=yeBPD4nWu*y z^O-db^4qp=9-V)!Hxaf9eFXUTjDVZ_?7LlGTX12hT%a_;Wz+<-|j_8SsOTjqeTnQ5qsc?Im|laW}F&C$kC;* zE&G$gVVvmOf}$XVoO^Vq*%xk`Se|ceM@3)5d%@ZngZjh|!^b-qi~Og4ek0(t(aNDa z{a)8m;)$pAIsf4`j?_E{(K&6&w9^>@_!cnK)Cip~0Bn$^UO{lPs+k4@-=DeJDJ(*D zq!hfIBLYn2iqhs{J-4!D5kpQ6M)b1_e3qqub=Tko%hBzR8KiwJBA4W6h8+W8dZV50 zDd^ATAaH!};R&)Tup)dKX61ryox_o8RU9)0O-N=kNY!g9%dG6(`R&l!bb~Zu3mzzW z_pGBR zUrEcJ@Td!`V~`$?KlAtJbX{L8H9%dpiJHAY;`73=Ae{&Cs(}BLRhONopZ)bj5F$l= zC)kF7rF6D~%{jKbPJ|sX?ulHcZp1pf@=Tw*Y<)exVZexvr8YWt_VU@G-iMTQ1|1J3 zEhv|Jh6s4)(K&HiBX|NWxQ5(|YteJb>;R(*q~onzG$IG_JM|oqzzCg5BXTR0PC&{& zWcD2~I=T-je8M|O*l{@keJDA4F0uaW!V!2jr~@61 zN_%@&oKG_=xeuu&qJg^F4Lj>08 zcm#`{5Pn43GRvsT6hHaf+EF^Ik&b5$`J!VvunVdeW!m=K>kQm)pCgF#7dfv9!Lco6lblUEELvsn`E)a#)$XrWSos*a5 zE1O{g7&gAc*Uk^{5gylndmSfc1$SG{W*)O6VUprA;r1?v|2*vrwrff@Ve%R}*lSM{ zxK%w7?#5wnbFu5`XbX18*sQjU@yLKTYKvjvn~;#52u|(o8ct{x9wPPiBjpqw0qxTc zZmKEwZF+hVo_$5YodHM!(@=w91z>4_3F<7BxF;c_{1MAW%Jbo)>{!6^v4@q5;bXs? zMGJmVFFIRR*)|vWQ4fvz^UrTap#?SJ|2~`8Ypx0MPhuA{B>W#JLh{Cc)Ym+U3|HYa zBmY)Olcij;(j1S4&*crwB!+MouRfB|27c^K#u9xsgRS3It#jm`Vup;m!FT!H>~ij@ zm^||h4wF!mWvZk;rtv$ZFQ z7uR&I0+EWaes*YUh1WURjq*_Hs1T{oumz?mxtfoIifDMV|0-xA7_MpRA1h_c1Gj1Z z!he2QHkQK97Qo;~@TH=GHJO@{2d$p} zV{h{>{O6a1|9d(0|51(kKWRWR&0{d2ybKCU^u{Fy)Vwn`NepnWI^0T4y>cKj?c-v5 zR#rloy3vc^=r#JBo@_U7iMj6G=+)vLpr=a~k-$~=+r8Wl?hn#}6?wzjZ z?$tS$*}Ph?$6vASHR4~2!{ggwrTOmOtK#O?Y-vhELc|e1)QiHGW2TkSeYw)~P&ohfVOKDa84!venw&hE(lwet#6k!>bvYfn zlhlCSI6nAmz(8`$;np6i{sAO#{GAkeVAWzU&j1yHOoyXP zdCU(T?i#FBo;ir3WsIyk+<0SDg2YgXWGIwkHNDM2qqV!mT5D5bK<*gN1wP%y9>Hkz zd#??dKo-_JH1EqjC;xfcFJ9csvhL)au|sz{^C`EI@aBF9o^S;!8Y#U3G`hC6E<8%o zIT$i%3I5wie{W1tHKs*)qv&MI=#*G-J=7bg)CYBU z6VH7($HJDwYIg37U7a^n=&@d5CVNYNn^7sFwXf`}ZJ#V!x))2%Q_*C#)2yIe--~`p z%6o30*{pH+yY?Yfw2W1GQ4;IQ4$?W=Y98E6RHZJhYh>$J?H#twIVoB> z*y;cAKdzAMWa#oPPJQP)+)j6P3tn&Q)ZJ*C6>+jcEoS!hk#N`dd18NILEw5w{~4ru zoxTH*?2pR`=}87@qwtL)CO`zmm4P%d3$KYR!=X`$Ug3`txPRu8$%Nf%ofO%O#c?qb zyhtCx=lulp9Lcc{ATFHfa!iIAn{b}RV>o=u3Pz@TDCRB&eE^m`{+?cl`!OxSUxUdh zApwDkXw|wZa>|`3HmS@8I!?k)UYh2j_e0WnEVHZCQ1A$;R`KU78mWo|v7sQGXweL znl13;xVsP>FqSGf7K^U7ez4J@)$~nyW@@q#fP|Ag`-j*UW~x@&)u@qveLxwG)+AWBotiA zpVADN;L~>2?UHlfQOFtqfeq)^r{A-R?Lq{SgalS z1~^bk64}I|T`ni3?-Mul_64q%x7utK1BH*)b4#uXy5bS$1o>5vAWJ47oo#ii0e zxo-E#9QE4=*rTA}pndp{(iD+1TJRaHraFY0K7WX=tchYwka zI@&y&41-P7k@<}EnLO*GmiM9%LvLEmTqmT0lp6q9-NiOK>Z)_&DFPge#`G191pEfm z8b@j!o1q$^%B;sd`hj+=~=U5CzdCZ2oC2z&CDtG0E;z9{6FVbW0&wr|@YW;%enB zh_0mdUjF>K(rb%UuK+Teb#C&5}$8+ z7(9*geH-N!G;?mS-HM-;;^G|X6C<>VONSz(i1ieM<21IE_Ct$H>gdM7d~s@99JBWF z46Gm&&)|;R^-k$NnLvuY)&z`SUiWj?r!X?NCRm7OhqGWoa5tN}btGz$`U<$4BBW#>QY7C*TCHDKZLJ<4LtFNwjG#h3~xW)Rr65*KON-Fy((mUDJ zQ_Rrte2G~MU!iHU54z2WjZIQM+Ue%zYv#MSZqatwZXA;sO0VbJq*Zp&&&(WY^Wh(~ z;eTQX-5~Bfrv?XTqQRXJ3;{Y;B~IyIde#cA=g=`}s{%DGfsY$QW8q2@`>MjP(6lTM zmE}zsI~ImNWj<~d=7#O02yRyITy&oKQ@7q8)2$W5$GWT{?5!UbR1sH?zsRv>Lp6<{ zN?084efyn>ywv5I8~50-WZs81L}$Cja~l5cs=@A`CLAYO>3p$wtb71NX16-}WyS6p z;W*;)kN3Zw1>p6+ZA#$ec<=u0yJR|#V5N#iIN`+psn@E4hiew9Ltx9Yv5ZwmHFG~v>Fnyf2VN%6`g-G>ieGQ7k@RUxRHnJIE;+KXF zo;G0j1)yTpSau1{4~@M6_i9@v2hNcw;#{?hI8DR+SJV-XyztcYl$eUgw3JwO>umNMt8wAUvUc%H1@y7@+``pDZ$VBBR#i6m*b}^Zz zrI+S6^0QcE;1A85Z4Kc7u@wU@A7TGARA}q@0KXhniM%DVcK`EdXg>p=46P40seTx` zQ+T^|e$=-f#4fP z$ry6(kY$db)ewSuWSPWkc02K++p29z9J^htDhe(Q&7&w5``D%J(bh#+(o3*R^+{a{ z@9N82pc5U7D!sq7GRG|u#aM?pDZ8c1a_w+NNDaHdtn6};C9E{JW8hrIjqf^}jsZFI z@@6id)^LgNMdryyo${041~FoXxP&|9W`?|r;g^psPeKL2SJU5<5a%Q zW$=PI#Y7-mK(vt^D;*f9$svC&hIh0jj*RLsIGh5tUc>WmP{$fqFidw8t}rA6Dy}mp zrjT?$$1o*APO&*!Gmn^}!9_%%)I{sD(r+IGQ1FuR;P~0~>rpg8_3ZV0yw(Wzfa3k^i zji*6Fb+UP5`m7^Caa&odz(3g1Z06?9@to;p^)lT~air8-1b^-@$@N2Iiw1g)X8Ab7 z3O5JLB2E&k`7#k(DbhN}*EyrBIJ^h84YkqAO2Ng_ZWY5Ul?3w|y3wXOcx9KgCD6n( z*PK`!>37}XD0OWDpm!Qejtf;H8MGHZ-rawu&5OK*$o@Fhi|%*R&`IS_89gT_u)2Ne zQti8!3$?4L22XIo>T;e$ul@Zgg=Hrr^6gQHtj}k}ZgV|h!0Log$hrMp4!Yg?dCpzV zDfH^}Xt8!}|6={eA-cbuZKvm~OfGSn!Chq`Y8~P}Z zHngm4I_z0zw;Tv_I{TxvxO_;Z$4DBi2AV7ThubcHRZ?osvY^;C;Z{Q{BN0CFB%HNz zo?YZvh7ev7*G34V{6O!w#8fvuq>9-~FJ!M+(dcTL!F+he?-Nr!yMz|fInbhRzlP>} zpksX9P29ww4GQk1#mVx|-}A_L<`=QpOg;I>64z&i%<&BF=&d(~qz_w~l% z+ljKu2T+dKKt&7UVr99T^2>wg`U%yJTTDw3Cmd`cay@|u!U8_hE+$Ba*0sJsJ=ymS zsj+T!xOJEd(N$vVCRE#iK3-S=ofD&bQF=ah9@#f$`k0_d33+HPlUn`L_;*BaKLZUC_cxN2D|B6BxU;h3} zTJcRFdvN-vwi$wEDKD$U|v3x1{VNE_oNotk5}O(Ey?KT6>|Vq)rH)pQ0=zk4a7tHu8;oRPnsUlYQdw zyn!pvJput9Qd0K+C`u>8xixdX;;AFrQEASU^jdbNePpkYu zMIpOSdf;|DG=tx|kEEGEfZC$xzhZ0yMYUNV89b{=$Mt)mKjEPPSgz#y$UMO5*9?+jT} z=6Lc)!VNBoIVIT9{oZT$&hKvhywdMT;r7;yw`!6F3PuJVLoymo0v?(zlH4O+O(Pa& z5!pX~>B5t2ux4k^Tnw0*nVBnZu5&aufB$IZyEo#Iv^qLkP+jAiOtyXL!mHI+Z!u6! z!%3qSk0V}7#Z*f*1tS=(1CDsJ>;)nYMl25I5wnx$4eZM2E1KD`&$-V7{BX{m9aO6= zM{2>%nR}cwhRHpH^#Oiij;wbUUsF9`F zZ?1g8Ry@0|M{LI@0JEq}y?Aka0`m9HucKdnXPZFy@e9qg6N|w6m-nZ(sT8SwV@MBiQ!;oAi%~L$iIR?ti}E^Losk;Fu2TPIEPS#o6v>!qOhr=h9I+QGyHLHq1K?%%Lu!nsog9!S+MH*i!g2xP@L9UX3Q5}TA2H3*BXm49SXMRSo>&hnRZcU}wLBvu2j*^fDB zo)B)eENwWU1hdqhG0D`@wPNK;Cb8cc`r6~XwsWRt;F!255B_>_^jm|N0FKF_R_?R1 z@C!nQ;@36(6(%zc(x1>dzkIDl59wDEBQwOFh(@G-jzm-CK#dqo+D$meW&rglcpzgN zXBgN6g)ozi32J*Dtat3GezlFY-_Z(rpOyz)oSDyJylO(0_ zWmWN5g$BUgtADKCyd;Mv8w^i#Ffmg;fr?TUM^PLtHBz#ZcN2F*3YCFj$|aiG0rfuarqE z43&q-NK^1?;fJhm#;s+0nAd>#Q{1Iy={4QaqHprpFC#FMrqIIS#me@1cingm0j|`N zTlJaQ?Wax^ntK%n5`;)7)T;JVUBt%iw9NK30&|$K{5cboZEIRiQ^dBpz5u$N#@{gU zI7?ZCQyka)kc^_G{fH*Ya(Pt@}Tq8K@g4a5pYH?il%Xv(-r^`2tn!ce@8p z#4EPmET3f({@}qCM*Uu+2y@_>2L$?p4M9Tt@#$s8!6vE?CG+SfvaKy@U`6!-q{AP& zV%6LB_WyK*P4Vf*YS~p0MEjvm67my3&wCqr#2Z_UYaH?d8)EP{`5ROH)p$4wZFOBY zhVSQ2@^-Z0msrdu-h>6_vq>=GwaCz>m0tBa%(r$yp}-=ahEe67xTg%Yy*UQV{i7=P zSu!(@w=2%J13YcMyRW>*O&ap_^(EYpD59Z8Y$Yu-Zd93AE~ZG%D`?IuUZ%9B>Xw@7 zjmfiqx_0~S_KOiHVQOt?bD4yS3YH`lKY}-dR(vLj|6;Lg_bI>MT5fmVuaj zCi?)eZ7M$RYN_#fgU;uHI`24+S25qZmf(D&L=*E)uOcrd(uLwv(4hXYO?{n4$*ezPC0!THLbj&3z zj0@UV9G@hsR~jw}?i744GYyj#1F;16)Pnv*Q3(Zy2S#@@5%e$-r1Ds=jWAwa6X5QB zh&7>m)W2AX`vtu+{K%#KPIUwh^IGG%Li#E~^hG||=C(;|r^Yu@w;cL~3 z7xW-Hh{KSdXG@392>sgA$r8;rrh#{JpES#l$CEg*@XW-%tig@wT@{8KM48_pKbJbe zhnwO#!%JySHo1;eT*W&{w6*jecfc~(e9vzaD#1x#99zt@+7o{v5vyLpwzHtnQvLk$ zEu=!laLH+IHAeEWer1Hot7rT$`&8Bi4~@F@kvERi*#G9{>FYTR)ZilE^{d!cIezg9 z(v@CP#QQYK;Q;Fs-#IBR3JR$G5Xqx_R4BFU)ad80bF_C3n^_1=c4CWTYJk=h^3q_O z7e9q2aEk@Nt6dFndQ@n9nY(Z8&YTqIP3PD;T zE(AYX?iFkSNpx7xR5ugz8^{-86uNxMFmlzmND(7{LRX-Zb(!gw7}?xBfrlr|1XSi-GvuyL#}uhXCWYE?zZ3TP(7Vjz zX(Kj`#>LqD?{i+I7YK$Qiv;wUB+Xqj<;16ABNbV0_1KO)$qs^XA$_mHBi)+I@K9)x z!gF`PhNc%659k~J_yFLP?|(bf=C_YdKOcD<)Wmwyrcx37oQu779C zhq(m8@V}G&mv$ia{}s`H(Cx`Z7?-D*_K}kF<~gnu7iJLD#*Xd|d_;&`p(e9^+iNOZm7?&t^&1)H#*H#qfNhc&qqLrv z_qB^Qiip}X_zfE@Kyxh4_RM(8otB7%mtL6D?o)LrLLz-K-a7U0&AUiaMZ{Qs;#HD+ z?3sUFJ6Am6B*%5I*9&BLD!;0cndvt3l|YicG5J}0ZSTXXXvI9rg>&%@lF+7TLX&7W zayaU#5GdE#BAKXJ?fjBGvVzP#(rpaLC7Rb#iw6WNC~TEuYJM_UM|*>s^|J8wg}L?rcB%^h9#e07a?OnDBDqTG;Jrb zXiq|U0>5gOse1o(Y;|{PlggkO26*lOQiEy>Zmn<$UfZD}|Uwsb;h%iWQG^}jsGhzz`FW8K3N;N(* zi`r4^ApXRSrTf4j)e%o&Tj|v1$h!{c6b!#Sr)9og&MJNOF{ZK(f-$QJFj5Ds)w7>+ zoXqU4uVQhrj(M&Q%G5 zPQL*C7@bO1YcwGfz`+!=W^xglB}(2MB7Skm9)Dae)FED_`b|V{E;X1U4wK{B4(pUTannE7|z>khp|oBD-#5y+gs6Z-xa3Hc8$(GG^MyegYb1=`mI8SCq4xX(HuA z&RRe!{|I|KsA4Yvq9Wmrh7q#|D(|?MOPMyK?TNKcU)>3b$Vk>9lFkr`Lt8K0(eH5fM-d zIq)7Cp9D)v0kD>}rpRiQuzsTYs9g~g2ZwXfxQ_{^Zs80s$BbQXjx4^abE5gWt+fr= z9|;n5r8k<-q*}F&vt6~Dp$BG+OD;GoiHG3&i-Fa)nKG%58|Wn7Dzmq|v}rCF`K+G2 zwv;^+j0`x@sO9}6x`YQc>c}K-)Ak2mRAPy2QiMd1c+)EMU=z{>ye}n`7vPcG0-^qA z-tC&9c6yS5a%^}I9%|mJwHu(f+bdvpcfF7qbjZ2fi-n2R#MC~5a4wRpZkPl573&8+ zmOVW}_UP6A^`xDE&!MNdJ-w}kmnUP7!O!-*dXU0*CgB0j$f#bXnmQ*7=jlwP_g+fk zO+A|5uQuG8Wj_a6bXnJLZ$R(WvCQk^1Y88R;k`c*zYU-~$fx{Sb^Cg9&ts74ywFeR zewu&n@$h|iMd3Hji)j4ACb8xpo=}6=M{u)y%E*`YsQvo%5ti8g3X~9D4BbZUZyj~r z?)1Cj%D2#V#y*PPxH;P%Ne@yfp@GBain-JgWF-cQzi}SIVP1uBFOlIRJX^UJf45+$ z38#i0DvXN!kZJvtCsLjCGQ^bI@YUQwN)h{Inqq692VqrxXE?HYxQv|L$Eh}yp7*@) zJt8yNo2@f&>Tzch}(V?iSqL-Q8V-LvU%_-CcsE zp>cP2cZbv8-e;e4?-+O7_XqT=)vIdGXV!vqdC87pZUF7mo|0wEl{>va3*Ok}v>MQN z`ZT}SVo}(#^mtGH>NVA;qQDYMuloa z$vf^oOVz#8zSq1b%UdT0HPyKK`Gka@EXH2e66Pm~X4%35s#nIC>ntgWz=3O$t# zPiduSiUkIXzNn|x!%)GV1j=+CumG6SMM`=V@=Fa&-J2SyEV=Ra2bChC0N7;~B6kEy z34RO*!N9p-gGiE2&p@NdXcp>6H_tc zt1-xexc>cse@qrZb0sWK#l!@9s1+jhzF@rqd=FZV{fC90Hosh95n|p2X+Z}ItJSB^ z)L(Kqk_C0ljrEHjD^Yf~p~yB3NUWmneA!1)mbX?`8T=-{9$TU%(nZY0V)%R22m11X zrliBlC=E|BS<_OVw_Z9P|329LwEsz@TnPwr5$^EzErVYQAD<}_-nr+|d#Qqo8e=fv zA~HSl%$3z>0f@|Hnftxe|5}|-NfjD%jFgPP(AFL*SWUK1?#S33;B+a9sXncvBsG=P zkSa&<1}vm!OfaRb?k%%_)s_$UbX`)AZ4BIN!$zQN)NmWFsqK6g2gKYwnJ_)?vY`QY zF8@lhhs#$su^}Y1O8b?ux{r-ax%kJ7S=IbGP55%M{euepinO7NGI{i9BrF2a4rQH> z+m0H&>_1-NRe%j2KZ$2wnyxlUM~2a117|Cu8?INnUtdV1bj5vvebrklOw#P|LnmeP<^3Xge) zLi;obkBZm(TUwAUurG=ReO9XTbc01t>EAYJsuIb_Uzwn1*DD|E!dTXfWfQ|kwy9?{ zK{F~kcAdn>Sv%S&?o2lZ>u@+tpU@$i?Illv3R&J8Wr4(9w4uGHEm0|lCWLMkKq~f` zCsNdINtYRo=^^PlT$@=OrRit3E_>z`Qf?KyWJFZWoZj78frrFtK=;FkK}@4@VDM2T zW*u6Uh~I*32avY7e`lW?)vBJp#^Ny4@ZB{kP-vB?>1yAFL*=t$(AJ|!25WJ-Dy~-7 z!%A+EGu5J`v;jE*3B!!y{RNe^8dsDG{VyaWV%8pb8@|Pxqe{I%%Gpm?5^o>BvL54C zxt{VuqFAP>4jztkHQIL5VGU0zWQ72Fq02NVxp$S^2f?VRRaL{YS6VfMZK(QNrSNzz zxew1MpyHQ(W#2pB@+vBhgPyUE3L12FFH#9+Vl4ypPnEoYM+9D)N zL_rKN^QUSS>2hmz zkwky2{KP`zJsQPGfAEr%(c_R~$piopX9yjoyPKz_V1yW%9&X@g>i&tu)>mPAla8Ee z33d~(C;^mk(fA&qCKWfxV#HF*3&#=pQAh=pw3~09xtvLaKpKX7GRIXVq^wW#l-}&o zQ(s_V9>_1AxVckzVSH5<=|nNG*VUJRFE$4X0y-9RSB`dubF{s=E{VWRKBrVzv z?MyX_2-%6|=;~z>KK6LYdY+4tsKX5z3ju@UHyw?cfYN4_tOE@sshU#z(z6NVv!DnI4E^THD&`eN{HmZ&+Sqv7vT zFhZu-t75_j7WvZ$=`Q=Es+~xdh!5-1tc21bM}JS6i8W=PVylP`UeQGm`j6?q49;$) z#gShJNMEe^If%1tUf%JsiOkv`a-&giUG|y){fxYHz7}C1*SM$!V$l(D7S2Ywbr-2) zA)9)Di~6h{0~7mBL2BnP`_I7$GqDhMEIzxoC@My3bbE{Et)!?6c(GS@82K>n*nv}r z^rdpXR!K3K+?zsQ>E5Fq z$A4##>MK1e(NXB>X6`A7`uF+J&@d5b1F2Cylc_^;kX#nku{%vfHA_xa6Y>Sq6nlEb zj8(jxwAykhjK1RVP&V9E()H$gNHGR`7Aa%V-n3{DELNXprC68{K@8Q`+>52FcXF8@ z#r+Fnw7erkzSW%ByD9rpAa;3+0)y0Ev|(W6Ix5V)MfKr})BbQ@BtyVk{Q=5L~36JYMh zujLzaD_BAQNuVVZ{dD9<8Oofzo3=Y zhg_#MqJ!&Dv6rknf_Cx5wLf<9g--a3I zme{YjelNdkGQ(MA(!x~2N^5^|_OF&dP#0tH9a30K0F5^^jw`c*`~tkr5zo))(K2{5=SQiBvbw@QgmQ(x{zX69k$CVxrpS3DEATVlb?vyK`Y(E)&IE% zl#xtA_=}E(r(KIqs{Q>X>~?_v|nY*eAeRwA5S%f-E1iXatlxKgbDp* z%sv3Iza~`K6U@7vM-S{Y478D|BTQ9CjPEdVfb%h%Gzr>sM>9g0oRtIT%nrvfROs^f z2(tjqwY77ms+%J$U#9NA`)SU_dZ@{J1bD|fN;{=cgEfoVW>`nq>eDhWt+-_-_A|yu zt4iKRA*WgBP8shP${F&!p`C@NJl!w8LW6Bx|57R+v~!L=%qQ?Qj*sW1DRuWgApBsV zXD%6TH*_j_VJLwxBW-;N4teI3p86wImU z8cH$XvvSKTV&`a3&`Ee5>~@_G5^_DZ5~NfrA!4z2-Nwwn{VKSM9=h*iY=vJsho=&t zav_E)_wg#RP=$$?#`gG5DH}czUe<_{Tuy5yOaB|x#~J(&I(m?{a^pMH_KzaL3FiBx z2{~XiiPwnm>F*hzL4`fLWYuBQgA%#Tv^S}8s9Cb1+eT%#fc2^wL2@-91^MMtWcBzR zGVD>Cbq#F5a!r*EeF3|SP=2FqVAs&!3c6K>NP&Vd#M-obQRiNXuB?J=eeJZ_b{~e7 zR?$HP+O>~mb4b|e7Y9rK@VF>!R+!}7V!iF^n^tK8G^DxOQ}!_x+VuiZ=od_f01v+) z7k6Ruhic;Lb(R@R!p>jCwK{!WDcd>G#W(VM(g%fnevnJ8Pn1XqjYxc?C~@CR?f0Z} zQ=`6@l`Xp1bkN+RcaJJ-AhIAy-KUNnilW&mbKdCs*mcVvVv1|`U=%ePY==ZOhzpVL zJ+nFVD4n=a5)}9;c_TfizfSM0XbRTKHu>=jHI2cxT1!P7k$s3`{F-#xA35#ZZDeIw z*$h46DY@D{w}aAxG3$tfT>vOi6!81=WD=C<>n$3c&=s~z2aqCvjS_?^mzCAkBA`Np zZs^C9M=o=1-p6JX+r?qf- za_crLXTu}?MJXt89{7~he>91a>#x6C--tST3B=}1#i18ONCPq7gv9owA2ft$!G}U} zD5uFqLzS0{3=K}U^$f}`YJ6H7mRC-=E&?W9T78{&mrT=w$|4cBZl_l$g-R5iskDa7 z{bz#h+vDWEH6cGo4?o1t0~=&-B!WPSfpH`m*FN*ZC9*f>W;y99&@-ukhc5!EAU6{` zl1Wf^LsX^7Eih)hPdLi5vu}l^!oS_lM@(YqZy&|| z13cO~$_|zY_M@pc6y>g9`RpgUNBiLB?+nmf$0c$P@DHc}lN#=!St5fdxB`G8SxQ1+) z@IV<)U|zI@Ati;0!a?x=qjo zYkouvH>~8{K3m47+|{n#HyW~})E)a33{wyRMN;H>O*J$ZkDDjy5B$|OUKq-)5O&)D zN5VwpblRO}rX!SIjWkea%FKfG92uC!Y~?P2Lr@+TV=H8bhI{$Nj8d2R7?hWP_I@!Y zv@Ln28!G*Tj;kb7wY|pqre(*JC}B3sQh9=M*Yw4h`S;7j4@*I!lHTuN<1Y<*#wEkY zCaV2pgaFnA+7iX8sLeS|WDr(1IZUSA)%Kvo?j`;;k^nuuUFF5sV3cNt2VICJ9<}m0 z(yxc>TCS%=BS*nqt4)m%LYkUBd|B?jLPj3!bB~M4mWos|1{kcN<1OBw+}OtXQz=w* zF<{!c!Ulir;0p^-qoM05d%$B1XZe#!-o(g_)X218Fq9~_}5CRfm-sP6!^NPg!@?cF?>^#d%aWL5X`06F!C zV;4rotJyT!fEsP6`#2OlM`?N0(bPl+7-iMBA%7=9Q^jixEL8+Nb#gIdQJQM){Ze<9 zNwJhlm!|}ouJW@lTBIw7rG+YeQb%%UH@1FlBa%z?ZQU<~?bz#?T_NpgcwOE>(2(tF zC5;9owp!%=+c^pwHhiL9ya!;az__Mmi`&q&f^v74Hs*od zTZ(39=m}QBs5Xhtiy$^bj+fSQ=dIp-QK$Zo)*ElA(zbuy7qcVr4alo5 zG2?08o%-`U%Qlya!I(HD5?JNDbrT2Vctw^u8qOj0S50GQuds~i|--J-YX8gEHEYx&r$qnTT8oM z4Bd@R9Qt!EL8-G!x!hXMIYp7g!I(A8w^}p3VX-f1&#_^8ryL+Vw+^ul;o8j8<&2#? z2H0q{5kt+da(5>f&?Ei-Ivnf%^v<_v)dN#Y_5QIwxP)A>(u4#|{}q}rw;J9{UWJ+E zl)HoZj#av4_@$xaeJ6hp3lhr*cJYCYJS#O?X&R-&hKFH#DejqOrJiX(6jj1F!$zaB z!$tv~n1Roi31i`f)yW^btZP(AePeOXe3uz+FxUM95$k2(Qaq;%(orWO(MXJjzs9Hoz^+wAbl;!K**56v?)9sl3Wqz7 zqj71^Uz<3>E~wbUKR}Z~8Z$OZ>v*xMBJ0TTSYE&nwAd=Jd4r9fkuoh$;lBNo=fE-tgvajhdh$O#Wkc3gSvQlaPUkU&RZ&uBJ1cc|rgjP| zEEdeS!ubz-LJn6*Bf-FkTAU`oUuIX?A~*BDPO9VB@ECMrTPiu}5t9dzl|inwuD%lr zRb!GP*FW?|LdZ9@4{ka;dkf@<^hl;RL0jcNu62%M4~qtO$SGY$3^~+xq2tDGM5fnW zShKt17*1^GvVOVc509{)f|j4!{mBM~R=HOMFH3h5s@%+qRwZ@f&~%~59U}q%*l2J zX^0E8LeWWH@8Nu^Z!@ODt^2L9Te@Q@yLqgk7t1!P$vtvQvq&}w;+z+z5rO6!G9`74 z)>PoIbp6utahUS`UL8Dv`)r6}ST;VWkT+f!rf%b+M)cPsYdPFsmXr}b82-d)>a{62 zH4RL~qb-8!q}vt95p?*|ohFUy8KU{uKM(WsBM=KwH9H@1xnIYk$|BJ~yA%ZGe5}%?g`fB(xOBXs7#Ax4MvY)^jq&O zjCw`n^DaFpin7C;hJz#23a$Rx1)b3}1?uMl6~A^4yxV^t6~30mn5L8K1M7`;Mee2_ zZ4;K1Qm!nuN{Q9QL+JcUo`UuZI_GGV>yGR-dq~S*qC2IBQ*)A4-UQm}ZJ!2rc1Dz! zkuoZno?>K}GEDAj{%&0}g{VZwY%Is;TBO+>b75=sSuj89yH6HmXSQl!<7sfYu?-eR zA-WIAdv#mEyPm1b<1qlD1#&Emc_zOOrM!;xpq3mqNj%1~CA(c5+2#^uMJ+bC`imBR z`UaX7IH#;rx6fxiBBEY$7R+5_emslQ@DT;0SoyrK!UtHTRaadU89I^t17Y4e?ERJ~ z!OcHmK&>wWq&CLJrTo(Z#cG0LAqs@XfRyt5#Vn39h7ZPH5HNsHz+&R=bsGNa;c!}^ zg~&2jH-tLXtqb3I4>c%CS~EzShhk(q2yyc*?$~t8L{F_vZ0T;b;!>TyH|hc2+CN^* zQ34(AOrv;X;_X%JA78%|3%Gjd&_66?S_9hpS zT%x$wPKglLWawzUKx?^3@F2#oNc?ACD?#&TxK7dOqkfeI^Uipx@w?x>Uyp>Xpy^bX z3bC4Eq@RG$YiGu2puYODW+xUQ8V>B2sZ75t+fX>Rq2JMU-anO_=ZdEaxIph6woJ`* z;OW!O01N*Bs>-D;%czxqxV~b&qy{$8CU7`9x<{AK9NiWpFBPGEhinD$$bCRxw^!D) zfBGRwGccc(<)IowqSpL*s~YOg3X#YFU|;Sp(>Z6>i$>uc_fx8>b1{>ZQWjIp=g;p( zGcfSN_{j*^K6Z&6_dog7YEPGS=kPnISetC$K&t897Qk69TU_})Mn}t@@&>d~@sjq> z*=I(Wr)ahic?T>ojxp*^0Ii^Vnl-_#p|ZvG_oBiNrK7VdZ7X#% zZvfoXMn^J+W!kbV*cc-3n1n4Wdw`i)YtCNCQU|izY-twLEhBhQ3Oe^up|4*)20!C3 z23V00&R}zqtIElPGN5Erg_;(7w-K${1clzpbYE@D&E^*b0dk9n9_=5;wGe$F zQ4a(2vXZ3dBO!OLm(EJ%knUBdK8fx#B9g=_KQ;bnoIh%uY!Q++Lvvm01~PHXUhdIg zX0vkM6v3m~jRd>i!}-gVL^*^Gpji+#F}^{p#!xxUn?MlZFX8_U0WQ7%VhaP6_jBZ7P=-C5TnfszX_Iv35#!s}$5k21XZV8jR|Nh`mXLB4A!Z zx1k|cp&TT=ctu$F6~1zMd~?RftidC{6?g&6M=pESym#$ka(4WUh{(d>K`RO9@U(Bn zcA6it04PfJI0%cCoc@vU*%0$UpZg-fHY9Ao)aFOq7X=xMrqE74Y0F7QI8kQ`Ag%`ZuY?t{#pn)p~&;5wYx)EQ$Pc5HPLjKWkV{jLKVr@);A|T`XHD zw3M;0x>Fy9VgE7uH;(2`u(*Jm18uPqs%44A zZ_*0eVNa?CPAop`=n|F50S{!`u5!c@Fi-5qC3V?o#PXTDe-Rx05INd6>p!O(be(8S zQTc8}$V-{z?+}&cq1R`6Rv>&f3B~8$)Ax65RjkK09cHCL=+|?)kDz3Q0l1MK(~FIk zxj8|RdB#Q~$&MaFvWvv^?>GtI9*$3?5ZHt{hp@3oT;rLwt+LIUSLn7X2YjE}JeO2a zPj9m`(uA_^6!{cl#$K{c=y-9pJ>`(q(>JDMLi#3+aciixh_d} zQmBR|b@1S+G{TbMyIFJ?4Cog`{F?%$C0(%V6pJ)(b|FC^yc9en;ArPh3|FR<@!%iE z#G#TVi_(Yp( zZ%c|rqU=p9qRIF##D@qko&$@Q0Vo87V*g&Xl4gzkU6gOB@G%CoXK!UeU@U$;@7yb; z{}Y>n$C`=r)bRCdCeOPok;)W)^q|f1j1y9nW?CqJ*1G5Vfk<4UhFQ;372s(xGm7E{ zB$ssr4SCOcv!a=TGtSg7kDhS0N6uRBjP3waC!E5;1+54Vgc z@<3VZHDWi_!gt+J_c7Buq8YgbY?l-HN8bA^x8Zxl-WYafHvGLg&gfMKunpVmyKa0H z29e}|X2M-HbIXl#&?q-3NVb^NKLk-{xI9 z^r1;rzh`lzBRn@_y7u!*{TK?g8Nam`i`CcFGv}kx&pue(R zW-!uHWSHbhCONVQWr>0Cj{!_eK$+o2_8`kdLn9O(p;ghPlJ!??P|DKF;9Nd4-h&kUKcTUZ89>d4iI>&q!ZJg`QL)Xn8_Qf}{r?I&% zN_V3ggcMVw1vH4SJ0`}(it)1E6DOA3h(t$%vp?K_I8HlQ;CRWJK2-*>KMwOJj-TKw z;K)&6?7Wjq+;xI_+WMN|mr>zbo`3VkGN*vy5_~$bYvq@0-@e_r||lHz!$B;HI2a_Xii%?h@H*H}m03xjrjX-?W%SGz1A0+iypCNEFa1+ z&swoHrI|;4L+Ol6&!h0ZKV{h3-1y??Q7AelQjRlsjZct{#Zv1l!Kf#U1Rcyh%Zu7x z2A>LVmjEdD9}CBc$x-@nM(->HmdT3Yo`LIGvVgXHMfS&}0P3cmsk?==?eC8dRE<3* z(NZ?~A=%fF@bNP|*|!^S0t_srqksw;YLy!7KUQ}#slfQ0@G&m^Lcw+xdzOb}vnS`s z3yZ=#h)S3I#Lre&yn*++ukO?#s}HQu$x%e~Ndo25f_tTpRmxq2C=gg=R?Ohfa&VsS zST8(;0~Kcj+|oVwn`#_m9C$@zvNzsfsBM(-rMiyIej(f8gfrF4PicReD#R7+5OURZ zR9_fV{-QKkc)og3zvejUr+AjgY-J@OXA!J9E>H=Q;*8qN&2;dy?;v}zJK?L zPx&y-^7)X=*FuA0hha2C+C69a@!PEgI$A@*^e$p(y)LL_O+;WXG8B`hMbEsyo3Zlu zYH7QNv$n=FdZJ@xC|$>%3Ws;~dPIktE!LCz?RJ7*6Z6RhO|-@pSG)1^UOt?GzdGgf z4OCj~rHH;C)OI>DumMPJBME>`V2g50hJf~gWocSjfRgVQHmKvAt7V%SWR?8I@Ik$} zR3wZM#gDTOl1`k%i`MOEaGN0iMT);(16Ya>P(@eF692b^-MMb2Hn_81ATh6*4>v^p zKJ48NzF1{P>oK2uUL6YZ5n%;;TR`{}RQ9t?#%d*Y+|U1E0Y2+MXMXPY$xuTN7~q;4 z9P33iv)L5q0iO5i+fCn&yS6)?I2ed+Yv(2#(>PK-{{1_^?^s9nt3$VIJYW z0=xMMx5C@T6Ot9>XS)!JW?~3c2;VQxm#w#{EkEkuuP2N zUF*Q{10B(D_>1tK8Qq4{%31HbLixEik*8!RS>!Fg?3_mF!;1g|jm9{g(k#2VxOMI?+0v4jZ-L9W9a@KK8&Z=*a!=$6`ofo*C zz}YFRo2?ZH*0XXPufvqB{fzyWH!11ES;WXuLw0|^+c`2}$6Cu~J43XXfjlM*@OaP* zCmi&>Y=s2l_y3hPC}3Qowu5{zh%E3+pvpm;BD|g@z?yO}%~+usy)1=R8f>p*^>&F! zomY;V=0u;+Rm6T%r@f|tWzE>VnS7WYpEp7Ks#@5$l%)eOcwB;|Ls&Q%k%&Z!8nr2XMvA!sK17#7|fT1_Dm8lz=VR2 zr)?NGBjhXN=p+X#T6?&miN(6RX5hhr9RE7*8|UG_H4`91;Vwo&8QgBndOHlyS&Mmc z+v2(8WP9L_2KxFhhR*3a7m!QFlDSndnT5_r_6)g|B)=ymcx3w~zr*r6ikhY(?7a)G zHxjMUW>)H=+gYHIOSbVPXBDx1S>chHXu>&YFq(&xicj>&BLc>h$%KXdWy|?wTPe;% z;(v!N{yW34n}6UBP3UI^|6g{ zV+xtSt>gbcUa=YMB>kT}L~9q8UeRF~wIq@VN8VL(|0AjqyUwl4&%u7B(yUVa z*ap$+3}Ud~ApGw*2#c&1rJ&m4*_k?#`-1yxK7z7X$*XDLX7wE(|Bvbzmsn zCziN?LpIcfS7&HGPV(u!5YxfmqBvY>yPg6%aw2h>HVF@acQ2IFkl!DHsto zcx1khx1zL0jOi3VL2^2wBF_zp@xX<4Q8g8 zb;8r&6|9}PTgd;5`@=Ht6*ep6AKIejT+?az=Ow@Wd6>N+iKQ}tQwKj+`|D{!4V%s)VZfv-<_$ zyKSiiSfrOuTsr>IjV3F{!146@UKY|l6^qM8qcHqAx$*Ab>a-b=8!(9ogKY?bO8zDJ zKN|r2`p*Ucve7McIxV(pYI=)qXGNTLEv>7`6+OMi<+T{C0)4!mAPIqkZp;h)VkQ;Q$*gG-W^=XsP-4rk7~pyDb@+C`ve>#p`z7_a2MhqZ6q$ z*$(f!gnr~y6-N3?1XyED6&yD)Pv{r4%)8W#$=8w1?$zCoxLmkxoOSKh;VoNq(1BU2 zyuaHNLZ#k>Ug@V+;Kw7sKVV^{rxYal)xAoHu|!wRSifA-A|BK-dA{^&EF{;o<>H)ERnjXnMO&}>+cwpQr$ z3$uLAjC3zx=dGB>nL=$SlE#p?2(vEQ3(l}#n7-blmn;M8<=$2?9C9v4`lnAc$lZV$ z$`!zN{F>Qw-L(N?e!iel`FKS|oO=J`z{__Q2E}@)fz#PEUCjL9nJr1=+a=g>8Nn#bozrb<}VC*l;Ho2lQY8kz;fitiAE+JvrcDwpQeg*D|-@ zo0(D%_c5kozuLa$Rp)l)dG;{f7osNvcPzXm)B7#{x;9xI)yR6U?98^>UidJ}oK_oOsU<8YfLSSn4y6CR&x4eO6HX?WeX$fhY8e1u3$a^f8$n6`T=nOu1c4C+O@ z1SFax#ZHtK4Edm|Oz9%~wD+x9hS%Rd5I9s@`m;>@LrDV$)G^7cXP>TC#yroAElfs9 zdI_lRLgVB$<01-R+zL2 z>4pP35(8|t+~+%M`xYi6*D?0@Twr2j^X@qvAM(a_I>95TNj#HwFN5pw`sx@?bi}!M zcUfES*{r!?bnf``oSL5>GA1yB9bf_G0y4;A5t~%K)S)1~bK_?M%9;R8pp7kUL}`o%}p?*NlL0RFdXXuSmTw@%KpJW3|5gh7y z>fJp;izO_1J#3m9d?doRJxM~(HHuSs?Aj8>O!2 z$PjjcaCSB0u`b*TslPJ-8(8~MwGrPa=_&Q)lSj)33E^fxEx%A1({|n7cw>=37b=kn zm?lN?$OZhdFxvGlWtQK%Ine3BX!DBzR^2m7Ql65Zb zTWb*6Bs1zDnrXku=#7pO84-%q1j{$4ZU-VYW1~s1u)+A=r2lqapN)miOz8h$F}WFh z=1pvI;vO+0tUVErQ%NH?GfQ128^QFJ8-51Ru1PsKz0X5@VH`e+=UwCK9bF2BEs{ zM<2U|oSKpT2od&RfrmpFI26yC4WQXNfm|VEKK5#XWbYgt0^Vc6(3r~=-J3QL~kPU z{1ocJAd$ET4>OM}5=C0yx!b zk17i`j8A9~aG)D!x2_5)<0bev-m=cl+es@@_1rFHNu0d1EcURtYxGocJbA>fT%w~C z5USQ!F8)$$V$I6o+Z$iiWAi8fEWvD9>e+Z}XsA+V_s(@N-O3x;8L5ofaQwD`RdD$X zlN}shu8!2JXeoeG$Zf^^F^cO&CUl90(lIHf)tO;T+FoyVWEm4=n_IrvXsT$1$8qf6 zfWqGC1c(X3rUD@6uQ#MWNoT66_n&=Zy^@Ma{sr&wS?3C{-Ya|&irKXY;Y!vTp1`>6 zmWmMGtfw9!V{rIrli@tbfjhbDofF#BfR}r-bbBSnareJm=xr%%QWPf^8WH|ph0mJS zBIOkZ)rdvQ`nqzQx6x2@!6PPJ#yFZ_;<7ug+(GG~BUM)LiZmLyI9c7buR?TR(tkaTbxjRYoO(QLa`U5oCFVtuhD>txz^H$;6} zxl&velrHJmy{MVoU%2@@k$Nw~n|9!DrpTKO%8NWZ>TH_V;XCfVx0?%&q6cmjRc@Kv z=3tFfp05Tvnm?WHR`;cysWKQOi4u&Rnkg=fJ&@8LoqdoB>WLo^z=6;&J8l$LyPK!qm=gWwQ+ezWA^!NI2~5F^6oxvXFy?yhv#u(DpELnuCR{ z+$xcAx|>zMaxNIZqrkjpol`J>69@pFl@VF=zj=QZ6gAZ0m@`aGW9+5Ml96VK$|Us-!i<2zkC@9C}GAB1c$ ziW;`&BP_aamfhHsjo5IG4{`|)`eAs5`sd)8(?O4V80d%wkiFUF-k1!%ApaF;GxS?) z{m}~#moS>BC_@0h^icKB!zpbmU{dCVB)Ny-_vGm+-sI^TJ@PHD(u@4p_J>`3cjCZu z9yqU* zmn#9K3uwa``v_8mJ-76{44zKM_ok!HykQk0Ul%=bqH+^V#AR1BZ{@Z7v}bi1Txb zjSt7}QrW-@`R_?KG2QjdBx`44xpyMzOWO4f^;wF`d_M|U6${0gRSX|G0!VQ&Ov+!{ z-I0!Uy~e}s0BB48rm-TcWGJZH#vORK#hpdPH@uS(N3Q@VP?rw1`i}0DHoAH5Awoh4 zpD`?6AJYyJS0sO#YwTbSo#EYawf+oS(Sr0zB9p(Cz zK})52fz}M(H1h&iLg1-ab}FU>QrM9th^1a^ydhe%t;eq&*KU=c%qlB`mZw`GYzp&M z&Y;Srs^FCAewHpL0Drr`wDkm~@JyiBmi_2l8T{}6ymdb_Xqe2M)Xp`T4au=jK{G6J ztwne6!HpThW1#>mr2TzdDEIMWLgO+WW_PlniGV#a)?5lL?@|noEN%C%;L}Zt z72cTT&Y)uY21E4pseY`lYX12*f&qG_7whoy`)C*T$(U&4u)F- z5x{_zwOJckx6Mv)L@Eq-1TP9wIwCAb9$$@?;OpJX7*AO(l9DmZ`LFvKbA#?CYg}+u z!2Rj&FgYLK?Y(Z+|I%-@w`s8RHWYj7xDp)0m*-+YM#peYvJPuK7dv;p;G@~PQE|ca z9D*?{v%6(ci+Nj;OGbivsGd=&RLfX+LHvCIN8um1TZJSgWp6agq6^&%>jXta41%>~ zG|(GFY*%eKxtC5qw#}cE%#n(_)l+k`YWeFhy_X=Ay2rT?HBhAq%EE$d$vjSie4sql*$19m%b{YX%S z)7tBCO`@!|>bO2NelqwP*0&W*`BKY$9>d7{mxgV=-g1$}M?|Xi)T~^*d?9bteOpC8 z6r>Zx^%)wHAi;j%N1Lw+SQr|`=)8WwpX0btqnsQ(EoKqQl|8m@&cfpzc1&iebS}lJSI?883NkQ(iyKa1(rgYs>dT$b~?l_d-xM zZF!8HQcL=v{n)|JNpC?tC{NpI6TFb=1O*KH>2xq;s3B{M+GpC)&c^~6LfR0?RO*`}g z>JGg-YwF~WzOVcq17+cu+r{S2qGZ~1Ni24_W`+<(YtVK)Vc6R1dk|xiCsKn4bFK45 ztxAjr2tQZLNmy-q{Dw_i-~HUsz~c<(r?Y)E^GkimiUo^jkwmi<@^i7j5LJP9QB4|n zuavQ6Tr_?RfcR&N`{EIK5V$ z;-{Y*@2zRo`Xq7+4s3bu-p(D-m=K*AsT(wNrFvZDjl`yrl3J za@9Q>y|w>7{izxHG2-`0$SM^&##t!`%gba93(5qrc zMk>QLGVDf8a{;P0J#ZfzhnGpBv#AZmKy~#013=SGNcY19V$MdE;RymO2DHt>t-l`m z(kbIt&Xg6v{)yG#%+khNA0IKGhVcoFUM$Py9J>GKn`!nD1)~isl3qA;VSWD2v(hjW zTdrqXrXG~xF$?NKz{VY47rr>1vl7HFx&^|l@MuJ@TQ9V3Uc9K-^0Oj^<6tvOHCbVk zK>siH-ZCn#pxYKr2oAwrgNESl?hYZiySux)ySr;}hsNFA=>%vTI=I8@@0@$?c)#Df zKi-d5V~^h5V|4GXy=qnMs#SB%Ig^pSMw?J{{e0vS`?gH(zH0anH3xs3mws;D*MG+N zxk-l&WCkqT|9ALs)U~#Fl?DN!4aFOVk___$(Dy5fE4~i@ZoczG7;*! z2f4hDg;Q=T$#!+3qgD;P>gi5~i(j0QAR@Cf{os{Qj3E=YW@ckncea_gGiiFCBXRk= zUEHh?=ZO%p`8~KfC;#aACU~Z-8C^Ykz+Pf1^~NiR1DAlV&(I6RAZ*{rVs%rqa;-* zi#)lxi>$Jh*^=?sx>74yhFQ|CdlcIxOM&H_X*xI7xCGV+^2}>jH|_qE^$0R z(6d?DVJA&Yacb0MHm(Ni+s#V-BCTWH-RS;@PdJwV=2u!m4O7X1 z+QdV(FOQDlVG035Eq-^VSFJ|$cC=p=fq+2*+cK!p(<*Mk1JQJ~-wm60j&l*Wo}aqE z#u?eem`0LwE>0%nJ_s$3L5upDjF%T{5%=tDu!CyK$ccPvPdq)!1M2p}C^smi%%0;6 z`=0lA-!37jDq|4ugeV1E-`MdddCtf2+u**1Sk8lv*IUpuh`rumYVp1m?E=nCn^)|X zl=avLf(-`Rxqeig39ikU-Os3*cM}}*nq1d_S7;`5GY-sRR9oSe-J;XT*oOa+a?DrZ zWV}_Z}5pA!d>qS;r2> z|2$IxH9zcC?F(_fW5^n`XX@3u`;yQhm3@1f;QmgEYN$8Idgb7?NH=nMcjy?KRH6pZ zfrOcQ>-ipHx7AP4m!1_u27ZYo*RMSG>K$Z^G<83NFP^A$TU|BmG}LZJ3Hl%n3=sx^ zP_ethSV3x9%qINONRu~4P6^ceyB*P;wO~N8u%9gJJ^Z2o8(`!D*)FN+3cfVpol{e` zfw8;W?MB=x53@#8ypj-=3rzo3>gTYPJ4N?8H9Y!}qr~ZSt(gX_>=EH(I6JKyx=24u z)DLeP{d9#^>dyOj17$M!;+$tkql)lt^PKW)bhjiF$R{_&eY_J*{wV9!&|@E^*0yoc zm|fHhGb$SWXLKFbdn{wif^oZI3>J`R(}pDe%&6{j5R{9M&X(T4b>sP+GQ$tn8MyjG zPFig1z*%dhvTCe^l>(P)!~wJhl^)5LF^VUd_~|6(i#GzcPP7^8Ai4~X<&&;;@7I-2 z1EA>NJZIN1_KG-o$M9HV>~&k0&n5HT%l)dv)c#E>xvn+h4jV==ygG)L#zCDifz|V` z(qG03IPpA(jqksDfFC-uUE>=cZYXC7kn_e_e)0@xm2;z5fN*WhQWjNN<3p_yu*Y`v z|0x|}E&0`{>(qD{oSnnu5R$HMKL+ku-nG2_4;H|7V~&$~+hH+ad~a~L%>Woawg*Xs zLLE8rVwHVKKzamntQ6qd+FLca=(BvyvhAsZYcaQTm?~LGV)V%USXObAQ80xZ_)-JM zagMJK6t{V?j{Dx`b4;{tts(rDQ%47!i;PR&y zM~7hM_%K2yhIYjH`>%Z%UHk+HO#oz+Pr=d=CiP~Kd_YL~J@up9-S-LtJk#5Ib<0co zC=0p-Mm1*Y9*(Lw88p{dWbO>q+j)VlNd{M~imF9}a@L5w!`*ljthfJVDEyAt@6Jeb z5nDDU+BMQ@w<9)AU`Pmqa#=$08{*v=BH$9NbW)o}6Jx^TE~$x4=zh__vE|L@%eh=j zYu&4_cu)c7H~D%;OmSRLewB^6+WBsh3YMq_kCr0Q(+BP17q=P{#kltueq;v7D|3*< zuyMv$I{^oxDb)#pAYc281UGnSY1a&mHWT8{$oz{pFpfx+4qfwjr6ncBD)z^>{5a)x zW@klu7oG*@)HMGvs?;$de7{0W z7BAEfp$p;6BnNh8=goC~6#P$V>Gz+M7pUC3&rnAuo6d}%Ea=oZx)N}@-Kw@qZQq0= z-z$b%o(hQ9OsAgp)9`WQBXQ)LCX%Bb1MM5>dte)sbEpkUFS z?=4pLk(vRS0E)By3bosB4Tq^rcWZdX-R-VHZB=)WL9qKi)^i%rJ87$4a^{Nn@iNnU zkm24W3>)Sdq3yOo(u>rnn3wsn9k=Hxje}Tv-QP5`=jgkdC8z{sHKaApawmriCF?{ppx6l+i5X3uV*%Mz5C-8#AEUi)``wMbT|2Xw z7G)cuiG*fKhOpuN#yyW}7S~HYWuRE+e$(%G-r?QUS(f^F-=3g>UD_k`Wr#$StWt*J z^d}3U$zWLz@||vywy&Xd0K#P%qJMCcFgW&|XiHW$cVL4OaKJkv(;N7!>oXgJTTGx$ zy_NFzPsldgFWqx~G^F{fKt-2`@WV}cztk~KG2~vuMStNT2*E}KIsdb9YS2De8(m24 z6-RkSH0ByI3N-T&;$aZlmpLg*B<|_gT4U<*9?`uCJilH7NlhyEaZ^j~PY_~ZJiUa@ zgl76fj;{Q;g*DuMX0_=TF=W7U|9pzkCPAAZ_0bKK71A_-S&x`N(j>6Wghf%XuE}>Q zBvF$8FR8~_(Vc{srREqi9Uh!0vY?$|7fN%6yU@=VJ8X+9d1PoSfKcE`4zjWL_NW!d z@d<=Z%HbM=VtV2u_fMHnRH{ZK-JBEeR2l+qFWqF7kiDTHx42G*;>{cHh13x8TLd>M zF=VB0*2%e#gmc#MUgg!kZhWJes3}k79uOG>*#ey2Fx6$gL^8Bz37=ST< zVng@7)Nl-Jor@We*;39FPl-;NKM~v+AIej7vQ^<1j;XTU=+XrO_$?6@Z%kv8mSCgQ zzYeKJxrK7STB4OG8yb~s230-rUkEpXhDP|Y$S;NN1rA_Nlo)Vs5bcyWnVAchw$V#p zTq#hA|J`>S?%kg_k3A_-Nl`hmPyYdr9D~=QMpR?VKiLia?Aq$@(#nNMnwEShVfNe> z5T5L^mp6B58g%#?=S2tjy@68jU{D?0e%}dw-_l!pN}yNyEjicB(gj|dha|@V;3?)p znVY<*xftl0fziE$<;jS4Z`gI}ayS}e#JBn>`xeOAIjXUmfo)O*-0!-n??J*=T4|zA zZ;zoOK;cgm?zPn%2Vv(mInxjnB|yS74y{f1s#3NL0c%E@mb?H*IRTr&Pw(IO&My^tk%1(lxwqK-F${c9KUjjwuQdkuI+mi8aL0l?C-Kk0T{AB_^#g$p;ju>irm$Ip3}ODxY|jd9%>77LLTZ$A*@ zLpJOPF_m6DnCkb?jGUMEEcD5XWG+C*rswGmxoQDh5Ples7yab!TA&T5Fj#3$9v8c=db!w{4K_x7A~>9*p%+P!}A59cu=`z{2Kate4dP0eezd#DDoMWz9Yl)=tC=D1!jY`!~L5?Y`8U?u_KkvFduN868hBQvmP7G}4E* zBj|ja07OIj;2ZVBX5B8dy@P#hwt+*tQUBENW@>?oeb<62?zY8inb`vZlakNj@!7er zO4NUh6m)c7E0AX?K4+7%a9{PoWIpbyTrhgOUDCZZv*VJpz(yFGnM|(M52bmnUwTy+ zXb_G0H3Z#9AMQ@u-vs~1jTZ3yM+-l~xZi$DJ!_{oX6SkMVeas2C=<7MFM$+q3S`ye z7Xq}yfSf$9#*smHb9xEOD{7J-3>tzYu5ir_qY0U+iV=T~blozuX*irZ7`b*^nU$6W z*Shjkcxg9h>yVG$5yF)s@ZS?)hx#vGUQ=BC}fgH6+4(+Mmn@%wIw5&&||% zN?h~t|Ftu{qUK4f$lsKs<@K2#XSTj+ofvdbz=d_Qao04s7RL2<#d`FGB|Kh==gDGW zF9Kqq4E))*Gwn2lrdM72o-WaY4VvBx!V}#&v8x7q?N~s5)#g{fiYcCQL3y^-h2eP$ z7Lu+kTEhAsOb+mlx^;QjW^EL^0~6>(2*2n?IdkB%!P=)OGL96p+anu!10X=hXUA;k zK_hy@K)|iTFR`;{Fh>p&eb)DiCvjgiKTSg}(5JOMnDb#kqP?Kn>HB#m6~iP+jmkdn zQ5$3t|M~aSo>Wb&Kgr(%lgW#52)_C6w$RXc7Bi%i*qy&z)8ejWc-5%ae+(SvG_#e5KjSbP@F%Y1CIZqYiZNAZ| z$RcQjg}FvK35y15a6Er>WCg*-xO^yO^I#cAebOEv%d5@HI*ZeGvJF#Pf6duKs2;{3 z$Oa|q|GS{5#IkT}rJZYZ{TfVSBLin6M4%o_TaNGqiY`MT7|MOMPaOx%r2Vif3~H9L z)&DSRy95#&Q8IIa;b z%BlSpt2?qXxDaHTM~1mBvncs7SS7?3rK^9~s&T5T;a;=9L9J(>^|9=7UglGWqq~|; zR6FTo6VCJ0cdGjmgAGv+T~I4}a>cBO9~geX#qU!a>s-CbUG8+~l^Kc+Y@Xu(h)1j( z)@FBYbrq_{23H4X!K10@Ey@^lsbI%4uV?YrD+s(4$6JIN>*Gv!n>Kk|dSCMPA=4gv zj`+&0;J@1E3UW>51hjTlOg63J6|x|9N>to?)eE(MPS{S~r~7p(aDV4>u(F;Yx@U`t z`kV2_z0Rr5KoaoUukWTM*X@>|bidVBOLzVorJV`t^unSdpVg|~Ok@SPb>v#l1rvQ& zGcs7NH9Ym>n1O($q>1s650HKLjOe2f()W2gRq-;VPTwixjORJ$-o1c#@+g8{4C?&;Q{6wVGJW{8xGP z;=5^Ps5P5^W7CL8Q7cypn0g zR@ta~&^qhl-N;izKHb2zcRER7U~*VFvg*5FCT<}>7BWM7U7^LDbwE~&Bt}`R^rqk4 z+~RG%Bbkf8F31Mf`;|%>B(5KMrGqW!4tQr<^N&hr{Wj!aRNPAE3}S-iy!~?s`p%NF z+PMsn)@n3iZAtGX{GMui&(CSVyK%&zjL&(1Bl%5499)BqWB+Pm_=Q#I`_SG8t-A~) zdiym@qxw0!T`E7DGlbpEu5WkG4w^UXf*`o9bhos+9kcRbSuWU>32?gh7M!*W%O_6(-6f^!QXw#xK*n$@w+0dT)1{ z*&C#ofQe;XK+m(j@M-5eA?~bnse}Ep_L*~W{m@3uh}7G zyLtAl50n(bFNGr0Zkc|3Gn%b;qnS9E^fEj9WugSed-+o|(Q`ih68cayQAl;r_VV4^$7rUmmH+G0e_j%xC;$KV{|D`$ z^vC{kIzL`Hlk4)3G3>vsF6ukrwYp=s-&+&ZWMSo*?#|`$<9{`X=y{}TFV6q8fG?%; z|MT~kr`O6szmc`K0Xf}RkSkvDbSmxfZ3=pF#~E{b99u&EG18~N!s?)Y_;U08(z>vg zfU)|&T7Njc(n$x(4B{Vs3){;(I|cuzs`UTncK*NoTd2OvFKByHliX;OBw8N@5gZmN zldrr9(hxuDSi*{3{e-^r@UyLdFZjWRub2t7vrTkc0Wt&?RS%Cwm=RegqE8^8Vvw##NoDIty%)!}AgcFc1TmxTHa*_nsg>KS+fdy%9NOWu#wlIOx7(!shF}hOD+6uV?gJ_^ zQ!jTQzIR&M8j9x&31UbQFRem$sUim1A))xZ@qafO&GgzH?|5KZwGwbn+->WRKfLnq zdeIXH^)H1G|3iV8sZs9vb4ZcUa#UZ_^hng?!zoF#LuqF35*#|x!a~*dNiW3Aj~I(6 zp*#?x0E|P$zACh{1m~uoplzVJw0~AGC><>ijsqFnQSa8rYEx&;=(_KXZ|!yLzB=9aqG7YiW{x`ryn`XO)(+C3wVWP-=)vq2mtXamT=^;xeWgfm7IOGj6siU zqQl^_KOMm-8U31(-4fO7qN;z*^;o9>{T>r!cUt*`M{Mk*4u2kh4eV{12O0(fmW<5( z8zi6a9}U9}i{ObB&gyzePOgx-4!OW`Hp^C|@JW_ymdt9)pE}wi(r6R2XIa>cQYKyi zh1M-X1mz=vngxKLRzKA?6~$Q)12qf}^G^d6BWUKv!PXu?FhmT8{KP^^--C#>tz zIjvGzn3`2<=2zd;S@Bl;OiCX31A85w4I@3wgB+@?sOx!T*5nHs6Tw7qi9WnT7zI`-`ySe%NlHMP6D*d zY4tkGf;Vg&AOAAj=@Cepb)I^g9^OpHn`iZynmkRyy;**-$p3LEhWY~dPi*5}KMi-B%Pte;Y60mQ?>*Fe4;+rwsG;Lz`>FF8u4ao8`BRH}C>5S>@@7HZ;IhWP(BKSOMOM;JIa=BmFF z*7{oIDoD74MG3^2KNcjUa*D!UfK|MHdp$`{^c%H9sKF^J~7f+sjaO9cR>e#Y$e zib~G@LQA2zYGj9fJGw_`uNwB~82eo{fy-b9h{=|ML^j+z%x#-knYqF1;+00{hD_#c zZ}xP1Btr5UJv3N#PHWz=~B=)0o0e^1Nr=`42>*Z1AB_XMSd)^1J@sw$k4}L z(}vc(a^;3>PuHx?WRT?o1z{R-d;jtj{-N_H`zj7~GN)~Nq~tsut=TurG*ejS5M;la zp!`*pfmH~R``=H^;OF9B8BKo-Y+{(v)EfnxGoLBvWJStA(wX`CKtgK@=?BBWA0Oah z!x z>c9^2zA>@0Ix2t0nlAZClB;RlB3r}+d5(2rW%GRjFovAEV4k1=l5cQXtN$U`Q5?jsawuAdAL@K++C zzx`6eEzt~wH4r)8ZsCMTZU;~8jaiLmt9lda)yBrO9^e2wx=A2{LMMTu7Fu z4QW*k0Rw}fnE?CO+i&#xlUO09f%5rEQg&Ul_O5BsJM|{Y|I|omp%i{eyx68ayuV1= zfX~HAUJ`Qfj@La?h-{cX@*anbHs4_E7WW3HHnw1#t;vVN3P-b!R=d6E*ipl+-ZyAt z7>`jyPwqU33+9}i1f*fdH|2Y>$*y)RuRuZ73p_*zOJ_V*YB-{iZq*Pn$>KY9oo`xZ zC~-{CY9%f-$4JQnI6|6-As!b^J;37_=e;wdy|V231=6Ay-S{ZV^^A0{r|95Sd{7py z*tS3E8Dvgk7d;VB!QI`WM~gZ{2Alr_HMwp=ZTF?(ToU(8GxD!c^nxA%bt%w+)scOy z^QXaqCcZch_GVgmDUi8y5B}LF zGk*WWgMLtxhZqCEogPhs7r-8B%Pk!lDX3*lx%~vwJZF0F0GT#b9?vojgUOJ z*xi&sMx0>T3r2%9MC|>*J_7T?zW(9ReY$l4a|7T$#xMJzsE@YsVf7q(9?gF5?cWm7 zQ^uMcKq$VEsKHkzXx#|HeHauE+m>XTpKkjes&V(1*XPuCF}6=SHvQS-tP6jQaH6p> z_%!;}K5G<}X)tvIvQBy&u3M5p5QRyxz4yI_;4m~?7l>y1ng zIC|AM447ZX**aMe?+D^zvz;@)OtKy^fq__B*+u^e z+xaj#v=my*)+hel<9C@9f|f7o)C)U1dr`Ln*AtPe8(yc~eZ%jkVC$TGYG&yYdz+XxHXYSWsTCM7;cXP^w#tMUjQ^@o4xvyyH!I(`zE8eCetwm@YeOti^h%jhSw-p1u7_v@J^7{Pkh_!)@gNA1%t4?>R zV7+teHD^k_y3L1LRC_9E+A4wZpOg7=h8f4JuUwzV;yf{@{eSAnhN2k*t1Q!6`+6eY z!s0A@t}lBb@B~bw!*_-$<2K*!&i0I~t($i&j2&#fwG^*I4lB&}|G@&Jb2(T6#~Urf zEcCB2u#O#xtC0~1%$5YtbH5YixrOY?j{uE`ZbUMSdWz62t)I@Ir~y-Dj$ zI;|rQ+|-3@V6^CbGEKY`a3jRs+x;)Qm~yIyrld;&cB8{>*sc`Q+v+2to>Yn;!AjEQ zNK)j5H#|cg`uD}X)6VmIImVUZYD$u*>pzX(Ads5u-A0afZyOi;m{#JMXnbisR3AII zBTVH53J@v2o)NHikE4=SG6bqgHx*5E`Cj6ZMTaKwc{?P85R}7%23jUdkzX{wQV&|5 z;h#8zQQ^~qtx6XTeAJ+wTnv=la2hv+kZtrvux1m~B~T+*f;dv!Hl8m0~vGVI*)omp&n-TP&-{yNCXZa~XTtH#|y z&#~>a73U|{*B{LqHuX8SWqtx(kOj|wZezz1@oGge!01oo>&79=&#lJtX5B}99D$;x zRYLLw^ujU@;nJT!P?19*sbF!}Pt%Cad^Fpw`Y9p-iMo!NesnX(m2qmcM6JJR`tB~6 zJzCmT0mV99hy#U#u3QxH(N}XD<7( zX3SeBHF!&kif&NgzqEYvSUq^Y;*CTeb_nO{D~GZ#Uw?IzKnLah%d7^Q;$C8vDkbD_ z%i*sYxkA4yD4cfYjhEVRi&1fw;?3(b9k`)wFUa9gL7;f=;K7S115MQTq_Z@c(e;u1 z8-=i3#Grid)X9h|FCUqg+kV+uUNLNHmp7IfwT71iuUs*(uvw|1j7_+9baOnD_pJ{Y zc=v|&i_d=Gj&p|gVp>NRpJ};Y!P`AASJQ0XMQXKv7!Ovf0nhF z1D|#*2t6z7U$Jctox@M8_XFCDErQR@vv%lad&sH*EivDH!U5f#-?j`9eA=yFM9p<#t2bBk3orF zKb-yY*M}t_gXpJ^g5er$NF2{e3Bd6(Z;f&fe*t5 zh$wd_yA=74J)%TQ1vJ|Z>&+`O5%TWqVWyC@`?bceC%XB5SVFR+a=2M^Pcbfdw;M7# zxV5xIl$nb`JKc2*=9A(K3K&g2SA1=1g8Ji@D^ZZ01u7OSUsz)6nR%SlZ zAi0$&L;ksn)VA?_q5KPt;(G^rWsP`t&>St-vh;S}R@Nv#1k|+wITq+4(=$JT*+Pe= zA8hEtW)VTo14d>GA)|4?h3qfGP`29F|B#sW8?hdb!~je5Gq3+H`j!2OdR zh5I*Yse=08hEx;tO2;R zc&XGFx`$f|{P7{wbIkN!+tl208Q5ErGg3sUFNl+UadFB&nPPe9L|XVnY=DH`{gw_i zxztq;HiZPqQz1#7NeV|+z-0ew$%mV`f6s^oy44HDp5vwmkYwH|>jXx&qh`MB=|wFD zMk%xH*=uZVlrUySEwkR5TL8R3o=!UrAp~OJW<8t@m8n3xT81q-R>L5qX=puYUdW}n znx+%7$uLkYI$vpf;-1y_v!Ca<+g(8fuvyf=t#>sDA@TUrE{c~P(RyfTri=6ao;hiK zN4-HS_wM-eW+w!KZz!V2rG8!0By)QGumI~7Z1IA9fOFbNNSgiR@QL)4(a2)i?3CF! zX5lzg+HJmdr@X&&Iux_px-U$d`0@mNW~aRxKOz+kTzQX1PBxmAJlqv?T4Zud-O`rT zi1qB`nkpTail>#0hvkfZmZB$r3xc6H+?hoRB!>^C?2ob6FVMhvtUdC75tw`M%KsGC z?H@li0@~dpm}F$s8E}`5wMPEn7t^hIcRBkNXQ2D<-+7F)zr+q5I*;b1RTLC{NK%1y zO`PD?!97EarPx9u_+j34{7YLH3w#QN)hOzec+A}W<=tYN#|DI*<{{a)7~0DVd|k+- z9MBQ&%{KPucrPA*avjIydrGAN>LS24)rX2)&ur07dsvZywaI$%FPc#AurjmwT+^){ z7J|n!f=9adALanBq#&D1d~;k}+ze;RdTM;H9(%JZjO(!&{rZY0$bi?5oE*a$&hTwb zmc7MmV~f6yhpARiiACjV2Os7T8AD5GfE<)gR9Eyt6;U*$g%K|8rx7T|L|0iJ0@C}) zZCXA*sxl7g1B7?-$rvM1IbBP)xKGx+F6G+htGyckj_d?@sa!ThP~COHXAqNSTgok zJF&RzwKaFKC=6=L=(G>0bw;}9(~_rIZ}n2E4Bp-2HoMn*tM18sOd76_oTR`G|rOF2+vM%(z| zR>-q>bb_MY-yNWC{CcLsRMB3LQX!x(J>x|~BKbs)M^p-XnY=<}p|ADK$~Z!-Wd++G z{^|IIEo)XQ#dL9?N#s7JGL9<42ZUT6u0Hj<&MT$`!Fu7G+mn0R!6VbMJn*fC5?sL6 zkjy)jfkmL744xb-ixmB6@1eZRJm&daw$SVy+sr^cXN{W2ztvI2!U@vrEiCr5^eIbk z(}*TIlrJ}dQXu|4aDILK=RY2<9scmqM0?>1vmW~)mV5PehuhM`3U+ky4>p1%rK;Mr z(1_vsF+$8prvjtY1U`h`F8?Yq;u(mR|bP9L<&p0i^8gSCvUZiu2Z6Ok(xDr9>iF}g-(S<9q>lig5onv zdYl#qo_k?zQTtQLyvky9&Wd&Di>43EZd$TY%RE3a{m0QK=be!QghZ~>zU{yXnBm$^ z8S_n}Qa_Ds&`&Vn`=hXtmPWR&E@Ef{6`W~C=D(A~6EnQ)3Df8Ip7IC^RuuutFth~a z&d?D!eGr;Zcw#0zQesEt{FM$P6?nvXWpPkW`9nX~9hZe4fT&U(e?Mpo$eaOXgFqScs&?|1pU9l3r@<*#6MJbcf0b@A2sH{9Mydo4Z$e>5 z1cL76SLboD27S@1Bc|@ACe9PD6!DWzAh9T-ccnwBvMDorO6QjP`{R5%TH+byVA$gq zO&&tO0qwfdP3TUM@*)xO;R(V}CmXSml!FX-4)4putVYx1N85b|5(a4PUE9L?X3M0# z_)(t#jS;(JM#db&@1{+p5p?cAf2jN?2OZS8ouBSb|EqMyG+MIud4iXD(`fQ($2QPg zlUOX6e}(N$OlnRm@ejsm6$h$(N3RUK4eh+w?V>@q*X&JkfQZ-XQEY(7MmbXTbpLl* zEhMT1Jx=LEonFuE{_8#WyDw7z-4M|4Kf0xR8}y!su#nnAqxf0Yc5V-IBH z4GmX)o#Kl-W?gwU9y8-e^1qxy3;v|mR}nELR~6$fXq$9bmzC1qfC zo$9^X1pOWRzZnB^UATy+8oAs~M)VDEN#lqZGi5OiM0CE*zGNnAc&vFFV3x*z1lqg) z_g~M$|88H%D`APwaD7>nK{pWH$*Gg8W6|b+?(e_O8RWVUrb6Si&JVS)=f%I2_xfl4Z-hXdK z87~h1ec_CV94TPa)#LHgue9FXWS||hYtF$1eToJ|9&N>kVdd$V`<$XP)7;m(f{6j` zUn@<^gXpDqb{5011D&*iGS54opzny#rlBc)G8-&g-*S|nw)0LO&4TB;(t!png_yaT z!=X%cpd~NfA&U6Bhy+#?|1U9*cDT{xTp_)a?qG5|0|XaC(o(4uU4D&2zMthcsKG6i z9J6`Tf>1}n(56D`_!dL_T{=oG)Li~O!v}lXT+SMaxCI#Kb#ceV)xU?N(J9^ncpuzdvX zdUtidwf}eXP7TeLIMBotF3VJ=Puyls?C|8#6fS#N@~VhUr3i)xG>os?x}{K^X~?`^ z)TgW;+RHZ)9nLX1UW+w=LE=l_F)C9M_jv4q6LY&%34yw2WQ+S{AjiO2^td1oHp+B1 zwu+mDk2HxEE>8Q^KUdew1#88*Is1F9QZB``9FNET0@5$~4g65<3T|5w&UK;MaMlK2 z;YKlRz6zqJHi)C4Barxub!ve|gsz#NgfL>$Sp2~%l@*8>dVMD*hhti+?3fbdn&0;r z^eDGdUzYDl$gl4fB^j$`Ra^q!xzxxz=|d_=btyJRKtCmSNWX0&vSomG$X{l^DFpiIMx@(cTT=v(UCjal`?knE27o_)6&+*4TKhpL_MFLuaN5{K~U$EWB7fRGvj zP)i+qhm>P;_8ZzRxWX@C+=A5Blm_fhKzG|$s2NLd-toAPtQ|%AcSyt=)rZ->6puS+ z%kg(}@nLh+n`=^%(%J^UN`ie z77{G%&7uF8@Mug~AI^?_==Mf%Gskq0Bo80+x+a^0f8(qSpeT+puqeMbWF_b?6r-Lg zESIqIZ_q}65f@XJr0FK^L5(SjO(q#G#8rX{2aZ+qrB_n}x6{+C3?a1OtIMq%`H2Dj z)q7bQey2C(uuB9?`K}^_L2G)U!tnJ1+Qg}!V&Pi!>%w=_V3E%g?#ws}=Hcw&OoS47 z4Z?3p1o`K8KMgWGD(3Lz4HobK>Ur$mZsz#0z0L6Nyj>-bd>ae=j+4MKDR9BcJagLK zboWNj2N|gz6P%@0n(`Y|CW~5Q$NUKuQCg zKJxsv1cKTRU%nKxSKH`D(7@1s$-|uLAXhT-_SM2ov1lnHn3E^T=us~&tC&Q6vu6HH zG_YJ9VdTqd@xblJDjq;J)gd~fZQoijU`4c}y*XA-ngy_1*h+>H6(nKrn<|F%43RPg z} zQ`Ml{?86~HonYpsS$H!OGSWHP15#$O(4oqCO)EXP_!B%xkLx$M=9tn?t$RU`g@qS0 z%z76X$;b0g-@mpnoglx4jJ|rq*1(?>Vcp0^-e;&8|B^B1^xJQ2YD4#LE4?5?nz~K40Fv z{n(C`a#hTGlwlR1RaP?8tiomCBCofIM@%=oc~%0}p*@YmD$xA=96vdZDdYc4L2Q$9 z-=S9;Sx{n7PiAa^@5?uFu)TS&A|vo)DTj| zFIKjRpWq~R2U_@PYI7te=5un0esBB{z!5`T=QY3_(LWKoU@vZbTOr?T;7 zKSCTc3ew$~Ce!2r%~(_feEc+m$|M;-nXigPrML&QHdKQ z2>U9VI8F{;ONXL9Mgg!y8*x29qG8~d`|u7Q*)UF$*h9eh$Hn3YRi75thM@V8bcwb{ zhu7QiK4|1x)QYYUm=AHPOJ1N97T zX@@k$0`tW%CJ3#l!RymDfg0s6b=sS1DHEtEzC<2qV*0%K4@++xpZ5b8S+bzT|5alQ zjHy2J676XuZ_5S|e4Tfr5q4NTSe^)8(64a3B@`hykH2araIp&dhxxgfDI9(>Cb!Sf zFbZP^T@*C@&WfnC~F%+1EX8nYQRFA95w9Gk4k@3H5B4yzs@%RS2`x zkAC2B8}?F6@cF$VoqECW^&F%9$o8NYF>kJ#C@xa>wHxb-C{TFGhRnO$@V$BzfiDWuS-Sj*Mp<9wz$PM zxJ47%G5v=UdehD;w!F|E3+oy~3pj8N(u1@~9y&&0fFrxT5E<%>)vAn7Bw61LQe@&S zk^TRC_l`rdfoRb(^<{TlzZ%2^C z&Q2w2R5{zESX%v2G^Zu~mCqIFfI{V{ZW~864zynPtQ|*s*0IGN_s}>iH0gW!)vRhm ziV}J?_2+sg2K=h<-V$=kcMjt);_AoHxJ``mx4?w?7s)5R--Gx&j=dwGk2F)>l!tn% z!}(R(U*H+$yf412cj%Eoc1us)a{YHqW%|D;RIZy0WX4)@uKXQ7&qdCoUYy zLC7wWCEx?oQiMfnpr4d1pO96W;58uwHCmxnZj87SL3z8yyFX=-N?$}H$XQ< ztrYK-5TT*uX?2R0N#uO}bqBUe#PFF9L2@6aVjpE-Otgh#YT2i0SRrUU=~J9PWQ<^B z(dFLf^J6O@y@2x*XC#;w_>Ex~He5y}$lKo2Y{~30Ci15Kb-_g6{%y&xhkgpuJ-x!y zg;4$-$RRchEl$ZMr_W`v!!SUIAT^l_1Al~g=^YH8A?8J}4%ch{xhjmtxmjyQ&egA? zV?!dp10v6FpMdro>2XlfVM6((k8rp>CaQVqt2X3%_n3WXCyu2Db~MvZ_;B9oPL>@F3SOnzU7r5A=i7bETwOS3;(JQjvhIlpprzA1Wk9y z%(V};3}pvNA@axvSTt7ZZ$j$~(cL+3^cslwb@YSGS|?_Rvg_mbBd|9&Ird6WAc`*W zexd+-ea)(zE51Vin();UM3Z(_%eD}^kZnqgk6oytDcZ=yfTeqQom6m`?`^M^SPIba zpj$3BgDttHD#h$Y`IQ&KE~V{JAS;yGVzU z+D+15zf`bNa-e1EG)`8tCbEciWLdI0RW{B2tV)zo$0Lnpp%QL|=^%}y1pjy5Ozls6 z>92y`{&HolKk8W*A-OnjKG(miYi$YWuKLKaS5|*^TYFuqk?v-WMhI1nO3#)k9H61n zwYWwN!)l)V?E>6~ux81gdU&6hSO)|>Qs_7fuU)qMILxWm@L-~GC zOTSQ5YBFFK4pKzEI8+o!Hw&Mqhb+_o0=r}u&`@+X+O?#9VGt}9;0dqCKc+U+8D=cK z>0kjj;mVBMPobjzl+w7+tPDgid~3wo$5a;- z(2iet;Kb+H!-wtbQx~$K-@t?Sd?Uv?Qskey*xOOtBJYI`MRLNVB4@y$WShnU!{@*b z1rS=`eF_>KGsW66zRsZf0@97o(yDa0U;HzHQ~LA(ccz$KR3jMwP~%EE?eyfGWhU~I ze35p*I9xMopb|LE>M+xW)&$CPM3neGW{rLr+`W-pIT^`dC@-A zB4(i#=Zve-psj)#UjC+Wc&&KsI)2>YSa3@p#-Dp&T8527r=2H1yEwG;VdPkhKV&8u zuiYZ_>yWI{^LMHKX6dim<*Dwzoaxu5j!~XzrL5x!zt9IS!{yQw`xSl#Yz4#bLw20-_xE>T!U+#=gDcUz-r;3 zCOOq&c1_&RJd_KLl+xT*j{7^v>g!TR&4OL}HB_~7%=w478pO+AX!YQ}+zMvwABFjF z99Vvx^IJ*a`pxFeDnm|(;k&JLw4failffPEf6?{M(UC^ox^E+$bc~KWww;b`+eXE< zZQD-AcE?V|wo|cf-TKbm`|NY?7^nWPF>2L#*Ids#pWk|Bz1{QH<`-aBIf}GN>$Ob; zhYY{90=u2^zFBmC`X%7T^qxwE<*eDGu&;w!Wk9~p&;nNlMup;hQ7PDs+sTJkcnL4_ z_-l0hl+~WMPl1?)+aHKSpEOENjAO!=5aAb-4F6c14rF|&nCGJp+|1T)8AjU_2_umY zwd#UZn9&cg+t4-${6%!8P-0PqKI1tOk~kSKAr6kJhf9k+ zLjx54n+51y7WV1UjGma+h~}{^Z|uM>-Y3o6!wq~=PWi+fJphdH6HpKl%KOrcSBR;; z61a;X3BbeHnf|U2$NQ=Q>EQB=yQrPAlY0uGr*iZW=+VZ69?2LuWpga8!*)@vULAUq zmRrpmEM6D{`V|*pVcs@hcaewjo9B-@0`)$rWGO3upYC}z%Xr?G0%4%mePhzw#3__8 z^5FNgfB7`iPlmKJLj^aJ##p@Rl#cP?{coWX#RUeddX@R+hT+Fn;fecqqcGmS7qpC; z5U7aay#XjalQk@&vCrlfBbPqIoA8@S6Dvo9Ys(8ktnmlC)OarlP)DD)Uxorkw8!-i zOhU>SEngGL{!U)~z>imZ$KlM_Uk|l}Ex&12cqs}4K^-hE=I_&bmHLWLw=AOEV=o5#WmEnqow3{Jxph{5iwv2VOu-aIqbO6(4x!&H8J?l5u5ub7d23&_)wS#eELsQ5yJxvl$tPW(S$e%=3H3hePW(} z&t~L1v2x`bCb;t3x3p&1yDL>2qOF#*sBZg~3MLniwR!o#?Q>yYzHh8*)hrQ6Hu|zk z%YluQCKjPXnMRi7X8y4X!h#)`zIsV29VVcuBa-z)K^SDb3G%y3OrtqU_xbcDNC3^6 zNH4CIcP^c_rfe`{!tU9&Aqf9*W_|}C7n%A1p7o8-i1nJa5V zF8|Strx+={{QV`X)k9D>$c1ObvxSL;cc<`}F+BtwKl>G}K+Fl!H;$w^rc(8ybSHT0 zwvePD$~(@)XOb+{QD4u`RPR3d)2yV{8non$vM(yJwt7_-aZWJ$%^`1?f8z)xGOL%o z#QDKTCjV80{#xlY2{1(RU|;=4P>AJjBy>OX8rsx2OjbvYT{wsCG1)sL^@v|4EZOZy z!RKE}LStnjOa9bC2tHErqpNgJxZYXt6vH;>|2@g2TW{5u!WUtlR~EK+ha62nBo-l1 z+5A4GM_=aBfwC%*W~@;v2Kdlv;XSNwwXEW?4gA4;4(>q=I&ZAY!=u3#^EzReU<%AeY~Ppk;Dp z?pv{pA&Zb93pgQKQq1pT3m%CFPv1e1`fNZP)743|=3>Z7zTjSp7o5#%Ys2(Bk~R*P zF%I{w0NfMOe9==p;w)f(*KMV&2UVvsq#n;$ zZi%p6dpCAB8nMYk_&Lz;grVpI9hS3Bv8&zw?kb;gz&?L)YQhE}=YP z@bw7yFZjIzJu04T?1E;n5&fFgyB7+Xh@Zg6{HRIWnN5}b>I^AL{nN4kNj)lCcJ2*G zm0j}`9$Ve@WO!=bPiXojB23PBL6`(7Vf36_EylJ=Q2-`vm+N28(@jkHH2 zom8)g9%VHc@!o<8TWX@z_2~r$E%U#dd#`-;jO&#JAhy-OM?yvuY^I2ERT`B_5&wc@ zyP`@M$~e99NixkSRxs})-Q-4{rlu)jgW_j}qJm0<-TKduD^z>$b%bX7DMgI<$`MgpDJZ6I*VXO zlT_UQt=!3GM7L><*!9WW;>#$!GFmOWc^-a>`5E%h6|zs{u@)ah*+_Ss6BgSwYwm-h* zc{l~)yHWx_}ORUr}_A##eFXu}0#vM&?mkTdvcQQ9j)0J0kQ_JUN~fcs3>F z*;E9|pp*Y>6cmRTc$2w0Ep7$X68zhe(;6^*Q=Extu|a7!nO-Q%o448(EHgA=)_x6+ z-JL3R0!*m1-wdg)g_tKiHGQYW!xm)d)@I{jqLs61fW3DMYlN0Ymq`kkz(d73FqP>s z;5E^A9yxQDHZ*Q_6Cm!E()TvmQ+rzqA9YzZVpVsYh3lZc?o&f;+o~jd?^NdPpgP%W zZcR04O)7yY@<00(7peR=p~HiyExSmUILF8$`E-Px|Hn;^{IZ+?|;-yx8(}b$>oa81&!1y!C_qbaW=xFA8Hi^aB^R)x+(>%cV#S~8zF0@+ZGnTb{C{L;68pR$mfhvr{c~XzTSbNDL(Y5s zhe(=Y=rOSD1PO|MGP~5P#{1?lEhH40DxU3UPu|}%a?i!5m0->RD)Fuvz9P+j*5g8o z3%i2|bHhC_a(_-ua;!!E=|UlVAQL4LI8X^6>;42{)&;4_{2Xb#t6(vKXIYD^2L_%r zsf|Vc88CPN55xBLdsqrQGG!i6Lb^~qehct(p&g4t7&6NkAto$%C-i8L7_v#j)J2RA zwFEnN({y`jS)+#{d&0Ot#K-|3?o?kf4;~`kg4lgE$LrB2VWiWJ#zzd%k)YnEBXrvp zX%nojIO3a(SW&D(z}xiMOqeD1>weg-}CI=$mM<6~SHC}b3y zJ6tI{OFZKyPP%H$SmPcR#KThBSDtAsb=zEDi(_^*`_T~^;@3mI1eZVw#X&RYFdu1{ zAw6`HYKqzRy&zOPHMKQO#$~9|0b|JE!!Y6+*e}clP=(jgvibdIju=vo_n_$3)q;V} zM5A5=J6F}bx$4F=iJBBD{xCooK6<$^a2-@qvh+QYGjaF1ttj?yIwwkTC!hGEqVA-` zvec>6S8N{iMEok(?;eB?Xw726;gft-C6=}^z1_OT4zQ96-YkP+&L}{=ySYcS$VcBK zLIYJ@@|LMixG{jzA4TjmNa8KMyr>&@t@d}-St83owsfPje)!2u592Pk>A=R)+iST9 zdK{Fi`ibE74P{#^{|Gdt(}|z1{N|~1H{Ng9fN=yKHKp*+HB|)_KgMLk?Z$!YCDfdc z^5Boa0;x{$KL3yOVM zjPz(k9@q{$IWOLvbirNvw%>$iRB~ZseExWmeLDWguqEB9F2VGwiL>Ph+-XU?Dwlkv zHdvKS8hBPT4`91|D)NTWsc}Je9h!k@mwb4Syz`uXOgwFidn#|Hu|4N{w1=94jhk zV)~Ai@1#6l|I}KjJH?m)XRoZ}{qCa+-vRyF68k$bZ;US>a?T_EEdYq-e{{Jxr^Ak8 zLVJMutPzpEFTDgGvZWh9U(x7W6l1nyN^Q>YVUw^Ejc!@m!Ltd_`ug*x6wUAs zr4o}<%kgQ0-r46-PrN=>tNjO{Q`Bn{&yud?_kxb^`M#1o=05y5GGh{&E>}+ihr9wy zQn|3JF)XGfCLpB*nCntIi9Q*|zg~?ow0M6k{NdnPu-s>q(993d$qJA z^#?y34lkbSGQC12d60{@9BLO zjJAUuIWa!#mmNa~nn(_-i@t_GMpbe_2M%10kWZhS8P0J1 zFZGTlAJOLecbA9%iUP9jYpudQom(B6FkfGvGGH<97^5G7v%~Nx0)VD&SB-I|w|*Mp z55bSsKO-ElMk-gS(m zoCJAG;c1x!$D-H;t$|G){TD&r&cy@?CXFL2@LK00i3$Kldw#xQ%w4i9R4`!G&YP|O zQt7u!uyY%b-7HR5>0-Je>v-|GsGu@Ty9xlP)w=Vr+7z+hrvg?DMXZgi*S>T4LXQT& z0hnCK@NHYg1O|FX(QYQj9a;-K!0G+j>|?-sBVx(`WW)sX_^28lYQO{l>*bZEzg)Oh z)i0~*PwWzWd*6eZR#aCu(M>ME;WzeJ=Dd_uvD{?M@f=kd`RPMG+My*_f@^R1wx8kc zCquTQrdc!ikczEY@_KX?^toq0lgej!JH4Cui82Cb$u&@#B@8F?ixdb~lZ}i=32Uds zfQ~SrMreibZC&K&2hJL1PWRh9leCw114!% zYp-~ai89Dn6Wo?6SHuCCrat|nleSfhD};S0FY%*#*!4&;w)G3MV0L%Gg=SQ)Oqq4n zX4ELtgp+^6`Q)<=6+ibv84EToX7g0^TKHVJ${)h3f zP0o_JaY;|fa%^_#a)qj6i40N|NVQxgF}*yzuSOUOXmu!Vz-bQTRm(`Z?{Z`)*G-{) z_L=;^dRw3i|DuyA6DgllbO}7iSKAMHOVdyQgx2 zZb0t7`udQSzsZe45eIQwaV0M4=%*F z146Gl|GvqNx7&W2Z-U{eX7$CQ&0_Xn`X^dNDT<|SNHsdRgIpp7@yRai)Cqh3$4Hgj z1t_OR(0=f6u%KL;vKsC{lh4$o4*giLX{)5s_$#74Vm@ju59RZaTE+b1+-B4Z&jtc$ z-^|_L{(sVJk89&j_A7V}4YBA8#&5-3Uq9Dtm!=|pm4!NuP{1Qxh=}uidc`c21-rWj zo)g>Ij=Nod263SUL~_=hV}HO`!OE| zm1~w5g+Uo6{B;m9M74aNjAk8v;rEyF+}%%&gT*ZxMsGw{$G`AsN}1cVC5_)Z1mkC` z1K1&MM=>o&G{JI|AoIDqa#X9R;2Vi=e8h>tppt)rUFfJi8feIZ@k?1niW(wz$7BC^ zbM%14vbshKBbSm|Q1n97nhEn~>TBU_H|($o)kNqIF2cn3e>a(jt8zWepVp^4tz2Lx zRK;+8vYJ}R#a%dk;%es+m-7{k2nv;m2$M2$c?FfNdJpd_F;>qAACGHy;SQ+Yi2ca% zVCa0#W!faVi3(#)<9%H!d58DsNwkAidD?0iuHveOM5J&sc}KYc^QWt%$UVPexTXag zYGP*jAjE@<+qCKI$vP)^bz0!^4fP)$7z9%MI)t0O-**hLW>oSJ-5T5{?CHc@ z2V=Yc^C|lI{{ig4cpxSC`^JgA41UK6k(cavYDo)&59OHiFjw$$<&{o82OwS;ioE2}-pKOrsZD>lN1=-K z1pkW+n;Sm^UxGj*{C2wEDOUXc1OFsz@@hJeAxvfHdvfEY81AyB+vlit!3*#X>dD6+9${v3aXD$d`=_T- zOsDYtzu~31&c#>_lHWIl4lg0iw_tDa$)2tP>O3&sD}}*bR1d@-N8TgbhnJgjh5UIw zxlrLcc++DC=FhcNSq=6+XTmzANnle2m*`~}#dLF|nhscjWs_UF`(zM28E6g)R9QUU zaA`>-srS0E@wgvY2fySrhHM_9zH}ihZjqX8VykRynqr{v68DoIGLL8SdUUG@If{Q#*je_yH;5_%M>G7rwo z+0c9V)$C07@yOC!6JquVzstZwiH#y6HGRLJox8AGbEWsUO-SrdDWA#7!jmkx;c zZ%63Yr`Uld4Q48`G(~w~L;(+{4q}ivpiBCjO_j_$hio;kJ8vaY_7p?ANwFzM-jl#@ zdsYXw#jOvzA@s4vrZ%(PByd?fuF6vf(n&@F1=+(Jcxkatt^EcKT`uu|P{hJ6I($>_N&S}yRn&_Eb8xI2 zbFLHw5sweH5D_Fw6%Uh937p!GIwh8hjwdlZvQM#qI*~7-&$qByZ%AjJ8+mx_&dW9w zwiiYpDIx^x_@E~#rh$e5IumgcYnoVbyWH56Kp&6pBZ22o5%6AXsOu=M_St;mo(GD7 zLG3P#d3WeRoQt4U(0w2cz%8a2^Jh^6j|(bH?}~uhneTf*y<$<=u?!alIKAcsCz%jrQR={ zQ~OWPo~JSq5ZIi?s8)tT8jCz_7T8hQOsd{XPm*bcp(3-y=8#*=)5zxRe;u}ykqiLw zw-`4Dn=+TDd7o4hJz@x(VX@zVUQWI%CKqdz2QqQ(0e9UegV_@5Z5q7kDrDR8zD-r47+OH;_3X)JmUTMAnop2<4&{jT%JP>a&|9)HZ@)wbF^#b3W@T&IJ?ZR)#TO3jQ415SC&gPm}M7)D0e~+&f0Vt6EuLNMKh7Ej# zQ|Dg)Yx?97IoUDSvu&m>?9AGaTQyd?*v#en&>>=~^O^-B6Ld_Ru~dnobHll|=YBW| zt-Oyx24HTy>S`ecvO5+3-95wCsc#riRim>mgRHrD938sgdmgo9ict|%8&Gd&Jhlw) z1kxuLMIaL{11qHF|3J%kMEr9rANE-mbFT#D{ZtB@R>6`tkGf^H+)U?U#(TkVm#KJ` z*rN=*3S(ooFJTX!tt#br<$tQzoYb1DRk4R;4-env#m-Yu&IHP1W7(_oy%6KDxl4(T za?=;&{?S4SBkpJ?F!Rl-!HtDM*^lUZs2g34I*Yjy)qy|Jq5p*G|0emP$wE;io8KT7P=<;jXL51jHAGG`5W>mn z`Oc!l7)H-E`t;u{0Ig{)Rs5?y*y}}H-x@by^*Hv_Hj(>za0r2om#AQsnD5FuluGQ$ z415cajD%qVH(@xiV0#BjFza-`VLG39&`HaT+D4j(mIg&0C;%gVe6?&_eT0 zcIj&hu*=T*74F5%FH{l5l2m0w^lOUYEk5?c)a%Y?Zp>dJ@HtI@+XN|>WqL6lTf&*) zc7s$D###TR_^b21)A{2Z$~aRF>ifY1TO?tGlJVpT0;El)yy+yFS;Z6Nm^?WhTyQ)c&)|%3@bxzOC9#BU#2vf!hO*=u zS=2J@+zpo&48Gmdd@o5|3+^2c5PxU;&f@Cv>{eMS&1Q{(Hb2pcC93!zB#_cP7V;Vz z_Bhj^;N6F4NpavDW)|{7c9%G0wrIMPb3EomnrLoir>IKF*keL4i~lN5a*8qdh4X7Y z`faWm+}7RnZ5!gdVZd0h?VbtW2K}qKO;Vh7=3deyxhecZherP<7dGGLiidY9e=T!Q z;yW*qC+S}MXg*8-p)*P_qY?SMQZj{7a|y~Xb5CYzHBVjm+5gQ|hj_F6zZkp7LC((^ zS0uyF`r+Y{3ZTZmJS{!fdAbhKxxhW?45hd)&(X_lpGIM!g8&Kz#3`46@eA{AS zX8V}uc0-kOeT%)?i=+czv0#8kC{^lH%iAEHgk^L;Lvr}3O3Y@uKPH?M_Msc*QZu<7 zihx@KvD~gET#wtBHW~fMLKcatn!E%ZKg`U^{Y*-KpWo3A{!5NDzySI3-)b8%N(~Oz z1(zz62&hVtz~j#AN?4pGrdn?_;l_~j{V7kl*ao2`KXIe3yl2=@J4y_ovD z(IUy8H0p$2&hZVJ74K+M%(o9s?}j>KBZBSwR5%Ha#PsU|S(6z-s7kEBbfVZfiQll$ zWl!jO1RU{N&hg19qo3rw230|Zt4HQHhm4Clgs7@EBGn)d+txv;;qQ^VtqV8$YlbGH z1-G@K!@T~>0}k47CO)m4Elok$h+Bx;dF0keJdL=wjGqE2!r-JYLkba)jTc*3lC^`U zF}0GHe~2p9O7oLd$CGDhdhWz^Vy+6(!^V?)SYue+d^#P6p3xf>FzcB3<_A_Vnt9`& zctDq%olfnv)z2r%69Wur@i|{)8qAAfQJgHP z`mb(IY^J4lbs20-E_3j^GOHQCzsJS$R1pz3tf{9oSvzp>lWbK>!UNVsr8uTR#H)!$ zPZ|C;2e%|HgiUo@#l6dXq?pw2-o1h?BXaAK>$dB3R27-C0_h$#{cjT}tzG7Jo_$0f z%TgrYuDRC4GspVFDP~;6MHe03fHF{lG|U;X#X>N>mK9UiQn6(?o*S|Car;A4PAJ8~ zMtbmKS~wkq(3b8vir(B{R8}ToWbc`@7=z#+AOBRQZWJVRk#e|7&@9^C({J9tT%3iM zBof9D^DcZkVgWcczhGdXm#aO>4FRLg%7zI4e|$Uh+t>ev)Z43PG3{vZ(aO2~yEd6evErqFn)cu`KbQeO%$9$G4mlBRt>~Cs42$GXuQ6-M)od5A4`%S zq>XA1k#ImJqV;Lxa`fW1NFJxfZG1Y>`|B5^Su=f$O%loq$!mA@hZ5)LcJ3A3Y>SR2 zM@^gjMr<_ZK9Z!S2&)5NurTU$_j>#q^7E6=^AR6+(0l9iqqX~QysBViP#phHBz5%! zT9|=8bV~N2K?L>>EOHl}N6sowP*#m6X!9>yCmXMX>|ElVWy?tCNZH(zC&=e?UUq$; z+joF*emf1qYVVt9b5Oj1-xDDXiE5>4#=_*qCZ9>LXhrh!0%Fh8Jv)%xz{Q0(H~oFw zZ1dLLm-n_?$f!|;p_o94iI^F_x}z{o(K7D4R-e;j`_K6UrT26q7;hJJ5Jh^kSW9Y~ z7Hk>x0Q-|F{=^3Kl_!N>irmBmo*M|2mdB_ONHf_UwN^M4+&w+k_Z!CEHM6l}b}DZN zV8A5R*iAgVX;WOnqq~PnkqaB>GJZ5EbIxP0pqwbTu}_JtP$ThZW6FHjBtwD`2@Cz=+4WLNWx?wNd=*FZo zaNc+LohfU*hr(%>(4WI%^%x>v9_PyN2{*C|!o+FMLx}677Y~TFJ|i}^jy12t7}!LO z64XxPfgr;Ek~2~#J#-H@#7O?MN?bTy9mui5$YsMO%8L{gi>GHb$VmKvFWx_q&d;%m zFSmA^;O^^2)$R&#ef_fCM{%prMi4qH!l6tvX(Xb8;BP8d=h?lR_7GidOOo}5dz%0Y zN1q)1Orv{NR686!b&Bod7H0z2frZ_wSNGtvxv^8U6X8|;;=ntjgua2R!1)MoS4ztP z**z>&{o5-N<|*-W2ftOI1vd&9!R;pQJI(}hNlXmfcu)d|7J+b9QA{4He5h&=<=thn zrmrbn!Gu7vwTf?=$R}tpM!0<|550oZNDT_-jCP^ z{(cLlW9M!4PK?dxa95tRN8aS<2^a7aK=45-bV%>*tM|!>MXWk9yny-6$N13Yl+ih| zpAAKm@(%D5u`9NIp&u(8&&@l$pF5K-K5=apVYJPW5TYCy@`%}gQ0%0_AV$eQ#u6JE z&Zj75OjH@WQSaz6c6}vjC|TDqE|zAJqM0|jQIO${JQ#v*MzE`0;!Syu*@$r#W?#$} zb9Sa1)GFxj6cTAwR&EQ!W8lDs)QiqN`GK_VlhLw;d14(1@}Ne^49o813#I3&K2 zZRWBpEJJo%%s}4&he;jJs|ANKSV)y*5Tcsi3;ID<@GAIbpmd|&EOHb9q27*=Hq7IP z2Dto9R>X!$)6tX`p`_5pMbpL!d2A7j0vCPlxz`%r6%Lqp|Ma6C(w^N@WjrC) z@owa@P3EYD4y}SVaqx?lJScUy11=z>#XLKPDdl;g3^&m8-RMLbIw>hfpkq2$`(bO~ z-`8aBvj^wV{;r6RM*G>?e^uK`2}O>Wet7lqXxoKDAk&xS&F$i4yB!5AEBtZp#{mV& zk>>RY<@Nn|W$HL0E&S)yJ8EYUWn@5>$y#{kT^<_qgJZmtBz0Dj-m!)v?q&@XNpgkV$8eUz|cYMNtI?B3w!p;OTGj;PuNF}%T1l=3wg#~1( zH$ON~m-ZC8=JP!C(yXW8v)uDg&$0KWKkm8@^bbkEiF?}kgQWbnQFBJzm=W2S+f}(D zKeZFa)(1JptH%vyGPKdF8qGWz*|QXLM{~y___76xrV5P2(mH{LX@St7J$c{M+}Zhi zW)6BL^GqAAG=!#1oKuNgckbTfe7v{ti%c)LR&cqKA_Q*$Hm7&#>HAHBe1QA7&Ozb6 zn?vi8ltdYeJRB02I8NqlSLNtDAVzvZ|kfnAsj(H!8%{>Y+biqY4a zZg7(wxQR~^5qU#}FUmdCyO@+7*ba5f{dEwHBgu`}saUE%OW$QVNQUTksDWS(E-k;F zX$qZ7DCf3U}Am^bM84;)N5`MaA6}TQHi8ZnQ+t6`|AhMdFRQs&Zt1R}W$esV& z?Gpji)^W*sXOhI9U39T6{B+8B$Kk#XqSQVn`bPB;LUU8wUpmRW5srC>+uLENCXRCS z^ygSFOK!eW#m_i)o)lGg{APZ#rod3NqGwYb7V=My9fW|d?ZgFc_PmkhFYF9X-airR zUFY0*_^YF0uhVbob|2F6Dy)sUm;>!W|At4JVmF+cfRX9n_8oWT3GkhtDRV4q^ZK}% z>3)5&H0FFL^?YyCPNHMIYBxIb-eI8yhVoq}XN~2V5urGp1 zZKW5S;-^dgfp@df-+nUE;%S*3x-=~J5%ZltytfZrc<@7>`YU4a;9F8hX=*fQE-0Wr zk@mOoYk@zia(g4N`BY1BB!(;%oTzIJw#RJurca-vvF19;ASxO+np{9cr15(9_DR4gCAPJSiYNXU>cR?UC3NwXwa_Br8QFL^=(CI#kd&G+G$q9 z&=lE=Pw;BgYc$2}x=}2=l}DNG8U!tJ^fWD-%?hKl{ZRXKJE&aj#YaxB;Y$D4n+pWd z&`)LqTA@rg?IFQ1iKlgGhNC6sFfUBU9zz`Z8S0m0V6zhDEN13O>wc-YSLoy)J4^N2 z>pKz6k%WOL=K#Y`BFj#S5FHE>-rXK+IZZdu9L68LIn6u@oHXZ!TE~0)!EbkW50Hct zHT$4X>^p@&xuhw!ArQ3{ONxES#POi#eB*3nspq|9vWc}XD4W7UAvTISwu;4|{xNze zxDM*kLPp=rlDdES7VwD$QqO=5Dq#stAYy_^iqby2&X}yzut9`PJ=ivII=KrbX|wfv zCwC~-I5QnAq1}8DIw2vTlUzbo)H@_z1QDT!9?ekXyCL`ob6{Sk^*>(PAU0sQ^+ZiO zj(_atBDaLzP{Tz`{KRM11^haVvD+5ngROB|p_~xHAa{jCBnZi=oN5eQWOe|@DBCYm z3&@3voH^h(V7QxcFnVFZR#KXrC)-^eE1ltB+w$nHqzsAycWKJ&JXzXe5PbKv(u4@0 z1jWt>TtH3!Dy&h$;^@8k3*qf^0;97O`{7Hx$ z!}b!BtGnUf+FKy*gA@jjtV;?ynAE+xT9zUI^B7%-xdJ1(q&Dn+pCgp|BiT3M`RQNgaL7leIGv}bkx+|sDe~3Z2iEbnekBD}6 z#=L_gYIa4iN#5=c_?=hghYAh~Z z^3Q4|((GI$hp-q%y5|x{(4}^;R`s!!p7sAM{w{KqOPb_)Z%o>=-v^J^`zYf+=t3*n zm7dx=lCExa9{VdJEuy;al>2>-=1n(D<`s&fW&&W79GftzZIObhT*I>RoJ+_#BxN*t z(D91d42%p1*S!HN=oyx^36*F_wT;t#^#+N%#$Hmh)3beSz1NWLggf427Q3z^Vu` zr)FW$3yPbG8)X$^n$`}PPh)Oz^a!+koAm0#!HoPa`0@SKc!o!kwOa&=_*Q;M^79;% zotfD|6a*dh>^@I3JeWHtw&nvn=U|X@D6e$rzxlPT?JMX(3MAE`tPKjZ%i3fF8j&7d zMD=#yl-Ep>-o=Q}Y)Oe2@schMcmFhz;tN%URCii{Q-!;;-)6^5hIAg|8EAuIccX~! zI@>KA3 zXDFWyO5IJ27uGf-yw23mD3nI_F_3&o#z2G)R&7raGuD3F+R-@HaU2Nx{g$Q$d3554 ztggu~QxtjRlNf_EKoc1Uy}#g2V;1gMIVFa`F_Qfm@}uZyr=j`{+!ef#qsWA z?B>u?xA$7Ao$h+z9}B!fDjm^`IH|H9YK3i!n;vk zw-0&OxR*z*76a;QNMkEfURo`50(dUrW^G%c!S8b2oEU|Lu|~8r0&H>jiUXt`|6zdyuVGw}|J$8wwCc=_dKJlglft_X~Uu{x(5QBGZGfI}3 zFDgt8yfVZzG%VFqB=4}cgsA2hW+s$xeaF-!n-{>YTq z`XT3aD8x{h>=!w{`Q2Be<_}U2D9EFXuTx>9rtw9P$u0>y)hl zT`88qLw;@!ZXyn`v9Qp(Uu9eB=k`4XjwK?8vYy$XWafV-*nw@yWWXUenUlTA_L20M z2e&$naa( zR{MGl(YryI(L7N4uU|}O0CPK>Bxo5&YsdErr0A~y_pWeLN21bcSr67A3pVo_;f>6Y%>RQX_q0SWz83my@$(P z-D{zjDqCB~)W2SeiK(m*_#ZUt`1(8YD`Jq#&zW{N0umYZxtQ;2j4s^c&4&hqP41OX zj)7=GYn*rnkFfd7zs#iP-GHnvB;$)EhT6@Xxubv|NqRd^3^@ez2KBFq1_*!0hjF=3 zOl)oG#=4Sn&%9e6JCB=;`FW7?-K7wMmk#RBK8FEVNL;MLP&GLR`nq35*s$ll%7PMP zc&tuk^;)La2s3VA%k)%-EyOwvI~5Q*P(Xou)VY1_a<@>ngk!4v_6w;YA08yJTbAC$ z9_c{kB%uk`m@HCi{}oGnLovSBO(tM&<7iav>of1a$*H221I0sjpI&e8v&}SV9{eyn zJ6pWL)ZZ-Y0HjT8vp68Dx=fYf{CN5oe3BVl6pRA_kBonptV{(_$8%IjrkdUpn0 zVRQQSvSlZWM({j6?H;obkS&Lxk$zmOTShi5@wjUQm{T&FR}VMN^X^s4w(;r!wWAm~ ziyav3_+rj$fy|xmCrM8#GR>TXyMM2saoWKufpiwpi$Xt`OvldZL~F00nHrdDU#-xo z;Nd|iA1`Ds8#Kx6E2oW0-)L%eah4X~16Kah=`l2Qc9uao3qO){6CyQsf`1usPIjEo z3Al!eIDr*9DW0b2c*0rn8_f4YYC&NBJXUh33<;N2l6qF!k1lRDa8QtxoCaM{aL@F0 z8W;;_0lx<@J~%QJ=%5>TK~Qie`A?_lsTA zUp^TIW+9THNO45 zlC7oWfiOVZ77Hw^zE#jYC^(gd3JpEs%anMp6v+6vX8|g9ry6+PO^g8SqcgX)zB?gg zFI_|MaLIV^W(#R#ggQ2lhF)w}`O27p=bHZ=5{}dG%cQpS*}*Kq+?fnbn`?vj{BZ z=;|)-KDm10*Ss5@=!Zb?)I46Dol-{bE`~{o z>5@&0(BTmAJ#!s3#qDZfH>llESnTaO+L2)6;~S5bhnq{Jk$_uVN{Of0NL|aJBYD`} zz)!@eH(Eh1PxqeGxq8>)r4JpC%DuF_;nFR}MO=eNUtu%4 zJ9+kgku#?#d5XY#-_60zB*TejhPBm~>xsMVsZezWQHmm&qEFhowLZRRE7-bD!p+s= z{dzdaN8#9vGoMl8mZf0d;?Q**jO$+Z9VJFjI4#a_cHo~M$IB;#@}LmjNd3!UM!)=x z68EY>8G|yYSS;dso}@NRP}6sB%!~YzLZdS^?xDJ+K%6yDpB{*i&9w3%Iqxug&dSI1 zAFk*ePIn_DamRfi4L&r`=uGArzEBL(?4{)^w+~|QS2i1J+0Qb}gD^f#drsYx{974B zj=*c?^Tyo;B^=MW58bmy*7XqG%bk5|oAo+l6T#xcAL&gpuefCboVxg&z6s8=Jaqpma?qjkbqW+YoA@a( zCz7`~RS&>36zjt=%MD1{adS#Yr(6Mqi^zB`ZqK&iv~huiIf4I*5YnA+ZW=(dUg$5s z?ySP2jx-`K&JVXoiOOrJ3`IzU*Pzg6aOn5dQ02S(gq|h@5bz|s&Gije3<{eKX6CZ< zI*4Ll0O+1;yKW|j^6%ipaS0l-c=x%vWCO@?7N0KbYzS6*nq8AE!da!_X#jNu`JK&FSEVV`;QXQi7yHiVN8eu-px z4TV^B&uGNdfsrP%`H{9KxwtCET`BIFBW@;M&JL7q+`IuFJaWeRbN;z3{g9}GT+p-) zGTp#4%JYn-UIbv1(UN(WCC%RzItzGCIv@#U5O{fDU%9nGskeyzMj_AZ3%MIac59>h z1w~j+qhAh`J;tY6qa&g3qNQ=?HVBQI+t+P|Wkvr7i8x8*#o}cP5M>=|2my3w(?0;L zofD=RwC>(`Q0jf@mC?X49rGDfRh5~FLgwb2be!vD&o4ATVMrV9=p10KQcF3v!yf2k z4A{BOB^L8ED)cY_zUW%%J21QCR#|_0c&%stmjbA_yBfI6Wsm*|=>2y)6|{N0PYXVzs;TtvQxj{F!pMJmW16hlz5=lVGqXobij8wf7sccv zC8qX$M){P4ThRcIavs2+eZO$5vHo^(Vi?)Qby8l+n=(k=)(F{k=87yqsWMHP?S+^J2*tg|hzhtRy$ucw`$DL{m z8aq(?S~lhOS((!^y_g65)n2zxzWBdgZ>h&G@&{#qB9q@?@kOTWDRd8T;PZcLpdR&C z>+9$y2dM6g2X$Wd^kqhr$$d!IZ3<}W9{)zbX14h?LeuR=mxrhYK6_i2+^_w|k1mm!?`-f=b@JHbWxB zMPa;YwBd9-3zR&+#~an2eA4{A>!0g*8^0T8qP?+axN29Rzr;?p2O!ez8uA!vKiwo~ z$otu9Apc%`J@P2ZwDuI;EO3LvfLrZk86Q7n)_X|&)TOWKOrYCX>&+ggxh4zCipU=6 z$%yeD;bb<4X8iKsEWqHkj$Q^0quq|9aroCbqgX!Nv*;3I6`3e<`t)n$*k31jbA)80=?>P|8QTq0`Bj1MNTHm{lWi&bJ^@u9i;f#-!EHQ$th zP|I(&-lllVZYL&o@Ba^bZy6O=)BKC#E=h1tAVGq=O9Fx57A$!1;6B)30TSGTYp~$% z&IETG++BvjVHo60p67knI^XYF_rK0Pw?FMQ)4Qd*>Q~)WyQ+881s=Y>1evmM`0}pY zUO={5TBg_fiuSVM^amqC}=P3=`R}zj$0* zaeGP!?H7}GYwF#5Xa`N4)h5R}r*oNoi;l^rNb=Ipj8+;%n{==p$vUn>7;4OBJ@fya zrrtn_I1aNKX`HJK*ZXPJu{#H*=P)!Z&;9|H;k?~lq(@H%G)#lry`7qz#SgdbKmr2% zCrQNB8|xMuhxFP%Q{3G9oySKC&Ryu-@T(d>czU=5*0A*?rB4;0-&e0G_HU4%iP7y` zH4_ua+AM4gd~<63L5E)pM{UQEDwG$1s3V~d(bLnrNMw+&6RN)X-8__YsS&IpCNItQPy1U36gKv_WjN(;_L;ezWQ20PAqalpR`Mh*K6Qy3`X!E&Bc&%JBQut;@0_r^ z$Mf9{q)uQLUg*CRFc88T^k%*V%vXLZ6JiF^5r zI7yK7pLRHihfmTBa>@qV{^7L$Pw`r}PX{Z49eCLB=1+~)3=?}j*{XV;iJ&m0=R$<7 zZBtEPM!FATWx8BclweZ8WvSW~mz}O;vwhSlvX~cBMqQy(Izrm^ zQvZrovo(e5E4BEC8_QYgS#RrG3^c*+KF9Y*9mfqD_5mcNuVQy=gG6)JS@wsW^odpo znQT33j4Ym|{B3TcC>Ol9?-nE{?zT*xZSkVE(XY9x@qiRw07SWfc z!e3z?SV-(zL}Z4&tED@X6dl{cm|2k^At6HW`$3K!%k$^3?J5&j20O{LN3%RU1)cW~ z#@*+kiG+KgG|qQRF=IxN)dXLMsd1I8S5WGstGh28g3sxs_hkC6KUY;Fr$t+w{WKRY zzv)?96YRqez;swdzZz>Hq^SjM+39R0e303u&-)8!66$QzSyil=9jMp-ka6j`X{VLL zxa?$ORCGxZ`26PKQnwR_b%ai9)HO%aH%b|ZWog>oCAame8^1CapkScc5=Y`+L^-yM${&w{|CKn zbmek6>wRXC*7WF?y4clrJRPtWjnH~L?ykPOJAP9@MX-5u*FV@O4{Ali(Fopjz={&2 zox`nfEWCWt*psCVe`;vYh;I<5JFH9%Ukpc3_LIh61dSgG2<0q@cQ?-myE)ZL)%_t$Cehi*swR-$Z}< zcFE|jInkmu=c;Sq1A??`E71#;iaC$6)UUnkqX- zxndDl<$M3A5_5&(IYe6rx=t>vb{#~t4~KevjHp`10RgUosqr2V-PMeEAjJOL9n1zy z@)vR6izBqs`QVpAI;&&l%;r+6)q5U3x?z)a;O{59Lk!bd+$xvID><&56OZ{v5Z=%W z#7j0s|FDiQ+x$Gz5@8YBP0!(4&*#0!SqrBXNhx@{mFf=xu(bQazCEe=q@oyIhatE@Pa(z1`m#?!=wlYVWY&n5mRsAx~Mc zf^pUh&g0e_r);YrY7Zo;#UKRkqXt~M%JyM^qF4`&ssl9!$o^0^7pR}hV=>}<$_?12 z+Uj&oebHn)H}KLI9Hn+;XiewnZI9r;RFA{Iy-`fzr*{f^E4i%G>eh;k-WmPwbidTx z1MiK!c+E8Y707F`;QL!uCmPXXhUPyEMSYhQD%hCMcKIFz;^K6F>B2t4uBz8S_?gGo z*m>z9&!@Q5QKuytF+y13UyA-23@UW&i8~$WT@`8CIARE%Kj)P4QuyR^ZXpyuI5Hd1 zjJ?<@$HA(LS-%za12Lrkti)?A+_vle{mX-Mcy6@>j1X0OIi)5Dy?oqKU4@gc=-Mb* z%J0wv!RXr=|4K$*oPVexrUvPq>8NBk8z+D&*IRvPuY6)E%~05WJcgFY&>ff9T+jIL zaztDwq@l}Yf zkqkzwW8xF`xUU5s!uDDPct~)--JxN*9+P_7d^sC%rxo$N{Lf@DB5(soUB3=i9_K(R zh=7K;=hp@ucS$?IGfS^=!&C&032qtCe~=k_jw*0Z_KD^j6ExH$O{?u_eKqXWrRHsK z9?LKgqZ9DDmA;)s&7UrX$rN6ATq<@1Y}%@nHI`u~dJoTi@Ur%%961&gZ-6;1Ip!`B zcB7g+U@Q`-S|DdrTLC+b<^jBQ*M-KF>ncrq&@S8V-Ek>bz~PZb)eFiKRHGm;>Q>o) z;2}KcP;35PNLSPNCpWu49+)k&!;CAlEdsLTbN-CaUOWt)snT}#F>!}9rG26v)#CxT zj!qO_h(npXVZ`N+ZBJ1q$-z0&N<#TI2ZHIuvk)8kqN*iqX*My5h5np7AHj2Dl7z%; zL3*Ts_WT&6@RwCsDg554p4w?wMX8>oBcIW_*(ZQA)oob2W#8%rd#1EbZ|8CxONI>4 zwLUm)LpQ4FR3>F5b3=iW0Xu^4kBAQDN}n`b#Bn1p9>{p{IzO9ZLiTt*VsO1Zc7?Rj z1!Gqkh0u*+vbsd$su<0oFxGV(0<3pKf_l|_BxlwLs3ZxO()-tD;-Ed!gn|5hsF`I5 zWU#b84-)=VjL~#sF~5Mo8&&DQizH14lJugdpEfUh=|@fTM8GMLT~mbx?m#Mq25NPN zMlBT;vN={f)K37~2Z-h!`j!UFmlhJU4_Y)0PoCkb&<2!V>C8w`AZ>>p{&4PrO*4xa@!?O?w_65o>j$u6yy4Z_O(S^J^i)@6kS8 z2(y@rb2koF{_1m;I;H1v5#JE|h~_3h?)5uexD{6^-Uw*o?x~vq_2d@kxDB^K#M_ZA z=g_ulgZ29E@;zMhbW`w^&?MEK(fqZdwE@9T*1%k;W~?MhtWpR0B#ZY|*bRP!X`$dV zkbP%ZDrfgD*7CI$S3{RAsX_aYQNf+*?VYE|T(aqf$0>NpEXh{qT_7En^j>-uWND(! z@g5>^`DVU~X15KE^EjA__M4Y0gm?rTw%*H>^JuE;%8GVx($JM|90E~}mhM0U`%)dw zN#!s%8@HPDuYs;}A9&Ua&4tI0zu~V1mVCHwVQTE^!_0wvGfS6-*HPl&RS`%ZJ%vU| zx}g!LQjgbKuc6@INcqv&*;ByuV3}&PfrdV^m_rm0SB-VjrW4297Vu-<(i~1JZO|xZ z!Kc1}P&w1lsB5S-P4Mm$b|-~GtqLt@sQh$ar+er+g#(dyGxV;05OVW=j}dU8ubKWi z-9|W-78F`!@)Ma;0%Pt0Yl!`}SA+SMqKqZLsso+%5&MwpI5M%ra{k&M-x1`4zYI4M zx*9(%?GJnMyVsv+d+l|S9Q(LDe(oI~wSF;i_mZzHKp3Mg)eLI8U+UDK4T=^u@xe{> z$3MBn1zd121DYw zMQzMV+MLObI$+n;M0m*V^E&zNd0rxo9*U`oVH?~OV*Jj62lbaDU7Z99 zRkjn_{)H(-QtII=R^n^_CvEG;cWJKDGTUN*4}oMq&pU&cIxzgw*L(ZB*+O^xaRqPQ zn{eD}fH~)psFdbm_VC@NL)L3RA3hYsNt-S6A=LYe*x=LE`>g~gL)5wlGyw;Uw>gi< z^QU;^bA)S7RIa9Ag(e4NH!}CMuEaFmY(l8S1=XF{hLkT7y6C;`IU% zu>4N634*@<*;+#2+T$!h9itZ9gI{X0wgwX5WP1|*{Zv8oY2-L$^ai=U>iVeanl2Yo zJ4nkTX>jwJFL9yU*cKPC$HYYK^I}9Y)UaPR=>q0H@yI!G`5a9G*A^O@?!6&*^TzQP zSvOgB^-b}?(!?abt3zNPRLX`{>{`64Mks3wyB$aFB(T7<2ka;+!DZ4LW#SO%L6jug znRp<9+85x#xcK_mW*~L7`7nNs@R)Y4Pvc zFmZOJa{x{D7Km$HmLR!cROPiGu%8KA2){q3aapkOJ$9XH@BEh1@Zcu7H|n$B0kdlu zwh5V(zC%G$NGOWycmDPM{Z@3yd8A-pnhH3#TUjUrqZ(2m%!EV;xC&E)k^HVRvEt#X zxGSC912)Vhl<7&GoQ}@x!oQ*&pP#!Vu&~A-CIkRy^2|*b;8k2IAWApS{w>Av zAx%eN`3lZ>2O6N8IZP_m(4k**;xh(5{=~JW53~G)678CKDp){hCu~vQ6wJWj({JV1 zouymlWy|yNkzo0PA4>vpiKU;>?-oo-x0k3c}Hy{PG~w(eEqgd1TB zchwgYJ}LN~Y@t1-)l#5~b-K7Zxzts#nifi{Ls*pbyE$!ZXb2{4h?YO{=o)I@Vg<79 zx*?iRMNpynXU{M?S^#+o>gmABAZ-KUR(fv?wpfx%`Ax)a*z75Rwx7=A8V%Oo=jvaZ^D?so&7;Qni7q?}c4xR(-BVh-8W1k3NL>wc6+Qnrt?6^FW!>3J zkQgJ0*&08G*yI-&S+AniH-e_#RC)a__cA^3Uaxu(b~T0gb!h|6J2tXl*X-A@rCUL@ ztJihop(Wl}5(J&F$y3aIRNp0OWvE`wq`SgOGYINe zRY|e?COA!?N=FEXT{&{x7E_%%7@WA^JE{XohnC1Pzf7$VTlLVF9m}!Ls?s5JpiK>; z75kcOdmCs3_u*XS@pj$i7F{1^m51fbo>+*1f;8m@IJXjA<$6 z)59Qs7nz8A8eNoh%rJRbAoH*xt=w4_zRb7Yg%0yQ7pk?<~Xm_{E{Ons$IoukXWC27VXMtIJs z^L@GWMDlXwAYQM759Qs-w?loJKo{{D@cUD4cwnSw)Wfs#t%Af{R>nHJz&`k~>n}gL zZ{DV>BGye9X$`NB+))~UWVYJSydaYrr2A!wk>9t!Rqfz;8B$wf?9KN{&sz2M?0?r+tFQ1zfp?Q~qc(Gef| zVD2d+W@hC0gH#5YoJz8`H)`wd%im+_S4@URy+*_uEe)ar?G!n=<4 z@hSP=X-{<6NP-lue`OAe*K58Pwv((PSM2NO4VGH)9Xk=O=s=B3?qUnRb9XvlgQPzl zy6sq1jjy6#SF6s^PJt36` z>(4uo6wRVMHm{P00(8!{C1P~nOs}&__lY#!V7K15!=)#x2{G;hNKUP$A8<7%zX`#5rv+#c%voOd$Q6o7jz$OF>eDDue?Pf^N zxg-7Ej`3Ef`;aR4ta77wT)KAkIp@E2u4ZdpCP@0{Jae~jQ*SIc+F3`}d>JQSK)(fnf)!oZox7^~1l ze{3db;38GG597a(a$!H@iaSRp|l)j=DIv?G*sDq~XWng&pfR*~%Jf)Y+uKfX!g&OEBN7;j|^+O-} zU6+)M1o!e@D+lyNGq8YlT)!M?f1aR_tVB}quJ#aa0z1S;!(yz1 zqV2wSS9q*;#veHY4boV7Ngll>Ax0+sMj%A$Z3*LuXDGTH&@zUK^W%+^yv#PW=3n@Y;X3vv`5jCJM4OFLbI0{Pb3UaN>D1!Q z4mM<0dlYn<_Pf^Y2zpa>ALg_Ks;Y7R(#~Z6^d*MG?v|U)vCgM+XbxomgoM6Q1+miX zU}d0QLnW`eJi3+`{rat5wQDMe#+uZ?a-1zIsv-Nm6&fWFCF&^GSD5WdqRi&oy5jX8 zT}_}E`#j7x=N0T^+j;7Mch!o>4j<&D&=uR*WO79>Pd?0mD;D9cncfy7@blR1Y86<-s zthr~%O;5;OL1XoIe#j?Y#oI+dT-h-O60MXH;3pp#y+TGx} zpagppmB|!!v*&0E$2r^Pp}g#>BxZ3BMvEPO&)h~5?GpoMGc!WG=$8z?be6EXx2jCL zi;aJ7w7Xuw4g4$$xikQC)0hEWJPwV7njP=FPk30YTK>xmeU48?yBn88Z!Jdy^k%O| zHo8%-+3sz06dRCP@w)vqKQ(;EsygOqf*CVC`XS85F6za=XOvVq2S(O3ha_x4=|I9^ z_FJzlJ{8N?^kgK4stChk2O||TE5a1d@zyXb=aR~4dRPrc{ViZ9J;y_$&QDpeo<3gq zpNh>03*^9uH@9eFtC+SOXtVwfo%Y3!l6D_w+9wXAStmPZSA2RgZ{96C|g`UH1 z(?7+W=dM0n%Bm=98Oku)9UQYF&0kE4?lm(a;8SrFZUQF!JOL-M(Zg=@ z^Z+$s;oCs|Xm<7Yg4)V;H7L;N@lpAxalK@evZ8zGx3StZ@_2QG-W z5$RZK1S_se?F}L~muReIREA4`nH3)&g5r zd;D+&DHm&HEDE8F8R|F8Xcw-gbWh2XE?$Gu96sC&>JenaKMGf*|B!Bb*f^q&jMCIzRj4p_5=MRr zIJ7Vljk|~Yy)uN#a1awUNzx|um&Id9XlY1$0l9&dgqx@`^mLOHi^C^dqC%83t@~xr zqwx$-7{F)kXb}DuCZwK|F|=a_j-(_dL-{0iE1h+q@HE$FtOnXl70(b$yZez`BkbvX zaX*~8gS?4~#vp##`%^MsAG%Gkus~lI$ zo-r{L{aDX?`SgN=z3rRg@SM}-ZKc=UgwMAylrK;_(y_8!yHh^L)_`gh3mxtY6n-|3 zZlk#go_^RfsxBqq+|d{)eMGL=<&hoD$MdxHI@#-qty@hGMp?mDfn*X0Wrkk2jSW^G z^2*W-TP?c3KG_-gaRXRFM86O3m2FN@7?~hlFu;A&Vl#d&8`MUw=lXAXwrf(KW(xPc zy5sqsN3z&w3tt0K3W-d%3(K*o_+Ii44HBzY+6=rK6Fh!T!2S5957Jirfh5MKR`qtY@`_woc_dR#ePbt)#Bhx6r{MWhqi1EedM0LC|I>>ib-@s4lzv^4ZvMYCXBN5`M+y-FCh za=#`hWGJ^aB-ER3zI6(Dz5boMru)_Vmx1Q58HB&89WAzaDEg^E!(^RizOHn?gu%4V z0=2WJ9qA4-E+qEaXAOPQi3}(4^>EAPR5L)PitP$X#OON*-;_t3cs?|@vYdA+Pvfbk z{3@{WOXwWpWaJk6XbbN=Pd~bHTa(zr%NSEAInOQ}XijJ+47-KxOgDmp;tYC?Ng;pC zQf_@zoqpA)C9P*p&pIX6h&u`{X(!jXJ^)7|B#Rs_EBtJ|Ox|c}IK_%naA4ItcRLi?kQC8L#OcHzypEp>$i z{GePUk0lfdP|rt}!k2^6!{NPMPzsEsx!z?yu9sUX0iJnkjRn;}Z6aLS5T##-cp9ue|jZ z`DDFBwnj6c`+p4dWM>yQZCZ?bW7Xtm7r(rq$ls}@I|uK96R^AfKqW`xLOrd?QGFEN z?2PG;vu=vn4I}Ky0i74@Krp~%>H|_2-i+qTtq2nw#e~(ebjg*hqfBi7*rpXG8;Uz3 zP*QmxOO%-ui^nDLy>ZKM^}a6jq$+Uff%wBsOAYa1em(qk|It&3@#^Up%4R#xW%+Kt zn6=|EIhF>H)0!}FZuTP=N5Zd>%@FZ&+=fX3jFUPXv1ze_D|5R)%1o=d2bu+Ov5IkU z`G(4sK}}&*jjW^y*C{P-ipJ8!^E$g@sV`Hq;RXc&?Y>1lM)cG;Kl|~DoGxzV0?%sg zU5)Lx1uwb#9zjxVxT@Y3`P5lB;V%-3+Ss{^Q|NWL9jM5!-wW&=oBM=XO}RTt*${B>t1QpsDadp9|4)qTV3l{|2A83&%rm z)IB+yvmLId7SA~3+U!$=m_M!!B(k9N7%jq4$yqECRa}L+GW3~MxX$oZ4$?gO4O>=! zu<89>)ot-hwjG2YXJeY$9*}$ij>UU+J~eSCy?WJP+SkmicfmaN!`1%pik@GlXQ)uS z`S^cWIZS3AUmn#IAI&%&UzJEc0VblLjyxuK_EGfmXb(dvF<~HG*463(U{s>qLWa52 z@2tbrKKc+*;irY8OY-(Q0^4U(=gCSxL^ZH4-0#DACs%&31w9hAc{XQF=33tKM*V0_ z?+LD}Yad=nOkT*+?QV!ns9XO2*s4^>T$>`}ExfieavJ+i@ps}29rO?OQG6q<2TO{s z#vC6`G!!)bqXh4kGpq}+vkJZH5`$QlL$Aw*#SgxK^E|9SMnyr&DG83WFc}?As z-%xjgMn096g>+Ur)A3y^7H3@!NHenU+}bW#Igz0>eoal5l`-QCe|A?E++aI*C`Vqp zG1&wV-H-8dtarD~v4%Z2i)fU=&Zh~*irUTCu`pLsYAkJUA$cR`;84;j-}YC2+yiuo zBN7ejOvJm|g%paZ;(9!7Ru`#EERh;i&snvk`SHsqDK6>J40~Ykp#(Z=vwvu48(v(* zbdr(=wP$D>(x242oBG(Jw*`D4(872gs4DnZWYp_u9~ej~)}EuVdj2SLV5;HwUA4NB zmp9{tqTy1dO=d%f>88DZ*pna<%|m7*6$7beWpaV)J1-Gu4B4jCL$E6`u}+mR)aH!h`0X_{n3eSTI}z6Plk;`OYMy;v$?#P&DE$3-JU*6> z&2d`pEOG!+$Kx(%zHUOKPw~-t4Lf&o#>~~cV}{w3nur{_eg*+K?AslXs3&UX>RMxy z!_w882cPD`cby26Mm+hnm6c_Tc{jfks9_RWHx*F%@IqYnAaN8@VKtwi`Ek=)oT=iG z)g3iBh1GxWB0)4tkM|+>78sS!55iHHhiVTylI4 z!>D!T=kYE8%PIB*82@KuI%D6?V&K@ESuDoPwe*E)XyEN^qNNq@v8b%OKWF#k0n=M0 z;|qEpR~&BHG3Pdzkh?T8msq0MC7bA+*kG1BYGZ3T#n@YqUE+8y1CUR4X1)!xMp9cH z&8Ho$s`hn?MccK4od|E{*Dv05pU`7-jlK*0)%v-QBIVIFE-dIEX4B?Ir9ygMKqKw2 zi@L19rYzQe+HZiq(vx0s<;!8etP8iWz{~lM(_L)H9&HI=`JXqvvqu&OntSznXz*y) z8lPuoT5nYM0G7a-XV5+y!I#U5H*Naf_)|1-C*k4@bNuLG%W<(3HeS4<2fzTjXgqu$ z3KTpaMjQ2V|Ev)nu`|Yw6ITETiLpHM$}J4vh3D-@V?+S9Af`mF4cZGGNtf1M!7sml z7TPaj;>w50`xx`O`-}9!0`(265ZHBIL=<<{8T|!^CeDQb(+7LCWH~L@s3o`^PZh^< zaN>qTZjtJM09g>zrrkLRmmy%|)gJ9tSJqG$ZOIP66~DY2X%`T1vT8=%xq%+fGF0W} zN{A|SNWN@*zh<(NFXuxkTY#YUjOZmRgukpagjyKN%mPtq7~T}I>)V% zBB<ij05Xezgtt5 znfgj_vc_2YUP}3>njPbukIEkUV0L%Ju@@)UJ{!Aa-=;)fY_X~g;N10Qs;OhI0NTwP zzDBWJ7ooYxboW+AU@O7QFR@Z2W&3_>mkz1PG^+)$UkWe7bo*SiIPFUrCuI4#?=i~{ z?+aUAg!kf5u_8HO00LZjc1NXN50r6L=~>MgsgEQ@Dm5*Pgm5S}7Ua6};Yki^%Hw9t zupSpwv__rR_a^2uX$nrREKMm)KSQ#LRBjdZ?~T(H|J~D$7lllKDc{gMM!BS@=y-j#yyDfVB%I_zprvaoKlY|IbSs-4*?_t`{C}r!(Fs(e` zqyL4w8R6ZRrTkTyx1o}DZ&ae;GAQpZ-I<)L_WF10bSd*ex>%{V=bU_r6lCFyVsl)AUwz!gYmS8YOR-2bhAZ*l zYR*sE5lsl?+%1xXAW~?(P=3G#IB!ToT>i}jicVQ3 zFS#r)?a8~8fWzWWrYSxz1WniV)M1BY8MsWbeuRE!<9BcUS;#u2z-!n$nWd}XwJYoL zf>*wN<@3qgu~5H`B4eShiNv#yxl4#hf%W)jk0y~E<9Vof$Op3Kcm?iWT>e#pyWH6~ z)z~+ibeRv0RSc$YY^JFw;zd01!QA>yg= zb76qf4^?xG_@q}7<8_mh2sPLE0@Fjc(MzHtP;mFXbTFG{o*!OFx4mnELvWIpAgiiU z+n?p2YZQ=NPszxDZu_K)w*wUPLEh3F4B+)*!W{NgT4f#!L?$m~Wv=0$;)=1DMR zk$zG+*G^WC2gDz4{LoHQj5~>*HrRYIowX%dR@y~DmgAFy?&y!(ogSC&tbLb2JmQrG z$Nb@Cd+eNenz`m*m%gQ0$%88{uxrp#*Q+{m3hjDsyX%B;ice^wA_gG8ak(DqoXFA} z1lL7vzai78D~$ccFSZx%!+-QheOOv}1yfj8P3isp%TYhxFmt3H zBg&Sg`6T_1@}UteN9}QXC0mHy$86*vHLDqJ+kf~lCYs;{+OY?rjdOkD-dsw;2QUIv zr?uQGdNMN{#pMtBR*1`1l;zj-xmS6mj?_3+-?R=Wv-YiOiC*CI<;MO(qhm)XESaP* zta$k^Us;-2DXp?{7&#K!W(7p<+P-S(oj5d1NuZ$1)8b4SWVA~dl67MFxNz&`T7G7_ z!%lsLg2bwAnqBPfx>HUn$IJE9Su8|WP$7n=)P_3h`FKJ^XQ3F5Y{}UnRVKwdU_Bo! znwv6uz(MISf=DiGzcw@b@n>md{H4Lzw+P4jsTq|u1tQ}CqGl4IEgzfPOD?Fp{_xL{ zhQtEOM11YCD(-@G&$d8(6EQxYolPUU^z^`w6d0;!yFXoPG4TRUUmC4pw{;Lb%Z?Ov z7w@Hr(ah2wxpwE|ayygD$exatIU(@h;~vZ3)A)HutZFrPVcF^lb}el!O}fr7IG-B& zcvG*QmW0hvRGe+^k}FT@)J~*#PO2j*fVp(K%9huquH?MQ?JLdemlWvZ{z?#&ia}9| zy~K!fII!?tYD^p@&<_-0!@5J|Ma5lMx0(WIrB^LIE-eKN2~V?0a0u*mc6CfaYdBe_ zRwgd}-%m5go?@w%rP^$Ap?Y3l9J4=Cb!h2|-0){v*tE=kSorgj(`Rqwh+#>g*!%V+ zywNuTLKNYhHg`aQuDu%%;|%0wLQ zKy7pUShBPqc#W(+oCG>CS?Q~8wcD(pi48aW5v~)CoYk143;6RTe5ye$Ho(FK)SP~E zrD9H@`{_aGlVZChl5=XJanMJm1Uy5yP>5bY1d??HvS7fgw?ld7$xXGEGC$q{+{5Ic!cY_4S; zP{q;rW8g1RQQ?G<9%PR{ongNi~PeQz{Hu8Fbv=&^Y-pmAJU|tB`@CTQ7Y%#&^P>T5SwwAs^53c0ilwa`Lk&7!S$L?%tDQV_b%>(E==c+38iv8Y^ksKW9ODWMion}8%+ zN2i`^daVkBhRP?7H^EVL_;=)#m}UiGZL=Xw_868`SzA&4*3~l?p{=VMD`=wQ;-5Zt zFDBn=Tk_gJ9=WBz(nY9~`jj9of~Ey|!wafUgevXt7@Iw6#<`V!3HUEItoSde@?PDn z0jOq`p|PLt#nOpb7izs2VqldM52x`*aj)8EQG58!trdvCUTT%H z&Nm9jfIBSP0^f}QFvk$Qn30P390_r1+K_Q9;}#-tnG+k&xB%(Ng!?9O ze?-g%#OO6hZ0uVdwVxLd@eCkB!g{Bh-Lbu zJlFh3!>2YJVN3#-+M8NV*R(H$V5eo#Y03r06-(OIi0qv$s;gV2;F%eP2CL>SyIo(o zrQ;7N(CSoZP~3*fm_IN&ep4YFr=zz+i+_gD<4a%bfV@w}xf@~MfX!X7R8w+vfV_q* zg{Y!kKAdCnIi5+MuoGi)HTyJ)4a>gf$5?pxpCj=xuQ^3K#fr^?V_6_r;^JJX@QH|0 zZkV}|$?Q5aANWqSdCv<0iHxLLlC;93K02BJhm{1tRQ8VA*Ei)4qqmGFEb_g?`7KpM zT}zJ`a+UgYU)U>*2&FQ@*pE8oH~7}dTQ6uz%)ss2Kut9xq}nGmCea9(;tii6YhQbY zROZJ@$OL~GL)J}4@V9+Oxu)Y4aaOMmJu;l+I29wo`zBPVx=6!e#4olkY z*{D(53cv4tQ34ISFJt0DFW`^NG_0*l)1q+(JW)!^u1CnJJ5g&Fm zpG(KqYr4Bd^61BhXdmGvK#Ie-N|ux3YtFoQu@gH{F@y1iBc|e$G>5XxK;C-0^T?7d z%-vR_H7Zm8H5bl4_o zry+1qQdAZU5@k_X^JBeF$Ge$xx*y{HtWBV?zx;#g0{yyZ<`f)aJh`YRt$62vqw%&_ zA@JlV%x1rHNsu4aO5vcc7My0VdF!U%M&0Z}OhoG0xRBg*Y~1x zYCVjPft1mWwCPW4;R7o!wW+-J{KOlX) zKxLAmdS_$8;L&H6hP*q3;_|S=_vcf*(3#*K}?CDAh=-$XJj=`LLC|=1|#Sl-Q&cvKKK|rHUCY% zln@uxKAA9BCJeI(UpCmNrCd?0SM(#F2_JD&?TDr?&jA7Et}YbSzYR~H(S`ssbRojg z{8d8u$i+NcT?Q+!e61)~AL=b?CZ4M3yNJM*4aU*kfKwz;pvpbrb>(`+9rWiDZore? zh4G22yv*yQ5+DInL4UN?5U_Be?;#r9Di_85Y%x~A3X}tnh4#^ zM-xtoxA7&>^GIjjlFm+vjGNU?sHR@5RmpR$_m;LGJWpNpNBxW-WXEHlhWl>a&@<2aLo{u?_G^8Y`5|KDg(of0Kz;b5V;3vSgSjYF4L*2NE)KKbIgVZ6LI z_5J%YRZGVYu@uI`SH^z(<9}EC&)3amgWpX~Lew3K{|yjDYz$|A`JSjgh3%dkBHH;6 z)jzN*`G<#xJ$iR9adI&GKk}ye06S9*|8;|YUv`hc9!eWg{5NXw&--_OgI*Ek|GKB1 zrZx0uh&a5SKMV5m2CCts(a$lLFL}wEWXE++fyqh;n$W()K}1j0lnHCxoX;)$6O}26aBKiSzmF)zEKS$lL@gVIC7FWGbQD$YY46%jfSklp9>}sSW3+uvY z#K1g}v^!*09mBI{Kgco?CC}wNnk^S}gudI{VMngySncCL+KZZ&y>N`J33gKAGTqa3 zk)9LYTbLPN@2ofo*sFdn&-a=MTjam_bDO${bVRkb_bVPD!fdXTd5Dm^;b^lfPr$*C z1To&QSnAQFx@4<{*>erxLLw5pD`K2}{NgQUr$=fs7?N`1iRolcVc=_{hpliLIJ0VF136~1cY7$c zBA}u9x_0m^p*d<0A$xkiW`6ypQ78jk>v7p4#+(0pr8`}v->tnwB+ejG?&Rk&!IQ0e z!Y2kYxnG`5$uwR^P}3iK3?HqR&pBt_sXnoAG<5WEO5EW*ZEZoF-TAz!M7=`7V4u&I zC0?<^k-Wfi`&;}o{O!H`NAGywE*uUTFY6#g-m8lR5oA8V6E|+w+*;^HMR3(Ibd(&; z!!-=f?an^!3-RysY-RO`4eX}$?kiGhxOz#s?;AqrcaKv(sfrTW5Bu6(vCt{K>`gnE zCB`8-8f1&mic?`;|)P zHtglRqNXXY@awLr&&I+tPvWZ%z))y~zpuu+@}S5c?=3%$G+w(5mw-`~qchwo?=37; zQ^{n%A}os)>?MIciH1Von5_rTCs71MiDCUq#h_6wd6i2;`)%UA);%H*(Wcu2Pni7@ zT#@6o>S_H$0PAG=5G(6KmB`(z9&@j(_HAALAET6qBJ_&p_-@oYCH&_7KdT2XZVg!~ z5%G7yAOFH!Qb=O{eep^Kro=D5`wrs0M+(v2qR0K%><`+9X$Orz*1Ey`3H@Kf?xJ0s zzw}g0G8i0RtKK#Hj3S?Zig@-s&O)J42tf&qk*XA7GszgLdaXJQYi-hI04`j4dk5z2Lf3{o=nO@b(|JC$(cOta!9d^K;cF zwb8?Vfu#%)aM0&A{9Ruwu}3jg#+h2w1jsX+YDJ-Oba9{R9W@#^N5GPK2l3zh z0v#iEt@9be&HNHGeu*vn1%-X9MHRL#H%EzOsAW1D2&Az1&#Wg#yAtQ{mJT^hBZ@K; zf0h=tQDXHv7n(NB?uGx2MW_wQOCgvtiB+BNZb{rHb6h{VQT2nC_$3y}#TYhhIc+C2yzP?=a~-O7&WBn}&unZULKPL$ z{_ZWk$=NGEZc%F3K-|_t$BKdUse9W#4_iHq_l5{B3UQ@pueHL~QH5|4VRlFLF@{ zi*n;hx#TYFKGM7zS`& zDlLmp8O+u6J=KdG|0v!Cp4s1Q8?t@6Zr2O)kudh(V7z_&Ps^L(pggMq@4B&Coo&bN&`5IZxVC$pdWP z)F(-*44kzb?%@VywiP{CtXg;QTw$Jx&bL@TY|r)C7Pr!FwO3#JY$<=9lN=Kn-txFf zL*3>FPf@P)>LY|1TjL+4Zn4b{JRAIK7t=wS7c#4&FtwHcdbB2{D!Vkw2WDeiwobl% z_!G4)+dEwRKvVxW%@2z*#f)d$^y>!>P$RiF+ECm|Seg zaa_FThLg5{$x`sE=n0O?$oqUb{TSsekcUmHqF=N-QdC zv(bOPYhe}+`+D*UZ1pLAE5(wc$SIcg32xB*NF-tHlbzM<7A9Z|yX|2xa1i~Y)9LbE znPS?#8Uu@G!WgteCAk3tH(Jx_?R4t`k2EBgQ|Kia zQ&r}k+|_&eU3Nv~8LU)A;3uWYgu{`JqHdanicYo>vPM{e*j~4-e_#PR?H=*n+&^-V z${}K^SDnk=f-)0T7l&r&wWS7*8MwSdJf27}NNG-v^I!i&SU#9~ce_XhtDkId?`#j6 zsWkNwi9iN2K8an<9dp7Df*N%q5cuS4gC_gchNigSD(hH$pld=756{m{^%_BZdSmK; z!rlLF->cRBn*d PnTo;F)z4*}Q$iB}_Z^$> literal 0 HcmV?d00001 From 29c0f5b8693bc8ca6e534e054cc91102f2bcf8f9 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Fri, 8 Nov 2024 00:59:05 -0500 Subject: [PATCH 24/47] Migrate feature diff for NN Descent from RAFT to cuVS (#421) This PR is an amalgamation of the diff of 3 PRs in RAFT: 1. https://github.com/rapidsai/raft/pull/2345 2. https://github.com/rapidsai/raft/pull/2380 3. https://github.com/rapidsai/raft/pull/2403 This PR also addresses part 1 and 2 of #419, closes https://github.com/rapidsai/cuvs/issues/391 and makes CAGRA use the compiled headers of NN Descent, which seemed to have been a pending TODO https://github.com/rapidsai/cuvs/blob/009bb8de03ce9708d4d797166187250f77a59a36/cpp/src/neighbors/detail/cagra/cagra_build.cuh#L36-L37 Also, batch tests are disabled in this PR due to issue https://github.com/rapidsai/raft/issues/2450. PR https://github.com/rapidsai/cuvs/pull/424 will attempt to re-enable them. Authors: - Divye Gala (https://github.com/divyegala) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/421 --- cpp/include/cuvs/neighbors/nn_descent.hpp | 92 ++- .../neighbors/detail/cagra/cagra_build.cuh | 8 +- cpp/src/neighbors/detail/nn_descent.cuh | 294 ++++--- cpp/src/neighbors/detail/nn_descent_batch.cuh | 736 ++++++++++++++++++ cpp/src/neighbors/nn_descent.cuh | 45 +- cpp/src/neighbors/nn_descent_float.cu | 47 +- cpp/src/neighbors/nn_descent_half.cu | 48 +- cpp/src/neighbors/nn_descent_int8.cu | 48 +- cpp/src/neighbors/nn_descent_uint8.cu | 48 +- cpp/test/neighbors/ann_nn_descent.cuh | 191 ++++- .../ann_nn_descent/test_float_uint32_t.cu | 6 + cpp/test/neighbors/ann_utils.cuh | 20 +- python/cuvs/cuvs/test/test_hnsw.py | 2 +- 13 files changed, 1361 insertions(+), 224 deletions(-) create mode 100644 cpp/src/neighbors/detail/nn_descent_batch.cuh diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp index 347ccf889..bd41d1ff7 100644 --- a/cpp/include/cuvs/neighbors/nn_descent.hpp +++ b/cpp/include/cuvs/neighbors/nn_descent.hpp @@ -55,6 +55,8 @@ struct index_params : cuvs::neighbors::index_params { size_t intermediate_graph_degree = 128; // Degree of input graph for pruning. size_t max_iterations = 20; // Number of nn-descent iterations. float termination_threshold = 0.0001; // Termination threshold of nn-descent. + bool return_distances = true; // return distances if true + size_t n_clusters = 1; // defaults to not using any batching /** @brief Construct NN descent parameters for a specific kNN graph degree * @@ -100,14 +102,20 @@ struct index : cuvs::neighbors::index { * @param res raft::resources is an object mangaging resources * @param n_rows number of rows in knn-graph * @param n_cols number of cols in knn-graph + * @param return_distances whether to return distances */ - index(raft::resources const& res, int64_t n_rows, int64_t n_cols) + index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false) : cuvs::neighbors::index(), res_{res}, metric_{cuvs::distance::DistanceType::L2Expanded}, graph_{raft::make_host_matrix(n_rows, n_cols)}, - graph_view_{graph_.view()} + graph_view_{graph_.view()}, + return_distances_{return_distances} { + if (return_distances) { + distances_ = raft::make_device_matrix(res_, n_rows, n_cols); + distances_view_ = distances_.value().view(); + } } /** @@ -119,14 +127,20 @@ struct index : cuvs::neighbors::index { * * @param res raft::resources is an object mangaging resources * @param graph_view raft::host_matrix_view for storing knn-graph + * @param distances_view optional raft::device_matrix_view for storing + * distances */ index(raft::resources const& res, - raft::host_matrix_view graph_view) + raft::host_matrix_view graph_view, + std::optional> distances_view = + std::nullopt) : cuvs::neighbors::index(), res_{res}, metric_{cuvs::distance::DistanceType::L2Expanded}, graph_{raft::make_host_matrix(0, 0)}, - graph_view_{graph_view} + graph_view_{graph_view}, + distances_view_{distances_view}, + return_distances_{distances_view.has_value()} { } @@ -155,6 +169,13 @@ struct index : cuvs::neighbors::index { return graph_view_; } + /** neighborhood graph distances [size, graph-degree] */ + [[nodiscard]] inline auto distances() noexcept + -> std::optional> + { + return distances_view_; + } + // Don't allow copying the index for performance reasons (try avoiding copying data) index(const index&) = delete; index(index&&) = default; @@ -166,8 +187,11 @@ struct index : cuvs::neighbors::index { raft::resources const& res_; cuvs::distance::DistanceType metric_; raft::host_matrix graph_; // graph to return for non-int IdxT + std::optional> distances_; raft::host_matrix_view graph_view_; // view of graph for user provided matrix + std::optional> distances_view_; + bool return_distances_; }; /** @} */ @@ -200,12 +224,15 @@ struct index : cuvs::neighbors::index { * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -232,12 +259,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in device memory @@ -262,12 +292,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -294,12 +327,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in device memory @@ -324,12 +360,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -356,12 +395,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in device memory @@ -386,14 +428,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; - -/** @} */ + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -420,12 +463,17 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; + +/** @} */ /** * @brief Test if we have enough GPU memory to run NN descent algorithm. diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 9e4d453e3..6209ff819 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -33,8 +33,7 @@ #include #include -// TODO: Fixme- this needs to be migrated -#include "../../nn_descent.cuh" +#include // TODO: This shouldn't be calling spatial/knn APIs #include "../ann_utils.cuh" @@ -356,8 +355,8 @@ void build_knn_graph( raft::host_matrix_view knn_graph, cuvs::neighbors::nn_descent::index_params build_params) { - auto nn_descent_idx = cuvs::neighbors::nn_descent::index(res, knn_graph); - cuvs::neighbors::nn_descent::build(res, build_params, dataset, nn_descent_idx); + std::optional> graph_view = knn_graph; + auto nn_descent_idx = cuvs::neighbors::nn_descent::build(res, build_params, dataset, graph_view); using internal_IdxT = typename std::make_unsigned::type; using g_accessor = typename decltype(nn_descent_idx.graph())::accessor_type; @@ -471,6 +470,7 @@ index build( } // Use nn-descent to build CAGRA knn graph + nn_descent_params.return_distances = false; build_knn_graph(res, dataset, knn_graph->view(), nn_descent_params); } diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index 8c5767c50..883d82d76 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -16,42 +16,41 @@ #pragma once -#include - #include "ann_utils.cuh" #include "cagra/device_common.hpp" + +#include + #include +#include #include #include +#include +#include +#include #include #include - +#include +#include #include // raft::util::arch::SM_* #include #include #include #include -#include +#include + #include -#include -#include -#include -#include -#include #include #include #include +#include #include #include namespace cuvs::neighbors::nn_descent::detail { -static const std::string RAFT_NAME = "raft"; -using pinned_memory_resource = thrust::universal_host_pinned_memory_resource; -template -using pinned_memory_allocator = thrust::mr::stateless_resource_allocator; using DistData_t = float; constexpr int DEGREE_ON_DEVICE{32}; @@ -216,6 +215,7 @@ struct BuildConfig { // If internal_node_degree == 0, the value of node_degree will be assigned to it size_t max_iterations{50}; float termination_threshold{0.0001}; + size_t output_graph_degree{32}; }; template @@ -300,6 +300,7 @@ class BloomFilter { template struct GnndGraph { + raft::resources const& res; static constexpr int segment_size = 32; InternalID_t* h_graph; @@ -310,16 +311,17 @@ struct GnndGraph { raft::host_matrix h_dists; - thrust::host_vector> h_graph_new; - thrust::host_vector> h_list_sizes_new; + raft::pinned_matrix h_graph_new; + raft::pinned_vector h_list_sizes_new; - thrust::host_vector> h_graph_old; - thrust::host_vector> h_list_sizes_old; + raft::pinned_matrix h_graph_old; + raft::pinned_vector h_list_sizes_old; BloomFilter bloom_filter; GnndGraph(const GnndGraph&) = delete; GnndGraph& operator=(const GnndGraph&) = delete; - GnndGraph(const size_t nrow, + GnndGraph(raft::resources const& res, + const size_t nrow, const size_t node_degree, const size_t internal_node_degree, const size_t num_samples); @@ -344,9 +346,14 @@ class GNND { GNND(const GNND&) = delete; GNND& operator=(const GNND&) = delete; - void build(Data_t* data, const Index_t nrow, Index_t* output_graph); + void build(Data_t* data, + const Index_t nrow, + Index_t* output_graph, + bool return_distances, + DistData_t* output_distances); ~GNND() = default; using ID_t = InternalID_t; + void reset(raft::resources const& res); private: void add_reverse_edges(Index_t* graph_ptr, @@ -371,15 +378,14 @@ class GNND { raft::device_matrix graph_buffer_; raft::device_matrix dists_buffer_; - // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827 - thrust::host_vector> graph_host_buffer_; - thrust::host_vector> dists_host_buffer_; + raft::pinned_matrix graph_host_buffer_; + raft::pinned_matrix dists_host_buffer_; raft::device_vector d_locks_; - thrust::host_vector> h_rev_graph_new_; - thrust::host_vector> h_graph_old_; - thrust::host_vector> h_rev_graph_old_; + raft::pinned_matrix h_rev_graph_new_; + raft::pinned_matrix h_graph_old_; + raft::pinned_matrix h_rev_graph_old_; // int2.x is the number of forward edges, int2.y is the number of reverse edges raft::device_vector d_list_sizes_new_; @@ -971,19 +977,21 @@ int insert_to_ordered_list(InternalID_t* list, } // namespace template -GnndGraph::GnndGraph(const size_t nrow, +GnndGraph::GnndGraph(raft::resources const& res, + const size_t nrow, const size_t node_degree, const size_t internal_node_degree, const size_t num_samples) - : nrow(nrow), + : res(res), + nrow(nrow), node_degree(node_degree), num_samples(num_samples), bloom_filter(nrow, internal_node_degree / segment_size, 3), h_dists{raft::make_host_matrix(nrow, node_degree)}, - h_graph_new(nrow * num_samples), - h_list_sizes_new(nrow), - h_graph_old(nrow * num_samples), - h_list_sizes_old{nrow} + h_graph_new{raft::make_pinned_matrix(res, nrow, num_samples)}, + h_list_sizes_new{raft::make_pinned_vector(res, nrow)}, + h_graph_old{raft::make_pinned_matrix(res, nrow, num_samples)}, + h_list_sizes_old{raft::make_pinned_vector(res, nrow)} { // node_degree must be a multiple of segment_size; assert(node_degree % segment_size == 0); @@ -1001,9 +1009,9 @@ void GnndGraph::sample_graph_new(InternalID_t* new_neighbors, { #pragma omp parallel for for (size_t i = 0; i < nrow; i++) { - auto list_new = h_graph_new.data() + i * num_samples; - h_list_sizes_new[i].x = 0; - h_list_sizes_new[i].y = 0; + auto list_new = h_graph_new.data_handle() + i * num_samples; + h_list_sizes_new.data_handle()[i].x = 0; + h_list_sizes_new.data_handle()[i].y = 0; for (size_t j = 0; j < width; j++) { auto new_neighb_id = new_neighbors[i * width + j].id(); @@ -1011,8 +1019,8 @@ void GnndGraph::sample_graph_new(InternalID_t* new_neighbors, if (bloom_filter.check(i, new_neighb_id)) { continue; } bloom_filter.add(i, new_neighb_id); new_neighbors[i * width + j].mark_old(); - list_new[h_list_sizes_new[i].x++] = new_neighb_id; - if (h_list_sizes_new[i].x == num_samples) break; + list_new[h_list_sizes_new.data_handle()[i].x++] = new_neighb_id; + if (h_list_sizes_new.data_handle()[i].x == num_samples) break; } } } @@ -1051,31 +1059,37 @@ void GnndGraph::sample_graph(bool sample_new) { #pragma omp parallel for for (size_t i = 0; i < nrow; i++) { - h_list_sizes_old[i].x = 0; - h_list_sizes_old[i].y = 0; - h_list_sizes_new[i].x = 0; - h_list_sizes_new[i].y = 0; + h_list_sizes_old.data_handle()[i].x = 0; + h_list_sizes_old.data_handle()[i].y = 0; + h_list_sizes_new.data_handle()[i].x = 0; + h_list_sizes_new.data_handle()[i].y = 0; auto list = h_graph + i * node_degree; - auto list_old = h_graph_old.data() + i * num_samples; - auto list_new = h_graph_new.data() + i * num_samples; + auto list_old = h_graph_old.data_handle() + i * num_samples; + auto list_new = h_graph_new.data_handle() + i * num_samples; for (int j = 0; j < segment_size; j++) { for (int k = 0; k < num_segments; k++) { auto neighbor = list[k * segment_size + j]; if ((size_t)neighbor.id() >= nrow) continue; if (!neighbor.is_new()) { - if (h_list_sizes_old[i].x < num_samples) { - list_old[h_list_sizes_old[i].x++] = neighbor.id(); + if (h_list_sizes_old.data_handle()[i].x < num_samples) { + list_old[h_list_sizes_old.data_handle()[i].x++] = neighbor.id(); } } else if (sample_new) { - if (h_list_sizes_new[i].x < num_samples) { + if (h_list_sizes_new.data_handle()[i].x < num_samples) { list[k * segment_size + j].mark_old(); - list_new[h_list_sizes_new[i].x++] = neighbor.id(); + list_new[h_list_sizes_new.data_handle()[i].x++] = neighbor.id(); } } - if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; } + if (h_list_sizes_old.data_handle()[i].x == num_samples && + h_list_sizes_new.data_handle()[i].x == num_samples) { + break; + } + } + if (h_list_sizes_old.data_handle()[i].x == num_samples && + h_list_sizes_new.data_handle()[i].x == num_samples) { + break; } - if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; } } } } @@ -1137,7 +1151,8 @@ template GNND::GNND(raft::resources const& res, const BuildConfig& build_config) : res(res), build_config_(build_config), - graph_(build_config.max_dataset_size, + graph_(res, + build_config.max_dataset_size, align32::roundUp(build_config.node_degree), align32::roundUp(build_config.internal_node_degree ? build_config.internal_node_degree : build_config.node_degree), @@ -1151,28 +1166,38 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, dists_buffer_{ raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, - graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE), - dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE), + graph_host_buffer_{ + raft::make_pinned_matrix(res, nrow_, DEGREE_ON_DEVICE)}, + dists_host_buffer_{ + raft::make_pinned_matrix(res, nrow_, DEGREE_ON_DEVICE)}, d_locks_{raft::make_device_vector(res, nrow_)}, - h_rev_graph_new_(nrow_ * NUM_SAMPLES), - h_graph_old_(nrow_ * NUM_SAMPLES), - h_rev_graph_old_(nrow_ * NUM_SAMPLES), + h_rev_graph_new_{ + raft::make_pinned_matrix(res, nrow_, NUM_SAMPLES)}, + h_graph_old_( + raft::make_pinned_matrix(res, nrow_, NUM_SAMPLES)), + h_rev_graph_old_{ + raft::make_pinned_matrix(res, nrow_, NUM_SAMPLES)}, d_list_sizes_new_{raft::make_device_vector(res, nrow_)}, d_list_sizes_old_{raft::make_device_vector(res, nrow_)} { static_assert(NUM_SAMPLES <= 32); - - thrust::fill(thrust::device, - dists_buffer_.data_handle(), - dists_buffer_.data_handle() + dists_buffer_.size(), - std::numeric_limits::max()); - thrust::fill(thrust::device, - reinterpret_cast(graph_buffer_.data_handle()), - reinterpret_cast(graph_buffer_.data_handle()) + graph_buffer_.size(), - std::numeric_limits::max()); - thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0); + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); + auto graph_buffer_view = raft::make_device_matrix_view( + reinterpret_cast(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE); + raft::matrix::fill(res, graph_buffer_view, std::numeric_limits::max()); + raft::matrix::fill(res, d_locks_.view(), 0); }; +template +void GNND::reset(raft::resources const& res) +{ + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); + auto graph_buffer_view = raft::make_device_matrix_view( + reinterpret_cast(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE); + raft::matrix::fill(res, graph_buffer_view, std::numeric_limits::max()); + raft::matrix::fill(res, d_locks_.view(), 0); +} + template void GNND::add_reverse_edges(Index_t* graph_ptr, Index_t* h_rev_graph_ptr, @@ -1189,34 +1214,35 @@ void GNND::add_reverse_edges(Index_t* graph_ptr, template void GNND::local_join(cudaStream_t stream) { - thrust::fill(thrust::device.on(stream), - dists_buffer_.data_handle(), - dists_buffer_.data_handle() + dists_buffer_.size(), - std::numeric_limits::max()); - local_join_kernel<<>>( - thrust::raw_pointer_cast(graph_.h_graph_new.data()), - thrust::raw_pointer_cast(h_rev_graph_new_.data()), - d_list_sizes_new_.data_handle(), - thrust::raw_pointer_cast(h_graph_old_.data()), - thrust::raw_pointer_cast(h_rev_graph_old_.data()), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_.data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle()); + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); + local_join_kernel<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_.data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle()); } template -void GNND::build(Data_t* data, const Index_t nrow, Index_t* output_graph) +void GNND::build(Data_t* data, + const Index_t nrow, + Index_t* output_graph, + bool return_distances, + DistData_t* output_distances) { using input_t = typename std::remove_const::type; cudaStream_t stream = raft::resource::get_cuda_stream(res); nrow_ = nrow; + graph_.nrow = nrow; graph_.h_graph = (InternalID_t*)output_graph; cudaPointerAttributes data_ptr_attr; @@ -1226,24 +1252,18 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{ data, static_cast(nrow_), build_config_.dataset_dim, batch_size, stream}; for (auto const& batch : vec_batches) { - preprocess_data_kernel<<(raft::warp_size())) * - raft::warp_size(), - stream>>>(batch.data(), - d_data_.data_handle(), - build_config_.dataset_dim, - l2_norms_.data_handle(), - batch.offset()); + preprocess_data_kernel<<< + batch.size(), + raft::warp_size(), + sizeof(Data_t) * ceildiv(build_config_.dataset_dim, static_cast(raft::warp_size())) * + raft::warp_size(), + stream>>>(batch.data(), + d_data_.data_handle(), + build_config_.dataset_dim, + l2_norms_.data_handle(), + batch.offset()); } - thrust::fill(thrust::device.on(stream), - (Index_t*)graph_buffer_.data_handle(), - (Index_t*)graph_buffer_.data_handle() + graph_buffer_.size(), - std::numeric_limits::max()); - graph_.clear(); graph_.init_random_graph(); graph_.sample_graph(true); @@ -1251,8 +1271,8 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out auto update_and_sample = [&](bool update_graph) { if (update_graph) { update_counter_ = 0; - graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()), - thrust::raw_pointer_cast(dists_host_buffer_.data()), + graph_.update_graph(graph_host_buffer_.data_handle(), + dists_host_buffer_.data_handle(), DEGREE_ON_DEVICE, update_counter_); if (update_counter_ < build_config_.termination_threshold * nrow_ * @@ -1265,15 +1285,15 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out for (size_t it = 0; it < build_config_.max_iterations; it++) { raft::copy(d_list_sizes_new_.data_handle(), - thrust::raw_pointer_cast(graph_.h_list_sizes_new.data()), + graph_.h_list_sizes_new.data_handle(), nrow_, raft::resource::get_cuda_stream(res)); - raft::copy(thrust::raw_pointer_cast(h_graph_old_.data()), - thrust::raw_pointer_cast(graph_.h_graph_old.data()), + raft::copy(h_graph_old_.data_handle(), + graph_.h_graph_old.data_handle(), nrow_ * NUM_SAMPLES, raft::resource::get_cuda_stream(res)); raft::copy(d_list_sizes_old_.data_handle(), - thrust::raw_pointer_cast(graph_.h_list_sizes_old.data()), + graph_.h_list_sizes_old.data_handle(), nrow_, raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); @@ -1286,13 +1306,13 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out // contains some information for local_join. static_assert(DEGREE_ON_DEVICE * sizeof(*(dists_buffer_.data_handle())) >= NUM_SAMPLES * sizeof(*(graph_buffer_.data_handle()))); - add_reverse_edges(thrust::raw_pointer_cast(graph_.h_graph_new.data()), - thrust::raw_pointer_cast(h_rev_graph_new_.data()), + add_reverse_edges(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), (Index_t*)dists_buffer_.data_handle(), d_list_sizes_new_.data_handle(), stream); - add_reverse_edges(thrust::raw_pointer_cast(h_graph_old_.data()), - thrust::raw_pointer_cast(h_rev_graph_old_.data()), + add_reverse_edges(h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), (Index_t*)dists_buffer_.data_handle(), d_list_sizes_old_.data_handle(), stream); @@ -1316,21 +1336,21 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out update_and_sample_thread.join(); if (update_counter_ == -1) { break; } - raft::copy(thrust::raw_pointer_cast(graph_host_buffer_.data()), + raft::copy(graph_host_buffer_.data_handle(), graph_buffer_.data_handle(), nrow_ * DEGREE_ON_DEVICE, raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); - raft::copy(thrust::raw_pointer_cast(dists_host_buffer_.data()), + raft::copy(dists_host_buffer_.data_handle(), dists_buffer_.data_handle(), nrow_ * DEGREE_ON_DEVICE, raft::resource::get_cuda_stream(res)); - graph_.sample_graph_new(thrust::raw_pointer_cast(graph_host_buffer_.data()), DEGREE_ON_DEVICE); + graph_.sample_graph_new(graph_host_buffer_.data_handle(), DEGREE_ON_DEVICE); } - graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()), - thrust::raw_pointer_cast(dists_host_buffer_.data()), + graph_.update_graph(graph_host_buffer_.data_handle(), + dists_host_buffer_.data_handle(), DEGREE_ON_DEVICE, update_counter_); raft::resource::sync_stream(res); @@ -1338,6 +1358,27 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out // Reuse graph_.h_dists as the buffer for shrink the lists in graph static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t)); + + if (return_distances) { + auto graph_d_dists = raft::make_device_matrix( + res, nrow_, build_config_.node_degree); + raft::copy(graph_d_dists.data_handle(), + graph_.h_dists.data_handle(), + nrow_ * build_config_.node_degree, + raft::resource::get_cuda_stream(res)); + + auto output_dist_view = raft::make_device_matrix_view( + output_distances, nrow_, build_config_.output_graph_degree); + + raft::matrix::slice_coordinates coords{static_cast(0), + static_cast(0), + static_cast(nrow_), + static_cast(build_config_.output_graph_degree)}; + raft::matrix::slice( + res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords); + raft::resource::sync_stream(res); + } + Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle(); #pragma omp parallel for @@ -1410,10 +1451,24 @@ void build(raft::resources const& res, .node_degree = extended_graph_degree, .internal_node_degree = extended_intermediate_degree, .max_iterations = params.max_iterations, - .termination_threshold = params.termination_threshold}; + .termination_threshold = params.termination_threshold, + .output_graph_degree = params.graph_degree}; GNND nnd(res, build_config); - nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle()); + + if (idx.distances().has_value() || !params.return_distances) { + nnd.build(dataset.data_handle(), + dataset.extent(0), + int_graph.data_handle(), + params.return_distances, + idx.distances() + .value_or(raft::make_device_matrix(res, 0, 0).view()) + .data_handle()); + } else { + RAFT_EXPECTS(!params.return_distances, + "Distance view not allocated. Using return_distances set to true requires " + "distance view to be allocated."); + } #pragma omp parallel for for (size_t i = 0; i < static_cast(dataset.extent(0)); i++) { @@ -1445,11 +1500,12 @@ index build( graph_degree = intermediate_degree; } - index idx{res, dataset.extent(0), static_cast(graph_degree)}; + index idx{ + res, dataset.extent(0), static_cast(graph_degree), params.return_distances}; build(res, params, dataset, idx); return idx; } -} // namespace cuvs::neighbors::nn_descent::detail +} // namespace cuvs::neighbors::nn_descent::detail diff --git a/cpp/src/neighbors/detail/nn_descent_batch.cuh b/cpp/src/neighbors/detail/nn_descent_batch.cuh new file mode 100644 index 000000000..842dbe788 --- /dev/null +++ b/cpp/src/neighbors/detail/nn_descent_batch.cuh @@ -0,0 +1,736 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#undef RAFT_EXPLICIT_INSTANTIATE_ONLY + +#include "nn_descent.cuh" +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace cuvs::neighbors::nn_descent::detail::experimental { + +// +// Run balanced kmeans on a subsample of the dataset to get centroids +// +template , memory_type::host>> +void get_balanced_kmeans_centroids( + raft::resources const& res, + cuvs::distance::DistanceType metric, + mdspan, row_major, Accessor> dataset, + raft::device_matrix_view centroids) +{ + size_t num_rows = static_cast(dataset.extent(0)); + size_t num_cols = static_cast(dataset.extent(1)); + size_t n_clusters = centroids.extent(0); + size_t num_subsamples = + std::min(static_cast(num_rows / n_clusters), static_cast(num_rows * 0.1)); + + auto d_subsample_dataset = + raft::make_device_matrix(res, num_subsamples, num_cols); + raft::matrix::sample_rows( + res, raft::random::RngState{0}, dataset, d_subsample_dataset.view()); + + cuvs::cluster::kmeans::balanced_params kmeans_params; + kmeans_params.metric = metric; + + auto d_subsample_dataset_const_view = + raft::make_device_matrix_view( + d_subsample_dataset.data_handle(), num_subsamples, num_cols); + auto centroids_view = raft::make_device_matrix_view( + centroids.data_handle(), n_clusters, num_cols); + cuvs::cluster::kmeans::fit(res, kmeans_params, d_subsample_dataset_const_view, centroids_view); +} + +// +// Get the top k closest centroid indices for each data point +// Loads the data in batches onto device if data is on host for memory efficiency +// +template +void get_global_nearest_k( + raft::resources const& res, + size_t k, + size_t num_rows, + size_t n_clusters, + const T* dataset, + raft::host_matrix_view global_nearest_cluster, + raft::device_matrix_view centroids, + cuvs::distance::DistanceType metric) +{ + size_t num_cols = centroids.extent(1); + auto centroids_view = raft::make_device_matrix_view( + centroids.data_handle(), n_clusters, num_cols); + + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset)); + float* ptr = reinterpret_cast(attr.devicePointer); + + size_t num_batches = n_clusters; + size_t batch_size = (num_rows + n_clusters) / n_clusters; + if (ptr == nullptr) { // data on host + + auto d_dataset_batch = + raft::make_device_matrix(res, batch_size, num_cols); + + auto nearest_clusters_idx = + raft::make_device_matrix(res, batch_size, k); + auto nearest_clusters_idxt = + raft::make_device_matrix(res, batch_size, k); + auto nearest_clusters_dist = + raft::make_device_matrix(res, batch_size, k); + + for (size_t i = 0; i < num_batches; i++) { + size_t batch_size_ = batch_size; + + if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; } + raft::copy(d_dataset_batch.data_handle(), + dataset + i * batch_size * num_cols, + batch_size_ * num_cols, + resource::get_cuda_stream(res)); + + std::optional> norms_view; + cuvs::neighbors::brute_force::index brute_force_index( + res, centroids_view, norms_view, metric); + cuvs::neighbors::brute_force::search(res, + brute_force_index, + raft::make_const_mdspan(d_dataset_batch.view()), + nearest_clusters_idx.view(), + nearest_clusters_dist.view()); + + thrust::copy(raft::resource::get_thrust_policy(res), + nearest_clusters_idx.data_handle(), + nearest_clusters_idx.data_handle() + nearest_clusters_idx.size(), + nearest_clusters_idxt.data_handle()); + raft::copy(global_nearest_cluster.data_handle() + i * batch_size * k, + nearest_clusters_idxt.data_handle(), + batch_size_ * k, + resource::get_cuda_stream(res)); + } + } else { // data on device + auto nearest_clusters_idx = + raft::make_device_matrix(res, num_rows, k); + auto nearest_clusters_dist = + raft::make_device_matrix(res, num_rows, k); + + std::optional> norms_view; + cuvs::neighbors::brute_force::index brute_force_index( + res, centroids_view, norms_view, metric); + auto dataset_view = + raft::make_device_matrix_view(dataset, num_rows, num_cols); + cuvs::neighbors::brute_force::search(res, + brute_force_index, + dataset_view, + nearest_clusters_idx.view(), + nearest_clusters_dist.view()); + + auto nearest_clusters_idxt = + raft::make_device_matrix(res, batch_size, k); + for (size_t i = 0; i < num_batches; i++) { + size_t batch_size_ = batch_size; + + if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; } + thrust::copy(raft::resource::get_thrust_policy(res), + nearest_clusters_idx.data_handle() + i * batch_size_ * k, + nearest_clusters_idx.data_handle() + (i + 1) * batch_size_ * k, + nearest_clusters_idxt.data_handle()); + raft::copy(global_nearest_cluster.data_handle() + i * batch_size_ * k, + nearest_clusters_idxt.data_handle(), + batch_size_ * k, + resource::get_cuda_stream(res)); + } + } +} + +// +// global_nearest_cluster [num_rows X k=2] : top 2 closest clusters for each data point +// inverted_indices [num_rows x k vector] : sparse vector for data indices for each cluster +// cluster_size [n_cluster] : cluster size for each cluster +// offset [n_cluster] : offset in inverted_indices for each cluster +// Loads the data in batches onto device if data is on host for memory efficiency +// +template +void get_inverted_indices(raft::resources const& res, + size_t n_clusters, + size_t& max_cluster_size, + size_t& min_cluster_size, + raft::host_matrix_view global_nearest_cluster, + raft::host_vector_view inverted_indices, + raft::host_vector_view cluster_size, + raft::host_vector_view offset) +{ + // build sparse inverted indices and get number of data points for each cluster + size_t num_rows = global_nearest_cluster.extent(0); + size_t k = global_nearest_cluster.extent(1); + + auto local_offset = raft::make_host_vector(n_clusters); + + max_cluster_size = 0; + min_cluster_size = std::numeric_limits::max(); + + std::fill(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters, 0); + std::fill(local_offset.data_handle(), local_offset.data_handle() + n_clusters, 0); + + // TODO: this part isn't really a bottleneck but maybe worth trying omp parallel + // for with atomic add + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = 0; j < k; j++) { + IdxT cluster_id = global_nearest_cluster(i, j); + cluster_size(cluster_id) += 1; + } + } + + offset(0) = 0; + for (size_t i = 1; i < n_clusters; i++) { + offset(i) = offset(i - 1) + cluster_size(i - 1); + } + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = 0; j < k; j++) { + IdxT cluster_id = global_nearest_cluster(i, j); + inverted_indices(offset(cluster_id) + local_offset(cluster_id)) = i; + local_offset(cluster_id) += 1; + } + } + + max_cluster_size = static_cast( + *std::max_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters)); + min_cluster_size = static_cast( + *std::min_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters)); +} + +template +struct KeyValuePair { + KeyType key; + ValueType value; +}; + +template +struct CustomKeyComparator { + __device__ bool operator()(const KeyValuePair& a, + const KeyValuePair& b) const + { + if (a.key == b.key) { return a.value < b.value; } + return a.key < b.key; + } +}; + +template +RAFT_KERNEL merge_subgraphs(IdxT* cluster_data_indices, + size_t graph_degree, + size_t num_cluster_in_batch, + float* global_distances, + float* batch_distances, + IdxT* global_indices, + IdxT* batch_indices) +{ + size_t batch_row = blockIdx.x; + typedef cub::BlockMergeSort, BLOCK_SIZE, ITEMS_PER_THREAD> + BlockMergeSortType; + __shared__ typename cub::BlockMergeSort, BLOCK_SIZE, ITEMS_PER_THREAD>:: + TempStorage tmpSmem; + + extern __shared__ char sharedMem[]; + float* blockKeys = reinterpret_cast(sharedMem); + IdxT* blockValues = reinterpret_cast(&sharedMem[graph_degree * 2 * sizeof(float)]); + int16_t* uniqueMask = + reinterpret_cast(&sharedMem[graph_degree * 2 * (sizeof(float) + sizeof(IdxT))]); + + if (batch_row < num_cluster_in_batch) { + // load batch or global depending on threadIdx + size_t global_row = cluster_data_indices[batch_row]; + + KeyValuePair threadKeyValuePair[ITEMS_PER_THREAD]; + + size_t halfway = BLOCK_SIZE / 2; + size_t do_global = threadIdx.x < halfway; + + float* distances; + IdxT* indices; + + if (do_global) { + distances = global_distances; + indices = global_indices; + } else { + distances = batch_distances; + indices = batch_indices; + } + + size_t idxBase = (threadIdx.x * do_global + (threadIdx.x - halfway) * (1lu - do_global)) * + static_cast(ITEMS_PER_THREAD); + size_t arrIdxBase = (global_row * do_global + batch_row * (1lu - do_global)) * graph_degree; + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId < graph_degree) { + threadKeyValuePair[i].key = distances[arrIdxBase + colId]; + threadKeyValuePair[i].value = indices[arrIdxBase + colId]; + } else { + threadKeyValuePair[i].key = std::numeric_limits::max(); + threadKeyValuePair[i].value = std::numeric_limits::max(); + } + } + + __syncthreads(); + + BlockMergeSortType(tmpSmem).Sort(threadKeyValuePair, CustomKeyComparator{}); + + // load sorted result into shared memory to get unique values + idxBase = threadIdx.x * ITEMS_PER_THREAD; + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId < 2 * graph_degree) { + blockKeys[colId] = threadKeyValuePair[i].key; + blockValues[colId] = threadKeyValuePair[i].value; + } + } + + __syncthreads(); + + // get unique mask + if (threadIdx.x == 0) { uniqueMask[0] = 1; } + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId > 0 && colId < 2 * graph_degree) { + uniqueMask[colId] = static_cast(blockValues[colId] != blockValues[colId - 1]); + } + } + + __syncthreads(); + + // prefix sum + if (threadIdx.x == 0) { + for (int i = 1; i < 2 * graph_degree; i++) { + uniqueMask[i] += uniqueMask[i - 1]; + } + } + + __syncthreads(); + // load unique values to global memory + if (threadIdx.x == 0) { + global_distances[global_row * graph_degree] = blockKeys[0]; + global_indices[global_row * graph_degree] = blockValues[0]; + } + + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId > 0 && colId < 2 * graph_degree) { + bool is_unique = uniqueMask[colId] != uniqueMask[colId - 1]; + int16_t global_colId = uniqueMask[colId] - 1; + if (is_unique && static_cast(global_colId) < graph_degree) { + global_distances[global_row * graph_degree + global_colId] = blockKeys[colId]; + global_indices[global_row * graph_degree + global_colId] = blockValues[colId]; + } + } + } + } +} + +// +// builds knn graph using NN Descent and merge with global graph +// +template , memory_type::host>> +void build_and_merge(raft::resources const& res, + const index_params& params, + size_t num_data_in_cluster, + size_t graph_degree, + size_t int_graph_node_degree, + T* cluster_data, + IdxT* cluster_data_indices, + int* int_graph, + IdxT* inverted_indices, + IdxT* global_indices_d, + float* global_distances_d, + IdxT* batch_indices_h, + IdxT* batch_indices_d, + float* batch_distances_d, + GNND& nnd) +{ + nnd.build(cluster_data, num_data_in_cluster, int_graph, true, batch_distances_d); + + // remap indices +#pragma omp parallel for + for (size_t i = 0; i < num_data_in_cluster; i++) { + for (size_t j = 0; j < graph_degree; j++) { + size_t local_idx = int_graph[i * int_graph_node_degree + j]; + batch_indices_h[i * graph_degree + j] = inverted_indices[local_idx]; + } + } + + raft::copy(batch_indices_d, + batch_indices_h, + num_data_in_cluster * graph_degree, + raft::resource::get_cuda_stream(res)); + + size_t num_elems = graph_degree * 2; + size_t sharedMemSize = num_elems * (sizeof(float) + sizeof(IdxT) + sizeof(int16_t)); + + if (num_elems <= 128) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else if (num_elems <= 512) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else if (num_elems <= 1024) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else if (num_elems <= 2048) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else { + // this is as far as we can get due to the shared mem usage of cub::BlockMergeSort + RAFT_FAIL("The degree of knn is too large (%lu). It must be smaller than 1024", graph_degree); + } + raft::resource::sync_stream(res); +} + +// +// For each cluster, gather the data samples that belong to that cluster, and +// call build_and_merge +// +template +void cluster_nnd(raft::resources const& res, + const index_params& params, + size_t graph_degree, + size_t extended_graph_degree, + size_t max_cluster_size, + raft::host_matrix_view dataset, + IdxT* offsets, + IdxT* cluster_size, + IdxT* cluster_data_indices, + int* int_graph, + IdxT* inverted_indices, + IdxT* global_indices_h, + float* global_distances_h, + IdxT* batch_indices_h, + IdxT* batch_indices_d, + float* batch_distances_d, + const BuildConfig& build_config) +{ + size_t num_rows = dataset.extent(0); + size_t num_cols = dataset.extent(1); + + GNND nnd(res, build_config); + + auto cluster_data_matrix = + raft::make_host_matrix(max_cluster_size, num_cols); + + for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) { + RAFT_LOG_DEBUG( + "# Data on host. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters); + size_t num_data_in_cluster = cluster_size[cluster_id]; + size_t offset = offsets[cluster_id]; + +#pragma omp parallel for + for (size_t i = 0; i < num_data_in_cluster; i++) { + for (size_t j = 0; j < num_cols; j++) { + size_t global_row = (inverted_indices + offset)[i]; + cluster_data_matrix(i, j) = dataset(global_row, j); + } + } + + build_and_merge(res, + params, + num_data_in_cluster, + graph_degree, + extended_graph_degree, + cluster_data_matrix.data_handle(), + cluster_data_indices + offset, + int_graph, + inverted_indices + offset, + global_indices_h, + global_distances_h, + batch_indices_h, + batch_indices_d, + batch_distances_d, + nnd); + nnd.reset(res); + } +} + +template +void cluster_nnd(raft::resources const& res, + const index_params& params, + size_t graph_degree, + size_t extended_graph_degree, + size_t max_cluster_size, + raft::device_matrix_view dataset, + IdxT* offsets, + IdxT* cluster_size, + IdxT* cluster_data_indices, + int* int_graph, + IdxT* inverted_indices, + IdxT* global_indices_h, + float* global_distances_h, + IdxT* batch_indices_h, + IdxT* batch_indices_d, + float* batch_distances_d, + const BuildConfig& build_config) +{ + size_t num_rows = dataset.extent(0); + size_t num_cols = dataset.extent(1); + + GNND nnd(res, build_config); + + auto cluster_data_matrix = + raft::make_device_matrix(res, max_cluster_size, num_cols); + + for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) { + RAFT_LOG_DEBUG( + "# Data on device. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters); + size_t num_data_in_cluster = cluster_size[cluster_id]; + size_t offset = offsets[cluster_id]; + + auto cluster_data_view = raft::make_device_matrix_view( + cluster_data_matrix.data_handle(), num_data_in_cluster, num_cols); + auto cluster_data_indices_view = raft::make_device_vector_view( + cluster_data_indices + offset, num_data_in_cluster); + + auto dataset_IdxT = + raft::make_device_matrix_view(dataset.data_handle(), num_rows, num_cols); + raft::matrix::gather(res, dataset_IdxT, cluster_data_indices_view, cluster_data_view); + + build_and_merge(res, + params, + num_data_in_cluster, + graph_degree, + extended_graph_degree, + cluster_data_view.data_handle(), + cluster_data_indices + offset, + int_graph, + inverted_indices + offset, + global_indices_h, + global_distances_h, + batch_indices_h, + batch_indices_d, + batch_distances_d, + nnd); + nnd.reset(res); + } +} + +template , memory_type::host>> +void batch_build(raft::resources const& res, + const index_params& params, + mdspan, row_major, Accessor> dataset, + index& global_idx) +{ + size_t graph_degree = params.graph_degree; + size_t intermediate_degree = params.intermediate_graph_degree; + + size_t num_rows = static_cast(dataset.extent(0)); + size_t num_cols = static_cast(dataset.extent(1)); + + auto centroids = + raft::make_device_matrix(res, params.n_clusters, num_cols); + get_balanced_kmeans_centroids(res, params.metric, dataset, centroids.view()); + + size_t k = 2; + auto global_nearest_cluster = raft::make_host_matrix(num_rows, k); + get_global_nearest_k(res, + k, + num_rows, + params.n_clusters, + dataset.data_handle(), + global_nearest_cluster.view(), + centroids.view(), + params.metric); + + auto inverted_indices = raft::make_host_vector(num_rows * k); + auto cluster_size = raft::make_host_vector(params.n_clusters); + auto offset = raft::make_host_vector(params.n_clusters); + + size_t max_cluster_size, min_cluster_size; + get_inverted_indices(res, + params.n_clusters, + max_cluster_size, + min_cluster_size, + global_nearest_cluster.view(), + inverted_indices.view(), + cluster_size.view(), + offset.view()); + + if (intermediate_degree >= min_cluster_size) { + RAFT_LOG_WARN( + "Intermediate graph degree cannot be larger than minimum cluster size, reducing it to %lu", + dataset.extent(0)); + intermediate_degree = min_cluster_size - 1; + } + if (intermediate_degree < graph_degree) { + RAFT_LOG_WARN( + "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing " + "graph_degree.", + graph_degree, + intermediate_degree); + graph_degree = intermediate_degree; + } + + size_t extended_graph_degree = + align32::roundUp(static_cast(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3))); + size_t extended_intermediate_degree = align32::roundUp( + static_cast(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3))); + + auto int_graph = raft::make_host_matrix( + max_cluster_size, static_cast(extended_graph_degree)); + + BuildConfig build_config{.max_dataset_size = max_cluster_size, + .dataset_dim = num_cols, + .node_degree = extended_graph_degree, + .internal_node_degree = extended_intermediate_degree, + .max_iterations = params.max_iterations, + .termination_threshold = params.termination_threshold, + .output_graph_degree = graph_degree}; + + auto global_indices_h = raft::make_managed_matrix(res, num_rows, graph_degree); + auto global_distances_h = raft::make_managed_matrix(res, num_rows, graph_degree); + + std::fill(global_indices_h.data_handle(), + global_indices_h.data_handle() + num_rows * graph_degree, + std::numeric_limits::max()); + std::fill(global_distances_h.data_handle(), + global_distances_h.data_handle() + num_rows * graph_degree, + std::numeric_limits::max()); + + auto batch_indices_h = + raft::make_host_matrix(max_cluster_size, graph_degree); + auto batch_indices_d = + raft::make_device_matrix(res, max_cluster_size, graph_degree); + auto batch_distances_d = + raft::make_device_matrix(res, max_cluster_size, graph_degree); + + auto cluster_data_indices = raft::make_device_vector(res, num_rows * k); + raft::copy(cluster_data_indices.data_handle(), + inverted_indices.data_handle(), + num_rows * k, + resource::get_cuda_stream(res)); + + cluster_nnd(res, + params, + graph_degree, + extended_graph_degree, + max_cluster_size, + dataset, + offset.data_handle(), + cluster_size.data_handle(), + cluster_data_indices.data_handle(), + int_graph.data_handle(), + inverted_indices.data_handle(), + global_indices_h.data_handle(), + global_distances_h.data_handle(), + batch_indices_h.data_handle(), + batch_indices_d.data_handle(), + batch_distances_d.data_handle(), + build_config); + + raft::copy(global_idx.graph().data_handle(), + global_indices_h.data_handle(), + num_rows * graph_degree, + raft::resource::get_cuda_stream(res)); + if (params.return_distances && global_idx.distances().has_value()) { + raft::copy(global_idx.distances().value().data_handle(), + global_distances_h.data_handle(), + num_rows * graph_degree, + raft::resource::get_cuda_stream(res)); + } +} + +template , memory_type::host>> +index batch_build(raft::resources const& res, + const index_params& params, + mdspan, row_major, Accessor> dataset) +{ + size_t intermediate_degree = params.intermediate_graph_degree; + size_t graph_degree = params.graph_degree; + + if (intermediate_degree < graph_degree) { + RAFT_LOG_WARN( + "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing " + "graph_degree.", + graph_degree, + intermediate_degree); + graph_degree = intermediate_degree; + } + + index idx{ + res, dataset.extent(0), static_cast(graph_degree), params.return_distances}; + + batch_build(res, params, dataset, idx); + + return idx; +} + +} // namespace cuvs::neighbors::nn_descent::detail::experimental diff --git a/cpp/src/neighbors/nn_descent.cuh b/cpp/src/neighbors/nn_descent.cuh index 582da72c1..ed91dac91 100644 --- a/cpp/src/neighbors/nn_descent.cuh +++ b/cpp/src/neighbors/nn_descent.cuh @@ -17,9 +17,14 @@ #pragma once #include "detail/nn_descent.cuh" +#include "detail/nn_descent_batch.cuh" + +#include +#include #include #include +#include #include namespace cuvs::neighbors::nn_descent { @@ -61,7 +66,15 @@ auto build(raft::resources const& res, index_params const& params, raft::device_matrix_view dataset) -> index { - return detail::build(res, params, dataset); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + return detail::experimental::batch_build(res, params, dataset); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + return detail::build(res, params, dataset); + } } /** @@ -100,7 +113,15 @@ void build(raft::resources const& res, raft::device_matrix_view dataset, index& idx) { - detail::build(res, params, dataset, idx); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + detail::experimental::batch_build(res, params, dataset, idx); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + detail::build(res, params, dataset, idx); + } } /** @@ -135,7 +156,15 @@ auto build(raft::resources const& res, index_params const& params, raft::host_matrix_view dataset) -> index { - return detail::build(res, params, dataset); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + return detail::experimental::batch_build(res, params, dataset); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + return detail::build(res, params, dataset); + } } /** @@ -174,7 +203,15 @@ void build(raft::resources const& res, raft::host_matrix_view dataset, index& idx) { - detail::build(res, params, dataset, idx); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + detail::experimental::batch_build(res, params, dataset, idx); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + detail::build(res, params, dataset, idx); + } } /** @} */ // end group nn-descent diff --git a/cpp/src/neighbors/nn_descent_float.cu b/cpp/src/neighbors/nn_descent_float.cu index c6d356671..fa85db127 100644 --- a/cpp/src/neighbors/nn_descent_float.cu +++ b/cpp/src/neighbors/nn_descent_float.cu @@ -19,21 +19,38 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + }; \ + } \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(float, uint32_t); diff --git a/cpp/src/neighbors/nn_descent_half.cu b/cpp/src/neighbors/nn_descent_half.cu index 587993031..2ee45d435 100644 --- a/cpp/src/neighbors/nn_descent_half.cu +++ b/cpp/src/neighbors/nn_descent_half.cu @@ -19,21 +19,39 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ + }; \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(half, uint32_t); diff --git a/cpp/src/neighbors/nn_descent_int8.cu b/cpp/src/neighbors/nn_descent_int8.cu index 813a01746..e150f511b 100644 --- a/cpp/src/neighbors/nn_descent_int8.cu +++ b/cpp/src/neighbors/nn_descent_int8.cu @@ -19,21 +19,39 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ + }; \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(int8_t, uint32_t); diff --git a/cpp/src/neighbors/nn_descent_uint8.cu b/cpp/src/neighbors/nn_descent_uint8.cu index 9d73dd90f..d8657777b 100644 --- a/cpp/src/neighbors/nn_descent_uint8.cu +++ b/cpp/src/neighbors/nn_descent_uint8.cu @@ -19,21 +19,39 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ + }; \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(uint8_t, uint32_t); diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh index bce0f9899..7d2575c2b 100644 --- a/cpp/test/neighbors/ann_nn_descent.cuh +++ b/cpp/test/neighbors/ann_nn_descent.cuh @@ -18,9 +18,13 @@ #include "../test_utils.cuh" #include "ann_utils.cuh" +#include #include + #include +#include #include +#include #include "naive_knn.cuh" @@ -42,6 +46,15 @@ struct AnnNNDescentInputs { double min_recall; }; +struct AnnNNDescentBatchInputs { + std::pair recall_cluster; + int n_rows; + int dim; + int graph_degree; + cuvs::distance::DistanceType metric; + bool host_dataset; +}; + inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p) { os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree @@ -50,6 +63,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& return os; } +inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentBatchInputs& p) +{ + os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree + << ", metric=" << static_cast(p.metric) << (p.host_dataset ? ", host" : ", device") + << ", clusters=" << p.recall_cluster.second << std::endl; + return os; +} + template class AnnNNDescentTest : public ::testing::TestWithParam { public: @@ -65,7 +86,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam { { size_t queries_size = ps.n_rows * ps.graph_degree; std::vector indices_NNDescent(queries_size); + std::vector distances_NNDescent(queries_size); std::vector indices_naive(queries_size); + std::vector distances_naive(queries_size); { rmm::device_uvector distances_naive_dev(queries_size, stream_); @@ -81,16 +104,18 @@ class AnnNNDescentTest : public ::testing::TestWithParam { ps.graph_degree, ps.metric); raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); + raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); raft::resource::sync_stream(handle_); } { { - cuvs::neighbors::nn_descent::index_params index_params; + nn_descent::index_params index_params; index_params.metric = ps.metric; index_params.graph_degree = ps.graph_degree; index_params.intermediate_graph_degree = 2 * ps.graph_degree; index_params.max_iterations = 100; + index_params.return_distances = true; auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); @@ -101,22 +126,40 @@ class AnnNNDescentTest : public ::testing::TestWithParam { raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); auto database_host_view = raft::make_host_matrix_view( (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); - auto index = - cuvs::neighbors::nn_descent::build(handle_, index_params, database_host_view); - raft::update_host( + auto index = nn_descent::build(handle_, index_params, database_host_view); + raft::copy( indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } + } else { - auto index = cuvs::neighbors::nn_descent::build(handle_, index_params, database_view); - raft::update_host( + auto index = nn_descent::build(handle_, index_params, database_view); + raft::copy( indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } }; } raft::resource::sync_stream(handle_); } double min_recall = ps.min_recall; - EXPECT_TRUE(eval_recall( - indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall)); + EXPECT_TRUE(eval_neighbours(indices_naive, + indices_NNDescent, + distances_naive, + distances_NNDescent, + ps.n_rows, + ps.graph_degree, + 0.001, + min_recall)); } } @@ -146,6 +189,125 @@ class AnnNNDescentTest : public ::testing::TestWithParam { rmm::device_uvector database; }; +template +class AnnNNDescentBatchTest : public ::testing::TestWithParam { + public: + AnnNNDescentBatchTest() + : stream_(raft::resource::get_cuda_stream(handle_)), + ps(::testing::TestWithParam::GetParam()), + database(0, stream_) + { + } + + void testNNDescentBatch() + { + size_t queries_size = ps.n_rows * ps.graph_degree; + std::vector indices_NNDescent(queries_size); + std::vector distances_NNDescent(queries_size); + std::vector indices_naive(queries_size); + std::vector distances_naive(queries_size); + + { + rmm::device_uvector distances_naive_dev(queries_size, stream_); + rmm::device_uvector indices_naive_dev(queries_size, stream_); + naive_knn(handle_, + distances_naive_dev.data(), + indices_naive_dev.data(), + database.data(), + database.data(), + ps.n_rows, + ps.n_rows, + ps.dim, + ps.graph_degree, + ps.metric); + raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); + raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); + raft::resource::sync_stream(handle_); + } + + { + { + nn_descent::index_params index_params; + index_params.metric = ps.metric; + index_params.graph_degree = ps.graph_degree; + index_params.intermediate_graph_degree = 2 * ps.graph_degree; + index_params.max_iterations = 10; + index_params.return_distances = true; + index_params.n_clusters = ps.recall_cluster.second; + + auto database_view = raft::make_device_matrix_view( + (const DataT*)database.data(), ps.n_rows, ps.dim); + + { + if (ps.host_dataset) { + auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); + raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); + auto database_host_view = raft::make_host_matrix_view( + (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); + auto index = nn_descent::build(handle_, index_params, database_host_view); + raft::copy( + indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } + + } else { + auto index = nn_descent::build(handle_, index_params, database_view); + raft::copy( + indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } + }; + } + raft::resource::sync_stream(handle_); + } + double min_recall = ps.recall_cluster.first; + EXPECT_TRUE(eval_neighbours(indices_naive, + indices_NNDescent, + distances_naive, + distances_NNDescent, + ps.n_rows, + ps.graph_degree, + 0.01, + min_recall, + true, + static_cast(ps.graph_degree * 0.1))); + } + } + + void SetUp() override + { + database.resize(((size_t)ps.n_rows) * ps.dim, stream_); + raft::random::RngState r(1234ULL); + if constexpr (std::is_same{}) { + raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0)); + } else { + raft::random::uniformInt( + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20)); + } + raft::resource::sync_stream(handle_); + } + + void TearDown() override + { + raft::resource::sync_stream(handle_); + database.resize(0, stream_); + } + + private: + raft::resources handle_; + rmm::cuda_stream_view stream_; + AnnNNDescentBatchInputs ps; + rmm::device_uvector database; +}; + const std::vector inputs = raft::util::itertools::product( {1000, 2000}, // n_rows {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, // dim @@ -154,4 +316,15 @@ const std::vector inputs = raft::util::itertools::product inputsBatch = + raft::util::itertools::product( + {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)}, // min_recall, n_clusters + {4000, 5000}, // n_rows + {192, 512}, // dim + {32, 64}, // graph_degree + {cuvs::distance::DistanceType::L2Expanded}, + {false, true}); + +} // namespace cuvs::neighbors::nn_descent diff --git a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu index 64c0e0291..7a24f96a1 100644 --- a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu +++ b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu @@ -23,6 +23,12 @@ namespace cuvs::neighbors::nn_descent { typedef AnnNNDescentTest AnnNNDescentTestF_U32; TEST_P(AnnNNDescentTestF_U32, AnnNNDescent) { this->testNNDescent(); } +// typedef AnnNNDescentBatchTest AnnNNDescentBatchTestF_U32; +// TEST_P(AnnNNDescentBatchTestF_U32, AnnNNDescentBatch) { this->testNNDescentBatch(); } + INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestF_U32, ::testing::ValuesIn(inputs)); +// INSTANTIATE_TEST_CASE_P(AnnNNDescentBatchTest, +// AnnNNDescentBatchTestF_U32, +// ::testing::ValuesIn(inputsBatch)); } // namespace cuvs::neighbors::nn_descent diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index b08e1d725..94bccade2 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include // raft::make_device_matrix #include @@ -165,9 +166,14 @@ auto calc_recall(const std::vector& expected_idx, /** check uniqueness of indices */ template -auto check_unique_indices(const std::vector& actual_idx, size_t rows, size_t cols) +auto check_unique_indices(const std::vector& actual_idx, + size_t rows, + size_t cols, + size_t max_duplicates = 0) { size_t max_count; + size_t dup_count = 0lu; + std::set unique_indices; for (size_t i = 0; i < rows; ++i) { unique_indices.clear(); @@ -180,8 +186,11 @@ auto check_unique_indices(const std::vector& actual_idx, size_t rows, size_t } else if (unique_indices.find(act_idx) == unique_indices.end()) { unique_indices.insert(act_idx); } else { - return testing::AssertionFailure() - << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! "; + dup_count++; + if (dup_count > max_duplicates) { + return testing::AssertionFailure() + << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! "; + } } } } @@ -264,7 +273,8 @@ auto eval_neighbours(const std::vector& expected_idx, size_t cols, double eps, double min_recall, - bool test_unique = true) -> testing::AssertionResult + bool test_unique = true, + size_t max_duplicates = 0) -> testing::AssertionResult { auto [actual_recall, match_count, total_count] = calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps); @@ -284,7 +294,7 @@ auto eval_neighbours(const std::vector& expected_idx, << min_recall << "); eps = " << eps << ". "; } if (test_unique) - return check_unique_indices(actual_idx, rows, cols); + return check_unique_indices(actual_idx, rows, cols, max_duplicates); else return testing::AssertionSuccess(); } diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py index 0ae97266b..8bd2e8b76 100644 --- a/python/cuvs/cuvs/test/test_hnsw.py +++ b/python/cuvs/cuvs/test/test_hnsw.py @@ -23,7 +23,7 @@ def run_hnsw_build_search_test( - n_rows=1000, + n_rows=10000, n_cols=10, n_queries=100, k=10, From fdb118002a482e878ec48fcaa7f11a15efd59140 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 13 Nov 2024 21:32:29 -0600 Subject: [PATCH 25/47] enforce wheel size limits, README formatting in CI (#464) Contributes to https://github.com/rapidsai/build-planning/issues/110 Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI. * checks on wheel size (compressed), - *to be sure they're under PyPI limits* - *and to prompt discussion on PRs that significantly increase wheel sizes* * checks on README formatting - *to ensure they'll render properly as the PyPI project homepages* - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/* Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cuvs/pull/464 --- ci/build_wheel_cuvs.sh | 5 ++++- ci/validate_wheel.sh | 21 +++++++++++++++++++++ python/cuvs/pyproject.toml | 8 ++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100755 ci/validate_wheel.sh diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh index e03da9f19..444657cc0 100755 --- a/ci/build_wheel_cuvs.sh +++ b/ci/build_wheel_cuvs.sh @@ -3,6 +3,8 @@ set -euo pipefail +package_dir="python/cuvs" + case "${RAPIDS_CUDA_VERSION}" in 12.*) EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON" @@ -15,4 +17,5 @@ esac # Set up skbuild options. Enable sccache in skbuild config options export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}" -ci/build_wheel.sh cuvs python/cuvs +ci/build_wheel.sh cuvs ${package_dir} +ci/validate_wheel.sh ${package_dir} final_dist diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh new file mode 100755 index 000000000..5910a5c59 --- /dev/null +++ b/ci/validate_wheel.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir=$1 +wheel_dir_relative_path=$2 + +cd "${package_dir}" + +rapids-logger "validate packages with 'pydistcheck'" + +pydistcheck \ + --inspect \ + "$(echo ${wheel_dir_relative_path}/*.whl)" + +rapids-logger "validate packages with 'twine'" + +twine check \ + --strict \ + "$(echo ${wheel_dir_relative_path}/*.whl)" diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml index 30d784c67..d40026776 100644 --- a/python/cuvs/pyproject.toml +++ b/python/cuvs/pyproject.toml @@ -133,6 +133,14 @@ build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" +[tool.pydistcheck] +select = [ + "distro-too-large-compressed", +] + +# detect when package size grows significantly +max_allowed_size_compressed = '1.4G' + [tool.pytest.ini_options] filterwarnings = [ "error", From bb9c669500cf0401114f4a5810d0f3a0ea1db6b3 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Thu, 14 Nov 2024 21:25:58 +0100 Subject: [PATCH 26/47] Fix include errors, header, and unsafe locks in iface.hpp (#467) Fix a few issues with the internal header `neighbors/iface/iface.hpp` leading to compile time errors and dangerous runtime behavior: - Add missing includes - Use `std::lock_guard` to avoid a deadlock on exception - Add NVIDIA header - Avoid an extra stream sync during search. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Victor Lafargue (https://github.com/viclafargue) - Corey J. Nolet (https://github.com/cjnolet) - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/cuvs/pull/467 --- cpp/src/neighbors/cagra_c.cpp | 2 ++ cpp/src/neighbors/iface/iface.hpp | 53 +++++++++++++++++-------------- cpp/src/neighbors/ivf_flat_c.cpp | 2 ++ cpp/src/neighbors/mg/mg.cuh | 2 ++ examples/cpp/src/common.cuh | 4 +++ 5 files changed, 39 insertions(+), 24 deletions(-) mode change 100755 => 100644 cpp/src/neighbors/ivf_flat_c.cpp diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp index 6985ff094..326a89665 100644 --- a/cpp/src/neighbors/cagra_c.cpp +++ b/cpp/src/neighbors/cagra_c.cpp @@ -29,6 +29,8 @@ #include #include +#include + namespace { template diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp index a329db429..9b3da75a4 100644 --- a/cpp/src/neighbors/iface/iface.hpp +++ b/cpp/src/neighbors/iface/iface.hpp @@ -1,4 +1,20 @@ -#include +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once #include #include @@ -6,6 +22,9 @@ #include #include +#include +#include + namespace cuvs::neighbors { using namespace raft; @@ -16,7 +35,7 @@ void build(const raft::device_resources& handle, const cuvs::neighbors::index_params* index_params, raft::mdspan, row_major, Accessor> index_dataset) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { auto idx = cuvs::neighbors::ivf_flat::build( @@ -32,8 +51,6 @@ void build(const raft::device_resources& handle, interface.index_.emplace(std::move(idx)); } resource::sync_stream(handle); - - interface.mutex_->unlock(); } template @@ -44,7 +61,7 @@ void extend( std::optional, layout_c_contiguous, Accessor2>> new_indices) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { auto idx = @@ -58,8 +75,6 @@ void extend( RAFT_FAIL("CAGRA does not implement the extend method"); } resource::sync_stream(handle); - - interface.mutex_->unlock(); } template @@ -70,7 +85,7 @@ void search(const raft::device_resources& handle, raft::device_matrix_view neighbors, raft::device_matrix_view distances) { - // interface.mutex_->lock(); + // std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { cuvs::neighbors::ivf_flat::search( handle, @@ -94,9 +109,7 @@ void search(const raft::device_resources& handle, neighbors, distances); } - resource::sync_stream(handle); - - // interface.mutex_->unlock(); + // resource::sync_stream(handle); } // for MG ANN only @@ -108,7 +121,7 @@ void search(const raft::device_resources& handle, raft::device_matrix_view d_neighbors, raft::device_matrix_view d_distances) { - // interface.mutex_->lock(); + // std::lock_guard(*interface.mutex_); int64_t n_rows = h_queries.extent(0); int64_t n_dims = h_queries.extent(1); @@ -120,8 +133,6 @@ void search(const raft::device_resources& handle, auto d_query_view = raft::make_const_mdspan(d_queries.view()); search(handle, interface, search_params, d_query_view, d_neighbors, d_distances); - - // interface.mutex_->unlock(); } template @@ -129,7 +140,7 @@ void serialize(const raft::device_resources& handle, const cuvs::neighbors::iface& interface, std::ostream& os) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { ivf_flat::serialize(handle, os, interface.index_.value()); @@ -138,8 +149,6 @@ void serialize(const raft::device_resources& handle, } else if constexpr (std::is_same>::value) { cagra::serialize(handle, os, interface.index_.value(), true); } - - interface.mutex_->unlock(); } template @@ -147,7 +156,7 @@ void deserialize(const raft::device_resources& handle, cuvs::neighbors::iface& interface, std::istream& is) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { ivf_flat::index idx(handle); @@ -162,8 +171,6 @@ void deserialize(const raft::device_resources& handle, cagra::deserialize(handle, is, &idx); interface.index_.emplace(std::move(idx)); } - - interface.mutex_->unlock(); } template @@ -171,7 +178,7 @@ void deserialize(const raft::device_resources& handle, cuvs::neighbors::iface& interface, const std::string& filename) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); std::ifstream is(filename, std::ios::in | std::ios::binary); if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); } @@ -191,8 +198,6 @@ void deserialize(const raft::device_resources& handle, } is.close(); - - interface.mutex_->unlock(); } -}; // namespace cuvs::neighbors \ No newline at end of file +}; // namespace cuvs::neighbors diff --git a/cpp/src/neighbors/ivf_flat_c.cpp b/cpp/src/neighbors/ivf_flat_c.cpp old mode 100755 new mode 100644 index c14c1edc0..2acc6b678 --- a/cpp/src/neighbors/ivf_flat_c.cpp +++ b/cpp/src/neighbors/ivf_flat_c.cpp @@ -29,6 +29,8 @@ #include #include +#include + namespace { template diff --git a/cpp/src/neighbors/mg/mg.cuh b/cpp/src/neighbors/mg/mg.cuh index d3f635bc4..e9cdc30f6 100644 --- a/cpp/src/neighbors/mg/mg.cuh +++ b/cpp/src/neighbors/mg/mg.cuh @@ -25,6 +25,8 @@ #include #include +#include + namespace cuvs::neighbors { using namespace raft; diff --git a/examples/cpp/src/common.cuh b/examples/cpp/src/common.cuh index 1c93dec0e..8e109a764 100644 --- a/examples/cpp/src/common.cuh +++ b/examples/cpp/src/common.cuh @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include #include @@ -28,6 +30,8 @@ #include #include +#include + // Fill dataset and queries with synthetic data. void generate_dataset(raft::device_resources const &dev_resources, raft::device_matrix_view dataset, From 7ab2bfdd250613137a5622471212dab528319306 Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Fri, 15 Nov 2024 12:16:17 -0500 Subject: [PATCH 27/47] Add `InnerProduct` and `CosineExpanded` metric support in NN Descent (#177) Closes #171 Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/177 --- cpp/CMakeLists.txt | 1 + cpp/include/cuvs/neighbors/nn_descent.hpp | 24 ++--- .../neighbors/detail/cagra/cagra_build.cuh | 12 ++- cpp/src/neighbors/detail/nn_descent.cuh | 87 +++++++++++++------ cpp/src/neighbors/nn_descent_index.cpp | 29 +++++++ cpp/test/neighbors/ann_cagra.cuh | 10 +-- cpp/test/neighbors/ann_nn_descent.cuh | 32 ++++--- python/cuvs/cuvs/test/test_cagra.py | 4 +- python/cuvs/cuvs/test/test_hnsw.py | 4 +- 9 files changed, 139 insertions(+), 64 deletions(-) create mode 100644 cpp/src/neighbors/nn_descent_index.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c493af488..81b82aa7b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -436,6 +436,7 @@ if(BUILD_SHARED_LIBS) src/neighbors/nn_descent.cu src/neighbors/nn_descent_float.cu src/neighbors/nn_descent_half.cu + src/neighbors/nn_descent_index.cpp src/neighbors/nn_descent_int8.cu src/neighbors/nn_descent_uint8.cu src/neighbors/reachability.cu diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp index bd41d1ff7..9cd8192b5 100644 --- a/cpp/include/cuvs/neighbors/nn_descent.hpp +++ b/cpp/include/cuvs/neighbors/nn_descent.hpp @@ -61,11 +61,10 @@ struct index_params : cuvs::neighbors::index_params { /** @brief Construct NN descent parameters for a specific kNN graph degree * * @param graph_degree output graph degree + * @param metric distance metric to use */ - index_params(size_t graph_degree = 64) - : graph_degree(graph_degree), intermediate_graph_degree(1.5 * graph_degree) - { - } + index_params(size_t graph_degree = 64, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded); }; /** @@ -103,11 +102,16 @@ struct index : cuvs::neighbors::index { * @param n_rows number of rows in knn-graph * @param n_cols number of cols in knn-graph * @param return_distances whether to return distances + * @param metric distance metric to use */ - index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false) + index(raft::resources const& res, + int64_t n_rows, + int64_t n_cols, + bool return_distances = false, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) : cuvs::neighbors::index(), res_{res}, - metric_{cuvs::distance::DistanceType::L2Expanded}, + metric_{metric}, graph_{raft::make_host_matrix(n_rows, n_cols)}, graph_view_{graph_.view()}, return_distances_{return_distances} @@ -129,14 +133,16 @@ struct index : cuvs::neighbors::index { * @param graph_view raft::host_matrix_view for storing knn-graph * @param distances_view optional raft::device_matrix_view for storing * distances + * @param metric distance metric to use */ index(raft::resources const& res, raft::host_matrix_view graph_view, std::optional> distances_view = - std::nullopt) + std::nullopt, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) : cuvs::neighbors::index(), res_{res}, - metric_{cuvs::distance::DistanceType::L2Expanded}, + metric_{metric}, graph_{raft::make_host_matrix(0, 0)}, graph_view_{graph_view}, distances_view_{distances_view}, @@ -473,8 +479,6 @@ auto build(raft::resources const& res, std::optional> graph = std::nullopt) -> cuvs::neighbors::nn_descent::index; -/** @} */ - /** * @brief Test if we have enough GPU memory to run NN descent algorithm. * diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 6209ff819..b7fec724b 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -436,11 +436,11 @@ index build( auto knn_build_params = params.graph_build_params; if (std::holds_alternative(params.graph_build_params)) { // Heuristic to decide default build algo and its params. - if (params.metric == cuvs::distance::DistanceType::L2Expanded && - cuvs::neighbors::nn_descent::has_enough_device_memory( + if (cuvs::neighbors::nn_descent::has_enough_device_memory( res, dataset.extents(), sizeof(IdxT))) { RAFT_LOG_DEBUG("NN descent solver"); - knn_build_params = cagra::graph_build_params::nn_descent_params(intermediate_degree); + knn_build_params = + cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric); } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); @@ -453,9 +453,6 @@ index build( std::get(knn_build_params); build_knn_graph(res, dataset, knn_graph->view(), ivf_pq_params); } else { - RAFT_EXPECTS( - params.metric == cuvs::distance::DistanceType::L2Expanded, - "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent"); auto nn_descent_params = std::get(knn_build_params); @@ -466,7 +463,8 @@ index build( "nn-descent graph_degree.", nn_descent_params.graph_degree, intermediate_degree); - nn_descent_params = cagra::graph_build_params::nn_descent_params(intermediate_degree); + nn_descent_params = + cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric); } // Use nn-descent to build CAGRA knn graph diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index 883d82d76..c62a52540 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -19,6 +19,7 @@ #include "ann_utils.cuh" #include "cagra/device_common.hpp" +#include #include #include @@ -216,6 +217,7 @@ struct BuildConfig { size_t max_iterations{50}; float termination_threshold{0.0001}; size_t output_graph_degree{32}; + cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded}; }; template @@ -454,11 +456,13 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer, // TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827 /** Calculate L2 norm, and cast data to __half */ template -RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data, - __half* output_data, - int dim, - DistData_t* l2_norms, - size_t list_offset = 0) +RAFT_KERNEL preprocess_data_kernel( + const Data_t* input_data, + __half* output_data, + int dim, + DistData_t* l2_norms, + size_t list_offset = 0, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) { extern __shared__ char buffer[]; __shared__ float l2_norm; @@ -468,26 +472,32 @@ RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data, load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size()); if (threadIdx.x == 0) { l2_norm = 0; } __syncthreads(); - int lane_id = threadIdx.x % raft::warp_size(); - for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { - int idx = step * raft::warp_size() + lane_id; - float part_dist = 0; - if (idx < dim) { - part_dist = s_vec[idx]; - part_dist = part_dist * part_dist; - } - __syncwarp(); - for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { - part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); + + if (metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::CosineExpanded) { + int lane_id = threadIdx.x % raft::warp_size(); + for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { + int idx = step * raft::warp_size() + lane_id; + float part_dist = 0; + if (idx < dim) { + part_dist = s_vec[idx]; + part_dist = part_dist * part_dist; + } + __syncwarp(); + for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { + part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); + } + if (lane_id == 0) { l2_norm += part_dist; } + __syncwarp(); } - if (lane_id == 0) { l2_norm += part_dist; } - __syncwarp(); } for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { int idx = step * raft::warp_size() + threadIdx.x; if (idx < dim) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { + output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { output_data[list_id * dim + idx] = (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm); } else { @@ -715,7 +725,8 @@ __launch_bounds__(BLOCK_SIZE, 4) DistData_t* dists, int graph_width, int* locks, - DistData_t* l2_norms) + DistData_t* l2_norms, + cuvs::distance::DistanceType metric) { #if (__CUDA_ARCH__ >= 700) using namespace nvcuda; @@ -827,8 +838,10 @@ __launch_bounds__(BLOCK_SIZE, 4) for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size && i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { s_distances[i] = -s_distances[i]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { + s_distances[i] = 1.0 - s_distances[i]; } else { s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - @@ -906,8 +919,10 @@ __launch_bounds__(BLOCK_SIZE, 4) for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size && i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { s_distances[i] = -s_distances[i]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { + s_distances[i] = 1.0 - s_distances[i]; } else { s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - @@ -1161,7 +1176,7 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build ndim_(build_config.dataset_dim), d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>( res, nrow_, build_config.dataset_dim)}, - l2_norms_{raft::make_device_vector(res, nrow_)}, + l2_norms_{raft::make_device_vector(res, 0)}, graph_buffer_{ raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, dists_buffer_{ @@ -1181,11 +1196,16 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build d_list_sizes_old_{raft::make_device_vector(res, nrow_)} { static_assert(NUM_SAMPLES <= 32); + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); auto graph_buffer_view = raft::make_device_matrix_view( reinterpret_cast(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE); raft::matrix::fill(res, graph_buffer_view, std::numeric_limits::max()); raft::matrix::fill(res, d_locks_.view(), 0); + + if (build_config.metric == cuvs::distance::DistanceType::L2Expanded) { + l2_norms_ = raft::make_device_vector(res, nrow_); + } }; template @@ -1228,7 +1248,8 @@ void GNND::local_join(cudaStream_t stream) dists_buffer_.data_handle(), DEGREE_ON_DEVICE, d_locks_.data_handle(), - l2_norms_.data_handle()); + l2_norms_.data_handle(), + build_config_.metric); } template @@ -1261,7 +1282,8 @@ void GNND::build(Data_t* data, d_data_.data_handle(), build_config_.dataset_dim, l2_norms_.data_handle(), - batch.offset()); + batch.offset(), + build_config_.metric); } graph_.clear(); @@ -1417,6 +1439,11 @@ void build(raft::resources const& res, RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits::max() - 1, "The dataset size for GNND should be less than %d", std::numeric_limits::max() - 1); + auto allowed_metrics = params.metric == cuvs::distance::DistanceType::L2Expanded || + params.metric == cuvs::distance::DistanceType::CosineExpanded || + params.metric == cuvs::distance::DistanceType::InnerProduct; + RAFT_EXPECTS(allowed_metrics && idx.metric() == params.metric, + "The metric for NN Descent should be L2Expanded, CosineExpanded or InnerProduct"); size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; @@ -1452,7 +1479,8 @@ void build(raft::resources const& res, .internal_node_degree = extended_intermediate_degree, .max_iterations = params.max_iterations, .termination_threshold = params.termination_threshold, - .output_graph_degree = params.graph_degree}; + .output_graph_degree = params.graph_degree, + .metric = params.metric}; GNND nnd(res, build_config); @@ -1500,8 +1528,11 @@ index build( graph_degree = intermediate_degree; } - index idx{ - res, dataset.extent(0), static_cast(graph_degree), params.return_distances}; + index idx{res, + dataset.extent(0), + static_cast(graph_degree), + params.return_distances, + params.metric}; build(res, params, dataset, idx); diff --git a/cpp/src/neighbors/nn_descent_index.cpp b/cpp/src/neighbors/nn_descent_index.cpp new file mode 100644 index 000000000..25d5b6af8 --- /dev/null +++ b/cpp/src/neighbors/nn_descent_index.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cuvs::neighbors::nn_descent { + +index_params::index_params(size_t graph_degree, cuvs::distance::DistanceType metric) +{ + this->graph_degree = graph_degree; + this->intermediate_graph_degree = 1.5 * graph_degree; + this->metric = metric; +} +} // namespace cuvs::neighbors::nn_descent \ No newline at end of file diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 37d42dd1d..660246c67 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -361,8 +361,8 @@ class AnnCagraTest : public ::testing::TestWithParam { // not used for knn_graph building. switch (ps.build_algo) { case graph_build_algo::IVF_PQ: - index_params.graph_build_params = - graph_build_params::ivf_pq_params(raft::matrix_extent(ps.n_rows, ps.dim)); + index_params.graph_build_params = graph_build_params::ivf_pq_params( + raft::matrix_extent(ps.n_rows, ps.dim), index_params.metric); if (ps.ivf_pq_search_refine_ratio) { std::get( index_params.graph_build_params) @@ -370,8 +370,8 @@ class AnnCagraTest : public ::testing::TestWithParam { } break; case graph_build_algo::NN_DESCENT: { - index_params.graph_build_params = - graph_build_params::nn_descent_params(index_params.intermediate_graph_degree); + index_params.graph_build_params = graph_build_params::nn_descent_params( + index_params.intermediate_graph_degree, index_params.metric); break; } case graph_build_algo::AUTO: @@ -389,7 +389,7 @@ class AnnCagraTest : public ::testing::TestWithParam { (const DataT*)database.data(), ps.n_rows, ps.dim); { - cagra::index index(handle_); + cagra::index index(handle_, index_params.metric); if (ps.host_dataset) { auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh index 7d2575c2b..09861a219 100644 --- a/cpp/test/neighbors/ann_nn_descent.cuh +++ b/cpp/test/neighbors/ann_nn_descent.cuh @@ -27,6 +27,7 @@ #include #include "naive_knn.cuh" +#include #include @@ -107,7 +108,6 @@ class AnnNNDescentTest : public ::testing::TestWithParam { raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); raft::resource::sync_stream(handle_); } - { { nn_descent::index_params index_params; @@ -124,6 +124,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam { if (ps.host_dataset) { auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); + raft::resource::sync_stream(handle_); auto database_host_view = raft::make_host_matrix_view( (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); auto index = nn_descent::build(handle_, index_params, database_host_view); @@ -151,6 +152,13 @@ class AnnNNDescentTest : public ::testing::TestWithParam { raft::resource::sync_stream(handle_); } + if (ps.metric == cuvs::distance::DistanceType::InnerProduct) { + std::transform( + distances_naive.begin(), distances_naive.end(), distances_naive.begin(), [](auto x) { + return -x; + }); + } + double min_recall = ps.min_recall; EXPECT_TRUE(eval_neighbours(indices_naive, indices_NNDescent, @@ -169,9 +177,11 @@ class AnnNNDescentTest : public ::testing::TestWithParam { raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0)); - } else { + } else if constexpr (std::is_same{}) { raft::random::uniformInt( - handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20)); + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(-5), DataT(5)); + } else { + raft::random::uniformInt(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0), DataT(5)); } raft::resource::sync_stream(handle_); } @@ -308,13 +318,15 @@ class AnnNNDescentBatchTest : public ::testing::TestWithParam database; }; -const std::vector inputs = raft::util::itertools::product( - {1000, 2000}, // n_rows - {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, // dim - {32, 64}, // graph_degree - {cuvs::distance::DistanceType::L2Expanded}, - {false, true}, - {0.90}); +const std::vector inputs = + raft::util::itertools::product({2000, 4000}, // n_rows + {4, 16, 64, 256, 1024}, // dim + {32, 64}, // graph_degree + {cuvs::distance::DistanceType::L2Expanded, + cuvs::distance::DistanceType::InnerProduct, + cuvs::distance::DistanceType::CosineExpanded}, + {false, true}, + {0.90}); // TODO : Investigate why this test is failing Reference issue https // : // github.com/rapidsai/raft/issues/2450 diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py index 92b88f013..56e132c23 100644 --- a/python/cuvs/cuvs/test/test_cagra.py +++ b/python/cuvs/cuvs/test/test_cagra.py @@ -122,8 +122,9 @@ def run_cagra_build_search_test( @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) @pytest.mark.parametrize("array_type", ["device", "host"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) +@pytest.mark.parametrize("metric", ["euclidean"]) def test_cagra_dataset_dtype_host_device( - dtype, array_type, inplace, build_algo + dtype, array_type, inplace, build_algo, metric ): # Note that inner_product tests use normalized input which we cannot # represent in int8, therefore we test only sqeuclidean metric here. @@ -132,6 +133,7 @@ def test_cagra_dataset_dtype_host_device( inplace=inplace, array_type=array_type, build_algo=build_algo, + metric=metric, ) diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py index 8bd2e8b76..20a35401e 100644 --- a/python/cuvs/cuvs/test/test_hnsw.py +++ b/python/cuvs/cuvs/test/test_hnsw.py @@ -41,8 +41,6 @@ def run_hnsw_build_search_test( pytest.skip( "inner_product metric is not supported for int8/uint8 data" ) - if build_algo == "nn_descent": - pytest.skip("inner_product metric is not supported for nn_descent") build_params = cagra.IndexParams( metric=metric, @@ -83,7 +81,7 @@ def run_hnsw_build_search_test( @pytest.mark.parametrize("k", [10, 20]) @pytest.mark.parametrize("ef", [30, 40]) @pytest.mark.parametrize("num_threads", [2, 4]) -@pytest.mark.parametrize("metric", ["sqeuclidean"]) +@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) def test_hnsw(dtype, k, ef, num_threads, metric, build_algo): # Note that inner_product tests use normalized input which we cannot From 7b879116684501f36ca5a19a74c01fcecb52e962 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 15 Nov 2024 16:12:42 -0600 Subject: [PATCH 28/47] use different wheel-size thresholds based on CUDA version (#469) `cuvs-cu11` wheels are significantly larger than `cuvs-cu12` wheels, because (among other reasons) they are not able to dynamically link to CUDA math library wheels. In #464, I proposed a size limit for CI checks of "max CUDA 11 wheel size + a buffer". This PR proposes using different thresholds based on CUDA major version, following these discussions: * https://github.com/rapidsai/cugraph/pull/4754#discussion_r1842526907 * https://github.com/rapidsai/cuml/pull/6136#discussion_r1841774811 Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Mike Sarahan (https://github.com/msarahan) URL: https://github.com/rapidsai/cuvs/pull/469 --- ci/validate_wheel.sh | 14 ++++++++++++++ python/cuvs/pyproject.toml | 4 +--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh index 5910a5c59..f2b235765 100755 --- a/ci/validate_wheel.sh +++ b/ci/validate_wheel.sh @@ -6,12 +6,26 @@ set -euo pipefail package_dir=$1 wheel_dir_relative_path=$2 +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" + +# some packages are much larger on CUDA 11 than on CUDA 12 +if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then + PYDISTCHECK_ARGS=( + --max-allowed-size-compressed '1.4G' + ) +else + PYDISTCHECK_ARGS=( + --max-allowed-size-compressed '950M' + ) +fi + cd "${package_dir}" rapids-logger "validate packages with 'pydistcheck'" pydistcheck \ --inspect \ + "${PYDISTCHECK_ARGS[@]}" \ "$(echo ${wheel_dir_relative_path}/*.whl)" rapids-logger "validate packages with 'twine'" diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml index d40026776..92e4993c7 100644 --- a/python/cuvs/pyproject.toml +++ b/python/cuvs/pyproject.toml @@ -135,12 +135,10 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" [tool.pydistcheck] select = [ + # NOTE: size threshold is managed via CLI args in CI scripts "distro-too-large-compressed", ] -# detect when package size grows significantly -max_allowed_size_compressed = '1.4G' - [tool.pytest.ini_options] filterwarnings = [ "error", From 27d45533d91f13ce00eabed409468a2b47452f4d Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Mon, 18 Nov 2024 14:55:13 -0800 Subject: [PATCH 29/47] Move check_input_array from pylibraft (#474) With the changes in https://github.com/rapidsai/raft/pull/2498 we no longer have a pylibraft.neighbors module - but were still using a utility function `_check_input_array` from it in cuvs. Move this over to cuvs to unblock ci Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/474 --- .../neighbors/brute_force/brute_force.pyx | 2 +- python/cuvs/cuvs/neighbors/cagra/cagra.pyx | 3 +- python/cuvs/cuvs/neighbors/common.py | 36 +++++++++++++++++++ .../cuvs/cuvs/neighbors/filters/filters.pyx | 2 +- python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx | 2 +- .../cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx | 2 +- python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx | 2 +- python/cuvs/cuvs/neighbors/refine.pyx | 2 +- 8 files changed, 44 insertions(+), 7 deletions(-) create mode 100644 python/cuvs/cuvs/neighbors/common.py diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx index 559302ccc..9d1d24eae 100644 --- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx +++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx @@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES +from cuvs.neighbors.common import _check_input_array from cuvs.common.c_api cimport cuvsResources_t diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx index 95209dbeb..752aef741 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx @@ -32,7 +32,8 @@ from cuvs.common cimport cydlpack from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array + +from cuvs.neighbors.common import _check_input_array from libc.stdint cimport ( int8_t, diff --git a/python/cuvs/cuvs/neighbors/common.py b/python/cuvs/cuvs/neighbors/common.py new file mode 100644 index 000000000..c14b9f8c9 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/common.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _check_input_array(cai, exp_dt, exp_rows=None, exp_cols=None): + if cai.dtype not in exp_dt: + raise TypeError("dtype %s not supported" % cai.dtype) + + if not cai.c_contiguous: + raise ValueError("Row major input is expected") + + if exp_cols is not None and cai.shape[1] != exp_cols: + raise ValueError( + "Incorrect number of columns, expected {} got {}".format( + exp_cols, cai.shape[1] + ) + ) + + if exp_rows is not None and cai.shape[0] != exp_rows: + raise ValueError( + "Incorrect number of rows, expected {} , got {}".format( + exp_rows, cai.shape[0] + ) + ) diff --git a/python/cuvs/cuvs/neighbors/filters/filters.pyx b/python/cuvs/cuvs/neighbors/filters/filters.pyx index 3a81cb786..9bc2a905c 100644 --- a/python/cuvs/cuvs/neighbors/filters/filters.pyx +++ b/python/cuvs/cuvs/neighbors/filters/filters.pyx @@ -20,11 +20,11 @@ import numpy as np from libc.stdint cimport uintptr_t from cuvs.common cimport cydlpack +from cuvs.neighbors.common import _check_input_array from .filters cimport BITMAP, NO_FILTER, cuvsFilter from pylibraft.common.cai_wrapper import wrap_array -from pylibraft.neighbors.common import _check_input_array cdef class Prefilter: diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx index 018fcfef9..bcfaf167e 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx +++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx @@ -21,6 +21,7 @@ from libcpp.string cimport string from cuvs.common.exceptions import check_cuvs from cuvs.common.resources import auto_sync_resources +from cuvs.neighbors.common import _check_input_array from cuvs.common cimport cydlpack @@ -36,7 +37,6 @@ import uuid from pylibraft.common import auto_convert_output from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array cdef class SearchParams: diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx index 25b9b2aee..7a169e1a0 100644 --- a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx @@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES +from cuvs.neighbors.common import _check_input_array from libc.stdint cimport ( int8_t, diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx index 3add1df75..531302ee6 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx @@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES +from cuvs.neighbors.common import _check_input_array from libc.stdint cimport ( int8_t, diff --git a/python/cuvs/cuvs/neighbors/refine.pyx b/python/cuvs/cuvs/neighbors/refine.pyx index 0eccc4108..b7aa35dca 100644 --- a/python/cuvs/cuvs/neighbors/refine.pyx +++ b/python/cuvs/cuvs/neighbors/refine.pyx @@ -31,13 +31,13 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES from cuvs.common.c_api cimport cuvsResources_t from cuvs.common.exceptions import check_cuvs +from cuvs.neighbors.common import _check_input_array @auto_sync_resources From f127b06b83e3c9e3c3034fdc902441edbf841b90 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:01:22 +0100 Subject: [PATCH 30/47] Fix an OOB error in device-side cuvs::neighbors::refine and CAGRA kern_prune (#460) IVF-Flat index expects all valid indices during build, which may not be the case in the context of refinement. At the same time, `cagra::detail::graph::kern_prune` fails with OOB error if some indices are invalid. This PR tweaks both kernels to avoid touching the input data with an invalid index. Fixes https://github.com/rapidsai/cuvs/issues/337 Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Micka (https://github.com/lowener) URL: https://github.com/rapidsai/cuvs/pull/460 --- cpp/src/neighbors/detail/cagra/graph_core.cuh | 1 + cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh | 8 +- cpp/test/CMakeLists.txt | 1 + .../ann_cagra/bug_extreme_inputs_oob.cu | 73 +++++++++++++++++++ 4 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 4253cb781..daeac82b9 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -156,6 +156,7 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g // count number of detours (A->D->B) for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) { const uint64_t iD = knn_graph[kAD + (graph_degree * iA)]; + if (iD >= graph_size) { continue; } for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) { const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)]; for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) { diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh index fb110d810..d6ffc1218 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh @@ -132,6 +132,10 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels, { const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; if (i >= n_rows) { return; } + auto source_ix = source_ixs == nullptr ? i + batch_offset : source_ixs[i]; + // In the context of refinement, some indices may be invalid (the generating NN algorithm does + // not return enough valid items). Do not add the item to the index in this case. + if (source_ix == ivf::kInvalidRecord || source_ix == raft::upper_bound()) { return; } auto list_id = labels[i]; auto inlist_id = atomicAdd(list_sizes_ptr + list_id, 1); @@ -139,7 +143,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels, auto* list_data = list_data_ptrs[list_id]; // Record the source vector id in the index - list_index[inlist_id] = source_ixs == nullptr ? i + batch_offset : source_ixs[i]; + list_index[inlist_id] = source_ix; // The data is written in interleaved groups of `index::kGroupSize` vectors using interleaved_group = raft::Pow2; @@ -151,7 +155,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels, // Point to the source vector if constexpr (gather_src) { - source_vecs += source_ixs[i] * dim; + source_vecs += source_ix * dim; } else { source_vecs += i * dim; } diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 1ed8466b3..7754a5043 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -137,6 +137,7 @@ if(BUILD_TESTS) NAME NEIGHBORS_ANN_CAGRA_TEST PATH + neighbors/ann_cagra/bug_extreme_inputs_oob.cu neighbors/ann_cagra/bug_multi_cta_crash.cu neighbors/ann_cagra/test_float_uint32_t.cu neighbors/ann_cagra/test_half_uint32_t.cu diff --git a/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu new file mode 100644 index 000000000..e21a54e9e --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +namespace cuvs::neighbors::cagra { + +class cagra_extreme_inputs_oob_test : public ::testing::Test { + public: + using data_type = float; + + protected: + void run() + { + cagra::index_params ix_ps; + graph_build_params::ivf_pq_params gb_params{}; + gb_params.refinement_rate = 2; + ix_ps.graph_build_params = gb_params; + ix_ps.graph_degree = 64; + ix_ps.intermediate_graph_degree = 128; + + [[maybe_unused]] auto ix = cagra::build(res, ix_ps, raft::make_const_mdspan(dataset->view())); + raft::resource::sync_stream(res); + } + + void SetUp() override + { + dataset.emplace(raft::make_device_matrix(res, n_samples, n_dim)); + raft::random::RngState r(1234ULL); + raft::random::normal( + res, r, dataset->data_handle(), n_samples * n_dim, data_type(0), data_type(1e20)); + raft::resource::sync_stream(res); + } + + void TearDown() override + { + dataset.reset(); + raft::resource::sync_stream(res); + } + + private: + raft::resources res; + std::optional> dataset = std::nullopt; + + constexpr static int64_t n_samples = 100000; + constexpr static int64_t n_dim = 200; + constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded; +}; + +TEST_F(cagra_extreme_inputs_oob_test, cagra_extreme_inputs_oob_test) { this->run(); } + +} // namespace cuvs::neighbors::cagra From 06afd5bd27d07ad6e58544c06f920d570b7df983 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Wed, 20 Nov 2024 15:26:05 -0800 Subject: [PATCH 31/47] Migrate sparse knn and distances code from raft (#457) Authors: - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/457 --- cpp/CMakeLists.txt | 2 + cpp/include/cuvs/distance/distance.hpp | 81 ++ cpp/include/cuvs/neighbors/brute_force.hpp | 104 +++ .../distance/detail/sparse/bin_distance.cuh | 231 +++++ cpp/src/distance/detail/sparse/common.hpp | 59 ++ cpp/src/distance/detail/sparse/coo_spmv.cuh | 211 +++++ .../detail/sparse/coo_spmv_kernel.cuh | 229 +++++ .../coo_spmv_strategies/base_strategy.cuh | 149 +++ .../coo_mask_row_iterators.cuh | 234 +++++ .../dense_smem_strategy.cuh | 121 +++ .../coo_spmv_strategies/hash_strategy.cuh | 296 ++++++ .../distance/detail/sparse/ip_distance.cuh | 89 ++ .../distance/detail/sparse/l2_distance.cuh | 502 +++++++++++ .../distance/detail/sparse/lp_distance.cuh | 333 +++++++ cpp/src/distance/detail/sparse/utils.cuh | 171 ++++ cpp/src/distance/sparse_distance.cu | 85 ++ cpp/src/distance/sparse_distance.cuh | 115 +++ cpp/src/neighbors/detail/sparse_knn.cuh | 437 +++++++++ cpp/src/neighbors/sparse_brute_force.cu | 72 ++ cpp/test/CMakeLists.txt | 3 +- cpp/test/distance/sparse_distance.cu | 850 ++++++++++++++++++ cpp/test/neighbors/sparse_brute_force.cu | 175 ++++ 22 files changed, 4548 insertions(+), 1 deletion(-) create mode 100644 cpp/src/distance/detail/sparse/bin_distance.cuh create mode 100644 cpp/src/distance/detail/sparse/common.hpp create mode 100644 cpp/src/distance/detail/sparse/coo_spmv.cuh create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh create mode 100644 cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh create mode 100644 cpp/src/distance/detail/sparse/ip_distance.cuh create mode 100644 cpp/src/distance/detail/sparse/l2_distance.cuh create mode 100644 cpp/src/distance/detail/sparse/lp_distance.cuh create mode 100644 cpp/src/distance/detail/sparse/utils.cuh create mode 100644 cpp/src/distance/sparse_distance.cu create mode 100644 cpp/src/distance/sparse_distance.cuh create mode 100644 cpp/src/neighbors/detail/sparse_knn.cuh create mode 100644 cpp/src/neighbors/sparse_brute_force.cu create mode 100644 cpp/test/distance/sparse_distance.cu create mode 100644 cpp/test/neighbors/sparse_brute_force.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 81b82aa7b..32093776c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,6 +369,7 @@ if(BUILD_SHARED_LIBS) src/distance/detail/fused_distance_nn.cu src/distance/distance.cu src/distance/pairwise_distance.cu + src/distance/sparse_distance.cu src/neighbors/brute_force.cu src/neighbors/cagra_build_float.cu src/neighbors/cagra_build_half.cu @@ -449,6 +450,7 @@ if(BUILD_SHARED_LIBS) src/neighbors/refine/detail/refine_host_int8_t_float.cpp src/neighbors/refine/detail/refine_host_uint8_t_float.cpp src/neighbors/sample_filter.cu + src/neighbors/sparse_brute_force.cu src/neighbors/vamana_build_float.cu src/neighbors/vamana_build_uint8.cu src/neighbors/vamana_build_int8.cu diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp index def72641e..42c574e58 100644 --- a/cpp/include/cuvs/distance/distance.hpp +++ b/cpp/include/cuvs/distance/distance.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -331,6 +332,86 @@ void pairwise_distance( cuvs::distance::DistanceType metric, float metric_arg = 2.0f); +/** + * @brief Compute sparse pairwise distances between x and y, using the provided + * input configuration and distance function. + * + * @code{.cpp} + * #include + * #include + * #include + * + * int x_n_rows = 100000; + * int y_n_rows = 50000; + * int n_cols = 10000; + * + * raft::device_resources handle; + * auto x = raft::make_device_csr_matrix(handle, x_n_rows, n_cols); + * auto y = raft::make_device_csr_matrix(handle, y_n_rows, n_cols); + * + * ... + * // populate data + * ... + * + * auto out = raft::make_device_matrix(handle, x_nrows, y_nrows); + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric); + * @endcode + * + * @param[in] handle raft::resources + * @param[in] x raft::device_csr_matrix_view + * @param[in] y raft::device_csr_matrix_view + * @param[out] dist raft::device_matrix_view dense matrix + * @param[in] metric distance metric to use + * @param[in] metric_arg metric argument (used for Minkowski distance) + */ +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg = 2.0f); + +/** + * @brief Compute sparse pairwise distances between x and y, using the provided + * input configuration and distance function. + * + * @code{.cpp} + * #include + * #include + * #include + * + * int x_n_rows = 100000; + * int y_n_rows = 50000; + * int n_cols = 10000; + * + * raft::device_resources handle; + * auto x = raft::make_device_csr_matrix(handle, x_n_rows, n_cols); + * auto y = raft::make_device_csr_matrix(handle, y_n_rows, n_cols); + * + * ... + * // populate data + * ... + * + * auto out = raft::make_device_matrix(handle, x_nrows, y_nrows); + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric); + * @endcode + * + * @param[in] handle raft::resources + * @param[in] x raft::device_csr_matrix_view + * @param[in] y raft::device_csr_matrix_view + * @param[out] dist raft::device_matrix_view dense matrix + * @param[in] metric distance metric to use + * @param[in] metric_arg metric argument (used for Minkowski distance) + */ +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg = 2.0f); + /** @} */ // end group pairwise_distance_runtime }; // namespace cuvs::distance diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp index 428fa592a..ba67797ee 100644 --- a/cpp/include/cuvs/neighbors/brute_force.hpp +++ b/cpp/include/cuvs/neighbors/brute_force.hpp @@ -18,6 +18,7 @@ #include "common.hpp" #include +#include #include #include #include @@ -375,4 +376,107 @@ void search(raft::resources const& handle, * @} */ +/** + * @defgroup sparse_bruteforce_cpp_index Sparse Brute Force index + * @{ + */ +/** + * @brief Sparse Brute Force index. + * + * @tparam T Data element type + * @tparam IdxT Index element type + */ +template +struct sparse_index { + public: + sparse_index(const sparse_index&) = delete; + sparse_index(sparse_index&&) = default; + sparse_index& operator=(const sparse_index&) = delete; + sparse_index& operator=(sparse_index&&) = default; + ~sparse_index() = default; + + /** Construct a sparse brute force sparse_index from dataset */ + sparse_index(raft::resources const& res, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric, + T metric_arg); + + /** Distance metric used for retrieval */ + cuvs::distance::DistanceType metric() const noexcept { return metric_; } + + /** Metric argument */ + T metric_arg() const noexcept { return metric_arg_; } + + raft::device_csr_matrix_view dataset() const noexcept + { + return dataset_; + } + + private: + raft::device_csr_matrix_view dataset_; + cuvs::distance::DistanceType metric_; + T metric_arg_; +}; +/** + * @} + */ + +/** + * @defgroup sparse_bruteforce_cpp_index_build Sparse Brute Force index build + * @{ + */ + +/* + * @brief Build the Sparse index from the dataset + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // create and fill the index from a CSR dataset + * auto index = brute_force::build(handle, dataset, metric); + * @endcode + * + * @param[in] handle + * @param[in] dataset A sparse CSR matrix in device memory to search against + * @param[in] metric cuvs::distance::DistanceType + * @param[in] metric_arg metric argument + * + * @return the constructed Sparse brute-force index + */ +auto build(raft::resources const& handle, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded, + float metric_arg = 0) -> cuvs::neighbors::brute_force::sparse_index; +/** + * @} + */ + +/** + * @defgroup sparse_bruteforce_cpp_index_search Sparse Brute Force index search + * @{ + */ +struct sparse_search_params { + int batch_size_index = 2 << 14; + int batch_size_query = 2 << 14; +}; + +/* + * @brief Search the sparse bruteforce index for nearest neighbors + * + * @param[in] handle + * @param[in] index Sparse brute-force constructed index + * @param[in] queries a sparse CSR matrix on the device to query + * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset + * [n_queries, k] + * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] + */ +void search(raft::resources const& handle, + const sparse_search_params& params, + const sparse_index& index, + raft::device_csr_matrix_view dataset, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); +/** + * @} + */ } // namespace cuvs::neighbors::brute_force diff --git a/cpp/src/distance/detail/sparse/bin_distance.cuh b/cpp/src/distance/detail/sparse/bin_distance.cuh new file mode 100644 index 000000000..1a63a8eb9 --- /dev/null +++ b/cpp/src/distance/detail/sparse/bin_distance.cuh @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "ip_distance.cuh" + +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { +// @TODO: Move this into sparse prims (coo_norm) +template +RAFT_KERNEL compute_binary_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { + // We do conditional here only because it's + // possible there could be some stray zeros in + // the sparse structure and removing them would be + // more expensive. + atomicAdd(&out[coo_rows[i]], data[i] == 1.0); + } +} + +template +RAFT_KERNEL compute_binary_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ + std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; + C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); +} + +template +void compute_binary(value_t* C, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func, + cudaStream_t stream) +{ + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_binary_warp_kernel<<>>( + C, Q_norms, R_norms, n_rows, n_cols, expansion_func); +} + +template +void compute_bin_distance(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ + rmm::device_uvector Q_norms(m, stream); + rmm::device_uvector R_norms(n, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + + compute_binary_row_norm_kernel<<>>( + Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_binary_row_norm_kernel<<>>( + R_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream); +} + +/** + * Jaccard distance using the expanded form: + * 1 - (sum(x_k * y_k) / ((sum(x_k) + sum(y_k)) - sum(x_k * y_k)) + */ +template +class jaccard_expanded_distances_t : public distances_t { + public: + explicit jaccard_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t denom = q_r_union - dot; + + value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); + + // flip the similarity when both rows are 0 + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * jacc) + both_empty); + }); + } + + ~jaccard_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +/** + * Dice distance using the expanded form: + * 1 - ((2 * sum(x_k * y_k)) / (sum(x_k) + sum(y_k))) + */ +template +class dice_expanded_distances_t : public distances_t { + public: + explicit dice_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t dice = (2 * dot) / q_r_union; + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * dice) + both_empty); + }); + } + + ~dice_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/common.hpp b/cpp/src/distance/detail/sparse/common.hpp new file mode 100644 index 000000000..803dabe56 --- /dev/null +++ b/cpp/src/distance/detail/sparse/common.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +struct distances_config_t { + distances_config_t(raft::resources const& handle_) : handle(handle_) {} + + // left side + value_idx a_nrows; + value_idx a_ncols; + value_idx a_nnz; + value_idx* a_indptr; + value_idx* a_indices; + value_t* a_data; + + // right side + value_idx b_nrows; + value_idx b_ncols; + value_idx b_nnz; + value_idx* b_indptr; + value_idx* b_indices; + value_t* b_data; + + raft::resources const& handle; +}; + +template +class distances_t { + public: + virtual void compute(value_t* out) {} + virtual ~distances_t() = default; +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv.cuh b/cpp/src/distance/detail/sparse/coo_spmv.cuh new file mode 100644 index 000000000..181b531f7 --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv.cuh @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "coo_spmv_strategies/dense_smem_strategy.cuh" +#include "coo_spmv_strategies/hash_strategy.cuh" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +inline void balanced_coo_pairwise_generalized_spmv( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows; + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle))); + + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); +}; + +/** + * Performs generalized sparse-matrix-sparse-matrix multiplication via a + * sparse-matrix-sparse-vector layout `out=A*B` where generalized product() + * and sum() operations can be used in place of the standard sum and product: + * + * out_ij = sum_k(product(A_ik, B_ik)) The sum goes through values of + * k=0..n_cols-1 where B_kj is nonzero. + * + * The product and sum operations shall form a semiring algebra with the + * following properties: + * 1. {+, 0} is a commutative sum reduction monoid with identity element 0 + * 2. {*, 1} is a product monoid with identity element 1 + * 3. Multiplication by 0 annihilates x. e.g. product(x, 0) = 0 + * + * Each vector of A is loaded into shared memory in dense form and the + * non-zeros of B load balanced across the threads of each block. + * @tparam value_idx index type + * @tparam value_t value type + * @tparam threads_per_block block size + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @tparam write_f atomic semiring sum() function + * @param[out] out_dists dense array of out distances of size m * n in row-major + * format. + * @param[in] config_ distance config object + * @param[in] coo_rows_b coo row array for B + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + * @param[in] chunk_size number of nonzeros of B to process for each row of A + * this value was found through profiling and represents a reasonable + * setting for both large and small densities + */ +template +inline void balanced_coo_pairwise_generalized_spmv( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows; + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle))); + + int max_cols = max_cols_per_block(); + + if (max_cols > config_.a_ncols) { + dense_smem_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); + } else { + hash_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); + } +}; + +template +inline void balanced_coo_pairwise_generalized_spmv_rev( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); +}; + +/** + * Used for computing distances where the reduction (e.g. product()) function + * requires an implicit union (product(x, 0) = x) to capture the difference A-B. + * This is necessary in some applications because the standard semiring algebra + * endowed with the default multiplication product monoid will only + * compute the intersection & B-A. + * + * This particular function is meant to accompany the function + * `balanced_coo_pairwise_generalized_spmv` and executes the product operation + * on only those columns that exist in B and not A. + * + * The product and sum operations shall enable the computation of a + * non-annihilating semiring algebra with the following properties: + * 1. {+, 0} is a commutative sum reduction monoid with identity element 0 + * 2. {*, 0} is a product monoid with identity element 0 + * 3. Multiplication by 0 does not annihilate x. e.g. product(x, 0) = x + * + * Manattan distance sum(abs(x_k-y_k)) is a great example of when this type of + * execution pattern is necessary. + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam threads_per_block block size + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @tparam write_f atomic semiring sum() function + * @param[out] out_dists dense array of out distances of size m * n + * @param[in] config_ distance config object + * @param[in] coo_rows_a coo row array for A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + * @param[in] chunk_size number of nonzeros of B to process for each row of A + * this value was found through profiling and represents a reasonable + * setting for both large and small densities + */ +template +inline void balanced_coo_pairwise_generalized_spmv_rev( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + // try dense first + int max_cols = max_cols_per_block(); + + if (max_cols > config_.b_ncols) { + dense_smem_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); + } else { + hash_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); + } +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh new file mode 100644 index 000000000..1f4b19af4 --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) +{ + return __ffs(peer_group) - 1; +} + +/** + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +RAFT_KERNEL balanced_coo_generalized_spmv_kernel(strategy_t strategy, + indptr_it indptrA, + value_idx* indicesA, + value_t* dataA, + value_idx nnz_a, + value_idx* rowsB, + value_idx* indicesB, + value_t* dataB, + value_idx m, + value_idx n, + int dim, + value_idx nnz_b, + value_t* out, + int n_blocks_per_row, + int chunk_size, + value_idx b_ncols, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ + typedef cub::WarpReduce warp_reduce; + + value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); + value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; + + // chunk starting offset + value_idx ind_offset = cur_chunk_offset * chunk_size * tpb; + // how many total cols will be processed by this block (should be <= chunk_size * n_threads) + value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); + + int tid = threadIdx.x; + int warp_id = tid / raft::warp_size(); + + // compute id relative to current warp + unsigned int lane_id = tid & (raft::warp_size() - 1); + value_idx ind = ind_offset + threadIdx.x; + + extern __shared__ char smem[]; + + typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); + typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim); + + auto inserter = strategy.init_insert(A, dim); + + __syncthreads(); + + value_idx start_offset_a, stop_offset_a; + bool first_a_chunk, last_a_chunk; + indptrA.get_row_offsets( + cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk); + + // Convert current row vector in A to dense + for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) { + strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]); + } + + __syncthreads(); + + auto finder = strategy.init_find(A, dim); + + if (cur_row_a > m || cur_chunk_offset > n_blocks_per_row) return; + if (ind >= nnz_b) return; + + value_idx start_index_a = 0, stop_index_a = b_ncols - 1; + indptrA.get_indices_boundary(indicesA, + cur_row_a, + start_offset_a, + stop_offset_a, + start_index_a, + stop_index_a, + first_a_chunk, + last_a_chunk); + + value_idx cur_row_b = -1; + value_t c = 0.0; + + auto warp_red = warp_reduce(*(temp_storage + warp_id)); + + if (tid < active_chunk_size) { + cur_row_b = rowsB[ind]; + + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + + if (in_bounds) { + value_t a_col = strategy.find(finder, index_b); + if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); } + } + } + + // loop through chunks in parallel, reducing when a new row is + // encountered by each thread + for (int i = tid; i < active_chunk_size; i += blockDim.x) { + value_idx ind_next = ind + blockDim.x; + value_idx next_row_b = -1; + + if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; + + bool diff_rows = next_row_b != cur_row_b; + + if (__any_sync(0xffffffff, diff_rows)) { + // grab the threads currently participating in loops. + // because any other threads should have returned already. + unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + + // thread with lowest lane id among peers writes out + if (is_leader && v != 0.0) { + // this conditional should be uniform, since rev is constant + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a; + write_func(out + idx, v); + } + + c = 0.0; + } + + if (next_row_b != -1) { + ind = ind_next; + + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + if (in_bounds) { + value_t a_col = strategy.find(finder, index_b); + + if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); } + } + + cur_row_b = next_row_b; + } + } +} + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh new file mode 100644 index 000000000..457b25eea --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../common.hpp" +#include "../coo_spmv_kernel.cuh" +#include "../utils.cuh" +#include "coo_mask_row_iterators.cuh" + +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class coo_spmv_strategy { + public: + coo_spmv_strategy(const distances_config_t& config_) : config(config_) + { + smem = raft::getSharedMemPerBlock(); + } + + template + void _dispatch_base(strategy_t& strategy, + int smem_dim, + indptr_it& a_indptr, + value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); + + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + a_indptr, + config.a_indices, + config.a_data, + config.a_nnz, + coo_rows_b, + config.b_indices, + config.b_data, + config.a_nrows, + config.b_nrows, + smem_dim, + config.b_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.b_ncols, + product_func, + accum_func, + write_func); + } + + template + void _dispatch_base_rev(strategy_t& strategy, + int smem_dim, + indptr_it& b_indptr, + value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); + + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + b_indptr, + config.b_indices, + config.b_data, + config.b_nnz, + coo_rows_a, + config.a_indices, + config.a_data, + config.b_nrows, + config.a_nrows, + smem_dim, + config.a_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.a_ncols, + product_func, + accum_func, + write_func); + } + + protected: + int smem; + const distances_config_t& config; +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh new file mode 100644 index 000000000..a9040e1d8 --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../common.hpp" +#include "../utils.cuh" + +#include // raft::ceildiv + +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class mask_row_it { + public: + mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_ = NULL) + : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) + { + } + + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { + if (mask_row_idx != NULL) { + return mask_row_idx[blockIdx.x / n_blocks_nnz_b]; + } else { + return blockIdx.x / n_blocks_nnz_b; + } + } + + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const value_idx& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + start_offset = full_indptr[row_idx]; + stop_offset = full_indptr[row_idx + 1] - 1; + } + + __device__ constexpr inline void get_indices_boundary(const value_idx* indices, + value_idx& indices_len, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { + // do nothing; + } + + __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { + return true; + } + + const value_idx *full_indptr, &n_rows; + value_idx* mask_row_idx; +}; + +template +RAFT_KERNEL fill_chunk_indices_kernel(value_idx* n_chunks_per_row, + value_idx* chunk_indices, + value_idx n_rows) +{ + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < n_rows) { + auto start = n_chunks_per_row[tid]; + auto end = n_chunks_per_row[tid + 1]; + +#pragma unroll + for (int i = start; i < end; i++) { + chunk_indices[i] = tid; + } + } +} + +template +class chunked_mask_row_it : public mask_row_it { + public: + chunked_mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_, + int row_chunk_size_, + const value_idx* n_chunks_per_row_, + const value_idx* chunk_indices_, + const cudaStream_t stream_) + : mask_row_it(full_indptr_, n_rows_, mask_row_idx_), + row_chunk_size(row_chunk_size_), + n_chunks_per_row(n_chunks_per_row_), + chunk_indices(chunk_indices_), + stream(stream_) + { + } + + static void init(const value_idx* indptr, + const value_idx* mask_row_idx, + const value_idx& n_rows, + const int row_chunk_size, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { + auto policy = rmm::exec_policy(stream); + + constexpr value_idx first_element = 0; + n_chunks_per_row.set_element_async(0, first_element, stream); + n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size); + thrust::transform( + policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor); + + thrust::inclusive_scan( + policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1); + + raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream); + + fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream); + } + + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { + return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]]; + } + + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const int& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + auto chunk_index = blockIdx.x / n_blocks_nnz_b; + auto chunk_val = chunk_indices[chunk_index]; + auto prev_n_chunks = n_chunks_per_row[chunk_val]; + auto relative_chunk = chunk_index - prev_n_chunks; + first_a_chunk = relative_chunk == 0; + + start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size; + stop_offset = start_offset + row_chunk_size; + + auto final_stop_offset = this->full_indptr[row_idx + 1]; + + last_a_chunk = stop_offset >= final_stop_offset; + stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; + } + + __device__ inline void get_indices_boundary(const value_idx* indices, + value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { + start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1; + stop_index = last_a_chunk ? stop_index : indices[stop_offset]; + } + + __device__ inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { + return (index_b >= start_index_a && index_b <= stop_index_a); + } + + inline static value_idx total_row_blocks = 0; + const cudaStream_t stream; + const value_idx *n_chunks_per_row, *chunk_indices; + value_idx row_chunk_size; + + struct n_chunks_per_row_functor { + public: + n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_) + : indptr(indptr_), row_chunk_size(row_chunk_size_) + { + } + + __host__ __device__ value_idx operator()(const value_idx& i) + { + auto degree = indptr[i + 1] - indptr[i]; + return raft::ceildiv(degree, (value_idx)row_chunk_size); + } + + const value_idx* indptr; + value_idx row_chunk_size; + }; + + private: + static void fill_chunk_indices(const value_idx& n_rows, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { + auto n_threads = std::min(n_rows, 256); + auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); + + chunk_indices.resize(total_row_blocks, stream); + + fill_chunk_indices_kernel + <<>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows); + } +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh new file mode 100644 index 000000000..baa913a6c --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "base_strategy.cuh" + +#include // raft::ceildiv + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class dense_smem_strategy : public coo_spmv_strategy { + public: + using smem_type = value_t*; + using insert_type = smem_type; + using find_type = smem_type; + + dense_smem_strategy(const distances_config_t& config_) + : coo_spmv_strategy(config_) + { + } + + inline static int smem_per_block(int n_cols) + { + return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t)); + } + + template + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024); + auto n_blocks = this->config.a_nrows * n_blocks_per_row; + + mask_row_it a_indptr(this->config.a_indptr, this->config.a_nrows); + + this->_dispatch_base(*this, + this->config.b_ncols, + a_indptr, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); + } + + template + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024); + auto n_blocks = this->config.b_nrows * n_blocks_per_row; + + mask_row_it b_indptr(this->config.b_indptr, this->config.b_nrows); + + this->_dispatch_base_rev(*this, + this->config.a_ncols, + b_indptr, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); + } + + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { + for (int k = threadIdx.x; k < cache_size; k += blockDim.x) { + cache[k] = 0.0; + } + return cache; + } + + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { + cache[key] = value; + } + + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { + return cache; + } + + __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; } +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh new file mode 100644 index 000000000..cf212076b --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "base_strategy.cuh" + +#include +#include + +#include +#include +#include + +// this is needed by cuco as key, value must be bitwise comparable. +// compilers don't declare float/double as bitwise comparable +// but that is too strict +// for example, the following is true (or 0): +// float a = 5; +// float b = 5; +// memcmp(&a, &b, sizeof(float)); +CUCO_DECLARE_BITWISE_COMPARABLE(float); +CUCO_DECLARE_BITWISE_COMPARABLE(double); + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class hash_strategy : public coo_spmv_strategy { + public: + using insert_type = typename cuco::legacy:: + static_map::device_mutable_view; + using smem_type = typename insert_type::slot_type*; + using find_type = + typename cuco::legacy::static_map::device_view; + + hash_strategy(const distances_config_t& config_, + float capacity_threshold_ = 0.5, + int map_size_ = get_map_size()) + : coo_spmv_strategy(config_), + capacity_threshold(capacity_threshold_), + map_size(map_size_) + { + } + + void chunking_needed(const value_idx* indptr, + const value_idx n_rows, + rmm::device_uvector& mask_indptr, + std::tuple& n_rows_divided, + cudaStream_t stream) + { + auto policy = raft::resource::get_thrust_policy(this->config.handle); + + auto less = thrust::copy_if(policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + mask_indptr.data(), + fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); + std::get<0>(n_rows_divided) = less - mask_indptr.data(); + + auto more = thrust::copy_if( + policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + less, + fits_in_hash_table( + indptr, capacity_threshold * map_size, std::numeric_limits::max())); + std::get<1>(n_rows_divided) = more - less; + } + + template + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb); + rmm::device_uvector mask_indptr( + this->config.a_nrows, raft::resource::get_cuda_stream(this->config.handle)); + std::tuple n_rows_divided; + + chunking_needed(this->config.a_indptr, + this->config.a_nrows, + mask_indptr, + n_rows_divided, + raft::resource::get_cuda_stream(this->config.handle)); + + auto less_rows = std::get<0>(n_rows_divided); + if (less_rows > 0) { + mask_row_it less(this->config.a_indptr, less_rows, mask_indptr.data()); + + auto n_less_blocks = less_rows * n_blocks_per_row; + this->_dispatch_base(*this, + map_size, + less, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); + } + + auto more_rows = std::get<1>(n_rows_divided); + if (more_rows > 0) { + rmm::device_uvector n_chunks_per_row( + more_rows + 1, raft::resource::get_cuda_stream(this->config.handle)); + rmm::device_uvector chunk_indices( + 0, raft::resource::get_cuda_stream(this->config.handle)); + chunked_mask_row_it::init(this->config.a_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + raft::resource::get_cuda_stream(this->config.handle)); + + chunked_mask_row_it more(this->config.a_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + raft::resource::get_cuda_stream(this->config.handle)); + + auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; + this->_dispatch_base(*this, + map_size, + more, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); + } + } + + template + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb); + rmm::device_uvector mask_indptr( + this->config.b_nrows, raft::resource::get_cuda_stream(this->config.handle)); + std::tuple n_rows_divided; + + chunking_needed(this->config.b_indptr, + this->config.b_nrows, + mask_indptr, + n_rows_divided, + raft::resource::get_cuda_stream(this->config.handle)); + + auto less_rows = std::get<0>(n_rows_divided); + if (less_rows > 0) { + mask_row_it less(this->config.b_indptr, less_rows, mask_indptr.data()); + + auto n_less_blocks = less_rows * n_blocks_per_row; + this->_dispatch_base_rev(*this, + map_size, + less, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); + } + + auto more_rows = std::get<1>(n_rows_divided); + if (more_rows > 0) { + rmm::device_uvector n_chunks_per_row( + more_rows + 1, raft::resource::get_cuda_stream(this->config.handle)); + rmm::device_uvector chunk_indices( + 0, raft::resource::get_cuda_stream(this->config.handle)); + chunked_mask_row_it::init(this->config.b_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + raft::resource::get_cuda_stream(this->config.handle)); + + chunked_mask_row_it more(this->config.b_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + raft::resource::get_cuda_stream(this->config.handle)); + + auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; + this->_dispatch_base_rev(*this, + map_size, + more, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); + } + } + + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { + return insert_type::make_from_uninitialized_slots(cooperative_groups::this_thread_block(), + cache, + cache_size, + cuco::empty_key{value_idx{-1}}, + cuco::empty_value{value_t{0}}); + } + + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { + auto success = cache.insert(cuco::pair(key, value)); + } + + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { + return find_type( + cache, cache_size, cuco::empty_key{value_idx{-1}}, cuco::empty_value{value_t{0}}); + } + + __device__ inline value_t find(find_type cache, const value_idx& key) + { + auto a_pair = cache.find(key); + + value_t a_col = 0.0; + if (a_pair != cache.end()) { a_col = a_pair->second; } + return a_col; + } + + struct fits_in_hash_table { + public: + fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_) + : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) + { + } + + __host__ __device__ bool operator()(const value_idx& i) + { + auto degree = indptr[i + 1] - indptr[i]; + + return degree >= degree_l && degree < degree_r; + } + + private: + const value_idx* indptr; + const value_idx degree_l, degree_r; + }; + + inline static int get_map_size() + { + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / + sizeof(typename insert_type::slot_type); + } + + private: + float capacity_threshold; + int map_size; +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/ip_distance.cuh b/cpp/src/distance/detail/sparse/ip_distance.cuh new file mode 100644 index 000000000..3a11d4e99 --- /dev/null +++ b/cpp/src/distance/detail/sparse/ip_distance.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "coo_spmv.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class ip_distances_t : public distances_t { + public: + /** + * Computes simple sparse inner product distances as sum(x_y * y_k) + * @param[in] config specifies inputs, outputs, and sizes + */ + ip_distances_t(const distances_config_t& config) + : config_(&config), coo_rows_b(config.b_nnz, raft::resource::get_cuda_stream(config.handle)) + { + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows_b.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + } + + /** + * Performs pairwise distance computation and computes output distances + * @param out_distances dense output matrix (size a_nrows * b_nrows) + */ + void compute(value_t* out_distances) + { + /** + * Compute pairwise distances and return dense matrix in row-major format + */ + balanced_coo_pairwise_generalized_spmv(out_distances, + *config_, + coo_rows_b.data(), + raft::mul_op(), + raft::add_op(), + raft::atomic_add_op()); + } + + value_idx* b_rows_coo() { return coo_rows_b.data(); } + + value_t* b_data_coo() { return config_->b_data; } + + private: + const distances_config_t* config_; + rmm::device_uvector coo_rows_b; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/l2_distance.cuh b/cpp/src/distance/detail/sparse/l2_distance.cuh new file mode 100644 index 000000000..40e7070fc --- /dev/null +++ b/cpp/src/distance/detail/sparse/l2_distance.cuh @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "ip_distance.cuh" +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +// @TODO: Move this into sparse prims (coo_norm) +template +RAFT_KERNEL compute_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); } +} + +template +RAFT_KERNEL compute_row_sum_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); } +} + +template +RAFT_KERNEL compute_euclidean_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ + std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t dot = C[(size_t)i * n_cols + j]; + + // e.g. Euclidean expansion func = -2.0 * dot + q_norm + r_norm + value_t val = expansion_func(dot, Q_sq_norms[i], R_sq_norms[j]); + + // correct for small instabilities + C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); +} + +template +RAFT_KERNEL compute_correlation_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n) +{ + std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t dot = C[(size_t)i * n_cols + j]; + value_t Q_l1 = Q_norms[i]; + value_t R_l1 = R_norms[j]; + + value_t Q_l2 = Q_sq_norms[i]; + value_t R_l2 = R_sq_norms[j]; + + value_t numer = n * dot - (Q_l1 * R_l1); + value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); + value_t R_denom = n * R_l2 - (R_l1 * R_l1); + + value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom)); + + // correct for small instabilities + C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); +} + +template +void compute_euclidean(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + value_idx n_rows, + value_idx n_cols, + cudaStream_t stream, + expansion_f expansion_func) +{ + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_euclidean_warp_kernel<<>>( + C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); +} + +template +void compute_l2(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ + rmm::device_uvector Q_sq_norms(m, stream); + rmm::device_uvector R_sq_norms(n, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + + compute_row_norm_kernel<<>>( + Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_row_norm_kernel<<>>( + R_sq_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func); +} + +template +void compute_correlation(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n, + cudaStream_t stream) +{ + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_correlation_warp_kernel<<>>( + C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n); +} + +template +void compute_corr(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + value_idx n_cols, + cudaStream_t stream) +{ + // sum_sq for std dev + rmm::device_uvector Q_sq_norms(m, stream); + rmm::device_uvector R_sq_norms(n, stream); + + // sum for mean + rmm::device_uvector Q_norms(m, stream); + rmm::device_uvector R_norms(n, stream); + + RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + + RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + + compute_row_norm_kernel<<>>( + Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_row_norm_kernel<<>>( + R_sq_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_row_sum_kernel<<>>( + Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_row_sum_kernel<<>>( + R_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_correlation(out, + Q_sq_norms.data(), + R_sq_norms.data(), + Q_norms.data(), + R_norms.data(), + m, + n, + n_cols, + stream); +} + +/** + * L2 distance using the expanded form: sum(x_k)^2 + sum(y_k)^2 - 2 * sum(x_k * y_k) + * The expanded form is more efficient for sparse data. + */ +template +class l2_expanded_distances_t : public distances_t { + public: + explicit l2_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); + } + + ~l2_expanded_distances_t() = default; + + protected: + const distances_config_t* config_; + ip_distances_t ip_dists; +}; + +/** + * L2 sqrt distance performing the sqrt operation after the distance computation + * The expanded form is more efficient for sparse data. + */ +template +class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t { + public: + explicit l2_sqrt_expanded_distances_t(const distances_config_t& config) + : l2_expanded_distances_t(config) + { + } + + void compute(value_t* out_dists) override + { + l2_expanded_distances_t::compute(out_dists); + // Sqrt Post-processing + raft::linalg::unaryOp( + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, + [] __device__(value_t input) { + int neg = input < 0 ? -1 : 1; + return raft::sqrt(abs(input) * neg); + }, + raft::resource::get_cuda_stream(this->config_->handle)); + } + + ~l2_sqrt_expanded_distances_t() = default; +}; + +template +class correlation_expanded_distances_t : public distances_t { + public: + explicit correlation_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_corr(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->b_ncols, + raft::resource::get_cuda_stream(config_->handle)); + } + + ~correlation_expanded_distances_t() = default; + + protected: + const distances_config_t* config_; + ip_distances_t ip_dists; +}; + +/** + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * + * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data. + */ +template +class cosine_expanded_distances_t : public distances_t { + public: + explicit cosine_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + + // flip the similarity when both rows are 0 + bool both_empty = (q_norm == 0) && (r_norm == 0); + return 1 - ((!both_empty * cos) + both_empty); + }); + } + + ~cosine_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +/** + * Hellinger distance using the expanded form: sqrt(1 - sum(sqrt(x_k) * sqrt(y_k))) + * The expanded form is more efficient for sparse data. + * + * This distance computation modifies A and B by computing a sqrt + * and then performing a `pow(x, 2)` to convert it back. Because of this, + * it is possible that the values in A and B might differ slightly + * after this is invoked. + */ +template +class hellinger_expanded_distances_t : public distances_t { + public: + explicit hellinger_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, raft::resource::get_cuda_stream(config.handle)) + { + } + + void compute(value_t* out_dists) + { + rmm::device_uvector coo_rows(std::max(config_->b_nnz, config_->a_nnz), + raft::resource::get_cuda_stream(config_->handle)); + + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv( + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); }, + raft::add_op(), + raft::atomic_add_op()); + + raft::linalg::unaryOp( + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, + [=] __device__(value_t input) { + // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative + bool rectifier = (1 - input) > 0; + return raft::sqrt(rectifier * (1 - input)); + }, + raft::resource::get_cuda_stream(config_->handle)); + } + + ~hellinger_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; +}; + +template +class russelrao_expanded_distances_t : public distances_t { + public: + explicit russelrao_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_t n_cols = config_->a_ncols; + value_t n_cols_inv = 1.0 / n_cols; + raft::linalg::unaryOp( + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, + [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; }, + raft::resource::get_cuda_stream(config_->handle)); + + auto exec_policy = rmm::exec_policy(raft::resource::get_cuda_stream(config_->handle)); + auto diags = thrust::counting_iterator(0); + value_idx b_nrows = config_->b_nrows; + thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) { + out_dists[input * b_nrows + input] = 0.0; + }); + } + + ~russelrao_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/lp_distance.cuh b/cpp/src/distance/detail/sparse/lp_distance.cuh new file mode 100644 index 000000000..18e7b04e4 --- /dev/null +++ b/cpp/src/distance/detail/sparse/lp_distance.cuh @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +void unexpanded_lp_distances(value_t* out_dists, + const distances_config_t* config_, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ + rmm::device_uvector coo_rows(std::max(config_->b_nnz, config_->a_nnz), + raft::resource::get_cuda_stream(config_->handle)); + + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv( + out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); + + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv_rev( + out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); +} + +/** + * Computes L1 distances for sparse input. This does not have + * an equivalent expanded form, so it is only executed in + * an unexpanded form. + * @tparam value_idx + * @tparam value_t + */ +template +class l1_unexpanded_distances_t : public distances_t { + public: + l1_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::absdiff_op(), raft::add_op(), raft::atomic_add_op()); + } + + private: + const distances_config_t* config_; +}; + +template +class l2_unexpanded_distances_t : public distances_t { + public: + l2_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::sqdiff_op(), raft::add_op(), raft::atomic_add_op()); + } + + protected: + const distances_config_t* config_; +}; + +template +class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t { + public: + l2_sqrt_unexpanded_distances_t(const distances_config_t& config) + : l2_unexpanded_distances_t(config) + { + } + + void compute(value_t* out_dists) + { + l2_unexpanded_distances_t::compute(out_dists); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + // Sqrt Post-processing + raft::linalg::unaryOp( + out_dists, + out_dists, + n, + [] __device__(value_t input) { + int neg = input < 0 ? -1 : 1; + return raft::sqrt(abs(input) * neg); + }, + raft::resource::get_cuda_stream(this->config_->handle)); + } +}; + +template +class linf_unexpanded_distances_t : public distances_t { + public: + explicit linf_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::absdiff_op(), raft::max_op(), raft::atomic_max_op()); + } + + private: + const distances_config_t* config_; +}; + +template +class canberra_unexpanded_distances_t : public distances_t { + public: + explicit canberra_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, + config_, + [] __device__(value_t a, value_t b) { + value_t d = fabs(a) + fabs(b); + + // deal with potential for 0 in denominator by + // forcing 1/0 instead + return ((d != 0) * fabs(a - b)) / (d + (d == 0)); + }, + raft::add_op(), + raft::atomic_add_op()); + } + + private: + const distances_config_t* config_; +}; + +template +class lp_unexpanded_distances_t : public distances_t { + public: + explicit lp_unexpanded_distances_t(const distances_config_t& config, + value_t p_) + : config_(&config), p(p_) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, + config_, + raft::compose_op(raft::pow_const_op(p), raft::sub_op()), + raft::add_op(), + raft::atomic_add_op()); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + value_t one_over_p = value_t{1} / p; + raft::linalg::unaryOp(out_dists, + out_dists, + n, + raft::pow_const_op(one_over_p), + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; + value_t p; +}; + +template +class hamming_unexpanded_distances_t : public distances_t { + public: + explicit hamming_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::notequal_op(), raft::add_op(), raft::atomic_add_op()); + + uint64_t n = (uint64_t)config_->a_nrows * (uint64_t)config_->b_nrows; + value_t n_cols = 1.0 / config_->a_ncols; + raft::linalg::unaryOp(out_dists, + out_dists, + n, + raft::mul_const_op(n_cols), + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; +}; + +template +class jensen_shannon_unexpanded_distances_t : public distances_t { + public: + explicit jensen_shannon_unexpanded_distances_t( + const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, + config_, + [] __device__(value_t a, value_t b) { + value_t m = 0.5f * (a + b); + bool a_zero = a == 0; + bool b_zero = b == 0; + + value_t x = (!a_zero * m) / (a_zero + a); + value_t y = (!b_zero * m) / (b_zero + b); + + bool x_zero = x == 0; + bool y_zero = y == 0; + + return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero))); + }, + raft::add_op(), + raft::atomic_add_op()); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + raft::linalg::unaryOp( + out_dists, + out_dists, + n, + [=] __device__(value_t input) { return raft::sqrt(0.5 * input); }, + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; +}; + +template +class kl_divergence_unexpanded_distances_t : public distances_t { + public: + explicit kl_divergence_unexpanded_distances_t( + const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + rmm::device_uvector coo_rows(std::max(config_->b_nnz, config_->a_nnz), + raft::resource::get_cuda_stream(config_->handle)); + + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv( + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return a * log(a / b); }, + raft::add_op(), + raft::atomic_add_op()); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + raft::linalg::unaryOp(out_dists, + out_dists, + n, + raft::mul_const_op(0.5), + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/utils.cuh b/cpp/src/distance/detail/sparse/utils.cuh new file mode 100644 index 000000000..dc7ae6df6 --- /dev/null +++ b/cpp/src/distance/detail/sparse/utils.cuh @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +/** + * Computes the maximum number of columns that can be stored + * in shared memory in dense form with the given block size + * and precision. + * @return the maximum number of columns that can be stored in smem + */ +template +inline int max_cols_per_block() +{ + // max cols = (total smem available - cub reduction smem) + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / + sizeof(value_t); +} + +template +RAFT_KERNEL faster_dot_on_csr_kernel(dot_t* __restrict__ dot, + const value_idx* __restrict__ indptr, + const value_idx* __restrict__ cols, + const value_t* __restrict__ A, + const value_t* __restrict__ B, + const value_idx nnz, + const value_idx n_rows, + const value_idx dim) +{ + auto vec_id = threadIdx.x; + auto lane_id = threadIdx.x & 0x1f; + + extern __shared__ char smem[]; + value_t* s_A = (value_t*)smem; + value_idx cur_row = -1; + + for (int row = blockIdx.x; row < n_rows; row += gridDim.x) { + for (int dot_id = blockIdx.y + indptr[row]; dot_id < indptr[row + 1]; dot_id += gridDim.y) { + if (dot_id >= nnz) { return; } + const value_idx col = cols[dot_id] * dim; + const value_t* __restrict__ B_col = B + col; + + if (threadIdx.x == 0) { dot[dot_id] = 0.0; } + __syncthreads(); + + if (cur_row != row) { + for (value_idx k = vec_id; k < dim; k += blockDim.x) { + s_A[k] = A[row * dim + k]; + } + cur_row = row; + } + + dot_t l_dot_ = 0.0; + for (value_idx k = vec_id; k < dim; k += blockDim.x) { + asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x)); + if constexpr ((std::is_same_v && std::is_same_v)) { + l_dot_ += __half2float(s_A[k]) * __half2float(__ldcg(B_col + k)); + } else { + l_dot_ += s_A[k] * __ldcg(B_col + k); + } + } + + typedef cub::WarpReduce WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage; + dot_t warp_sum = WarpReduce(temp_storage).Sum(l_dot_); + + if (lane_id == 0) { atomicAdd_block(dot + dot_id, warp_sum); } + } + } +} + +template +void faster_dot_on_csr(raft::resources const& handle, + dot_t* dot, + const value_idx nnz, + const value_idx* indptr, + const value_idx* cols, + const value_t* A, + const value_t* B, + const value_idx n_rows, + const value_idx dim) +{ + if (nnz == 0 || n_rows == 0) return; + + auto stream = raft::resource::get_cuda_stream(handle); + + constexpr value_idx MAX_ROW_PER_ITER = 500; + int dev_id, sm_count, blocks_per_sm; + + const int smem_size = dim * sizeof(value_t); + cudaGetDevice(&dev_id); + cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); + + if (dim < 128) { + constexpr int tpb = 64; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + + } else if (dim < 256) { + constexpr int tpb = 128; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + } else if (dim < 512) { + constexpr int tpb = 256; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + } else { + constexpr int tpb = 512; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + } + + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/sparse_distance.cu b/cpp/src/distance/sparse_distance.cu new file mode 100644 index 000000000..338c4e908 --- /dev/null +++ b/cpp/src/distance/sparse_distance.cu @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "sparse_distance.cuh" + +namespace cuvs { +namespace distance { + +template +void pairwise_distance( + raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg = 2.0f) +{ + auto x_structure = x.structure_view(); + auto y_structure = y.structure_view(); + + RAFT_EXPECTS(x_structure.get_n_cols() == y_structure.get_n_cols(), + "Number of columns must be equal"); + + RAFT_EXPECTS(dist.extent(0) == x_structure.get_n_rows(), + "Number of rows in output must be equal to " + "number of rows in X"); + RAFT_EXPECTS(dist.extent(1) == y_structure.get_n_rows(), + "Number of columns in output must be equal to " + "number of rows in Y"); + + detail::sparse::distances_config_t input_config(handle); + input_config.a_nrows = x_structure.get_n_rows(); + input_config.a_ncols = x_structure.get_n_cols(); + input_config.a_nnz = x_structure.get_nnz(); + input_config.a_indptr = const_cast(x_structure.get_indptr().data()); + input_config.a_indices = const_cast(x_structure.get_indices().data()); + input_config.a_data = const_cast(x.get_elements().data()); + + input_config.b_nrows = y_structure.get_n_rows(); + input_config.b_ncols = y_structure.get_n_cols(); + input_config.b_nnz = y_structure.get_nnz(); + input_config.b_indptr = const_cast(y_structure.get_indptr().data()); + input_config.b_indices = const_cast(y_structure.get_indices().data()); + input_config.b_data = const_cast(y.get_elements().data()); + + pairwiseDistance(dist.data_handle(), input_config, metric, metric_arg); +} + +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg) +{ + pairwise_distance(handle, x, y, dist, metric, metric_arg); +} + +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg) +{ + pairwise_distance(handle, x, y, dist, metric, metric_arg); +} +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/sparse_distance.cuh b/cpp/src/distance/sparse_distance.cuh new file mode 100644 index 000000000..0d6dc0e6f --- /dev/null +++ b/cpp/src/distance/sparse_distance.cuh @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/sparse/bin_distance.cuh" +#include "detail/sparse/common.hpp" +#include "detail/sparse/ip_distance.cuh" +#include "detail/sparse/l2_distance.cuh" +#include "detail/sparse/lp_distance.cuh" + +#include + +#include + +#include + +namespace cuvs { +namespace distance { +/** + * Compute pairwise distances between A and B, using the provided + * input configuration and distance function. + * + * @tparam value_idx index type + * @tparam value_t value type + * @param[out] out dense output array (size A.nrows * B.nrows) + * @param[in] input_config input argument configuration + * @param[in] metric distance metric to use + * @param[in] metric_arg metric argument (used for Minkowski distance) + */ +template +void pairwiseDistance(value_t* out, + detail::sparse::distances_config_t input_config, + cuvs::distance::DistanceType metric, + float metric_arg) +{ + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: + detail::sparse::l2_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L2SqrtExpanded: + detail::sparse::l2_sqrt_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::InnerProduct: + detail::sparse::ip_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L2Unexpanded: + detail::sparse::l2_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L2SqrtUnexpanded: + detail::sparse::l2_sqrt_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L1: + detail::sparse::l1_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::LpUnexpanded: + detail::sparse::lp_unexpanded_distances_t(input_config, metric_arg) + .compute(out); + break; + case cuvs::distance::DistanceType::Linf: + detail::sparse::linf_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::Canberra: + detail::sparse::canberra_unexpanded_distances_t(input_config) + .compute(out); + break; + case cuvs::distance::DistanceType::JaccardExpanded: + detail::sparse::jaccard_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::CosineExpanded: + detail::sparse::cosine_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::HellingerExpanded: + detail::sparse::hellinger_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::DiceExpanded: + detail::sparse::dice_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::CorrelationExpanded: + detail::sparse::correlation_expanded_distances_t(input_config) + .compute(out); + break; + case cuvs::distance::DistanceType::RusselRaoExpanded: + detail::sparse::russelrao_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::HammingUnexpanded: + detail::sparse::hamming_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::JensenShannon: + detail::sparse::jensen_shannon_unexpanded_distances_t(input_config) + .compute(out); + break; + case cuvs::distance::DistanceType::KLDivergence: + detail::sparse::kl_divergence_unexpanded_distances_t(input_config) + .compute(out); + break; + + default: THROW("Unsupported distance: %d", metric); + } +} +}; // namespace distance +}; // namespace cuvs diff --git a/cpp/src/neighbors/detail/sparse_knn.cuh b/cpp/src/neighbors/detail/sparse_knn.cuh new file mode 100644 index 000000000..9c8e971b9 --- /dev/null +++ b/cpp/src/neighbors/detail/sparse_knn.cuh @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "../../distance/sparse_distance.cuh" +#include "knn_merge_parts.cuh" +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuvs::neighbors::detail { + +template +struct csr_batcher_t { + csr_batcher_t(value_idx batch_size, + value_idx n_rows, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data) + : batch_start_(0), + batch_stop_(0), + batch_rows_(0), + total_rows_(n_rows), + batch_size_(batch_size), + csr_indptr_(csr_indptr), + csr_indices_(csr_indices), + csr_data_(csr_data), + batch_csr_start_offset_(0), + batch_csr_stop_offset_(0) + { + } + + void set_batch(int batch_num) + { + batch_start_ = batch_num * batch_size_; + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + + if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1; // zero-based indexing + + batch_rows_ = (batch_stop_ - batch_start_) + 1; + } + + value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_indptr(batch_start_, + batch_stop_, + csr_indptr_, + batch_indptr, + &batch_csr_start_offset_, + &batch_csr_stop_offset_, + stream); + + return batch_csr_stop_offset_ - batch_csr_start_offset_; + } + + void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_, + batch_csr_stop_offset_, + csr_indices_, + csr_data_, + csr_indices, + csr_data, + stream); + } + + value_idx batch_rows() const { return batch_rows_; } + + value_idx batch_start() const { return batch_start_; } + + value_idx batch_stop() const { return batch_stop_; } + + private: + value_idx batch_size_; + value_idx batch_start_; + value_idx batch_stop_; + value_idx batch_rows_; + + value_idx total_rows_; + + const value_idx* csr_indptr_; + const value_idx* csr_indices_; + const value_t* csr_data_; + + value_idx batch_csr_start_offset_; + value_idx batch_csr_stop_offset_; +}; + +template +class sparse_knn_t { + public: + sparse_knn_t(const value_idx* idxIndptr_, + const value_idx* idxIndices_, + const value_t* idxData_, + size_t idxNNZ_, + int n_idx_rows_, + int n_idx_cols_, + const value_idx* queryIndptr_, + const value_idx* queryIndices_, + const value_t* queryData_, + size_t queryNNZ_, + int n_query_rows_, + int n_query_cols_, + value_idx* output_indices_, + value_t* output_dists_, + int k_, + raft::resources const& handle_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + cuvs::distance::DistanceType metric_ = cuvs::distance::DistanceType::L2Expanded, + float metricArg_ = 0) + : idxIndptr(idxIndptr_), + idxIndices(idxIndices_), + idxData(idxData_), + idxNNZ(idxNNZ_), + n_idx_rows(n_idx_rows_), + n_idx_cols(n_idx_cols_), + queryIndptr(queryIndptr_), + queryIndices(queryIndices_), + queryData(queryData_), + queryNNZ(queryNNZ_), + n_query_rows(n_query_rows_), + n_query_cols(n_query_cols_), + output_indices(output_indices_), + output_dists(output_dists_), + k(k_), + handle(handle_), + batch_size_index(batch_size_index_), + batch_size_query(batch_size_query_), + metric(metric_), + metricArg(metricArg_) + { + } + + void run() + { + using namespace raft::sparse; + + int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); + csr_batcher_t query_batcher( + batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData); + + size_t rows_processed = 0; + + for (int i = 0; i < n_batches_query; i++) { + /** + * Compute index batch info + */ + query_batcher.set_batch(i); + + /** + * Slice CSR to rows in batch + */ + + rmm::device_uvector query_batch_indptr(query_batcher.batch_rows() + 1, + raft::resource::get_cuda_stream(handle)); + + value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( + query_batch_indptr.data(), raft::resource::get_cuda_stream(handle)); + + rmm::device_uvector query_batch_indices(n_query_batch_nnz, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector query_batch_data(n_query_batch_nnz, + raft::resource::get_cuda_stream(handle)); + + query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), + query_batch_data.data(), + raft::resource::get_cuda_stream(handle)); + + // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent + // batches and 1 space for the results of the merge, which get copied back to the top + rmm::device_uvector merge_buffer_indices(0, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector merge_buffer_dists(0, raft::resource::get_cuda_stream(handle)); + + value_t* dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_ptr; + + int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); + csr_batcher_t idx_batcher( + batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData); + + for (int j = 0; j < n_batches_idx; j++) { + idx_batcher.set_batch(j); + + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, + raft::resource::get_cuda_stream(handle)); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, + raft::resource::get_cuda_stream(handle)); + + /** + * Slice CSR to rows in batch + */ + rmm::device_uvector idx_batch_indptr(idx_batcher.batch_rows() + 1, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector idx_batch_indices(0, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector idx_batch_data(0, raft::resource::get_cuda_stream(handle)); + + value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( + idx_batch_indptr.data(), raft::resource::get_cuda_stream(handle)); + + idx_batch_indices.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle)); + idx_batch_data.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle)); + + idx_batcher.get_batch_csr_indices_data( + idx_batch_indices.data(), idx_batch_data.data(), raft::resource::get_cuda_stream(handle)); + + /** + * Compute distances + */ + uint64_t dense_size = + (uint64_t)idx_batcher.batch_rows() * (uint64_t)query_batcher.batch_rows(); + rmm::device_uvector batch_dists(dense_size, + raft::resource::get_cuda_stream(handle)); + + RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, + query_batcher, + idx_batch_nnz, + n_query_batch_nnz, + idx_batch_indptr.data(), + idx_batch_indices.data(), + idx_batch_data.data(), + query_batch_indptr.data(), + query_batch_indices.data(), + query_batch_data.data(), + batch_dists.data()); + + // Build batch indices array + rmm::device_uvector batch_indices(batch_dists.size(), + raft::resource::get_cuda_stream(handle)); + + // populate batch indices array + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); + + iota_fill( + batch_indices.data(), batch_rows, batch_cols, raft::resource::get_cuda_stream(handle)); + + /** + * Perform k-selection on batch & merge with other k-selections + */ + size_t merge_buffer_offset = batch_rows * k; + dists_merge_buffer_ptr = merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, + query_batcher, + batch_dists.data(), + batch_indices.data(), + dists_merge_buffer_ptr, + indices_merge_buffer_ptr); + + value_t* dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + + // Merge results of difference batches if necessary + if (idx_batcher.batch_start() > 0) { + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, + query_batcher, + merge_buffer_dists.data(), + merge_buffer_indices.data(), + dists_merge_buffer_tmp_ptr, + indices_merge_buffer_tmp_ptr); + } + + // copy merged output back into merge buffer partition for next iteration + raft::copy_async(merge_buffer_indices.data(), + indices_merge_buffer_tmp_ptr, + batch_rows * k, + raft::resource::get_cuda_stream(handle)); + raft::copy_async(merge_buffer_dists.data(), + dists_merge_buffer_tmp_ptr, + batch_rows * k, + raft::resource::get_cuda_stream(handle)); + } + + // Copy final merged batch to output array + raft::copy_async(output_indices + (rows_processed * k), + merge_buffer_indices.data(), + query_batcher.batch_rows() * k, + raft::resource::get_cuda_stream(handle)); + raft::copy_async(output_dists + (rows_processed * k), + merge_buffer_dists.data(), + query_batcher.batch_rows() * k, + raft::resource::get_cuda_stream(handle)); + + rows_processed += query_batcher.batch_rows(); + } + } + + private: + void merge_batches(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + value_t* merge_buffer_dists, + value_idx* merge_buffer_indices, + value_t* out_dists, + value_idx* out_indices) + { + // build translation buffer to shift resulting indices by the batch + std::vector id_ranges; + id_ranges.push_back(0); + id_ranges.push_back(idx_batcher.batch_start()); + + rmm::device_uvector trans(id_ranges.size(), raft::resource::get_cuda_stream(handle)); + raft::update_device( + trans.data(), id_ranges.data(), id_ranges.size(), raft::resource::get_cuda_stream(handle)); + + // combine merge buffers only if there's more than 1 partition to combine + cuvs::neighbors::detail::knn_merge_parts(merge_buffer_dists, + merge_buffer_indices, + out_dists, + out_indices, + query_batcher.batch_rows(), + 2, + k, + raft::resource::get_cuda_stream(handle), + trans.data()); + } + + void perform_k_selection(csr_batcher_t idx_batcher, + csr_batcher_t query_batcher, + value_t* batch_dists, + value_idx* batch_indices, + value_t* out_dists, + value_idx* out_indices) + { + // populate batch indices array + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); + + // build translation buffer to shift resulting indices by the batch + std::vector id_ranges; + id_ranges.push_back(0); + id_ranges.push_back(idx_batcher.batch_start()); + + // in the case where the number of idx rows in the batch is < k, we + // want to adjust k. + value_idx n_neighbors = std::min(static_cast(k), batch_cols); + + bool ascending = cuvs::distance::is_min_close(metric); + + // kernel to slice first (min) k cols and copy into batched merge buffer + cuvs::selection::select_k( + handle, + raft::make_device_matrix_view(batch_dists, batch_rows, batch_cols), + raft::make_device_matrix_view( + batch_indices, batch_rows, batch_cols), + raft::make_device_matrix_view(out_dists, batch_rows, n_neighbors), + raft::make_device_matrix_view(out_indices, batch_rows, n_neighbors), + ascending, + true); + } + + void compute_distances(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + size_t idx_batch_nnz, + size_t query_batch_nnz, + value_idx* idx_batch_indptr, + value_idx* idx_batch_indices, + value_t* idx_batch_data, + value_idx* query_batch_indptr, + value_idx* query_batch_indices, + value_t* query_batch_data, + value_t* batch_dists) + { + /** + * Compute distances + */ + cuvs::distance::detail::sparse::distances_config_t dist_config(handle); + dist_config.b_nrows = idx_batcher.batch_rows(); + dist_config.b_ncols = n_idx_cols; + dist_config.b_nnz = idx_batch_nnz; + + dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indices = idx_batch_indices; + dist_config.b_data = idx_batch_data; + + dist_config.a_nrows = query_batcher.batch_rows(); + dist_config.a_ncols = n_query_cols; + dist_config.a_nnz = query_batch_nnz; + + dist_config.a_indptr = query_batch_indptr; + dist_config.a_indices = query_batch_indices; + dist_config.a_data = query_batch_data; + + cuvs::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg); + } + + const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; + value_idx* output_indices; + const value_t *idxData, *queryData; + value_t* output_dists; + + size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; + + cuvs::distance::DistanceType metric; + + float metricArg; + + int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; + + raft::resources const& handle; +}; + +}; // namespace cuvs::neighbors::detail diff --git a/cpp/src/neighbors/sparse_brute_force.cu b/cpp/src/neighbors/sparse_brute_force.cu new file mode 100644 index 000000000..e277961ec --- /dev/null +++ b/cpp/src/neighbors/sparse_brute_force.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "detail/sparse_knn.cuh" + +namespace cuvs::neighbors::brute_force { +template +sparse_index::sparse_index(raft::resources const& res, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric, + T metric_arg) + : dataset_(dataset), metric_(metric), metric_arg_(metric_arg) +{ +} + +auto build(raft::resources const& handle, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric, + float metric_arg) -> cuvs::neighbors::brute_force::sparse_index +{ + return sparse_index(handle, dataset, metric, metric_arg); +} + +void search(raft::resources const& handle, + const sparse_search_params& params, + const sparse_index& index, + raft::device_csr_matrix_view query, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) +{ + auto idx_structure = index.dataset().structure_view(); + auto query_structure = query.structure_view(); + int k = neighbors.extent(1); + + detail::sparse_knn_t(idx_structure.get_indptr().data(), + idx_structure.get_indices().data(), + index.dataset().get_elements().data(), + idx_structure.get_nnz(), + idx_structure.get_n_rows(), + idx_structure.get_n_cols(), + query_structure.get_indptr().data(), + query_structure.get_indices().data(), + query.get_elements().data(), + query_structure.get_nnz(), + query_structure.get_n_rows(), + query_structure.get_n_cols(), + neighbors.data_handle(), + distances.data_handle(), + k, + handle, + params.batch_size_index, + params.batch_size_query, + index.metric(), + index.metric_arg()) + .run(); +} +} // namespace cuvs::neighbors::brute_force diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 7754a5043..286d721d7 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -94,7 +94,7 @@ endfunction() if(BUILD_TESTS) ConfigureTest( NAME NEIGHBORS_TEST PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu - neighbors/refine.cu GPUS 1 PERCENT 100 + neighbors/sparse_brute_force.cu neighbors/refine.cu GPUS 1 PERCENT 100 ) ConfigureTest( @@ -206,6 +206,7 @@ if(BUILD_TESTS) distance/dist_lp_unexp.cu distance/dist_russell_rao.cu distance/masked_nn.cu + distance/sparse_distance.cu sparse/neighbors/cross_component_nn.cu GPUS 1 diff --git a/cpp/test/distance/sparse_distance.cu b/cpp/test/distance/sparse_distance.cu new file mode 100644 index 000000000..f95487414 --- /dev/null +++ b/cpp/test/distance/sparse_distance.cu @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace distance { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseDistanceInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + + cuvs::distance::DistanceType metric; + + float metric_arg = 0.0; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs& dims) +{ + return os; +} + +template +class SparseDistanceTest + : public ::testing::TestWithParam> { + public: + SparseDistanceTest() + : params(::testing::TestWithParam>::GetParam()), + indptr(0, resource::get_cuda_stream(handle)), + indices(0, resource::get_cuda_stream(handle)), + data(0, resource::get_cuda_stream(handle)), + out_dists(0, resource::get_cuda_stream(handle)), + out_dists_ref(0, resource::get_cuda_stream(handle)) + { + } + + void SetUp() override + { + make_data(); + + int out_size = static_cast(params.indptr_h.size() - 1) * + static_cast(params.indptr_h.size() - 1); + + out_dists.resize(out_size, resource::get_cuda_stream(handle)); + + auto out = raft::make_device_matrix_view( + out_dists.data(), + static_cast(params.indptr_h.size() - 1), + static_cast(params.indptr_h.size() - 1)); + + auto x_structure = raft::make_device_compressed_structure_view( + indptr.data(), + indices.data(), + static_cast(params.indptr_h.size() - 1), + params.n_cols, + static_cast(params.indices_h.size())); + auto x = raft::make_device_csr_matrix_view(data.data(), x_structure); + + cuvs::distance::pairwise_distance(handle, x, x, out, params.metric, params.metric_arg); + + RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle))); + } + + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref.data(), + out_dists.data(), + params.out_dists_ref_h.size(), + CompareApprox(1e-3))); + } + + protected: + void make_data() + { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + auto stream = resource::get_cuda_stream(handle); + indptr.resize(indptr_h.size(), stream); + indices.resize(indices_h.size(), stream); + data.resize(data_h.size(), stream); + + update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); + update_device(indices.data(), indices_h.data(), indices_h.size(), stream); + update_device(data.data(), data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + + out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream); + + update_device(out_dists_ref.data(), + out_dists_ref_h.data(), + out_dists_ref_h.size(), + resource::get_cuda_stream(handle)); + } + + raft::resources handle; + + // input data + rmm::device_uvector indptr, indices; + rmm::device_uvector data; + + // output data + rmm::device_uvector out_dists, out_dists_ref; + + SparseDistanceInputs params; +}; + +const std::vector> inputs_i32_f = { + {5, + {0, 0, 1, 2}, + + {1, 2}, + {0.5, 0.5}, + {0, 1, 1, 1, 0, 1, 1, 1, 0}, + cuvs::distance::DistanceType::CosineExpanded, + 0.0}, + {5, + {0, 0, 1, 2}, + + {1, 2}, + {1.0, 1.0}, + {0, 1, 1, 1, 0, 1, 1, 1, 0}, + cuvs::distance::DistanceType::JaccardExpanded, + 0.0}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + cuvs::distance::DistanceType::L2Expanded, + 0.0}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, + cuvs::distance::DistanceType::InnerProduct, + 0.0}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + cuvs::distance::DistanceType::L2Unexpanded, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102, + 1., 0.76978799, 0.39419924, 0., 0.97577154, 0.48904013, 0.48300801, 0.45087445, + 0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., 0.79593037, 0.48904013, + 0.51413997, 0., 0.28605559, 0.35772784, 1., 0.60889396, 0.43324829, 0.84923694, + 0.45658883, 0.48300801, 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0., + 0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1., + 0.6745457, 0.77917274, 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., 0.51360432, 0.68185144, + 1., 0.54847744, 0.8321819, 0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432, + 0., 1., 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + 0.61547536, 0.68185144, 1., 0.}, + cuvs::distance::DistanceType::CosineExpanded, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {0.0, + 0.42857142857142855, + 0.7142857142857143, + 0.75, + 0.2857142857142857, + 0.75, + 0.7142857142857143, + 0.5, + 1.0, + 0.6666666666666666, + 0.42857142857142855, + 0.0, + 0.75, + 0.625, + 0.375, + 0.42857142857142855, + 0.75, + 0.375, + 0.75, + 0.7142857142857143, + 0.7142857142857143, + 0.75, + 0.0, + 0.7142857142857143, + 0.42857142857142855, + 0.7142857142857143, + 0.6666666666666666, + 0.625, + 0.6666666666666666, + 1.0, + 0.75, + 0.625, + 0.7142857142857143, + 0.0, + 0.5, + 0.5714285714285714, + 1.0, + 0.8, + 0.5, + 0.6666666666666666, + 0.2857142857142857, + 0.375, + 0.42857142857142855, + 0.5, + 0.0, + 0.6666666666666666, + 0.7777777777777778, + 0.4444444444444444, + 0.7777777777777778, + 0.75, + 0.75, + 0.42857142857142855, + 0.7142857142857143, + 0.5714285714285714, + 0.6666666666666666, + 0.0, + 0.7142857142857143, + 0.5, + 0.5, + 0.8571428571428571, + 0.7142857142857143, + 0.75, + 0.6666666666666666, + 1.0, + 0.7777777777777778, + 0.7142857142857143, + 0.0, + 0.42857142857142855, + 0.8571428571428571, + 0.8333333333333334, + 0.5, + 0.375, + 0.625, + 0.8, + 0.4444444444444444, + 0.5, + 0.42857142857142855, + 0.0, + 0.7777777777777778, + 0.75, + 1.0, + 0.75, + 0.6666666666666666, + 0.5, + 0.7777777777777778, + 0.5, + 0.8571428571428571, + 0.7777777777777778, + 0.0, + 1.0, + 0.6666666666666666, + 0.7142857142857143, + 1.0, + 0.6666666666666666, + 0.75, + 0.8571428571428571, + 0.8333333333333334, + 0.75, + 1.0, + 0.0}, + cuvs::distance::DistanceType::JaccardExpanded, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + cuvs::distance::DistanceType::Canberra, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + cuvs::distance::DistanceType::LpUnexpanded, + 2.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + cuvs::distance::DistanceType::Linf, + 0.0}, + + {15, + {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, 0, 3, 7, 8, 12, 0, 2, 5, + 7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219, + 1., 0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246, + 0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739, + 0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409, 0.31461499, 0.24412279, 0.08327667, + 0.04444576, 0.05047969, 0.26190054, 0.2077349, 0.10803964}, + {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, + 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, + 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, + 1.00000000e+00, 7.33151506e-01, 1.00000000e+00, 9.86880955e-01, + 9.19154851e-01, 5.38849774e-01, 1.00000000e+00, 8.98332369e-01, + 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 8.03303970e-01, + 6.64465915e-01, 8.69374690e-01, 1.00000000e+00, 1.00000000e+00, + 1.00000000e+00, 1.00000000e+00, 9.24116813e-01, 7.33151506e-01, + 8.03303970e-01, 0.00000000e+00, 8.16225843e-01, 9.39818306e-01, + 7.27700415e-01, 7.30155528e-01, 8.89451011e-01, 8.05419635e-01, + 9.90039274e-01, 1.00000000e+00, 6.64465915e-01, 8.16225843e-01, + 0.00000000e+00, 6.38804490e-01, 1.00000000e+00, 1.00000000e+00, + 9.52559809e-01, 9.53789212e-01, 7.97613546e-01, 9.86880955e-01, + 8.69374690e-01, 9.39818306e-01, 6.38804490e-01, 0.0, + 1.00000000e+00, 9.72569112e-01, 8.24907516e-01, 8.07933016e-01, + 8.91271059e-01, 9.19154851e-01, 1.00000000e+00, 7.27700415e-01, + 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 7.63596268e-01, + 8.40131263e-01, 7.40428532e-01, 1.00000000e+00, 5.38849774e-01, + 1.00000000e+00, 7.30155528e-01, 1.00000000e+00, 9.72569112e-01, + 7.63596268e-01, 0.00000000e+00, 1.00000000e+00, 7.95485011e-01, + 6.64669302e-01, 1.00000000e+00, 1.00000000e+00, 8.89451011e-01, + 9.52559809e-01, 8.24907516e-01, 8.40131263e-01, 1.00000000e+00, + 0.00000000e+00, 8.51370877e-01, 8.59439512e-01, 8.98332369e-01, + 1.00000000e+00, 8.05419635e-01, 9.53789212e-01, 8.07933016e-01, + 7.40428532e-01, 7.95485011e-01, 8.51370877e-01, 1.49011612e-08}, + // Dataset is L1 normalized into pdfs + cuvs::distance::DistanceType::HellingerExpanded, + 0.0}, + + {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + cuvs::distance::DistanceType::L1, + 0.0}, + {5, + {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45}, + {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2, + 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4}, + {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667, + 0.34426657, 0.2357925, 0.01274851, 0.11422017, 0.3437756, 0.31967718, 0.5956055, 0.31610373, + 0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, + 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736, + 0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815, + 0.31648823, 0.89874295, 0.27366735, 0.5119944, 0.11416134}, + {// dense output + 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, 0.76962708, 1.122858, + 1.1232498, 1.08166081, 0.48769777, 0., 1.31332116, 0.98318907, 0.42661815, 0.09279052, + 1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, + 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907, + 1.82943642, 0., 0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116, + 0.26657011, 0.42661815, 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, + 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, 0.45060069, 0., + 1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281, + 0.77814948, 1.29899154, 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, + 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., 1.47318624, 1.92660889, + 1.1232498, 0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624, + 0., 0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, + 1.88812175, 1.92660889, 0.24992619, 0.}, + cuvs::distance::DistanceType::CorrelationExpanded, + 0.0}, + {5, + {0, 1, 2, 4, 4, 5, 6, 7, 9, 9, 10}, + {1, 4, 0, 4, 1, 3, 0, 1, 3, 0}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {// dense output + 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., 1., 1., 1., + 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., + 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.8, 1., 1., + 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, + cuvs::distance::DistanceType::RusselRaoExpanded, + 0.0}, + {5, + {0, 1, 1, 3, 3, 4, 4, 6, 9, 10, 10}, + {0, 3, 4, 4, 2, 3, 0, 2, 3, 2}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {// dense output + 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, + 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0., 0.4, 0., + 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, + 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, + 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, + 0.2, 0.2, 0.4, 0., 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, + cuvs::distance::DistanceType::HammingUnexpanded, + 0.0}, + {3, + {0, 1, 2}, + {0, 1}, + {1.0, 1.0}, + {0.0, 0.83255, 0.83255, 0.0}, + cuvs::distance::DistanceType::JensenShannon, + 0.0}, + {2, + {0, 1, 3}, + {0, 0, 1}, + {1.0, 0.5, 0.5}, + {0, 0.4645014, 0.4645014, 0}, + cuvs::distance::DistanceType::JensenShannon, + 0.0}, + {3, + {0, 1, 2}, + {0, 0}, + {1.0, 1.0}, + {0.0, 0.0, 0.0, 0.0}, + cuvs::distance::DistanceType::JensenShannon, + 0.0}, + + {3, + {0, 1, 2}, + {0, 1}, + {1.0, 1.0}, + {0.0, 1.0, 1.0, 0.0}, + cuvs::distance::DistanceType::DiceExpanded, + 0.0}, + {3, + {0, 1, 3}, + {0, 0, 1}, + {1.0, 1.0, 1.0}, + {0, 0.333333, 0.333333, 0}, + cuvs::distance::DistanceType::DiceExpanded, + 0.0}, + +}; + +typedef SparseDistanceTest SparseDistanceTestF; +TEST_P(SparseDistanceTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, + SparseDistanceTestF, + ::testing::ValuesIn(inputs_i32_f)); + +} // end namespace distance +} // end namespace cuvs diff --git a/cpp/test/neighbors/sparse_brute_force.cu b/cpp/test/neighbors/sparse_brute_force.cu new file mode 100644 index 000000000..cb68989d4 --- /dev/null +++ b/cpp/test/neighbors/sparse_brute_force.cu @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include + +#include +#include + +namespace cuvs { +namespace neighbors { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseKNNInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + std::vector out_indices_ref_h; + + int k; + + int batch_size_index = 2; + int batch_size_query = 2; + + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2SqrtExpanded; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs& dims) +{ + return os; +} + +template +class SparseKNNTest : public ::testing::TestWithParam> { + public: + SparseKNNTest() + : params(::testing::TestWithParam>::GetParam()), + indptr(0, resource::get_cuda_stream(handle)), + indices(0, resource::get_cuda_stream(handle)), + data(0, resource::get_cuda_stream(handle)), + out_indices(0, resource::get_cuda_stream(handle)), + out_dists(0, resource::get_cuda_stream(handle)), + out_indices_ref(0, resource::get_cuda_stream(handle)), + out_dists_ref(0, resource::get_cuda_stream(handle)) + { + } + + protected: + void SetUp() override + { + n_rows = params.indptr_h.size() - 1; + nnz = params.indices_h.size(); + k = params.k; + + make_data(); + + auto index_structure = + raft::make_device_compressed_structure_view( + indptr.data(), indices.data(), n_rows, params.n_cols, nnz); + auto index_csr = raft::make_device_csr_matrix_view(data.data(), index_structure); + + auto index = cuvs::neighbors::brute_force::build(handle, index_csr, params.metric); + + cuvs::neighbors::brute_force::sparse_search_params search_params; + search_params.batch_size_index = params.batch_size_index; + search_params.batch_size_query = params.batch_size_query; + + cuvs::neighbors::brute_force::search( + handle, + search_params, + index, + index_csr, + raft::make_device_matrix_view(out_indices.data(), n_rows, k), + raft::make_device_matrix_view(out_dists.data(), n_rows, k)); + + RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle))); + } + + void compare() + { + ASSERT_TRUE(devArrMatch( + out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox(1e-4))); + ASSERT_TRUE( + devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare())); + } + + protected: + void make_data() + { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + auto stream = resource::get_cuda_stream(handle); + indptr.resize(indptr_h.size(), stream); + indices.resize(indices_h.size(), stream); + data.resize(data_h.size(), stream); + + update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); + update_device(indices.data(), indices_h.data(), indices_h.size(), stream); + update_device(data.data(), data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_indices_ref_h = params.out_indices_ref_h; + + out_indices_ref.resize(out_indices_ref_h.size(), stream); + out_dists_ref.resize(out_dists_ref_h.size(), stream); + + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); + + out_dists.resize(n_rows * k, stream); + out_indices.resize(n_rows * k, stream); + } + + raft::resources handle; + + int n_rows, nnz, k; + + // input data + rmm::device_uvector indptr, indices; + rmm::device_uvector data; + + // output data + rmm::device_uvector out_indices; + rmm::device_uvector out_dists; + + rmm::device_uvector out_indices_ref; + rmm::device_uvector out_dists_ref; + + SparseKNNInputs params; +}; + +const std::vector> inputs_i32_f = { + {9, // ncols + {0, 2, 4, 6, 8}, // indptr + {0, 4, 0, 3, 0, 2, 0, 8}, // indices + {0.0f, 1.0f, 5.0f, 6.0f, 5.0f, 6.0f, 0.0f, 1.0f}, // data + {0, 1.41421, 0, 7.87401, 0, 7.87401, 0, 1.41421}, // dists + {0, 3, 1, 0, 2, 0, 3, 0}, // inds + 2, + 2, + 2, + cuvs::distance::DistanceType::L2SqrtExpanded}}; +typedef SparseKNNTest SparseKNNTestF; +TEST_P(SparseKNNTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f)); + +}; // end namespace neighbors +}; // end namespace cuvs From 710e9f5a541c518deffb91f75a87cd4fe1372a8a Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 22 Nov 2024 09:25:27 -0500 Subject: [PATCH 32/47] Add `kIsSingleSource` to `PairwiseDistanceEpilogueElementwise` (#485) With raft having recently migrated to cutlass 3.5.1, this field is now required. Also remove `raft_cutlass` from symbol exclusions. Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Micka (https://github.com/lowener) URL: https://github.com/rapidsai/cuvs/pull/485 --- .github/workflows/pr.yaml | 2 +- .github/workflows/test.yaml | 2 +- .../distance/detail/pairwise_distance_epilogue_elementwise.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index e18e82df0..78648235f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -88,7 +88,7 @@ jobs: with: build_type: pull-request enable_check_symbols: true - symbol_exclusions: (void (thrust::|cub::)|raft_cutlass) + symbol_exclusions: (void (thrust::|cub::)) conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5f60c0a34..27dc99a11 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,7 +23,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} enable_check_symbols: true - symbol_exclusions: (void (thrust::|cub::)|raft_cutlass) + symbol_exclusions: (void (thrust::|cub::)) conda-cpp-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 diff --git a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h index f9955334d..f4a7feaba 100644 --- a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h +++ b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h @@ -61,6 +61,7 @@ class PairwiseDistanceEpilogueElementwise { using ElementT = ElementT_; static int const kElementsPerAccess = ElementsPerAccess; static int const kCount = kElementsPerAccess; + static bool const kIsSingleSource = true; using DistanceOp = DistanceOp_; using FinalOp = FinalOp_; From 96d98b12df0030bc21c8588e8905df9cdc00784e Mon Sep 17 00:00:00 2001 From: Azurethi Date: Sat, 23 Nov 2024 11:02:30 -0500 Subject: [PATCH 33/47] Fix broken link in README.md references (#473) Fixed the broken link for "Top-K Algorithms on GPU: A Comprehensive Study and New Methods" Authors: - https://github.com/Azurethi Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/473 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 572e8d098..23759f598 100755 --- a/README.md +++ b/README.md @@ -242,7 +242,7 @@ If you are interested in contributing to the cuVS library, please read our [Cont For the interested reader, many of the accelerated implementations in cuVS are also based on research papers which can provide a lot more background. We also ask you to please cite the corresponding algorithms by referencing them in your own research. - [CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search](https://arxiv.org/abs/2308.15136) -- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062>) +- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062) - [Fast K-NN Graph Construction by GPU Based NN-Descent](https://dl.acm.org/doi/abs/10.1145/3459637.3482344?casa_token=O_nan1B1F5cAAAAA:QHWDEhh0wmd6UUTLY9_Gv6c3XI-5DXM9mXVaUXOYeStlpxTPmV3nKvABRfoivZAaQ3n8FWyrkWw>) - [cuSLINK: Single-linkage Agglomerative Clustering on the GPU](https://arxiv.org/abs/2306.16354) - [GPU Semiring Primitives for Sparse Neighborhood Methods](https://arxiv.org/abs/2104.06357) From e1359e1a36ee48d2474a03a3b05c67b6610b220c Mon Sep 17 00:00:00 2001 From: Micka Date: Mon, 25 Nov 2024 21:09:26 +0100 Subject: [PATCH 34/47] Add serialization API to brute-force (#461) I noticed it was missing while switching Milvus to cuVS Authors: - Micka (https://github.com/lowener) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/461 --- .gitignore | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cuvs/neighbors/brute_force.h | 60 +++++ cpp/include/cuvs/neighbors/brute_force.hpp | 243 ++++++++++++++++++ cpp/src/neighbors/brute_force.cu | 15 ++ cpp/src/neighbors/brute_force_c.cpp | 55 +++- cpp/src/neighbors/brute_force_serialize.cu | 169 ++++++++++++ cpp/test/neighbors/ann_brute_force.cuh | 18 +- docs/source/c_api/neighbors_bruteforce_c.rst | 8 + docs/source/c_api/neighbors_hnsw_c.rst | 4 +- docs/source/c_api/neighbors_ivf_flat_c.rst | 8 + docs/source/c_api/neighbors_ivf_pq_c.rst | 8 + docs/source/cpp_api/neighbors_bruteforce.rst | 8 + .../python_api/neighbors_brute_force.rst | 10 + docs/source/python_api/neighbors_cagra.rst | 10 + docs/source/python_api/neighbors_hnsw.rst | 10 + docs/source/python_api/neighbors_ivf_flat.rst | 10 + docs/source/python_api/neighbors_ivf_pq.rst | 10 + .../cuvs/neighbors/brute_force/__init__.py | 4 +- .../neighbors/brute_force/brute_force.pxd | 8 + .../neighbors/brute_force/brute_force.pyx | 86 +++++++ python/cuvs/cuvs/test/test_serialization.py | 38 ++- 22 files changed, 767 insertions(+), 17 deletions(-) create mode 100644 cpp/src/neighbors/brute_force_serialize.cu diff --git a/.gitignore b/.gitignore index 97eab287d..da6eb07f6 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ compile_commands.json .clangd/ # serialized ann indexes +brute_force_index cagra_index ivf_flat_index ivf_pq_index diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 32093776c..eb2e7c7a4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -371,6 +371,7 @@ if(BUILD_SHARED_LIBS) src/distance/pairwise_distance.cu src/distance/sparse_distance.cu src/neighbors/brute_force.cu + src/neighbors/brute_force_serialize.cu src/neighbors/cagra_build_float.cu src/neighbors/cagra_build_half.cu src/neighbors/cagra_build_int8.cu diff --git a/cpp/include/cuvs/neighbors/brute_force.h b/cpp/include/cuvs/neighbors/brute_force.h index c9e172f62..33b92f11b 100644 --- a/cpp/include/cuvs/neighbors/brute_force.h +++ b/cpp/include/cuvs/neighbors/brute_force.h @@ -166,6 +166,66 @@ cuvsError_t cuvsBruteForceSearch(cuvsResources_t res, * @} */ +/** + * @defgroup bruteforce_c_serialize BRUTEFORCE C-API serialize functions + * @{ + */ +/** + * Save the index to file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.c} + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // create an index with `cuvsBruteforceBuild` + * cuvsBruteForceSerialize(res, "/path/to/index", index); + * @endcode + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] filename the file name for saving the index + * @param[in] index BRUTEFORCE index + * + */ +cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index); + +/** + * Load index from file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.c} + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // Deserialize an index previously built with `cuvsBruteforceBuild` + * cuvsBruteForceIndex_t index; + * cuvsBruteForceIndexCreate(&index); + * cuvsBruteForceDeserialize(res, "/path/to/index", index); + * @endcode + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] filename the name of the file that stores the index + * @param[out] index BRUTEFORCE index loaded disk + */ +cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index); + +/** + * @} + */ #ifdef __cplusplus } #endif diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp index ba67797ee..d040e03db 100644 --- a/cpp/include/cuvs/neighbors/brute_force.hpp +++ b/cpp/include/cuvs/neighbors/brute_force.hpp @@ -48,6 +48,14 @@ struct index : cuvs::neighbors::index { index& operator=(index&&) = default; ~index() = default; + /** + * @brief Construct an empty index. + * + * Constructs an empty index. This index will either need to be trained with `build` + * or loaded from a saved copy with `deserialize` + */ + index(raft::resources const& handle); + /** Construct a brute force index from dataset * * Constructs a brute force index from a dataset. This lets us precompute norms for @@ -479,4 +487,239 @@ void search(raft::resources const& handle, /** * @} */ + +/** + * @defgroup bruteforce_cpp_index_serialize Bruteforce index serialize functions + * @{ + */ +/** + * Save the index to file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * // create an index with `auto index = brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, filename, index); + * @endcode + * + * @tparam T data element type + * + * @param[in] handle the raft handle + * @param[in] filename the file name for saving the index + * @param[in] index brute force index + * @param[in] include_dataset whether to include the dataset in the serialized + * output + */ +void serialize(raft::resources const& handle, + const std::string& filename, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); +/** + * Save the index to file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * // create an index with `auto index = brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, filename, index); + * @endcode + * + * @tparam T data element type + * + * @param[in] handle the raft handle + * @param[in] filename the file name for saving the index + * @param[in] index brute force index + * @param[in] include_dataset whether to include the dataset in the serialized + * output + * + */ +void serialize(raft::resources const& handle, + const std::string& filename, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); + +/** + * Write the index to an output stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an output stream + * std::ostream os(std::cout.rdbuf()); + * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, os, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] os output stream + * @param[in] index brute force index + * @param[in] include_dataset Whether or not to write out the dataset to the file. + */ +void serialize(raft::resources const& handle, + std::ostream& os, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); + +/** + * Write the index to an output stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an output stream + * std::ostream os(std::cout.rdbuf()); + * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, os, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] os output stream + * @param[in] index brute force index + * @param[in] include_dataset Whether or not to write out the dataset to the file. + */ +void serialize(raft::resources const& handle, + std::ostream& os, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); + +/** + * Load index from file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * using T = half; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, filename, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] filename the name of the file that stores the index + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index); +/** + * Load index from file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * using T = float; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, filename, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] filename the name of the file that stores the index + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index); +/** + * Load index from input stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an input stream + * std::istream is(std::cin.rdbuf()); + * using T = half; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, is, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] is input stream + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index); +/** + * Load index from input stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an input stream + * std::istream is(std::cin.rdbuf()); + * using T = float; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, is, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] is input stream + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index); +/** + * @} + */ + } // namespace cuvs::neighbors::brute_force diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu index b0f87e9ac..d534676e3 100644 --- a/cpp/src/neighbors/brute_force.cu +++ b/cpp/src/neighbors/brute_force.cu @@ -21,6 +21,21 @@ #include namespace cuvs::neighbors::brute_force { + +template +index::index(raft::resources const& res) + // this constructor is just for a temporary index, for use in the deserialization + // api. all the parameters here will get replaced with loaded values - that aren't + // necessarily known ahead of time before deserialization. + // TODO: do we even need a handle here - could just construct one? + : cuvs::neighbors::index(), + metric_(cuvs::distance::DistanceType::L2Expanded), + dataset_(raft::make_device_matrix(res, 0, 0)), + norms_(std::nullopt), + metric_arg_(0) +{ +} + template index::index(raft::resources const& res, raft::host_matrix_view dataset, diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp index eda79aa31..f1a8c995d 100644 --- a/cpp/src/neighbors/brute_force_c.cpp +++ b/cpp/src/neighbors/brute_force_c.cpp @@ -17,10 +17,12 @@ #include #include +#include #include #include #include +#include #include #include @@ -91,6 +93,22 @@ void _search(cuvsResources_t res, } } +template +void _serialize(cuvsResources_t res, const char* filename, cuvsBruteForceIndex index) +{ + auto res_ptr = reinterpret_cast(res); + auto index_ptr = reinterpret_cast*>(index.addr); + cuvs::neighbors::brute_force::serialize(*res_ptr, std::string(filename), *index_ptr); +} + +template +void* _deserialize(cuvsResources_t res, const char* filename) +{ + auto res_ptr = reinterpret_cast(res); + auto index = new cuvs::neighbors::brute_force::index(*res_ptr); + cuvs::neighbors::brute_force::deserialize(*res_ptr, std::string(filename), index); + return index; +} } // namespace extern "C" cuvsError_t cuvsBruteForceIndexCreate(cuvsBruteForceIndex_t* index) @@ -129,7 +147,7 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res, if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) { index->addr = reinterpret_cast(_build(res, dataset_tensor, metric, metric_arg)); - index->dtype.code = kDLFloat; + index->dtype = dataset.dtype; } else { RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d", dataset.dtype.code, @@ -174,3 +192,38 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res, } }); } + +extern "C" cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + // read the numpy dtype from the beginning of the file + std::ifstream is(filename, std::ios::in | std::ios::binary); + if (!is) { RAFT_FAIL("Cannot open file %s", filename); } + char dtype_string[4]; + is.read(dtype_string, 4); + auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4)); + + index->dtype.bits = dtype.itemsize * 8; + if (dtype.kind == 'f' && dtype.itemsize == 4) { + index->dtype.code = kDLFloat; + index->addr = reinterpret_cast(_deserialize(res, filename)); + } else { + RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits); + } + }); +} + +extern "C" cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + if (index->dtype.code == kDLFloat && index->dtype.bits == 32) { + _serialize(res, filename, *index); + } else { + RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits); + } + }); +} \ No newline at end of file diff --git a/cpp/src/neighbors/brute_force_serialize.cu b/cpp/src/neighbors/brute_force_serialize.cu new file mode 100644 index 000000000..1b5b5111e --- /dev/null +++ b/cpp/src/neighbors/brute_force_serialize.cu @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace cuvs::neighbors::brute_force { + +int constexpr serialization_version = 0; + +template +void serialize(raft::resources const& handle, + std::ostream& os, + const index& index, + bool include_dataset = true) +{ + RAFT_LOG_DEBUG( + "Saving brute force index, size %zu, dim %u", static_cast(index.size()), index.dim()); + + auto dtype_string = raft::detail::numpy_serializer::get_numpy_dtype().to_string(); + dtype_string.resize(4); + os << dtype_string; + + raft::serialize_scalar(handle, os, serialization_version); + raft::serialize_scalar(handle, os, index.size()); + raft::serialize_scalar(handle, os, index.dim()); + raft::serialize_scalar(handle, os, index.metric()); + raft::serialize_scalar(handle, os, index.metric_arg()); + raft::serialize_scalar(handle, os, include_dataset); + if (include_dataset) { raft::serialize_mdspan(handle, os, index.dataset()); } + auto has_norms = index.has_norms(); + raft::serialize_scalar(handle, os, has_norms); + if (has_norms) { raft::serialize_mdspan(handle, os, index.norms()); } + raft::resource::sync_stream(handle); +} + +void serialize(raft::resources const& handle, + const std::string& filename, + const index& index, + bool include_dataset) +{ + auto os = std::ofstream{filename, std::ios::out | std::ios::binary}; + RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str()); + serialize(handle, os, index, include_dataset); +} + +void serialize(raft::resources const& handle, + const std::string& filename, + const index& index, + bool include_dataset) +{ + auto os = std::ofstream{filename, std::ios::out | std::ios::binary}; + RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str()); + serialize(handle, os, index, include_dataset); +} + +void serialize(raft::resources const& handle, + std::ostream& os, + const index& index, + bool include_dataset) +{ + serialize(handle, os, index, include_dataset); +} + +void serialize(raft::resources const& handle, + std::ostream& os, + const index& index, + bool include_dataset) +{ + serialize(handle, os, index, include_dataset); +} + +template +auto deserialize(raft::resources const& handle, std::istream& is) +{ + auto dtype_string = std::array{}; + is.read(dtype_string.data(), 4); + + auto ver = raft::deserialize_scalar(handle, is); + if (ver != serialization_version) { + RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver); + } + std::int64_t rows = raft::deserialize_scalar(handle, is); + std::int64_t dim = raft::deserialize_scalar(handle, is); + auto metric = raft::deserialize_scalar(handle, is); + auto metric_arg = raft::deserialize_scalar(handle, is); + + auto dataset_storage = raft::make_host_matrix(std::int64_t{}, std::int64_t{}); + auto include_dataset = raft::deserialize_scalar(handle, is); + if (include_dataset) { + dataset_storage = raft::make_host_matrix(rows, dim); + raft::deserialize_mdspan(handle, is, dataset_storage.view()); + } + + auto has_norms = raft::deserialize_scalar(handle, is); + auto norms_storage = has_norms ? std::optional{raft::make_host_vector(rows)} + : std::optional>{}; + // TODO(wphicks): Use mdbuffer here when available + auto norms_storage_dev = + has_norms ? std::optional{raft::make_device_vector(handle, rows)} + : std::optional>{}; + if (has_norms) { + raft::deserialize_mdspan(handle, is, norms_storage->view()); + raft::copy(handle, norms_storage_dev->view(), norms_storage->view()); + } + + auto result = index(handle, + raft::make_const_mdspan(dataset_storage.view()), + std::move(norms_storage_dev), + metric, + metric_arg); + raft::resource::sync_stream(handle); + + return result; +} + +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index) +{ + auto is = std::ifstream{filename, std::ios::in | std::ios::binary}; + RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str()); + + *index = deserialize(handle, is); +} + +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index) +{ + auto is = std::ifstream{filename, std::ios::in | std::ios::binary}; + RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str()); + + *index = deserialize(handle, is); +} + +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index) +{ + *index = deserialize(handle, is); +} + +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index) +{ + *index = deserialize(handle, is); +} + +} // namespace cuvs::neighbors::brute_force diff --git a/cpp/test/neighbors/ann_brute_force.cuh b/cpp/test/neighbors/ann_brute_force.cuh index c2afa4e8b..03d6e820c 100644 --- a/cpp/test/neighbors/ann_brute_force.cuh +++ b/cpp/test/neighbors/ann_brute_force.cuh @@ -114,12 +114,28 @@ class AnnBruteForceTest : public ::testing::TestWithParam(handle_); + brute_force::deserialize(handle_, std::string{"brute_force_index"}, &index_loaded); + brute_force::search(handle_, - idx, + index_loaded, search_queries_view, indices_out_view, dists_out_view, cuvs::neighbors::filtering::none_sample_filter{}); + raft::resource::sync_stream(handle_); + + ASSERT_TRUE(cuvs::neighbors::devArrMatchKnnPair(indices_naive_dev.data(), + indices_bruteforce_dev.data(), + distances_naive_dev.data(), + distances_bruteforce_dev.data(), + ps.num_queries, + ps.k, + 0.001f, + stream_, + true)); } } diff --git a/docs/source/c_api/neighbors_bruteforce_c.rst b/docs/source/c_api/neighbors_bruteforce_c.rst index af0356eee..a12175209 100644 --- a/docs/source/c_api/neighbors_bruteforce_c.rst +++ b/docs/source/c_api/neighbors_bruteforce_c.rst @@ -32,3 +32,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: bruteforce_c_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst index 4d83cd3e3..988e5b6f3 100644 --- a/docs/source/c_api/neighbors_hnsw_c.rst +++ b/docs/source/c_api/neighbors_hnsw_c.rst @@ -29,13 +29,13 @@ Index Index search ------------ -.. doxygengroup:: cagra_c_index_search +.. doxygengroup:: hnsw_c_index_search :project: cuvs :members: :content-only: Index serialize ------------- +--------------- .. doxygengroup:: hnsw_c_index_serialize :project: cuvs diff --git a/docs/source/c_api/neighbors_ivf_flat_c.rst b/docs/source/c_api/neighbors_ivf_flat_c.rst index 9e1ccc0d1..1254d70ef 100644 --- a/docs/source/c_api/neighbors_ivf_flat_c.rst +++ b/docs/source/c_api/neighbors_ivf_flat_c.rst @@ -48,3 +48,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: ivf_flat_c_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/c_api/neighbors_ivf_pq_c.rst b/docs/source/c_api/neighbors_ivf_pq_c.rst index 070719609..260057b8c 100644 --- a/docs/source/c_api/neighbors_ivf_pq_c.rst +++ b/docs/source/c_api/neighbors_ivf_pq_c.rst @@ -48,3 +48,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: ivf_pq_c_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/cpp_api/neighbors_bruteforce.rst b/docs/source/cpp_api/neighbors_bruteforce.rst index 3adcb01c5..f75e26b3c 100644 --- a/docs/source/cpp_api/neighbors_bruteforce.rst +++ b/docs/source/cpp_api/neighbors_bruteforce.rst @@ -34,3 +34,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: bruteforce_cpp_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/python_api/neighbors_brute_force.rst b/docs/source/python_api/neighbors_brute_force.rst index 5fdc3658f..d756a6c80 100644 --- a/docs/source/python_api/neighbors_brute_force.rst +++ b/docs/source/python_api/neighbors_brute_force.rst @@ -20,3 +20,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.brute_force.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.brute_force.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.brute_force.load diff --git a/docs/source/python_api/neighbors_cagra.rst b/docs/source/python_api/neighbors_cagra.rst index 09b2e2694..e7155efb8 100644 --- a/docs/source/python_api/neighbors_cagra.rst +++ b/docs/source/python_api/neighbors_cagra.rst @@ -34,3 +34,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.cagra.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.cagra.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.cagra.load diff --git a/docs/source/python_api/neighbors_hnsw.rst b/docs/source/python_api/neighbors_hnsw.rst index 9922805b3..64fe5493b 100644 --- a/docs/source/python_api/neighbors_hnsw.rst +++ b/docs/source/python_api/neighbors_hnsw.rst @@ -28,3 +28,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.hnsw.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.hnsw.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.hnsw.load diff --git a/docs/source/python_api/neighbors_ivf_flat.rst b/docs/source/python_api/neighbors_ivf_flat.rst index 5514e5e43..f2c21e68a 100644 --- a/docs/source/python_api/neighbors_ivf_flat.rst +++ b/docs/source/python_api/neighbors_ivf_flat.rst @@ -32,3 +32,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.ivf_flat.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.ivf_flat.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.ivf_flat.load diff --git a/docs/source/python_api/neighbors_ivf_pq.rst b/docs/source/python_api/neighbors_ivf_pq.rst index e3625ba67..57668fbc3 100644 --- a/docs/source/python_api/neighbors_ivf_pq.rst +++ b/docs/source/python_api/neighbors_ivf_pq.rst @@ -32,3 +32,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.ivf_pq.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.ivf_pq.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.ivf_pq.load diff --git a/python/cuvs/cuvs/neighbors/brute_force/__init__.py b/python/cuvs/cuvs/neighbors/brute_force/__init__.py index b88c4b464..6aa0e4bb2 100644 --- a/python/cuvs/cuvs/neighbors/brute_force/__init__.py +++ b/python/cuvs/cuvs/neighbors/brute_force/__init__.py @@ -13,6 +13,6 @@ # limitations under the License. -from .brute_force import Index, build, search +from .brute_force import Index, build, load, save, search -__all__ = ["Index", "build", "search"] +__all__ = ["Index", "build", "search", "save", "load"] diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd index 183827916..f1fc14ba7 100644 --- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd +++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd @@ -47,3 +47,11 @@ cdef extern from "cuvs/neighbors/brute_force.h" nogil: DLManagedTensor* neighbors, DLManagedTensor* distances, cuvsFilter filter) except + + + cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res, + const char * filename, + cuvsBruteForceIndex_t index) except + + + cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res, + const char * filename, + cuvsBruteForceIndex_t index) except + diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx index 9d1d24eae..9d43bfb29 100644 --- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx +++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx @@ -24,6 +24,7 @@ from cuvs.common.resources import auto_sync_resources from cython.operator cimport dereference as deref from libc.stdint cimport uint32_t from libcpp cimport bool +from libcpp.string cimport string from cuvs.common cimport cydlpack from cuvs.distance_type cimport cuvsDistanceType @@ -256,3 +257,88 @@ def search(Index index, )) return (distances, neighbors) + + +@auto_sync_resources +def save(filename, Index index, bool include_dataset=True, resources=None): + """ + Saves the index to a file. + + The serialization format can be subject to changes, therefore loading + an index saved with a previous version of cuvs is not guaranteed + to work. + + Parameters + ---------- + filename : string + Name of the file. + index : Index + Trained Brute Force index. + {resources_docstring} + + Examples + -------- + >>> import cupy as cp + >>> from cuvs.neighbors import brute_force + >>> n_samples = 50000 + >>> n_features = 50 + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + >>> # Build index + >>> index = brute_force.build(dataset) + >>> # Serialize and deserialize the brute_force index built + >>> brute_force.save("my_index.bin", index) + >>> index_loaded = brute_force.load("my_index.bin") + """ + cdef string c_filename = filename.encode('utf-8') + cdef cuvsResources_t res = resources.get_c_obj() + check_cuvs(cuvsBruteForceSerialize(res, + c_filename.c_str(), + index.index)) + + +@auto_sync_resources +def load(filename, resources=None): + """ + Loads index from file. + + The serialization format can be subject to changes, therefore loading + an index saved with a previous version of cuvs is not guaranteed + to work. + + + Parameters + ---------- + filename : string + Name of the file. + {resources_docstring} + + Returns + ------- + index : Index + + Examples + -------- + >>> import cupy as cp + >>> from cuvs.neighbors import brute_force + >>> n_samples = 50000 + >>> n_features = 50 + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + >>> # Build index + >>> index = brute_force.build(dataset) + >>> # Serialize and deserialize the brute_force index built + >>> brute_force.save("my_index.bin", index) + >>> index_loaded = brute_force.load("my_index.bin") + """ + cdef Index idx = Index() + cdef cuvsResources_t res = resources.get_c_obj() + cdef string c_filename = filename.encode('utf-8') + + check_cuvs(cuvsBruteForceDeserialize( + res, + c_filename.c_str(), + idx.index + )) + idx.trained = True + return idx diff --git a/python/cuvs/cuvs/test/test_serialization.py b/python/cuvs/cuvs/test/test_serialization.py index 4ffccf121..1f4a54e87 100644 --- a/python/cuvs/cuvs/test/test_serialization.py +++ b/python/cuvs/cuvs/test/test_serialization.py @@ -17,7 +17,7 @@ import pytest from pylibraft.common import device_ndarray -from cuvs.neighbors import cagra, ivf_flat, ivf_pq +from cuvs.neighbors import brute_force, cagra, ivf_flat, ivf_pq from cuvs.test.ann_utils import generate_data @@ -35,6 +35,10 @@ def test_save_load_ivf_pq(): run_save_load(ivf_pq, np.float32) +def test_save_load_brute_force(): + run_save_load(brute_force, np.float32) + + def run_save_load(ann_module, dtype): n_rows = 10000 n_cols = 50 @@ -43,8 +47,11 @@ def run_save_load(ann_module, dtype): dataset = generate_data((n_rows, n_cols), dtype) dataset_device = device_ndarray(dataset) - build_params = ann_module.IndexParams() - index = ann_module.build(build_params, dataset_device) + if ann_module == brute_force: + index = ann_module.build(dataset_device) + else: + build_params = ann_module.IndexParams() + index = ann_module.build(build_params, dataset_device) assert index.trained filename = "my_index.bin" @@ -54,20 +61,29 @@ def run_save_load(ann_module, dtype): queries = generate_data((n_queries, n_cols), dtype) queries_device = device_ndarray(queries) - search_params = ann_module.SearchParams() k = 10 - - distance_dev, neighbors_dev = ann_module.search( - search_params, index, queries_device, k - ) + if ann_module == brute_force: + distance_dev, neighbors_dev = ann_module.search( + index, queries_device, k + ) + else: + search_params = ann_module.SearchParams() + distance_dev, neighbors_dev = ann_module.search( + search_params, index, queries_device, k + ) neighbors = neighbors_dev.copy_to_host() dist = distance_dev.copy_to_host() del index - distance_dev, neighbors_dev = ann_module.search( - search_params, loaded_index, queries_device, k - ) + if ann_module == brute_force: + distance_dev, neighbors_dev = ann_module.search( + loaded_index, queries_device, k + ) + else: + distance_dev, neighbors_dev = ann_module.search( + search_params, loaded_index, queries_device, k + ) neighbors2 = neighbors_dev.copy_to_host() dist2 = distance_dev.copy_to_host() From 5062594138a40231475299c7bac61083b0669fd1 Mon Sep 17 00:00:00 2001 From: tsuki <12711693+enp1s0@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:50:41 +0900 Subject: [PATCH 35/47] [Doc] Fix CAGRA search sample code (#484) `.view()` is required Authors: - tsuki (https://github.com/enp1s0) Approvers: - Micka (https://github.com/lowener) URL: https://github.com/rapidsai/cuvs/pull/484 --- cpp/include/cuvs/neighbors/cagra.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index e48050756..5ceb3010e 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -363,7 +363,7 @@ struct index : cuvs::neighbors::index { * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * In the above example, we have passed a host dataset to build. The returned index will own a * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a @@ -530,7 +530,7 @@ struct index : cuvs::neighbors::index { * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -567,7 +567,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -604,7 +604,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -640,7 +640,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -676,7 +676,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -713,7 +713,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -750,7 +750,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -787,7 +787,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res From 441d2f1bcceb8f653a0fdaec5658c54c5201155b Mon Sep 17 00:00:00 2001 From: Divye Gala Date: Mon, 2 Dec 2024 17:34:08 -0500 Subject: [PATCH 36/47] HNSW CPU Hierarchy (#465) This PR adds an option to build the full HNSW hierarchy on the CPU when converting a CAGRA index to an hnswlib index. This lets us enable an `extend()` API. For hnswlib: 1. Update to `v0.7.0` 2. Remove dependency as symbols are compiled within DSO Authors: - Divye Gala (https://github.com/divyegala) Approvers: - Bradley Dice (https://github.com/bdice) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/465 --- .../bench_ann_cuda-118_arch-aarch64.yaml | 1 - .../bench_ann_cuda-118_arch-x86_64.yaml | 1 - .../bench_ann_cuda-125_arch-aarch64.yaml | 1 - .../bench_ann_cuda-125_arch-x86_64.yaml | 1 - cpp/CMakeLists.txt | 1 + cpp/bench/ann/CMakeLists.txt | 4 +- cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu | 34 +- .../ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h | 57 ++- cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h | 2 + .../ann/src/hnswlib/hnswlib_benchmark.cpp | 4 +- cpp/cmake/modules/ConfigureCUDA.cmake | 8 +- cpp/cmake/patches/hnswlib.diff | 327 ++++++------- cpp/cmake/patches/hnswlib_override.json | 28 +- cpp/cmake/thirdparty/get_hnswlib.cmake | 1 + cpp/include/cuvs/neighbors/hnsw.h | 279 ++++++++++- cpp/include/cuvs/neighbors/hnsw.hpp | 440 +++++++++++++++--- cpp/src/neighbors/detail/hnsw.hpp | 218 ++++++++- cpp/src/neighbors/hnsw.cpp | 57 ++- cpp/src/neighbors/hnsw_c.cpp | 157 ++++++- cpp/src/neighbors/iface/iface.hpp | 1 + cpp/test/neighbors/ann_hnsw_c.cu | 4 +- cpp/test/neighbors/hnsw.cu | 3 +- dependencies.yaml | 1 - docs/source/c_api/neighbors_hnsw_c.rst | 22 + docs/source/cpp_api/neighbors_hnsw.rst | 23 +- python/cuvs/cuvs/neighbors/hnsw/__init__.py | 15 +- python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd | 49 +- python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx | 276 ++++++++--- python/cuvs/cuvs/test/test_hnsw.py | 89 +++- .../config/algos/cuvs_cagra_hnswlib.yaml | 5 +- 30 files changed, 1683 insertions(+), 426 deletions(-) diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index 21cb98180..1e602ccf1 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -24,7 +24,6 @@ dependencies: - gcc_linux-aarch64=11.* - glog>=0.6.0 - h5py>=3.8.0 -- hnswlib=0.6.2 - libcublas-dev=11.11.3.6 - libcublas=11.11.3.6 - libcurand-dev=10.3.0.86 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 432509bcb..b060e78c2 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -24,7 +24,6 @@ dependencies: - gcc_linux-64=11.* - glog>=0.6.0 - h5py>=3.8.0 -- hnswlib=0.6.2 - libcublas-dev=11.11.3.6 - libcublas=11.11.3.6 - libcurand-dev=10.3.0.86 diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml index 0c5043ac2..485122273 100644 --- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml @@ -25,7 +25,6 @@ dependencies: - gcc_linux-aarch64=11.* - glog>=0.6.0 - h5py>=3.8.0 -- hnswlib=0.6.2 - libcublas-dev - libcurand-dev - libcusolver-dev diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml index cbb22333c..d5f48dadb 100644 --- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml @@ -25,7 +25,6 @@ dependencies: - gcc_linux-64=11.* - glog>=0.6.0 - h5py>=3.8.0 -- hnswlib=0.6.2 - libcublas-dev - libcurand-dev - libcusolver-dev diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index eb2e7c7a4..34b7cb898 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -577,6 +577,7 @@ if(BUILD_SHARED_LIBS) if(BUILD_CAGRA_HNSWLIB) target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib) + target_compile_definitions(cuvs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB) target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB) endif() diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt index 0f6b42ae9..c161a68bc 100644 --- a/cpp/bench/ann/CMakeLists.txt +++ b/cpp/bench/ann/CMakeLists.txt @@ -225,9 +225,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA) endif() if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) - ConfigureAnnBench( - NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib - ) + ConfigureAnnBench(NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs) endif() if(CUVS_ANN_BENCH_USE_CUVS_MG) diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu index 558ba01e0..e45a3bd5a 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu @@ -24,12 +24,35 @@ namespace cuvs::bench { +template +void parse_build_param(const nlohmann::json& conf, + typename cuvs::bench::cuvs_cagra_hnswlib::build_param& param) +{ + if (conf.contains("hierarchy")) { + if (conf.at("hierarchy") == "none") { + param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::NONE; + } else if (conf.at("hierarchy") == "cpu") { + param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU; + } else { + THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get().c_str()); + } + } + if (conf.contains("ef_construction")) { + param.hnsw_index_params.ef_construction = conf.at("ef_construction"); + } + if (conf.contains("num_threads")) { + param.hnsw_index_params.num_threads = conf.at("num_threads"); + } +} + template void parse_search_param(const nlohmann::json& conf, typename cuvs::bench::cuvs_cagra_hnswlib::search_param& param) { - param.ef = conf.at("ef"); - if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } + param.hnsw_search_param.ef = conf.at("ef"); + if (conf.contains("num_threads")) { + param.hnsw_search_param.num_threads = conf.at("num_threads"); + } } template @@ -43,9 +66,10 @@ auto create_algo(const std::string& algo_name, if constexpr (std::is_same_v or std::is_same_v) { if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") { - typename cuvs::bench::cuvs_cagra_hnswlib::build_param param; - parse_build_param(conf, param); - a = std::make_unique>(metric, dim, param); + typename cuvs::bench::cuvs_cagra_hnswlib::build_param bparam; + ::parse_build_param(conf, bparam.cagra_build_param); + parse_build_param(conf, bparam); + a = std::make_unique>(metric, dim, bparam); } } diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h index 875fe0bba..e4169f6f8 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h @@ -15,8 +15,8 @@ */ #pragma once -#include "../hnswlib/hnswlib_wrapper.h" #include "cuvs_cagra_wrapper.h" +#include #include @@ -26,14 +26,20 @@ template class cuvs_cagra_hnswlib : public algo, public algo_gpu { public: using search_param_base = typename algo::search_param; - using build_param = typename cuvs_cagra::build_param; - using search_param = typename hnsw_lib::search_param; + + struct build_param { + typename cuvs_cagra::build_param cagra_build_param; + cuvs::neighbors::hnsw::index_params hnsw_index_params; + }; + + struct search_param : public search_param_base { + cuvs::neighbors::hnsw::search_params hnsw_search_param; + }; cuvs_cagra_hnswlib(Metric metric, int dim, const build_param& param, int concurrent_searches = 1) : algo(metric, dim), - cagra_build_{metric, dim, param, concurrent_searches}, - // hnsw_lib param values don't matter since we don't build with hnsw_lib - hnswlib_search_{metric, dim, typename hnsw_lib::build_param{50, 100}} + build_param_{param}, + cagra_build_{metric, dim, param.cagra_build_param, concurrent_searches} { } @@ -69,40 +75,67 @@ class cuvs_cagra_hnswlib : public algo, public algo_gpu { } private: + raft::resources handle_{}; + build_param build_param_; + search_param search_param_; cuvs_cagra cagra_build_; - hnsw_lib hnswlib_search_; + std::shared_ptr> hnsw_index_; }; template void cuvs_cagra_hnswlib::build(const T* dataset, size_t nrow) { cagra_build_.build(dataset, nrow); + auto* cagra_index = cagra_build_.get_index(); + auto host_dataset_view = raft::make_host_matrix_view(dataset, nrow, this->dim_); + auto opt_dataset_view = + std::optional>(std::move(host_dataset_view)); + hnsw_index_ = cuvs::neighbors::hnsw::from_cagra( + handle_, build_param_.hnsw_index_params, *cagra_index, opt_dataset_view); } template void cuvs_cagra_hnswlib::set_search_param(const search_param_base& param_) { - hnswlib_search_.set_search_param(param_); + search_param_ = dynamic_cast(param_); } template void cuvs_cagra_hnswlib::save(const std::string& file) const { - cagra_build_.save_to_hnswlib(file); + cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get())); } template void cuvs_cagra_hnswlib::load(const std::string& file) { - hnswlib_search_.load(file); - hnswlib_search_.set_base_layer_only(); + cuvs::neighbors::hnsw::index* idx = nullptr; + cuvs::neighbors::hnsw::deserialize(handle_, + build_param_.hnsw_index_params, + file, + this->dim_, + parse_metric_type(this->metric_), + &idx); + hnsw_index_ = std::shared_ptr>(idx); } template void cuvs_cagra_hnswlib::search( const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const { - hnswlib_search_.search(queries, batch_size, k, neighbors, distances); + // Only Latency mode is supported for now + auto queries_view = + raft::make_host_matrix_view(queries, batch_size, this->dim_); + auto neighbors_view = raft::make_host_matrix_view( + reinterpret_cast(neighbors), batch_size, k); + auto distances_view = raft::make_host_matrix_view(distances, batch_size, k); + + cuvs::neighbors::hnsw::search(handle_, + search_param_.hnsw_search_param, + *(hnsw_index_.get()), + queries_view, + neighbors_view, + distances_view); } } // namespace cuvs::bench diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h index b2ba35eee..f6d3d60fc 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h @@ -154,6 +154,8 @@ class cuvs_cagra : public algo, public algo_gpu { void save_to_hnswlib(const std::string& file) const; std::unique_ptr> copy() override; + auto get_index() const -> const cuvs::neighbors::cagra::index* { return index_.get(); } + private: // handle_ must go first to make sure it dies last and all memory allocated in pool configured_raft_resources handle_{}; diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp index 755c7c8d6..6e219d2a7 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp +++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp @@ -33,7 +33,7 @@ void parse_build_param(const nlohmann::json& conf, { param.ef_construction = conf.at("efConstruction"); param.m = conf.at("M"); - if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } + if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); } } template @@ -41,7 +41,7 @@ void parse_search_param(const nlohmann::json& conf, typename cuvs::bench::hnsw_lib::search_param& param) { param.ef = conf.at("ef"); - if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); } + if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); } } template class Algo> diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake index 74da25660..3e91d9995 100644 --- a/cpp/cmake/modules/ConfigureCUDA.cmake +++ b/cpp/cmake/modules/ConfigureCUDA.cmake @@ -22,8 +22,12 @@ endif() # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with # clang) if(CMAKE_COMPILER_IS_GNUCXX) - list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations) - list(APPEND CUVS_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations) + list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations + -Wno-reorder + ) + list(APPEND CUVS_CUDA_FLAGS + -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations,-Wno-reorder + ) # set warnings as errors if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0) diff --git a/cpp/cmake/patches/hnswlib.diff b/cpp/cmake/patches/hnswlib.diff index e7f89a8cc..f20c27d91 100644 --- a/cpp/cmake/patches/hnswlib.diff +++ b/cpp/cmake/patches/hnswlib.diff @@ -1,188 +1,159 @@ +diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h +index bef0017..0ee7931 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h -@@ -3,6 +3,7 @@ - #include "visited_list_pool.h" - #include "hnswlib.h" - #include -+#include - #include - #include - #include -@@ -16,6 +17,8 @@ namespace hnswlib { - template - class HierarchicalNSW : public AlgorithmInterface { - public: -+ bool base_layer_only{false}; -+ int num_seeds=32; - static const tableint max_update_element_locks = 65536; - HierarchicalNSW(SpaceInterface *s) { - } -@@ -56,7 +59,7 @@ namespace hnswlib { - visited_list_pool_ = new VisitedListPool(1, max_elements); - - //initializations for special treatment of the first node -- enterpoint_node_ = -1; -+ enterpoint_node_ = std::numeric_limits::max(); - maxlevel_ = -1; - - linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); -@@ -527,7 +530,7 @@ namespace hnswlib { - tableint *datal = (tableint *) (data + 1); - for (int i = 0; i < size; i++) { - tableint cand = datal[i]; -- if (cand < 0 || cand > max_elements_) -+ if (cand > max_elements_) - throw std::runtime_error("cand error"); - dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); - -@@ -1067,7 +1070,7 @@ namespace hnswlib { - tableint *datal = (tableint *) (data + 1); - for (int i = 0; i < size; i++) { - tableint cand = datal[i]; -- if (cand < 0 || cand > max_elements_) -+ if (cand > max_elements_) - throw std::runtime_error("cand error"); - dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_); - if (d < curdist) { -@@ -1119,28 +1122,41 @@ namespace hnswlib { - tableint currObj = enterpoint_node_; - dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); - -- for (int level = maxlevel_; level > 0; level--) { -- bool changed = true; -- while (changed) { -- changed = false; -- unsigned int *data; -+ if (base_layer_only) { -+ // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale -+ for (int i = 0; i < num_seeds; i++) { -+ tableint obj = i * (max_elements_ / num_seeds); -+ dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_); -+ if (dist < curdist) { -+ curdist = dist; -+ currObj = obj; -+ } +@@ -16,6 +16,9 @@ typedef unsigned int linklistsizeint; + template + class HierarchicalNSW : public AlgorithmInterface { + public: ++ bool base_layer_only = false; ++ int num_seeds = 32; ++ bool base_layer_init = true; + static const tableint MAX_LABEL_OPERATION_LOCKS = 65536; + static const unsigned char DELETE_MARK = 0x01; + +@@ -1098,7 +1101,7 @@ class HierarchicalNSW : public AlgorithmInterface { + + std::unique_lock lock_el(link_list_locks_[cur_c]); + int curlevel = getRandomLevel(mult_); +- if (level > 0) ++ if (level > -1) + curlevel = level; + + element_levels_[cur_c] = curlevel; +@@ -1116,6 +1119,9 @@ class HierarchicalNSW : public AlgorithmInterface { + memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype)); + memcpy(getDataByInternalId(cur_c), data_point, data_size_); + ++ if (!base_layer_init && curlevel == 0) ++ return cur_c; ++ + if (curlevel) { + linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1); + if (linkLists_[cur_c] == nullptr) +@@ -1138,7 +1144,7 @@ class HierarchicalNSW : public AlgorithmInterface { + tableint *datal = (tableint *) (data + 1); + for (int i = 0; i < size; i++) { + tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) ++ if (static_cast(cand) < 0 || cand > max_elements_) + throw std::runtime_error("cand error"); + dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_); + if (d < curdist) { +@@ -1188,28 +1194,41 @@ class HierarchicalNSW : public AlgorithmInterface { + tableint currObj = enterpoint_node_; + dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); + +- for (int level = maxlevel_; level > 0; level--) { +- bool changed = true; +- while (changed) { +- changed = false; +- unsigned int *data; ++ if (base_layer_only) { ++ // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale ++ for (int i = 0; i < num_seeds; i++) { ++ tableint obj = i * (max_elements_ / num_seeds); ++ dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_); ++ if (dist < curdist) { ++ curdist = dist; ++ currObj = obj; + } + } -+ else{ -+ for (int level = maxlevel_; level > 0; level--) { -+ bool changed = true; -+ while (changed) { -+ changed = false; -+ unsigned int *data; - -- data = (unsigned int *) get_linklist(currObj, level); -- int size = getListCount(data); -- metric_hops++; -- metric_distance_computations+=size; -+ data = (unsigned int *) get_linklist(currObj, level); -+ int size = getListCount(data); -+ metric_hops++; -+ metric_distance_computations+=size; - -- tableint *datal = (tableint *) (data + 1); -- for (int i = 0; i < size; i++) { -- tableint cand = datal[i]; -- if (cand < 0 || cand > max_elements_) -- throw std::runtime_error("cand error"); -- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); -+ tableint *datal = (tableint *) (data + 1); -+ for (int i = 0; i < size; i++) { -+ tableint cand = datal[i]; -+ if (cand > max_elements_) -+ throw std::runtime_error("cand error"); -+ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); - -- if (d < curdist) { -- curdist = d; -- currObj = cand; -- changed = true; -+ if (d < curdist) { -+ curdist = d; -+ currObj = cand; -+ changed = true; -+ } - } ++ } ++ else { ++ for (int level = maxlevel_; level > 0; level--) { ++ bool changed = true; ++ while (changed) { ++ changed = false; ++ unsigned int *data; + +- data = (unsigned int *) get_linklist(currObj, level); +- int size = getListCount(data); +- metric_hops++; +- metric_distance_computations+=size; ++ data = (unsigned int *) get_linklist(currObj, level); ++ int size = getListCount(data); ++ metric_hops++; ++ metric_distance_computations+=size; ++ ++ tableint *datal = (tableint *) (data + 1); ++ for (int i = 0; i < size; i++) { ++ tableint cand = datal[i]; ++ if (static_cast(cand) < 0 || cand > max_elements_) ++ throw std::runtime_error("cand error"); ++ dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); + +- tableint *datal = (tableint *) (data + 1); +- for (int i = 0; i < size; i++) { +- tableint cand = datal[i]; +- if (cand < 0 || cand > max_elements_) +- throw std::runtime_error("cand error"); +- dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_); +- +- if (d < curdist) { +- curdist = d; +- currObj = cand; +- changed = true; ++ if (d < curdist) { ++ curdist = d; ++ currObj = cand; ++ changed = true; ++ } } } + } diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h -index 4413537..c3240f3 100644 +index 834d19f..0c0af26 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h -@@ -252,13 +252,14 @@ namespace hnswlib { - ~L2Space() {} - }; - -+ template - static int - L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) { - - size_t qty = *((size_t *) qty_ptr); - int res = 0; -- unsigned char *a = (unsigned char *) pVect1; -- unsigned char *b = (unsigned char *) pVect2; -+ T *a = (T *) pVect1; -+ T *b = (T *) pVect2; - - qty = qty >> 2; - for (size_t i = 0; i < qty; i++) { -@@ -279,11 +280,12 @@ namespace hnswlib { - return (res); - } - -+ template - static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) { - size_t qty = *((size_t*)qty_ptr); - int res = 0; -- unsigned char* a = (unsigned char*)pVect1; -- unsigned char* b = (unsigned char*)pVect2; -+ T* a = (T*)pVect1; -+ T* b = (T*)pVect2; - - for(size_t i = 0; i < qty; i++) - { -@@ -294,6 +296,7 @@ namespace hnswlib { - return (res); - } - -+ template - class L2SpaceI : public SpaceInterface { - - DISTFUNC fstdistfunc_; -@@ -302,10 +305,10 @@ namespace hnswlib { - public: - L2SpaceI(size_t dim) { - if(dim % 4 == 0) { -- fstdistfunc_ = L2SqrI4x; -+ fstdistfunc_ = L2SqrI4x; - } - else { -- fstdistfunc_ = L2SqrI; -+ fstdistfunc_ = L2SqrI; - } - dim_ = dim; - data_size_ = dim * sizeof(unsigned char); -diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h -index 5e1a4a5..4195ebd 100644 ---- a/hnswlib/visited_list_pool.h -+++ b/hnswlib/visited_list_pool.h -@@ -3,6 +3,7 @@ - #include - #include - #include -+#include - - namespace hnswlib { - typedef unsigned short int vl_type; -@@ -14,7 +15,7 @@ namespace hnswlib { - unsigned int numelements; - - VisitedList(int numelements1) { -- curV = -1; -+ curV = std::numeric_limits::max(); - numelements = numelements1; - mass = new vl_type[numelements]; +@@ -252,12 +252,13 @@ class L2Space : public SpaceInterface { + ~L2Space() {} + }; + ++template + static int + L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + int res = 0; +- unsigned char *a = (unsigned char *) pVect1; +- unsigned char *b = (unsigned char *) pVect2; ++ T *a = (T *) pVect1; ++ T *b = (T *) pVect2; + + qty = qty >> 2; + for (size_t i = 0; i < qty; i++) { +@@ -277,11 +278,12 @@ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const voi + return (res); + } + ++template + static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) { + size_t qty = *((size_t*)qty_ptr); + int res = 0; +- unsigned char* a = (unsigned char*)pVect1; +- unsigned char* b = (unsigned char*)pVect2; ++ T* a = (T*)pVect1; ++ T* b = (T*)pVect2; + + for (size_t i = 0; i < qty; i++) { + res += ((*a) - (*b)) * ((*a) - (*b)); +@@ -291,6 +293,7 @@ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, + return (res); + } + ++template + class L2SpaceI : public SpaceInterface { + DISTFUNC fstdistfunc_; + size_t data_size_; +@@ -299,9 +302,9 @@ class L2SpaceI : public SpaceInterface { + public: + L2SpaceI(size_t dim) { + if (dim % 4 == 0) { +- fstdistfunc_ = L2SqrI4x; ++ fstdistfunc_ = L2SqrI4x; + } else { +- fstdistfunc_ = L2SqrI; ++ fstdistfunc_ = L2SqrI; } --- -2.43.0 - + dim_ = dim; + data_size_ = dim * sizeof(unsigned char); diff --git a/cpp/cmake/patches/hnswlib_override.json b/cpp/cmake/patches/hnswlib_override.json index aef2da772..c50220e24 100644 --- a/cpp/cmake/patches/hnswlib_override.json +++ b/cpp/cmake/patches/hnswlib_override.json @@ -1,16 +1,16 @@ { - "packages" : { - "hnswlib" : { - "version": "0.6.2", - "git_url": "https://github.com/nmslib/hnswlib.git", - "git_tag": "v${version}", - "patches" : [ - { - "file" : "${current_json_dir}/hnswlib.diff", - "issue" : "Correct compilation issues", - "fixed_in" : "" - } - ] - } + "packages": { + "hnswlib": { + "version": "0.7.0", + "git_url": "https://github.com/nmslib/hnswlib.git", + "git_tag": "v${version}", + "patches": [ + { + "file": "${current_json_dir}/hnswlib.diff", + "issue": "Correct compilation issues", + "fixed_in": "" + } + ] } - } \ No newline at end of file + } +} \ No newline at end of file diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake index 2e6c895e5..5b4d89aa2 100644 --- a/cpp/cmake/thirdparty/get_hnswlib.cmake +++ b/cpp/cmake/thirdparty/get_hnswlib.cmake @@ -15,6 +15,7 @@ #============================================================================= function(find_and_configure_hnswlib) + message(STATUS "Finding or building hnswlib") set(oneValueArgs) include(${rapids-cmake-dir}/cpm/package_override.cmake) diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h index 0495c574a..b7eda54b8 100644 --- a/cpp/include/cuvs/neighbors/hnsw.h +++ b/cpp/include/cuvs/neighbors/hnsw.h @@ -16,6 +16,8 @@ #pragma once +#include "cagra.h" + #include #include #include @@ -27,32 +29,51 @@ extern "C" { #endif /** - * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params + * @defgroup hnsw_c_index_params C API for HNSW index params * @{ */ -struct cuvsHnswSearchParams { - int32_t ef; - int32_t numThreads; +/** + * @brief Hierarchy for HNSW index when converting from CAGRA index + * + * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index. + */ +enum cuvsHnswHierarchy { + /* Flat hierarchy, search is base-layer only */ + NONE, + /* Full hierarchy is built using the CPU */ + CPU }; -typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t; +struct cuvsHnswIndexParams { + /* hierarchy of the hnsw index */ + cuvsHnswHierarchy hierarchy; + /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/ + int ef_construction; + /** Number of host threads to use to construct hierarchy when hierarchy is `CPU` + NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive + to parallelism, and increasing the number of threads can reduce the quality of the index. + */ + int num_threads; +}; + +typedef struct cuvsHnswIndexParams* cuvsHnswIndexParams_t; /** - * @brief Allocate HNSW search params, and populate with default values + * @brief Allocate HNSW Index params, and populate with default values * - * @param[in] params cuvsHnswSearchParams_t to allocate + * @param[in] params cuvsHnswIndexParams_t to allocate * @return cuvsError_t */ -cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params); +cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params); /** - * @brief De-allocate HNSW search params + * @brief De-allocate HNSW Index params * - * @param[in] params cuvsHnswSearchParams_t to de-allocate + * @param[in] params * @return cuvsError_t */ -cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params); +cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params); /** * @} @@ -90,6 +111,184 @@ cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index); */ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index); +/** + * @} + */ + +/** + * @defgroup hnsw_c_extend_params Parameters for extending HNSW index + * @{ + */ + +struct cuvsHnswExtendParams { + /** Number of CPU threads used to extend additional vectors */ + int num_threads; +}; + +typedef struct cuvsHnswExtendParams* cuvsHnswExtendParams_t; + +/** + * @brief Allocate HNSW extend params, and populate with default values + * + * @param[in] params cuvsHnswExtendParams_t to allocate + * @return cuvsError_t + */ +cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params); + +/** + * @brief De-allocate HNSW extend params + * + * @param[in] params cuvsHnswExtendParams_t to de-allocate + * @return cuvsError_t + */ + +cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params); + +/** + * @} + */ + +/** + * @defgroup hnsw_c_index_load Load CAGRA index as hnswlib index + * @{ + */ + +/** + * @brief Convert a CAGRA Index to an HNSW index. + * NOTE: When hierarchy is: + * 1. `NONE`: This method uses the filesystem to write the CAGRA index in + * `/tmp/.bin` before reading it as an hnswlib index, then deleting the temporary + * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as + * the format is not compatible with the original hnswlib. + * 2. `CPU`: The returned index is mutable and can be extended with additional vectors. The + * serialized index is also compatible with the original hnswlib library. + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index + * @param[in] cagra_index cuvsCagraIndex_t to convert to HNSW index + * @param[out] hnsw_index cuvsHnswIndex_t to return the HNSW index + * + * @return cuvsError_t + * + * @code{.c} + * #include + * #include + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // create a CAGRA index with `cuvsCagraBuild` + * + * // Convert the CAGRA index to an HNSW index + * cuvsHnswIndex_t hnsw_index; + * cuvsHnswIndexCreate(&hnsw_index); + * cuvsHnswIndexParams_t hnsw_params; + * cuvsHnswIndexParamsCreate(&hnsw_params); + * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index); + * + * // de-allocate `hnsw_params`, `hnsw_index` and `res` + * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params); + * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index); + * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res); + * @endcode + */ +cuvsError_t cuvsHnswFromCagra(cuvsResources_t res, + cuvsHnswIndexParams_t params, + cuvsCagraIndex_t cagra_index, + cuvsHnswIndex_t hnsw_index); + +/** + * @} + */ + +/** + * @defgroup hnsw_c_index_extend Extend HNSW index with additional vectors + * @{ + */ + +/** + * @brief Add new vectors to an HNSW index + * NOTE: The HNSW index can only be extended when the hierarchy is `CPU` + * when converting from a CAGRA index. + + * @param[in] res cuvsResources_t opaque C handle + * @param[in] params cuvsHnswExtendParams_t used to extend Hnsw index + * @param[in] additional_dataset DLManagedTensor* additional dataset to extend the index + * @param[inout] index cuvsHnswIndex_t to extend + * + * @return cuvsError_t + * + * @code{.c} + * #include + * #include + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // create an index with `cuvsCagraBuild` + * + * // Convert the CAGRA index to an HNSW index + * cuvsHnswIndex_t hnsw_index; + * cuvsHnswIndexCreate(&hnsw_index); + * cuvsHnswIndexParams_t hnsw_params; + * cuvsHnswIndexParamsCreate(&hnsw_params); + * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index); + * + * // Extend the HNSW index with additional vectors + * DLManagedTensor additional_dataset; + * cuvsHnswExtendParams_t extend_params; + * cuvsHnswExtendParamsCreate(&extend_params); + * cuvsHnswExtend(res, extend_params, additional_dataset, hnsw_index); + * + * // de-allocate `hnsw_params`, `hnsw_index`, `extend_params` and `res` + * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params); + * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index); + * cuvsError_t extend_params_destroy_status = cuvsHnswExtendParamsDestroy(extend_params); + * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res); + * @endcode + */ + +cuvsError_t cuvsHnswExtend(cuvsResources_t res, + cuvsHnswExtendParams_t params, + DLManagedTensor* additional_dataset, + cuvsHnswIndex_t index); + +/** + * @} + */ + +/** + * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params + * @{ + */ + +struct cuvsHnswSearchParams { + int32_t ef; + int32_t num_threads; +}; + +typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t; + +/** + * @brief Allocate HNSW search params, and populate with default values + * + * @param[in] params cuvsHnswSearchParams_t to allocate + * @return cuvsError_t + */ +cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params); + +/** + * @brief De-allocate HNSW search params + * + * @param[in] params cuvsHnswSearchParams_t to de-allocate + * @return cuvsError_t + */ +cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params); + /** * @} */ @@ -111,8 +310,8 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index); * c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8` * 2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 64` * 3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32` - * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS, - * as the format is not compatible with the original hnswlib. + * NOTE: When hierarchy is `NONE`, the HNSW index can only be searched by the hnswlib wrapper in + * cuVS, as the format is not compatible with the original hnswlib. * * @code {.c} * #include @@ -131,7 +330,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index); * cuvsHnswSearchParams_t params; * cuvsError_t params_create_status = cuvsHnswSearchParamsCreate(¶ms); * - * // Search the `index` built using `cuvsHnswBuild` + * // Search the `index` built using `cuvsHnswFromCagra` * cuvsError_t search_status = cuvsHnswSearch(res, params, index, &queries, &neighbors, * &distances); * @@ -142,7 +341,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index); * * @param[in] res cuvsResources_t opaque C handle * @param[in] params cuvsHnswSearchParams_t used to search Hnsw index - * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswBuild` + * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswFromCagra` * @param[in] queries DLManagedTensor* queries dataset to search * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries * @param[out] distances DLManagedTensor* output `k` distances for queries @@ -163,9 +362,50 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res, * @{ */ +/** + * @brief Serialize a CAGRA index to a file as an hnswlib index + * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the + * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. + * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib + * library. + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] filename the name of the file to save the index + * @param[in] index cuvsHnswIndex_t to serialize + * @return cuvsError_t + * + * @code{.c} + * #include + * #include + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // create an index with `cuvsCagraBuild` + * + * // Convert the CAGRA index to an HNSW index + * cuvsHnswIndex_t hnsw_index; + * cuvsHnswIndexCreate(&hnsw_index); + * cuvsHnswIndexParams_t hnsw_params; + * cuvsHnswIndexParamsCreate(&hnsw_params); + * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index); + * + * // Serialize the HNSW index + * cuvsHnswSerialize(res, "/path/to/index", hnsw_index); + * + * // de-allocate `hnsw_params`, `hnsw_index` and `res` + * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params); + * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index); + * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res); + * @endcode + */ +cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHnswIndex_t index); + /** * Load hnswlib index from file which was serialized from a HNSW index. - * NOTE: The loaded hnswlib index is immutable, and only be read by the + * NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. * Experimental, both the API and the serialization format are subject to change. * @@ -185,17 +425,22 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res, * // The index should have the same dtype as the one used to build CAGRA the index * cuvsHnswIndex_t hnsw_index; * cuvsHnswIndexCreate(&hnsw_index); + * cuvsHnsWIndexParams_t hnsw_params; + * cuvsHnswIndexParamsCreate(&hnsw_params); + * hnsw_params->hierarchy = NONE; * hnsw_index->dtype = index->dtype; - * cuvsCagraDeserialize(res, "/path/to/index", hnsw_index); + * cuvsHnswDeserialize(res, hnsw_params, "/path/to/index", dim, metric hnsw_index); * @endcode * * @param[in] res cuvsResources_t opaque C handle + * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index * @param[in] filename the name of the file that stores the index * @param[in] dim the dimension of the vectors in the index * @param[in] metric the distance metric used to build the index * @param[out] index HNSW index loaded disk */ cuvsError_t cuvsHnswDeserialize(cuvsResources_t res, + cuvsHnswIndexParams_t params, const char* filename, int dim, cuvsDistanceType metric, diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp index d5abd6d55..f0b433d8e 100644 --- a/cpp/include/cuvs/neighbors/hnsw.hpp +++ b/cpp/include/cuvs/neighbors/hnsw.hpp @@ -34,14 +34,30 @@ namespace cuvs::neighbors::hnsw { /** - * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib + * @defgroup hnsw_cpp_index_params hnswlib index wrapper params * @{ */ -struct search_params : cuvs::neighbors::search_params { - int ef; // size of the candidate list - int num_threads = 0; // number of host threads to use for concurrent searches. Value of 0 - // automatically maximizes parallelism +/** + * @brief Hierarchy for HNSW index when converting from CAGRA index + * + * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index. + */ +enum class HnswHierarchy { + NONE, // base-layer-only index + CPU // full index with CPU-built hierarchy +}; + +struct index_params : cuvs::neighbors::index_params { + /** Hierarchy build type for HNSW index when converting from CAGRA index */ + HnswHierarchy hierarchy = HnswHierarchy::NONE; + /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/ + int ef_construction = 200; + /** Number of host threads to use to construct hierarchy when hierarchy is `CPU` + NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive + to parallelism, and increasing the number of threads can reduce the quality of the index. + */ + int num_threads = 2; }; /**@}*/ @@ -62,8 +78,12 @@ struct index : cuvs::neighbors::index { * * @param[in] dim dimensions of the training dataset * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct") + * @param[in] hierarchy hierarchy used for upper HNSW layers */ - index(int dim, cuvs::distance::DistanceType metric) : dim_{dim}, metric_{metric} {} + index(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy = HnswHierarchy::NONE) + : dim_{dim}, metric_{metric}, hierarchy_{hierarchy} + { + } virtual ~index() {} @@ -76,6 +96,8 @@ struct index : cuvs::neighbors::index { auto metric() const -> cuvs::distance::DistanceType { return metric_; } + auto hierarchy() const -> HnswHierarchy { return hierarchy_; } + /** @brief Set ef for search */ @@ -84,24 +106,41 @@ struct index : cuvs::neighbors::index { private: int dim_; cuvs::distance::DistanceType metric_; + HnswHierarchy hierarchy_; }; /**@}*/ +/** + * @defgroup hnsw_cpp_extend_params HNSW index extend parameters + * @{ + */ + +struct extend_params { + /** Number of host threads to use to add additional vectors to the index. + Value of 0 automatically maximizes parallelism. */ + int num_threads = 0; +}; + /** * @defgroup hnsw_cpp_index_load Load CAGRA index as hnswlib index * @{ */ /** - * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index - * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/.bin` - * before reading it as an hnswlib index, then deleting the temporary file. The returned index - * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not - * compatible with the original hnswlib. + * @brief Construct an hnswlib index from a CAGRA index + * NOTE: When `hnsw::index_params.hierarchy` is: + * 1. `NONE`: This method uses the filesystem to write the CAGRA index in + * `/tmp/.bin` before reading it as an hnswlib index, then deleting the temporary + * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as + * the format is not compatible with the original hnswlib. + * 2. `CPU`: The returned index is mutable and can be extended with additional vectors. The + * serialized index is also compatible with the original hnswlib library. * * @param[in] res raft resources + * @param[in] params hnsw index parameters * @param[in] cagra_index cagra index + * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU` * * Usage example: * @code{.cpp} @@ -110,24 +149,34 @@ struct index : cuvs::neighbors::index { * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // Load CAGRA index as base-layer-only hnswlib index - * auto hnsw_index = hnsw::from_cagra(res, index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); * @endcode */ std::unique_ptr> from_cagra( - raft::resources const& res, const cuvs::neighbors::cagra::index& cagra_index); + raft::resources const& res, + const index_params& params, + const cuvs::neighbors::cagra::index& cagra_index, + std::optional> dataset = + std::nullopt); /** - * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index - * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/.bin` - * before reading it as an hnswlib index, then deleting the temporary file. The returned index - * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not - * compatible with the original hnswlib. + * @brief Construct an hnswlib index from a CAGRA index + * NOTE: When `hnsw::index_params.hierarchy` is: + * 1. `NONE`: This method uses the filesystem to write the CAGRA index in + * `/tmp/.bin` before reading it as an hnswlib index, then deleting the temporary + * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as + * the format is not compatible with the original hnswlib. + * 2. `CPU`: The returned index is mutable and can be extended with additional vectors. The + * serialized index is also compatible with the original hnswlib library. * * @param[in] res raft resources + * @param[in] params hnsw index parameters * @param[in] cagra_index cagra index + * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU` * * Usage example: * @code{.cpp} @@ -136,24 +185,34 @@ std::unique_ptr> from_cagra( * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // Load CAGRA index as base-layer-only hnswlib index - * auto hnsw_index = hnsw::from_cagra(res, index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); * @endcode */ std::unique_ptr> from_cagra( - raft::resources const& res, const cuvs::neighbors::cagra::index& cagra_index); + raft::resources const& res, + const index_params& params, + const cuvs::neighbors::cagra::index& cagra_index, + std::optional> dataset = + std::nullopt); /** - * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index - * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/.bin` - * before reading it as an hnswlib index, then deleting the temporary file. The returned index - * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not - * compatible with the original hnswlib. + * @brief Construct an hnswlib index from a CAGRA index + * NOTE: When `hnsw::index_params.hierarchy` is: + * 1. `NONE`: This method uses the filesystem to write the CAGRA index in + * `/tmp/.bin` before reading it as an hnswlib index, then deleting the temporary + * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as + * the format is not compatible with the original hnswlib. + * 2. `CPU`: The returned index is mutable and can be extended with additional vectors. The + * serialized index is also compatible with the original hnswlib library. * * @param[in] res raft resources + * @param[in] params hnsw index parameters * @param[in] cagra_index cagra index + * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU` * * Usage example: * @code{.cpp} @@ -162,14 +221,138 @@ std::unique_ptr> from_cagra( * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // Load CAGRA index as base-layer-only hnswlib index - * auto hnsw_index = hnsw::from_cagra(res, index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); * @endcode */ std::unique_ptr> from_cagra( - raft::resources const& res, const cuvs::neighbors::cagra::index& cagra_index); + raft::resources const& res, + const index_params& params, + const cuvs::neighbors::cagra::index& cagra_index, + std::optional> dataset = + std::nullopt); + +/**@}*/ + +/** + * @defgroup hnsw_cpp_index_extend Extend HNSW index with additional vectors + * @{ + */ + +/** + * @brief Add new vectors to an HNSW index + * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU` + * when converting from a CAGRA index. + * + * @param[in] res raft resources + * @param[in] params configure the extend + * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()] + * @param[inout] idx HNSW index to extend + * + * Usage example: + * @code{.cpp} + * // Build a CAGRA index + * using namespace cuvs::neighbors; + * cagra::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = cagra::build(res, index_params, dataset); + * + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * + * // Extend the HNSW index with additional vectors + * auto additional_dataset = raft::make_host_matrix(res, add_size, index->dim()); + * hnsw::extend_params extend_params; + * hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get()); + */ +void extend(raft::resources const& res, + const extend_params& params, + raft::host_matrix_view additional_dataset, + index& idx); + +/** + * @brief Add new vectors to an HNSW index + * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU` + * when converting from a CAGRA index. + * + * @param[in] res raft resources + * @param[in] params configure the extend + * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()] + * @param[inout] idx HNSW index to extend + * + * Usage example: + * @code{.cpp} + * // Build a CAGRA index + * using namespace cuvs::neighbors; + * cagra::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = cagra::build(res, index_params, dataset); + * + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * + * // Extend the HNSW index with additional vectors + * auto additional_dataset = raft::make_host_matrix(res, add_size, index->dim()); + * hnsw::extend_params extend_params; + * hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get()); + */ +void extend(raft::resources const& res, + const extend_params& params, + raft::host_matrix_view additional_dataset, + index& idx); + +/** + * @brief Add new vectors to an HNSW index + * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU` + * when converting from a CAGRA index. + * + * @param[in] res raft resources + * @param[in] params configure the extend + * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()] + * @param[inout] idx HNSW index to extend + * + * Usage example: + * @code{.cpp} + * // Build a CAGRA index + * using namespace cuvs::neighbors; + * cagra::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = cagra::build(res, index_params, dataset); + * + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * + * // Extend the HNSW index with additional vectors + * auto additional_dataset = raft::make_host_matrix(res, add_size, index->dim()); + * hnsw::extend_params extend_params; + * hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get()); + */ +void extend(raft::resources const& res, + const extend_params& params, + raft::host_matrix_view additional_dataset, + index& idx); + +/**@} */ + +/** + * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib + * @{ + */ + +struct search_params : cuvs::neighbors::search_params { + int ef; // size of the candidate list + int num_threads = 0; // number of host threads to use for concurrent searches. Value of 0 + // automatically maximizes parallelism +}; /**@}*/ @@ -181,9 +364,9 @@ std::unique_ptr> from_cagra( */ /** - * @brief Search hnswlib base-layer-only index constructed from a CAGRA index - * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS, - * as the format is not compatible with the original hnswlib. + * @brief Search HNSW index constructed from a CAGRA index + * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is + * `NONE`, as the format is not compatible with the original hnswlib. * * @param[in] res raft resources * @param[in] params configure the search @@ -201,10 +384,11 @@ std::unique_ptr> from_cagra( * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // Load CAGRA index as a base-layer HNSW index using the filesystem - * auto hnsw_index = hnsw::from_cagra(res, index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); * * // Search K nearest neighbors as an hnswlib index * // using host threads for concurrency @@ -224,9 +408,9 @@ void search(raft::resources const& res, raft::host_matrix_view distances); /** - * @brief Search hnswlib base-layer-only index constructed from a CAGRA index - * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS, - * as the format is not compatible with the original hnswlib. + * @brief Search HNSWindex constructed from a CAGRA index + * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is + * `NONE`, as the format is not compatible with the original hnswlib. * * @param[in] res raft resources * @param[in] params configure the search @@ -244,10 +428,11 @@ void search(raft::resources const& res, * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // Load CAGRA index as a base-layer HNSW index using the filesystem - * auto hnsw_index = hnsw::from_cagra(res, index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); * * // Search K nearest neighbors as an hnswlib index * // using host threads for concurrency @@ -267,9 +452,9 @@ void search(raft::resources const& res, raft::host_matrix_view distances); /** - * @brief Search hnswlib base-layer-only index constructed from a CAGRA index - * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS, - * as the format is not compatible with the original hnswlib. + * @brief Search HNSW index constructed from a CAGRA index + * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is + * `NONE`, as the format is not compatible with the original hnswlib. * * @param[in] res raft resources * @param[in] params configure the search @@ -287,10 +472,11 @@ void search(raft::resources const& res, * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // Load CAGRA index as a base-layer HNSW index using the filesystem - * auto hnsw_index = hnsw::from_cagra(res, index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); * * // Search K nearest neighbors as an hnswlib index * // using host threads for concurrency @@ -312,16 +498,106 @@ void search(raft::resources const& res, /**@}*/ /** - * @defgroup hnsw_cpp_index_deserialize Deserialize CAGRA index as hnswlib index + * @defgroup hnsw_cpp_index_serialize Deserialize CAGRA index as hnswlib index * @{ */ +/** + * @brief Serialize a CAGRA index to a file as an hnswlib index + * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the + * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. + * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib + * library. + * + * @param[in] res raft resources + * @param[in] filename path to the file to save the serialized CAGRA index + * @param[in] idx cagra index + * + * Usage example: + * @code{.cpp} + * // Build a CAGRA index + * using namespace cuvs::neighbors; + * // use default index parameters + * cagra::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = cagra::build(res, index_params, dataset); + * + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * // Save the index + * hnsw::serialize(res, "index.bin", index); + * @endcode + */ +void serialize(raft::resources const& res, const std::string& filename, const index& idx); + +/** + * @brief Serialize a CAGRA index to a file as an hnswlib index + * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the + * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. + * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib + * library. + * + * @param[in] res raft resources + * @param[in] filename path to the file to save the serialized CAGRA index + * @param[in] idx cagra index + * + * Usage example: + * @code{.cpp} + * // Build a CAGRA index + * using namespace cuvs::neighbors; + * // use default index parameters + * cagra::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = cagra::build(res, index_params, dataset); + * + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * // Save the index + * hnsw::serialize(res, "index.bin", index); + * @endcode + */ +void serialize(raft::resources const& res, const std::string& filename, const index& idx); + +/** + * @brief Serialize a CAGRA index to a file as an hnswlib index + * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the + * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. + * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib + * library. + * + * @param[in] res raft resources + * @param[in] filename path to the file to save the serialized CAGRA index + * @param[in] idx cagra index + * + * Usage example: + * @code{.cpp} + * // Build a CAGRA index + * using namespace cuvs::neighbors; + * // use default index parameters + * cagra::index_params index_params; + * // create and fill the index from a [N, D] dataset + * auto index = cagra::build(res, index_params, dataset); + * + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * // Save the index + * hnsw::serialize(res, "index.bin", index); + * @endcode + */ +void serialize(raft::resources const& res, const std::string& filename, const index& idx); + /** * @brief De-serialize a CAGRA index saved to a file as an hnswlib index - * NOTE: The loaded hnswlib index is immutable, and only be read by the + * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. + * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib + * library. * * @param[in] res raft resources + * @param[in] params hnsw index parameters * @param[in] filename path to the file containing the serialized CAGRA index * @param[in] dim dimensions of the training dataset * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct") @@ -334,19 +610,23 @@ void search(raft::resources const& res, * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // save a CAGRA index to a file - * cagra::serialize(res, index, "index.bin"); - * // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem - * index* hnsw_index = nullptr; - * hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * // save HNSW index to a file + * hnsw::serialize(res, "index.bin", hnsw_index); + * // De-serialize the HNSW index + * index* hnsw_index = nullptr; + * hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index); * * // Delete index after use * delete hnsw_index; * @endcode */ void deserialize(raft::resources const& res, + const index_params& params, const std::string& filename, int dim, cuvs::distance::DistanceType metric, @@ -354,10 +634,13 @@ void deserialize(raft::resources const& res, /** * @brief De-serialize a CAGRA index saved to a file as an hnswlib index - * NOTE: The loaded hnswlib index is immutable, and only be read by the + * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. + * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib + * library. * * @param[in] res raft resources + * @param[in] params hnsw index parameters * @param[in] filename path to the file containing the serialized CAGRA index * @param[in] dim dimensions of the training dataset * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct") @@ -370,19 +653,23 @@ void deserialize(raft::resources const& res, * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // save a CAGRA index to a file - * cagra::serialize(res, index, "index.bin"); - * // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem - * index* hnsw_index = nullptr; - * hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * // save HNSW index to a file + * hnsw::serialize(res, "index.bin", hnsw_index); + * // De-serialize the HNSW index + * index* hnsw_index = nullptr; + * hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index); * * // Delete index after use * delete hnsw_index; * @endcode */ void deserialize(raft::resources const& res, + const index_params& params, const std::string& filename, int dim, cuvs::distance::DistanceType metric, @@ -390,10 +677,13 @@ void deserialize(raft::resources const& res, /** * @brief De-serialize a CAGRA index saved to a file as an hnswlib index - * NOTE: The loaded hnswlib index is immutable, and only be read by the + * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib. + * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib + * library. * * @param[in] res raft resources + * @param[in] params hnsw index parameters * @param[in] filename path to the file containing the serialized CAGRA index * @param[in] dim dimensions of the training dataset * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct") @@ -406,19 +696,23 @@ void deserialize(raft::resources const& res, * // use default index parameters * cagra::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = cagra::build(res, index_params, dataset); + * auto index = cagra::build(res, index_params, dataset); * - * // save a CAGRA index to a file - * cagra::serialize(res, index, "index.bin"); - * // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem - * index* hnsw_index = nullptr; - * hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index); + * // Load CAGRA index as an HNSW index + * hnsw::index_params hnsw_params; + * auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index); + * // save HNSW index to a file + * hnsw::serialize(res, "index.bin", hnsw_index); + * // De-serialize the HNSW index + * index* hnsw_index = nullptr; + * hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index); * * // Delete index after use * delete hnsw_index; * @endcode */ void deserialize(raft::resources const& res, + const index_params& params, const std::string& filename, int dim, cuvs::distance::DistanceType metric, diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index ce1e03264..e129d23e8 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -22,9 +22,63 @@ #include #include #include +#include namespace cuvs::neighbors::hnsw::detail { +// Multithreaded executor +// The helper function is copied from the hnswlib repository +// as for some reason, adding vectors to the hnswlib index does not +// work well with omp parallel for +template +inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) +{ + if (numThreads <= 0) { numThreads = std::thread::hardware_concurrency(); } + + if (numThreads == 1) { + for (size_t id = start; id < end; id++) { + fn(id, 0); + } + } else { + std::vector threads; + std::atomic current(start); + + // keep track of exceptions in threads + // https://stackoverflow.com/a/32428427/1713196 + std::exception_ptr lastException = nullptr; + std::mutex lastExceptMutex; + + for (size_t threadId = 0; threadId < numThreads; ++threadId) { + threads.push_back(std::thread([&, threadId] { + while (true) { + size_t id = current.fetch_add(1); + + if (id >= end) { break; } + + try { + fn(id, threadId); + } catch (...) { + std::unique_lock lastExcepLock(lastExceptMutex); + lastException = std::current_exception(); + /* + * This will work even when current is the largest value that + * size_t can fit, because fetch_add returns the previous value + * before the increment (what will result in overflow + * and produce 0 instead of current + 1). + */ + current = end; + break; + } + } + })); + } + for (auto& thread : threads) { + thread.join(); + } + if (lastException) { std::rethrow_exception(lastException); } + } +} + template struct hnsw_dist_t { using type = void; @@ -54,9 +108,10 @@ struct index_impl : index { * @param[in] filepath path to the index * @param[in] dim dimensions of the training dataset * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct") + * @param[in] hierarchy hierarchy used for upper HNSW layers */ - index_impl(const std::string& filepath, int dim, cuvs::distance::DistanceType metric) - : index{dim, metric} + index_impl(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy) + : index{dim, metric, hierarchy} { if constexpr (std::is_same_v) { if (metric == cuvs::distance::DistanceType::L2Expanded) { @@ -71,11 +126,6 @@ struct index_impl : index { } RAFT_EXPECTS(space_ != nullptr, "Unsupported metric type was used"); - - appr_alg_ = std::make_unique::type>>( - space_.get(), filepath); - - appr_alg_->base_layer_only = true; } /** @@ -88,14 +138,32 @@ struct index_impl : index { */ void set_ef(int ef) const override { appr_alg_->ef_ = ef; } + /** + @brief Set index + */ + void set_index(std::unique_ptr::type>>&& index) + { + appr_alg_ = std::move(index); + } + + /** + @brief Get space + */ + auto get_space() const -> hnswlib::SpaceInterface::type>* + { + return space_.get(); + } + private: std::unique_ptr::type>> appr_alg_; std::unique_ptr::type>> space_; }; -template -std::unique_ptr> from_cagra(raft::resources const& res, - const cuvs::neighbors::cagra::index& cagra_index) +template +std::enable_if_t>> from_cagra( + raft::resources const& res, + const index_params& params, + const cuvs::neighbors::cagra::index& cagra_index) { std::random_device dev; std::mt19937 rng(dev()); @@ -103,13 +171,125 @@ std::unique_ptr> from_cagra(raft::resources const& res, auto uuid = std::to_string(dist(rng)); std::string filepath = "/tmp/" + uuid + ".bin"; cuvs::neighbors::cagra::serialize_to_hnswlib(res, filepath, cagra_index); + index* hnsw_index = nullptr; cuvs::neighbors::hnsw::deserialize( - res, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index); + res, params, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index); std::filesystem::remove(filepath); return std::unique_ptr>(hnsw_index); } +template +std::enable_if_t>> from_cagra( + raft::resources const& res, + const index_params& params, + const cuvs::neighbors::cagra::index& cagra_index, + std::optional> dataset) +{ + // auto host_dataset = raft::make_host_matrix(dataset.extent(0), dataset.extent(1)); + auto host_dataset = raft::make_host_matrix(0, 0); + raft::host_matrix_view host_dataset_view( + host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1)); + if (dataset.has_value()) { + host_dataset_view = dataset.value(); + } else { + // move dataset to host, remove padding + auto cagra_dataset = cagra_index.dataset(); + host_dataset = + raft::make_host_matrix(cagra_dataset.extent(0), cagra_dataset.extent(1)); + RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(), + sizeof(T) * host_dataset.extent(1), + cagra_dataset.data_handle(), + sizeof(T) * cagra_dataset.stride(0), + sizeof(T) * host_dataset.extent(1), + cagra_dataset.extent(0), + cudaMemcpyDefault, + raft::resource::get_cuda_stream(res))); + raft::resource::sync_stream(res); + host_dataset_view = host_dataset.view(); + } + // build upper layers of hnsw index + auto hnsw_index = + std::make_unique>(cagra_index.dim(), cagra_index.metric(), hierarchy); + auto appr_algo = std::make_unique::type>>( + hnsw_index->get_space(), + host_dataset_view.extent(0), + cagra_index.graph().extent(1) / 2, + params.ef_construction); + appr_algo->base_layer_init = false; // tell hnswlib to build upper layers only + ParallelFor(0, host_dataset_view.extent(0), params.num_threads, [&](size_t i, size_t threadId) { + appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)), + i); + }); + appr_algo->base_layer_init = true; // reset to true to allow addition of new points + + // move cagra graph to host + auto graph = cagra_index.graph(); + auto host_graph = + raft::make_host_matrix(graph.extent(0), graph.extent(1)); + raft::copy(host_graph.data_handle(), + graph.data_handle(), + graph.size(), + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + +// copy cagra graph to hnswlib base layer +#pragma omp parallel for + for (size_t i = 0; i < static_cast(host_graph.extent(0)); ++i) { + auto ll_i = appr_algo->get_linklist0(i); + appr_algo->setListCount(ll_i, host_graph.extent(1)); + auto* data = (uint32_t*)(ll_i + 1); + for (size_t j = 0; j < static_cast(host_graph.extent(1)); ++j) { + data[j] = host_graph(i, j); + } + } + + hnsw_index->set_index(std::move(appr_algo)); + return hnsw_index; +} + +template +std::unique_ptr> from_cagra( + raft::resources const& res, + const index_params& params, + const cuvs::neighbors::cagra::index& cagra_index, + std::optional> dataset) +{ + if (params.hierarchy == HnswHierarchy::NONE) { + return from_cagra(res, params, cagra_index); + } else if (params.hierarchy == HnswHierarchy::CPU) { + return from_cagra(res, params, cagra_index, dataset); + } + { + RAFT_FAIL("Unsupported hierarchy type"); + } +} + +template +void extend(raft::resources const& res, + const extend_params& params, + raft::host_matrix_view additional_dataset, + index& idx) +{ + auto* hnswlib_index = reinterpret_cast::type>*>( + const_cast(idx.get_index())); + auto current_element_count = hnswlib_index->getCurrentElementCount(); + auto new_element_count = additional_dataset.extent(0); + auto num_threads = params.num_threads == 0 ? std::thread::hardware_concurrency() + : static_cast(params.num_threads); + + hnswlib_index->resizeIndex(current_element_count + new_element_count); + ParallelFor(current_element_count, + current_element_count + new_element_count, + num_threads, + [&](size_t i, size_t threadId) { + hnswlib_index->addPoint( + (void*)(additional_dataset.data_handle() + + (i - current_element_count) * additional_dataset.extent(1)), + i); + }); +} + template void get_search_knn_results(hnswlib::HierarchicalNSW::type> const* idx, const T* query, @@ -171,14 +351,28 @@ void search(raft::resources const& res, } } +template +void serialize(raft::resources const& res, const std::string& filename, const index& idx) +{ + auto* hnswlib_index = reinterpret_cast::type>*>( + const_cast(idx.get_index())); + hnswlib_index->saveIndex(filename); +} + template void deserialize(raft::resources const& res, + const index_params& params, const std::string& filename, int dim, cuvs::distance::DistanceType metric, index** idx) { - *idx = new detail::index_impl(filename, dim, metric); + auto hnsw_index = std::make_unique>(dim, metric, params.hierarchy); + auto appr_algo = std::make_unique::type>>( + hnsw_index->get_space(), filename); + if (params.hierarchy == HnswHierarchy::NONE) { appr_algo->base_layer_only = true; } + hnsw_index->set_index(std::move(appr_algo)); + *idx = hnsw_index.release(); } } // namespace cuvs::neighbors::hnsw::detail diff --git a/cpp/src/neighbors/hnsw.cpp b/cpp/src/neighbors/hnsw.cpp index e6f3fbcc7..f165176ec 100644 --- a/cpp/src/neighbors/hnsw.cpp +++ b/cpp/src/neighbors/hnsw.cpp @@ -21,11 +21,14 @@ namespace cuvs::neighbors::hnsw { -#define CUVS_INST_HNSW_FROM_CAGRA(T) \ - std::unique_ptr> from_cagra( \ - raft::resources const& res, const cuvs::neighbors::cagra::index& cagra_index) \ - { \ - return detail::from_cagra(res, cagra_index); \ +#define CUVS_INST_HNSW_FROM_CAGRA(T) \ + std::unique_ptr> from_cagra( \ + raft::resources const& res, \ + const index_params& params, \ + const cuvs::neighbors::cagra::index& cagra_index, \ + std::optional> dataset) \ + { \ + return detail::from_cagra(res, params, cagra_index, dataset); \ } CUVS_INST_HNSW_FROM_CAGRA(float); @@ -34,6 +37,21 @@ CUVS_INST_HNSW_FROM_CAGRA(int8_t); #undef CUVS_INST_HNSW_FROM_CAGRA +#define CUVS_INST_HNSW_EXTEND(T) \ + void extend(raft::resources const& res, \ + const extend_params& params, \ + raft::host_matrix_view additional_dataset, \ + index& idx) \ + { \ + detail::extend(res, params, additional_dataset, idx); \ + } + +CUVS_INST_HNSW_EXTEND(float); +CUVS_INST_HNSW_EXTEND(uint8_t); +CUVS_INST_HNSW_EXTEND(int8_t); + +#undef CUVS_INST_HNSW_EXTEND + #define CUVS_INST_HNSW_SEARCH(T) \ void search(raft::resources const& res, \ const search_params& params, \ @@ -51,20 +69,25 @@ CUVS_INST_HNSW_SEARCH(int8_t); #undef CUVS_INST_HNSW_SEARCH -#define CUVS_INST_HNSW_DESERIALIZE(T) \ - void deserialize(raft::resources const& res, \ - const std::string& filename, \ - int dim, \ - cuvs::distance::DistanceType metric, \ - index** idx) \ - { \ - detail::deserialize(res, filename, dim, metric, idx); \ +#define CUVS_INST_HNSW_SERIALIZE(T) \ + void serialize(raft::resources const& res, const std::string& filename, const index& idx) \ + { \ + detail::serialize(res, filename, idx); \ + } \ + void deserialize(raft::resources const& res, \ + const index_params& params, \ + const std::string& filename, \ + int dim, \ + cuvs::distance::DistanceType metric, \ + index** idx) \ + { \ + detail::deserialize(res, params, filename, dim, metric, idx); \ } -CUVS_INST_HNSW_DESERIALIZE(float); -CUVS_INST_HNSW_DESERIALIZE(uint8_t); -CUVS_INST_HNSW_DESERIALIZE(int8_t); +CUVS_INST_HNSW_SERIALIZE(float); +CUVS_INST_HNSW_SERIALIZE(uint8_t); +CUVS_INST_HNSW_SERIALIZE(int8_t); -#undef CUVS_INST_HNSW_DESERIALIZE +#undef CUVS_INST_HNSW_SERIALIZE } // namespace cuvs::neighbors::hnsw diff --git a/cpp/src/neighbors/hnsw_c.cpp b/cpp/src/neighbors/hnsw_c.cpp index a19875641..0233a510a 100644 --- a/cpp/src/neighbors/hnsw_c.cpp +++ b/cpp/src/neighbors/hnsw_c.cpp @@ -31,6 +31,44 @@ #include namespace { + +template +void _from_cagra(cuvsResources_t res, + cuvsHnswIndexParams_t params, + cuvsCagraIndex_t cagra_index, + cuvsHnswIndex_t hnsw_index) +{ + auto res_ptr = reinterpret_cast(res); + auto index = reinterpret_cast*>(cagra_index->addr); + auto cpp_params = cuvs::neighbors::hnsw::index_params(); + cpp_params.hierarchy = static_cast(params->hierarchy); + cpp_params.ef_construction = params->ef_construction; + cpp_params.num_threads = params->num_threads; + std::optional> dataset = std::nullopt; + + auto hnsw_index_unique_ptr = + cuvs::neighbors::hnsw::from_cagra(*res_ptr, cpp_params, *index, dataset); + auto hnsw_index_ptr = hnsw_index_unique_ptr.release(); + hnsw_index->addr = reinterpret_cast(hnsw_index_ptr); +} + +template +void _extend(cuvsResources_t res, + cuvsHnswExtendParams_t params, + DLManagedTensor* additional_dataset, + cuvsHnswIndex index) +{ + auto res_ptr = reinterpret_cast(res); + auto index_ptr = reinterpret_cast*>(index.addr); + auto cpp_params = cuvs::neighbors::hnsw::extend_params(); + cpp_params.num_threads = params->num_threads; + + using additional_dataset_mdspan_type = raft::host_matrix_view; + auto additional_dataset_mds = + cuvs::core::from_dlpack(additional_dataset); + cuvs::neighbors::hnsw::extend(*res_ptr, cpp_params, additional_dataset_mds, *index_ptr); +} + template void _search(cuvsResources_t res, cuvsHnswSearchParams params, @@ -44,7 +82,7 @@ void _search(cuvsResources_t res, auto search_params = cuvs::neighbors::hnsw::search_params(); search_params.ef = params.ef; - search_params.num_threads = params.numThreads; + search_params.num_threads = params.num_threads; using queries_mdspan_type = raft::host_matrix_view; using neighbors_mdspan_type = raft::host_matrix_view; @@ -57,26 +95,42 @@ void _search(cuvsResources_t res, } template -void* _deserialize(cuvsResources_t res, const char* filename, int dim, cuvsDistanceType metric) +void _serialize(cuvsResources_t res, const char* filename, cuvsHnswIndex index) +{ + auto res_ptr = reinterpret_cast(res); + auto index_ptr = reinterpret_cast*>(index.addr); + cuvs::neighbors::hnsw::serialize(*res_ptr, std::string(filename), *index_ptr); +} + +template +void* _deserialize(cuvsResources_t res, + cuvsHnswIndexParams_t params, + const char* filename, + int dim, + cuvsDistanceType metric) { auto res_ptr = reinterpret_cast(res); cuvs::neighbors::hnsw::index* index = nullptr; - cuvs::neighbors::hnsw::deserialize(*res_ptr, std::string(filename), dim, metric, &index); + auto cpp_params = cuvs::neighbors::hnsw::index_params(); + cpp_params.hierarchy = static_cast(params->hierarchy); + cuvs::neighbors::hnsw::deserialize( + *res_ptr, cpp_params, std::string(filename), dim, metric, &index); return index; } } // namespace -extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params) +extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params) { - return cuvs::core::translate_exceptions( - [=] { *params = new cuvsHnswSearchParams{.ef = 200, .numThreads = 0}; }); + return cuvs::core::translate_exceptions([=] { + *params = new cuvsHnswIndexParams{ + .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 2}; + }); } -extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params) +extern "C" cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params) { return cuvs::core::translate_exceptions([=] { delete params; }); } - extern "C" cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index) { return cuvs::core::translate_exceptions([=] { *index = new cuvsHnswIndex{}; }); @@ -101,6 +155,66 @@ extern "C" cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index_c_ptr) }); } +extern "C" cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params) +{ + return cuvs::core::translate_exceptions( + [=] { *params = new cuvsHnswExtendParams{.num_threads = 0}; }); +} + +extern "C" cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params) +{ + return cuvs::core::translate_exceptions([=] { delete params; }); +} + +extern "C" cuvsError_t cuvsHnswFromCagra(cuvsResources_t res, + cuvsHnswIndexParams_t params, + cuvsCagraIndex_t cagra_index, + cuvsHnswIndex_t hnsw_index) +{ + return cuvs::core::translate_exceptions([=] { + auto index = *cagra_index; + hnsw_index->dtype = index.dtype; + if (index.dtype.code == kDLFloat) { + _from_cagra(res, params, cagra_index, hnsw_index); + } else if (index.dtype.code == kDLUInt) { + _from_cagra(res, params, cagra_index, hnsw_index); + } else if (index.dtype.code == kDLInt) { + _from_cagra(res, params, cagra_index, hnsw_index); + } else { + RAFT_FAIL("Unsupported dtype: %d", index.dtype.code); + } + }); +} + +extern "C" cuvsError_t cuvsHnswExtend(cuvsResources_t res, + cuvsHnswExtendParams_t params, + DLManagedTensor* additional_dataset, + cuvsHnswIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + if (index->dtype.code == kDLFloat) { + _extend(res, params, additional_dataset, *index); + } else if (index->dtype.code == kDLUInt) { + _extend(res, params, additional_dataset, *index); + } else if (index->dtype.code == kDLInt) { + _extend(res, params, additional_dataset, *index); + } else { + RAFT_FAIL("Unsupported dtype: %d", index->dtype.code); + } + }); +} + +extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params) +{ + return cuvs::core::translate_exceptions( + [=] { *params = new cuvsHnswSearchParams{.ef = 200, .num_threads = 0}; }); +} + +extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params) +{ + return cuvs::core::translate_exceptions([=] { delete params; }); +} + extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res, cuvsHnswSearchParams_t params, cuvsHnswIndex_t index_c_ptr, @@ -140,7 +254,25 @@ extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res, }); } +extern "C" cuvsError_t cuvsHnswSerialize(cuvsResources_t res, + const char* filename, + cuvsHnswIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + if (index->dtype.code == kDLFloat) { + _serialize(res, filename, *index); + } else if (index->dtype.code == kDLInt) { + _serialize(res, filename, *index); + } else if (index->dtype.code == kDLUInt) { + _serialize(res, filename, *index); + } else { + RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits); + } + }); +} + extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res, + cuvsHnswIndexParams_t params, const char* filename, int dim, cuvsDistanceType metric, @@ -148,11 +280,14 @@ extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res, { return cuvs::core::translate_exceptions([=] { if (index->dtype.code == kDLFloat && index->dtype.bits == 32) { - index->addr = reinterpret_cast(_deserialize(res, filename, dim, metric)); + index->addr = + reinterpret_cast(_deserialize(res, params, filename, dim, metric)); } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) { - index->addr = reinterpret_cast(_deserialize(res, filename, dim, metric)); + index->addr = + reinterpret_cast(_deserialize(res, params, filename, dim, metric)); } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) { - index->addr = reinterpret_cast(_deserialize(res, filename, dim, metric)); + index->addr = + reinterpret_cast(_deserialize(res, params, filename, dim, metric)); } else { RAFT_FAIL("Unsupported dtype in file %s", filename); } diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp index 9b3da75a4..98ef3fdd3 100644 --- a/cpp/src/neighbors/iface/iface.hpp +++ b/cpp/src/neighbors/iface/iface.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/cpp/test/neighbors/ann_hnsw_c.cu b/cpp/test/neighbors/ann_hnsw_c.cu index fc740b924..2a6401b1d 100644 --- a/cpp/test/neighbors/ann_hnsw_c.cu +++ b/cpp/test/neighbors/ann_hnsw_c.cu @@ -111,7 +111,9 @@ TEST(CagraHnswC, BuildSearch) cuvsHnswIndex_t hnsw_index; cuvsHnswIndexCreate(&hnsw_index); hnsw_index->dtype = index->dtype; - cuvsHnswDeserialize(res, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index); + cuvsHnswIndexParams_t hnsw_params; + cuvsHnswIndexParamsCreate(&hnsw_params); + cuvsHnswDeserialize(res, hnsw_params, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index); // search index cuvsHnswSearchParams_t search_params; diff --git a/cpp/test/neighbors/hnsw.cu b/cpp/test/neighbors/hnsw.cu index 9fb88be05..20ee83a11 100644 --- a/cpp/test/neighbors/hnsw.cu +++ b/cpp/test/neighbors/hnsw.cu @@ -108,7 +108,8 @@ class AnnHNSWTest : public ::testing::TestWithParam { cuvs::neighbors::hnsw::search_params search_params; search_params.ef = ps.ef; - auto hnsw_index = cuvs::neighbors::hnsw::from_cagra(handle_, index); + cuvs::neighbors::hnsw::index_params hnsw_params; + auto hnsw_index = cuvs::neighbors::hnsw::from_cagra(handle_, hnsw_params, index); auto queries_HNSW_view = raft::make_host_matrix_view(queries_h.data(), ps.n_queries, ps.dim); auto indices_HNSW_view = diff --git a/dependencies.yaml b/dependencies.yaml index e909ad0dc..80a7d2024 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -470,7 +470,6 @@ dependencies: common: - output_types: [conda, pyproject, requirements] packages: - - hnswlib=0.6.2 - nlohmann_json>=3.11.2 - glog>=0.6.0 - h5py>=3.8.0 diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst index 988e5b6f3..22ffc236d 100644 --- a/docs/source/c_api/neighbors_hnsw_c.rst +++ b/docs/source/c_api/neighbors_hnsw_c.rst @@ -26,6 +26,28 @@ Index :members: :content-only: +Index extend parameters +----------------------- + +.. doxygengroup:: hnsw_c_extend_params + :project: cuvs + :members: + :content-only: + +Index extend +------------ +.. doxygengroup:: hnsw_c_index_extend + :project: cuvs + :members: + :content-only: + +Index load +---------- +.. doxygengroup:: hnsw_c_index_load + :project: cuvs + :members: + :content-only: + Index search ------------ diff --git a/docs/source/cpp_api/neighbors_hnsw.rst b/docs/source/cpp_api/neighbors_hnsw.rst index b0af88af0..00dd3a213 100644 --- a/docs/source/cpp_api/neighbors_hnsw.rst +++ b/docs/source/cpp_api/neighbors_hnsw.rst @@ -27,10 +27,25 @@ Index :members: :content-only: -Index load +Index extend parameters +----------------------- + +.. doxygengroup:: hnsw_cpp_extend_params + :project: cuvs + :members: + :content-only: + +Index extend ------------ +.. doxygengroup:: hnsw_cpp_index_extend + :project: cuvs + :members: + :content-only: -.. doxygengroup:: hnsw_cpp_index_search +Index load +---------- + +.. doxygengroup:: hnsw_cpp_index_load :project: cuvs :members: :content-only: @@ -43,10 +58,10 @@ Index search :members: :content-only: -Index deserialize +Index serialize --------------- -.. doxygengroup:: hnsw_cpp_index_deserialize +.. doxygengroup:: hnsw_cpp_index_serialize :project: cuvs :members: :content-only: diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.py b/python/cuvs/cuvs/neighbors/hnsw/__init__.py index 5efcdf68b..fafff7d03 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/__init__.py +++ b/python/cuvs/cuvs/neighbors/hnsw/__init__.py @@ -13,10 +13,23 @@ # limitations under the License. -from .hnsw import Index, SearchParams, from_cagra, load, save, search +from .hnsw import ( + ExtendParams, + Index, + IndexParams, + SearchParams, + extend, + from_cagra, + load, + save, + search, +) __all__ = [ + "IndexParams", "Index", + "ExtendParams", + "extend", "SearchParams", "load", "save", diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd index 1cdc97406..e0c517933 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd +++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd @@ -20,14 +20,25 @@ from libc.stdint cimport int32_t, uintptr_t from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor from cuvs.distance_type cimport cuvsDistanceType +from cuvs.neighbors.cagra.cagra cimport cuvsCagraIndex_t cdef extern from "cuvs/neighbors/hnsw.h" nogil: - ctypedef struct cuvsHnswSearchParams: - int32_t ef - int32_t numThreads - ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t + ctypedef enum cuvsHnswHierarchy: + NONE + CPU + + ctypedef struct cuvsHnswIndexParams: + cuvsHnswHierarchy hierarchy + int32_t ef_construction + int32_t num_threads + + ctypedef cuvsHnswIndexParams* cuvsHnswIndexParams_t + + cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params) + + cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params) ctypedef struct cuvsHnswIndex: uintptr_t addr @@ -39,6 +50,31 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil: cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index) + ctypedef struct cuvsHnswExtendParams: + int32_t num_threads + + ctypedef cuvsHnswExtendParams* cuvsHnswExtendParams_t + + cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params) + + cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params) + + cuvsError_t cuvsHnswFromCagra(cuvsResources_t res, + cuvsHnswIndexParams_t params, + cuvsCagraIndex_t cagra_index, + cuvsHnswIndex_t hnsw_index) except + + + cuvsError_t cuvsHnswExtend(cuvsResources_t res, + cuvsHnswExtendParams_t params, + DLManagedTensor* data, + cuvsHnswIndex_t index) except + + + ctypedef struct cuvsHnswSearchParams: + int32_t ef + int32_t num_threads + + ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t + cuvsError_t cuvsHnswSearch(cuvsResources_t res, cuvsHnswSearchParams* params, cuvsHnswIndex_t index, @@ -46,7 +82,12 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil: DLManagedTensor* neighbors, DLManagedTensor* distances) except + + cuvsError_t cuvsHnswSerialize(cuvsResources_t res, + const char * filename, + cuvsHnswIndex_t index) except + + cuvsError_t cuvsHnswDeserialize(cuvsResources_t res, + cuvsHnswIndexParams_t params, const char * filename, int32_t dim, cuvsDistanceType metric, diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx index bcfaf167e..4c44350e8 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx +++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx @@ -39,41 +39,63 @@ from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -cdef class SearchParams: +cdef class IndexParams: """ - HNSW search parameters + Parameters to build index for HNSW nearest neighbor search Parameters ---------- - ef: int, default = 200 - Maximum number of candidate list size used during search. - num_threads: int, default = 0 - Number of CPU threads used to increase search parallelism. - When set to 0, the number of threads is automatically determined - using OpenMP's `omp_get_max_threads()`. + hierarchy : string, default = "none" (optional) + The hierarchy of the HNSW index. Valid values are ["none", "cpu"]. + - "none": No hierarchy is built. + - "cpu": Hierarchy is built using CPU. + ef_construction : int, default = 200 (optional) + Maximum number of candidate list size used during construction + when hierarchy is `cpu`. + num_threads : int, default = 2 (optional) + Number of CPU threads used to increase construction parallelism + when hierarchy is `cpu`. + NOTE: Constructing the hierarchy when converting from a CAGRA graph + is highly sensitive to parallelism, and increasing the number of + threads can reduce the quality of the index. """ - cdef cuvsHnswSearchParams params + cdef cuvsHnswIndexParams* params + + def __cinit__(self): + check_cuvs(cuvsHnswIndexParamsCreate(&self.params)) + + def __dealloc__(self): + check_cuvs(cuvsHnswIndexParamsDestroy(self.params)) def __init__(self, *, - ef=200, - num_threads=0): - self.params.ef = ef - self.params.numThreads = num_threads + hierarchy="none", + ef_construction=200, + num_threads=2): + if hierarchy == "none": + self.params.hierarchy = cuvsHnswHierarchy.NONE + elif hierarchy == "cpu": + self.params.hierarchy = cuvsHnswHierarchy.CPU + else: + raise ValueError("Invalid hierarchy type." + " Valid values are 'none' and 'cpu'.") + self.params.ef_construction = ef_construction + self.params.num_threads = num_threads - def __repr__(self): - attr_str = [attr + "=" + str(getattr(self, attr)) - for attr in [ - "ef", "num_threads"]] - return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")" + @property + def hierarchy(self): + if self.params.hierarchy == cuvsHnswHierarchy.NONE: + return "none" + elif self.params.hierarchy == cuvsHnswHierarchy.CPU: + return "cpu" @property - def ef(self): - return self.params.ef + def ef_construction(self): + return self.params.ef_construction @property def num_threads(self): - return self.params.numThreads + return self.params.num_threads cdef class Index: @@ -103,13 +125,44 @@ cdef class Index: return "Index(type=HNSW, metric=L2" + (", ".join(attr_str)) + ")" +cdef class ExtendParams: + """ + Parameters to extend the HNSW index with new data + + Parameters + ---------- + num_threads : int, default = 0 (optional) + Number of CPU threads used to increase construction parallelism. + When set to 0, the number of threads is automatically determined. + """ + + cdef cuvsHnswExtendParams* params + + def __cinit__(self): + check_cuvs(cuvsHnswExtendParamsCreate(&self.params)) + + def __dealloc__(self): + check_cuvs(cuvsHnswExtendParamsDestroy(self.params)) + + def __init__(self, *, + num_threads=0): + self.params.num_threads = num_threads + + @property + def num_threads(self): + return self.params.num_threads + + @auto_sync_resources -def save(filename, cagra.Index index, resources=None): +def save(filename, Index index, resources=None): """ Saves the CAGRA index to a file as an hnswlib index. - The saved index is immutable and can only be searched by the hnswlib - wrapper in cuVS, as the format is not compatible with the original - hnswlib. + If the index was constructed with `hnsw.IndexParams(hierarchy="none")`, + then the saved index is immutable and can only be searched by the hnswlib + wrapper in cuVS, as the format is not compatible with the original hnswlib. + However, if the index was constructed with + `hnsw.IndexParams(hierarchy="cpu")`, then the saved index is mutable and + compatible with the original hnswlib. Saving / loading the index is experimental. The serialization format is subject to change. @@ -119,7 +172,7 @@ def save(filename, cagra.Index index, resources=None): filename : string Name of the file. index : Index - Trained CAGRA index. + Trained HNSW index. {resources_docstring} Examples @@ -131,23 +184,28 @@ def save(filename, cagra.Index index, resources=None): >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> # Build index - >>> index = cagra.build(cagra.IndexParams(), dataset) + >>> cagra_index = cagra.build(cagra.IndexParams(), dataset) >>> # Serialize and deserialize the cagra index built - >>> hnsw.save("my_index.bin", index) + >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), cagra_index) + >>> hnsw.save("my_index.bin", hnsw_index) """ cdef string c_filename = filename.encode('utf-8') cdef cuvsResources_t res = resources.get_c_obj() - check_cuvs(cagra.cuvsCagraSerializeToHnswlib(res, - c_filename.c_str(), - index.index)) + check_cuvs(cuvsHnswSerialize(res, + c_filename.c_str(), + index.index)) @auto_sync_resources -def load(filename, dim, dtype, metric="sqeuclidean", resources=None): +def load(IndexParams index_params, filename, dim, dtype, metric="sqeuclidean", + resources=None): """ - Loads base-layer-only hnswlib index from file, which was originally - saved as a built CAGRA index. The loaded index is immutable and can only - be searched by the hnswlib wrapper in cuVS, as the format is not + Loads an HNSW index. + If the index was constructed with `hnsw.IndexParams(hierarchy="none")`, + then the loaded index is immutable and can only be searched by the hnswlib + wrapper in cuVS, as the format is not compatible with the original hnswlib. + However, if the index was constructed with + `hnsw.IndexParams(hierarchy="cpu")`, then the loaded index is mutable and compatible with the original hnswlib. Saving / loading the index is experimental. The serialization format is @@ -156,6 +214,8 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None): Parameters ---------- + index_params : IndexParams + Parameters that were used to convert CAGRA index to HNSW index. filename : string Name of the file. dim : int @@ -214,6 +274,7 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None): check_cuvs(cuvsHnswDeserialize( res, + index_params.params, c_filename.c_str(), dim, distance_type, @@ -224,26 +285,30 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None): @auto_sync_resources -def from_cagra(cagra.Index index, temporary_index_path=None, resources=None): +def from_cagra(IndexParams index_params, cagra.Index cagra_index, + temporary_index_path=None, resources=None): """ - Returns an hnsw base-layer-only index from a CAGRA index. - - NOTE: This method uses the filesystem to write the CAGRA index in - `/tmp/.bin` or the parameter `temporary_index_path` - if not None before reading it as an hnsw index, - then deleting the temporary file. The returned index is immutable - and can only be searched by the hnsw wrapper in cuVS, as the - format is not compatible with the original hnswlib library. - By `base_layer_only`, we mean that the hnsw index is created - without the additional layers that are used for the hierarchical - search in hnswlib. Instead, the base layer is used for the search. + Returns an HNSW index from a CAGRA index. + + NOTE: When `index_params.hierarchy` is: + 1. `NONE`: This method uses the filesystem to write the CAGRA index + in `/tmp/.bin` before reading it as an + hnswlib index, then deleting the temporary file. The + returned index is immutable and can only be searched by + the hnswlib wrapper in cuVS, as the format is not + compatible with the original hnswlib. + 2. `CPU`: The returned index is mutable and can be extended with + additional vectors. The serialized index is also compatible + with the original hnswlib library. Saving / loading the index is experimental. The serialization format is subject to change. Parameters ---------- - index : Index + index_params : IndexParams + Parameters to convert the CAGRA index to HNSW index. + cagra_index : cagra.Index Trained CAGRA index. temporary_index_path : string, default = None Path to save the temporary index file. If None, the temporary file @@ -262,18 +327,107 @@ def from_cagra(cagra.Index index, temporary_index_path=None, resources=None): >>> # Build index >>> index = cagra.build(cagra.IndexParams(), dataset) >>> # Serialize the CAGRA index to hnswlib base layer only index format - >>> hnsw_index = hnsw.from_cagra(index) + >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index) """ - uuid_num = uuid.uuid4() - filename = temporary_index_path if temporary_index_path else \ - f"/tmp/{uuid_num}.bin" - save(filename, index, resources=resources) - hnsw_index = load(filename, index.dim, np.dtype(index.active_index_type), - "sqeuclidean", resources=resources) - os.remove(filename) + + cdef Index hnsw_index = Index() + cdef cuvsResources_t res = resources.get_c_obj() + check_cuvs(cuvsHnswFromCagra( + res, + index_params.params, + cagra_index.index, + hnsw_index.index + )) + + hnsw_index.trained = True return hnsw_index +@auto_sync_resources +def extend(ExtendParams extend_params, Index index, data, resources=None): + """ + Extends the HNSW index with new data. + + Parameters + ---------- + extend_params : ExtendParams + index : Index + Trained HNSW index. + data : Host array interface compliant matrix shape (n_samples, dim) + Supported dtype [float32, int8, uint8] + {resources_docstring} + + Examples + -------- + >>> import numpy as np + >>> from cuvs.neighbors import hnsw, cagra + >>> + >>> n_samples = 50000 + >>> n_features = 50 + >>> dataset = np.random.random_sample((n_samples, n_features)) + >>> + >>> # Build index + >>> index = cagra.build(hnsw.IndexParams(), dataset) + >>> # Load index + >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(hierarchy="cpu"), index) + >>> # Extend the index with new data + >>> new_data = np.random.random_sample((n_samples, n_features)) + >>> hnsw.extend(hnsw.ExtendParams(), hnsw_index, new_data) + """ + + data_ai = wrap_array(data) + _check_input_array(data_ai, [np.dtype('float32'), + np.dtype('uint8'), + np.dtype('int8')]) + + cdef cydlpack.DLManagedTensor* data_dlpack = cydlpack.dlpack_c(data_ai) + cdef cuvsResources_t res = resources.get_c_obj() + + check_cuvs(cuvsHnswExtend( + res, + extend_params.params, + data_dlpack, + index.index + )) + + +cdef class SearchParams: + """ + HNSW search parameters + + Parameters + ---------- + ef: int, default = 200 + Maximum number of candidate list size used during search. + num_threads: int, default = 0 + Number of CPU threads used to increase search parallelism. + When set to 0, the number of threads is automatically determined + using OpenMP's `omp_get_max_threads()`. + """ + + cdef cuvsHnswSearchParams params + + def __init__(self, *, + ef=200, + num_threads=0): + self.params.ef = ef + self.params.num_threads = num_threads + + def __repr__(self): + attr_str = [attr + "=" + str(getattr(self, attr)) + for attr in [ + "ef", "num_threads"]] + return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")" + + @property + def ef(self): + return self.params.ef + + @property + def num_threads(self): + return self.params.num_threads + + @auto_sync_resources @auto_convert_output def search(SearchParams search_params, @@ -290,15 +444,15 @@ def search(SearchParams search_params, ---------- search_params : SearchParams index : Index - Trained CAGRA index. - queries : CUDA array interface compliant matrix shape (n_samples, dim) + Trained HNSW index. + queries : CPU array interface compliant matrix shape (n_samples, dim) Supported dtype [float, int] k : int The number of neighbors. - neighbors : Optional CUDA array interface compliant matrix shape + neighbors : Optional CPU array interface compliant matrix shape (n_queries, k), dtype uint64_t. If supplied, neighbor indices will be written here in-place. (default None) - distances : Optional CUDA array interface compliant matrix shape + distances : Optional CPU array interface compliant matrix shape (n_queries, k) If supplied, the distances to the neighbors will be written here in-place. (default None) {resources_docstring} @@ -323,7 +477,7 @@ def search(SearchParams search_params, ... num_threads=0 ... ) >>> # Convert CAGRA index to HNSW - >>> hnsw_index = hnsw.from_cagra(index) + >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index) >>> # Using a pooling allocator reduces overhead of temporary array >>> # creation during search. This is useful if multiple searches >>> # are performed with same query size. diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py index 20a35401e..20f583ae8 100644 --- a/python/cuvs/cuvs/test/test_hnsw.py +++ b/python/cuvs/cuvs/test/test_hnsw.py @@ -32,6 +32,7 @@ def run_hnsw_build_search_test( build_algo="ivf_pq", intermediate_graph_degree=128, graph_degree=64, + hierarchy="none", search_params={}, ): dataset = generate_data((n_rows, n_cols), dtype) @@ -53,7 +54,8 @@ def run_hnsw_build_search_test( assert index.trained - hnsw_index = hnsw.from_cagra(index) + hnsw_params = hnsw.IndexParams(hierarchy=hierarchy, num_threads=1) + hnsw_index = hnsw.from_cagra(hnsw_params, index) queries = generate_data((n_queries, n_cols), dtype) @@ -83,10 +85,93 @@ def run_hnsw_build_search_test( @pytest.mark.parametrize("num_threads", [2, 4]) @pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) -def test_hnsw(dtype, k, ef, num_threads, metric, build_algo): +@pytest.mark.parametrize("hierarchy", ["none", "cpu"]) +def test_hnsw(dtype, k, ef, num_threads, metric, build_algo, hierarchy): # Note that inner_product tests use normalized input which we cannot # represent in int8, therefore we test only sqeuclidean metric here. run_hnsw_build_search_test( + dtype=dtype, + k=k, + metric=metric, + build_algo=build_algo, + hierarchy=hierarchy, + search_params={"ef": ef, "num_threads": num_threads}, + ) + + +def run_hnsw_extend_test( + n_rows=10000, + add_rows=2000, + n_cols=10, + n_queries=100, + k=10, + dtype=np.float32, + metric="sqeuclidean", + build_algo="ivf_pq", + intermediate_graph_degree=128, + graph_degree=64, + search_params={}, +): + dataset = generate_data((n_rows, n_cols), dtype) + add_dataset = generate_data((add_rows, n_cols), dtype) + if metric == "inner_product": + dataset = normalize(dataset, norm="l2", axis=1) + add_dataset = normalize(add_dataset, norm="l2", axis=1) + if dtype in [np.int8, np.uint8]: + pytest.skip( + "inner_product metric is not supported for int8/uint8 data" + ) + if build_algo == "nn_descent": + pytest.skip("inner_product metric is not supported for nn_descent") + + build_params = cagra.IndexParams( + metric=metric, + intermediate_graph_degree=intermediate_graph_degree, + graph_degree=graph_degree, + build_algo=build_algo, + ) + + index = cagra.build(build_params, dataset) + + assert index.trained + + hnsw_params = hnsw.IndexParams(hierarchy="cpu", num_threads=1) + hnsw_index = hnsw.from_cagra(hnsw_params, index) + hnsw.extend(hnsw.ExtendParams(), hnsw_index, add_dataset) + + queries = generate_data((n_queries, n_cols), dtype) + + search_params = hnsw.SearchParams(**search_params) + + out_dist, out_idx = hnsw.search(search_params, hnsw_index, queries, k) + + # Calculate reference values with sklearn + skl_metric = { + "sqeuclidean": "sqeuclidean", + "inner_product": "cosine", + "euclidean": "euclidean", + }[metric] + nn_skl = NearestNeighbors( + n_neighbors=k, algorithm="brute", metric=skl_metric + ) + nn_skl.fit(np.vstack([dataset, add_dataset])) + skl_dist, skl_idx = nn_skl.kneighbors(queries, return_distance=True) + + recall = calc_recall(out_idx, skl_idx) + print(recall) + assert recall > 0.95 + + +@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) +@pytest.mark.parametrize("k", [10, 20]) +@pytest.mark.parametrize("ef", [30, 40]) +@pytest.mark.parametrize("num_threads", [2, 4]) +@pytest.mark.parametrize("metric", ["sqeuclidean"]) +@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) +def test_hnsw_extend(dtype, k, ef, num_threads, metric, build_algo): + # Note that inner_product tests use normalized input which we cannot + # represent in int8, therefore we test only sqeuclidean metric here. + run_hnsw_extend_test( dtype=dtype, k=k, metric=metric, diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml index f1a7f272c..90a561bca 100644 --- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml +++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml @@ -4,8 +4,11 @@ constraints: groups: base: build: - graph_degree: [32, 64, 128, 256] + graph_degree: [32, 64, 96, 128] intermediate_graph_degree: [32, 64, 96, 128] graph_build_algo: ["NN_DESCENT"] + hierarchy: ["none", "cpu"] + ef_construction: [64, 128, 256, 512] + num_threads: [2, 5, 10] search: ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800] From e0aebfd0c499189585319f5a5fbf46dfb9ce04f9 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 3 Dec 2024 14:41:47 -0600 Subject: [PATCH 37/47] add a README for wheels (#504) Wheel-building CI jobs are failing like this: > Checking final_dist/cuvs_cu12-25.2.0a26-cp310-cp310-manylinux_2_28_aarch64.whl: FAILED due to warnings > WARNING `long_description` missing. > Error: Process completed with exit code 1. ([build link](https://github.com/rapidsai/cuvs/actions/runs/12133882036)) Looks like the root cause is a combination of the following: * there was a new `twine` release (6.x) 3 days ago: https://pypi.org/project/twine/#history * it contains https://github.com/pypa/twine/pull/1168, which makes `twine check --strict` fail if the wheel's `long_description` is empty * the `cuvs` wheel README (used as the wheel `long_description`) is empty This proposes adding a small README, with just 2 sentences copied from the project's root-level README, to get past that check. ## Notes for Reviewers The `long_description` becomes the project homepage when a project is hosted on PyPI. The wheels produced from this repo aren't currently being published to pypi.org so this change won't be seen there, but a more user-friendly README should be added if/when we decide to publish `cuvs-cu{11,12}` to pypi.org. ref: https://github.com/rapidsai/build-planning/issues/70 --- python/cuvs/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cuvs/README.md b/python/cuvs/README.md index e69de29bb..27b494811 100644 --- a/python/cuvs/README.md +++ b/python/cuvs/README.md @@ -0,0 +1,3 @@ +# cuVS + +cuVS contains state-of-the-art implementations of several algorithms for running approximate nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering. From fbbca0570db27d476b500ef021c03482b0d989e2 Mon Sep 17 00:00:00 2001 From: Micka Date: Wed, 4 Dec 2024 00:57:52 +0100 Subject: [PATCH 38/47] Add Question Retrieval notebook using Milvus (#451) This notebook is adapting the Question Retrieval nb to use Milvus. It can serve as a good example on how to do Bulk ingest, how to use cuVS, and especially CAGRA+HNSW on Milvus Authors: - Micka (https://github.com/lowener) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/451 --- ...ectorSearch_QuestionRetrieval_Milvus.ipynb | 732 ++++++++++++++++++ 1 file changed, 732 insertions(+) create mode 100644 notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb diff --git a/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb new file mode 100644 index 000000000..09a6cca43 --- /dev/null +++ b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb @@ -0,0 +1,732 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f5499b54", + "metadata": {}, + "source": [ + "\n", + "# Similar Questions Retrieval - Milvus - CAGRA-HNSW\n", + "\n", + "This notebook is inspired by the [similar search example of Sentence-Transformers](https://www.sbert.net/examples/applications/semantic-search/README.html#similar-questions-retrieval), and adapted to be used with [Milvus](https://milvus.io) and [cuVS](https://rapids.ai/cuvs/).\n", + "\n", + "The model was pre-trained on the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions). It consists of about 100k real Google search queries, together with an annotated passage from Wikipedia that provides the answer. It is an example of an asymmetric search task. As corpus, we use the smaller [Simple English Wikipedia](http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz) so that it fits easily into memory.\n", + "\n", + "The steps to install the latest Milvus package are available in the [Milvus documentation](https://milvus.io/docs/quickstart.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8d55ede", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:47:21.149465Z", + "iopub.status.busy": "2024-11-08T14:47:21.149218Z", + "iopub.status.idle": "2024-11-08T14:47:23.440275Z", + "shell.execute_reply": "2024-11-08T14:47:23.439436Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "!pip install sentence_transformers torch pymilvus pymilvus[bulk_writer] dask dask[distributed]\n", + "\n", + "# Note: if you have a Hopper based GPU, like an H100, use these to install:\n", + "# pip install torch --index-url https://download.pytorch.org/whl/cu118\n", + "# pip install sentence_transformers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb1e81c3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:47:23.444058Z", + "iopub.status.busy": "2024-11-08T14:47:23.443683Z", + "iopub.status.idle": "2024-11-08T14:47:24.219903Z", + "shell.execute_reply": "2024-11-08T14:47:24.219228Z" + } + }, + "outputs": [], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee4c5cc0", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:47:24.223131Z", + "iopub.status.busy": "2024-11-08T14:47:24.222874Z", + "iopub.status.idle": "2024-11-08T14:47:34.024085Z", + "shell.execute_reply": "2024-11-08T14:47:34.023435Z" + } + }, + "outputs": [], + "source": [ + "import dask.array as da\n", + "import gzip\n", + "import json\n", + "import math\n", + "import numpy as np\n", + "import os\n", + "import pymilvus\n", + "import time\n", + "import torch\n", + "\n", + "from minio import Minio\n", + "from multiprocessing import Process\n", + "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n", + "from typing import List\n", + "\n", + "\n", + "from pymilvus import (\n", + " connections, utility\n", + ")\n", + "from pymilvus.bulk_writer import LocalBulkWriter, BulkFileType # pip install pymilvus[bulk_writer]\n", + "\n", + "if not torch.cuda.is_available():\n", + " print(\"Warning: No GPU found. Please add GPU to your notebook\")" + ] + }, + { + "cell_type": "markdown", + "id": "47cabaca", + "metadata": {}, + "source": [ + "# Setup Milvus Collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcd259c", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:47:34.027677Z", + "iopub.status.busy": "2024-11-08T14:47:34.027288Z", + "iopub.status.idle": "2024-11-08T14:47:34.109212Z", + "shell.execute_reply": "2024-11-08T14:47:34.108609Z" + } + }, + "outputs": [], + "source": [ + "DIM = 768\n", + "MILVUS_PORT = 30004\n", + "MILVUS_HOST = f\"http://localhost:{MILVUS_PORT}\"\n", + "ID_FIELD=\"id\"\n", + "EMBEDDING_FIELD=\"embedding\"\n", + "\n", + "collection_name = \"simple_wiki\"\n", + "\n", + "def get_milvus_client():\n", + " return pymilvus.MilvusClient(uri=MILVUS_HOST)\n", + "\n", + "client = get_milvus_client()\n", + "\n", + "fields = [\n", + " pymilvus.FieldSchema(name=ID_FIELD, dtype=pymilvus.DataType.INT64, is_primary=True),\n", + " pymilvus.FieldSchema(name=EMBEDDING_FIELD, dtype=pymilvus.DataType.FLOAT_VECTOR, dim=DIM)\n", + "]\n", + "\n", + "schema = pymilvus.CollectionSchema(fields)\n", + "schema.verify()\n", + "\n", + "if collection_name in client.list_collections():\n", + " print(f\"Collection '{collection_name}' already exists. Deleting collection...\")\n", + " client.drop_collection(collection_name)\n", + "\n", + "client.create_collection(collection_name, schema=schema, dimension=DIM, vector_field_name=EMBEDDING_FIELD)\n", + "collection = pymilvus.Collection(name=collection_name, using=client._using)\n", + "collection.release()\n", + "collection.drop_index()\n" + ] + }, + { + "cell_type": "markdown", + "id": "00bd20f5", + "metadata": {}, + "source": [ + "# Setup Sentence Transformer model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a1a6307", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:47:34.111782Z", + "iopub.status.busy": "2024-11-08T14:47:34.111556Z", + "iopub.status.idle": "2024-11-08T14:47:39.654323Z", + "shell.execute_reply": "2024-11-08T14:47:39.653386Z" + } + }, + "outputs": [], + "source": [ + "# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search\n", + "model_name = 'nq-distilbert-base-v1'\n", + "bi_encoder = SentenceTransformer(model_name)\n", + "\n", + "# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only\n", + "# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder\n", + "\n", + "wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'\n", + "\n", + "if not os.path.exists(wikipedia_filepath):\n", + " util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n", + "\n", + "passages = []\n", + "with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:\n", + " for line in fIn:\n", + " data = json.loads(line.strip())\n", + " for paragraph in data['paragraphs']:\n", + " # We encode the passages as [title, text]\n", + " passages.append([data['title'], paragraph])\n", + "\n", + "# If you like, you can also limit the number of passages you want to use\n", + "print(\"Passages:\", len(passages))\n", + "\n", + "# To speed things up, pre-computed embeddings are downloaded.\n", + "# The provided file encoded the passages with the model 'nq-distilbert-base-v1'\n", + "if model_name == 'nq-distilbert-base-v1':\n", + " embeddings_filepath = 'simplewiki-2020-11-01-nq-distilbert-base-v1.pt'\n", + " if not os.path.exists(embeddings_filepath):\n", + " util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt', embeddings_filepath)\n", + "\n", + " corpus_embeddings = torch.load(embeddings_filepath, map_location='cpu', weights_only=True).float() # Convert embedding file to float\n", + " #if torch.cuda.is_available():\n", + " # corpus_embeddings = corpus_embeddings.to('cuda')\n", + "else: # Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)\n", + " corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True).to('cpu')" + ] + }, + { + "cell_type": "markdown", + "id": "1f4e9b9d", + "metadata": {}, + "source": [ + "# Vector Search using Milvus and RAPIDS cuVS \n", + "Now that our embeddings are ready to be indexed and that the model has been loaded, we can use Milvus and RAPIDS cuVS to do our vector search.\n", + "\n", + "This is done in 3 steps: First we ingest all the vectors in the Milvus collection, then we build the Milvus index, to finally search it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "563751c1", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:47:39.658832Z", + "iopub.status.busy": "2024-11-08T14:47:39.658374Z", + "iopub.status.idle": "2024-11-08T14:49:47.244768Z", + "shell.execute_reply": "2024-11-08T14:49:47.244162Z" + } + }, + "outputs": [], + "source": [ + "# minio\n", + "MINIO_PORT = 30009\n", + "MINIO_URL = f\"localhost:{MINIO_PORT}\"\n", + "MINIO_SECRET_KEY = \"minioadmin\"\n", + "MINIO_ACCESS_KEY = \"minioadmin\"\n", + "\n", + "def upload_to_minio(file_paths: List[List[str]], remote_paths: List[List[str]], bucket_name=\"milvus-bucket\"):\n", + " minio_client = Minio(endpoint=MINIO_URL, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)\n", + " if not minio_client.bucket_exists(bucket_name):\n", + " minio_client.make_bucket(bucket_name)\n", + "\n", + " for local_batch, remote_batch in zip(file_paths, remote_paths):\n", + " for local_file, remote_file in zip(local_batch, remote_batch):\n", + " minio_client.fput_object(bucket_name, \n", + " object_name=remote_file,\n", + " file_path=local_file,\n", + " part_size=512 * 1024 * 1024,\n", + " num_parallel_uploads=5)\n", + " \n", + " \n", + "def ingest_data_bulk(collection_name, vectors, schema: pymilvus.CollectionSchema, log_times=True, bulk_writer_type=\"milvus\", debug=False):\n", + " print(f\"- Ingesting {len(vectors) // 1000}k vectors, Bulk\")\n", + " tic = time.perf_counter()\n", + " collection = pymilvus.Collection(collection_name, using=get_milvus_client()._using)\n", + " remote_path = None\n", + "\n", + " if bulk_writer_type == 'milvus':\n", + " # # Prepare source data for faster ingestion\n", + " writer = LocalBulkWriter(\n", + " schema=schema,\n", + " local_path='bulk_data',\n", + " segment_size=512 * 1024 * 1024, # Default value\n", + " file_type=BulkFileType.NPY\n", + " )\n", + " for id, vec in enumerate(vectors):\n", + " writer.append_row({ID_FIELD: id, EMBEDDING_FIELD: vec})\n", + "\n", + " if debug:\n", + " print(writer.batch_files)\n", + " def callback(file_list):\n", + " if debug:\n", + " print(f\" - Commit successful\")\n", + " print(file_list)\n", + " writer.commit(call_back=callback)\n", + " files_to_upload = writer.batch_files\n", + " elif bulk_writer_type == 'dask':\n", + " # Prepare source data for faster ingestion\n", + " if not os.path.isdir(\"bulk_data\"):\n", + " os.mkdir(\"bulk_data\")\n", + "\n", + " from dask.distributed import Client, LocalCluster\n", + " cluster = LocalCluster(n_workers=1, threads_per_worker=1)\n", + " client = Client(cluster)\n", + "\n", + " chunk_size = 100000\n", + " da_vectors = da.from_array(vectors, chunks=(chunk_size, vectors.shape[1]))\n", + " da_ids = da.arange(len(vectors), chunks=(chunk_size,))\n", + " da.to_npy_stack(\"bulk_data/da_embedding/\", da_vectors)\n", + " da.to_npy_stack(\"bulk_data/da_id/\", da_ids)\n", + " files_to_upload = []\n", + " remote_path = []\n", + " for chunk_nb in range(math.ceil(len(vectors) / chunk_size)):\n", + " files_to_upload.append([f\"bulk_data/da_embedding/{chunk_nb}.npy\", f\"bulk_data/da_id/{chunk_nb}.npy\"])\n", + " remote_path.append([f\"bulk_data/da_{chunk_nb}/embedding.npy\", f\"bulk_data/da__{chunk_nb}/id.npy\"])\n", + "\n", + " elif bulk_writer_type == 'numpy':\n", + " # Directly save NPY files\n", + " np.save(\"bulk_data/embedding.npy\", vectors)\n", + " np.save(\"bulk_data/id.npy\", np.arange(len(vectors)))\n", + " files_to_upload = [[\"bulk_data/embedding.npy\", \"bulk_data/id.npy\"]]\n", + " else:\n", + " raise ValueError(\"Invalid bulk writer type\")\n", + " \n", + " toc = time.perf_counter()\n", + " if log_times:\n", + " print(f\" - File save time: {toc - tic:.2f} seconds\")\n", + " # Import data\n", + " if remote_path is None:\n", + " remote_path = files_to_upload\n", + " upload_to_minio(files_to_upload, remote_path)\n", + " \n", + " job_ids = [utility.do_bulk_insert(collection_name, batch, using=get_milvus_client()._using) for batch in remote_path]\n", + "\n", + " while True:\n", + " tasks = [utility.get_bulk_insert_state(job_id, using=get_milvus_client()._using) for job_id in job_ids]\n", + " success = all(task.state_name == \"Completed\" for task in tasks)\n", + " failure = any(task.state_name == \"Failed\" for task in tasks)\n", + " for i in range(len(tasks)):\n", + " task = tasks[i]\n", + " if debug:\n", + " print(f\" - Task {i}/{len(tasks)} state: {task.state_name}, Progress percent: {task.infos['progress_percent']}, Imported row count: {task.row_count}\")\n", + " if task.state_name == \"Failed\":\n", + " print(task)\n", + " if success or failure:\n", + " break\n", + " time.sleep(2)\n", + "\n", + " added_entities = str(sum([task.row_count for task in tasks]))\n", + " failure = failure or added_entities != str(len(vectors))\n", + " if failure:\n", + " print(f\"- Ingestion failed. Added entities: {added_entities}\")\n", + " toc = time.perf_counter()\n", + " if log_times:\n", + " datasize = vectors.nbytes / 1024 / 1024\n", + " print(f\"- Ingestion time: {toc - tic:.2f} seconds. ({(datasize / (toc-tic)):.2f}MB/s)\")\n", + "\n", + "ingest_data_bulk(collection_name, np.array(corpus_embeddings), schema, bulk_writer_type='dask', log_times=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad90b4be", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:49:47.247498Z", + "iopub.status.busy": "2024-11-08T14:49:47.247268Z", + "iopub.status.idle": "2024-11-08T14:50:00.737502Z", + "shell.execute_reply": "2024-11-08T14:50:00.736808Z" + } + }, + "outputs": [], + "source": [ + "# Setups the IVFPQ index\n", + "\n", + "index_params = dict(\n", + " index_type=\"GPU_IVF_PQ\",\n", + " metric_type=\"L2\",\n", + " params={\"nlist\": 150, # Number of clusters\n", + " \"m\": 96}) # Product Quantization dimension\n", + "\n", + "# Drop the index if it exists\n", + "if collection.has_index():\n", + " collection.release()\n", + " collection.drop_index()\n", + "\n", + "# Create the index\n", + "tic = time.perf_counter()\n", + "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n", + "collection.load()\n", + "toc = time.perf_counter()\n", + "print(f\"- Index creation time: {toc - tic:.4f} seconds. ({index_params})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c75acea7", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:00.740443Z", + "iopub.status.busy": "2024-11-08T14:50:00.740142Z", + "iopub.status.idle": "2024-11-08T14:50:00.745403Z", + "shell.execute_reply": "2024-11-08T14:50:00.744672Z" + } + }, + "outputs": [], + "source": [ + "# Search the index\n", + "def search_cuvs_pq(query, top_k = 5, n_probe = 30):\n", + " # Encode the query using the bi-encoder and find potentially relevant passages\n", + " question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n", + "\n", + " search_params = {\"nprobe\": n_probe}\n", + " tic = time.perf_counter()\n", + " hits = collection.search(\n", + " data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n", + " )\n", + " toc = time.perf_counter()\n", + "\n", + " # Output of top-k hits\n", + " print(\"Input question:\", query)\n", + " print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n", + " for k in range(top_k):\n", + " print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))" + ] + }, + { + "cell_type": "markdown", + "id": "07935bca", + "metadata": {}, + "source": [ + "The ideal use-case for the IVF-PQ algorithm is when there is a need to reduce the memory footprint while keeping a good accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c27d4715", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:00.748001Z", + "iopub.status.busy": "2024-11-08T14:50:00.747783Z", + "iopub.status.idle": "2024-11-08T14:50:01.785914Z", + "shell.execute_reply": "2024-11-08T14:50:01.785223Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_pq(query=\"Who was Grace Hopper?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc375518", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:01.788877Z", + "iopub.status.busy": "2024-11-08T14:50:01.788640Z", + "iopub.status.idle": "2024-11-08T14:50:01.813820Z", + "shell.execute_reply": "2024-11-08T14:50:01.813153Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_pq(query=\"Who was Alan Turing?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab154181", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:01.816625Z", + "iopub.status.busy": "2024-11-08T14:50:01.816362Z", + "iopub.status.idle": "2024-11-08T14:50:01.839593Z", + "shell.execute_reply": "2024-11-08T14:50:01.838986Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_pq(query = \"What is creating tides?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "836344ec", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:01.842319Z", + "iopub.status.busy": "2024-11-08T14:50:01.842022Z", + "iopub.status.idle": "2024-11-08T14:50:15.969324Z", + "shell.execute_reply": "2024-11-08T14:50:15.968562Z" + } + }, + "outputs": [], + "source": [ + "# Drop the current index if it exists\n", + "if collection.has_index():\n", + " collection.release()\n", + " collection.drop_index()\n", + "\n", + "# Create the IVF Flat index\n", + "index_params = dict(\n", + " index_type=\"GPU_IVF_FLAT\",\n", + " metric_type=\"L2\",\n", + " params={\"nlist\": 150}) # Number of clusters)\n", + "tic = time.perf_counter()\n", + "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n", + "collection.load()\n", + "toc = time.perf_counter()\n", + "print(f\"- Index creation time: {toc - tic:.4f} seconds. ({index_params})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2d6017ed", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:15.972764Z", + "iopub.status.busy": "2024-11-08T14:50:15.972368Z", + "iopub.status.idle": "2024-11-08T14:50:15.977806Z", + "shell.execute_reply": "2024-11-08T14:50:15.977064Z" + } + }, + "outputs": [], + "source": [ + "def search_cuvs_flat(query, top_k = 5, n_probe = 30):\n", + " # Encode the query using the bi-encoder and find potentially relevant passages\n", + " question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n", + " \n", + " search_params = {\"nprobe\": n_probe}\n", + " tic = time.perf_counter()\n", + " hits = collection.search(\n", + " data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n", + " )\n", + " toc = time.perf_counter()\n", + "\n", + " # Output of top-k hits\n", + " print(\"Input question:\", query)\n", + " print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n", + " for k in range(top_k):\n", + " print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5cfb644", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:15.980796Z", + "iopub.status.busy": "2024-11-08T14:50:15.980408Z", + "iopub.status.idle": "2024-11-08T14:50:16.009271Z", + "shell.execute_reply": "2024-11-08T14:50:16.008579Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_flat(query=\"Who was Grace Hopper?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5694d00", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:16.012253Z", + "iopub.status.busy": "2024-11-08T14:50:16.011924Z", + "iopub.status.idle": "2024-11-08T14:50:16.043432Z", + "shell.execute_reply": "2024-11-08T14:50:16.042751Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_flat(query=\"Who was Alan Turing?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcfc3c5b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:16.046439Z", + "iopub.status.busy": "2024-11-08T14:50:16.046093Z", + "iopub.status.idle": "2024-11-08T14:50:16.071322Z", + "shell.execute_reply": "2024-11-08T14:50:16.070614Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_flat(query = \"What is creating tides?\")" + ] + }, + { + "cell_type": "markdown", + "id": "a59d7b32-0832-4c3a-864e-aeb2e6e7fe1f", + "metadata": {}, + "source": [ + "## Using CAGRA: Hybrid GPU-CPU graph-based Vector Search\n", + "\n", + "CAGRA is a graph-based nearest neighbors implementation with state-of-the art performance for both small- and large-batch sized vector searches. \n", + "\n", + "CAGRA follows the same steps as IVF-FLAT and IVF-PQ in Milvus, but is also able to be adapted for querying on CPU.\n", + "This means that CAGRA is able to profit from a high training speed on GPU, as well as a low inference time on CPU, that minimize latency even on the smallest queries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5ce4dab", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:16.074449Z", + "iopub.status.busy": "2024-11-08T14:50:16.074128Z", + "iopub.status.idle": "2024-11-08T14:50:30.479027Z", + "shell.execute_reply": "2024-11-08T14:50:30.478265Z" + } + }, + "outputs": [], + "source": [ + "# Drop the current index if it exists\n", + "if collection.has_index():\n", + " collection.release()\n", + " collection.drop_index()\n", + "\n", + "# Create the IVF Flat index\n", + "index_params = dict(\n", + " index_type=\"GPU_CAGRA\",\n", + " metric_type=\"L2\",\n", + " params={\"graph_degree\": 64, \"intermediate_graph_degree\": 128, \"build_algo\": \"NN_DESCENT\", \"adapt_for_cpu\": True})\n", + "tic = time.perf_counter()\n", + "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n", + "collection.load()\n", + "toc = time.perf_counter()\n", + "print(f\"- Index creation time: {toc - tic:.4f} seconds. ({index_params})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "df229e21-f6b6-4d6c-ad54-2724f8738934", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:30.481748Z", + "iopub.status.busy": "2024-11-08T14:50:30.481474Z", + "iopub.status.idle": "2024-11-08T14:50:30.486324Z", + "shell.execute_reply": "2024-11-08T14:50:30.485696Z" + } + }, + "outputs": [], + "source": [ + "def search_cuvs_cagra(query, top_k = 5, itopk = 32):\n", + " # Encode the query using the bi-encoder and find potentially relevant passages\n", + " question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n", + "\n", + " search_params = {\"params\": {\"itopk\": itopk, \"ef\": 35}}\n", + " tic = time.perf_counter()\n", + " hits = collection.search(\n", + " data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n", + " )\n", + " toc = time.perf_counter()\n", + "\n", + " # Output of top-k hits\n", + " print(\"Input question:\", query)\n", + " print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n", + " for k in range(top_k):\n", + " print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5e862fd-b7e5-4423-8fbf-36918f02c8f3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:30.489077Z", + "iopub.status.busy": "2024-11-08T14:50:30.488790Z", + "iopub.status.idle": "2024-11-08T14:50:30.513998Z", + "shell.execute_reply": "2024-11-08T14:50:30.513319Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_cagra(query=\"Who was Grace Hopper?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb8a5b7b", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:30.516748Z", + "iopub.status.busy": "2024-11-08T14:50:30.516521Z", + "iopub.status.idle": "2024-11-08T14:50:30.538982Z", + "shell.execute_reply": "2024-11-08T14:50:30.538269Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_cagra(query=\"Who was Alan Turing?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c89810a", + "metadata": { + "execution": { + "iopub.execute_input": "2024-11-08T14:50:30.541508Z", + "iopub.status.busy": "2024-11-08T14:50:30.541287Z", + "iopub.status.idle": "2024-11-08T14:50:30.562722Z", + "shell.execute_reply": "2024-11-08T14:50:30.562085Z" + } + }, + "outputs": [], + "source": [ + "search_cuvs_cagra(query=\"What is creating tides?\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From acbd097ed15afe186367b5a46a5d4b366ac9d804 Mon Sep 17 00:00:00 2001 From: tsuki <12711693+enp1s0@users.noreply.github.com> Date: Wed, 4 Dec 2024 18:06:17 +0900 Subject: [PATCH 39/47] [BUG] Fix CAGRA filter (#489) Ref : https://github.com/rapidsai/cuvs/issues/472 ## The cause of the bug The bitonic sort was used on an array that was not a power of 2 long. In the current search implementation, the bitonic sort is used to move the invalid elements to the end of the buffer as: https://github.com/rapidsai/cuvs/blob/5062594138a40231475299c7bac61083b0669fd1/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh#L758-L763 https://github.com/rapidsai/cuvs/blob/5062594138a40231475299c7bac61083b0669fd1/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh#L644-L649 The problem is that the (max) array length (=`MAX_ITOPK + MAX_CANDIDATES`) is not always the power of two. These bitonic sorts are called even if no elements are filtered out unless `cuvs::neighbors::filtering::none_sample_filter` is specified as the filter, so #472 occurs. ## Fix This PR changes the filtering process so that the bitonic sort is not used to move the invalid elements to the end of the buffer. Authors: - tsuki (https://github.com/enp1s0) Approvers: - Artem M. Chirkin (https://github.com/achirkin) URL: https://github.com/rapidsai/cuvs/pull/489 --- .../detail/cagra/search_single_cta.cuh | 16 +- .../cagra/search_single_cta_kernel-inl.cuh | 182 +++++++++++++----- cpp/test/neighbors/ann_cagra.cuh | 6 +- 3 files changed, 153 insertions(+), 51 deletions(-) diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh index 2bed19009..fa71dbaf9 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh @@ -129,17 +129,27 @@ struct search : search_plan_impl { (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 + sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width + sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t); - smem_size = base_smem_size; + + std::uint32_t additional_smem_size = 0; if (num_itopk_candidates > 256) { // Tentatively calculate the required share memory size when radix // sort based topk is used, assuming the block size is the maximum. if (itopk_size <= 256) { - smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t); + additional_smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t); } else { - smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t); + additional_smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t); } } + if (!std::is_same_v) { + // For filtering postprocess + using scan_op_t = cub::WarpScan; + additional_smem_size = + std::max(additional_smem_size, sizeof(scan_op_t::TempStorage)); + } + + smem_size = base_smem_size + additional_smem_size; + uint32_t block_size = thread_block_size; if (block_size == 0) { block_size = min_block_size; diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 79cb6bc10..678ed0cb4 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -111,7 +111,7 @@ RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const termin } template -RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st( +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_full( float* candidate_distances, // [num_candidates] IdxT* candidate_indices, // [num_candidates] const std::uint32_t num_candidates, @@ -215,7 +215,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st( } template -RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd( +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge( float* itopk_distances, // [num_itopk] IdxT* itopk_indices, // [num_itopk] const std::uint32_t num_itopk, @@ -424,7 +424,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd( template -RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort( +RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge( float* itopk_distances, // [num_itopk] IdxT* itopk_indices, // [num_itopk] const std::uint32_t num_itopk, @@ -437,20 +437,62 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort( const unsigned MULTI_WARPS_2) { // The results in candidate_distances/indices are sorted by bitonic sort. - topk_by_bitonic_sort_1st( + topk_by_bitonic_sort_and_full( candidate_distances, candidate_indices, num_candidates, num_itopk, MULTI_WARPS_1); // The results sorted above are merged with the internal intermediate top-k // results so far using bitonic merge. - topk_by_bitonic_sort_2nd(itopk_distances, - itopk_indices, - num_itopk, - candidate_distances, - candidate_indices, - num_candidates, - work_buf, - first, - MULTI_WARPS_2); + topk_by_bitonic_sort_and_merge(itopk_distances, + itopk_indices, + num_itopk, + candidate_distances, + candidate_indices, + num_candidates, + work_buf, + first, + MULTI_WARPS_2); +} + +// This function move the invalid index element to the end of the itopk list. +// Require : array_length % 32 == 0 && The invalid entry is only one. +template +RAFT_DEVICE_INLINE_FUNCTION void move_invalid_to_end_of_list(IdxT* const index_array, + float* const distance_array, + const std::uint32_t array_length) +{ + constexpr std::uint32_t warp_size = 32; + constexpr std::uint32_t invalid_index = utils::get_max_value(); + const std::uint32_t lane_id = threadIdx.x % warp_size; + + if (threadIdx.x >= warp_size) { return; } + + bool found_invalid = false; + if (array_length % warp_size == 0) { + for (std::uint32_t i = lane_id; i < array_length; i += warp_size) { + const auto index = index_array[i]; + const auto distance = distance_array[i]; + + if (found_invalid) { + index_array[i - 1] = index; + distance_array[i - 1] = distance; + } else { + // Check if the index is invalid + const auto I_found_invalid = (index == invalid_index); + const auto who_has_invalid = raft::ballot(I_found_invalid); + // if a value that is loaded by a smaller lane id thread, shift the array + if (who_has_invalid << (warp_size - lane_id)) { + index_array[i - 1] = index; + distance_array[i - 1] = distance; + } + + found_invalid = who_has_invalid; + } + } + } + if (lane_id == 0) { + index_array[array_length - 1] = invalid_index; + distance_array[array_length - 1] = utils::get_max_value(); + } } template @@ -589,10 +631,10 @@ __device__ void search_core( // sort if constexpr (TOPK_BY_BITONIC_SORT) { // [Notice] - // It is good to use multiple warps in topk_by_bitonic_sort() when + // It is good to use multiple warps in topk_by_bitonic_sort_and_merge() when // batch size is small (short-latency), but it might not be always good // when batch size is large (high-throughput). - // topk_by_bitonic_sort() consists of two operations: + // topk_by_bitonic_sort_and_merge() consists of two operations: // if MAX_CANDIDATES is greater than 128, the first operation uses two warps; // if MAX_ITOPK is greater than 256, the second operation used two warps. const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0; @@ -601,9 +643,9 @@ __device__ void search_core( // reset small-hash table. if ((iter + 1) % small_hash_reset_interval == 0) { // Depending on the block size and the number of warps used in - // topk_by_bitonic_sort(), determine which warps are used to reset + // topk_by_bitonic_sort_and_merge(), determine which warps are used to reset // the small hash and whether they are performed in overlap with - // topk_by_bitonic_sort(). + // topk_by_bitonic_sort_and_merge(). _CLK_START(); unsigned hash_start_tid; if (blockDim.x == 32) { @@ -627,28 +669,28 @@ __device__ void search_core( // topk with bitonic sort _CLK_START(); - if (std::is_same::value || - *filter_flag == 0) { - topk_by_bitonic_sort(result_distances_buffer, - result_indices_buffer, - internal_topk, - result_distances_buffer + internal_topk, - result_indices_buffer + internal_topk, - search_width * graph_degree, - topk_ws, - (iter == 0), - multi_warps_1, - multi_warps_2); - __syncthreads(); - } else { - topk_by_bitonic_sort_1st( - result_distances_buffer, - result_indices_buffer, - internal_topk + search_width * graph_degree, - internal_topk, - false); + if (!(std::is_same::value || + *filter_flag == 0)) { + // Move the filtered out index to the end of the itopk list + for (unsigned i = 0; i < search_width; i++) { + move_invalid_to_end_of_list( + result_indices_buffer, result_distances_buffer, internal_topk); + } + if (threadIdx.x == 0) { *terminate_flag = 0; } } + topk_by_bitonic_sort_and_merge( + result_distances_buffer, + result_indices_buffer, + internal_topk, + result_distances_buffer + internal_topk, + result_indices_buffer + internal_topk, + search_width * graph_degree, + topk_ws, + (iter == 0), + multi_warps_1, + multi_warps_2); + __syncthreads(); _CLK_REC(clk_topk); } else { _CLK_START(); @@ -755,12 +797,66 @@ __device__ void search_core( } __syncthreads(); - topk_by_bitonic_sort_1st( - result_distances_buffer, - result_indices_buffer, - internal_topk + search_width * graph_degree, - top_k, - false); + // Move invalid index items to the end of the buffer without sorting the entire buffer + using scan_op_t = cub::WarpScan; + auto& temp_storage = *reinterpret_cast(smem_work_ptr); + + constexpr std::uint32_t warp_size = 32; + if (threadIdx.x < warp_size) { + std::uint32_t num_found_valid = 0; + for (std::uint32_t buffer_offset = 0; buffer_offset < internal_topk; + buffer_offset += warp_size) { + // Calculate the new buffer index + const auto src_position = buffer_offset + threadIdx.x; + const std::uint32_t is_valid_index = + (result_indices_buffer[src_position] & (~index_msb_1_mask)) == invalid_index ? 0 : 1; + std::uint32_t new_position; + scan_op_t(temp_storage).InclusiveSum(is_valid_index, new_position); + if (is_valid_index) { + const auto dst_position = num_found_valid + (new_position - 1); + result_indices_buffer[dst_position] = result_indices_buffer[src_position]; + result_distances_buffer[dst_position] = result_distances_buffer[src_position]; + } + + // Calculate the largest valid position within a warp and bcast it for the next iteration + num_found_valid += new_position; + for (std::uint32_t offset = (warp_size >> 1); offset > 0; offset >>= 1) { + const auto v = raft::shfl_xor(num_found_valid, offset); + if ((threadIdx.x & offset) == 0) { num_found_valid = v; } + } + + // If the enough number of items are found, do early termination + if (num_found_valid >= top_k) { break; } + } + + if (num_found_valid < top_k) { + // Fill the remaining buffer with invalid values so that `topk_by_bitonic_sort_and_merge` is + // usable in the next step + for (std::uint32_t i = num_found_valid + threadIdx.x; i < internal_topk; i += warp_size) { + result_indices_buffer[i] = invalid_index; + result_distances_buffer[i] = utils::get_max_value(); + } + } + } + + // If the sufficient number of valid indexes are not in the internal topk, pick up from the + // candidate list. + if (top_k > internal_topk || result_indices_buffer[top_k - 1] == invalid_index) { + __syncthreads(); + const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0; + const unsigned multi_warps_2 = ((blockDim.x >= 64) && (MAX_ITOPK > 256)) ? 1 : 0; + topk_by_bitonic_sort_and_merge( + result_distances_buffer, + result_indices_buffer, + internal_topk, + result_distances_buffer + internal_topk, + result_indices_buffer + internal_topk, + search_width * graph_degree, + topk_ws, + (iter == 0), + multi_warps_1, + multi_warps_2); + } __syncthreads(); } diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 660246c67..8d5701439 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -758,11 +758,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam { search_params.algo = ps.algo; search_params.max_queries = ps.max_queries; search_params.team_size = ps.team_size; - - // TODO: setting search_params.itopk_size here breaks the filter tests, but is required for - // k>1024 skip these tests until fixed - if (ps.k >= 1024) { GTEST_SKIP(); } - // search_params.itopk_size = ps.itopk_size; + search_params.itopk_size = ps.itopk_size; auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); From a96b72086320ff1dab7b843c67a3c96352a7563d Mon Sep 17 00:00:00 2001 From: Ajit Mistry <55892788+ajit283@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:18:45 +0100 Subject: [PATCH 40/47] [WIP] Add pinned memory resource to C API (#311) Let me know if this is out of scope for cuVS! Authors: - Ajit Mistry (https://github.com/ajit283) - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/cuvs/pull/311 --- cpp/include/cuvs/core/c_api.h | 16 ++++++++++++++++ cpp/src/core/c_api.cpp | 16 ++++++++++++++++ cpp/test/core/c_api.c | 9 +++++++++ 3 files changed, 41 insertions(+) diff --git a/cpp/include/cuvs/core/c_api.h b/cpp/include/cuvs/core/c_api.h index c8c8d3934..400d162ad 100644 --- a/cpp/include/cuvs/core/c_api.h +++ b/cpp/include/cuvs/core/c_api.h @@ -151,6 +151,22 @@ cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent, */ cuvsError_t cuvsRMMMemoryResourceReset(); +/** + * @brief Allocates pinned memory on the host using RMM + * @param[out] ptr Pointer to allocated host memory + * @param[in] bytes Size in bytes to allocate + * @return cuvsError_t + */ +cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes); + +/** + * @brief Deallocates pinned memory on the host using RMM + * @param[in] ptr Pointer to allocated host memory to free + * @param[in] bytes Size in bytes to deallocate + * @return cuvsError_t + */ +cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes); + /** @} */ #ifdef __cplusplus diff --git a/cpp/src/core/c_api.cpp b/cpp/src/core/c_api.cpp index cfbeed2d5..4333bff0c 100644 --- a/cpp/src/core/c_api.cpp +++ b/cpp/src/core/c_api.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res) @@ -130,6 +131,21 @@ extern "C" cuvsError_t cuvsRMMMemoryResourceReset() }); } +thread_local std::unique_ptr pinned_mr; + +extern "C" cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes) +{ + return cuvs::core::translate_exceptions([=] { + if (pinned_mr == nullptr) { pinned_mr = std::make_unique(); } + *ptr = pinned_mr->allocate(bytes); + }); +} + +extern "C" cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes) +{ + return cuvs::core::translate_exceptions([=] { pinned_mr->deallocate(ptr, bytes); }); +} + thread_local std::string last_error_text = ""; extern "C" const char* cuvsGetLastErrorText() diff --git a/cpp/test/core/c_api.c b/cpp/test/core/c_api.c index a3dae6004..a51824d2b 100644 --- a/cpp/test/core/c_api.c +++ b/cpp/test/core/c_api.c @@ -73,6 +73,15 @@ int main() error = cuvsRMMMemoryResourceReset(); if (error == CUVS_ERROR) { exit(EXIT_FAILURE); } + // Alloc memory on host (pinned) + void* ptr3; + cuvsError_t alloc_error_pinned = cuvsRMMHostAlloc(&ptr3, 1024); + if (alloc_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); } + + // Free memory + cuvsError_t free_error_pinned = cuvsRMMHostFree(ptr3, 1024); + if (free_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); } + // Destroy resources error = cuvsResourcesDestroy(res); if (error == CUVS_ERROR) { exit(EXIT_FAILURE); } From 9fb21adc82e625deb7cc0f20b68c0f42902246f1 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:38:05 +0100 Subject: [PATCH 41/47] Dynamic Batching (#261) Non-blocking / stream-ordered dynamic batching as a new index type. ## API This PR implements dynamic batching as a new index type, mirroring the API of other indices. * [_building is wrapping_] Building the index means creating a lightweight wrapper on top of an existing index and initializing necessary components, such as IO batch buffers and synchronization primitives. * [_type erasure_] The underlying/upstream index type is erased once the dynamic_batching wrapper is created, i.e. there's no way to recover the original search index type or parameters. * [_explicit control over batching_] To allow multiple user requests group into a dynamic batch request, the users must use copies of the same dynamic batching index (the user-facing index type is a thin wrapper on top of a shared pointer, hence the copy is shallow and cheap). The search function is thread-safe. ## Feature: stream-ordered dynamic batching Non-blocking / stream-ordered dynamic batching means the batching does not involve synchronizing with a GPU stream. The control is returned to the user as soon as the necessary work is submitted to the GPU. This entails a few good-to-know features: 1. The dynamic batching index has the same blocking properties as the upstream index: if the upstream index does not involve stream sync during search, that the dynamic batching index does not involve it as well (otherwise, the dynamic batching search obviously waits till the upstream search synchronizes under the hood). 2. It's responsibility of the user to synchronize the stream before getting the results back - even if the upstream index search does not need it (the batch results are scattered back to the request threads in a post-processing kernel). 3. If the upstream index does not synchronize during search, the dynamic batching index can group the queries even in a single-threaded application (_try it with --no-lap-sync option in the ann-bench benchmarks_). Overall, stream-ordered dynamic batching makes it easy to modify existing cuVS indexes, because the wrapped index has the same execution behavior as the upstream index. ## Work-in-progress TODO - [x] Add dynamic batching option to more indices in ann-bench - [x] Add tests - [x] **(postponed to 25.02)** Do proper benchmarking and possibly fine-tune the inter-thread communication - [x] Review the API side (`cpp/include/cuvs/neighbors/dynamic_batching.hpp`) [ready for review CC @cjnolet] - [x] Review the algorithm side (`cpp/src/neighbors/detail/dynamic_batching.cuh`) [ready for preliminary review: requests for algorithm docsting/clarifications are especially welcome] Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/261 --- cpp/CMakeLists.txt | 1 + .../src/cuvs/cuvs_ann_bench_param_parser.h | 26 + cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h | 97 +- cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h | 40 +- cpp/include/cuvs/neighbors/cagra.hpp | 4 + .../cuvs/neighbors/dynamic_batching.hpp | 290 ++++ cpp/include/cuvs/neighbors/ivf_flat.hpp | 4 + cpp/include/cuvs/neighbors/ivf_pq.hpp | 3 + cpp/src/neighbors/detail/dynamic_batching.cuh | 1197 +++++++++++++++++ cpp/src/neighbors/dynamic_batching.cu | 91 ++ cpp/test/CMakeLists.txt | 13 + cpp/test/neighbors/dynamic_batching.cuh | 292 ++++ .../neighbors/dynamic_batching/test_cagra.cu | 84 ++ .../dynamic_batching/test_ivf_flat.cu | 44 + .../neighbors/dynamic_batching/test_ivf_pq.cu | 41 + docs/source/cpp_api/neighbors.rst | 1 + .../cpp_api/neighbors_dynamic_batching.rst | 45 + examples/cpp/CMakeLists.txt | 4 + examples/cpp/src/dynamic_batching_example.cu | 282 ++++ 19 files changed, 2539 insertions(+), 20 deletions(-) create mode 100644 cpp/include/cuvs/neighbors/dynamic_batching.hpp create mode 100644 cpp/src/neighbors/detail/dynamic_batching.cuh create mode 100644 cpp/src/neighbors/dynamic_batching.cu create mode 100644 cpp/test/neighbors/dynamic_batching.cuh create mode 100644 cpp/test/neighbors/dynamic_batching/test_cagra.cu create mode 100644 cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu create mode 100644 cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu create mode 100644 docs/source/cpp_api/neighbors_dynamic_batching.rst create mode 100644 examples/cpp/src/dynamic_batching_example.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 34b7cb898..6af423bd5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -397,6 +397,7 @@ if(BUILD_SHARED_LIBS) src/neighbors/iface/iface_pq_uint8_t_int64_t.cu src/neighbors/detail/cagra/cagra_build.cpp src/neighbors/detail/cagra/topk_for_cagra/topk.cu + src/neighbors/dynamic_batching.cu $<$:src/neighbors/hnsw.cpp> src/neighbors/ivf_flat_index.cpp src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h index 57d5b1910..7617bfa66 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h @@ -56,6 +56,26 @@ extern template class cuvs::bench::cuvs_cagra; #include "cuvs_mg_cagra_wrapper.h" #endif +template +void parse_dynamic_batching_params(const nlohmann::json& conf, ParamT& param) +{ + if (!conf.value("dynamic_batching", false)) { return; } + param.dynamic_batching = true; + if (conf.contains("dynamic_batching_max_batch_size")) { + param.dynamic_batching_max_batch_size = conf.at("dynamic_batching_max_batch_size"); + } + param.dynamic_batching_conservative_dispatch = + conf.value("dynamic_batching_conservative_dispatch", false); + if (conf.contains("dynamic_batching_dispatch_timeout_ms")) { + param.dynamic_batching_dispatch_timeout_ms = conf.at("dynamic_batching_dispatch_timeout_ms"); + } + if (conf.contains("dynamic_batching_n_queues")) { + param.dynamic_batching_n_queues = conf.at("dynamic_batching_n_queues"); + } + param.dynamic_batching_k = + uint32_t(uint32_t(conf.at("k")) * float(conf.value("refine_ratio", 1.0f))); +} + #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG) template void parse_build_param(const nlohmann::json& conf, @@ -138,6 +158,9 @@ void parse_search_param(const nlohmann::json& conf, param.refine_ratio = conf.at("refine_ratio"); if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); } } + + // enable dynamic batching + parse_dynamic_batching_params(conf, param); } #endif @@ -291,5 +314,8 @@ void parse_search_param(const nlohmann::json& conf, } // Same ratio as in IVF-PQ param.refine_ratio = conf.value("refine_ratio", 1.0f); + + // enable dynamic batching + parse_dynamic_batching_params(conf, param); } #endif diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h index f6d3d60fc..8c9cb2d4f 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,13 @@ class cuvs_cagra : public algo, public algo_gpu { AllocatorType graph_mem = AllocatorType::kDevice; AllocatorType dataset_mem = AllocatorType::kDevice; [[nodiscard]] auto needs_dataset() const -> bool override { return true; } + /* Dynamic batching */ + bool dynamic_batching = false; + int64_t dynamic_batching_k; + int64_t dynamic_batching_max_batch_size = 4; + double dynamic_batching_dispatch_timeout_ms = 0.01; + size_t dynamic_batching_n_queues = 8; + bool dynamic_batching_conservative_dispatch = false; }; struct build_param { @@ -173,6 +181,12 @@ class cuvs_cagra : public algo, public algo_gpu { std::shared_ptr> dataset_; std::shared_ptr> input_dataset_v_; + std::shared_ptr> dynamic_batcher_; + cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{}; + int64_t dynamic_batching_max_batch_size_; + size_t dynamic_batching_n_queues_; + bool dynamic_batching_conservative_dispatch_; + inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type) { switch (mem_type) { @@ -216,26 +230,33 @@ inline auto allocator_to_string(AllocatorType mem_type) -> std::string template void cuvs_cagra::set_search_param(const search_param_base& param) { - auto sp = dynamic_cast(param); - search_params_ = sp.p; - refine_ratio_ = sp.refine_ratio; + auto sp = dynamic_cast(param); + bool needs_dynamic_batcher_update = + (dynamic_batching_max_batch_size_ != sp.dynamic_batching_max_batch_size) || + (dynamic_batching_n_queues_ != sp.dynamic_batching_n_queues) || + (dynamic_batching_conservative_dispatch_ != sp.dynamic_batching_conservative_dispatch); + dynamic_batching_max_batch_size_ = sp.dynamic_batching_max_batch_size; + dynamic_batching_n_queues_ = sp.dynamic_batching_n_queues; + dynamic_batching_conservative_dispatch_ = sp.dynamic_batching_conservative_dispatch; + search_params_ = sp.p; + refine_ratio_ = sp.refine_ratio; if (sp.graph_mem != graph_mem_) { // Move graph to correct memory space graph_mem_ = sp.graph_mem; RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str()); // We create a new graph and copy to it from existing graph - auto mr = get_mr(graph_mem_); - auto new_graph = raft::make_device_mdarray( + auto mr = get_mr(graph_mem_); + *graph_ = raft::make_device_mdarray( handle_, mr, raft::make_extents(index_->graph().extent(0), index_->graph_degree())); - raft::copy(new_graph.data_handle(), + raft::copy(graph_->data_handle(), index_->graph().data_handle(), index_->graph().size(), raft::resource::get_cuda_stream(handle_)); - index_->update_graph(handle_, make_const_mdspan(new_graph.view())); - // update_graph() only stores a view in the index. We need to keep the graph object alive. - *graph_ = std::move(new_graph); + // NB: update_graph() only stores a view in the index. We need to keep the graph object alive. + index_->update_graph(handle_, make_const_mdspan(graph_->view())); + needs_dynamic_batcher_update = true; } if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) { @@ -256,7 +277,26 @@ void cuvs_cagra::set_search_param(const search_param_base& param) dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1)); index_->update_dataset(handle_, dataset_view); - need_dataset_update_ = false; + need_dataset_update_ = false; + needs_dynamic_batcher_update = true; + } + + // dynamic batching + if (sp.dynamic_batching) { + if (!dynamic_batcher_ || needs_dynamic_batcher_update) { + dynamic_batcher_ = std::make_shared>( + handle_, + cuvs::neighbors::dynamic_batching::index_params{{}, + sp.dynamic_batching_k, + sp.dynamic_batching_max_batch_size, + sp.dynamic_batching_n_queues, + sp.dynamic_batching_conservative_dispatch}, + *index_, + search_params_); + } + dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms; + } else { + if (dynamic_batcher_) { dynamic_batcher_.reset(); } } } @@ -306,7 +346,7 @@ void cuvs_cagra::load(const std::string& file) template std::unique_ptr> cuvs_cagra::copy() { - return std::make_unique>(*this); // use copy constructor + return std::make_unique>(std::cref(*this)); // use copy constructor } template @@ -330,8 +370,17 @@ void cuvs_cagra::search_base(const T* queries, raft::make_device_matrix_view(neighbors_idx_t, batch_size, k); auto distances_view = raft::make_device_matrix_view(distances, batch_size, k); - cuvs::neighbors::cagra::search( - handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + if (dynamic_batcher_) { + cuvs::neighbors::dynamic_batching::search(handle_, + dynamic_batcher_sp_, + *dynamic_batcher_, + queries_view, + neighbors_view, + distances_view); + } else { + cuvs::neighbors::cagra::search( + handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + } if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) { if (raft::get_device_for_address(neighbors) < 0 && @@ -367,11 +416,23 @@ void cuvs_cagra::search( const raft::resources& res = handle_; auto mem_type = raft::get_device_for_address(neighbors) >= 0 ? MemoryType::kDevice : MemoryType::kHostPinned; - auto& tmp_buf = get_tmp_buffer_from_global_pool( - ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) + - (kNeedsIoMapping ? sizeof(IdxT) : 0)) * - batch_size * k0); - auto* candidates_ptr = reinterpret_cast(tmp_buf.data(mem_type)); + + // If dynamic batching is used and there's no sync between benchmark laps, multiple sequential + // requests can group together. The data is copied asynchronously, and if the same intermediate + // buffer is used for multiple requests, they can override each other's data. Hence, we need to + // allocate as much space as required by the maximum number of sequential requests. + auto max_dyn_grouping = dynamic_batcher_ ? raft::div_rounding_up_safe( + dynamic_batching_max_batch_size_, batch_size) * + dynamic_batching_n_queues_ + : 1; + auto tmp_buf_size = ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) + + (kNeedsIoMapping ? sizeof(IdxT) : 0)) * + batch_size * k0; + auto& tmp_buf = get_tmp_buffer_from_global_pool(tmp_buf_size * max_dyn_grouping); + thread_local static int64_t group_id = 0; + auto* candidates_ptr = reinterpret_cast( + reinterpret_cast(tmp_buf.data(mem_type)) + tmp_buf_size * group_id); + group_id = (group_id + 1) % max_dyn_grouping; auto* candidate_dists_ptr = reinterpret_cast(candidates_ptr + (disable_refinement ? 0 : batch_size * k0)); auto* neighbors_idx_t = diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h index 4c8a91f23..dac766669 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h @@ -19,7 +19,9 @@ #include "cuvs_ann_bench_utils.h" #include +#include #include + #include #include #include @@ -46,6 +48,13 @@ class cuvs_ivf_pq : public algo, public algo_gpu { cuvs::neighbors::ivf_pq::search_params pq_param; float refine_ratio = 1.0f; [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; } + /* Dynamic batching */ + bool dynamic_batching = false; + int64_t dynamic_batching_k; + int64_t dynamic_batching_max_batch_size = 128; + double dynamic_batching_dispatch_timeout_ms = 0.01; + size_t dynamic_batching_n_queues = 3; + bool dynamic_batching_conservative_dispatch = true; }; using build_param = cuvs::neighbors::ivf_pq::index_params; @@ -98,6 +107,9 @@ class cuvs_ivf_pq : public algo, public algo_gpu { int dimension_; float refine_ratio_ = 1.0; raft::device_matrix_view dataset_; + + std::shared_ptr> dynamic_batcher_; + cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{}; }; template @@ -138,6 +150,21 @@ void cuvs_ivf_pq::set_search_param(const search_param_base& param) search_params_ = sp.pq_param; refine_ratio_ = sp.refine_ratio; assert(search_params_.n_probes <= index_params_.n_lists); + + if (sp.dynamic_batching) { + dynamic_batcher_ = std::make_shared>( + handle_, + cuvs::neighbors::dynamic_batching::index_params{{}, + sp.dynamic_batching_k, + sp.dynamic_batching_max_batch_size, + sp.dynamic_batching_n_queues, + sp.dynamic_batching_conservative_dispatch}, + *index_, + search_params_); + dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms; + } else { + dynamic_batcher_.reset(); + } } template @@ -168,8 +195,17 @@ void cuvs_ivf_pq::search_base( raft::make_device_matrix_view(neighbors_idx_t, batch_size, k); auto distances_view = raft::make_device_matrix_view(distances, batch_size, k); - cuvs::neighbors::ivf_pq::search( - handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + if (dynamic_batcher_) { + cuvs::neighbors::dynamic_batching::search(handle_, + dynamic_batcher_sp_, + *dynamic_batcher_, + queries_view, + neighbors_view, + distances_view); + } else { + cuvs::neighbors::ivf_pq::search( + handle_, search_params_, *index_, queries_view, neighbors_view, distances_view); + } if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) { raft::linalg::unaryOp(neighbors, diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 5ceb3010e..a4684ce26 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -272,6 +272,10 @@ static_assert(std::is_aggregate_v); */ template struct index : cuvs::neighbors::index { + using index_params_type = cagra::index_params; + using search_params_type = cagra::search_params; + using index_type = IdxT; + using value_type = T; static_assert(!raft::is_narrowing_v, "IdxT must be able to represent all values of uint32_t"); diff --git a/cpp/include/cuvs/neighbors/dynamic_batching.hpp b/cpp/include/cuvs/neighbors/dynamic_batching.hpp new file mode 100644 index 000000000..410800357 --- /dev/null +++ b/cpp/include/cuvs/neighbors/dynamic_batching.hpp @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuvs::neighbors::dynamic_batching { + +namespace detail { +template +class batch_runner; +} + +/** + * @defgroup dynamic_batching_cpp_index_params Dynamic Batching index parameters + * @{ + */ +struct index_params : cuvs::neighbors::index_params { + /** The number of neighbors to search is fixed at construction time. */ + int64_t k; + /** Maximum size of the batch to submit to the upstream index. */ + int64_t max_batch_size = 100; + /** + * The number of independent request queues. + * + * Each queue is associated with a unique CUDA stream and IO device buffers. If the number of + * concurrent requests is high, using multiple queues allows to fill-in data and prepare the batch + * while the other queue is busy. Moreover, the queues are submitted concurrently; this allows to + * better utilize the GPU by hiding the kernel launch latencies, which helps to improve the + * throughput. + */ + size_t n_queues = 3; + /** + * By default (`conservative_dispatch = false`) the first CPU thread to commit a query to a batch + * dispatches the upstream search function as soon as possible (before the batch is full). In that + * case, it does not know the final batch size at the time of calling the upstream search and thus + * runs the upstream search with the maximum batch size every time, even if only one valid query + * is present in the batch. This reduces the latency at the cost of wasted GPU resources. + * + * The alternative behavaior (`conservative_dispatch = true`) is more conservative: the dispatcher + * thread starts the kernel that gathers input queries, but waits till the batch is full or the + * waiting time is exceeded. Only then it acquires the actual batch size and launches the upstream + * search. As a result, less GPU resources are wasted at the cost of exposing upstream search + * latency. + * + * *Rule of Thumb*: + * for a large `max_batch_size` set `conservative_dispatch = true`, otherwise keep it disabled. + */ + bool conservative_dispatch = false; +}; +/** @} */ + +/** + * @defgroup dynamic_batching_cpp_search_params Dynamic Batching search parameters + * @{ + */ +struct search_params : cuvs::neighbors::search_params { + /** + * How long a request can stay in the queue (milliseconds). + * Note, this only affects the dispatch time and does not reflect full request latency; + * the latter depends on the upstream search parameters and the batch size. + */ + double dispatch_timeout_ms = 1.0; +}; +/** @} */ + +/** + * @defgroup dynamic_batching_cpp_index Dynamic Batching index type + * @{ + */ + +/** + * @brief Lightweight dynamic batching index wrapper + * + * @tparam T data type + * @tparam IdxT index type + * + * One lightweight dynamic batching index manages a single index and a single search parameter set. + * This structure should be shared among multiple users via copy semantics: access to the + * underlying implementation is managed via a shared pointer, and concurrent search among the + * participants is thread-safe. + * + * __Usage example__ + * @code{.cpp} + * using namespace cuvs::neighbors; + * // When creating a dynamic batching index, k parameter has to be passed explicitly. + * // The first empty braces default-initialize the parent `neighbors::index_params` (unused). + * dynamic_batching::index_params dynb_index_params{{}, k}; + * // Construct the index by wrapping the upstream index and search parameters. + * dynamic_batching::index index{ + * res, dynb_index_params, upstream_index, upstream_search_params + * }; + * // Use default search parameters + * dynamic_batching::search_params search_params; + * // Search K nearest neighbours + * auto neighbors = raft::make_device_matrix(res, n_queries, k); + * auto distances = raft::make_device_matrix(res, n_queries, k); + * dynamic_batching::search( + * res, search_params, index, queries, neighbors.view(), distances.view() + * ); + * @endcode + * + * + * __Priority queues__ + * + * The dynamic batching index has a limited support for prioritizing individual requests. + * There's only one pool of queues in the batcher and no functionality to prioritize one bach over + * the other. The `search_params::dispatch_timeout_ms` parameters passed in each request are + * aggregated internally and the batch is dispatched no later than any of the timeouts is exceeded. + * In this logic, a high-priority request can never be processed earlier than any lower-priority + * requests submitted earlier. + * + * However, dynamic batching indexes are lightweight and do not contain any global or static state. + * This means it's easy to combine multiple batchers. + * As an example, you can construct one batching index per priority class: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // Large batch size (128), couple queues (2), + * // enabled conservative dispatch - all for better throughput + * dynamic_batching::index_params low_priority_params{{}, k, 128, 2, true}; + * // Small batch size (16), more queues (4), + * // disabled conservative dispatch - to minimize latency with reasonable throughput + * dynamic_batching::index_params high_priority_params{{}, k, 16, 4, false}; + * // Construct the indexes by wrapping the upstream index and search parameters. + * dynamic_batching::index low_priority_index{ + * res, low_priority_params, upstream_index, upstream_search_params + * }; + * dynamic_batching::index high_priority_index{ + * res, high_priority_params, upstream_index, upstream_search_params + * }; + * // Define a combined search function with priority selection + * double high_priority_threshold_ms = 0.1; + * auto search_function = + * [low_priority_index, high_priority_index, high_priority_threshold_ms]( + * raft::resources const &res, + * dynamic_batching::search_params search_params, + * raft::device_matrix_view queries, + * raft::device_matrix_view neighbors, + * raft::device_matrix_view distances) { + * dynamic_batching::search( + * res, + * search_params, + * search_params.dispatch_timeout_ms < high_priority_threshold_ms + * ? high_priority_index : low_priority_index, + * queries, + * neighbors, + * distances + * ); + * }; + * @endcode + */ +template +struct index : cuvs::neighbors::index { + std::shared_ptr> runner; + + /** + * @brief Construct a dynamic batching index by wrapping the upstream index. + * + * @tparam Upstream the upstream index type + * + * @param[in] res raft resources + * @param[in] params dynamic batching parameters + * @param[in] upstream_index the original index to perform the search + * (the reference must be alive for the lifetime of the dynamic batching index) + * @param[in] upstream_params the original index search parameters for all queries in a batch + * (the parameters are captured by value for the lifetime of the dynamic batching index) + * @param[in] sample_filter + * filtering function, if any, must be the same for all requests in a batch + * (the pointer must be alive for the lifetime of the dynamic batching index) + */ + template + index(const raft::resources& res, + const cuvs::neighbors::dynamic_batching::index_params& params, + const Upstream& upstream_index, + const typename Upstream::search_params_type& upstream_params, + const cuvs::neighbors::filtering::base_filter* sample_filter = nullptr); +}; +/** @} */ + +/** + * + * @defgroup dynamic_batching_cpp_search Dynamic Batching search + * + * @{ + */ + +/** + * @brief Search ANN using a dynamic batching index. + * + * The search parameters of the upstream index and the optional filtering function are configured at + * the dynamic batching index construction time. + * + * Like with many other indexes, the dynamic batching search has the stream-ordered semantics: the + * host function may return the control before the results are ready. Synchronize with the main CUDA + * stream in the given resource object to wait for arrival of the search results. + * + * Dynamic batching search is thread-safe: call the search function with copies of the same index in + * multiple threads to increase the occupancy of the batches. + * + * @param[in] res + * @param[in] params query-specific batching parameters, such as the maximum waiting time + * @param[in] index a dynamic batching index + * @param[in] queries a device matrix view to a row-major matrix + * [n_queries, dim] + * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset + * [n_queries, k] + * @param[out] distances a device matrix view to the distances to the selected neighbors + * [n_queries, k] + * + */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @copydoc search */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @copydoc search */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @copydoc search */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @copydoc search */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @copydoc search */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @copydoc search */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @copydoc search */ +void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + dynamic_batching::index const& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); + +/** @} */ + +} // namespace cuvs::neighbors::dynamic_batching diff --git a/cpp/include/cuvs/neighbors/ivf_flat.hpp b/cpp/include/cuvs/neighbors/ivf_flat.hpp index 7f852d635..e017946d9 100644 --- a/cpp/include/cuvs/neighbors/ivf_flat.hpp +++ b/cpp/include/cuvs/neighbors/ivf_flat.hpp @@ -138,6 +138,10 @@ using list_data = ivf::list; */ template struct index : cuvs::neighbors::index { + using index_params_type = ivf_flat::index_params; + using search_params_type = ivf_flat::search_params; + using index_type = IdxT; + using value_type = T; static_assert(!raft::is_narrowing_v, "IdxT must be able to represent all values of uint32_t"); diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp index ae543c9e9..d85753b7f 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq.hpp +++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp @@ -319,6 +319,9 @@ using list_data = ivf::list; */ template struct index : cuvs::neighbors::index { + using index_params_type = ivf_pq::index_params; + using search_params_type = ivf_pq::search_params; + using index_type = IdxT; static_assert(!raft::is_narrowing_v, "IdxT must be able to represent all values of uint32_t"); diff --git a/cpp/src/neighbors/detail/dynamic_batching.cuh b/cpp/src/neighbors/detail/dynamic_batching.cuh new file mode 100644 index 000000000..5c6b1654e --- /dev/null +++ b/cpp/src/neighbors/detail/dynamic_batching.cuh @@ -0,0 +1,1197 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../sample_filter.cuh" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef CUVS_SYSTEM_LITTLE_ENDIAN +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define CUVS_SYSTEM_LITTLE_ENDIAN 0 +#else +#define CUVS_SYSTEM_LITTLE_ENDIAN 1 +#endif +#endif + +namespace cuvs::neighbors::dynamic_batching::detail { + +using raft::RAFT_NAME; // TODO: a workaround for RAFT_LOG_XXX macros + +/** + * A helper to make the requester threads more cooperative when busy-spinning. + * It is used in the wait loops across this file to reduce the CPU usage. + * + * Ideally, we should be using atomics notify/wait feature, but that is not always possible + * (e.g. waiting on multiple things or waiting on GPU volatile stores). + */ +struct local_waiter { + static constexpr inline int64_t kNonSleepIterations = 10; + + explicit local_waiter(std::chrono::nanoseconds base_sleep_time, + int64_t start_iteration = 0) noexcept + : base_sleep_time_{base_sleep_time}, iteration_{start_iteration} + { + } + + inline void wait() noexcept + { + if (iteration_ < 2) { + // Don't wait for the first few iterations: + // maybe there's a weak CAS op in the loop, or something else that could return quickly + } else if (iteration_ < kNonSleepIterations) { + std::this_thread::yield(); + } else { + auto k = iteration_ + 1 - kNonSleepIterations; + std::this_thread::sleep_for(base_sleep_time_ * k); + } + ++iteration_; + } + + inline void reset(int64_t start_iteration = 0) noexcept { iteration_ = start_iteration; } + + private: + std::chrono::nanoseconds base_sleep_time_; + int64_t iteration_; +}; + +class cuda_event { + public: + cuda_event(cuda_event&&) = default; + cuda_event& operator=(cuda_event&&) = default; + ~cuda_event() = default; + cuda_event(cuda_event const&) = delete; // Copying disallowed: one event one owner + cuda_event& operator=(cuda_event&) = delete; + + cuda_event() + : event_{[]() { + cudaEvent_t* e = new cudaEvent_t; + RAFT_CUDA_TRY(cudaEventCreateWithFlags(e, cudaEventDisableTiming)); + return e; + }(), + [](cudaEvent_t* e) { + RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(*e)); + delete e; + }} + { + } + + cudaEvent_t value() const { return *event_; } + + private: + std::unique_ptr> event_; +}; + +template +struct get_accessor_type_t { + using type = typename MdSpanOrArray::accessor_type; +}; + +template +struct get_accessor_type_t> { + using mdarray_type = raft::mdarray; + using view_type = typename mdarray_type::view_type; + using type = typename view_type::accessor_type; +}; + +template +using get_accessor_type = typename get_accessor_type_t::type; + +template +constexpr inline auto slice_3d(typename Source3DT::index_type i, + const Source3DT& source3d, + typename Source3DT::index_type n_rows = 0) +{ + using element_type = typename Source3DT::element_type; + using index_type = typename Source3DT::index_type; + using layout_type = typename Source3DT::layout_type; + using accessor_type = get_accessor_type; + auto extent2d = + raft::make_extents(n_rows == 0 ? source3d.extent(1) : n_rows, source3d.extent(2)); + auto stride = uint64_t(source3d.extent(1)) * uint64_t(source3d.extent(2)); + return raft::mdspan{ + const_cast(source3d.data_handle()) + stride * i, extent2d}; +} + +template +constexpr inline auto slice_2d(typename Source2DT::index_type i, const Source2DT& source2d) +{ + using element_type = typename Source2DT::element_type; + using index_type = typename Source2DT::index_type; + using layout_type = typename Source2DT::layout_type; + using accessor_type = get_accessor_type; + auto extent1d = raft::make_extents(source2d.extent(1)); + auto stride = uint64_t(extent1d.extent(0)); + return raft::mdspan{ + const_cast(source2d.data_handle()) + stride * i, extent1d}; +} + +// --------------------------------------------- + +constexpr size_t kCacheLineBytes = 64; + +template +using upstream_search_type_const = void(raft::resources const&, + typename Upstream::search_params_type const&, + Upstream const&, + raft::device_matrix_view, + raft::device_matrix_view, + raft::device_matrix_view, + const cuvs::neighbors::filtering::base_filter&); + +template +using upstream_search_type = void(raft::resources const&, + typename Upstream::search_params_type const&, + Upstream&, + raft::device_matrix_view, + raft::device_matrix_view, + raft::device_matrix_view, + const cuvs::neighbors::filtering::base_filter&); + +template +using function_search_type = void(raft::resources const&, + raft::device_matrix_view, + raft::device_matrix_view, + raft::device_matrix_view); + +/** + * State of the batch token slot. + * + * In a nutshell, there are only two batch slot states that matter: empty or full. + * Initially, all slots are empty. The host threads can commit (i.e. subscribe) to a batch slot even + * if it's empty (when they know it will be filled-in at some point in future). With this logic, we + * smooth out the bottleneck that occurs when many threads try to submit their work using a single + * atomic counter (the batch queue head). + * + * Once a GPU IO buffer is available, its owner returns the buffer to the queue by marking a slot as + * full. By that time, it may be partially or fully committed (i.e. several host threads are + * committed to submit a certain number of queries). + * + * If we had an infinite buffer, these two states would suffice. However, we have a finite ring + * buffer, so the used-up slots must be emptied again, so that they are usable in the following + * rounds through the ring buffer. + * + * The slot state depends not only on the value stored in it, but on the accessing thread as well + * (see `batch_queue_t::batch_status` below). The accessing thread may be ahead or behind the others + * (as defined by the sequential order id below). Depending on the accessor state, it may view the + * slot as being emptied/filled in the future, current, or previous rounds. This affects the + * decision whether the slot can be used and whether the thread has the right to advance tail or + * head counters of the batch queue. + * + */ +enum struct slot_state : int32_t { + /** The slot is empty, cleared-up in this round (hence the head should be past it). */ + kEmptyPast = 1025, + /** The slot is empty, cleared-up in previous round. */ + kEmpty = 1024, + /** The slot is empty, cleared-up two round ago and cannot be used yet (due to be filled). */ + kEmptyBusy = 1023, + /** The current thread has been sleeping for too long and is way behind the others. */ + kFullPast = 1, + /** The slot is full, filled-in in this round. */ + kFull = 0, + /** This state is considered full, filled-in in previous round. */ + kFullBusy = -1 + /** The rest of the values are impossible states indicating an error in the algo. */ +}; + +/** + * Identifies the batch and its job-commit state. + * Should be in the pinned memory for fast shared access on CPU and GPU side. + * + * The batch token packs the IO buffer address (id) and a number of committed queries in a single + * 64-bit atomic. This is to allow conflict-free atomic updates of both values. + * + */ +struct batch_token { + uint64_t value = 0; + + constexpr inline batch_token() {} + explicit constexpr inline batch_token(uint32_t buffer_id) { id() = buffer_id; } + + /** + * Sequential id of the batch in the array of batches. + * + * The `id` field, in practice, stores not only the IO buffer address, but also an extra + * sequential "round" id. The latter identifies how many rounds through the batch ring buffer has + * already been done (computed from the the `seq_order_id` counter in the batch queue) and is used + * by `batch_queue_t::batch_status` below to compute the `slot_state`. This is to avoid the ABA + * atomic updates problem when using the ring buffer. + * + * There cannot be more IO buffers than the size of the ring buffer. The size of the ring buffer + * is always a power-of-two. Hence the IO buffer address needs only `log2(Size)` bits, and the + * rest is used for the ring buffer round id (see `batch_queue_t::make_seq_batch_id`). + * + */ + RAFT_INLINE_FUNCTION auto id() noexcept -> uint32_t& + { + return *(reinterpret_cast(&value) + kOffsetOfId); + } + /** + * How many queries are promised by the participating CPU threads (requesters). + * + * The CPU threads atomically increment this counter until its size reaches `max_batch_size`. + * + * Any (CPU or GPU thread) may atomically write to the highest byte of this value, which indicates + * that no one can commit to this batch anymore (e.g. the wait timeout is exceeded). + * Hence, the actual number of committed queries is `size_committed % 0x00ffffff`. + * + * The gather kernel cannot finish while `size_committed < max_batch_size`. + * + * NB: we use the trick of writing to the highest byte to allow GPU write atomically to the pinned + * host memory. This way, we don't need to use device RMW atomics on host memory, which are not + * available on a broad class of GPUs. If not this workaround, we could simply do atomic add/or + * with value 0x01000000. + */ + RAFT_INLINE_FUNCTION auto size_committed() noexcept -> uint32_t& + { + return *(reinterpret_cast(&value) + kOffsetOfSC); + } + + private: + /** Offset of the `id()` value in the token if it's interpreted as uint32_t[2]. */ + static constexpr inline uint32_t kOffsetOfId = CUVS_SYSTEM_LITTLE_ENDIAN; + /** Offset of the `size_committed()` value in the token if it's interpreted as uint32_t[2]. */ + static constexpr inline uint32_t kOffsetOfSC = 1 - kOffsetOfId; +}; +static_assert(sizeof(batch_token) == sizeof(uint64_t)); +static_assert(cuda::std::atomic::is_always_lock_free); + +/** + * The batch queue consists of several ring buffers and two counters determining where are the head + * and the tail of the queue in those buffers. + * + * There is an internal sequentially consistent order in the queue, defined by `seq_order_id` + * counter. The head and tail members define where the participants should look for full and + * empty slots in the queue respectively. + * + * The slots in the queue have their own states (see `slot_state` above). The states are updated + * concurrently in many threads, so the head and tail counters do not always accurately represent + * the actual compound state of the queue. + * + * `.head()` is where a host thread starts looking for a batch token. All slots earlier than + * returned by this method are not usable anymore (they batches are either "fully committed", + * dispatched, or emptied earlier). If a host thread determines that the current slot is not usable + * anymore, it increments the counter by calling `.pop()`. + * + * The tail is where a host thread reserves an empty slot to be filled-in by a GPU worker thread + * once it releases the owned IO buffer. There's no `.tail()` method, but `.push()` method returns + * the tail position (before advancing it). `.push()` blocks the host thread until it knows the slot + * isn't used by any other threads anymore (i.e. cleaned-up from the previous round). + * + * There's no strict relation between the head and the tail. + * Normally there is a single batch in the ring buffer being partially filled. It is followed by + * contiguous list of empty idle batches and reserved empty slots. The head and the tail loosely + * correspond to the beginning and the end of this sequence. + * + * Sometimes, the head can go further than the tail. This means all batches are busy and there are + * more threads committed to the slots that are not populated with the batches (and not even + * reserved for filling-in yet). + * + * + */ +template +struct batch_queue_t { + static constexpr uint32_t kSize = Size; + static constexpr uint32_t kMinElemSize = sizeof(uint32_t); + static_assert(cuda::std::atomic::is_always_lock_free, + "The value type must be lock-free."); + static_assert(cuda::std::atomic::is_always_lock_free, + "The value type must be lock-free."); + static_assert(cuda::std::atomic::is_always_lock_free, + "The value type must be lock-free."); + static_assert(raft::is_a_power_of_two(kSize), "The size must be a power-of-two for efficiency."); + + static constexpr auto kMemOrder = cuda::std::memory_order_relaxed; + + /** Type-safe synonym for the internal head & tail counters. */ + struct seq_order_id { + uint32_t value; + }; + + explicit batch_queue_t(const raft::resources& res, bool use_batch_sizes) noexcept + : tokens_{raft::make_pinned_vector, + uint32_t>(res, kSize)}, + rem_time_us_{ + raft::make_pinned_vector, uint32_t>( + res, kSize)}, + dispatch_sequence_id_(kSize), + batch_sizes_{ + use_batch_sizes + ? std::make_optional( + raft::make_pinned_vector, uint32_t>( + res, kSize)) + : std::nullopt} + { + tail_.store(0, kMemOrder); + head_.store(0, kMemOrder); + auto past_seq_id = seq_order_id{static_cast(-1)}; + for (uint32_t i = 0; i < kSize; i++) { + rem_time_us_(i).store(std::numeric_limits::max(), kMemOrder); + if (batch_sizes_.has_value()) { batch_sizes_.value()(i).store(0, kMemOrder); } + dispatch_sequence_id_[i].store(past_seq_id.value, kMemOrder); + tokens_(i).store(make_empty_token(past_seq_id), kMemOrder); + } + } + + /** + * Advance the tail position, ensure the slot is empty, and return the reference to the new slot. + * The calling side is responsible for filling-in the slot with an actual value at a later time. + * + * Conceptually, this method reserves a ring buffer slot on the host side, so that the GPU worker + * thread can return the IO buffer (filling the token slot) asynchronously. + */ + inline auto push() -> seq_order_id + { + seq_order_id seq_id{tail_.fetch_add(1, kMemOrder)}; + auto& loc = token(seq_id); + auto ss = batch_status(loc.load(kMemOrder), seq_id); + /* [Note: very small waiting time] + + Only a few (dispatcher) threads are going to call this function at the same time as opposed to + potentially any number of threads waiting on new batches to arrive. + This is a performance-critical code path. + + Hence the small base sleep time. + */ + local_waiter till_empty{std::chrono::nanoseconds{1000}}; + while (ss == slot_state::kFull || ss == slot_state::kFullBusy || ss == slot_state::kEmptyBusy) { + // Wait till the slot becomes empty (doesn't matter future or past). + // The batch id is only ever updated in the scatter/gather kernels, which are the only source + // of truth whether a batch buffer is currently used by the GPU. + till_empty.wait(); + ss = batch_status(loc.load(kMemOrder), seq_id); + } + return seq_id; + } + + /** + * Return the offset of the given w.r.t. the tail of the queue. + * Negative value means the given slot is in the body of the queue and should be dispatched soon. + * Positive value means the given slot is ahead of the queue and should wait longer. + * + * That is the lower the value the higher the priority. + */ + [[nodiscard]] inline auto niceness(seq_order_id id) const noexcept -> int32_t + { + return static_cast(id.value - tail_.load(kMemOrder)); + } + + /** Get the reference to the first element in the queue. */ + inline auto head() noexcept -> seq_order_id + { + auto h = head_.load(kMemOrder); + // The head cannot go ahead of the tail by more than the queue buffer size. + // If the head is ahead by not more than kSize elements though, everything is fine; + // the slots too far ahead are protected by busy tokens. + local_waiter for_tail(std::chrono::nanoseconds{100000}); + while (static_cast(h - tail_.load(kMemOrder)) >= static_cast(kSize)) { + for_tail.wait(); + h = head_.load(kMemOrder); + } + return seq_order_id{h}; + } + + /** Batch commit state and IO buffer id (see `batch_token`) */ + inline auto token(seq_order_id id) -> cuda::atomic& + { + return tokens_(cache_friendly_idx(id.value)); + } + + /** + * How much time has this batch left for waiting. + * It is an approximate value by design - to minimize the synchronization between CPU and GPU. + * + * The clocks on GPU and CPU may have different values, so the running kernel and the CPU thread + * have different ideas on how much time is left. Rather than trying to synchronize the clocks, we + * maintain independent timers and accept the uncertainty. + * + * Access pattern: CPU write-only (producer); GPU read-only (consumer). + */ + inline auto rem_time_us(seq_order_id id) -> cuda::atomic& + { + return rem_time_us_(cache_friendly_idx(id.value)); + } + + /** + * The actual batch size - the final number of committed queries. + * This is only used if `conservative_dispatch = true`. + */ + inline auto batch_size(seq_order_id id) noexcept + -> cuda::atomic* + { + if (batch_sizes_.has_value()) { return &batch_sizes_.value()(cache_friendly_idx(id.value)); } + return nullptr; + } + + /** + * This value is updated by the host thread after it submits the job completion event to indicate + * to other threads can wait on the event to get the results back. + * Other threads get the value from the batch queue and compare that value against this atomic. + * + * Access pattern: CPU-only; dispatching thread writes the id once, other threads wait on it. + */ + inline auto dispatch_sequence_id(seq_order_id id) -> cuda::std::atomic& + { + return dispatch_sequence_id_[cache_friendly_idx(id.value)]; + } + + /** + * An `atomicMax` on the queue head in disguise. + * This makes the given batch slot and all prior slots unreachable (not possible to commit). + */ + inline void pop(seq_order_id id) noexcept + { + const auto desired = id.value + 1; + auto observed = id.value; + while (observed < desired && + !head_.compare_exchange_weak(observed, desired, kMemOrder, kMemOrder)) {} + } + + static constexpr inline auto batch_id(batch_token token) noexcept -> uint32_t + { + return token.id() & kCounterLocMask; + } + + /** + * Construct a token that is interpreted as having been emptied in the current round + * (the round is derived from seq_id). + * + * NB: "round" is the number of times the queue counters went over the whole ring buffer. + * It's used to avoid the ABA problem for atomic token updates. + */ + static constexpr inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token + { + // Modify the seq_id to identify that the token slot is empty + auto empty_round = static_cast(slot_state::kEmptyPast) * kSize; + auto empty_round_id = seq_order_id{seq_id.value + empty_round}; + // Id of empty slot is ignored and can be anything + auto empty_id = kCounterLocMask; + return batch_token{make_seq_batch_id(empty_round_id, empty_id)}; + } + + /** + * Construct a sequential batch id by combining the current round and the real batch id. + * + * The "round" part gives a hint when the token slot was filled-in to avoid the ABA problem + * (see above). + */ + static constexpr inline auto make_seq_batch_id(seq_order_id seq_id, uint32_t batch_id) noexcept + -> uint32_t + { + return seq_round(seq_id) | batch_id; + } + + /** + * Get the state of the batch slot w.r.t. the given seq_order_id counter. + * This gives the information whether the slot is emptied/filled by another thread and whether + * that thread is ahead or behind the current thread. + * By introducing these future/past flavours of states we solve the ABA problem for atomic updates + * of the ring buffer slots. + */ + static inline auto batch_status(batch_token token, seq_order_id seq_id) -> slot_state + { + /* + The "round" part of the id is just a seq_id without the low bits. + Essentially, we comparing here seq_ids of two threads: the one that wrote to the slot in the + past and the one reads from it now. + + `kSize` determines the number of bits we use for the IO buffer id and for the round id. + */ + auto v = + static_cast(seq_round(token) - seq_round(seq_id)) / static_cast(kSize); + if (v < static_cast(slot_state::kFullBusy)) { RAFT_FAIL("Invalid batch state %d", v); } + if (v < static_cast(slot_state::kEmptyBusy)) { + return static_cast(std::min(v, static_cast(slot_state::kFullPast))); + } + return static_cast(std::min(v, static_cast(slot_state::kEmptyPast))); + } + + private: + alignas(kCacheLineBytes) cuda::std::atomic tail_{}; + alignas(kCacheLineBytes) cuda::std::atomic head_{}; + + alignas(kCacheLineBytes) + raft::pinned_vector, uint32_t> tokens_; + raft::pinned_vector, uint32_t> rem_time_us_; + std::vector> dispatch_sequence_id_; + std::optional, uint32_t>> + batch_sizes_; + + /* [Note: cache-friendly indexing] + To avoid false sharing, the queue pushes and pops values not sequentially, but with an + increment that is larger than the cache line size. + Hence we introduce the `kCounterIncrement > kCacheLineBytes`. + However, to make sure all indices are used, we choose the increment to be coprime with the + buffer size. We also require that the buffer size is a power-of-two for two reasons: + 1) Fast modulus operation - reduces to binary `and` (with `kCounterLocMask`). + 2) Easy to ensure GCD(kCounterIncrement, kSize) == 1 by construction + (see the definition below). + */ + static constexpr uint32_t kElemsPerCacheLine = + raft::div_rounding_up_safe(kCacheLineBytes, kMinElemSize); + static constexpr uint32_t kCounterIncrement = raft::bound_by_power_of_two(kElemsPerCacheLine) + 1; + static constexpr uint32_t kCounterLocMask = kSize - 1; + // These props hold by design, but we add them here as a documentation and a sanity check. + static_assert( + kCounterIncrement * kMinElemSize >= kCacheLineBytes, + "The counter increment should be larger than the cache line size to avoid false sharing."); + static_assert( + std::gcd(kCounterIncrement, kSize) == 1, + "The counter increment and the size must be coprime to allow using all of the queue slots."); + /** Map the sequential index onto cache-friendly strided index. */ + static constexpr inline auto cache_friendly_idx(uint32_t source_idx) noexcept -> uint32_t + { + return (source_idx * kCounterIncrement) & kCounterLocMask; + } + + /** The "round": the number of times the queue counter went over the whole ring buffer. */ + static constexpr inline auto seq_round(seq_order_id id) noexcept -> uint32_t + { + return id.value & ~kCounterLocMask; + } + + /** The "round": the number of times the queue counter went over the whole ring buffer. */ + static constexpr inline auto seq_round(batch_token token) noexcept -> uint32_t + { + return token.id() & ~kCounterLocMask; + } +}; + +template +struct alignas(kCacheLineBytes) request_pointers { + /** + * A pointer to `dim` values of a single query (input). + * + * Serves as a synchronization point between the CPU thread (producer) and a GPU block in the + * `gather_inputs` kernel (consumer). + */ + cuda::atomic query{nullptr}; + /** A pointer to `k` nearest neighbors (output) */ + IdxT* neighbors{nullptr}; + /** A pointer to distances of `k` nearest neighbors (output) */ + float* distances{nullptr}; +}; + +/** + * Check the current timestamp at the moment of construction and repeatedly compare the elapsed time + * to the timeout value provided by the host (passed via an atomic). + * + * This is used in the gather inputs kernel to make it stop waiting for new queries in a batch + * once the deadline is reached. + */ +struct gpu_time_keeper { + /** + * @param[in] cpu_provided_remaining_time_us + * a pointer to a shared atomic, represent the remaining waiting time in microseconds. + * Note, the remaining time is updated atomically by each participating host thread in their + * "private coordinate systems". That's ok, we don't expect a single reference time for all host + * and device threads. + * We tolerate the errors coming from the time difference between the host thread writing their + * remaining waiting time and the GPU thread reading that value. + */ + RAFT_DEVICE_INLINE_FUNCTION explicit gpu_time_keeper( + cuda::atomic* cpu_provided_remaining_time_us) + : cpu_provided_remaining_time_us_{cpu_provided_remaining_time_us} + { + update_timestamp(); + } + + /** + * Check whether the deadline is not reached yet: + * 1) Compare the internal clock against the last-read deadline value + * 2) Read the deadline value from the host-visible atomic and check the internal clock again. + */ + RAFT_DEVICE_INLINE_FUNCTION auto has_time() noexcept -> bool + { + if (timeout) { return false; } + update_local_remaining_time(); + if (local_remaining_time_us_ <= 0) { + timeout = true; + return false; + } + update_cpu_provided_remaining_time(); + if (local_remaining_time_us_ <= 0) { + timeout = true; + return false; + } + return true; + } + + private: + cuda::atomic* cpu_provided_remaining_time_us_; + uint64_t timestamp_ns_ = 0; + int32_t local_remaining_time_us_ = std::numeric_limits::max(); + bool timeout = false; + + RAFT_DEVICE_INLINE_FUNCTION void update_timestamp() noexcept + { + asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timestamp_ns_)); + } + + RAFT_DEVICE_INLINE_FUNCTION void update_local_remaining_time() noexcept + { + auto prev_timestamp = timestamp_ns_; + update_timestamp(); + // subtract the time passed since the last check + // (assuming local time is updated every time timestamp is read) + local_remaining_time_us_ -= static_cast((timestamp_ns_ - prev_timestamp) / 1000ull); + } + + RAFT_DEVICE_INLINE_FUNCTION void update_cpu_provided_remaining_time() noexcept + { + local_remaining_time_us_ = + std::min(local_remaining_time_us_, + cpu_provided_remaining_time_us_->load(cuda::std::memory_order_relaxed)); + } +}; + +/** + * Copy the queries from the submitted pointers to the batch store, one query per block. + * Upon completion of this kernel, the submitted queries are all in the contiguous buffer + * `batch_queries`. + * + * Block size: (n, 1, 1) any number of threads copying a single row of data. + * Grid size: (max_batch_size, 1, 1) - one block per query + * + * Note, we view the incoming queries and the batch as going through multiple stages: + * 1) A host thread "commits" a query: it reserves a slot for the query in the batch and promises + * to fill-in the corresponding query pointer. + * 2) A host thread "submits" the query: it fills-in the pointer to the query data in the reserved + * slot. + * 3) This kernel copies the query data to the contiguous query buffer owned by the batch. + * + * The batch is "fully committed" when the number of committed queries reaches the maximum batch + * size (all slots are reserved). Committing, submitting, and copying of the queries is somewhat + * overlapped among multiple host and device threads. Only the copying happens in a CUDA stream in + * this kernel, and the upstream search is dispatched right after this kernel (in the same stream). + * + */ +template +RAFT_KERNEL gather_inputs( + raft::device_matrix_view batch_queries, + raft::pinned_vector_view, uint32_t> request_ptrs, + /* The remaining time may be updated on the host side: a thread with a tighter deadline may reduce + it (but not increase). */ + cuda::atomic* remaining_time_us, + /* The token contains the current number of queries committed and is cleared in this kernel. */ + cuda::atomic* batch_token_ptr, + /* The host-visible batch size counter (used in `conservative_dispatch`). */ + cuda::atomic* batch_size_out, + /** + * The token value considered empty depends on the round over the ring buffer + * (which is defined by the seq_order_id) + */ + batch_token empty_token_value, + /** + * The counter is used to find the last CTA to finish and to share the batch size with the + * scatter_inputs kernel. + */ + cuda::atomic* kernel_progress_counter) +{ + const uint32_t query_id = blockIdx.x; + __shared__ const T* query_ptr; + + if (threadIdx.x == 0) { + query_ptr = nullptr; + + // NB: we have to read/write to `batch_token_ptr`, `bs_committed`, and `batch_fully_committed` + // using volatile assembly ops, because otherwise the compiler seems to fail to understand that + // this is the same location in memory. The order of reads in writes here is extremely + // important, as it involves multiple host and device threads (the host threads do RMW atomic + // increments on the commit counter). + volatile uint32_t* bs_committed = + reinterpret_cast(batch_token_ptr) + 1 - CUVS_SYSTEM_LITTLE_ENDIAN; + volatile uint8_t* batch_fully_committed = + reinterpret_cast(bs_committed) + (CUVS_SYSTEM_LITTLE_ENDIAN * 3); + + gpu_time_keeper runtime{remaining_time_us}; + bool committed = false; // if the query is committed, we have to wait for it to arrive + auto& request_query_ptr = request_ptrs(query_id).query; + while (true) { + query_ptr = request_query_ptr.load(cuda::std::memory_order_acquire); + if (query_ptr != nullptr) { + // The query is submitted to this block's slot; erase the pointer buffer for future use and + // exit the loop. + request_query_ptr.store(nullptr, cuda::std::memory_order_relaxed); + break; + } + // The query hasn't been submitted, but is already committed; other checks may be skipped + if (committed) { continue; } + // Check if the query is committed + uint32_t committed_count; + asm volatile("ld.volatile.global.u32 %0, [%1];" + : "=r"(committed_count) + : "l"(bs_committed) + : "memory"); + committed = (committed_count & 0x00ffffff) > query_id; + if (committed) { continue; } + // If the query is not committed, but the batch is past the deadline, we exit without copying + // the query + if (committed_count > 0x00ffffff) { break; } + // The query hasn't been submitted yet; check if we're past the deadline + if (runtime.has_time()) { continue; } + // Otherwise, let the others know time is out + // Set the highest byte of the commit counter to 1 (thus avoiding RMW atomic) + // This prevents any more CPU threads from committing to this batch. + asm volatile("st.volatile.global.u8 [%0], %1;" + : + : "l"(batch_fully_committed), "r"(1) + : "memory"); + asm volatile("ld.volatile.global.u32 %0, [%1];" + : "=r"(committed_count) + : "l"(bs_committed) + : "memory"); + committed = (committed_count & 0x00ffffff) > query_id; + if (committed) { continue; } + break; + } + auto progress = kernel_progress_counter->fetch_add(1, cuda::std::memory_order_acq_rel) + 1; + if (progress >= gridDim.x) { + // read the last value of the committed count to know the batch size for sure + uint32_t committed_count; + asm volatile("ld.volatile.global.u32 %0, [%1];" + : "=r"(committed_count) + : "l"(bs_committed) + : "memory"); + committed_count &= 0x00ffffff; // Clear the timeout bit + if (batch_size_out != nullptr) { + // Inform the dispatcher about the final batch size if `conservative_dispatch` is enabled + batch_size_out->store(committed_count, cuda::std::memory_order_relaxed); + } + // store the batch size in the progress counter, so we can read it in the scatter kernel + kernel_progress_counter->store(committed_count, cuda::std::memory_order_relaxed); + // Clear the batch token slot, so it can be re-used by others + asm volatile("st.volatile.global.u64 [%0], %1;" + : + : "l"(reinterpret_cast(batch_token_ptr)), + "l"(reinterpret_cast(empty_token_value)) + : "memory"); + } + } + // The block waits till the leading thread gets the query pointer + cooperative_groups::this_thread_block().sync(); + auto query_ptr_local = query_ptr; + if (query_ptr_local == nullptr) { return; } + // block-wide copy input query + auto dim = batch_queries.extent(1); + for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) { + batch_queries(query_id, i) = query_ptr_local[i]; + } +} + +/** Copy the results of the search back to the requesters. */ +template +RAFT_KERNEL scatter_outputs( + raft::pinned_vector_view, uint32_t> request_ptrs, + raft::device_matrix_view batch_neighbors, + raft::device_matrix_view batch_distances, + cuda::atomic* kernel_progress_counter, + cuda::atomic* next_token, + uint32_t batch_id) +{ + __shared__ uint32_t batch_size; + if (threadIdx.x == 0 && threadIdx.y == 0) { + batch_size = kernel_progress_counter->exchange(0, cuda::std::memory_order_relaxed); + } + // Copy output + cooperative_groups::this_thread_block().sync(); + auto k = batch_neighbors.extent(1); + for (uint32_t i = threadIdx.y; i < batch_size; i += blockDim.y) { + auto* request_neighbors = request_ptrs(i).neighbors; + auto* request_distances = request_ptrs(i).distances; + for (uint32_t j = threadIdx.x; j < k; j += blockDim.x) { + request_neighbors[j] = batch_neighbors(i, j); + request_distances[j] = batch_distances(i, j); + } + } + // Clear the batch state after all threads copied the data, so the batch can be reused + cuda::atomic_thread_fence(cuda::std::memory_order_release, cuda::thread_scope_system); + cooperative_groups::this_thread_block().sync(); + if (threadIdx.x != 0 || threadIdx.y != 0) { return; } + reinterpret_cast*>( + &reinterpret_cast(next_token)->id()) + ->store(batch_id, cuda::std::memory_order_relaxed); +} + +/** + * Batch runner is shared among the users of the `dynamic_batching::index` (i.e. the index can be + * copied, but the copies hold shared pointers to a single batch runner). + * + * Constructor and destructor of this class do not need to be thread-safe, as their execution is + * guaranteed to happen in one thread by the holding shared pointer. + * + * The search function must be thread-safe. We only have to pay attention to the `mutable` members + * though, because the function is marked const. + */ +template +class batch_runner { + public: + constexpr static uint32_t kMaxNumQueues = 256; + + using batch_queue = batch_queue_t; + using seq_order_id = typename batch_queue::seq_order_id; + + // Save the parameters and the upstream batched search function to invoke + template + batch_runner(const raft::resources& res, + const dynamic_batching::index_params& params, + const Upstream& upstream_index, + const typename Upstream::search_params_type& upstream_params, + upstream_search_type_const* upstream_search, + const cuvs::neighbors::filtering::base_filter* sample_filter) + : res_{res}, + upstream_search_{[&upstream_index, upstream_search, upstream_params, sample_filter]( + raft::resources const& res, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) { + /* Note: passing sample_filter by pointer + + Ideally, dynamic batching would capture the filter by value. Unfortunately, one cannot use + the copy constructor of the `base_filter` (it would erase the actual filter type). + Therefore, we can only pass the filter by pointer or reference and require the user to keep + the filter alive for the lifetime of the dynamic batching index. + This, however, may lead to a segfault when the user doesn't provide the filter argument and + the argument is passed by reference: the lifetime of the none_sample_filter default argument + is limited to the search function call, so it is destroyed while the dynamic batching index + is still alive. + Hence the solution is to pass the filter by pointer and default it to nullptr. + */ + if (sample_filter == nullptr) { + using base_filter_type = cuvs::neighbors::filtering::base_filter; + const auto none_filter = cuvs::neighbors::filtering::none_sample_filter{}; + return upstream_search(res, + upstream_params, + upstream_index, + queries, + neighbors, + distances, + static_cast(none_filter)); + + } else { + return upstream_search( + res, upstream_params, upstream_index, queries, neighbors, distances, *sample_filter); + } + }}, + k_{uint32_t(params.k)}, + dim_{uint32_t(upstream_index.dim())}, + max_batch_size_{uint32_t(params.max_batch_size)}, + n_queues_{uint32_t(params.n_queues)}, + batch_queue_{res_, params.conservative_dispatch}, + completion_events_(n_queues_), + input_extents_{n_queues_, max_batch_size_, dim_}, + output_extents_{n_queues_, max_batch_size_, k_}, + queries_{raft::make_device_mdarray(res_, input_extents_)}, + neighbors_{raft::make_device_mdarray(res_, output_extents_)}, + distances_{raft::make_device_mdarray(res_, output_extents_)}, + kernel_progress_counters_{ + raft::make_device_vector>( + res_, n_queues_)}, + request_ptrs_{raft::make_pinned_matrix, uint32_t>( + res_, n_queues_, max_batch_size_)} + { + RAFT_CUDA_TRY(cudaMemsetAsync( + kernel_progress_counters_.data_handle(), + 0, + sizeof(*kernel_progress_counters_.data_handle()) * kernel_progress_counters_.size(), + raft::resource::get_cuda_stream(res_))); + // Make sure to initialize the atomic values in the batch_state structs. + for (uint32_t i = 0; i < n_queues_; i++) { + auto seq_id = batch_queue_.push(); + batch_queue_.token(seq_id).store(batch_token{batch_queue::make_seq_batch_id(seq_id, i)}); + // Make sure to initialize query pointers, because they are used for synchronization + for (uint32_t j = 0; j < max_batch_size_; j++) { + new (&request_ptrs_(i, j)) request_pointers{}; + } + } + } + + // A workaround for algos, which have non-const `index` type in their arguments + template + batch_runner(const raft::resources& res, + const dynamic_batching::index_params& params, + const Upstream& upstream_index, + const typename Upstream::search_params_type& upstream_params, + upstream_search_type* upstream_search, + const cuvs::neighbors::filtering::base_filter* sample_filter) + : batch_runner{ + res, + params, + upstream_index, + upstream_params, + reinterpret_cast*>(upstream_search), + sample_filter} + { + } + + void search(raft::resources const& res, + cuvs::neighbors::dynamic_batching::search_params const& params, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) const + { + uint32_t n_queries = queries.extent(0); + if (n_queries >= max_batch_size_) { + return upstream_search_(res, queries, neighbors, distances); + } + + if (neighbors.extent(1) != int64_t(k_)) { + // TODO: the check can be relaxed to `neighbors.extent(1) > int64_t(k_)`; + // this, however, would require an extra bounds check per-query in the scatter kernel. + RAFT_LOG_WARN( + "The requested number of neighbors (%zd) doesn't match the configured " + "dynamic_batching::index_params::k (%u); dynamic batching is disabled for the request.", + neighbors.extent(1), + k_); + return upstream_search_(res, queries, neighbors, distances); + } + + auto deadline = std::chrono::system_clock::now() + + std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1000000.0)); + + int64_t local_io_offset = 0; + batch_token batch_token_observed{0}; + local_waiter to_commit{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 3e5)), + local_waiter::kNonSleepIterations}; + while (true) { + const auto seq_id = batch_queue_.head(); + const auto commit_result = try_commit(seq_id, n_queries); + // The bool (busy or not) returned if no queries were committed: + if (std::holds_alternative(commit_result)) { + // Pause if the system is busy + // (otherwise the progress is guaranteed due to update of the head counter) + if (std::get(commit_result)) { to_commit.wait(); } + continue; // Try to get a new batch token + } + batch_token_observed = std::get(std::get<0>(commit_result)); + const auto queries_committed = std::get(std::get<0>(commit_result)); + const auto batch_offset = batch_token_observed.size_committed(); + auto& batch_token_ref = batch_queue_.token(seq_id); + auto& rem_time_us_ref = batch_queue_.rem_time_us(seq_id); + auto& dispatch_sequence_id_ref = batch_queue_.dispatch_sequence_id(seq_id); + auto* batch_size_ptr = batch_queue_.batch_size(seq_id); + // sleep for 1/10 of deadline time or more + // (if couldn't get the value in the first few iterations). + local_waiter till_full{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5)), + batch_queue_.niceness(seq_id)}; + while (batch_queue::batch_status(batch_token_observed, seq_id) != slot_state::kFull) { + /* Note: waiting for batch IO buffers + The CPU threads can commit to the incoming batches in the queue in advance (this happens in + try_commit). + In this loop, a thread waits for the batch IO buffer to be released by a running search on + the GPU side (scatter_outputs kernel). Hence, this loop is engaged only if all buffers are + currently used, which suggests that the GPU is busy (or there's not enough IO buffers). + This also means the current search is not likely to meet the deadline set by the user. + + The scatter kernel returns its buffer id into an acquired slot in the batch queue; in this + loop we wait for that id to arrive. + + Generally, we want to waste as little as possible CPU cycles here to let other threads wait + on dispatch_sequence_id_ref below more efficiently. At the same time, we shouldn't use + `.wait()` here, because `.notify_all()` would have to come from GPU. + */ + till_full.wait(); + batch_token_observed = batch_token_ref.load(cuda::std::memory_order_acquire); + } + // Whether this thread is responsible for dispatching the batch. + bool is_dispatcher = batch_offset == 0; + auto stream = raft::resource::get_cuda_stream(res); + auto batch_id = batch_queue::batch_id(batch_token_observed); + auto request_ptrs = slice_2d(batch_id, request_ptrs_); + + if (is_dispatcher) { + // Conservatively initialize the remaining time + // TODO (achirkin): this initialization may happen after the other requesters update the + // time and thus erase their deadlines. + rem_time_us_ref.store(static_cast(params.dispatch_timeout_ms * 1000), + cuda::std::memory_order_relaxed); + // run the gather kernel before submitting the data to reduce the latency + gather_inputs<<>>( + slice_3d(batch_id, queries_), + request_ptrs, + &rem_time_us_ref, + &batch_token_ref, + batch_size_ptr, + // This indicates the empty token slot, which can only be used in the following round + batch_queue::make_empty_token(seq_id), + kernel_progress_counters_.data_handle() + batch_id); + } + + // *** Set the pointers to queries, neighbors, distances - query-by-query + for (uint32_t i = 0; i < queries_committed; i++) { + const auto o = local_io_offset + i; + auto& ptrs = request_ptrs(batch_offset + i); + ptrs.neighbors = neighbors.data_handle() + o * k_; + ptrs.distances = distances.data_handle() + o * k_; + ptrs.query.store(queries.data_handle() + o * dim_, cuda::std::memory_order_release); + } + + // Submit estimated remaining time + { + auto rem_time_us = static_cast( + std::max(0, (deadline - std::chrono::system_clock::now()).count()) / 1000); + rem_time_us_ref.fetch_min(rem_time_us, cuda::std::memory_order_relaxed); + } + + if (is_dispatcher) { + uint32_t batch_size = max_batch_size_; + if (batch_size_ptr != nullptr) { + // Block until the real batch size is available if conservative dispatch is used. + local_waiter for_dispatch{ + std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5))}; + batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed); + while (batch_size == 0) { + for_dispatch.wait(); + batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed); + } + batch_size_ptr->store(0, cuda::std::memory_order_relaxed); + } + auto batch_neighbors = slice_3d(batch_id, neighbors_, batch_size); + auto batch_distances = slice_3d(batch_id, distances_, batch_size); + upstream_search_( + res, slice_3d(batch_id, queries_, batch_size), batch_neighbors, batch_distances); + auto next_seq_id = batch_queue_.push(); + auto& next_token_ref = batch_queue_.token(next_seq_id); + // next_batch_token); + auto bs = dim3(128, 8, 1); + scatter_outputs + <<<1, bs, 0, stream>>>(request_ptrs, + batch_neighbors, + batch_distances, + kernel_progress_counters_.data_handle() + batch_id, + &next_token_ref, + batch_queue::make_seq_batch_id(next_seq_id, batch_id)); + RAFT_CUDA_TRY(cudaEventRecord(completion_events_[batch_id].value(), stream)); + dispatch_sequence_id_ref.store(seq_id.value, cuda::std::memory_order_release); + dispatch_sequence_id_ref.notify_all(); + + } else { + // Wait till the dispatch_sequence_id counter is updated, which means the event is recorded + auto dispatched_id_observed = + dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire); + while (static_cast(seq_id.value - dispatched_id_observed) > 0) { + dispatch_sequence_id_ref.wait(dispatched_id_observed, cuda::std::memory_order_relaxed); + dispatched_id_observed = dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire); + } + // Now we can safely record the event + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, completion_events_[batch_id].value())); + } + + n_queries -= queries_committed; + + if (n_queries == 0) { return; } + // If not all queries were committed, continue in the loop. + // TODO: it could potentially be more efficient to first commit everything and only then + // submit the work/wait for the event + local_io_offset += queries_committed; + to_commit.reset( + local_waiter::kNonSleepIterations); // reset the waiter for the next iteration. + } + } + + private: + raft::resources res_; // Sic! Store by value to copy the resource. + std::function> upstream_search_; + uint32_t k_; + uint32_t dim_; + uint32_t max_batch_size_; + uint32_t n_queues_; + + mutable batch_queue batch_queue_; + std::vector completion_events_; + + using batch_extents = raft::extent_3d; + batch_extents input_extents_; + batch_extents output_extents_; + + mutable raft::device_mdarray queries_; + mutable raft::device_mdarray neighbors_; + mutable raft::device_mdarray distances_; + mutable raft::device_vector> + kernel_progress_counters_; + + mutable raft::pinned_matrix, uint32_t, raft::row_major> request_ptrs_; + + /** + * Try to commit n_queries at most; returns the last observed batch_token (where `size_committed` + * represents offset at which new queries are committed if successful), the number of committed + * queries, or whether the ring buffer appears to be busy (on unsuccessful commit). + */ + auto try_commit(seq_order_id seq_id, uint32_t n_queries) const + -> std::variant, bool> + { + auto& batch_token_ref = batch_queue_.token(seq_id); + batch_token batch_token_observed = batch_token_ref.load(cuda::std::memory_order_relaxed); + batch_token batch_token_updated; + slot_state token_status; + do { + // The interpretation of the token status depends on the current seq_order_id and a similar + // counter in the token. This is to prevent conflicts when too many parallel requests wrap + // over the whole ring buffer (batch_queue_t). + token_status = batch_queue::batch_status(batch_token_observed, seq_id); + // Busy status means the current thread is a whole ring buffer ahead of the token. + // The thread should wait for the rest of the system. + if (token_status == slot_state::kFullBusy || token_status == slot_state::kEmptyBusy) { + return true; + } + // This branch checks if the token was recently filled or dispatched. + // This means the head counter of the ring buffer is slightly outdated. + if (token_status == slot_state::kEmptyPast || token_status == slot_state::kFullPast || + batch_token_observed.size_committed() >= max_batch_size_) { + batch_queue_.pop(seq_id); + return false; + } + batch_token_updated = batch_token_observed; + batch_token_updated.size_committed() = + std::min(batch_token_observed.size_committed() + n_queries, max_batch_size_); + } while (!batch_token_ref.compare_exchange_weak(batch_token_observed, + batch_token_updated, + cuda::std::memory_order_acq_rel, + cuda::std::memory_order_relaxed)); + if (batch_token_updated.size_committed() >= max_batch_size_) { + // The batch is already full, let's try to pop it from the queue + // (if nobody has done so already) + batch_queue_.pop(seq_id); + } + return std::make_tuple( + batch_token_observed, + batch_token_updated.size_committed() - batch_token_observed.size_committed()); + } +}; + +} // namespace cuvs::neighbors::dynamic_batching::detail diff --git a/cpp/src/neighbors/dynamic_batching.cu b/cpp/src/neighbors/dynamic_batching.cu new file mode 100644 index 000000000..6be70353b --- /dev/null +++ b/cpp/src/neighbors/dynamic_batching.cu @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "detail/dynamic_batching.cuh" + +#include +#include +#include + +#include +#include + +namespace cuvs::neighbors::dynamic_batching { + +// NB: the (template) index parameter should be the last; it may contain the spaces and so split +// into multiple preprocessor token. Then it is consumed as __VA_ARGS__ +// +#define CUVS_INST_DYNAMIC_BATCHING_INDEX(T, IdxT, Namespace, ...) \ + template <> \ + template <> \ + index::index( \ + const raft::resources& res, \ + const cuvs::neighbors::dynamic_batching::index_params& params, \ + const Namespace ::__VA_ARGS__& upstream_index, \ + const typename Namespace ::__VA_ARGS__::search_params_type& upstream_params, \ + const cuvs::neighbors::filtering::base_filter* sample_filter) \ + : runner{new detail::batch_runner( \ + res, params, upstream_index, upstream_params, Namespace ::search, sample_filter)} \ + { \ + } + +#define CUVS_INST_DYNAMIC_BATCHING_SEARCH(T, IdxT) \ + void search(raft::resources const& res, \ + cuvs::neighbors::dynamic_batching::search_params const& params, \ + cuvs::neighbors::dynamic_batching::index const& index, \ + raft::device_matrix_view queries, \ + raft::device_matrix_view neighbors, \ + raft::device_matrix_view distances) \ + { \ + return index.runner->search(res, params, queries, neighbors, distances); \ + } + +CUVS_INST_DYNAMIC_BATCHING_INDEX(float, uint32_t, cuvs::neighbors::cagra, index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(half, uint32_t, cuvs::neighbors::cagra, index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, uint32_t, cuvs::neighbors::cagra, index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t, + uint32_t, + cuvs::neighbors::cagra, + index); + +CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_pq, index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(half, int64_t, cuvs::neighbors::ivf_pq, index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, int64_t, cuvs::neighbors::ivf_pq, index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t, int64_t, cuvs::neighbors::ivf_pq, index); + +CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_flat, index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, + int64_t, + cuvs::neighbors::ivf_flat, + index); +CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t, + int64_t, + cuvs::neighbors::ivf_flat, + index); + +CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, int64_t); +CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, int64_t); +CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, int64_t); +CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, int64_t); +CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, uint32_t); // uint32_t index type is needed for CAGRA +CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, uint32_t); +CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, uint32_t); +CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, uint32_t); + +#undef CUVS_INST_DYNAMIC_BATCHING_INDEX +#undef CUVS_INST_DYNAMIC_BATCHING_SEARCH + +} // namespace cuvs::neighbors::dynamic_batching diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 286d721d7..1c8de2ad0 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -175,6 +175,19 @@ if(BUILD_TESTS) 100 ) + ConfigureTest( + NAME + NEIGHBORS_DYNAMIC_BATCHING_TEST + PATH + neighbors/dynamic_batching/test_cagra.cu + neighbors/dynamic_batching/test_ivf_flat.cu + neighbors/dynamic_batching/test_ivf_pq.cu + GPUS + 1 + PERCENT + 100 + ) + if(BUILD_CAGRA_HNSWLIB) ConfigureTest(NAME NEIGHBORS_HNSW_TEST PATH neighbors/hnsw.cu GPUS 1 PERCENT 100) target_link_libraries(NEIGHBORS_HNSW_TEST PRIVATE hnswlib::hnswlib) diff --git a/cpp/test/neighbors/dynamic_batching.cuh b/cpp/test/neighbors/dynamic_batching.cuh new file mode 100644 index 000000000..b64c5b01e --- /dev/null +++ b/cpp/test/neighbors/dynamic_batching.cuh @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "ann_utils.cuh" + +#include + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuvs::neighbors::dynamic_batching { + +struct dynamic_batching_spec { + int64_t n_queries = 1000; + int64_t n_rows = 100000; + int64_t dim = 128; + int64_t k = 10; + int64_t max_batch_size = 64; + size_t n_queues = 3; + bool conservative_dispatch = false; + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded; + int64_t max_concurrent_threads = 128; +}; + +inline ::std::ostream& operator<<(::std::ostream& os, const dynamic_batching_spec& p) +{ + os << "{n_queries=" << p.n_queries; + os << ", dataset shape=" << p.n_rows << "x" << p.dim; + os << ", metric=" << print_metric{p.metric}; + os << ", k=" << p.k; + os << ", max_batch_size=" << p.max_batch_size; + os << ", n_queues=" << p.n_queues; + os << ", conservative_dispatch=" << p.conservative_dispatch; + os << '}' << std::endl; + return os; +} + +template +using build_function = UpstreamT(const raft::resources&, + const typename UpstreamT::index_params_type&, + raft::device_matrix_view); + +template +using search_function = void(const raft::resources&, + const typename UpstreamT::search_params_type& params, + const UpstreamT& index, + raft::device_matrix_view, + raft::device_matrix_view, + raft::device_matrix_view, + const cuvs::neighbors::filtering::base_filter&); + +template UpstreamBuildF, + search_function UpstreamSearchF> +struct dynamic_batching_test : public ::testing::TestWithParam { + using distance_type = float; + using data_type = DataT; + using index_type = IdxT; + using upstream_type = UpstreamT; + + dynamic_batching_spec ps = ::testing::TestWithParam::GetParam(); + raft::resources res; + + // input data + std::optional> dataset = std::nullopt; + std::optional> queries = std::nullopt; + std::optional> neighbors_upsm = std::nullopt; + std::optional> neighbors_dynb = std::nullopt; + std::optional> distances_upsm = std::nullopt; + std::optional> distances_dynb = std::nullopt; + + // build parameters + cuvs::neighbors::index_params build_params_base{ps.metric}; + typename upstream_type::index_params_type build_params_upsm{build_params_base}; + dynamic_batching::index_params build_params_dynb{ + build_params_base, ps.k, ps.max_batch_size, ps.n_queues, ps.conservative_dispatch}; + + // search parameters + typename upstream_type::search_params_type search_params_upsm{}; + dynamic_batching::search_params search_params_dynb{}; + + // indexes + std::optional index_upsm = std::nullopt; + std::optional> index_dynb = std::nullopt; + + void build_all() + { + index_dynb.reset(); + index_upsm.reset(); + index_upsm = UpstreamBuildF(res, build_params_upsm, dataset->view()); + index_dynb.emplace(res, build_params_dynb, index_upsm.value(), search_params_upsm); + } + + void search_all() + { + // Search using upstream index - all queries at once + UpstreamSearchF(res, + search_params_upsm, + index_upsm.value(), + queries->view(), + neighbors_upsm->view(), + distances_upsm->view(), + filtering::none_sample_filter{}); + raft::resource::sync_stream(res); + + // Search with dynamic batching + // Streaming scenario: prepare concurrent resources + rmm::cuda_stream_pool worker_streams(ps.max_concurrent_threads); + std::vector> futures(ps.max_concurrent_threads); + std::vector resource_pool(0); + for (int64_t i = 0; i < ps.max_concurrent_threads; i++) { + resource_pool.push_back(res); // copies the resource + raft::resource::set_cuda_stream(resource_pool[i], worker_streams.get_stream(i)); + } + + // Try multiple batch sizes in a round-robin to improve test coverage + std::vector minibatch_sizes{1, 3, 7, 10}; + auto get_bs = [&minibatch_sizes](auto i) { + return minibatch_sizes[i % minibatch_sizes.size()]; + }; + int64_t i = 0; + for (int64_t offset = 0; offset < ps.n_queries; offset += get_bs(i++)) { + auto bs = std::min(get_bs(i), ps.n_queries - offset); + auto j = i % ps.max_concurrent_threads; + // wait for previous job in the same slot to finish + if (i >= ps.max_concurrent_threads) { futures[j].wait(); } + // submit a new job + futures[j] = std::async( + std::launch::async, + [&res = resource_pool[j], + ¶ms = search_params_dynb, + index = index_dynb.value(), + query_view = raft::make_device_matrix_view( + queries->data_handle() + offset * ps.dim, bs, ps.dim), + neighbors_view = raft::make_device_matrix_view( + neighbors_dynb->data_handle() + offset * ps.k, bs, ps.k), + distances_view = raft::make_device_matrix_view( + distances_dynb->data_handle() + offset * ps.k, bs, ps.k)]() { + dynamic_batching::search(res, params, index, query_view, neighbors_view, distances_view); + }); + } + + // finalize all resources + for (int64_t j = 0; j < ps.max_concurrent_threads && j < i; j++) { + futures[j].wait(); + raft::resource::sync_stream(resource_pool[j]); + } + raft::resource::sync_stream(res); + } + + /* + Check the dynamic batching generated neighbors against the upstream index. They both may be + imperfect w.r.t. the ground truth, but they shouldn't differ too much. + */ + void check_neighbors() + { + auto stream = raft::resource::get_cuda_stream(res); + size_t queries_size = ps.n_queries * ps.k; + std::vector neighbors_upsm_host(queries_size); + std::vector neighbors_dynb_host(queries_size); + std::vector distances_upsm_host(queries_size); + std::vector distances_dynb_host(queries_size); + raft::copy(neighbors_upsm_host.data(), neighbors_upsm->data_handle(), queries_size, stream); + raft::copy(neighbors_dynb_host.data(), neighbors_dynb->data_handle(), queries_size, stream); + raft::copy(distances_upsm_host.data(), distances_upsm->data_handle(), queries_size, stream); + raft::copy(distances_dynb_host.data(), distances_dynb->data_handle(), queries_size, stream); + raft::resource::sync_stream(res); + ASSERT_TRUE(eval_neighbours(neighbors_upsm_host, + neighbors_dynb_host, + distances_upsm_host, + distances_dynb_host, + ps.n_queries, + ps.k, + 0.001, + 0.9)) + << ps; + } + + void SetUp() override + { + dataset.emplace(raft::make_device_matrix(res, ps.n_rows, ps.dim)); + queries.emplace(raft::make_device_matrix(res, ps.n_queries, ps.dim)); + neighbors_upsm.emplace(raft::make_device_matrix(res, ps.n_queries, ps.k)); + neighbors_dynb.emplace(raft::make_device_matrix(res, ps.n_queries, ps.k)); + distances_upsm.emplace( + raft::make_device_matrix(res, ps.n_queries, ps.k)); + distances_dynb.emplace( + raft::make_device_matrix(res, ps.n_queries, ps.k)); + + raft::random::RngState rng(666ULL); + if constexpr (std::is_same_v || std::is_same_v) { + raft::random::uniform( + res, rng, dataset->data_handle(), dataset->size(), data_type(0.1), data_type(2.0)); + raft::random::uniform( + res, rng, queries->data_handle(), queries->size(), data_type(0.1), data_type(2.0)); + } else { + raft::random::uniformInt( + res, rng, dataset->data_handle(), dataset->size(), data_type(1), data_type(20)); + raft::random::uniformInt( + res, rng, queries->data_handle(), queries->size(), data_type(1), data_type(20)); + } + raft::resource::sync_stream(res); + } + + void TearDown() override + { + index_dynb.reset(); + index_upsm.reset(); + dataset.reset(); + queries.reset(); + neighbors_upsm.reset(); + neighbors_dynb.reset(); + distances_upsm.reset(); + distances_dynb.reset(); + raft::resource::sync_stream(res); + } +}; + +inline std::vector generate_inputs() +{ + std::vector inputs{dynamic_batching_spec{}}; + + for (auto alt_n_queries : {10, 50, 100}) { + dynamic_batching_spec input{}; + input.n_queries = alt_n_queries; + inputs.push_back(input); + } + + for (auto alt_k : {100, 200}) { + dynamic_batching_spec input{}; + input.k = alt_k; + inputs.push_back(input); + } + + for (auto alt_max_batch_size : {4, 16, 128, 256, 512, 1024}) { + dynamic_batching_spec input{}; + input.max_batch_size = alt_max_batch_size; + inputs.push_back(input); + } + + for (auto alt_n_queues : {1, 2, 16, 32}) { + dynamic_batching_spec input{}; + input.n_queues = alt_n_queues; + inputs.push_back(input); + } + + for (auto alt_max_concurrent_threads : {1, 2, 16, 32}) { + dynamic_batching_spec input{}; + input.max_concurrent_threads = alt_max_concurrent_threads; + inputs.push_back(input); + } + + { + auto n = inputs.size(); + for (size_t i = 0; i < n; i++) { + auto input = inputs[i]; + input.conservative_dispatch = !input.conservative_dispatch; + inputs.push_back(input); + } + } + + return inputs; +} + +const std::vector inputs = generate_inputs(); + +} // namespace cuvs::neighbors::dynamic_batching diff --git a/cpp/test/neighbors/dynamic_batching/test_cagra.cu b/cpp/test/neighbors/dynamic_batching/test_cagra.cu new file mode 100644 index 000000000..604fc29cf --- /dev/null +++ b/cpp/test/neighbors/dynamic_batching/test_cagra.cu @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../dynamic_batching.cuh" + +#include + +namespace cuvs::neighbors::dynamic_batching { + +using cagra_F32 = dynamic_batching_test, + cagra::build, + cagra::search>; + +using cagra_U8 = dynamic_batching_test, + cagra::build, + cagra::search>; + +template +static void set_default_cagra_params(fixture& that) +{ + that.build_params_upsm.intermediate_graph_degree = 128; + that.build_params_upsm.graph_degree = 64; + that.search_params_upsm.itopk_size = + std::clamp(raft::bound_by_power_of_two(that.ps.k) * 16, 128, 512); +} + +TEST_P(cagra_F32, single_cta) +{ + set_default_cagra_params(*this); + search_params_upsm.algo = cagra::search_algo::SINGLE_CTA; + build_all(); + search_all(); + check_neighbors(); +} + +TEST_P(cagra_F32, multi_cta) +{ + set_default_cagra_params(*this); + search_params_upsm.algo = cagra::search_algo::MULTI_CTA; + build_all(); + search_all(); + check_neighbors(); +} + +TEST_P(cagra_F32, multi_kernel) +{ + set_default_cagra_params(*this); + search_params_upsm.algo = cagra::search_algo::MULTI_KERNEL; + build_all(); + search_all(); + check_neighbors(); +} + +TEST_P(cagra_U8, defaults) +{ + set_default_cagra_params(*this); + build_all(); + search_all(); + check_neighbors(); +} + +INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_F32, ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_U8, ::testing::ValuesIn(inputs)); + +} // namespace cuvs::neighbors::dynamic_batching diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu new file mode 100644 index 000000000..4922cffa3 --- /dev/null +++ b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../dynamic_batching.cuh" + +#include + +namespace cuvs::neighbors::dynamic_batching { + +using ivf_flat_i8 = dynamic_batching_test, + ivf_flat::build, + ivf_flat::search>; + +TEST_P(ivf_flat_i8, defaults) +{ + build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows)); + search_params_upsm.n_probes = + std::max(std::min(build_params_upsm.n_lists, 10), + raft::div_rounding_up_safe(build_params_upsm.n_lists, 50)); + build_all(); + search_all(); + check_neighbors(); +} + +INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_flat_i8, ::testing::ValuesIn(inputs)); + +} // namespace cuvs::neighbors::dynamic_batching diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu new file mode 100644 index 000000000..ec57e0b57 --- /dev/null +++ b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../dynamic_batching.cuh" + +#include + +namespace cuvs::neighbors::dynamic_batching { + +using ivf_pq_f16 = + dynamic_batching_test, ivf_pq::build, ivf_pq::search>; + +TEST_P(ivf_pq_f16, defaults) +{ + build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows)); + search_params_upsm.n_probes = + std::max(std::min(build_params_upsm.n_lists, 10), + raft::div_rounding_up_safe(build_params_upsm.n_lists, 50)); + build_all(); + search_all(); + check_neighbors(); +} + +INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_pq_f16, ::testing::ValuesIn(inputs)); + +} // namespace cuvs::neighbors::dynamic_batching diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst index d55d58eb0..ab810ab53 100644 --- a/docs/source/cpp_api/neighbors.rst +++ b/docs/source/cpp_api/neighbors.rst @@ -11,6 +11,7 @@ Nearest Neighbors neighbors_bruteforce.rst neighbors_cagra.rst + neighbors_dynamic_batching.rst neighbors_hnsw.rst neighbors_ivf_flat.rst neighbors_ivf_pq.rst diff --git a/docs/source/cpp_api/neighbors_dynamic_batching.rst b/docs/source/cpp_api/neighbors_dynamic_batching.rst new file mode 100644 index 000000000..adc5cb56a --- /dev/null +++ b/docs/source/cpp_api/neighbors_dynamic_batching.rst @@ -0,0 +1,45 @@ +Dynamic Batching +================ + +Dynamic Batching allows grouping small search requests into batches to increase the device occupancy and throughput while keeping the latency within limits. + +.. role:: py(code) + :language: c++ + :class: highlight + +``#include `` + +namespace *cuvs::neighbors::dynamic_batching* + +Index build parameters +---------------------- + +.. doxygengroup:: dynamic_batching_cpp_index_params + :project: cuvs + :members: + :content-only: + +Index search parameters +----------------------- + +.. doxygengroup:: dynamic_batching_cpp_search_params + :project: cuvs + :members: + :content-only: + +Index +----- + +.. doxygengroup:: dynamic_batching_cpp_index + :project: cuvs + :members: + :content-only: + + +Index search +------------ + +.. doxygengroup:: dynamic_batching_cpp_search + :project: cuvs + :members: + :content-only: diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 092b65ed9..951e0ad0c 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -38,6 +38,7 @@ include(../cmake/thirdparty/get_cuvs.cmake) # -------------- compile tasks ----------------- # add_executable(CAGRA_EXAMPLE src/cagra_example.cu) add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu) +add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu) add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu) add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu) add_executable(VAMANA_EXAMPLE src/vamana_example.cu) @@ -48,6 +49,9 @@ target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $ Threads::Threads ) +target_link_libraries( + DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $ Threads::Threads +) target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $) target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $) diff --git a/examples/cpp/src/dynamic_batching_example.cu b/examples/cpp/src/dynamic_batching_example.cu new file mode 100644 index 000000000..95f66a454 --- /dev/null +++ b/examples/cpp/src/dynamic_batching_example.cu @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.cuh" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// A helper to split the dataset into chunks +template +auto slice_matrix(const DeviceMatrixOrView &source, + typename DeviceMatrixOrView::index_type offset_rows, + typename DeviceMatrixOrView::index_type count_rows) { + auto n_cols = source.extent(1); + return raft::make_device_matrix_view< + typename DeviceMatrixOrView::element_type, + typename DeviceMatrixOrView::index_type>( + const_cast( + source.data_handle()) + + offset_rows * n_cols, + count_rows, n_cols); +} + +// A helper to measure the execution time of a function +template +void time_it(std::string label, F f, Args &&...xs) { + auto start = std::chrono::system_clock::now(); + f(std::forward(xs)...); + auto end = std::chrono::system_clock::now(); + auto t = std::chrono::duration_cast(end - start); + auto t_ms = double(t.count()) / 1000.0; + std::cout << "[" << label << "] execution time: " << t_ms << " ms" + << std::endl; +} + +/** + * Wrap waiting on a stream work into an async C++ future object. + * This is similar to recording and waiting on CUDA events, but in C++11 API. + */ +struct cuda_work_completion_promise { + + cuda_work_completion_promise(const raft::resources &res) { + auto *promise = new std::promise; + RAFT_CUDA_TRY(cudaLaunchHostFunc(raft::resource::get_cuda_stream(res), + completion_callback, + reinterpret_cast(promise))); + value_ = promise->get_future(); + } + + /** + * Waiting on the produced `future` object has the same effect as + * cudaEventSynchronize if an event was recorded at the time of creation of + * this promise object. + */ + auto get_future() -> std::future && { return std::move(value_); } + +private: + std::future value_; + + static void completion_callback(void *ptr) { + auto *promise = reinterpret_cast *>(ptr); + promise->set_value(); + delete promise; + } +}; + +void dynamic_batching_example( + raft::resources const &res, + raft::device_matrix_view dataset, + raft::device_matrix_view queries) { + using namespace cuvs::neighbors; + + // Number of neighbors to search + int64_t topk = 100; + + // Streaming scenario: maximum number of requests in-flight + constexpr int64_t kMaxJobs = 1000; + // Streaming scenario: number of concurrent CUDA streams + constexpr int64_t kNumWorkerStreams = 5; + + // Split the queries into two subsets to run every experiment twice and thus + // surface any initialization overheads. + int64_t n_queries_a = queries.extent(0) / 2; + int64_t n_queries_b = queries.extent(0) - n_queries_a; + + auto queries_a = slice_matrix(queries, 0, n_queries_a); + auto queries_b = slice_matrix(queries, n_queries_a, n_queries_b); + + // create output arrays + auto neighbors = + raft::make_device_matrix(res, queries.extent(0), topk); + auto distances = + raft::make_device_matrix(res, queries.extent(0), topk); + // slice them same as queries + auto neighbors_a = slice_matrix(neighbors, 0, n_queries_a); + auto distances_a = slice_matrix(distances, 0, n_queries_a); + auto neighbors_b = slice_matrix(neighbors, n_queries_a, n_queries_b); + auto distances_b = slice_matrix(distances, n_queries_a, n_queries_b); + + // use default index parameters + cagra::index_params orig_index_params; + + std::cout << "Building CAGRA index (search graph)" << std::endl; + auto orig_index = cagra::build(res, orig_index_params, dataset); + + std::cout << "CAGRA index has " << orig_index.size() << " vectors" + << std::endl; + std::cout << "CAGRA graph has degree " << orig_index.graph_degree() + << ", graph size [" << orig_index.graph().extent(0) << ", " + << orig_index.graph().extent(1) << "]" << std::endl; + + // use default search parameters + cagra::search_params orig_search_params; + // get a decent recall by increasing the internal topk list + orig_search_params.itopk_size = 512; + orig_search_params.algo = cagra::search_algo::SINGLE_CTA; + + // Set up dynamic batching parameters + dynamic_batching::index_params dynb_index_params{ + /* default-initializing the parent `neighbors::index_params` + (not used anyway) */ + {}, + /* Set the K in advance (the batcher needs to allocate buffers) */ + topk, + /* Configure the number and the size of IO buffers */ + 64, + kNumWorkerStreams}; + + // "build" the index (it's a low-cost index wrapping), + // that is we need to pass the original index and its search params here + dynamic_batching::index dynb_index( + res, dynb_index_params, orig_index, orig_search_params); + + // You can implement job priorities by varying the deadlines of individual + // requests + dynamic_batching::search_params dynb_search_params; + dynb_search_params.dispatch_timeout_ms = 0.1; + + // Define the big-batch setting as a baseline for measuring the throughput. + auto search_batch_orig = + [&res, &orig_index, &orig_search_params]( + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) { + cagra::search(res, orig_search_params, orig_index, queries, neighbors, + distances); + raft::resource::sync_stream(res); + }; + + // Launch the baseline search: check the big-batch performance + time_it("standard/batch A", search_batch_orig, queries_a, neighbors_a, + distances_a); + time_it("standard/batch B", search_batch_orig, queries_b, neighbors_b, + distances_b); + + // Streaming scenario: prepare concurrent resources + rmm::cuda_stream_pool worker_streams{kNumWorkerStreams}; + std::vector resource_pool(0); + for (int64_t i = 0; i < kNumWorkerStreams; i++) { + resource_pool.push_back(res); + raft::resource::set_cuda_stream(resource_pool[i], + worker_streams.get_stream(i)); + } + + // Streaming scenario: + // send queries one-by-one, with a maximum kMaxJobs in-flight + auto search_async_orig = + [&resource_pool, &orig_index, &orig_search_params]( + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) { + auto work_size = queries.extent(0); + std::array, kMaxJobs> futures; + for (int64_t i = 0; i < work_size + kMaxJobs; i++) { + // wait for previous job in the same slot to finish + if (i >= kMaxJobs) { + futures[i % kMaxJobs].wait(); + } + // submit a new job + if (i < work_size) { + auto &res = resource_pool[i % kNumWorkerStreams]; + cagra::search(res, orig_search_params, orig_index, + slice_matrix(queries, i, 1), + slice_matrix(neighbors, i, 1), + slice_matrix(distances, i, 1)); + futures[i % kMaxJobs] = + cuda_work_completion_promise(res).get_future(); + } + } + }; + + // Streaming scenario with dynamic batching: + // send queries one-by-one, with a maximum kMaxJobs in-flight, + // yet allow grouping the sequential requests (subject to deadlines) + auto search_async_dynb = + [&resource_pool, &dynb_index, &dynb_search_params]( + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) { + auto work_size = queries.extent(0); + std::array, kMaxJobs> futures; + for (int64_t i = 0; i < work_size + kMaxJobs; i++) { + // wait for previous job in the same slot to finish + if (i >= kMaxJobs) { + futures[i % kMaxJobs].wait(); + } + // submit a new job + if (i < work_size) { + auto &res = resource_pool[i % kNumWorkerStreams]; + dynamic_batching::search(res, dynb_search_params, dynb_index, + slice_matrix(queries, i, 1), + slice_matrix(neighbors, i, 1), + slice_matrix(distances, i, 1)); + futures[i % kMaxJobs] = + cuda_work_completion_promise(res).get_future(); + } + } + }; + + // Try to handle the same amount of work in the async setting using the + // standard implementation. + time_it("standard/async A", search_async_orig, queries_a, neighbors_a, + distances_a); + time_it("standard/async B", search_async_orig, queries_b, neighbors_b, + distances_b); + + // Do the same using dynamic batching + time_it("dynamic_batching/async A", search_async_dynb, queries_a, neighbors_a, + distances_a); + time_it("dynamic_batching/async B", search_async_dynb, queries_b, neighbors_b, + distances_b); +} + +int main() { + raft::device_resources res; + + // Set the raft resource to use a pool for internal memory allocations + // (workspace) and limit the available workspace size. + raft::resource::set_workspace_to_pool_resource(res, + 12ull * 1024 * 1024 * 1024ull); + + // Create input arrays. + int64_t n_samples = 1000000; + int64_t n_dim = 128; + int64_t n_queries = 10000; + auto dataset = + raft::make_device_matrix(res, n_samples, n_dim); + auto queries = + raft::make_device_matrix(res, n_queries, n_dim); + generate_dataset(res, dataset.view(), queries.view()); + + // run the interesting part of the program + dynamic_batching_example(res, raft::make_const_mdspan(dataset.view()), + raft::make_const_mdspan(queries.view())); +} From b051f805129fab36ee5da7299ed0fb98850fa44c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Malte=20F=C3=B6rster?= <97973773+mfoerste4@users.noreply.github.com> Date: Thu, 5 Dec 2024 06:27:33 +0100 Subject: [PATCH 42/47] Add C++ API scalar quantization (#494) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First draft for scalar quantization. WIP status: * only int8_t target type * quantile computation inefficient (via sampling & sorting) Authors: - Malte Förster (https://github.com/mfoerste4) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/494 --- cpp/CMakeLists.txt | 1 + .../cuvs/preprocessing/quantize/scalar.hpp | 489 ++++++++++++++++++ .../preprocessing/quantize/detail/scalar.cuh | 227 ++++++++ cpp/src/preprocessing/quantize/scalar.cu | 74 +++ cpp/test/CMakeLists.txt | 5 + cpp/test/preprocessing/scalar_quantization.cu | 291 +++++++++++ docs/source/cpp_api.rst | 1 + docs/source/cpp_api/preprocessing.rst | 12 + .../source/cpp_api/preprocessing_quantize.rst | 20 + 9 files changed, 1120 insertions(+) create mode 100644 cpp/include/cuvs/preprocessing/quantize/scalar.hpp create mode 100644 cpp/src/preprocessing/quantize/detail/scalar.cuh create mode 100644 cpp/src/preprocessing/quantize/scalar.cu create mode 100644 cpp/test/preprocessing/scalar_quantization.cu create mode 100644 docs/source/cpp_api/preprocessing.rst create mode 100644 docs/source/cpp_api/preprocessing_quantize.rst diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6af423bd5..199bb232d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -459,6 +459,7 @@ if(BUILD_SHARED_LIBS) src/neighbors/vamana_serialize_float.cu src/neighbors/vamana_serialize_uint8.cu src/neighbors/vamana_serialize_int8.cu + src/preprocessing/quantize/scalar.cu src/selection/select_k_float_int64_t.cu src/selection/select_k_float_int32_t.cu src/selection/select_k_float_uint32_t.cu diff --git a/cpp/include/cuvs/preprocessing/quantize/scalar.hpp b/cpp/include/cuvs/preprocessing/quantize/scalar.hpp new file mode 100644 index 000000000..49b4bb7a6 --- /dev/null +++ b/cpp/include/cuvs/preprocessing/quantize/scalar.hpp @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace cuvs::preprocessing::quantize::scalar { + +/** + * @defgroup scalar Scalar quantizer utilities + * @{ + */ + +/** + * @brief quantizer parameters. + */ +struct params { + /* + * specifies how many outliers at top & bottom will be ignored + * needs to be within range of (0, 1] + */ + float quantile = 0.99; +}; + +/** + * @brief Defines and stores scalar for quantisation upon training + * + * The quantization is performed by a linear mapping of an interval in the + * float data type to the full range of the quantized int type. + * + * @tparam T data element type + * + */ +template +struct quantizer { + T min_; + T max_; +}; + +/** + * @brief Initializes a scalar quantizer to be used later for quantizing the dataset. + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); + * @endcode + * + * @param[in] res raft resource + * @param[in] params configure scalar quantizer, e.g. quantile + * @param[in] dataset a row-major matrix view on device + * + * @return quantizer + */ +quantizer train(raft::resources const& res, + const params params, + raft::device_matrix_view dataset); + +/** + * @brief Initializes a scalar quantizer to be used later for quantizing the dataset. + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); + * @endcode + * + * @param[in] res raft resource + * @param[in] params configure scalar quantizer, e.g. quantile + * @param[in] dataset a row-major matrix view on host + * + * @return quantizer + */ +quantizer train(raft::resources const& res, + const params params, + raft::host_matrix_view dataset); + +/** + * @brief Applies quantization transform to given dataset + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); auto quantized_dataset = raft::make_device_matrix(handle, samples, + * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on device + * @param[out] out a row-major matrix view on device + * + */ +void transform(raft::resources const& res, + const quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out); + +/** + * @brief Applies quantization transform to given dataset + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); auto quantized_dataset = raft::make_host_matrix(samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on host + * @param[out] out a row-major matrix view on host + * + */ +void transform(raft::resources const& res, + const quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out); + +/** + * @brief Perform inverse quantization step on previously quantized dataset + * + * Note that depending on the chosen data types train dataset the conversion is + * not lossless. + * + * Usage example: + * @code{.cpp} + * auto quantized_dataset = raft::make_device_matrix(handle, samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix(handle, samples, features); + * cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer, + * dataset_revert.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on device + * @param[out] out a row-major matrix view on device + * + */ +void inverse_transform(raft::resources const& res, + const quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out); + +/** + * @brief Perform inverse quantization step on previously quantized dataset + * + * Note that depending on the chosen data types train dataset the conversion is + * not lossless. + * + * Usage example: + * @code{.cpp} + * auto quantized_dataset = raft::make_host_matrix(samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix(samples, + * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer, + * dataset_revert.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on host + * @param[out] out a row-major matrix view on host + * + */ +void inverse_transform(raft::resources const& res, + const quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out); + +/** + * @brief Initializes a scalar quantizer to be used later for quantizing the dataset. + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); + * @endcode + * + * @param[in] res raft resource + * @param[in] params configure scalar quantizer, e.g. quantile + * @param[in] dataset a row-major matrix view on device + * + * @return quantizer + */ +quantizer train(raft::resources const& res, + const params params, + raft::device_matrix_view dataset); + +/** + * @brief Initializes a scalar quantizer to be used later for quantizing the dataset. + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); + * @endcode + * + * @param[in] res raft resource + * @param[in] params configure scalar quantizer, e.g. quantile + * @param[in] dataset a row-major matrix view on host + * + * @return quantizer + */ +quantizer train(raft::resources const& res, + const params params, + raft::host_matrix_view dataset); + +/** + * @brief Applies quantization transform to given dataset + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); auto quantized_dataset = raft::make_device_matrix(handle, samples, + * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on device + * @param[out] out a row-major matrix view on device + * + */ +void transform(raft::resources const& res, + const quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out); + +/** + * @brief Applies quantization transform to given dataset + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); auto quantized_dataset = raft::make_host_matrix(samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on host + * @param[out] out a row-major matrix view on host + * + */ +void transform(raft::resources const& res, + const quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out); + +/** + * @brief Perform inverse quantization step on previously quantized dataset + * + * Note that depending on the chosen data types train dataset the conversion is + * not lossless. + * + * Usage example: + * @code{.cpp} + * auto quantized_dataset = raft::make_device_matrix(handle, samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix(handle, + * samples, features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer, + * dataset_revert.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on device + * @param[out] out a row-major matrix view on device + * + */ +void inverse_transform(raft::resources const& res, + const quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out); + +/** + * @brief Perform inverse quantization step on previously quantized dataset + * + * Note that depending on the chosen data types train dataset the conversion is + * not lossless. + * + * Usage example: + * @code{.cpp} + * auto quantized_dataset = raft::make_host_matrix(samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix(samples, + * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer, + * dataset_revert.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on host + * @param[out] out a row-major matrix view on host + * + */ +void inverse_transform(raft::resources const& res, + const quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out); + +/** + * @brief Initializes a scalar quantizer to be used later for quantizing the dataset. + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); + * @endcode + * + * @param[in] res raft resource + * @param[in] params configure scalar quantizer, e.g. quantile + * @param[in] dataset a row-major matrix view on device + * + * @return quantizer + */ +quantizer train(raft::resources const& res, + const params params, + raft::device_matrix_view dataset); + +/** + * @brief Initializes a scalar quantizer to be used later for quantizing the dataset. + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); + * @endcode + * + * @param[in] res raft resource + * @param[in] params configure scalar quantizer, e.g. quantile + * @param[in] dataset a row-major matrix view on host + * + * @return quantizer + */ +quantizer train(raft::resources const& res, + const params params, + raft::host_matrix_view dataset); + +/** + * @brief Applies quantization transform to given dataset + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); auto quantized_dataset = raft::make_device_matrix(handle, samples, + * features); cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on device + * @param[out] out a row-major matrix view on device + * + */ +void transform(raft::resources const& res, + const quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out); + +/** + * @brief Applies quantization transform to given dataset + * + * Usage example: + * @code{.cpp} + * raft::handle_t handle; + * cuvs::preprocessing::quantize::scalar::params params; + * auto quantizer = cuvs::preprocessing::quantize::scalar::train(handle, params, + * dataset); auto quantized_dataset = raft::make_host_matrix(samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on host + * @param[out] out a row-major matrix view on host + * + */ +void transform(raft::resources const& res, + const quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out); + +/** + * @brief Perform inverse quantization step on previously quantized dataset + * + * Note that depending on the chosen data types train dataset the conversion is + * not lossless. + * + * Usage example: + * @code{.cpp} + * auto quantized_dataset = raft::make_device_matrix(handle, samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); auto dataset_revert = raft::make_device_matrix(handle, + * samples, features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer, + * dataset_revert.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on device + * @param[out] out a row-major matrix view on device + * + */ +void inverse_transform(raft::resources const& res, + const quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out); + +/** + * @brief Perform inverse quantization step on previously quantized dataset + * + * Note that depending on the chosen data types train dataset the conversion is + * not lossless. + * + * Usage example: + * @code{.cpp} + * auto quantized_dataset = raft::make_host_matrix(samples, features); + * cuvs::preprocessing::quantize::scalar::transform(handle, quantizer, dataset, + * quantized_dataset.view()); auto dataset_revert = raft::make_host_matrix(samples, + * features); cuvs::preprocessing::quantize::scalar::inverse_transform(handle, quantizer, + * dataset_revert.view()); + * @endcode + * + * @param[in] res raft resource + * @param[in] quantizer a scalar quantizer + * @param[in] dataset a row-major matrix view on host + * @param[out] out a row-major matrix view on host + * + */ +void inverse_transform(raft::resources const& res, + const quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out); + +/** @} */ // end of group scalar + +} // namespace cuvs::preprocessing::quantize::scalar diff --git a/cpp/src/preprocessing/quantize/detail/scalar.cuh b/cpp/src/preprocessing/quantize/detail/scalar.cuh new file mode 100644 index 000000000..fc132eb7f --- /dev/null +++ b/cpp/src/preprocessing/quantize/detail/scalar.cuh @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::preprocessing::quantize::detail { + +template +_RAFT_HOST_DEVICE bool fp_lt(const T& a, const T& b) +{ + return a < b; +} + +template <> +_RAFT_HOST_DEVICE bool fp_lt(const half& a, const half& b) +{ + return static_cast(a) < static_cast(b); +} + +template +struct quantize_op { + const T min_; + const T max_; + const QuantI q_type_min_ = std::numeric_limits::min(); + const QuantI q_type_max_ = std::numeric_limits::max(); + const TempT scalar_; + const TempT offset_; + + constexpr explicit quantize_op(T min, T max) + : min_(min), + max_(max), + scalar_(static_cast(max_) > static_cast(min_) + ? ((static_cast(q_type_max_) - static_cast(q_type_min_)) / + (static_cast(max_) - static_cast(min_))) + : static_cast(1)), + offset_(static_cast(q_type_min_) - static_cast(min_) * scalar_) + { + } + + constexpr RAFT_INLINE_FUNCTION QuantI operator()(const T& x) const + { + if (!fp_lt(min_, x)) return q_type_min_; + if (!fp_lt(x, max_)) return q_type_max_; + return static_cast(lroundf(scalar_ * static_cast(x) + offset_)); + } + + constexpr RAFT_INLINE_FUNCTION T operator()(const QuantI& x) const + { + return static_cast((static_cast(x) - offset_) / scalar_); + } +}; + +template +std::tuple quantile_min_max(raft::resources const& res, + raft::device_matrix_view dataset, + double quantile) +{ + // settings for quantile approximation + constexpr size_t max_num_samples = 1000000; + constexpr int seed = 137; + + cudaStream_t stream = raft::resource::get_cuda_stream(res); + + // select subsample + raft::random::RngState rng(seed); + size_t n_elements = dataset.extent(0) * dataset.extent(1); + size_t subset_size = std::min(max_num_samples, n_elements); + auto subset = raft::make_device_vector(res, subset_size); + auto dataset_view = raft::make_device_vector_view(dataset.data_handle(), n_elements); + raft::random::sample_without_replacement( + res, rng, dataset_view, std::nullopt, subset.view(), std::nullopt); + + // quantile / sort and pick for now + thrust::sort(raft::resource::get_thrust_policy(res), + subset.data_handle(), + subset.data_handle() + subset_size); + + double half_quantile_pos = (0.5 + 0.5 * quantile) * subset_size; + int pos_max = std::ceil(half_quantile_pos) - 1; + int pos_min = subset_size - pos_max - 1; + + T minmax_h[2]; + raft::update_host(&(minmax_h[0]), subset.data_handle() + pos_min, 1, stream); + raft::update_host(&(minmax_h[1]), subset.data_handle() + pos_max, 1, stream); + raft::resource::sync_stream(res); + + return {minmax_h[0], minmax_h[1]}; +} + +template +std::tuple quantile_min_max(raft::resources const& res, + raft::host_matrix_view dataset, + double quantile) +{ + // settings for quantile approximation + constexpr size_t max_num_samples = 1000000; + constexpr int seed = 137; + + // select subsample + std::mt19937 rng(seed); + size_t n_elements = dataset.extent(0) * dataset.extent(1); + size_t subset_size = std::min(max_num_samples, n_elements); + std::vector subset; + std::sample(dataset.data_handle(), + dataset.data_handle() + n_elements, + std::back_inserter(subset), + subset_size, + rng); + + // quantile / sort and pick for now + thrust::sort(thrust::omp::par, subset.data(), subset.data() + subset_size, fp_lt); + double half_quantile_pos = (0.5 + 0.5 * quantile) * subset_size; + int pos_max = std::ceil(half_quantile_pos) - 1; + int pos_min = subset_size - pos_max - 1; + + return {subset[pos_min], subset[pos_max]}; +} + +template +cuvs::preprocessing::quantize::scalar::quantizer train( + raft::resources const& res, + const cuvs::preprocessing::quantize::scalar::params params, + raft::device_matrix_view dataset) +{ + RAFT_EXPECTS(params.quantile > 0.0 && params.quantile <= 1.0, + "quantile for scalar quantization needs to be within (0, 1] but is %f", + params.quantile); + + auto [min, max] = detail::quantile_min_max(res, dataset, params.quantile); + + RAFT_LOG_DEBUG("quantizer train min=%lf max=%lf.", double(min), double(max)); + + return cuvs::preprocessing::quantize::scalar::quantizer{min, max}; +} + +template +cuvs::preprocessing::quantize::scalar::quantizer train( + raft::resources const& res, + const cuvs::preprocessing::quantize::scalar::params params, + raft::host_matrix_view dataset) +{ + RAFT_EXPECTS(params.quantile > 0.0 && params.quantile <= 1.0, + "quantile for scalar quantization needs to be within (0, 1] but is %f", + params.quantile); + + auto [min, max] = detail::quantile_min_max(res, dataset, params.quantile); + + RAFT_LOG_DEBUG("quantizer train min=%lf max=%lf.", double(min), double(max)); + + return cuvs::preprocessing::quantize::scalar::quantizer{min, max}; +} + +template +void transform(raft::resources const& res, + const cuvs::preprocessing::quantize::scalar::quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(res); + + raft::linalg::map(res, out, quantize_op(quantizer.min_, quantizer.max_), dataset); +} + +template +void transform(raft::resources const& res, + const cuvs::preprocessing::quantize::scalar::quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out) +{ + auto main_op = quantize_op(quantizer.min_, quantizer.max_); + size_t n_elements = dataset.extent(0) * dataset.extent(1); + +#pragma omp parallel for + for (size_t i = 0; i < n_elements; ++i) { + out.data_handle()[i] = main_op(dataset.data_handle()[i]); + } +} + +template +void inverse_transform(raft::resources const& res, + const cuvs::preprocessing::quantize::scalar::quantizer& quantizer, + raft::device_matrix_view dataset, + raft::device_matrix_view out) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(res); + + raft::linalg::map(res, out, quantize_op(quantizer.min_, quantizer.max_), dataset); +} + +template +void inverse_transform(raft::resources const& res, + const cuvs::preprocessing::quantize::scalar::quantizer& quantizer, + raft::host_matrix_view dataset, + raft::host_matrix_view out) +{ + auto main_op = quantize_op(quantizer.min_, quantizer.max_); + size_t n_elements = dataset.extent(0) * dataset.extent(1); + +#pragma omp parallel for + for (size_t i = 0; i < n_elements; ++i) { + out.data_handle()[i] = main_op(dataset.data_handle()[i]); + } +} + +} // namespace cuvs::preprocessing::quantize::detail diff --git a/cpp/src/preprocessing/quantize/scalar.cu b/cpp/src/preprocessing/quantize/scalar.cu new file mode 100644 index 000000000..9624ad4fe --- /dev/null +++ b/cpp/src/preprocessing/quantize/scalar.cu @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "./detail/scalar.cuh" + +#include + +namespace cuvs::preprocessing::quantize::scalar { + +#define CUVS_INST_QUANTIZATION(T, QuantI) \ + auto train(raft::resources const& res, \ + const params params, \ + raft::device_matrix_view dataset) \ + ->quantizer \ + { \ + return detail::train(res, params, dataset); \ + } \ + auto train(raft::resources const& res, \ + const params params, \ + raft::host_matrix_view dataset) \ + ->quantizer \ + { \ + return detail::train(res, params, dataset); \ + } \ + void transform(raft::resources const& res, \ + const quantizer& quantizer, \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view out) \ + { \ + detail::transform(res, quantizer, dataset, out); \ + } \ + void transform(raft::resources const& res, \ + const quantizer& quantizer, \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view out) \ + { \ + detail::transform(res, quantizer, dataset, out); \ + } \ + void inverse_transform(raft::resources const& res, \ + const quantizer& quantizer, \ + raft::device_matrix_view dataset, \ + raft::device_matrix_view out) \ + { \ + detail::inverse_transform(res, quantizer, dataset, out); \ + } \ + void inverse_transform(raft::resources const& res, \ + const quantizer& quantizer, \ + raft::host_matrix_view dataset, \ + raft::host_matrix_view out) \ + { \ + detail::inverse_transform(res, quantizer, dataset, out); \ + } \ + template struct quantizer; + +CUVS_INST_QUANTIZATION(double, int8_t); +CUVS_INST_QUANTIZATION(float, int8_t); +CUVS_INST_QUANTIZATION(half, int8_t); + +#undef CUVS_INST_QUANTIZATION + +} // namespace cuvs::preprocessing::quantize::scalar \ No newline at end of file diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 1c8de2ad0..0ecac6ec2 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -226,6 +226,11 @@ if(BUILD_TESTS) PERCENT 100 ) + + ConfigureTest( + NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100 + ) + ConfigureTest( NAME STATS_TEST PATH stats/trustworthiness.cu stats/silhouette_score.cu GPUS 1 PERCENT 100 ) diff --git a/cpp/test/preprocessing/scalar_quantization.cu b/cpp/test/preprocessing/scalar_quantization.cu new file mode 100644 index 000000000..2fdfe7555 --- /dev/null +++ b/cpp/test/preprocessing/scalar_quantization.cu @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::preprocessing::quantize::scalar { + +template +struct QuantizationInputs { + cuvs::preprocessing::quantize::scalar::params quantization_params; + int rows; + int cols; + T min = T(-1.0); + T max = T(1.0); + double threshold = 2e-2; +}; + +template +std::ostream& operator<<(std::ostream& os, const QuantizationInputs& inputs) +{ + return os << "quantization_quantile:<" << inputs.quantization_params.quantile + << "> rows:" << inputs.rows << " cols:" << inputs.cols << " min:" << (double)inputs.min + << " max:" << (double)inputs.max; +} + +template +class QuantizationTest : public ::testing::TestWithParam> { + public: + QuantizationTest() + : params_(::testing::TestWithParam>::GetParam()), + stream(raft::resource::get_cuda_stream(handle)), + input_(0, stream) + { + } + + double getRelativeErrorStddev(const T* array_a, const T* array_b, size_t size, float quantile) + { + // relative error elementwise + rmm::device_uvector relative_error(size, stream); + raft::linalg::binaryOp( + relative_error.data(), + array_a, + array_b, + size, + [] __device__(double a, double b) { + return a != b ? (raft::abs(a - b) / raft::max(raft::abs(a), raft::abs(b))) : 0; + }, + stream); + + // sort by size --> remove largest errors to account for quantile chosen + thrust::sort(raft::resource::get_thrust_policy(handle), + relative_error.data(), + relative_error.data() + size); + int elements_to_consider = + std::ceil(double(params_.quantization_params.quantile) * double(size)); + + rmm::device_uvector mu(1, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(mu.data(), 0, sizeof(double), stream)); + + rmm::device_uvector error_stddev(1, stream); + raft::stats::stddev(error_stddev.data(), + relative_error.data(), + mu.data(), + 1, + elements_to_consider, + false, + true, + stream); + + double error_stddev_h; + raft::update_host(&error_stddev_h, error_stddev.data(), 1, stream); + raft::resource::sync_stream(handle, stream); + return error_stddev_h; + } + + protected: + void testScalarQuantization() + { + // dataset identical on host / device + auto dataset = raft::make_device_matrix_view( + (const T*)(input_.data()), rows_, cols_); + auto dataset_h = raft::make_host_matrix_view( + (const T*)(host_input_.data()), rows_, cols_); + + size_t print_size = std::min(input_.size(), 20ul); + + // train quantizer_1 on device + auto quantizer_1 = + cuvs::preprocessing::quantize::scalar::train(handle, params_.quantization_params, dataset); + std::cerr << "Q1: min = " << (double)quantizer_1.min_ << ", max = " << (double)quantizer_1.max_ + << std::endl; + + { + auto quantized_input_h = raft::make_host_matrix(rows_, cols_); + auto quantized_input_d = raft::make_device_matrix(handle, rows_, cols_); + cuvs::preprocessing::quantize::scalar::transform( + handle, quantizer_1, dataset, quantized_input_d.view()); + cuvs::preprocessing::quantize::scalar::transform( + handle, quantizer_1, dataset_h, quantized_input_h.view()); + + { + raft::print_device_vector("Input array: ", input_.data(), print_size, std::cerr); + + rmm::device_uvector quantization_for_print(print_size, stream); + raft::linalg::unaryOp(quantization_for_print.data(), + quantized_input_d.data_handle(), + print_size, + raft::cast_op{}, + stream); + raft::resource::sync_stream(handle, stream); + raft::print_device_vector( + "Quantized array 1: ", quantization_for_print.data(), print_size, std::cerr); + } + + // test (inverse) transform host/device equal + ASSERT_TRUE(devArrMatchHost(quantized_input_h.data_handle(), + quantized_input_d.data_handle(), + input_.size(), + cuvs::Compare(), + stream)); + + auto quantized_input_h_const_view = raft::make_host_matrix_view( + quantized_input_h.data_handle(), rows_, cols_); + auto re_transformed_input_h = raft::make_host_matrix(rows_, cols_); + cuvs::preprocessing::quantize::scalar::inverse_transform( + handle, quantizer_1, quantized_input_h_const_view, re_transformed_input_h.view()); + + auto quantized_input_d_const_view = raft::make_device_matrix_view( + quantized_input_d.data_handle(), rows_, cols_); + auto re_transformed_input_d = raft::make_device_matrix(handle, rows_, cols_); + cuvs::preprocessing::quantize::scalar::inverse_transform( + handle, quantizer_1, quantized_input_d_const_view, re_transformed_input_d.view()); + raft::print_device_vector( + "re-transformed array: ", re_transformed_input_d.data_handle(), print_size, std::cerr); + + { + double l2_error = getRelativeErrorStddev(dataset.data_handle(), + re_transformed_input_d.data_handle(), + input_.size(), + params_.quantization_params.quantile); + std::cerr << "error stddev = " << l2_error << ", threshold = " << params_.threshold + << std::endl; + // test (inverse) transform close to original dataset + ASSERT_TRUE(l2_error < params_.threshold); + } + } + + // train quantizer_2 on host + auto quantizer_2 = + cuvs::preprocessing::quantize::scalar::train(handle, params_.quantization_params, dataset_h); + std::cerr << "Q2: min = " << (double)quantizer_2.min_ << ", max = " << (double)quantizer_2.max_ + << std::endl; + + // check both quantizers are the same (valid if sampling is identical) + if (input_.size() <= 1000000) { + ASSERT_TRUE((double)quantizer_1.min_ == (double)quantizer_2.min_); + ASSERT_TRUE((double)quantizer_1.max_ == (double)quantizer_2.max_); + } + + { + // test transform host/device equal + auto quantized_input_h = raft::make_host_matrix(rows_, cols_); + auto quantized_input_d = raft::make_device_matrix(handle, rows_, cols_); + cuvs::preprocessing::quantize::scalar::transform( + handle, quantizer_2, dataset, quantized_input_d.view()); + cuvs::preprocessing::quantize::scalar::transform( + handle, quantizer_2, dataset_h, quantized_input_h.view()); + + { + rmm::device_uvector quantization_for_print(print_size, stream); + raft::linalg::unaryOp(quantization_for_print.data(), + quantized_input_d.data_handle(), + print_size, + raft::cast_op{}, + stream); + raft::resource::sync_stream(handle, stream); + raft::print_device_vector( + "Quantized array 2: ", quantization_for_print.data(), print_size, std::cerr); + } + + ASSERT_TRUE(devArrMatchHost(quantized_input_h.data_handle(), + quantized_input_d.data_handle(), + input_.size(), + cuvs::Compare(), + stream)); + } + + // sort_by_key (input, quantization) -- check <= on result + { + auto quantized_input = raft::make_device_matrix(handle, rows_, cols_); + cuvs::preprocessing::quantize::scalar::transform( + handle, quantizer_1, dataset, quantized_input.view()); + thrust::sort_by_key(raft::resource::get_thrust_policy(handle), + input_.data(), + input_.data() + input_.size(), + quantized_input.data_handle()); + std::vector quantized_input_sorted_host(input_.size()); + raft::update_host( + quantized_input_sorted_host.data(), quantized_input.data_handle(), input_.size(), stream); + raft::resource::sync_stream(handle, stream); + + for (size_t i = 0; i < input_.size() - 1; ++i) { + ASSERT_TRUE(quantized_input_sorted_host[i] <= quantized_input_sorted_host[i + 1]); + } + } + } + + void SetUp() override + { + rows_ = params_.rows; + cols_ = params_.cols; + + int n_elements = rows_ * cols_; + input_.resize(n_elements, stream); + host_input_.resize(n_elements); + + // random input + unsigned long long int seed = 1234ULL; + raft::random::RngState r(seed); + uniform(handle, r, input_.data(), input_.size(), params_.min, params_.max); + + raft::update_host(host_input_.data(), input_.data(), input_.size(), stream); + + raft::resource::sync_stream(handle, stream); + } + + private: + raft::resources handle; + cudaStream_t stream; + + QuantizationInputs params_; + int rows_; + int cols_; + rmm::device_uvector input_; + std::vector host_input_; +}; + +template +const std::vector> inputs = { + {{1.0}, 5, 5, T(0.0), T(1.0)}, + {{0.98}, 10, 20, T(0.0), T(1.0)}, + {{0.90}, 1000, 1500, T(-500.0), T(100.0)}, + {{0.59}, 100, 200}, + {{0.1}, 1, 1, T(0.0), T(1.0)}, + {{0.01}, 50, 50, T(0.0), T(1.0)}, + {{0.94}, 10, 20, T(-1.0), T(0.0)}, + {{0.95}, 10, 2, T(50.0), T(100.0)}, + {{0.95}, 10, 20, T(-500.0), T(-100.0)}, + {{0.95}, 10, 20, T(5.0), T(5.0)}, +}; + +typedef QuantizationTest QuantizationTest_float_int8t; +TEST_P(QuantizationTest_float_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); } + +typedef QuantizationTest QuantizationTest_double_int8t; +TEST_P(QuantizationTest_double_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); } + +typedef QuantizationTest QuantizationTest_half_int8t; +TEST_P(QuantizationTest_half_int8t, ScalarQuantizationTest) { this->testScalarQuantization(); } + +INSTANTIATE_TEST_CASE_P(QuantizationTest, + QuantizationTest_float_int8t, + ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(QuantizationTest, + QuantizationTest_double_int8t, + ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(QuantizationTest, + QuantizationTest_half_int8t, + ::testing::ValuesIn(inputs)); + +} // namespace cuvs::preprocessing::quantize::scalar diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst index 49732dc92..34f48a88f 100644 --- a/docs/source/cpp_api.rst +++ b/docs/source/cpp_api.rst @@ -10,5 +10,6 @@ C++ API Documentation cpp_api/cluster.rst cpp_api/distance.rst cpp_api/neighbors.rst + cpp_api/preprocessing.rst cpp_api/selection.rst cpp_api/stats.rst diff --git a/docs/source/cpp_api/preprocessing.rst b/docs/source/cpp_api/preprocessing.rst new file mode 100644 index 000000000..1c2b0f051 --- /dev/null +++ b/docs/source/cpp_api/preprocessing.rst @@ -0,0 +1,12 @@ +Preprocessing +============= + +.. role:: py(code) + :language: c++ + :class: highlight + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + preprocessing_quantize.rst diff --git a/docs/source/cpp_api/preprocessing_quantize.rst b/docs/source/cpp_api/preprocessing_quantize.rst new file mode 100644 index 000000000..b660c61c5 --- /dev/null +++ b/docs/source/cpp_api/preprocessing_quantize.rst @@ -0,0 +1,20 @@ +Quantize +======== + +This page provides C++ class references for the publicly-exposed elements of the +`cuvs/preprocessing/quantize` package. + +.. role:: py(code) + :language: c++ + :class: highlight + +Scalar +------ + +``#include `` + +namespace *cuvs::preprocessing::quantize::scalar* + +.. doxygengroup:: scalar + :project: cuvs + From c9b38623932039722214caf02a516ce12883c9a4 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Thu, 5 Dec 2024 09:44:33 +0100 Subject: [PATCH 43/47] Skip IVF-PQ packing test for lists with not enough data (#512) Skip some checks involving hard-coded offsets into the data when the number of records in the checked PQ list is smaller than needed. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) URL: https://github.com/rapidsai/cuvs/pull/512 --- cpp/test/neighbors/ann_ivf_pq.cuh | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index fd4e330db..3a92b5e3d 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -379,7 +379,14 @@ class ivf_pq_test : public ::testing::TestWithParam { // Pack a few vectors back to the list. int row_offset = 5; int n_vec = 3; - ASSERT_TRUE(row_offset + n_vec < n_rows); + if (static_cast(row_offset + n_vec) > n_rows) { + RAFT_LOG_INFO( + "Skipping IVF-PQ check_packing/pack test for label %u due to insufficient data (%u " + "records)", + label, + uint32_t(n_rows)); + return; + } size_t offset = row_offset * index->pq_dim(); auto codes_to_pack = raft::make_device_matrix_view( codes.data_handle() + offset, n_vec, index->pq_dim()); @@ -393,7 +400,14 @@ class ivf_pq_test : public ::testing::TestWithParam { // Another test with the API that take list_data directly [[maybe_unused]] auto list_data = index->lists()[label]->data.view(); uint32_t n_take = 4; - ASSERT_TRUE(row_offset + n_take < n_rows); + if (static_cast(row_offset + n_take) > n_rows) { + RAFT_LOG_INFO( + "Skipping IVF-PQ check_packing/take test for label %u due to insufficient data (%u " + "records)", + label, + uint32_t(n_rows)); + return; + } auto codes2 = raft::make_device_matrix(handle_, n_take, index->pq_dim()); ivf_pq::helpers::codepacker::unpack( handle_, list_data, index->pq_bits(), row_offset, codes2.view()); From c5e03f2eaf5e30053a248a866428249909e99180 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 5 Dec 2024 07:51:00 -0800 Subject: [PATCH 44/47] Update cuvs to match raft's cutlass changes (#516) Due to the tight integration between cuvs and raft, we need to ensure that cuvs is updated for rapidsai/raft#2503 or builds of cuvs that rely on cloning raft will get an incompatible version of cutlass due to raft's update. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Ben Frederickson (https://github.com/benfred) URL: https://github.com/rapidsai/cuvs/pull/516 --- .pre-commit-config.yaml | 5 ++- cpp/cmake/patches/cutlass/build-export.patch | 27 +++++++++++++++ cpp/cmake/patches/cutlass_override.json | 16 +++++++++ cpp/cmake/thirdparty/get_cutlass.cmake | 35 ++++++++++---------- 4 files changed, 64 insertions(+), 19 deletions(-) create mode 100644 cpp/cmake/patches/cutlass/build-export.patch create mode 100644 cpp/cmake/patches/cutlass_override.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f4fdf202e..5e53abd92 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -91,7 +91,10 @@ repos: - id: codespell additional_dependencies: [tomli] args: ["--toml", "pyproject.toml"] - exclude: (?x)^(^CHANGELOG.md$) + exclude: | + (?x) + ^CHANGELOG[.]md$| + ^cpp/cmake/patches/cutlass/build-export[.]patch$ - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: diff --git a/cpp/cmake/patches/cutlass/build-export.patch b/cpp/cmake/patches/cutlass/build-export.patch new file mode 100644 index 000000000..a6423e9c0 --- /dev/null +++ b/cpp/cmake/patches/cutlass/build-export.patch @@ -0,0 +1,27 @@ +From e0a9597946257a01ae8444200f836ee51d5597ba Mon Sep 17 00:00:00 2001 +From: Kyle Edwards +Date: Wed, 20 Nov 2024 16:37:38 -0500 +Subject: [PATCH] Remove erroneous include directories + +These directories are left over from when CuTe was a separate +CMake project. Remove them. +--- + CMakeLists.txt | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 7419bdf5e..545384d82 100755 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -665,8 +665,6 @@ target_include_directories( + $ + $ + $ +- $ +- $ + ) + + # Mark CTK headers as system to supress warnings from them +-- +2.34.1 + diff --git a/cpp/cmake/patches/cutlass_override.json b/cpp/cmake/patches/cutlass_override.json new file mode 100644 index 000000000..7bf818987 --- /dev/null +++ b/cpp/cmake/patches/cutlass_override.json @@ -0,0 +1,16 @@ +{ + "packages" : { + "cutlass" : { + "version": "3.5.1", + "git_url": "https://github.com/NVIDIA/cutlass.git", + "git_tag": "v${version}", + "patches" : [ + { + "file" : "${current_json_dir}/cutlass/build-export.patch", + "issue" : "Fix build directory export", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/cmake/thirdparty/get_cutlass.cmake b/cpp/cmake/thirdparty/get_cutlass.cmake index 61065318b..71bd2d26c 100644 --- a/cpp/cmake/thirdparty/get_cutlass.cmake +++ b/cpp/cmake/thirdparty/get_cutlass.cmake @@ -13,10 +13,11 @@ # ============================================================================= function(find_and_configure_cutlass) - set(oneValueArgs VERSION REPOSITORY PINNED_TAG) + set(options) + set(oneValueArgs) + set(multiValueArgs) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - # if(RAFT_ENABLE_DIST_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES) set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library" @@ -34,13 +35,22 @@ function(find_and_configure_cutlass) set(CUDART_LIBRARY "${CUDA_cudart_static_LIBRARY}" CACHE FILEPATH "fixing cutlass cmake code" FORCE) endif() + include("${rapids-cmake-dir}/cpm/package_override.cmake") + rapids_cpm_package_override("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches/cutlass_override.json") + + include("${rapids-cmake-dir}/cpm/detail/package_details.cmake") + rapids_cpm_package_details(cutlass version repository tag shallow exclude) + + include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake") + rapids_cpm_generate_patch_command(cutlass ${version} patch_command) + rapids_cpm_find( - NvidiaCutlass ${PKG_VERSION} + NvidiaCutlass ${version} GLOBAL_TARGETS nvidia::cutlass::cutlass CPM_ARGS - GIT_REPOSITORY ${PKG_REPOSITORY} - GIT_TAG ${PKG_PINNED_TAG} - GIT_SHALLOW TRUE + GIT_REPOSITORY ${repository} + GIT_TAG ${tag} + GIT_SHALLOW ${shallow} ${patch_command} OPTIONS "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}" ) @@ -56,7 +66,6 @@ function(find_and_configure_cutlass) NAMESPACE nvidia::cutlass:: ) endif() - # endif() # We generate the cutlass-config files when we built cutlass locally, so always do # `find_dependency` @@ -79,14 +88,4 @@ function(find_and_configure_cutlass) ) endfunction() -if(NOT RAFT_CUTLASS_GIT_TAG) - set(RAFT_CUTLASS_GIT_TAG v2.10.0) -endif() - -if(NOT RAFT_CUTLASS_GIT_REPOSITORY) - set(RAFT_CUTLASS_GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git) -endif() - -find_and_configure_cutlass( - VERSION 2.10.0 REPOSITORY ${RAFT_CUTLASS_GIT_REPOSITORY} PINNED_TAG ${RAFT_CUTLASS_GIT_TAG} -) +find_and_configure_cutlass() From 007c3d2c9efddec6dc46549a5aa2e9f48d4d1612 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 5 Dec 2024 15:48:41 -0500 Subject: [PATCH 45/47] Moving spectral embedding and kernel gramm APIs to cuVS (#463) Partially addresses #455 Authors: - Corey J. Nolet (https://github.com/cjnolet) - Ben Frederickson (https://github.com/benfred) Approvers: - Ben Frederickson (https://github.com/benfred) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cuvs/pull/463 --- build.sh | 12 +- cpp/CMakeLists.txt | 4 + cpp/include/cuvs/cluster/agglomerative.hpp | 1 + cpp/include/cuvs/distance/grammian.hpp | 665 +++++++++++++++ cpp/include/cuvs/embed/spectral.hpp | 40 + .../distance/detail/kernels/gram_matrix.cu | 481 +++++++++++ .../distance/detail/kernels/gram_matrix.cuh | 488 ----------- .../distance/detail/kernels/kernel_factory.cu | 61 ++ .../detail/kernels/kernel_factory.cuh | 65 -- .../detail/kernels/kernel_matrices.cu | 726 ++++++++++++++++ .../detail/kernels/kernel_matrices.cuh | 777 ------------------ .../distance/detail/kernels/rbf_fin_op.cuh | 4 +- .../detail/pairwise_matrix/dispatch-ext.cuh | 4 +- .../detail/pairwise_matrix/dispatch_rbf.cu | 6 +- cpp/src/distance/distance-ext.cuh | 4 +- cpp/src/distance/distance.cu | 4 +- cpp/src/embed/spectral.cu | 53 ++ cpp/src/sparse/cluster/cluster_solvers.cuh | 100 +++ cpp/src/sparse/cluster/detail/spectral.cuh | 111 +++ .../spectral/modularity_maximization.hpp | 176 ++++ .../cluster/detail/spectral/partition.hpp | 188 +++++ .../cluster/detail/spectral/spectral_util.cuh | 181 ++++ cpp/src/sparse/cluster/eigen_solvers.cuh | 107 +++ .../cluster/modularity_maximization.cuh | 86 ++ cpp/src/sparse/cluster/partition.cuh | 95 +++ cpp/test/CMakeLists.txt | 6 + cpp/test/distance/gram.cu | 174 ++++ cpp/test/distance/gram_base.cuh | 91 ++ cpp/test/sparse/cluster/cluster_solvers.cu | 105 +++ cpp/test/sparse/cluster/eigen_solvers.cu | 119 +++ cpp/test/sparse/cluster/spectral.cu | 109 +++ cpp/test/sparse/cluster/spectral_matrix.cu | 84 ++ cpp/test/sparse/gram.cu | 330 ++++++++ 33 files changed, 4105 insertions(+), 1352 deletions(-) create mode 100644 cpp/include/cuvs/distance/grammian.hpp create mode 100644 cpp/include/cuvs/embed/spectral.hpp create mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cu delete mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cuh create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cu delete mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cuh create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cu delete mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cuh create mode 100644 cpp/src/embed/spectral.cu create mode 100644 cpp/src/sparse/cluster/cluster_solvers.cuh create mode 100644 cpp/src/sparse/cluster/detail/spectral.cuh create mode 100644 cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp create mode 100644 cpp/src/sparse/cluster/detail/spectral/partition.hpp create mode 100644 cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh create mode 100644 cpp/src/sparse/cluster/eigen_solvers.cuh create mode 100644 cpp/src/sparse/cluster/modularity_maximization.cuh create mode 100644 cpp/src/sparse/cluster/partition.cuh create mode 100644 cpp/test/distance/gram.cu create mode 100644 cpp/test/distance/gram_base.cuh create mode 100644 cpp/test/sparse/cluster/cluster_solvers.cu create mode 100644 cpp/test/sparse/cluster/eigen_solvers.cu create mode 100644 cpp/test/sparse/cluster/spectral.cu create mode 100644 cpp/test/sparse/cluster/spectral_matrix.cu create mode 100644 cpp/test/sparse/gram.cu diff --git a/build.sh b/build.sh index c08c2900e..bd5fa649b 100755 --- a/build.sh +++ b/build.sh @@ -76,8 +76,8 @@ BUILD_REPORT_METRICS="" BUILD_REPORT_INCL_CACHE_STATS=OFF BUILD_SHARED_LIBS=ON -TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST" -ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL" +TEST_TARGETS="" +ANN_BENCH_TARGETS="" CACHE_ARGS="" NVTX=ON @@ -273,14 +273,6 @@ fi if hasArg tests || (( ${NUMARGS} == 0 )); then BUILD_TESTS=ON CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}" - - # Force compile library when needed test targets are specified - if [[ $CMAKE_TARGET == *"CAGRA_C_TEST"* || \ - $CMAKE_TARGET == *"INTEROP_TEST"* || \ - $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* ]]; then - echo "-- Enabling compiled lib for gtests" - COMPILE_LIBRARY=ON - fi fi if hasArg bench-ann || (( ${NUMARGS} == 0 )); then diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 199bb232d..95fb7e63b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -324,6 +324,9 @@ if(BUILD_SHARED_LIBS) src/cluster/kmeans_transform_float.cu src/cluster/single_linkage_float.cu src/core/bitset.cu + src/distance/detail/kernels/gram_matrix.cu + src/distance/detail/kernels/kernel_factory.cu + src/distance/detail/kernels/kernel_matrices.cu src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu @@ -370,6 +373,7 @@ if(BUILD_SHARED_LIBS) src/distance/distance.cu src/distance/pairwise_distance.cu src/distance/sparse_distance.cu + src/embed/spectral.cu src/neighbors/brute_force.cu src/neighbors/brute_force_serialize.cu src/neighbors/cagra_build_float.cu diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp index e1da04085..8f7e8675a 100644 --- a/cpp/include/cuvs/cluster/agglomerative.hpp +++ b/cpp/include/cuvs/cluster/agglomerative.hpp @@ -18,6 +18,7 @@ #include #include + #include #include diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp new file mode 100644 index 000000000..0c904d493 --- /dev/null +++ b/cpp/include/cuvs/distance/grammian.hpp @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels { + +template +using dense_input_matrix_view_t = raft::device_matrix_view; +template +using dense_output_matrix_view_t = raft::device_matrix_view; +template +using csr_input_matrix_view_t = raft::device_csr_matrix_view; + +/** + * Base class for general Gram matrices + * A Gram matrix is the Hermitian matrix of inner probucts G_ik = + * Here, the inner product is evaluated for all elements from vectors sets X1, + * and X2. + * + * To be more precise, on exit the output buffer will store: + * - if is_row_major == true: out[j+k*n1] = , + * - if is_row_major == false: out[j*n2 + k] = , + * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector + * from the x2 set. + */ +template +class GramMatrixBase { + protected: + cublasHandle_t cublas_handle; + bool legacy_interface; + + public: + GramMatrixBase() : legacy_interface(false){}; + [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle) + : cublas_handle(cublas_handle), legacy_interface(true){}; + + virtual ~GramMatrixBase(){}; + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void operator()(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1 = nullptr, + math_t* norm_x2 = nullptr); + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1 = nullptr, + math_t* norm_x2 = nullptr); + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1 = nullptr, + math_t* norm_x2 = nullptr); + + // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + virtual void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + virtual void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + virtual void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] virtual void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + [[deprecated]] void operator()(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1 = 0, + int ld2 = 0, + int ld_out = 0); + + protected: + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + [[deprecated]] void linear(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); + + protected: + bool get_is_row_major(dense_output_matrix_view_t matrix); + bool get_is_row_major(dense_input_matrix_view_t matrix); + bool get_is_col_major(dense_output_matrix_view_t matrix); + bool get_is_col_major(dense_input_matrix_view_t matrix); + + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ + void linear(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out); + + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ + void linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out); + + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ + void linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out); +}; + +template +class KernelFactory { + public: + static GramMatrixBase* create(KernelParams params); + [[deprecated]] static GramMatrixBase* create(KernelParams params, cublasHandle_t handle); +}; + +/** + * Create a kernel matrix using polynomial kernel function. + */ +template +class PolynomialKernel : public GramMatrixBase { + exp_t exponent; + math_t gain; + math_t offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream); + + public: + /** + * Constructs a polynomial kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = (gain* + offset)^exponent + * + * @tparam math_t floating point type + * @tparam exp_t type of exponent + * @param exponent + * @param gain + * @param offset + */ + PolynomialKernel(exp_t exponent, math_t gain, math_t offset) + : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset){}; + + [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) + : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset){}; + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; + +/** + * Create a kernel matrix using tanh kernel function. + */ +template +class TanhKernel : public GramMatrixBase { + math_t gain, offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream); + + public: + /** + * Constructs a tanh kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = tanh(gain* + offset) + * + * @tparam math_t floating point type + * @param gain + * @param offset + */ + TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} + + [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) + : GramMatrixBase(handle), gain(gain), offset(offset){}; + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; + +/** + * Create a kernel matrix using RBF kernel function. + */ +template +class RBFKernel : public GramMatrixBase { + math_t gain; + + void applyKernel(math_t* inout, + int ld, + int rows, + int cols, + math_t* norm_x1, + math_t* norm_x2, + bool is_row_major, + cudaStream_t stream); + + public: + /** + * Constructs a RBF kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = exp(-gain*|x1_i- x2_k|^2) + * + * @tparam math_t floating point type + * @param gain + */ + RBFKernel(math_t gain) : GramMatrixBase(), gain(gain){}; + + [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) + : GramMatrixBase(handle), gain(gain){}; + + void matrixRowNormL2(raft::resources const& handle, + dense_input_matrix_view_t matrix, + math_t* target); + + void matrixRowNormL2(raft::resources const& handle, + csr_input_matrix_view_t matrix, + math_t* target); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; +}; // end namespace cuvs::distance::kernels diff --git a/cpp/include/cuvs/embed/spectral.hpp b/cpp/include/cuvs/embed/spectral.hpp new file mode 100644 index 000000000..1a8fed96a --- /dev/null +++ b/cpp/include/cuvs/embed/spectral.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cuvs::embed::spectral { + +/** + * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings + * (lowest n_components eigenvectors), using Lanczos min cut algorithm. Please note that this + * algorithm does not compute a full laplacian eigenmap, as the laplacian eigenmap would embed each + * connected component. Laplacian eigenmaps can be built from this algorithm by running it on the + * vectors for each connected component. + + * @param[in] handle + * @param[in] knn_graph KNN Graph + * @param[in] n_components the number of components to project into + * @param[out] out output array for embedding (size n*n_comonents) + * @param[in] seed + */ +void fit(const raft::resources& handle, + raft::device_coo_matrix_view knn_graph, + int n_components, + raft::device_matrix_view out, + unsigned long long seed = 0L); +}; // namespace cuvs::embed::spectral diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cu b/cpp/src/distance/detail/kernels/gram_matrix.cu new file mode 100644 index 000000000..0e4f3e639 --- /dev/null +++ b/cpp/src/distance/detail/kernels/gram_matrix.cu @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../distance.cuh" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels { + +/** + * Base class for general Gram matrices + * A Gram matrix is the Hermitian matrix of inner probucts G_ik = + * Here, the inner product is evaluated for all elements from vectors sets X1, + * and X2. + * + * To be more precise, on exit the output buffer will store: + * - if is_row_major == true: out[j+k*n1] = , + * - if is_row_major == false: out[j*n2 + k] = , + * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector + * from the x2 set. + */ + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +// unfortunately, 'evaluate' cannot be templatized as it needs to be virtual + +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} + +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void GramMatrixBase::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ +template +[[deprecated]] void GramMatrixBase::operator()(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor."); + if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } + if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } + if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } + evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ +template +[[deprecated]] void GramMatrixBase::linear(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + math_t alpha = 1.0; + math_t beta = 0.0; + if (is_row_major) { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + n2, + n1, + n_cols, + &alpha, + x2, + ld2, + x1, + ld1, + &beta, + out, + ld_out, + stream)); + } else { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + n1, + n2, + n_cols, + &alpha, + x1, + ld1, + x2, + ld2, + &beta, + out, + ld_out, + stream)); + } +} + +template +bool GramMatrixBase::get_is_row_major(dense_output_matrix_view_t matrix) +{ + return (matrix.stride(1) == 1); +} +template +bool GramMatrixBase::get_is_row_major(dense_input_matrix_view_t matrix) +{ + return (matrix.stride(1) == 1); +} + +template +bool GramMatrixBase::get_is_col_major(dense_output_matrix_view_t matrix) +{ + return (matrix.stride(0) == 1); +} + +template +bool GramMatrixBase::get_is_col_major(dense_input_matrix_view_t matrix) +{ + return (matrix.stride(0) == 1); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check is_row_major consistency + bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out); + bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out); + ASSERT(is_row_major || is_col_major, + "GramMatrix leading dimensions for x1, x2 and out do not match"); + + // check dimensions + int n1 = out.extent(0); + int n2 = out.extent(1); + int n_cols = x1.extent(1); + ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match"); + ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match"); + ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match"); + + // extract major stride + int ld1 = is_row_major ? x1.stride(0) : x1.stride(1); + int ld2 = is_row_major ? x2.stride(0) : x2.stride(1); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + + math_t alpha = 1.0; + math_t beta = 0.0; + if (is_row_major) { + // #TODO: Use mdspan-based API when stride-capable + // https://github.com/rapidsai/raft/issues/875 + raft::linalg::gemm(handle, + true, + false, + n2, + n1, + n_cols, + &alpha, + x2.data_handle(), + ld2, + x1.data_handle(), + ld1, + &beta, + out.data_handle(), + ld_out, + raft::resource::get_cuda_stream(handle)); + } else { + // #TODO: Use mdspan-based API when stride-capable + // https://github.com/rapidsai/raft/issues/875 + raft::linalg::gemm(handle, + false, + true, + n1, + n2, + n_cols, + &alpha, + x1.data_handle(), + ld1, + x2.data_handle(), + ld2, + &beta, + out.data_handle(), + ld_out, + raft::resource::get_cuda_stream(handle)); + } +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check is_row_major consistency + bool is_row_major = get_is_row_major(x2) && get_is_row_major(out); + bool is_col_major = get_is_col_major(x2) && get_is_col_major(out); + ASSERT(is_row_major || is_col_major, "GramMatrix leading dimensions for x2 and out do not match"); + + // check dimensions + auto x1_structure = x1.structure_view(); + ASSERT(x1_structure.get_n_rows() == out.extent(0), + "GramMatrix input matrix dimensions for x1 and out do not match"); + ASSERT(x2.extent(0) == out.extent(1), + "GramMatrix input matrix dimensions for x2 and out do not match"); + ASSERT(x2.extent(1) == x1_structure.get_n_cols(), + "GramMatrix input matrix dimensions for x1 and x2 do not match"); + + math_t alpha = 1.0; + math_t beta = 0.0; + + raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check layout consistency (w.r.t. strides a matrix might be both row & col major) + bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1); + bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0); + + ASSERT(is_row_major_nopad || is_col_major_nopad, + "Sparse linear Kernel distance does not support ld_out parameter"); + + // switch a,b based on is_row_major + if (is_col_major_nopad) { + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(1), out.extent(0)); + + cuvs::distance::pairwise_distance( + handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); + } else { + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(0), out.extent(1)); + cuvs::distance::pairwise_distance( + handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); + } +} + +template class GramMatrixBase; +template class GramMatrixBase; + +}; // namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cuh b/cpp/src/distance/detail/kernels/gram_matrix.cuh deleted file mode 100644 index d435fb4d1..000000000 --- a/cpp/src/distance/detail/kernels/gram_matrix.cuh +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "../../distance.cuh" -#include -#include -#include -#include -// #include -#include -#include -#include -#include - -namespace cuvs::distance::kernels::detail { - -template -using dense_input_matrix_view_t = raft::device_matrix_view; -template -using dense_output_matrix_view_t = raft::device_matrix_view; -template -using csr_input_matrix_view_t = raft::device_csr_matrix_view; - -/** - * Base class for general Gram matrices - * A Gram matrix is the Hermitian matrix of inner probucts G_ik = - * Here, the inner product is evaluated for all elements from vectors sets X1, - * and X2. - * - * To be more precise, on exit the output buffer will store: - * - if is_row_major == true: out[j+k*n1] = , - * - if is_row_major == false: out[j*n2 + k] = , - * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector - * from the x2 set. - */ -template -class GramMatrixBase { - protected: - cublasHandle_t cublas_handle; - bool legacy_interface; - - public: - GramMatrixBase() : legacy_interface(false){}; - [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle) - : cublas_handle(cublas_handle), legacy_interface(true){}; - - virtual ~GramMatrixBase(){}; - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual - - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] virtual void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 - * @param ld2 leading dimension of x2 - * @param ld_out leading dimension of out - */ - [[deprecated]] void operator()(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1 = 0, - int ld2 = 0, - int ld_out = 0) - { - ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor."); - if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } - if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } - if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } - evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } - - protected: - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 - * @param ld2 leading dimension of x2 - * @param ld_out leading dimension of out - */ - [[deprecated]] void linear(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - n2, - n1, - n_cols, - &alpha, - x2, - ld2, - x1, - ld1, - &beta, - out, - ld_out, - stream)); - } else { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_T, - n1, - n2, - n_cols, - &alpha, - x1, - ld1, - x2, - ld2, - &beta, - out, - ld_out, - stream)); - } - } - - protected: - bool get_is_row_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_row_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_col_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } - - bool get_is_col_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x1, x2 and out do not match"); - - // check dimensions - int n1 = out.extent(0); - int n2 = out.extent(1); - int n_cols = x1.extent(1); - ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - // extract major stride - int ld1 = is_row_major ? x1.stride(0) : x1.stride(1); - int ld2 = is_row_major ? x2.stride(0) : x2.stride(1); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - true, - false, - n2, - n1, - n_cols, - &alpha, - x2.data_handle(), - ld2, - x1.data_handle(), - ld1, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } else { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - false, - true, - n1, - n2, - n_cols, - &alpha, - x1.data_handle(), - ld1, - x2.data_handle(), - ld2, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x2 and out do not match"); - - // check dimensions - auto x1_structure = x1.structure_view(); - ASSERT(x1_structure.get_n_rows() == out.extent(0), - "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == out.extent(1), - "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == x1_structure.get_n_cols(), - "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - math_t alpha = 1.0; - math_t beta = 0.0; - - raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out); - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check layout consistency (w.r.t. strides a matrix might be both row & col major) - bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1); - bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0); - - ASSERT(is_row_major_nopad || is_col_major_nopad, - "Sparse linear Kernel distance does not support ld_out parameter"); - - // switch a,b based on is_row_major - if (is_col_major_nopad) { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(1), out.extent(0)); - raft::sparse::distance::pairwise_distance( - handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } else { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(0), out.extent(1)); - raft::sparse::distance::pairwise_distance( - handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cu b/cpp/src/distance/detail/kernels/kernel_factory.cu new file mode 100644 index 000000000..25f9e9b84 --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_factory.cu @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cuvs::distance::kernels { + +template +GramMatrixBase* KernelFactory::create(KernelParams params) +{ + GramMatrixBase* res; + // KernelParams is not templated, we convert the parameters to math_t here: + math_t coef0 = params.coef0; + math_t gamma = params.gamma; + switch (params.kernel) { + case LINEAR: res = new GramMatrixBase(); break; + case POLYNOMIAL: res = new PolynomialKernel(params.degree, gamma, coef0); break; + case TANH: res = new TanhKernel(gamma, coef0); break; + case RBF: res = new RBFKernel(gamma); break; + default: throw raft::exception("Kernel not implemented"); + } + return res; +} + +template +[[deprecated]] GramMatrixBase* KernelFactory::create(KernelParams params, + cublasHandle_t handle) +{ + GramMatrixBase* res; + // KernelParams is not templated, we convert the parameters to math_t here: + math_t coef0 = params.coef0; + math_t gamma = params.gamma; + switch (params.kernel) { + case LINEAR: res = new GramMatrixBase(handle); break; + case POLYNOMIAL: + res = new PolynomialKernel(params.degree, gamma, coef0, handle); + break; + case TANH: res = new TanhKernel(gamma, coef0, handle); break; + case RBF: res = new RBFKernel(gamma, handle); break; + default: throw raft::exception("Kernel not implemented"); + } + return res; +} + +template class KernelFactory; +template class KernelFactory; + +}; // end namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cuh b/cpp/src/distance/detail/kernels/kernel_factory.cuh deleted file mode 100644 index 5c50a95a3..000000000 --- a/cpp/src/distance/detail/kernels/kernel_factory.cuh +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" -#include "kernel_matrices.cuh" - -#include -#include - -namespace cuvs::distance::kernels::detail { - -template -class KernelFactory { - public: - static GramMatrixBase* create(KernelParams params) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(); break; - case POLYNOMIAL: res = new PolynomialKernel(params.degree, gamma, coef0); break; - case TANH: res = new TanhKernel(gamma, coef0); break; - case RBF: res = new RBFKernel(gamma); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } - - [[deprecated]] static GramMatrixBase* create(KernelParams params, cublasHandle_t handle) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(handle); break; - case POLYNOMIAL: - res = new PolynomialKernel(params.degree, gamma, coef0, handle); - break; - case TANH: res = new TanhKernel(gamma, coef0, handle); break; - case RBF: res = new RBFKernel(gamma, handle); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cu b/cpp/src/distance/detail/kernels/kernel_matrices.cu new file mode 100644 index 000000000..526ca106f --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_matrices.cu @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../../distance/distance.cuh" +#include + +#include "rbf_fin_op.cuh" +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels { + +/** Epiloge function for polynomial kernel without padding. + * Calculates output = (gain*in + offset)^exponent + * @param inout device vector in column major format, size [len] + * @param len array length + * @param exponent + * @param gain + * @param offset + */ +template +RAFT_KERNEL polynomial_kernel_nopad( + math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = pow(gain * inout[tid] + offset, exponent); + } +} + +/** Epiloge function for polynomial kernel with padding. + * Calculates output = (gain*input + offset)^exponent + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param exponent + * @param gain + * @param offset + */ +template +RAFT_KERNEL polynomial_kernel( + math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector, size [len] + * @param len length of the input vector + * @param gain + * @param offset + */ +template +RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = tanh(gain * inout[tid] + offset); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param gain + * @param offset + */ +template +RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); + } +} + +/** Epiloge function for rbf kernel using expansion. + * + * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij)); + * + * Intended usage + * - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk + * - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X + * - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y + * + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param norm_x l2-norm of X's rows + * @param norm_y l2-norm of Y's rows + * @param gain + */ +template +RAFT_KERNEL rbf_kernel_expanded( + math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) { + math_t norm_y_val = norm_y[tidy]; + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = + exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2)); + } + } +} + +std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) +{ + dim3 block_shape = dim3(32, 4); + const int num_blocks_x = raft::ceildiv(n1, 32); + const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); + dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); + return std::make_tuple(grid_shape, block_shape); +} + +/** + * Create a kernel matrix using polynomial kernel function. + */ +template +void PolynomialKernel::applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) +{ + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, exponent, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + polynomial_kernel<<>>( + inout, ld, n1, n2, exponent, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void PolynomialKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); +} + +/** + * Create a kernel matrix using tanh kernel function. + */ +template +void TanhKernel::applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) +{ + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void TanhKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); +} + +/** + * Create a kernel matrix using RBF kernel function. + */ +template +void RBFKernel::applyKernel(math_t* inout, + int ld, + int rows, + int cols, + math_t* norm_x1, + math_t* norm_x2, + bool is_row_major, + cudaStream_t stream) +{ + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; + math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + rbf_kernel_expanded<<>>( + inout, ld, n1, n2, norm_n1, norm_n2, gain); +} + +template +void RBFKernel::matrixRowNormL2(raft::resources const& handle, + dense_input_matrix_view_t matrix, + math_t* target) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(matrix); + int minor = is_row_major ? matrix.extent(1) : matrix.extent(0); + int ld = is_row_major ? matrix.stride(0) : matrix.stride(1); + ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter"); + raft::linalg::rowNorm(target, + matrix.data_handle(), + matrix.extent(1), + matrix.extent(0), + raft::linalg::NormType::L2Norm, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +template +void RBFKernel::matrixRowNormL2(raft::resources const& handle, + csr_input_matrix_view_t matrix, + math_t* target) +{ + auto matrix_structure = matrix.structure_view(); + raft::sparse::linalg::rowNormCsr(handle, + matrix_structure.get_indptr().data(), + matrix.get_elements().data(), + matrix_structure.get_nnz(), + matrix_structure.get_n_rows(), + target, + raft::linalg::NormType::L2Norm); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.extent(0), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.extent(0), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.extent(0), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void RBFKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + int minor1 = is_row_major ? n_cols : n1; + int minor2 = is_row_major ? n_cols : n2; + int minor_out = is_row_major ? n2 : n1; + ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); + ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); + ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); + + math_t gain = this->gain; + using index_t = int64_t; + + rbf_fin_op fin_op{gain}; + + raft::resources handle; + raft::resource::set_cuda_stream(handle, stream); + + cuvs::distance::distance(handle, + const_cast(x1), + const_cast(x2), + out, + n1, + n2, + n_cols, + NULL, + 0, + fin_op, + is_row_major); +} + +template class PolynomialKernel; +template class PolynomialKernel; +template class TanhKernel; +template class TanhKernel; +template class RBFKernel; +template class RBFKernel; + +}; // end namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cuh b/cpp/src/distance/detail/kernels/kernel_matrices.cuh deleted file mode 100644 index bff5bda92..000000000 --- a/cpp/src/distance/detail/kernels/kernel_matrices.cuh +++ /dev/null @@ -1,777 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" - -#include "../detail/kernels/rbf_fin_op.cuh" -#include -#include -#include -#include -#include - -namespace cuvs::distance::kernels::detail { - -/** Epiloge function for polynomial kernel without padding. - * Calculates output = (gain*in + offset)^exponent - * @param inout device vector in column major format, size [len] - * @param len array length - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel_nopad( - math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = pow(gain * inout[tid] + offset, exponent); - } -} - -/** Epiloge function for polynomial kernel with padding. - * Calculates output = (gain*input + offset)^exponent - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel( - math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector, size [len] - * @param len length of the input vector - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = tanh(gain * inout[tid] + offset); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); - } -} - -/** Epiloge function for rbf kernel using expansion. - * - * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij)); - * - * Intended usage - * - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk - * - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X - * - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y - * - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param norm_x l2-norm of X's rows - * @param norm_y l2-norm of Y's rows - * @param gain - */ -template -RAFT_KERNEL rbf_kernel_expanded( - math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) { - math_t norm_y_val = norm_y[tidy]; - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = - exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2)); - } - } -} - -namespace { -std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) -{ - dim3 block_shape = dim3(32, 4); - const int num_blocks_x = raft::ceildiv(n1, 32); - const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); - dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); - return std::make_tuple(grid_shape, block_shape); -} -} // namespace - -/** - * Create a kernel matrix using polynomial kernel function. - */ -template -class PolynomialKernel : public GramMatrixBase { - exp_t exponent; - math_t gain; - math_t offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, exponent, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - polynomial_kernel<<>>( - inout, ld, n1, n2, exponent, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a polynomial kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = (gain* + offset)^exponent - * - * @tparam math_t floating point type - * @tparam exp_t type of exponent - * @param exponent - * @param gain - * @param offset - */ - PolynomialKernel(exp_t exponent, math_t gain, math_t offset) - : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset) - { - } - - [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using tanh kernel function. - */ -template -class TanhKernel : public GramMatrixBase { - math_t gain, offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a tanh kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = tanh(gain* + offset) - * - * @tparam math_t floating point type - * @param gain - * @param offset - */ - TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} - - [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using RBF kernel function. - */ -template -class RBFKernel : public GramMatrixBase { - math_t gain; - - void applyKernel(math_t* inout, - int ld, - int rows, - int cols, - math_t* norm_x1, - math_t* norm_x2, - bool is_row_major, - cudaStream_t stream) - { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; - math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - rbf_kernel_expanded<<>>( - inout, ld, n1, n2, norm_n1, norm_n2, gain); - } - - public: - /** - * Constructs a RBF kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = exp(-gain*|x1_i- x2_k|^2) - * - * @tparam math_t floating point type - * @param gain - */ - RBFKernel(math_t gain) : GramMatrixBase(), gain(gain) {} - - [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain) - { - } - - void matrixRowNormL2(raft::resources const& handle, - dense_input_matrix_view_t matrix, - math_t* target) - { - bool is_row_major = GramMatrixBase::get_is_row_major(matrix); - int minor = is_row_major ? matrix.extent(1) : matrix.extent(0); - int ld = is_row_major ? matrix.stride(0) : matrix.stride(1); - ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter"); - raft::linalg::rowNorm(target, - matrix.data_handle(), - matrix.extent(1), - matrix.extent(0), - raft::linalg::NormType::L2Norm, - is_row_major, - resource::get_cuda_stream(handle)); - } - - void matrixRowNormL2(raft::resources const& handle, - csr_input_matrix_view_t matrix, - math_t* target) - { - auto matrix_structure = matrix.structure_view(); - raft::sparse::linalg::rowNormCsr(handle, - matrix_structure.get_indptr().data(), - matrix.get_elements().data(), - matrix_structure.get_nnz(), - matrix_structure.get_n_rows(), - target, - raft::linalg::NormType::L2Norm); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.extent(0), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - int minor1 = is_row_major ? n_cols : n1; - int minor2 = is_row_major ? n_cols : n2; - int minor_out = is_row_major ? n2 : n1; - ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); - ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); - ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); - - math_t gain = this->gain; - using index_t = int64_t; - - rbf_fin_op fin_op{gain}; - - raft::resources handle; - resource::set_cuda_stream(handle, stream); - - cuvs::distance::distance(handle, - const_cast(x1), - const_cast(x2), - out, - n1, - n2, - n_cols, - NULL, - 0, - fin_op, - is_row_major); - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh index 73588baea..53022368d 100644 --- a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh +++ b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh @@ -28,7 +28,7 @@ #include // raft::exp #include // HD -namespace cuvs::distance::kernels::detail { +namespace cuvs::distance::kernels { /** @brief: Final op for Gram matrix with RBF kernel. * @@ -48,4 +48,4 @@ struct rbf_fin_op { } }; // struct rbf_fin_op -} // namespace cuvs::distance::kernels::detail +} // namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh index edfd7cf5f..49497ab3a 100644 --- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh +++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh @@ -118,9 +118,7 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default( instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default( cuvs::distance::detail::ops::russel_rao_distance_op, int); instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo( - cuvs::distance::detail::ops::l2_unexp_distance_op, - int64_t, - cuvs::distance::kernels::detail::rbf_fin_op); + cuvs::distance::detail::ops::l2_unexp_distance_op, int64_t, cuvs::distance::kernels::rbf_fin_op); instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default( cuvs::distance::detail::ops::l2_exp_distance_op, int64_t); diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu index 3c8f25109..a2e12b6df 100644 --- a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu +++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu @@ -50,7 +50,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch( float, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_raft_distance_detail_pairwise_matrix_dispatch( @@ -58,7 +58,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch( double, double, double, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_raft_distance_detail_pairwise_matrix_dispatch( @@ -66,7 +66,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch( half, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); #undef instantiate_raft_distance_detail_pairwise_matrix_dispatch diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh index e623f76ba..a692a62a3 100644 --- a/cpp/src/distance/distance-ext.cuh +++ b/cpp/src/distance/distance-ext.cuh @@ -273,13 +273,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan float, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded, double, double, double, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); #undef instantiate_cuvs_distance_distance_extra diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu index c1d39f360..47e72460f 100644 --- a/cpp/src/distance/distance.cu +++ b/cpp/src/distance/distance.cu @@ -139,13 +139,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan float, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded, double, double, double, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); #undef instantiate_cuvs_distance_distance_extra diff --git a/cpp/src/embed/spectral.cu b/cpp/src/embed/spectral.cu new file mode 100644 index 000000000..c3d4e3fc7 --- /dev/null +++ b/cpp/src/embed/spectral.cu @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../sparse/cluster/detail/spectral.cuh" +#include +#include +#include + +namespace cuvs::embed::spectral { + +/** + * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings + * (lowest n_components eigenvectors), using Lanczos min cut algorithm. + * @param rows source vertices of knn graph (size nnz) + * @param cols destination vertices of knn graph (size nnz) + * @param vals edge weights connecting vertices of knn graph (size nnz) + * @param nnz size of rows/cols/vals + * @param n number of samples in X + * @param n_neighbors the number of neighbors to query for knn graph construction + * @param n_components the number of components to project the X into + * @param out output array for embedding (size n*n_comonents) + */ +void fit(const raft::resources& handle, + raft::device_coo_matrix_view knn_graph, + int n_components, + raft::device_matrix_view out, + unsigned long long seed) +{ + cuvs::sparse::cluster::spectral::detail::fit_embedding( + handle, + knn_graph.structure_view().get_rows().data(), + knn_graph.structure_view().get_cols().data(), + knn_graph.get_elements().data(), + knn_graph.structure_view().get_nnz(), + knn_graph.structure_view().get_n_rows(), + n_components, + out.data_handle(), + seed); +} +}; // namespace cuvs::embed::spectral diff --git a/cpp/src/sparse/cluster/cluster_solvers.cuh b/cpp/src/sparse/cluster/cluster_solvers.cuh new file mode 100644 index 000000000..7b4cf6ab3 --- /dev/null +++ b/cpp/src/sparse/cluster/cluster_solvers.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CLUSTER_SOLVERS_H +#define __CLUSTER_SOLVERS_H + +#pragma once + +#include +#include +#include + +#include // for std::pair + +namespace cuvs { +namespace spectral { + +using namespace raft::spectral::matrix; + +// aggregate of control params for Eigen Solver: +// +template +struct cluster_solver_config_t { + size_type_t n_clusters; + size_type_t maxIter; + + value_type_t tol; + + unsigned long long seed{123456}; +}; + +template +struct kmeans_solver_t { + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) + { + } + + std::pair solve(raft::resources const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { + RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); + RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); + value_type_t residual{}; + index_type_t iters{}; + cuvs::cluster::kmeans::params km_params; + km_params.n_clusters = config_.n_clusters; + km_params.tol = config_.tol; + km_params.max_iter = config_.maxIter; + km_params.rng_state.seed = config_.seed; + + auto X = raft::make_device_matrix_view(obs, n_obs_vecs, dim); + auto labels = raft::make_device_vector_view(codes, n_obs_vecs); + auto centroids = + raft::make_device_matrix(handle, config_.n_clusters, dim); + auto weight = raft::make_device_vector(handle, n_obs_vecs); + thrust::fill(raft::resource::get_thrust_policy(handle), + weight.data_handle(), + weight.data_handle() + n_obs_vecs, + 1); + + auto sw = std::make_optional((raft::device_vector_view)weight.view()); + cuvs::cluster::kmeans::fit_predict(handle, + km_params, + X, + sw, + centroids.view(), + labels, + raft::make_host_scalar_view(&residual), + raft::make_host_scalar_view(&iters)); + return std::make_pair(residual, iters); + } + + auto const& get_config(void) const { return config_; } + + private: + cluster_solver_config_t config_; +}; + +} // namespace spectral +} // namespace cuvs + +#endif \ No newline at end of file diff --git a/cpp/src/sparse/cluster/detail/spectral.cuh b/cpp/src/sparse/cluster/detail/spectral.cuh new file mode 100644 index 000000000..571d92bf5 --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral.cuh @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../cluster_solvers.cuh" +#include "../eigen_solvers.cuh" +#include "../partition.cuh" +#include +#include +#include +#include +#include + +#include + +namespace cuvs::sparse::cluster::spectral::detail { + +template +void fit_embedding(raft::resources const& handle, + int* rows, + int* cols, + T* vals, + int nnz, + int n, + int n_components, + T* out, + unsigned long long seed = 1234567) +{ + auto stream = raft::resource::get_cuda_stream(handle); + rmm::device_uvector src_offsets(n + 1, stream); + rmm::device_uvector dst_cols(nnz, stream); + rmm::device_uvector dst_vals(nnz, stream); + raft::sparse::convert::coo_to_csr( + handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); + + rmm::device_uvector eigVals(n_components + 1, stream); + rmm::device_uvector eigVecs(n * (n_components + 1), stream); + rmm::device_uvector labels(n, stream); + + raft::resource::sync_stream(handle, stream); + + /** + * Raft spectral clustering + */ + using index_type = int; + using value_type = T; + + index_type* ro = src_offsets.data(); + index_type* ci = dst_cols.data(); + value_type* vs = dst_vals.data(); + + raft::spectral::matrix::sparse_matrix_t const r_csr_m{ + handle, ro, ci, vs, n, nnz}; + + index_type neigvs = n_components + 1; + index_type maxiter = 4000; // default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; // what cugraph is using + + cuvs::spectral::eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol}; + + cfg.seed = seed; + + cuvs::spectral::lanczos_solver_t eig_solver{cfg}; + + // cluster computation here is irrelevant, + // hence define a no-op such solver to + // feed partition(): + // + struct no_op_cluster_solver_t { + using index_type_t = index_type; + using size_type_t = index_type; + using value_type_t = value_type; + + std::pair solve(raft::resources const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { + return std::make_pair(0, 0); + } + }; + + cuvs::spectral::partition(handle, + r_csr_m, + eig_solver, + no_op_cluster_solver_t{}, + labels.data(), + eigVals.data(), + eigVecs.data()); + + raft::copy(out, eigVecs.data() + n, n * n_components, stream); + + RAFT_CUDA_TRY(cudaGetLastError()); +} + +}; // namespace cuvs::sparse::cluster::spectral::detail \ No newline at end of file diff --git a/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp new file mode 100644 index 000000000..a42ad2dc1 --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS +#include "../../cluster_solvers.cuh" +#include "../../eigen_solvers.cuh" +#include "spectral_util.cuh" +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuvs { +namespace spectral { +namespace detail { + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Cluster + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return error flag. + */ +template +std::tuple modularity_maximization( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + + std::tuple + stats; // # iters eigen solver, cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // Compute eigenvectors of Modularity Matrix + + // Initialize Modularity Matrix + raft::spectral::matrix::modularity_matrix_t B{handle, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute eigenvectors corresponding to largest eigenvalues + std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); + + // notice that at this point the matrix has already been transposed, so we are scaling + // columns + auto dataset_view = raft::make_device_matrix_view(eigVecs, nEigVecs, n); + raft::linalg::row_normalize( + handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param G Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param clusters (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +void analyzeModularity(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + weight_t partModularity, clustersize; + + auto cublas_h = raft::resource::get_cublas_handle(handle); + auto stream = raft::resource::get_cuda_stream(handle); + + // Device memory + raft::spectral::matrix::vector_t part_i(handle, n); + raft::spectral::matrix::vector_t Bx(handle, n); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Modularity + raft::spectral::matrix::modularity_matrix_t B{handle, csr_m}; + + // Initialize output + modularity = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { + WARNING("empty partition"); + continue; + } + + // Record results + modularity += partModularity; + } + + modularity = modularity / B.diagonal_.nrm1(); +} + +} // namespace detail +} // namespace spectral +} // namespace cuvs diff --git a/cpp/src/sparse/cluster/detail/spectral/partition.hpp b/cpp/src/sparse/cluster/detail/spectral/partition.hpp new file mode 100644 index 000000000..77e83c17d --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral/partition.hpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS +#include + +#include "../../cluster_solvers.cuh" +#include "../../eigen_solvers.cuh" +#include "spectral_util.cuh" +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuvs { +namespace spectral { +namespace detail { + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple partition( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, + // cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // ------------------------------------------------------- + // Spectral partitioner + // ------------------------------------------------------- + + // Compute eigenvectors of Laplacian + + // Initialize Laplacian + /// sparse_matrix_t A{handle, graph}; + raft::spectral::matrix::laplacian_matrix_t L{handle, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute smallest eigenvalues and eigenvectors + std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return error flag. + */ +template +void analyzePartition(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + + weight_t partEdgesCut, clustersize; + + // Device memory + raft::spectral::matrix::vector_t part_i(handle, n); + raft::spectral::matrix::vector_t Lx(handle, n); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Laplacian + /// sparse_matrix_t A{handle, graph}; + raft::spectral::matrix::laplacian_matrix_t L{handle, csr_m}; + + // Initialize output + cost = 0; + edgeCut = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + // Construct indicator vector for ith partition + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { + WARNING("empty partition"); + continue; + } + + // Record results + cost += partEdgesCut / clustersize; + edgeCut += partEdgesCut / 2; + } +} + +} // namespace detail +} // namespace spectral +} // namespace cuvs diff --git a/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh new file mode 100644 index 000000000..1d2e58e2a --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuvs { +namespace spectral { + +template +void transform_eigen_matrix(raft::resources const& handle, + edge_t n, + vertex_t nEigVecs, + weight_t* eigVecs) +{ + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + auto thrust_exec_policy = raft::resource::get_thrust_policy(handle); + + const weight_t zero{0.0}; + const weight_t one{1.0}; + + // Whiten eigenvector matrix + for (auto i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + RAFT_CHECK_CUDA(stream); + mean /= n; + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + RAFT_CHECK_CUDA(stream); + + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + RAFT_CHECK_CUDA(stream); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + raft::spectral::matrix::vector_t work(handle, nEigVecs * n); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); + } +} + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const index_type_t i; + + public: + equal_to_i_op(index_type_t _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + } +}; +} // namespace + +// Construct indicator vector for ith partition +// +template +bool construct_indicator(raft::resources const& handle, + edge_t index, + edge_t n, + weight_t& clustersize, + weight_t& partStats, + vertex_t const* __restrict__ clusters, + raft::spectral::matrix::vector_t& part_i, + raft::spectral::matrix::vector_t& Bx, + raft::spectral::matrix::laplacian_matrix_t const& B) +{ + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + auto thrust_exec_policy = raft::resource::get_thrust_policy(handle); + + thrust::for_each( + thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); + RAFT_CHECK_CUDA(stream); + + // Compute size of ith partition + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot( + cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); + + clustersize = round(clustersize); + if (clustersize < 0.5) { return false; } + + // Compute part stats + B.mv(1, part_i.raw(), 0, Bx.raw()); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + + return true; +} + +} // namespace spectral +} // namespace cuvs diff --git a/cpp/src/sparse/cluster/eigen_solvers.cuh b/cpp/src/sparse/cluster/eigen_solvers.cuh new file mode 100644 index 000000000..1b2501d68 --- /dev/null +++ b/cpp/src/sparse/cluster/eigen_solvers.cuh @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __EIGEN_SOLVERS_H +#define __EIGEN_SOLVERS_H + +#pragma once + +#include +#include + +namespace cuvs { +namespace spectral { + +// aggregate of control params for Eigen Solver: +// +template +struct eigen_solver_config_t { + size_type_t n_eigVecs; + size_type_t maxIter; + + size_type_t restartIter; + value_type_t tol; + + bool reorthogonalize{false}; + unsigned long long seed{ + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in + // Lanczos; was not the case before: there were places where a default seed = 123456 + // was used; this may trigger slightly different # solver iterations +}; + +template +struct lanczos_solver_t { + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) + { + } + + index_type_t solve_smallest_eigenvectors( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + index_type_t iters{}; + raft::sparse::solver::computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); + return iters; + } + + index_type_t solve_largest_eigenvectors( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + index_type_t iters{}; + raft::sparse::solver::computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); + return iters; + } + + auto const& get_config(void) const { return config_; } + + private: + eigen_solver_config_t config_; +}; + +} // namespace spectral +} // namespace cuvs + +#endif diff --git a/cpp/src/sparse/cluster/modularity_maximization.cuh b/cpp/src/sparse/cluster/modularity_maximization.cuh new file mode 100644 index 000000000..71cba6927 --- /dev/null +++ b/cpp/src/sparse/cluster/modularity_maximization.cuh @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __MODULARITY_MAXIMIZATION_H +#define __MODULARITY_MAXIMIZATION_H + +#pragma once + +#include "detail/spectral/modularity_maximization.hpp" + +#include + +namespace cuvs { +namespace spectral { + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param eigen_solver Eigensolver implementation + * @param cluster_solver Cluster solver implementation + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param eigVals Output eigenvalue array pointer on device + * @param eigVecs Output eigenvector array pointer on device + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple modularity_maximization( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + return cuvs::spectral::detail:: + modularity_maximization( + handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param clusters (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +void analyzeModularity(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ + cuvs::spectral::detail::analyzeModularity( + handle, csr_m, nClusters, clusters, modularity); +} + +} // namespace spectral +} // namespace cuvs + +#endif \ No newline at end of file diff --git a/cpp/src/sparse/cluster/partition.cuh b/cpp/src/sparse/cluster/partition.cuh new file mode 100644 index 000000000..df78a8a2d --- /dev/null +++ b/cpp/src/sparse/cluster/partition.cuh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PARTITION_H +#define __PARTITION_H + +#pragma once + +#include "detail/spectral/partition.hpp" + +#include + +namespace cuvs { +namespace spectral { + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param eigen_solver Eigensolver implementation + * @param cluster_solver Cluster solver implementation + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param eigVals Output eigenvalue array pointer on device + * @param eigVecs Output eigenvector array pointer on device + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple partition( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + return cuvs::spectral::detail::partition( + handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + */ +template +void analyzePartition(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ + cuvs::spectral::detail::analyzePartition( + handle, csr_m, nClusters, clusters, edgeCut, cost); +} + +} // namespace spectral +} // namespace cuvs + +#endif \ No newline at end of file diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 0ecac6ec2..9224e88d8 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -218,6 +218,7 @@ if(BUILD_TESTS) distance/dist_l_inf.cu distance/dist_lp_unexp.cu distance/dist_russell_rao.cu + distance/gram.cu distance/masked_nn.cu distance/sparse_distance.cu sparse/neighbors/cross_component_nn.cu @@ -227,6 +228,11 @@ if(BUILD_TESTS) 100 ) + ConfigureTest( + NAME SPARSE_TEST PATH sparse/cluster/cluster_solvers.cu sparse/cluster/eigen_solvers.cu + sparse/cluster/spectral.cu GPUS 1 PERCENT 100 + ) + ConfigureTest( NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100 ) diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu new file mode 100644 index 000000000..89b1525ea --- /dev/null +++ b/cpp/test/distance/gram.cu @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include "gram_base.cuh" + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuvs::distance::kernels { + +struct GramMatrixInputs { + int n1; // feature vectors in matrix 1 + int n2; // featuer vectors in matrix 2 + int n_cols; // number of elements in a feature vector + bool is_row_major; + KernelParams kernel; + int ld1; + int ld2; + int ld_out; + // We will generate random input using the dimensions given here. + // The reference output is calculated by a custom kernel. +}; + +std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p) +{ + std::vector kernel_names{"linear", "poly", "rbf", "tanh"}; + os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/" + << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_" + << p.ld1 << "x" << p.ld2 << "x" << p.ld_out; + return os; +} + +const std::vector inputs = { + {42, 137, 2, false, {KernelType::LINEAR}}, + {42, 137, 2, true, {KernelType::LINEAR}}, + {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181}, + {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181}, + {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}}, + {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}}, + {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144}, + {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144}, + {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}}, + {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}}, + {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49}, + {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143}, + {3, 4, 2, false, {KernelType::RBF, 0, 0.5}}, + {42, 137, 2, false, {KernelType::RBF, 0, 0.5}}, + {42, 137, 2, true, {KernelType::RBF, 0, 0.5}}, + // Distance kernel does not support LD parameter yet. + //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49}, + // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143}, +}; + +template +class GramMatrixTest : public ::testing::TestWithParam { + protected: + GramMatrixTest() + : params(GetParam()), + handle(), + x1(0, raft::resource::get_cuda_stream(handle)), + x2(0, raft::resource::get_cuda_stream(handle)), + gram(0, raft::resource::get_cuda_stream(handle)), + gram_host(0) + { + auto stream = raft::resource::get_cuda_stream(handle); + + if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; } + if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; } + if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; } + // Derive the size of the output from the offset of the last element. + size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1; + x1.resize(size, stream); + size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1; + x2.resize(size, stream); + size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1; + + gram.resize(size, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream)); + gram_host.resize(gram.size()); + std::fill(gram_host.begin(), gram_host.end(), 0); + + raft::random::RngState rng(42137ULL); + raft::random::uniform(handle, rng, x1.data(), x1.size(), math_t(0), math_t(1)); + raft::random::uniform(handle, rng, x2.data(), x2.size(), math_t(0), math_t(1)); + } + + ~GramMatrixTest() override {} + + void runTest() + { + std::unique_ptr> kernel = + std::unique_ptr>(KernelFactory::create(params.kernel)); + + auto x1_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1) + : raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1); + auto x2_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2) + : raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2); + auto out_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out) + : raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out); + + (*kernel)(handle, x1_span, x2_span, out_span); + + auto stream = raft::resource::get_cuda_stream(handle); + naiveGramMatrixKernel(params.n1, + params.n2, + params.n_cols, + x1, + x2, + gram_host.data(), + params.ld1, + params.ld2, + params.ld_out, + params.is_row_major, + params.kernel, + stream, + handle); + + ASSERT_TRUE(cuvs::devArrMatchHost( + gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox(1e-6f), stream)); + } + + GramMatrixInputs params; + raft::resources handle; + + rmm::device_uvector x1; + rmm::device_uvector x2; + rmm::device_uvector gram; + + std::vector gram_host; +}; + +typedef GramMatrixTest GramMatrixTestFloat; +typedef GramMatrixTest GramMatrixTestDouble; + +TEST_P(GramMatrixTestFloat, Gram) { runTest(); } + +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs)); +}; // namespace cuvs::distance::kernels \ No newline at end of file diff --git a/cpp/test/distance/gram_base.cuh b/cpp/test/distance/gram_base.cuh new file mode 100644 index 000000000..326cdb4f8 --- /dev/null +++ b/cpp/test/distance/gram_base.cuh @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace kernels { + +// Get the offset of element [i,k]. +HDI int get_offset(int i, int k, int ld, bool is_row_major) +{ + return is_row_major ? i * ld + k : i + k * ld; +} + +// Calculate the Gram matrix on the host. +template +void naiveGramMatrixKernel(int n1, + int n2, + int n_cols, + const rmm::device_uvector& x1, + const rmm::device_uvector& x2, + math_t* gram_host, + int ld1, + int ld2, + int ld_out, + bool is_row_major, + KernelParams kernel, + cudaStream_t stream, + const raft::resources& handle) +{ + std::vector x1_host(x1.size()); + raft::update_host(x1_host.data(), x1.data(), x1.size(), stream); + std::vector x2_host(x2.size()); + raft::update_host(x2_host.data(), x2.data(), x2.size(), stream); + raft::resource::sync_stream(handle, stream); + + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + float d = 0; + for (int k = 0; k < n_cols; k++) { + if (kernel.kernel == KernelType::RBF) { + math_t diff = x1_host[get_offset(i, k, ld1, is_row_major)] - + x2_host[get_offset(j, k, ld2, is_row_major)]; + d += diff * diff; + } else { + d += x1_host[get_offset(i, k, ld1, is_row_major)] * + x2_host[get_offset(j, k, ld2, is_row_major)]; + } + } + int idx = get_offset(i, j, ld_out, is_row_major); + math_t v = 0; + switch (kernel.kernel) { + case (KernelType::LINEAR): gram_host[idx] = d; break; + case (KernelType::POLYNOMIAL): + v = kernel.gamma * d + kernel.coef0; + gram_host[idx] = std::pow(v, kernel.degree); + break; + case (KernelType::TANH): gram_host[idx] = std::tanh(kernel.gamma * d + kernel.coef0); break; + case (KernelType::RBF): gram_host[idx] = exp(-kernel.gamma * d); break; + } + } + } +} + +} // namespace kernels +} // namespace distance +} // namespace cuvs \ No newline at end of file diff --git a/cpp/test/sparse/cluster/cluster_solvers.cu b/cpp/test/sparse/cluster/cluster_solvers.cu new file mode 100644 index 000000000..c0b6c1a78 --- /dev/null +++ b/cpp/test/sparse/cluster/cluster_solvers.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../../src/sparse/cluster/cluster_solvers.cuh" +#include "../../../src/sparse/cluster/eigen_solvers.cuh" +#include "../../../src/sparse/cluster/modularity_maximization.cuh" +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace spectral { + +TEST(Raft, ClusterSolvers) +{ + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + + index_type maxiter{100}; + value_type tol{1.0e-10}; + unsigned long long seed{100110021003}; + + auto stream = raft::resource::get_cuda_stream(h); + + index_type n{100}; + index_type d{10}; + index_type k{5}; + + // nullptr expected to trigger exceptions: + // + value_type* eigvecs{nullptr}; + index_type* codes{nullptr}; + + cluster_solver_config_t cfg{k, maxiter, tol, seed}; + + kmeans_solver_t cluster_solver{cfg}; + + EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); +} + +TEST(Raft, ModularitySolvers) +{ + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h)); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + // nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = raft::resource::get_cuda_stream(h); + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + + EXPECT_ANY_THROW(cuvs::spectral::modularity_maximization( + h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + + value_type modularity{0}; + EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity)); +} + +} // namespace spectral +} // namespace cuvs diff --git a/cpp/test/sparse/cluster/eigen_solvers.cu b/cpp/test/sparse/cluster/eigen_solvers.cu new file mode 100644 index 000000000..8de0b49e7 --- /dev/null +++ b/cpp/test/sparse/cluster/eigen_solvers.cu @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../../src/sparse/cluster/eigen_solvers.cuh" +#include "../../../src/sparse/cluster/partition.cuh" +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuvs { +namespace spectral { + +TEST(Raft, EigenSolvers) +{ + raft::common::nvtx::range fun_scope("test::EigenSolvers"); + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h)); + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + // nullptr expected to trigger exceptions: + // + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + std::uint64_t seed{100110021003}; + + eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + + lanczos_solver_t eig_solver{cfg}; + + EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + + EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); +} + +TEST(Raft, SpectralSolvers) +{ + raft::common::nvtx::range fun_scope("test::SpectralSolvers"); + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h) + + ); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + // nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + EXPECT_ANY_THROW( + spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + + value_type edgeCut{0}; + value_type cost{0}; + EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); +} + +} // namespace spectral +} // namespace cuvs diff --git a/cpp/test/sparse/cluster/spectral.cu b/cpp/test/sparse/cluster/spectral.cu new file mode 100644 index 000000000..7d0cdef9d --- /dev/null +++ b/cpp/test/sparse/cluster/spectral.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../test_utils.cuh" + +#include "../../../src/sparse/cluster/modularity_maximization.cuh" +#include "../../../src/sparse/cluster/partition.cuh" +#include + +#include + +#include +#include + +namespace cuvs { +namespace cluster { + +/** + * Warning: There appears to be a CUDA 12.2 bug in cusparse that causes an + * alignment issue. We've fixed the bug in our code through a workaround + * (see raft/sparse/linalg/spmm.hpp for fix). This test is meant to fail + * in the case where the fix is accidentally reverted, so that it doesn't + * break any downstream libraries that depend on RAFT + */ +TEST(Raft, Spectral) +{ + raft::handle_t handle; + + std::vector h_offsets({0, 2, 4, 7, 10, 12, 14}); + std::vector h_indices({1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4}); + std::vector h_values( + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + std::vector expected_clustering({1, 1, 1, 0, 0, 0}); + + int32_t n_clusters{2}; + int32_t n_eigenvectors{2}; + int32_t evs_max_it{100}; + int32_t kmean_max_it{100}; + int32_t restartIter_lanczos = 15 + n_eigenvectors; + float evs_tol{0.001}; + float kmean_tol{0.001}; + unsigned long long seed1{1234567}; + unsigned long long seed2{12345678}; + bool reorthog{false}; + + rmm::device_uvector offsets(h_offsets.size(), handle.get_stream()); + rmm::device_uvector indices(h_indices.size(), handle.get_stream()); + rmm::device_uvector values(h_indices.size(), handle.get_stream()); + rmm::device_uvector clustering(expected_clustering.size(), handle.get_stream()); + rmm::device_uvector eigenvalues(n_eigenvectors, handle.get_stream()); + rmm::device_uvector eigenvectors(n_eigenvectors * expected_clustering.size(), + handle.get_stream()); + + rmm::device_uvector exp_dev(expected_clustering.size(), handle.get_stream()); + + raft::update_device( + exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream()); + + raft::update_device(offsets.data(), h_offsets.data(), h_offsets.size(), handle.get_stream()); + raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream()); + raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream()); + + raft::spectral::matrix::sparse_matrix_t const matrix{ + handle, + offsets.data(), + indices.data(), + values.data(), + static_cast(offsets.size() - 1), + static_cast(indices.size())}; + + cuvs::spectral::eigen_solver_config_t eig_cfg{ + n_eigenvectors, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed1}; + cuvs::spectral::lanczos_solver_t eig_solver{eig_cfg}; + + cuvs::spectral::cluster_solver_config_t clust_cfg{ + n_clusters, kmean_max_it, kmean_tol, seed2}; + cuvs::spectral::kmeans_solver_t cluster_solver{clust_cfg}; + + cuvs::spectral::partition(handle, + matrix, + eig_solver, + cluster_solver, + clustering.data(), + eigenvalues.data(), + eigenvectors.data()); + + ASSERT_TRUE(devArrMatch(expected_clustering.data(), + exp_dev.data(), + exp_dev.size(), + 1, + cuvs::Compare(), + handle.get_stream())); +} + +} // namespace cluster +} // namespace cuvs \ No newline at end of file diff --git a/cpp/test/sparse/cluster/spectral_matrix.cu b/cpp/test/sparse/cluster/spectral_matrix.cu new file mode 100644 index 000000000..37a4202b8 --- /dev/null +++ b/cpp/test/sparse/cluster/spectral_matrix.cu @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace spectral { +namespace matrix { +namespace { +template +struct csr_view_t { + index_type* offsets; + index_type* indices; + value_type* edge_data; + index_type number_of_vertices; + index_type number_of_edges; +}; +} // namespace +TEST(Raft, SpectralMatrices) +{ + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h)); + + csr_view_t csr_v{nullptr, nullptr, nullptr, 0, 0}; + + int const sz = 10; + vector_t d_v{h, sz}; + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm2{h, csr_v}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + ASSERT_EQ(nullptr, sm2.row_offsets_); + + auto stream = resource::get_cuda_stream(h); + + auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args + + auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t lm2{h, sm2}; }; + EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args + + auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args + + auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t mm2{h, sm2}; }; + EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args +} + +} // namespace matrix +} // namespace spectral +} // namespace cuvs diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu new file mode 100644 index 000000000..d7af30a1c --- /dev/null +++ b/cpp/test/sparse/gram.cu @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../distance/gram_base.cuh" +#include "../test_utils.cuh" + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuvs::distance::kernels::sparse { + +/** + * Structure to describe structure of the input matrices: + * - DENSE: dense, dense + * - MIX: CSR, dense + * - CSR: CSR, CSR + */ +enum SparseType { DENSE, MIX, CSR }; + +struct GramMatrixInputs { + int n1; // feature vectors in matrix 1 + int n2; // featuer vectors in matrix 2 + int n_cols; // number of elements in a feature vector + bool is_row_major; + SparseType sparse_input; + KernelParams kernel; + int ld1; + int ld2; + int ld_out; + // We will generate random input using the dimensions given here. + // The reference output is calculated by a custom kernel. +}; + +std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p) +{ + std::vector kernel_names{"linear", "poly", "rbf", "tanh"}; + os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/" + << (p.is_row_major ? "RowMajor/" : "ColMajor/") + << (p.sparse_input == SparseType::DENSE + ? "DenseDense/" + : (p.sparse_input == SparseType::MIX ? "CsrDense/" : "CsrCsr/")) + << kernel_names[p.kernel.kernel] << "/ld_" << p.ld1 << "x" << p.ld2 << "x" << p.ld_out; + return os; +} + +/*struct KernelParams { + // Kernel function parameters + KernelType kernel; //!< Type of the kernel function + int degree; //!< Degree of polynomial kernel (ignored by others) + double gamma; //!< multiplier in the + double coef0; //!< additive constant in poly and tanh kernels +};*/ + +// const KernelParams linear_kernel_params{.kernel=KernelType::LINEAR}; + +// {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, {KernelType::TANH, 0, 0.5, 2.4}, {KernelType::RBF, 0, 0.5} +const std::vector inputs = raft::util::itertools::product( + {42}, + {137}, + {2}, + {true, false}, + {SparseType::DENSE, SparseType::MIX, SparseType::CSR}, + {KernelParams{KernelType::LINEAR}, + KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4}, + KernelParams{KernelType::TANH, 0, 0.5, 2.4}, + KernelParams{KernelType::RBF, 0, 0.5}}); + +// (ld_1, ld_2, ld_out) not supported by RBF and CSR +const std::vector inputs_ld = raft::util::itertools::product( + {137}, + {42}, + {2}, + {true, false}, + {SparseType::DENSE, SparseType::MIX}, + {KernelParams{KernelType::LINEAR}, + KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4}, + KernelParams{KernelType::TANH, 0, 0.5, 2.4}}, + {159}, + {73}, + {144}); + +// (ld_1, ld_2) are supported by CSR +const std::vector inputs_ld_csr = + raft::util::itertools::product( + {42}, + {137}, + {2}, + {true, false}, + {SparseType::CSR, SparseType::MIX}, + {KernelParams{KernelType::LINEAR}, + KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4}, + KernelParams{KernelType::TANH, 0, 0.5, 2.4}}, + {64}, + {155}, + {0}); + +template +class GramMatrixTest : public ::testing::TestWithParam { + protected: + GramMatrixTest() + : params(GetParam()), + stream(raft::resource::get_cuda_stream(handle)), + x1(0, stream), + x2(0, stream), + x1_csr_indptr(0, stream), + x1_csr_indices(0, stream), + x1_csr_data(0, stream), + x2_csr_indptr(0, stream), + x2_csr_indices(0, stream), + x2_csr_data(0, stream), + gram(0, stream), + gram_host(0) + { + if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; } + if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; } + if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; } + // Derive the size of the output from the offset of the last element. + size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1; + x1.resize(size, stream); + size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1; + x2.resize(size, stream); + size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1; + + gram.resize(size, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream)); + gram_host.resize(gram.size()); + std::fill(gram_host.begin(), gram_host.end(), 0); + + raft::random::RngState r(42137ULL); + raft::random::uniform(handle, r, x1.data(), x1.size(), math_t(0), math_t(1)); + raft::random::uniform(handle, r, x2.data(), x2.size(), math_t(0), math_t(1)); + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + } + + ~GramMatrixTest() override {} + + int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data) + { + int nnz = 0; + double eps = 1e-6; + int n_cols = params.n_cols; + bool is_row_major = params.is_row_major; + size_t dense_size = get_offset(n_rows - 1, n_cols - 1, ld, is_row_major) + 1; + + std::vector dense_host(dense_size); + raft::update_host(dense_host.data(), dense, dense_size, stream); + raft::resource::sync_stream(handle, stream); + + std::vector indptr_host(n_rows + 1); + std::vector indices_host(n_rows * n_cols); + std::vector data_host(n_rows * n_cols); + + // create csr matrix from dense (with threshold) + for (int i = 0; i < n_rows; ++i) { + indptr_host[i] = nnz; + for (int j = 0; j < n_cols; ++j) { + math_t value = dense_host[get_offset(i, j, ld, is_row_major)]; + if (value > eps) { + indices_host[nnz] = j; + data_host[nnz] = value; + nnz++; + } + } + } + indptr_host[n_rows] = nnz; + + // fill back dense matrix from CSR + std::fill(dense_host.data(), dense_host.data() + dense_size, 0); + for (int i = 0; i < n_rows; ++i) { + for (int idx = indptr_host[i]; idx < indptr_host[i + 1]; ++idx) { + dense_host[get_offset(i, indices_host[idx], ld, is_row_major)] = data_host[idx]; + } + } + + raft::update_device(dense, dense_host.data(), dense_size, stream); + raft::update_device(indptr, indptr_host.data(), n_rows + 1, stream); + raft::update_device(indices, indices_host.data(), nnz, stream); + raft::update_device(data, data_host.data(), nnz, stream); + raft::resource::sync_stream(handle, stream); + return nnz; + } + + void runTest() + { + std::unique_ptr> kernel = + std::unique_ptr>(KernelFactory::create(params.kernel)); + + auto x1_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1) + : raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1); + auto x2_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2) + : raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2); + auto out_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out) + : raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out); + + if (params.sparse_input == SparseType::DENSE) { + (*kernel)(handle, x1_span, x2_span, out_span); + } else { + x1_csr_indptr.reserve(params.n1 + 1, stream); + x1_csr_indices.reserve(params.n1 * params.n_cols, stream); + x1_csr_data.reserve(params.n1 * params.n_cols, stream); + int x1_nnz = prepareCsr(x1.data(), + params.n1, + params.ld1, + x1_csr_indptr.data(), + x1_csr_indices.data(), + x1_csr_data.data()); + + auto x1_csr_structure = raft::make_device_compressed_structure_view( + x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz); + auto x1_csr = raft::device_csr_matrix_view( + raft::device_span(x1_csr_data.data(), x1_csr_structure.get_nnz()), + x1_csr_structure); + + if (params.sparse_input == SparseType::MIX) { + (*kernel)(handle, x1_csr, x2_span, out_span); + } else { + x2_csr_indptr.reserve(params.n2 + 1, stream); + x2_csr_indices.reserve(params.n2 * params.n_cols, stream); + x2_csr_data.reserve(params.n2 * params.n_cols, stream); + int x2_nnz = prepareCsr(x2.data(), + params.n2, + params.ld2, + x2_csr_indptr.data(), + x2_csr_indices.data(), + x2_csr_data.data()); + + auto x2_csr_structure = raft::make_device_compressed_structure_view( + x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz); + auto x2_csr = raft::device_csr_matrix_view( + raft::device_span(x2_csr_data.data(), x2_csr_structure.get_nnz()), + x2_csr_structure); + + (*kernel)(handle, x1_csr, x2_csr, out_span); + } + } + // Something in gram is executing not on the 'stream' and therefore + // a full device sync is required + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + naiveGramMatrixKernel(params.n1, + params.n2, + params.n_cols, + x1, + x2, + gram_host.data(), + params.ld1, + params.ld2, + params.ld_out, + params.is_row_major, + params.kernel, + stream, + handle); + raft::resource::sync_stream(handle, stream); + + ASSERT_TRUE(cuvs::devArrMatchHost( + gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox(1e-6f), stream)); + } + + raft::resources handle; + cudaStream_t stream = 0; + GramMatrixInputs params; + + rmm::device_uvector x1; + rmm::device_uvector x2; + + rmm::device_uvector x1_csr_indptr; + rmm::device_uvector x1_csr_indices; + rmm::device_uvector x1_csr_data; + rmm::device_uvector x2_csr_indptr; + rmm::device_uvector x2_csr_indices; + rmm::device_uvector x2_csr_data; + + rmm::device_uvector gram; + std::vector gram_host; +}; + +typedef GramMatrixTest GramMatrixTestFloatStandard; +typedef GramMatrixTest GramMatrixTestFloatLd; +typedef GramMatrixTest GramMatrixTestFloatLdCsr; + +TEST_P(GramMatrixTestFloatStandard, Gram) { runTest(); } +TEST_P(GramMatrixTestFloatLd, Gram) { runTest(); } +TEST_P(GramMatrixTestFloatLdCsr, Gram) { runTest(); } + +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatStandard, ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatLd, ::testing::ValuesIn(inputs_ld)); +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, + GramMatrixTestFloatLdCsr, + ::testing::ValuesIn(inputs_ld_csr)); +}; // namespace cuvs::distance::kernels::sparse \ No newline at end of file From fa8838a3a00b17c6b8284e094d22382d80f87247 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Fri, 6 Dec 2024 09:42:21 -0600 Subject: [PATCH 46/47] Modify cuvs-bench to be able to generate ground truth in CPU systems (#466) PR allows calculating ground truth for cuvs-bench on CPU systems. Current version uses a simple NumPy brute force, perhaps we should consider using faiss? cc @cjnolet @divyegala Authors: - Dante Gama Dessavre (https://github.com/dantegd) - Corey J. Nolet (https://github.com/cjnolet) - Divye Gala (https://github.com/divyegala) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cuvs/pull/466 --- .../bench_ann_cuda-118_arch-aarch64.yaml | 3 + .../bench_ann_cuda-118_arch-x86_64.yaml | 3 + .../bench_ann_cuda-125_arch-aarch64.yaml | 3 + .../bench_ann_cuda-125_arch-x86_64.yaml | 3 + conda/recipes/cuvs-bench-cpu/meta.yaml | 1 + conda/recipes/cuvs-bench/meta.yaml | 3 +- dependencies.yaml | 3 + .../generate_groundtruth/__main__.py | 204 +++++++++++++++--- .../cuvs_bench/cuvs_bench/run/data_export.py | 73 +++---- python/cuvs_bench/pyproject.toml | 1 + 10 files changed, 220 insertions(+), 77 deletions(-) diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index 1e602ccf1..59d471bda 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -18,6 +18,8 @@ dependencies: - cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit +- cupy>=12.0.0 +- cuvs==24.12.*,>=0.0.0a0 - cxx-compiler - cython>=3.0.0 - dlpack>=0.8,<1.0 @@ -32,6 +34,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 +- libcuvs==24.12.*,>=0.0.0a0 - librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.19 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index b060e78c2..31a416eb5 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -18,6 +18,8 @@ dependencies: - cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit +- cupy>=12.0.0 +- cuvs==24.12.*,>=0.0.0a0 - cxx-compiler - cython>=3.0.0 - dlpack>=0.8,<1.0 @@ -32,6 +34,7 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 +- libcuvs==24.12.*,>=0.0.0a0 - librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.19 diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml index 485122273..3efe9ebde 100644 --- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml @@ -19,6 +19,8 @@ dependencies: - cuda-profiler-api - cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 +- cupy>=12.0.0 +- cuvs==24.12.*,>=0.0.0a0 - cxx-compiler - cython>=3.0.0 - dlpack>=0.8,<1.0 @@ -29,6 +31,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev +- libcuvs==24.12.*,>=0.0.0a0 - librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.19 diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml index d5f48dadb..7fbd77368 100644 --- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml @@ -19,6 +19,8 @@ dependencies: - cuda-profiler-api - cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 +- cupy>=12.0.0 +- cuvs==24.12.*,>=0.0.0a0 - cxx-compiler - cython>=3.0.0 - dlpack>=0.8,<1.0 @@ -29,6 +31,7 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev +- libcuvs==24.12.*,>=0.0.0a0 - librmm==24.12.*,>=0.0.0a0 - matplotlib - nccl>=2.19 diff --git a/conda/recipes/cuvs-bench-cpu/meta.yaml b/conda/recipes/cuvs-bench-cpu/meta.yaml index 02c11346f..016df56be 100644 --- a/conda/recipes/cuvs-bench-cpu/meta.yaml +++ b/conda/recipes/cuvs-bench-cpu/meta.yaml @@ -59,6 +59,7 @@ requirements: - glog {{ glog_version }} - h5py {{ h5py_version }} - matplotlib + - numpy >=1.23,<3.0a0 - pandas - pyyaml - python diff --git a/conda/recipes/cuvs-bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml index 3e81edc58..0681a1038 100644 --- a/conda/recipes/cuvs-bench/meta.yaml +++ b/conda/recipes/cuvs-bench/meta.yaml @@ -88,10 +88,11 @@ requirements: - cudatoolkit {% else %} - cuda-cudart + - cupy>=12.0.0 - libcublas {% endif %} - glog {{ glog_version }} - - libcuvs {{ version }} + - cuvs {{ version }} - h5py {{ h5py_version }} - matplotlib - pandas diff --git a/dependencies.yaml b/dependencies.yaml index 80a7d2024..98cac5300 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -39,6 +39,7 @@ files: - bench - bench_python - rapids_build_setuptools + - cupy test_cpp: output: none includes: @@ -475,11 +476,13 @@ dependencies: - h5py>=3.8.0 - benchmark>=1.8.2 - openblas + - libcuvs==24.12.*,>=0.0.0a0 bench_python: common: - output_types: [conda, pyproject, requirements] packages: - click + - cuvs==24.12.*,>=0.0.0a0 - matplotlib - pandas - pyyaml diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py index dbee6cd36..88ec55dfa 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py @@ -15,70 +15,206 @@ # limitations under the License. # import argparse +import importlib import os import sys +import warnings -import cupy as cp -import numpy as np -import rmm -from pylibraft.common import DeviceResources -from rmm.allocators.cupy import rmm_cupy_allocator +from .utils import memmap_bin_file, suffix_from_dtype, write_bin -from cuvs.neighbors.brute_force import build, search -from .utils import memmap_bin_file, suffix_from_dtype, write_bin +def import_with_fallback(primary_lib, secondary_lib=None, alias=None): + """ + Attempt to import a primary library, with an optional fallback to a + secondary library. + Optionally assigns the imported module to a global alias. + + Parameters + ---------- + primary_lib : str + Name of the primary library to import. + secondary_lib : str, optional + Name of the secondary library to use as a fallback. If `None`, + no fallback is attempted. + alias : str, optional + Alias to assign the imported module globally. + + Returns + ------- + module or None + The imported module if successful; otherwise, `None`. + + Examples + -------- + >>> xp = import_with_fallback('cupy', 'numpy') + >>> mod = import_with_fallback('nonexistent_lib') + >>> if mod is None: + ... print("Library not found.") + """ + try: + module = importlib.import_module(primary_lib) + except ImportError: + if secondary_lib is not None: + try: + module = importlib.import_module(secondary_lib) + except ImportError: + module = None + else: + module = None + if alias and module is not None: + globals()[alias] = module + return module + + +xp = import_with_fallback("cupy", "numpy") +rmm = import_with_fallback("rmm") +gpu_system = False + +def force_fallback_to_numpy(): + global xp, gpu_system + xp = import_with_fallback("numpy") + gpu_system = False + warnings.warn( + "Consider using a GPU-based system to greatly accelerate " + " generating groundtruths using cuVS." + ) + + +if rmm is not None: + gpu_system = True + try: + from pylibraft.common import DeviceResources + from rmm.allocators.cupy import rmm_cupy_allocator -def generate_random_queries(n_queries, n_features, dtype=np.float32): + from cuvs.neighbors.brute_force import build, search + except ImportError: + # RMM is available, cupy is available, but cuVS is not + force_fallback_to_numpy() +else: + # No RMM, no cuVS, but cupy is available + force_fallback_to_numpy() + + +def generate_random_queries(n_queries, n_features, dtype=xp.float32): print("Generating random queries") - if np.issubdtype(dtype, np.integer): - queries = cp.random.randint( + if xp.issubdtype(dtype, xp.integer): + queries = xp.random.randint( 0, 255, size=(n_queries, n_features), dtype=dtype ) else: - queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype) + queries = xp.random.uniform(size=(n_queries, n_features)).astype(dtype) return queries def choose_random_queries(dataset, n_queries): print("Choosing random vector from dataset as query vectors") - query_idx = np.random.choice( + query_idx = xp.random.choice( dataset.shape[0], size=(n_queries,), replace=False ) return dataset[query_idx, :] +def cpu_search(dataset, queries, k, metric="squeclidean"): + """ + Find the k nearest neighbors for each query point in the dataset using the + specified metric. + + Parameters + ---------- + dataset : numpy.ndarray + An array of shape (n_samples, n_features) representing the dataset. + queries : numpy.ndarray + An array of shape (n_queries, n_features) representing the query + points. + k : int + The number of nearest neighbors to find. + metric : str, optional + The distance metric to use. Can be 'squeclidean' or 'inner_product'. + Default is 'squeclidean'. + + Returns + ------- + distances : numpy.ndarray + An array of shape (n_queries, k) containing the distances + (for 'squeclidean') or similarities + (for 'inner_product') to the k nearest neighbors for each query. + indices : numpy.ndarray + An array of shape (n_queries, k) containing the indices of the + k nearest neighbors in the dataset for each query. + + """ + if metric == "squeclidean": + diff = queries[:, xp.newaxis, :] - dataset[xp.newaxis, :, :] + dist_sq = xp.sum(diff**2, axis=2) # Shape: (n_queries, n_samples) + + indices = xp.argpartition(dist_sq, kth=k - 1, axis=1)[:, :k] + distances = xp.take_along_axis(dist_sq, indices, axis=1) + + sorted_idx = xp.argsort(distances, axis=1) + distances = xp.take_along_axis(distances, sorted_idx, axis=1) + indices = xp.take_along_axis(indices, sorted_idx, axis=1) + + elif metric == "inner_product": + similarities = xp.dot( + queries, dataset.T + ) # Shape: (n_queries, n_samples) + + neg_similarities = -similarities + indices = xp.argpartition(neg_similarities, kth=k - 1, axis=1)[:, :k] + distances = xp.take_along_axis(similarities, indices, axis=1) + + sorted_idx = xp.argsort(-distances, axis=1) + + else: + raise ValueError( + "Unsupported metric in cuvs-bench-cpu. " + "Use 'squeclidean' or 'inner_product' or use the GPU package" + "to use any distance supported by cuVS." + ) + + distances = xp.take_along_axis(distances, sorted_idx, axis=1) + indices = xp.take_along_axis(indices, sorted_idx, axis=1) + + return distances, indices + + def calc_truth(dataset, queries, k, metric="sqeuclidean"): - resources = DeviceResources() n_samples = dataset.shape[0] n = 500000 # batch size for processing neighbors i = 0 indices = None distances = None - queries = cp.asarray(queries, dtype=cp.float32) + queries = xp.asarray(queries, dtype=xp.float32) + + if gpu_system: + resources = DeviceResources() while i < n_samples: print("Step {0}/{1}:".format(i // n, n_samples // n)) n_batch = n if i + n <= n_samples else n_samples - i - X = cp.asarray(dataset[i : i + n_batch, :], cp.float32) + X = xp.asarray(dataset[i : i + n_batch, :], xp.float32) - index = build(X, metric=metric, resources=resources) - D, Ind = search(index, queries, k, resources=resources) - resources.sync() + if gpu_system: + index = build(X, metric=metric, resources=resources) + D, Ind = search(index, queries, k, resources=resources) + resources.sync() + else: + D, Ind = cpu_search(X, queries, metric=metric) - D, Ind = cp.asarray(D), cp.asarray(Ind) + D, Ind = xp.asarray(D), xp.asarray(Ind) Ind += i # shift neighbor index by offset i if distances is None: distances = D indices = Ind else: - distances = cp.concatenate([distances, D], axis=1) - indices = cp.concatenate([indices, Ind], axis=1) - idx = cp.argsort(distances, axis=1)[:, :k] - distances = cp.take_along_axis(distances, idx, axis=1) - indices = cp.take_along_axis(indices, idx, axis=1) + distances = xp.concatenate([distances, D], axis=1) + indices = xp.concatenate([indices, Ind], axis=1) + idx = xp.argsort(distances, axis=1)[:, :k] + distances = xp.take_along_axis(distances, idx, axis=1) + indices = xp.take_along_axis(indices, idx, axis=1) i += n_batch @@ -86,11 +222,15 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): def main(): - pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaMemoryResource(), initial_pool_size=2**30 - ) - rmm.mr.set_current_device_resource(pool) - cp.cuda.set_allocator(rmm_cupy_allocator) + if gpu_system and xp.__name__ == "cupy": + pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), initial_pool_size=2**30 + ) + rmm.mr.set_current_device_resource(pool) + xp.cuda.set_allocator(rmm_cupy_allocator) + else: + # RMM is available, but cupy is not + force_fallback_to_numpy() parser = argparse.ArgumentParser( prog="generate_groundtruth", @@ -197,7 +337,7 @@ def main(): "Dataset size {:6.1f} GB, shape {}, dtype {}".format( dataset.size * dataset.dtype.itemsize / 1e9, dataset.shape, - np.dtype(dtype), + xp.dtype(dtype), ) ) @@ -230,11 +370,11 @@ def main(): write_bin( os.path.join(args.output, "groundtruth.neighbors.ibin"), - indices.astype(np.uint32), + indices.astype(xp.uint32), ) write_bin( os.path.join(args.output, "groundtruth.distances.fbin"), - distances.astype(np.float32), + distances.astype(xp.float32), ) diff --git a/python/cuvs_bench/cuvs_bench/run/data_export.py b/python/cuvs_bench/cuvs_bench/run/data_export.py index 997dab500..1d0ac40a0 100644 --- a/python/cuvs_bench/cuvs_bench/run/data_export.py +++ b/python/cuvs_bench/cuvs_bench/run/data_export.py @@ -17,7 +17,6 @@ import json import os import traceback -import warnings import pandas as pd @@ -170,44 +169,6 @@ def convert_json_to_csv_build(dataset, dataset_path): traceback.print_exc() -def append_build_data(write, build_file): - """ - Append build data to the search DataFrame. - - Parameters - ---------- - write : pandas.DataFrame - The DataFrame containing the search data to which build - data will be appended. - build_file : str - The file path to the build CSV file. - """ - if os.path.exists(build_file): - build_df = pd.read_csv(build_file) - write_ncols = len(write.columns) - # Initialize columns for build data - build_columns = [ - "build time", - "build threads", - "build cpu_time", - "build GPU", - ] - write = write.assign(**{col: None for col in build_columns}) - # Append additional columns if available - for col_name in build_df.columns[6:]: - write[col_name] = None - # Match build rows with search rows by index_name - for s_index, search_row in write.iterrows(): - for b_index, build_row in build_df.iterrows(): - if search_row["index_name"] == build_row["index_name"]: - write.iloc[s_index, write_ncols:] = build_row[2:].values - break - else: - warnings.warn( - f"Build CSV not found for {build_file}, build params not appended." - ) - - def convert_json_to_csv_search(dataset, dataset_path): """ Convert search JSON files to CSV format. @@ -232,7 +193,7 @@ def convert_json_to_csv_search(dataset, dataset_path): ) algo_name = clean_algo_name(algo_name) df["name"] = df["name"].str.split("/").str[0] - write_data = pd.DataFrame( + write = pd.DataFrame( { "algo_name": [algo_name] * len(df), "index_name": df["name"], @@ -242,11 +203,35 @@ def convert_json_to_csv_search(dataset, dataset_path): } ) # Append build data - append_build_data(write_data, build_file) + for name in df: + if name not in skip_search_cols: + write[name] = df[name] + if os.path.exists(build_file): + build_df = pd.read_csv(build_file) + write_ncols = len(write.columns) + write["build time"] = None + write["build threads"] = None + write["build cpu_time"] = None + write["build GPU"] = None + + for col_idx in range(6, len(build_df.columns)): + col_name = build_df.columns[col_idx] + write[col_name] = None + + for s_index, search_row in write.iterrows(): + for b_index, build_row in build_df.iterrows(): + if search_row["index_name"] == build_row["index_name"]: + write.iloc[s_index, write_ncols] = build_df.iloc[ + b_index, 2 + ] + write.iloc[ + s_index, write_ncols + 1 : + ] = build_df.iloc[b_index, 3:] + break # Write search data and compute frontiers - write_data.to_csv(file.replace(".json", ",raw.csv"), index=False) - write_frontier(file, write_data, "throughput") - write_frontier(file, write_data, "latency") + write.to_csv(file.replace(".json", ",raw.csv"), index=False) + write_frontier(file, write, "throughput") + write_frontier(file, write, "latency") except Exception as e: print(f"Error processing search file {file}: {e}. Skipping...") traceback.print_exc() diff --git a/python/cuvs_bench/pyproject.toml b/python/cuvs_bench/pyproject.toml index 41ebad116..5b17f7228 100644 --- a/python/cuvs_bench/pyproject.toml +++ b/python/cuvs_bench/pyproject.toml @@ -19,6 +19,7 @@ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ "click", + "cuvs==24.12.*,>=0.0.0a0", "matplotlib", "pandas", "pyyaml", From cf2885c9d0b8a5d839378939a29154a4d165fefe Mon Sep 17 00:00:00 2001 From: Ray Douglass Date: Wed, 11 Dec 2024 13:11:32 -0500 Subject: [PATCH 47/47] Update Changelog [skip ci] --- CHANGELOG.md | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ce4a14c3..ed9429d55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,67 @@ +# cuvs 24.12.00 (11 Dec 2024) + +## 🚨 Breaking Changes + +- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala) +- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice) + +## 🐛 Bug Fixes + +- Skip IVF-PQ packing test for lists with not enough data ([#512](https://github.com/rapidsai/cuvs/pull/512)) [@achirkin](https://github.com/achirkin) +- [BUG] Fix CAGRA filter ([#489](https://github.com/rapidsai/cuvs/pull/489)) [@enp1s0](https://github.com/enp1s0) +- Add `kIsSingleSource` to `PairwiseDistanceEpilogueElementwise` ([#485](https://github.com/rapidsai/cuvs/pull/485)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Fix include errors, header, and unsafe locks in iface.hpp ([#467](https://github.com/rapidsai/cuvs/pull/467)) [@achirkin](https://github.com/achirkin) +- Fix an OOB error in device-side cuvs::neighbors::refine and CAGRA kern_prune ([#460](https://github.com/rapidsai/cuvs/pull/460)) [@achirkin](https://github.com/achirkin) +- Put a ceiling on cuda-python ([#445](https://github.com/rapidsai/cuvs/pull/445)) [@bdice](https://github.com/bdice) +- Enable NVTX in cuvs-cagra-search component ([#439](https://github.com/rapidsai/cuvs/pull/439)) [@achirkin](https://github.com/achirkin) +- BUG: CAGRA multi-cta illegal access with bad queries ([#438](https://github.com/rapidsai/cuvs/pull/438)) [@achirkin](https://github.com/achirkin) +- Fix index overflow in edge cases of CAGRA graph optimize ([#435](https://github.com/rapidsai/cuvs/pull/435)) [@achirkin](https://github.com/achirkin) +- Fix correct call to brute force in generate groundtruth of cuvs-bench ([#427](https://github.com/rapidsai/cuvs/pull/427)) [@dantegd](https://github.com/dantegd) +- Use Python for sccache hit rate computation. ([#420](https://github.com/rapidsai/cuvs/pull/420)) [@bdice](https://github.com/bdice) +- Add `click` package to `cuvs-bench` conda recipe ([#408](https://github.com/rapidsai/cuvs/pull/408)) [@divyegala](https://github.com/divyegala) +- Fix NVTX annotations ([#400](https://github.com/rapidsai/cuvs/pull/400)) [@achirkin](https://github.com/achirkin) + +## 📖 Documentation + +- [Doc] Fix CAGRA search sample code ([#484](https://github.com/rapidsai/cuvs/pull/484)) [@enp1s0](https://github.com/enp1s0) +- Fix broken link in README.md references ([#473](https://github.com/rapidsai/cuvs/pull/473)) [@Azurethi](https://github.com/Azurethi) +- Adding tech stack to docs ([#448](https://github.com/rapidsai/cuvs/pull/448)) [@cjnolet](https://github.com/cjnolet) +- Fix Question Retrieval notebook ([#352](https://github.com/rapidsai/cuvs/pull/352)) [@lowener](https://github.com/lowener) + +## 🚀 New Features + +- Add C++ API scalar quantization ([#494](https://github.com/rapidsai/cuvs/pull/494)) [@mfoerste4](https://github.com/mfoerste4) +- HNSW CPU Hierarchy ([#465](https://github.com/rapidsai/cuvs/pull/465)) [@divyegala](https://github.com/divyegala) +- Add serialization API to brute-force ([#461](https://github.com/rapidsai/cuvs/pull/461)) [@lowener](https://github.com/lowener) +- Add Question Retrieval notebook using Milvus ([#451](https://github.com/rapidsai/cuvs/pull/451)) [@lowener](https://github.com/lowener) +- Migrate feature diff for NN Descent from RAFT to cuVS ([#421](https://github.com/rapidsai/cuvs/pull/421)) [@divyegala](https://github.com/divyegala) +- Add --no-lap-sync cmd option to ann-bench ([#405](https://github.com/rapidsai/cuvs/pull/405)) [@achirkin](https://github.com/achirkin) +- Add `InnerProduct` and `CosineExpanded` metric support in NN Descent ([#177](https://github.com/rapidsai/cuvs/pull/177)) [@divyegala](https://github.com/divyegala) + +## 🛠️ Improvements + +- Update cuvs to match raft's cutlass changes ([#516](https://github.com/rapidsai/cuvs/pull/516)) [@vyasr](https://github.com/vyasr) +- add a README for wheels ([#504](https://github.com/rapidsai/cuvs/pull/504)) [@jameslamb](https://github.com/jameslamb) +- Move check_input_array from pylibraft ([#474](https://github.com/rapidsai/cuvs/pull/474)) [@benfred](https://github.com/benfred) +- use different wheel-size thresholds based on CUDA version ([#469](https://github.com/rapidsai/cuvs/pull/469)) [@jameslamb](https://github.com/jameslamb) +- Modify cuvs-bench to be able to generate ground truth in CPU systems ([#466](https://github.com/rapidsai/cuvs/pull/466)) [@dantegd](https://github.com/dantegd) +- enforce wheel size limits, README formatting in CI ([#464](https://github.com/rapidsai/cuvs/pull/464)) [@jameslamb](https://github.com/jameslamb) +- Moving spectral embedding and kernel gramm APIs to cuVS ([#463](https://github.com/rapidsai/cuvs/pull/463)) [@cjnolet](https://github.com/cjnolet) +- Migrate sparse knn and distances code from raft ([#457](https://github.com/rapidsai/cuvs/pull/457)) [@benfred](https://github.com/benfred) +- Don't presume pointers location infers usability. ([#441](https://github.com/rapidsai/cuvs/pull/441)) [@robertmaynard](https://github.com/robertmaynard) +- call `enable_testing` in root CMakeLists.txt ([#437](https://github.com/rapidsai/cuvs/pull/437)) [@robertmaynard](https://github.com/robertmaynard) +- CAGRA tech debt: distance descriptor and workspace memory ([#436](https://github.com/rapidsai/cuvs/pull/436)) [@achirkin](https://github.com/achirkin) +- Add ci run_ scripts needed for build infra ([#434](https://github.com/rapidsai/cuvs/pull/434)) [@robertmaynard](https://github.com/robertmaynard) +- Use environment variables in cache hit rate computation. ([#422](https://github.com/rapidsai/cuvs/pull/422)) [@bdice](https://github.com/bdice) +- Use dashes in cuvs-bench package name. ([#417](https://github.com/rapidsai/cuvs/pull/417)) [@bdice](https://github.com/bdice) +- We need to enable the c_api by default ([#416](https://github.com/rapidsai/cuvs/pull/416)) [@robertmaynard](https://github.com/robertmaynard) +- print sccache stats in builds ([#413](https://github.com/rapidsai/cuvs/pull/413)) [@jameslamb](https://github.com/jameslamb) +- make conda installs in CI stricter ([#406](https://github.com/rapidsai/cuvs/pull/406)) [@jameslamb](https://github.com/jameslamb) +- Ivf c example ([#404](https://github.com/rapidsai/cuvs/pull/404)) [@abner-ma](https://github.com/abner-ma) +- Prune workflows based on changed files ([#392](https://github.com/rapidsai/cuvs/pull/392)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- [WIP] Add pinned memory resource to C API ([#311](https://github.com/rapidsai/cuvs/pull/311)) [@ajit283](https://github.com/ajit283) +- Dynamic Batching ([#261](https://github.com/rapidsai/cuvs/pull/261)) [@achirkin](https://github.com/achirkin) + # cuvs 24.10.00 (9 Oct 2024) ## 🐛 Bug Fixes