diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index e18e82df0..78648235f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -88,7 +88,7 @@ jobs: with: build_type: pull-request enable_check_symbols: true - symbol_exclusions: (void (thrust::|cub::)|raft_cutlass) + symbol_exclusions: (void (thrust::|cub::)) conda-python-build: needs: conda-cpp-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5f60c0a34..27dc99a11 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,7 +23,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} enable_check_symbols: true - symbol_exclusions: (void (thrust::|cub::)|raft_cutlass) + symbol_exclusions: (void (thrust::|cub::)) conda-cpp-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12 diff --git a/.gitignore b/.gitignore index 97eab287d..da6eb07f6 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ compile_commands.json .clangd/ # serialized ann indexes +brute_force_index cagra_index ivf_flat_index ivf_pq_index diff --git a/README.md b/README.md index c1b74a9e8..23759f598 100755 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Finally, faster vector search enables interactions between dense vectors and gra Below are some common use-cases for vector search + - ### Semantic search - Generative AI & Retrieval augmented generation (RAG) - Recommender systems @@ -68,6 +69,14 @@ There are several benefits to using cuVS and GPUs for vector search, including In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale. +## cuVS Technology Stack + +cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU. + +![cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU](img/tech_stack.png "cuVS Technology Stack") + + + ## Installing cuVS cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/). Different packages are available for the different languages supported by cuVS: @@ -233,7 +242,7 @@ If you are interested in contributing to the cuVS library, please read our [Cont For the interested reader, many of the accelerated implementations in cuVS are also based on research papers which can provide a lot more background. We also ask you to please cite the corresponding algorithms by referencing them in your own research. - [CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search](https://arxiv.org/abs/2308.15136) -- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062>) +- [Top-K Algorithms on GPU: A Comprehensive Study and New Methods](https://dl.acm.org/doi/10.1145/3581784.3607062) - [Fast K-NN Graph Construction by GPU Based NN-Descent](https://dl.acm.org/doi/abs/10.1145/3459637.3482344?casa_token=O_nan1B1F5cAAAAA:QHWDEhh0wmd6UUTLY9_Gv6c3XI-5DXM9mXVaUXOYeStlpxTPmV3nKvABRfoivZAaQ3n8FWyrkWw>) - [cuSLINK: Single-linkage Agglomerative Clustering on the GPU](https://arxiv.org/abs/2306.16354) - [GPU Semiring Primitives for Sparse Neighborhood Methods](https://arxiv.org/abs/2104.06357) diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh index e03da9f19..444657cc0 100755 --- a/ci/build_wheel_cuvs.sh +++ b/ci/build_wheel_cuvs.sh @@ -3,6 +3,8 @@ set -euo pipefail +package_dir="python/cuvs" + case "${RAPIDS_CUDA_VERSION}" in 12.*) EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON" @@ -15,4 +17,5 @@ esac # Set up skbuild options. Enable sccache in skbuild config options export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}" -ci/build_wheel.sh cuvs python/cuvs +ci/build_wheel.sh cuvs ${package_dir} +ci/validate_wheel.sh ${package_dir} final_dist diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh new file mode 100755 index 000000000..f2b235765 --- /dev/null +++ b/ci/validate_wheel.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir=$1 +wheel_dir_relative_path=$2 + +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" + +# some packages are much larger on CUDA 11 than on CUDA 12 +if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then + PYDISTCHECK_ARGS=( + --max-allowed-size-compressed '1.4G' + ) +else + PYDISTCHECK_ARGS=( + --max-allowed-size-compressed '950M' + ) +fi + +cd "${package_dir}" + +rapids-logger "validate packages with 'pydistcheck'" + +pydistcheck \ + --inspect \ + "${PYDISTCHECK_ARGS[@]}" \ + "$(echo ${wheel_dir_relative_path}/*.whl)" + +rapids-logger "validate packages with 'twine'" + +twine check \ + --strict \ + "$(echo ${wheel_dir_relative_path}/*.whl)" diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index aa12b4ed6..80bfb0c24 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cupy>=12.0.0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 494ec394d..07937726c 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cupy>=12.0.0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index f4f03ccee..b7fd6fcfa 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index a295e93f4..83a457465 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cupy>=12.0.0 - cxx-compiler diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index a73839457..21cb98180 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cxx-compiler diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index a529b27e2..effa88ced 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -15,7 +15,7 @@ dependencies: - cmake>=3.26.4,!=3.30.0 - cuda-nvtx=11.8 - cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-version=11.8 - cudatoolkit - cxx-compiler diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml index 407fb6058..0c5043ac2 100644 --- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cxx-compiler - cython>=3.0.0 diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml index 2ce3b5d7e..c63f205b0 100644 --- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml @@ -17,7 +17,7 @@ dependencies: - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-version=12.5 - cxx-compiler - cython>=3.0.0 diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml index e7e2daf0c..560c95feb 100644 --- a/conda/recipes/cuvs/meta.yaml +++ b/conda/recipes/cuvs/meta.yaml @@ -26,6 +26,7 @@ build: - {{ compiler('cuda') }} - cuda-cudart-dev {% endif %} + - cuda-python requirements: build: @@ -42,10 +43,10 @@ requirements: - {{ stdlib("c") }} host: {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 - cudatoolkit {% else %} - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.0,<13.0a0,<=12.6.0 - cuda-cudart-dev {% endif %} - cuda-version ={{ cuda_version }} @@ -60,13 +61,14 @@ requirements: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} - cudatoolkit + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 {% else %} - cuda-cudart + - cuda-python >=12.0,<13.0a0,<=12.6.0 {% endif %} - pylibraft {{ minor_version }} - libcuvs {{ version }} - python x.x - - cuda-python - numpy >=1.23,<3.0a0 tests: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c493af488..eb2e7c7a4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,7 +369,9 @@ if(BUILD_SHARED_LIBS) src/distance/detail/fused_distance_nn.cu src/distance/distance.cu src/distance/pairwise_distance.cu + src/distance/sparse_distance.cu src/neighbors/brute_force.cu + src/neighbors/brute_force_serialize.cu src/neighbors/cagra_build_float.cu src/neighbors/cagra_build_half.cu src/neighbors/cagra_build_int8.cu @@ -436,6 +438,7 @@ if(BUILD_SHARED_LIBS) src/neighbors/nn_descent.cu src/neighbors/nn_descent_float.cu src/neighbors/nn_descent_half.cu + src/neighbors/nn_descent_index.cpp src/neighbors/nn_descent_int8.cu src/neighbors/nn_descent_uint8.cu src/neighbors/reachability.cu @@ -448,6 +451,7 @@ if(BUILD_SHARED_LIBS) src/neighbors/refine/detail/refine_host_int8_t_float.cpp src/neighbors/refine/detail/refine_host_uint8_t_float.cpp src/neighbors/sample_filter.cu + src/neighbors/sparse_brute_force.cu src/neighbors/vamana_build_float.cu src/neighbors/vamana_build_uint8.cu src/neighbors/vamana_build_int8.cu diff --git a/cpp/include/cuvs/distance/distance.hpp b/cpp/include/cuvs/distance/distance.hpp index def72641e..42c574e58 100644 --- a/cpp/include/cuvs/distance/distance.hpp +++ b/cpp/include/cuvs/distance/distance.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -331,6 +332,86 @@ void pairwise_distance( cuvs::distance::DistanceType metric, float metric_arg = 2.0f); +/** + * @brief Compute sparse pairwise distances between x and y, using the provided + * input configuration and distance function. + * + * @code{.cpp} + * #include + * #include + * #include + * + * int x_n_rows = 100000; + * int y_n_rows = 50000; + * int n_cols = 10000; + * + * raft::device_resources handle; + * auto x = raft::make_device_csr_matrix(handle, x_n_rows, n_cols); + * auto y = raft::make_device_csr_matrix(handle, y_n_rows, n_cols); + * + * ... + * // populate data + * ... + * + * auto out = raft::make_device_matrix(handle, x_nrows, y_nrows); + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric); + * @endcode + * + * @param[in] handle raft::resources + * @param[in] x raft::device_csr_matrix_view + * @param[in] y raft::device_csr_matrix_view + * @param[out] dist raft::device_matrix_view dense matrix + * @param[in] metric distance metric to use + * @param[in] metric_arg metric argument (used for Minkowski distance) + */ +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg = 2.0f); + +/** + * @brief Compute sparse pairwise distances between x and y, using the provided + * input configuration and distance function. + * + * @code{.cpp} + * #include + * #include + * #include + * + * int x_n_rows = 100000; + * int y_n_rows = 50000; + * int n_cols = 10000; + * + * raft::device_resources handle; + * auto x = raft::make_device_csr_matrix(handle, x_n_rows, n_cols); + * auto y = raft::make_device_csr_matrix(handle, y_n_rows, n_cols); + * + * ... + * // populate data + * ... + * + * auto out = raft::make_device_matrix(handle, x_nrows, y_nrows); + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * raft::sparse::distance::pairwise_distance(handle, x.view(), y.view(), out, metric); + * @endcode + * + * @param[in] handle raft::resources + * @param[in] x raft::device_csr_matrix_view + * @param[in] y raft::device_csr_matrix_view + * @param[out] dist raft::device_matrix_view dense matrix + * @param[in] metric distance metric to use + * @param[in] metric_arg metric argument (used for Minkowski distance) + */ +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg = 2.0f); + /** @} */ // end group pairwise_distance_runtime }; // namespace cuvs::distance diff --git a/cpp/include/cuvs/neighbors/brute_force.h b/cpp/include/cuvs/neighbors/brute_force.h index c9e172f62..33b92f11b 100644 --- a/cpp/include/cuvs/neighbors/brute_force.h +++ b/cpp/include/cuvs/neighbors/brute_force.h @@ -166,6 +166,66 @@ cuvsError_t cuvsBruteForceSearch(cuvsResources_t res, * @} */ +/** + * @defgroup bruteforce_c_serialize BRUTEFORCE C-API serialize functions + * @{ + */ +/** + * Save the index to file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.c} + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // create an index with `cuvsBruteforceBuild` + * cuvsBruteForceSerialize(res, "/path/to/index", index); + * @endcode + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] filename the file name for saving the index + * @param[in] index BRUTEFORCE index + * + */ +cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index); + +/** + * Load index from file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.c} + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // Deserialize an index previously built with `cuvsBruteforceBuild` + * cuvsBruteForceIndex_t index; + * cuvsBruteForceIndexCreate(&index); + * cuvsBruteForceDeserialize(res, "/path/to/index", index); + * @endcode + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] filename the name of the file that stores the index + * @param[out] index BRUTEFORCE index loaded disk + */ +cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index); + +/** + * @} + */ #ifdef __cplusplus } #endif diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp index 428fa592a..d040e03db 100644 --- a/cpp/include/cuvs/neighbors/brute_force.hpp +++ b/cpp/include/cuvs/neighbors/brute_force.hpp @@ -18,6 +18,7 @@ #include "common.hpp" #include +#include #include #include #include @@ -47,6 +48,14 @@ struct index : cuvs::neighbors::index { index& operator=(index&&) = default; ~index() = default; + /** + * @brief Construct an empty index. + * + * Constructs an empty index. This index will either need to be trained with `build` + * or loaded from a saved copy with `deserialize` + */ + index(raft::resources const& handle); + /** Construct a brute force index from dataset * * Constructs a brute force index from a dataset. This lets us precompute norms for @@ -375,4 +384,342 @@ void search(raft::resources const& handle, * @} */ +/** + * @defgroup sparse_bruteforce_cpp_index Sparse Brute Force index + * @{ + */ +/** + * @brief Sparse Brute Force index. + * + * @tparam T Data element type + * @tparam IdxT Index element type + */ +template +struct sparse_index { + public: + sparse_index(const sparse_index&) = delete; + sparse_index(sparse_index&&) = default; + sparse_index& operator=(const sparse_index&) = delete; + sparse_index& operator=(sparse_index&&) = default; + ~sparse_index() = default; + + /** Construct a sparse brute force sparse_index from dataset */ + sparse_index(raft::resources const& res, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric, + T metric_arg); + + /** Distance metric used for retrieval */ + cuvs::distance::DistanceType metric() const noexcept { return metric_; } + + /** Metric argument */ + T metric_arg() const noexcept { return metric_arg_; } + + raft::device_csr_matrix_view dataset() const noexcept + { + return dataset_; + } + + private: + raft::device_csr_matrix_view dataset_; + cuvs::distance::DistanceType metric_; + T metric_arg_; +}; +/** + * @} + */ + +/** + * @defgroup sparse_bruteforce_cpp_index_build Sparse Brute Force index build + * @{ + */ + +/* + * @brief Build the Sparse index from the dataset + * + * Usage example: + * @code{.cpp} + * using namespace cuvs::neighbors; + * // create and fill the index from a CSR dataset + * auto index = brute_force::build(handle, dataset, metric); + * @endcode + * + * @param[in] handle + * @param[in] dataset A sparse CSR matrix in device memory to search against + * @param[in] metric cuvs::distance::DistanceType + * @param[in] metric_arg metric argument + * + * @return the constructed Sparse brute-force index + */ +auto build(raft::resources const& handle, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded, + float metric_arg = 0) -> cuvs::neighbors::brute_force::sparse_index; +/** + * @} + */ + +/** + * @defgroup sparse_bruteforce_cpp_index_search Sparse Brute Force index search + * @{ + */ +struct sparse_search_params { + int batch_size_index = 2 << 14; + int batch_size_query = 2 << 14; +}; + +/* + * @brief Search the sparse bruteforce index for nearest neighbors + * + * @param[in] handle + * @param[in] index Sparse brute-force constructed index + * @param[in] queries a sparse CSR matrix on the device to query + * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset + * [n_queries, k] + * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k] + */ +void search(raft::resources const& handle, + const sparse_search_params& params, + const sparse_index& index, + raft::device_csr_matrix_view dataset, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances); +/** + * @} + */ + +/** + * @defgroup bruteforce_cpp_index_serialize Bruteforce index serialize functions + * @{ + */ +/** + * Save the index to file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * // create an index with `auto index = brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, filename, index); + * @endcode + * + * @tparam T data element type + * + * @param[in] handle the raft handle + * @param[in] filename the file name for saving the index + * @param[in] index brute force index + * @param[in] include_dataset whether to include the dataset in the serialized + * output + */ +void serialize(raft::resources const& handle, + const std::string& filename, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); +/** + * Save the index to file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * // create an index with `auto index = brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, filename, index); + * @endcode + * + * @tparam T data element type + * + * @param[in] handle the raft handle + * @param[in] filename the file name for saving the index + * @param[in] index brute force index + * @param[in] include_dataset whether to include the dataset in the serialized + * output + * + */ +void serialize(raft::resources const& handle, + const std::string& filename, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); + +/** + * Write the index to an output stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an output stream + * std::ostream os(std::cout.rdbuf()); + * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, os, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] os output stream + * @param[in] index brute force index + * @param[in] include_dataset Whether or not to write out the dataset to the file. + */ +void serialize(raft::resources const& handle, + std::ostream& os, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); + +/** + * Write the index to an output stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an output stream + * std::ostream os(std::cout.rdbuf()); + * // create an index with `auto index = cuvs::neighbors::brute_force::build(...);` + * cuvs::neighbors::brute_force::serialize(handle, os, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] os output stream + * @param[in] index brute force index + * @param[in] include_dataset Whether or not to write out the dataset to the file. + */ +void serialize(raft::resources const& handle, + std::ostream& os, + const cuvs::neighbors::brute_force::index& index, + bool include_dataset = true); + +/** + * Load index from file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * using T = half; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, filename, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] filename the name of the file that stores the index + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index); +/** + * Load index from file. + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create a string with a filepath + * std::string filename("/path/to/index"); + * using T = float; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, filename, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] filename the name of the file that stores the index + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index); +/** + * Load index from input stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an input stream + * std::istream is(std::cin.rdbuf()); + * using T = half; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, is, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] is input stream + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index); +/** + * Load index from input stream + * The serialization format can be subject to changes, therefore loading + * an index saved with a previous version of cuvs is not guaranteed + * to work. + * + * @code{.cpp} + * #include + * #include + * + * raft::resources handle; + * + * // create an input stream + * std::istream is(std::cin.rdbuf()); + * using T = float; // data element type + * brute_force::index index(handle); + * cuvs::neighbors::brute_force::deserialize(handle, is, index); + * @endcode + * + * @param[in] handle the raft handle + * @param[in] is input stream + * @param[out] index brute force index + * + */ +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index); +/** + * @} + */ + } // namespace cuvs::neighbors::brute_force diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index e48050756..5ceb3010e 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -363,7 +363,7 @@ struct index : cuvs::neighbors::index { * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * In the above example, we have passed a host dataset to build. The returned index will own a * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a @@ -530,7 +530,7 @@ struct index : cuvs::neighbors::index { * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -567,7 +567,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -604,7 +604,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -640,7 +640,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -676,7 +676,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -713,7 +713,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -750,7 +750,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res @@ -787,7 +787,7 @@ auto build(raft::resources const& res, * // search K nearest neighbours * auto neighbors = raft::make_device_matrix(res, n_queries, k); * auto distances = raft::make_device_matrix(res, n_queries, k); - * cagra::search(res, search_params, index, queries, neighbors, distances); + * cagra::search(res, search_params, index, queries, neighbors.view(), distances.view()); * @endcode * * @param[in] res diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp index 347ccf889..9cd8192b5 100644 --- a/cpp/include/cuvs/neighbors/nn_descent.hpp +++ b/cpp/include/cuvs/neighbors/nn_descent.hpp @@ -55,15 +55,16 @@ struct index_params : cuvs::neighbors::index_params { size_t intermediate_graph_degree = 128; // Degree of input graph for pruning. size_t max_iterations = 20; // Number of nn-descent iterations. float termination_threshold = 0.0001; // Termination threshold of nn-descent. + bool return_distances = true; // return distances if true + size_t n_clusters = 1; // defaults to not using any batching /** @brief Construct NN descent parameters for a specific kNN graph degree * * @param graph_degree output graph degree + * @param metric distance metric to use */ - index_params(size_t graph_degree = 64) - : graph_degree(graph_degree), intermediate_graph_degree(1.5 * graph_degree) - { - } + index_params(size_t graph_degree = 64, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded); }; /** @@ -100,14 +101,25 @@ struct index : cuvs::neighbors::index { * @param res raft::resources is an object mangaging resources * @param n_rows number of rows in knn-graph * @param n_cols number of cols in knn-graph + * @param return_distances whether to return distances + * @param metric distance metric to use */ - index(raft::resources const& res, int64_t n_rows, int64_t n_cols) + index(raft::resources const& res, + int64_t n_rows, + int64_t n_cols, + bool return_distances = false, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) : cuvs::neighbors::index(), res_{res}, - metric_{cuvs::distance::DistanceType::L2Expanded}, + metric_{metric}, graph_{raft::make_host_matrix(n_rows, n_cols)}, - graph_view_{graph_.view()} + graph_view_{graph_.view()}, + return_distances_{return_distances} { + if (return_distances) { + distances_ = raft::make_device_matrix(res_, n_rows, n_cols); + distances_view_ = distances_.value().view(); + } } /** @@ -119,14 +131,22 @@ struct index : cuvs::neighbors::index { * * @param res raft::resources is an object mangaging resources * @param graph_view raft::host_matrix_view for storing knn-graph + * @param distances_view optional raft::device_matrix_view for storing + * distances + * @param metric distance metric to use */ index(raft::resources const& res, - raft::host_matrix_view graph_view) + raft::host_matrix_view graph_view, + std::optional> distances_view = + std::nullopt, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) : cuvs::neighbors::index(), res_{res}, - metric_{cuvs::distance::DistanceType::L2Expanded}, + metric_{metric}, graph_{raft::make_host_matrix(0, 0)}, - graph_view_{graph_view} + graph_view_{graph_view}, + distances_view_{distances_view}, + return_distances_{distances_view.has_value()} { } @@ -155,6 +175,13 @@ struct index : cuvs::neighbors::index { return graph_view_; } + /** neighborhood graph distances [size, graph-degree] */ + [[nodiscard]] inline auto distances() noexcept + -> std::optional> + { + return distances_view_; + } + // Don't allow copying the index for performance reasons (try avoiding copying data) index(const index&) = delete; index(index&&) = default; @@ -166,8 +193,11 @@ struct index : cuvs::neighbors::index { raft::resources const& res_; cuvs::distance::DistanceType metric_; raft::host_matrix graph_; // graph to return for non-int IdxT + std::optional> distances_; raft::host_matrix_view graph_view_; // view of graph for user provided matrix + std::optional> distances_view_; + bool return_distances_; }; /** @} */ @@ -200,12 +230,15 @@ struct index : cuvs::neighbors::index { * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -232,12 +265,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in device memory @@ -262,12 +298,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -294,12 +333,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in device memory @@ -324,12 +366,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -356,12 +401,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in device memory @@ -386,14 +434,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located * in device memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::device_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; - -/** @} */ + raft::device_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Build nn-descent Index with dataset in host memory @@ -420,12 +469,15 @@ auto build(raft::resources const& res, * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located * in host memory + * @param[in] graph optional raft::host_matrix_view for owning + * the output graph * @return index index containing all-neighbors knn graph in host memory */ auto build(raft::resources const& res, index_params const& params, - raft::host_matrix_view dataset) - -> cuvs::neighbors::nn_descent::index; + raft::host_matrix_view dataset, + std::optional> graph = + std::nullopt) -> cuvs::neighbors::nn_descent::index; /** * @brief Test if we have enough GPU memory to run NN descent algorithm. diff --git a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h index f9955334d..f4a7feaba 100644 --- a/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h +++ b/cpp/src/distance/detail/pairwise_distance_epilogue_elementwise.h @@ -61,6 +61,7 @@ class PairwiseDistanceEpilogueElementwise { using ElementT = ElementT_; static int const kElementsPerAccess = ElementsPerAccess; static int const kCount = kElementsPerAccess; + static bool const kIsSingleSource = true; using DistanceOp = DistanceOp_; using FinalOp = FinalOp_; diff --git a/cpp/src/distance/detail/sparse/bin_distance.cuh b/cpp/src/distance/detail/sparse/bin_distance.cuh new file mode 100644 index 000000000..1a63a8eb9 --- /dev/null +++ b/cpp/src/distance/detail/sparse/bin_distance.cuh @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "ip_distance.cuh" + +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { +// @TODO: Move this into sparse prims (coo_norm) +template +RAFT_KERNEL compute_binary_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { + // We do conditional here only because it's + // possible there could be some stray zeros in + // the sparse structure and removing them would be + // more expensive. + atomicAdd(&out[coo_rows[i]], data[i] == 1.0); + } +} + +template +RAFT_KERNEL compute_binary_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ + std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; + C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); +} + +template +void compute_binary(value_t* C, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func, + cudaStream_t stream) +{ + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_binary_warp_kernel<<>>( + C, Q_norms, R_norms, n_rows, n_cols, expansion_func); +} + +template +void compute_bin_distance(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ + rmm::device_uvector Q_norms(m, stream); + rmm::device_uvector R_norms(n, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + + compute_binary_row_norm_kernel<<>>( + Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_binary_row_norm_kernel<<>>( + R_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream); +} + +/** + * Jaccard distance using the expanded form: + * 1 - (sum(x_k * y_k) / ((sum(x_k) + sum(y_k)) - sum(x_k * y_k)) + */ +template +class jaccard_expanded_distances_t : public distances_t { + public: + explicit jaccard_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t denom = q_r_union - dot; + + value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); + + // flip the similarity when both rows are 0 + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * jacc) + both_empty); + }); + } + + ~jaccard_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +/** + * Dice distance using the expanded form: + * 1 - ((2 * sum(x_k * y_k)) / (sum(x_k) + sum(y_k))) + */ +template +class dice_expanded_distances_t : public distances_t { + public: + explicit dice_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t dice = (2 * dot) / q_r_union; + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * dice) + both_empty); + }); + } + + ~dice_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/common.hpp b/cpp/src/distance/detail/sparse/common.hpp new file mode 100644 index 000000000..803dabe56 --- /dev/null +++ b/cpp/src/distance/detail/sparse/common.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +struct distances_config_t { + distances_config_t(raft::resources const& handle_) : handle(handle_) {} + + // left side + value_idx a_nrows; + value_idx a_ncols; + value_idx a_nnz; + value_idx* a_indptr; + value_idx* a_indices; + value_t* a_data; + + // right side + value_idx b_nrows; + value_idx b_ncols; + value_idx b_nnz; + value_idx* b_indptr; + value_idx* b_indices; + value_t* b_data; + + raft::resources const& handle; +}; + +template +class distances_t { + public: + virtual void compute(value_t* out) {} + virtual ~distances_t() = default; +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv.cuh b/cpp/src/distance/detail/sparse/coo_spmv.cuh new file mode 100644 index 000000000..181b531f7 --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv.cuh @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "coo_spmv_strategies/dense_smem_strategy.cuh" +#include "coo_spmv_strategies/hash_strategy.cuh" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +inline void balanced_coo_pairwise_generalized_spmv( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows; + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle))); + + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); +}; + +/** + * Performs generalized sparse-matrix-sparse-matrix multiplication via a + * sparse-matrix-sparse-vector layout `out=A*B` where generalized product() + * and sum() operations can be used in place of the standard sum and product: + * + * out_ij = sum_k(product(A_ik, B_ik)) The sum goes through values of + * k=0..n_cols-1 where B_kj is nonzero. + * + * The product and sum operations shall form a semiring algebra with the + * following properties: + * 1. {+, 0} is a commutative sum reduction monoid with identity element 0 + * 2. {*, 1} is a product monoid with identity element 1 + * 3. Multiplication by 0 annihilates x. e.g. product(x, 0) = 0 + * + * Each vector of A is loaded into shared memory in dense form and the + * non-zeros of B load balanced across the threads of each block. + * @tparam value_idx index type + * @tparam value_t value type + * @tparam threads_per_block block size + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @tparam write_f atomic semiring sum() function + * @param[out] out_dists dense array of out distances of size m * n in row-major + * format. + * @param[in] config_ distance config object + * @param[in] coo_rows_b coo row array for B + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + * @param[in] chunk_size number of nonzeros of B to process for each row of A + * this value was found through profiling and represents a reasonable + * setting for both large and small densities + */ +template +inline void balanced_coo_pairwise_generalized_spmv( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + uint64_t n = (uint64_t)sizeof(value_t) * (uint64_t)config_.a_nrows * (uint64_t)config_.b_nrows; + RAFT_CUDA_TRY(cudaMemsetAsync(out_dists, 0, n, raft::resource::get_cuda_stream(config_.handle))); + + int max_cols = max_cols_per_block(); + + if (max_cols > config_.a_ncols) { + dense_smem_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); + } else { + hash_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); + } +}; + +template +inline void balanced_coo_pairwise_generalized_spmv_rev( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); +}; + +/** + * Used for computing distances where the reduction (e.g. product()) function + * requires an implicit union (product(x, 0) = x) to capture the difference A-B. + * This is necessary in some applications because the standard semiring algebra + * endowed with the default multiplication product monoid will only + * compute the intersection & B-A. + * + * This particular function is meant to accompany the function + * `balanced_coo_pairwise_generalized_spmv` and executes the product operation + * on only those columns that exist in B and not A. + * + * The product and sum operations shall enable the computation of a + * non-annihilating semiring algebra with the following properties: + * 1. {+, 0} is a commutative sum reduction monoid with identity element 0 + * 2. {*, 0} is a product monoid with identity element 0 + * 3. Multiplication by 0 does not annihilate x. e.g. product(x, 0) = x + * + * Manattan distance sum(abs(x_k-y_k)) is a great example of when this type of + * execution pattern is necessary. + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam threads_per_block block size + * @tparam product_f semiring product() function + * @tparam accum_f semiring sum() function + * @tparam write_f atomic semiring sum() function + * @param[out] out_dists dense array of out distances of size m * n + * @param[in] config_ distance config object + * @param[in] coo_rows_a coo row array for A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + * @param[in] chunk_size number of nonzeros of B to process for each row of A + * this value was found through profiling and represents a reasonable + * setting for both large and small densities + */ +template +inline void balanced_coo_pairwise_generalized_spmv_rev( + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + // try dense first + int max_cols = max_cols_per_block(); + + if (max_cols > config_.b_ncols) { + dense_smem_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); + } else { + hash_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); + } +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh new file mode 100644 index 000000000..1f4b19af4 --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) +{ + return __ffs(peer_group) - 1; +} + +/** + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +RAFT_KERNEL balanced_coo_generalized_spmv_kernel(strategy_t strategy, + indptr_it indptrA, + value_idx* indicesA, + value_t* dataA, + value_idx nnz_a, + value_idx* rowsB, + value_idx* indicesB, + value_t* dataB, + value_idx m, + value_idx n, + int dim, + value_idx nnz_b, + value_t* out, + int n_blocks_per_row, + int chunk_size, + value_idx b_ncols, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ + typedef cub::WarpReduce warp_reduce; + + value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); + value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; + + // chunk starting offset + value_idx ind_offset = cur_chunk_offset * chunk_size * tpb; + // how many total cols will be processed by this block (should be <= chunk_size * n_threads) + value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); + + int tid = threadIdx.x; + int warp_id = tid / raft::warp_size(); + + // compute id relative to current warp + unsigned int lane_id = tid & (raft::warp_size() - 1); + value_idx ind = ind_offset + threadIdx.x; + + extern __shared__ char smem[]; + + typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); + typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim); + + auto inserter = strategy.init_insert(A, dim); + + __syncthreads(); + + value_idx start_offset_a, stop_offset_a; + bool first_a_chunk, last_a_chunk; + indptrA.get_row_offsets( + cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk); + + // Convert current row vector in A to dense + for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) { + strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]); + } + + __syncthreads(); + + auto finder = strategy.init_find(A, dim); + + if (cur_row_a > m || cur_chunk_offset > n_blocks_per_row) return; + if (ind >= nnz_b) return; + + value_idx start_index_a = 0, stop_index_a = b_ncols - 1; + indptrA.get_indices_boundary(indicesA, + cur_row_a, + start_offset_a, + stop_offset_a, + start_index_a, + stop_index_a, + first_a_chunk, + last_a_chunk); + + value_idx cur_row_b = -1; + value_t c = 0.0; + + auto warp_red = warp_reduce(*(temp_storage + warp_id)); + + if (tid < active_chunk_size) { + cur_row_b = rowsB[ind]; + + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + + if (in_bounds) { + value_t a_col = strategy.find(finder, index_b); + if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); } + } + } + + // loop through chunks in parallel, reducing when a new row is + // encountered by each thread + for (int i = tid; i < active_chunk_size; i += blockDim.x) { + value_idx ind_next = ind + blockDim.x; + value_idx next_row_b = -1; + + if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; + + bool diff_rows = next_row_b != cur_row_b; + + if (__any_sync(0xffffffff, diff_rows)) { + // grab the threads currently participating in loops. + // because any other threads should have returned already. + unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + + // thread with lowest lane id among peers writes out + if (is_leader && v != 0.0) { + // this conditional should be uniform, since rev is constant + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a; + write_func(out + idx, v); + } + + c = 0.0; + } + + if (next_row_b != -1) { + ind = ind_next; + + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + if (in_bounds) { + value_t a_col = strategy.find(finder, index_b); + + if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); } + } + + cur_row_b = next_row_b; + } + } +} + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh new file mode 100644 index 000000000..457b25eea --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/base_strategy.cuh @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../common.hpp" +#include "../coo_spmv_kernel.cuh" +#include "../utils.cuh" +#include "coo_mask_row_iterators.cuh" + +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class coo_spmv_strategy { + public: + coo_spmv_strategy(const distances_config_t& config_) : config(config_) + { + smem = raft::getSharedMemPerBlock(); + } + + template + void _dispatch_base(strategy_t& strategy, + int smem_dim, + indptr_it& a_indptr, + value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); + + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + a_indptr, + config.a_indices, + config.a_data, + config.a_nnz, + coo_rows_b, + config.b_indices, + config.b_data, + config.a_nrows, + config.b_nrows, + smem_dim, + config.b_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.b_ncols, + product_func, + accum_func, + write_func); + } + + template + void _dispatch_base_rev(strategy_t& strategy, + int smem_dim, + indptr_it& b_indptr, + value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); + + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + b_indptr, + config.b_indices, + config.b_data, + config.b_nnz, + coo_rows_a, + config.a_indices, + config.a_data, + config.b_nrows, + config.a_nrows, + smem_dim, + config.a_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.a_ncols, + product_func, + accum_func, + write_func); + } + + protected: + int smem; + const distances_config_t& config; +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh new file mode 100644 index 000000000..a9040e1d8 --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../common.hpp" +#include "../utils.cuh" + +#include // raft::ceildiv + +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class mask_row_it { + public: + mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_ = NULL) + : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) + { + } + + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { + if (mask_row_idx != NULL) { + return mask_row_idx[blockIdx.x / n_blocks_nnz_b]; + } else { + return blockIdx.x / n_blocks_nnz_b; + } + } + + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const value_idx& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + start_offset = full_indptr[row_idx]; + stop_offset = full_indptr[row_idx + 1] - 1; + } + + __device__ constexpr inline void get_indices_boundary(const value_idx* indices, + value_idx& indices_len, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { + // do nothing; + } + + __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { + return true; + } + + const value_idx *full_indptr, &n_rows; + value_idx* mask_row_idx; +}; + +template +RAFT_KERNEL fill_chunk_indices_kernel(value_idx* n_chunks_per_row, + value_idx* chunk_indices, + value_idx n_rows) +{ + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < n_rows) { + auto start = n_chunks_per_row[tid]; + auto end = n_chunks_per_row[tid + 1]; + +#pragma unroll + for (int i = start; i < end; i++) { + chunk_indices[i] = tid; + } + } +} + +template +class chunked_mask_row_it : public mask_row_it { + public: + chunked_mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_, + int row_chunk_size_, + const value_idx* n_chunks_per_row_, + const value_idx* chunk_indices_, + const cudaStream_t stream_) + : mask_row_it(full_indptr_, n_rows_, mask_row_idx_), + row_chunk_size(row_chunk_size_), + n_chunks_per_row(n_chunks_per_row_), + chunk_indices(chunk_indices_), + stream(stream_) + { + } + + static void init(const value_idx* indptr, + const value_idx* mask_row_idx, + const value_idx& n_rows, + const int row_chunk_size, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { + auto policy = rmm::exec_policy(stream); + + constexpr value_idx first_element = 0; + n_chunks_per_row.set_element_async(0, first_element, stream); + n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size); + thrust::transform( + policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor); + + thrust::inclusive_scan( + policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1); + + raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream); + + fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream); + } + + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { + return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]]; + } + + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const int& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + auto chunk_index = blockIdx.x / n_blocks_nnz_b; + auto chunk_val = chunk_indices[chunk_index]; + auto prev_n_chunks = n_chunks_per_row[chunk_val]; + auto relative_chunk = chunk_index - prev_n_chunks; + first_a_chunk = relative_chunk == 0; + + start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size; + stop_offset = start_offset + row_chunk_size; + + auto final_stop_offset = this->full_indptr[row_idx + 1]; + + last_a_chunk = stop_offset >= final_stop_offset; + stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; + } + + __device__ inline void get_indices_boundary(const value_idx* indices, + value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { + start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1; + stop_index = last_a_chunk ? stop_index : indices[stop_offset]; + } + + __device__ inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { + return (index_b >= start_index_a && index_b <= stop_index_a); + } + + inline static value_idx total_row_blocks = 0; + const cudaStream_t stream; + const value_idx *n_chunks_per_row, *chunk_indices; + value_idx row_chunk_size; + + struct n_chunks_per_row_functor { + public: + n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_) + : indptr(indptr_), row_chunk_size(row_chunk_size_) + { + } + + __host__ __device__ value_idx operator()(const value_idx& i) + { + auto degree = indptr[i + 1] - indptr[i]; + return raft::ceildiv(degree, (value_idx)row_chunk_size); + } + + const value_idx* indptr; + value_idx row_chunk_size; + }; + + private: + static void fill_chunk_indices(const value_idx& n_rows, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { + auto n_threads = std::min(n_rows, 256); + auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); + + chunk_indices.resize(total_row_blocks, stream); + + fill_chunk_indices_kernel + <<>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows); + } +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh new file mode 100644 index 000000000..baa913a6c --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/dense_smem_strategy.cuh @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "base_strategy.cuh" + +#include // raft::ceildiv + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class dense_smem_strategy : public coo_spmv_strategy { + public: + using smem_type = value_t*; + using insert_type = smem_type; + using find_type = smem_type; + + dense_smem_strategy(const distances_config_t& config_) + : coo_spmv_strategy(config_) + { + } + + inline static int smem_per_block(int n_cols) + { + return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t)); + } + + template + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024); + auto n_blocks = this->config.a_nrows * n_blocks_per_row; + + mask_row_it a_indptr(this->config.a_indptr, this->config.a_nrows); + + this->_dispatch_base(*this, + this->config.b_ncols, + a_indptr, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); + } + + template + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024); + auto n_blocks = this->config.b_nrows * n_blocks_per_row; + + mask_row_it b_indptr(this->config.b_indptr, this->config.b_nrows); + + this->_dispatch_base_rev(*this, + this->config.a_ncols, + b_indptr, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); + } + + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { + for (int k = threadIdx.x; k < cache_size; k += blockDim.x) { + cache[k] = 0.0; + } + return cache; + } + + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { + cache[key] = value; + } + + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { + return cache; + } + + __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; } +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh new file mode 100644 index 000000000..cf212076b --- /dev/null +++ b/cpp/src/distance/detail/sparse/coo_spmv_strategies/hash_strategy.cuh @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "base_strategy.cuh" + +#include +#include + +#include +#include +#include + +// this is needed by cuco as key, value must be bitwise comparable. +// compilers don't declare float/double as bitwise comparable +// but that is too strict +// for example, the following is true (or 0): +// float a = 5; +// float b = 5; +// memcmp(&a, &b, sizeof(float)); +CUCO_DECLARE_BITWISE_COMPARABLE(float); +CUCO_DECLARE_BITWISE_COMPARABLE(double); + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class hash_strategy : public coo_spmv_strategy { + public: + using insert_type = typename cuco::legacy:: + static_map::device_mutable_view; + using smem_type = typename insert_type::slot_type*; + using find_type = + typename cuco::legacy::static_map::device_view; + + hash_strategy(const distances_config_t& config_, + float capacity_threshold_ = 0.5, + int map_size_ = get_map_size()) + : coo_spmv_strategy(config_), + capacity_threshold(capacity_threshold_), + map_size(map_size_) + { + } + + void chunking_needed(const value_idx* indptr, + const value_idx n_rows, + rmm::device_uvector& mask_indptr, + std::tuple& n_rows_divided, + cudaStream_t stream) + { + auto policy = raft::resource::get_thrust_policy(this->config.handle); + + auto less = thrust::copy_if(policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + mask_indptr.data(), + fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); + std::get<0>(n_rows_divided) = less - mask_indptr.data(); + + auto more = thrust::copy_if( + policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + less, + fits_in_hash_table( + indptr, capacity_threshold * map_size, std::numeric_limits::max())); + std::get<1>(n_rows_divided) = more - less; + } + + template + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb); + rmm::device_uvector mask_indptr( + this->config.a_nrows, raft::resource::get_cuda_stream(this->config.handle)); + std::tuple n_rows_divided; + + chunking_needed(this->config.a_indptr, + this->config.a_nrows, + mask_indptr, + n_rows_divided, + raft::resource::get_cuda_stream(this->config.handle)); + + auto less_rows = std::get<0>(n_rows_divided); + if (less_rows > 0) { + mask_row_it less(this->config.a_indptr, less_rows, mask_indptr.data()); + + auto n_less_blocks = less_rows * n_blocks_per_row; + this->_dispatch_base(*this, + map_size, + less, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); + } + + auto more_rows = std::get<1>(n_rows_divided); + if (more_rows > 0) { + rmm::device_uvector n_chunks_per_row( + more_rows + 1, raft::resource::get_cuda_stream(this->config.handle)); + rmm::device_uvector chunk_indices( + 0, raft::resource::get_cuda_stream(this->config.handle)); + chunked_mask_row_it::init(this->config.a_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + raft::resource::get_cuda_stream(this->config.handle)); + + chunked_mask_row_it more(this->config.a_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + raft::resource::get_cuda_stream(this->config.handle)); + + auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; + this->_dispatch_base(*this, + map_size, + more, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); + } + } + + template + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb); + rmm::device_uvector mask_indptr( + this->config.b_nrows, raft::resource::get_cuda_stream(this->config.handle)); + std::tuple n_rows_divided; + + chunking_needed(this->config.b_indptr, + this->config.b_nrows, + mask_indptr, + n_rows_divided, + raft::resource::get_cuda_stream(this->config.handle)); + + auto less_rows = std::get<0>(n_rows_divided); + if (less_rows > 0) { + mask_row_it less(this->config.b_indptr, less_rows, mask_indptr.data()); + + auto n_less_blocks = less_rows * n_blocks_per_row; + this->_dispatch_base_rev(*this, + map_size, + less, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); + } + + auto more_rows = std::get<1>(n_rows_divided); + if (more_rows > 0) { + rmm::device_uvector n_chunks_per_row( + more_rows + 1, raft::resource::get_cuda_stream(this->config.handle)); + rmm::device_uvector chunk_indices( + 0, raft::resource::get_cuda_stream(this->config.handle)); + chunked_mask_row_it::init(this->config.b_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + raft::resource::get_cuda_stream(this->config.handle)); + + chunked_mask_row_it more(this->config.b_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + raft::resource::get_cuda_stream(this->config.handle)); + + auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; + this->_dispatch_base_rev(*this, + map_size, + more, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); + } + } + + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { + return insert_type::make_from_uninitialized_slots(cooperative_groups::this_thread_block(), + cache, + cache_size, + cuco::empty_key{value_idx{-1}}, + cuco::empty_value{value_t{0}}); + } + + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { + auto success = cache.insert(cuco::pair(key, value)); + } + + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { + return find_type( + cache, cache_size, cuco::empty_key{value_idx{-1}}, cuco::empty_value{value_t{0}}); + } + + __device__ inline value_t find(find_type cache, const value_idx& key) + { + auto a_pair = cache.find(key); + + value_t a_col = 0.0; + if (a_pair != cache.end()) { a_col = a_pair->second; } + return a_col; + } + + struct fits_in_hash_table { + public: + fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_) + : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) + { + } + + __host__ __device__ bool operator()(const value_idx& i) + { + auto degree = indptr[i + 1] - indptr[i]; + + return degree >= degree_l && degree < degree_r; + } + + private: + const value_idx* indptr; + const value_idx degree_l, degree_r; + }; + + inline static int get_map_size() + { + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / + sizeof(typename insert_type::slot_type); + } + + private: + float capacity_threshold; + int map_size; +}; + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/sparse/ip_distance.cuh b/cpp/src/distance/detail/sparse/ip_distance.cuh new file mode 100644 index 000000000..3a11d4e99 --- /dev/null +++ b/cpp/src/distance/detail/sparse/ip_distance.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "coo_spmv.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +class ip_distances_t : public distances_t { + public: + /** + * Computes simple sparse inner product distances as sum(x_y * y_k) + * @param[in] config specifies inputs, outputs, and sizes + */ + ip_distances_t(const distances_config_t& config) + : config_(&config), coo_rows_b(config.b_nnz, raft::resource::get_cuda_stream(config.handle)) + { + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows_b.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + } + + /** + * Performs pairwise distance computation and computes output distances + * @param out_distances dense output matrix (size a_nrows * b_nrows) + */ + void compute(value_t* out_distances) + { + /** + * Compute pairwise distances and return dense matrix in row-major format + */ + balanced_coo_pairwise_generalized_spmv(out_distances, + *config_, + coo_rows_b.data(), + raft::mul_op(), + raft::add_op(), + raft::atomic_add_op()); + } + + value_idx* b_rows_coo() { return coo_rows_b.data(); } + + value_t* b_data_coo() { return config_->b_data; } + + private: + const distances_config_t* config_; + rmm::device_uvector coo_rows_b; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/l2_distance.cuh b/cpp/src/distance/detail/sparse/l2_distance.cuh new file mode 100644 index 000000000..40e7070fc --- /dev/null +++ b/cpp/src/distance/detail/sparse/l2_distance.cuh @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" +#include "ip_distance.cuh" +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +// @TODO: Move this into sparse prims (coo_norm) +template +RAFT_KERNEL compute_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); } +} + +template +RAFT_KERNEL compute_row_sum_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ + value_idx i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); } +} + +template +RAFT_KERNEL compute_euclidean_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ + std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t dot = C[(size_t)i * n_cols + j]; + + // e.g. Euclidean expansion func = -2.0 * dot + q_norm + r_norm + value_t val = expansion_func(dot, Q_sq_norms[i], R_sq_norms[j]); + + // correct for small instabilities + C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); +} + +template +RAFT_KERNEL compute_correlation_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n) +{ + std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; + + if (i >= n_rows || j >= n_cols) return; + + value_t dot = C[(size_t)i * n_cols + j]; + value_t Q_l1 = Q_norms[i]; + value_t R_l1 = R_norms[j]; + + value_t Q_l2 = Q_sq_norms[i]; + value_t R_l2 = R_sq_norms[j]; + + value_t numer = n * dot - (Q_l1 * R_l1); + value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); + value_t R_denom = n * R_l2 - (R_l1 * R_l1); + + value_t val = 1 - (numer / raft::sqrt(Q_denom * R_denom)); + + // correct for small instabilities + C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); +} + +template +void compute_euclidean(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + value_idx n_rows, + value_idx n_cols, + cudaStream_t stream, + expansion_f expansion_func) +{ + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_euclidean_warp_kernel<<>>( + C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); +} + +template +void compute_l2(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ + rmm::device_uvector Q_sq_norms(m, stream); + rmm::device_uvector R_sq_norms(n, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + + compute_row_norm_kernel<<>>( + Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_row_norm_kernel<<>>( + R_sq_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func); +} + +template +void compute_correlation(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n, + cudaStream_t stream) +{ + int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); + compute_correlation_warp_kernel<<>>( + C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n); +} + +template +void compute_corr(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + value_idx n_cols, + cudaStream_t stream) +{ + // sum_sq for std dev + rmm::device_uvector Q_sq_norms(m, stream); + rmm::device_uvector R_sq_norms(n, stream); + + // sum for mean + rmm::device_uvector Q_norms(m, stream); + rmm::device_uvector R_norms(n, stream); + + RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + + RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + + compute_row_norm_kernel<<>>( + Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_row_norm_kernel<<>>( + R_sq_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_row_sum_kernel<<>>( + Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); + compute_row_sum_kernel<<>>( + R_norms.data(), R_coo_rows, R_data, R_nnz); + + compute_correlation(out, + Q_sq_norms.data(), + R_sq_norms.data(), + Q_norms.data(), + R_norms.data(), + m, + n, + n_cols, + stream); +} + +/** + * L2 distance using the expanded form: sum(x_k)^2 + sum(y_k)^2 - 2 * sum(x_k * y_k) + * The expanded form is more efficient for sparse data. + */ +template +class l2_expanded_distances_t : public distances_t { + public: + explicit l2_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); + } + + ~l2_expanded_distances_t() = default; + + protected: + const distances_config_t* config_; + ip_distances_t ip_dists; +}; + +/** + * L2 sqrt distance performing the sqrt operation after the distance computation + * The expanded form is more efficient for sparse data. + */ +template +class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t { + public: + explicit l2_sqrt_expanded_distances_t(const distances_config_t& config) + : l2_expanded_distances_t(config) + { + } + + void compute(value_t* out_dists) override + { + l2_expanded_distances_t::compute(out_dists); + // Sqrt Post-processing + raft::linalg::unaryOp( + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, + [] __device__(value_t input) { + int neg = input < 0 ? -1 : 1; + return raft::sqrt(abs(input) * neg); + }, + raft::resource::get_cuda_stream(this->config_->handle)); + } + + ~l2_sqrt_expanded_distances_t() = default; +}; + +template +class correlation_expanded_distances_t : public distances_t { + public: + explicit correlation_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_corr(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->b_ncols, + raft::resource::get_cuda_stream(config_->handle)); + } + + ~correlation_expanded_distances_t() = default; + + protected: + const distances_config_t* config_; + ip_distances_t ip_dists; +}; + +/** + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * + * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data. + */ +template +class cosine_expanded_distances_t : public distances_t { + public: + explicit cosine_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); + + rmm::device_uvector search_coo_rows( + config_->a_nnz, raft::resource::get_cuda_stream(config_->handle)); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + raft::resource::get_cuda_stream(config_->handle), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = raft::sqrt(q_norm) * raft::sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + + // flip the similarity when both rows are 0 + bool both_empty = (q_norm == 0) && (r_norm == 0); + return 1 - ((!both_empty * cos) + both_empty); + }); + } + + ~cosine_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +/** + * Hellinger distance using the expanded form: sqrt(1 - sum(sqrt(x_k) * sqrt(y_k))) + * The expanded form is more efficient for sparse data. + * + * This distance computation modifies A and B by computing a sqrt + * and then performing a `pow(x, 2)` to convert it back. Because of this, + * it is possible that the values in A and B might differ slightly + * after this is invoked. + */ +template +class hellinger_expanded_distances_t : public distances_t { + public: + explicit hellinger_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, raft::resource::get_cuda_stream(config.handle)) + { + } + + void compute(value_t* out_dists) + { + rmm::device_uvector coo_rows(std::max(config_->b_nnz, config_->a_nnz), + raft::resource::get_cuda_stream(config_->handle)); + + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv( + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return raft::sqrt(a) * raft::sqrt(b); }, + raft::add_op(), + raft::atomic_add_op()); + + raft::linalg::unaryOp( + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, + [=] __device__(value_t input) { + // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative + bool rectifier = (1 - input) > 0; + return raft::sqrt(rectifier * (1 - input)); + }, + raft::resource::get_cuda_stream(config_->handle)); + } + + ~hellinger_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; +}; + +template +class russelrao_expanded_distances_t : public distances_t { + public: + explicit russelrao_expanded_distances_t(const distances_config_t& config) + : config_(&config), + workspace(0, raft::resource::get_cuda_stream(config.handle)), + ip_dists(config) + { + } + + void compute(value_t* out_dists) + { + ip_dists.compute(out_dists); + + value_t n_cols = config_->a_ncols; + value_t n_cols_inv = 1.0 / n_cols; + raft::linalg::unaryOp( + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, + [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; }, + raft::resource::get_cuda_stream(config_->handle)); + + auto exec_policy = rmm::exec_policy(raft::resource::get_cuda_stream(config_->handle)); + auto diags = thrust::counting_iterator(0); + value_idx b_nrows = config_->b_nrows; + thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) { + out_dists[input * b_nrows + input] = 0.0; + }); + } + + ~russelrao_expanded_distances_t() = default; + + private: + const distances_config_t* config_; + rmm::device_uvector workspace; + ip_distances_t ip_dists; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/lp_distance.cuh b/cpp/src/distance/detail/sparse/lp_distance.cuh new file mode 100644 index 000000000..18e7b04e4 --- /dev/null +++ b/cpp/src/distance/detail/sparse/lp_distance.cuh @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +template +void unexpanded_lp_distances(value_t* out_dists, + const distances_config_t* config_, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ + rmm::device_uvector coo_rows(std::max(config_->b_nnz, config_->a_nnz), + raft::resource::get_cuda_stream(config_->handle)); + + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv( + out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); + + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + coo_rows.data(), + config_->a_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv_rev( + out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); +} + +/** + * Computes L1 distances for sparse input. This does not have + * an equivalent expanded form, so it is only executed in + * an unexpanded form. + * @tparam value_idx + * @tparam value_t + */ +template +class l1_unexpanded_distances_t : public distances_t { + public: + l1_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::absdiff_op(), raft::add_op(), raft::atomic_add_op()); + } + + private: + const distances_config_t* config_; +}; + +template +class l2_unexpanded_distances_t : public distances_t { + public: + l2_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::sqdiff_op(), raft::add_op(), raft::atomic_add_op()); + } + + protected: + const distances_config_t* config_; +}; + +template +class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t { + public: + l2_sqrt_unexpanded_distances_t(const distances_config_t& config) + : l2_unexpanded_distances_t(config) + { + } + + void compute(value_t* out_dists) + { + l2_unexpanded_distances_t::compute(out_dists); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + // Sqrt Post-processing + raft::linalg::unaryOp( + out_dists, + out_dists, + n, + [] __device__(value_t input) { + int neg = input < 0 ? -1 : 1; + return raft::sqrt(abs(input) * neg); + }, + raft::resource::get_cuda_stream(this->config_->handle)); + } +}; + +template +class linf_unexpanded_distances_t : public distances_t { + public: + explicit linf_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::absdiff_op(), raft::max_op(), raft::atomic_max_op()); + } + + private: + const distances_config_t* config_; +}; + +template +class canberra_unexpanded_distances_t : public distances_t { + public: + explicit canberra_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, + config_, + [] __device__(value_t a, value_t b) { + value_t d = fabs(a) + fabs(b); + + // deal with potential for 0 in denominator by + // forcing 1/0 instead + return ((d != 0) * fabs(a - b)) / (d + (d == 0)); + }, + raft::add_op(), + raft::atomic_add_op()); + } + + private: + const distances_config_t* config_; +}; + +template +class lp_unexpanded_distances_t : public distances_t { + public: + explicit lp_unexpanded_distances_t(const distances_config_t& config, + value_t p_) + : config_(&config), p(p_) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, + config_, + raft::compose_op(raft::pow_const_op(p), raft::sub_op()), + raft::add_op(), + raft::atomic_add_op()); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + value_t one_over_p = value_t{1} / p; + raft::linalg::unaryOp(out_dists, + out_dists, + n, + raft::pow_const_op(one_over_p), + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; + value_t p; +}; + +template +class hamming_unexpanded_distances_t : public distances_t { + public: + explicit hamming_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, config_, raft::notequal_op(), raft::add_op(), raft::atomic_add_op()); + + uint64_t n = (uint64_t)config_->a_nrows * (uint64_t)config_->b_nrows; + value_t n_cols = 1.0 / config_->a_ncols; + raft::linalg::unaryOp(out_dists, + out_dists, + n, + raft::mul_const_op(n_cols), + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; +}; + +template +class jensen_shannon_unexpanded_distances_t : public distances_t { + public: + explicit jensen_shannon_unexpanded_distances_t( + const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + unexpanded_lp_distances( + out_dists, + config_, + [] __device__(value_t a, value_t b) { + value_t m = 0.5f * (a + b); + bool a_zero = a == 0; + bool b_zero = b == 0; + + value_t x = (!a_zero * m) / (a_zero + a); + value_t y = (!b_zero * m) / (b_zero + b); + + bool x_zero = x == 0; + bool y_zero = y == 0; + + return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero))); + }, + raft::add_op(), + raft::atomic_add_op()); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + raft::linalg::unaryOp( + out_dists, + out_dists, + n, + [=] __device__(value_t input) { return raft::sqrt(0.5 * input); }, + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; +}; + +template +class kl_divergence_unexpanded_distances_t : public distances_t { + public: + explicit kl_divergence_unexpanded_distances_t( + const distances_config_t& config) + : config_(&config) + { + } + + void compute(value_t* out_dists) + { + rmm::device_uvector coo_rows(std::max(config_->b_nnz, config_->a_nnz), + raft::resource::get_cuda_stream(config_->handle)); + + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, + raft::resource::get_cuda_stream(config_->handle)); + + balanced_coo_pairwise_generalized_spmv( + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return a * log(a / b); }, + raft::add_op(), + raft::atomic_add_op()); + + uint64_t n = (uint64_t)this->config_->a_nrows * (uint64_t)this->config_->b_nrows; + raft::linalg::unaryOp(out_dists, + out_dists, + n, + raft::mul_const_op(0.5), + raft::resource::get_cuda_stream(config_->handle)); + } + + private: + const distances_config_t* config_; +}; + +} // END namespace sparse +} // END namespace detail +} // END namespace distance +} // END namespace cuvs diff --git a/cpp/src/distance/detail/sparse/utils.cuh b/cpp/src/distance/detail/sparse/utils.cuh new file mode 100644 index 000000000..dc7ae6df6 --- /dev/null +++ b/cpp/src/distance/detail/sparse/utils.cuh @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { +namespace sparse { + +/** + * Computes the maximum number of columns that can be stored + * in shared memory in dense form with the given block size + * and precision. + * @return the maximum number of columns that can be stored in smem + */ +template +inline int max_cols_per_block() +{ + // max cols = (total smem available - cub reduction smem) + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / + sizeof(value_t); +} + +template +RAFT_KERNEL faster_dot_on_csr_kernel(dot_t* __restrict__ dot, + const value_idx* __restrict__ indptr, + const value_idx* __restrict__ cols, + const value_t* __restrict__ A, + const value_t* __restrict__ B, + const value_idx nnz, + const value_idx n_rows, + const value_idx dim) +{ + auto vec_id = threadIdx.x; + auto lane_id = threadIdx.x & 0x1f; + + extern __shared__ char smem[]; + value_t* s_A = (value_t*)smem; + value_idx cur_row = -1; + + for (int row = blockIdx.x; row < n_rows; row += gridDim.x) { + for (int dot_id = blockIdx.y + indptr[row]; dot_id < indptr[row + 1]; dot_id += gridDim.y) { + if (dot_id >= nnz) { return; } + const value_idx col = cols[dot_id] * dim; + const value_t* __restrict__ B_col = B + col; + + if (threadIdx.x == 0) { dot[dot_id] = 0.0; } + __syncthreads(); + + if (cur_row != row) { + for (value_idx k = vec_id; k < dim; k += blockDim.x) { + s_A[k] = A[row * dim + k]; + } + cur_row = row; + } + + dot_t l_dot_ = 0.0; + for (value_idx k = vec_id; k < dim; k += blockDim.x) { + asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x)); + if constexpr ((std::is_same_v && std::is_same_v)) { + l_dot_ += __half2float(s_A[k]) * __half2float(__ldcg(B_col + k)); + } else { + l_dot_ += s_A[k] * __ldcg(B_col + k); + } + } + + typedef cub::WarpReduce WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage; + dot_t warp_sum = WarpReduce(temp_storage).Sum(l_dot_); + + if (lane_id == 0) { atomicAdd_block(dot + dot_id, warp_sum); } + } + } +} + +template +void faster_dot_on_csr(raft::resources const& handle, + dot_t* dot, + const value_idx nnz, + const value_idx* indptr, + const value_idx* cols, + const value_t* A, + const value_t* B, + const value_idx n_rows, + const value_idx dim) +{ + if (nnz == 0 || n_rows == 0) return; + + auto stream = raft::resource::get_cuda_stream(handle); + + constexpr value_idx MAX_ROW_PER_ITER = 500; + int dev_id, sm_count, blocks_per_sm; + + const int smem_size = dim * sizeof(value_t); + cudaGetDevice(&dev_id); + cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); + + if (dim < 128) { + constexpr int tpb = 64; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + + } else if (dim < 256) { + constexpr int tpb = 128; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + } else if (dim < 512) { + constexpr int tpb = 256; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + } else { + constexpr int tpb = 512; + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &blocks_per_sm, faster_dot_on_csr_kernel, tpb, smem_size); + auto block_x = std::min(n_rows, MAX_ROW_PER_ITER); + auto block_y = + (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x; + dim3 blocks(block_x, block_y, 1); + + faster_dot_on_csr_kernel + <<>>(dot, indptr, cols, A, B, nnz, n_rows, dim); + } + + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +} // namespace sparse +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/sparse_distance.cu b/cpp/src/distance/sparse_distance.cu new file mode 100644 index 000000000..338c4e908 --- /dev/null +++ b/cpp/src/distance/sparse_distance.cu @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "sparse_distance.cuh" + +namespace cuvs { +namespace distance { + +template +void pairwise_distance( + raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg = 2.0f) +{ + auto x_structure = x.structure_view(); + auto y_structure = y.structure_view(); + + RAFT_EXPECTS(x_structure.get_n_cols() == y_structure.get_n_cols(), + "Number of columns must be equal"); + + RAFT_EXPECTS(dist.extent(0) == x_structure.get_n_rows(), + "Number of rows in output must be equal to " + "number of rows in X"); + RAFT_EXPECTS(dist.extent(1) == y_structure.get_n_rows(), + "Number of columns in output must be equal to " + "number of rows in Y"); + + detail::sparse::distances_config_t input_config(handle); + input_config.a_nrows = x_structure.get_n_rows(); + input_config.a_ncols = x_structure.get_n_cols(); + input_config.a_nnz = x_structure.get_nnz(); + input_config.a_indptr = const_cast(x_structure.get_indptr().data()); + input_config.a_indices = const_cast(x_structure.get_indices().data()); + input_config.a_data = const_cast(x.get_elements().data()); + + input_config.b_nrows = y_structure.get_n_rows(); + input_config.b_ncols = y_structure.get_n_cols(); + input_config.b_nnz = y_structure.get_nnz(); + input_config.b_indptr = const_cast(y_structure.get_indptr().data()); + input_config.b_indices = const_cast(y_structure.get_indices().data()); + input_config.b_data = const_cast(y.get_elements().data()); + + pairwiseDistance(dist.data_handle(), input_config, metric, metric_arg); +} + +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg) +{ + pairwise_distance(handle, x, y, dist, metric, metric_arg); +} + +void pairwise_distance(raft::resources const& handle, + raft::device_csr_matrix_view x, + raft::device_csr_matrix_view y, + raft::device_matrix_view dist, + cuvs::distance::DistanceType metric, + float metric_arg) +{ + pairwise_distance(handle, x, y, dist, metric, metric_arg); +} +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/sparse_distance.cuh b/cpp/src/distance/sparse_distance.cuh new file mode 100644 index 000000000..0d6dc0e6f --- /dev/null +++ b/cpp/src/distance/sparse_distance.cuh @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/sparse/bin_distance.cuh" +#include "detail/sparse/common.hpp" +#include "detail/sparse/ip_distance.cuh" +#include "detail/sparse/l2_distance.cuh" +#include "detail/sparse/lp_distance.cuh" + +#include + +#include + +#include + +namespace cuvs { +namespace distance { +/** + * Compute pairwise distances between A and B, using the provided + * input configuration and distance function. + * + * @tparam value_idx index type + * @tparam value_t value type + * @param[out] out dense output array (size A.nrows * B.nrows) + * @param[in] input_config input argument configuration + * @param[in] metric distance metric to use + * @param[in] metric_arg metric argument (used for Minkowski distance) + */ +template +void pairwiseDistance(value_t* out, + detail::sparse::distances_config_t input_config, + cuvs::distance::DistanceType metric, + float metric_arg) +{ + switch (metric) { + case cuvs::distance::DistanceType::L2Expanded: + detail::sparse::l2_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L2SqrtExpanded: + detail::sparse::l2_sqrt_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::InnerProduct: + detail::sparse::ip_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L2Unexpanded: + detail::sparse::l2_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L2SqrtUnexpanded: + detail::sparse::l2_sqrt_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::L1: + detail::sparse::l1_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::LpUnexpanded: + detail::sparse::lp_unexpanded_distances_t(input_config, metric_arg) + .compute(out); + break; + case cuvs::distance::DistanceType::Linf: + detail::sparse::linf_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::Canberra: + detail::sparse::canberra_unexpanded_distances_t(input_config) + .compute(out); + break; + case cuvs::distance::DistanceType::JaccardExpanded: + detail::sparse::jaccard_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::CosineExpanded: + detail::sparse::cosine_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::HellingerExpanded: + detail::sparse::hellinger_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::DiceExpanded: + detail::sparse::dice_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::CorrelationExpanded: + detail::sparse::correlation_expanded_distances_t(input_config) + .compute(out); + break; + case cuvs::distance::DistanceType::RusselRaoExpanded: + detail::sparse::russelrao_expanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::HammingUnexpanded: + detail::sparse::hamming_unexpanded_distances_t(input_config).compute(out); + break; + case cuvs::distance::DistanceType::JensenShannon: + detail::sparse::jensen_shannon_unexpanded_distances_t(input_config) + .compute(out); + break; + case cuvs::distance::DistanceType::KLDivergence: + detail::sparse::kl_divergence_unexpanded_distances_t(input_config) + .compute(out); + break; + + default: THROW("Unsupported distance: %d", metric); + } +} +}; // namespace distance +}; // namespace cuvs diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu index b0f87e9ac..d534676e3 100644 --- a/cpp/src/neighbors/brute_force.cu +++ b/cpp/src/neighbors/brute_force.cu @@ -21,6 +21,21 @@ #include namespace cuvs::neighbors::brute_force { + +template +index::index(raft::resources const& res) + // this constructor is just for a temporary index, for use in the deserialization + // api. all the parameters here will get replaced with loaded values - that aren't + // necessarily known ahead of time before deserialization. + // TODO: do we even need a handle here - could just construct one? + : cuvs::neighbors::index(), + metric_(cuvs::distance::DistanceType::L2Expanded), + dataset_(raft::make_device_matrix(res, 0, 0)), + norms_(std::nullopt), + metric_arg_(0) +{ +} + template index::index(raft::resources const& res, raft::host_matrix_view dataset, diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp index eda79aa31..f1a8c995d 100644 --- a/cpp/src/neighbors/brute_force_c.cpp +++ b/cpp/src/neighbors/brute_force_c.cpp @@ -17,10 +17,12 @@ #include #include +#include #include #include #include +#include #include #include @@ -91,6 +93,22 @@ void _search(cuvsResources_t res, } } +template +void _serialize(cuvsResources_t res, const char* filename, cuvsBruteForceIndex index) +{ + auto res_ptr = reinterpret_cast(res); + auto index_ptr = reinterpret_cast*>(index.addr); + cuvs::neighbors::brute_force::serialize(*res_ptr, std::string(filename), *index_ptr); +} + +template +void* _deserialize(cuvsResources_t res, const char* filename) +{ + auto res_ptr = reinterpret_cast(res); + auto index = new cuvs::neighbors::brute_force::index(*res_ptr); + cuvs::neighbors::brute_force::deserialize(*res_ptr, std::string(filename), index); + return index; +} } // namespace extern "C" cuvsError_t cuvsBruteForceIndexCreate(cuvsBruteForceIndex_t* index) @@ -129,7 +147,7 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res, if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) { index->addr = reinterpret_cast(_build(res, dataset_tensor, metric, metric_arg)); - index->dtype.code = kDLFloat; + index->dtype = dataset.dtype; } else { RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d", dataset.dtype.code, @@ -174,3 +192,38 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res, } }); } + +extern "C" cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + // read the numpy dtype from the beginning of the file + std::ifstream is(filename, std::ios::in | std::ios::binary); + if (!is) { RAFT_FAIL("Cannot open file %s", filename); } + char dtype_string[4]; + is.read(dtype_string, 4); + auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4)); + + index->dtype.bits = dtype.itemsize * 8; + if (dtype.kind == 'f' && dtype.itemsize == 4) { + index->dtype.code = kDLFloat; + index->addr = reinterpret_cast(_deserialize(res, filename)); + } else { + RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits); + } + }); +} + +extern "C" cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res, + const char* filename, + cuvsBruteForceIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + if (index->dtype.code == kDLFloat && index->dtype.bits == 32) { + _serialize(res, filename, *index); + } else { + RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits); + } + }); +} \ No newline at end of file diff --git a/cpp/src/neighbors/brute_force_serialize.cu b/cpp/src/neighbors/brute_force_serialize.cu new file mode 100644 index 000000000..1b5b5111e --- /dev/null +++ b/cpp/src/neighbors/brute_force_serialize.cu @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace cuvs::neighbors::brute_force { + +int constexpr serialization_version = 0; + +template +void serialize(raft::resources const& handle, + std::ostream& os, + const index& index, + bool include_dataset = true) +{ + RAFT_LOG_DEBUG( + "Saving brute force index, size %zu, dim %u", static_cast(index.size()), index.dim()); + + auto dtype_string = raft::detail::numpy_serializer::get_numpy_dtype().to_string(); + dtype_string.resize(4); + os << dtype_string; + + raft::serialize_scalar(handle, os, serialization_version); + raft::serialize_scalar(handle, os, index.size()); + raft::serialize_scalar(handle, os, index.dim()); + raft::serialize_scalar(handle, os, index.metric()); + raft::serialize_scalar(handle, os, index.metric_arg()); + raft::serialize_scalar(handle, os, include_dataset); + if (include_dataset) { raft::serialize_mdspan(handle, os, index.dataset()); } + auto has_norms = index.has_norms(); + raft::serialize_scalar(handle, os, has_norms); + if (has_norms) { raft::serialize_mdspan(handle, os, index.norms()); } + raft::resource::sync_stream(handle); +} + +void serialize(raft::resources const& handle, + const std::string& filename, + const index& index, + bool include_dataset) +{ + auto os = std::ofstream{filename, std::ios::out | std::ios::binary}; + RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str()); + serialize(handle, os, index, include_dataset); +} + +void serialize(raft::resources const& handle, + const std::string& filename, + const index& index, + bool include_dataset) +{ + auto os = std::ofstream{filename, std::ios::out | std::ios::binary}; + RAFT_EXPECTS(os, "Cannot open file %s", filename.c_str()); + serialize(handle, os, index, include_dataset); +} + +void serialize(raft::resources const& handle, + std::ostream& os, + const index& index, + bool include_dataset) +{ + serialize(handle, os, index, include_dataset); +} + +void serialize(raft::resources const& handle, + std::ostream& os, + const index& index, + bool include_dataset) +{ + serialize(handle, os, index, include_dataset); +} + +template +auto deserialize(raft::resources const& handle, std::istream& is) +{ + auto dtype_string = std::array{}; + is.read(dtype_string.data(), 4); + + auto ver = raft::deserialize_scalar(handle, is); + if (ver != serialization_version) { + RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver); + } + std::int64_t rows = raft::deserialize_scalar(handle, is); + std::int64_t dim = raft::deserialize_scalar(handle, is); + auto metric = raft::deserialize_scalar(handle, is); + auto metric_arg = raft::deserialize_scalar(handle, is); + + auto dataset_storage = raft::make_host_matrix(std::int64_t{}, std::int64_t{}); + auto include_dataset = raft::deserialize_scalar(handle, is); + if (include_dataset) { + dataset_storage = raft::make_host_matrix(rows, dim); + raft::deserialize_mdspan(handle, is, dataset_storage.view()); + } + + auto has_norms = raft::deserialize_scalar(handle, is); + auto norms_storage = has_norms ? std::optional{raft::make_host_vector(rows)} + : std::optional>{}; + // TODO(wphicks): Use mdbuffer here when available + auto norms_storage_dev = + has_norms ? std::optional{raft::make_device_vector(handle, rows)} + : std::optional>{}; + if (has_norms) { + raft::deserialize_mdspan(handle, is, norms_storage->view()); + raft::copy(handle, norms_storage_dev->view(), norms_storage->view()); + } + + auto result = index(handle, + raft::make_const_mdspan(dataset_storage.view()), + std::move(norms_storage_dev), + metric, + metric_arg); + raft::resource::sync_stream(handle); + + return result; +} + +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index) +{ + auto is = std::ifstream{filename, std::ios::in | std::ios::binary}; + RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str()); + + *index = deserialize(handle, is); +} + +void deserialize(raft::resources const& handle, + const std::string& filename, + cuvs::neighbors::brute_force::index* index) +{ + auto is = std::ifstream{filename, std::ios::in | std::ios::binary}; + RAFT_EXPECTS(is, "Cannot open file %s", filename.c_str()); + + *index = deserialize(handle, is); +} + +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index) +{ + *index = deserialize(handle, is); +} + +void deserialize(raft::resources const& handle, + std::istream& is, + cuvs::neighbors::brute_force::index* index) +{ + *index = deserialize(handle, is); +} + +} // namespace cuvs::neighbors::brute_force diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp index 6985ff094..326a89665 100644 --- a/cpp/src/neighbors/cagra_c.cpp +++ b/cpp/src/neighbors/cagra_c.cpp @@ -29,6 +29,8 @@ #include #include +#include + namespace { template diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 9e4d453e3..b7fec724b 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -33,8 +33,7 @@ #include #include -// TODO: Fixme- this needs to be migrated -#include "../../nn_descent.cuh" +#include // TODO: This shouldn't be calling spatial/knn APIs #include "../ann_utils.cuh" @@ -356,8 +355,8 @@ void build_knn_graph( raft::host_matrix_view knn_graph, cuvs::neighbors::nn_descent::index_params build_params) { - auto nn_descent_idx = cuvs::neighbors::nn_descent::index(res, knn_graph); - cuvs::neighbors::nn_descent::build(res, build_params, dataset, nn_descent_idx); + std::optional> graph_view = knn_graph; + auto nn_descent_idx = cuvs::neighbors::nn_descent::build(res, build_params, dataset, graph_view); using internal_IdxT = typename std::make_unsigned::type; using g_accessor = typename decltype(nn_descent_idx.graph())::accessor_type; @@ -437,11 +436,11 @@ index build( auto knn_build_params = params.graph_build_params; if (std::holds_alternative(params.graph_build_params)) { // Heuristic to decide default build algo and its params. - if (params.metric == cuvs::distance::DistanceType::L2Expanded && - cuvs::neighbors::nn_descent::has_enough_device_memory( + if (cuvs::neighbors::nn_descent::has_enough_device_memory( res, dataset.extents(), sizeof(IdxT))) { RAFT_LOG_DEBUG("NN descent solver"); - knn_build_params = cagra::graph_build_params::nn_descent_params(intermediate_degree); + knn_build_params = + cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric); } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); @@ -454,9 +453,6 @@ index build( std::get(knn_build_params); build_knn_graph(res, dataset, knn_graph->view(), ivf_pq_params); } else { - RAFT_EXPECTS( - params.metric == cuvs::distance::DistanceType::L2Expanded, - "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent"); auto nn_descent_params = std::get(knn_build_params); @@ -467,10 +463,12 @@ index build( "nn-descent graph_degree.", nn_descent_params.graph_degree, intermediate_degree); - nn_descent_params = cagra::graph_build_params::nn_descent_params(intermediate_degree); + nn_descent_params = + cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric); } // Use nn-descent to build CAGRA knn graph + nn_descent_params.return_distances = false; build_knn_graph(res, dataset, knn_graph->view(), nn_descent_params); } diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index 95c158675..5778d85a6 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -151,7 +151,7 @@ void search_main(raft::resources const& res, if (auto* strided_dset = dynamic_cast*>(&index.data()); strided_dset != nullptr) { // Search using a plain (strided) row-major dataset - auto& desc = dataset_descriptor_init_with_cache( + auto desc = dataset_descriptor_init_with_cache( res, params, *strided_dset, index.metric()); search_main_core( res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); @@ -161,7 +161,7 @@ void search_main(raft::resources const& res, RAFT_FAIL("FP32 VPQ dataset support is coming soon"); } else if (auto* vpq_dset = dynamic_cast*>(&index.data()); vpq_dset != nullptr) { - auto& desc = dataset_descriptor_init_with_cache( + auto desc = dataset_descriptor_init_with_cache( res, params, *vpq_dset, index.metric()); search_main_core( res, params, desc, graph_internal, queries, neighbors, distances, sample_filter); diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp index 297eb1f55..7eb798459 100644 --- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp +++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp @@ -31,8 +31,10 @@ #include #include +#include #include #include +#include #include #include @@ -232,52 +234,77 @@ struct alignas(device::LOAD_128BIT_T) dataset_descriptor_base_t { */ template struct dataset_descriptor_host { - using dev_descriptor_t = dataset_descriptor_base_t; - using dd_ptr_t = std::shared_ptr; - using init_f = - std::tuple, size_t>; + using dev_descriptor_t = dataset_descriptor_base_t; uint32_t smem_ws_size_in_bytes = 0; uint32_t team_size = 0; + struct state { + using ready_t = std::tuple; + using init_f = + std::tuple, size_t>; + + std::mutex mutex; + std::atomic ready; // Not sure if std::holds_alternative is thread-safe + std::variant value; + + template + state(InitF init, size_t size) : ready{false}, value{std::make_tuple(init, size)} + { + } + + ~state() noexcept + { + if (std::holds_alternative(value)) { + auto& [ptr, stream] = std::get(value); + RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(ptr, stream)); + } + } + + void eval(rmm::cuda_stream_view stream) + { + std::lock_guard lock(mutex); + if (std::holds_alternative(value)) { + auto& [fun, size] = std::get(value); + dev_descriptor_t* ptr = nullptr; + RAFT_CUDA_TRY(cudaMallocAsync(&ptr, size, stream)); + fun(ptr, stream); + value = std::make_tuple(ptr, stream); + ready.store(true, std::memory_order_release); + } + } + + auto get(rmm::cuda_stream_view stream) -> dev_descriptor_t* + { + if (!ready.load(std::memory_order_acquire)) { eval(stream); } + return std::get<0>(std::get(value)); + } + }; + template dataset_descriptor_host(const DescriptorImpl& dd_host, InitF init) - : value_{std::make_tuple(init, sizeof(DescriptorImpl))}, + : value_{std::make_shared(init, sizeof(DescriptorImpl))}, smem_ws_size_in_bytes{dd_host.smem_ws_size_in_bytes()}, team_size{dd_host.team_size()} { } + dataset_descriptor_host() = default; + /** * Return the device pointer, possibly evaluating it in the given thread. */ [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) const -> const dev_descriptor_t* { - if (std::holds_alternative(value_)) { value_ = eval(std::get(value_), stream); } - return std::get(value_).get(); + return value_->get(stream); } + [[nodiscard]] auto dev_ptr(rmm::cuda_stream_view stream) -> dev_descriptor_t* { - if (std::holds_alternative(value_)) { value_ = eval(std::get(value_), stream); } - return std::get(value_).get(); + return value_->get(stream); } private: - mutable std::variant value_; - - static auto eval(init_f init, rmm::cuda_stream_view stream) -> dd_ptr_t - { - using raft::RAFT_NAME; - auto& [fun, size] = init; - dd_ptr_t dev_ptr{ - [stream, s = size]() { - dev_descriptor_t* p; - RAFT_CUDA_TRY(cudaMallocAsync(&p, s, stream)); - return p; - }(), - [stream](dev_descriptor_t* p) { RAFT_CUDA_TRY_NO_THROW(cudaFreeAsync(p, stream)); }}; - fun(dev_ptr.get(), stream); - return dev_ptr; - } + mutable std::shared_ptr value_; }; /** diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index abc907da5..e6e7ff64f 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -135,11 +135,9 @@ template struct store { /** Number of descriptors to cache. */ static constexpr size_t kDefaultSize = 100; - raft::cache::lru, - std::shared_ptr>> - value{kDefaultSize}; + raft::cache:: + lru, dataset_descriptor_host> + value{kDefaultSize}; }; } // namespace descriptor_cache @@ -159,20 +157,18 @@ auto dataset_descriptor_init_with_cache(const raft::resources& res, const cagra::search_params& params, const DatasetT& dataset, cuvs::distance::DistanceType metric) - -> const dataset_descriptor_host& + -> dataset_descriptor_host { - using desc_t = dataset_descriptor_host; - auto key = descriptor_cache::make_key(params, dataset, metric); + auto key = descriptor_cache::make_key(params, dataset, metric); auto& cache = raft::resource::get_custom_resource>(res) ->value; - std::shared_ptr desc{nullptr}; + dataset_descriptor_host desc; if (!cache.get(key, &desc)) { - desc = std::make_shared( - std::move(dataset_descriptor_init(params, dataset, metric))); + desc = dataset_descriptor_init(params, dataset, metric); cache.set(key, desc); } - return *desc; + return desc; } }; // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 4253cb781..daeac82b9 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -156,6 +156,7 @@ __global__ void kern_prune(const IdxT* const knn_graph, // [graph_chunk_size, g // count number of detours (A->D->B) for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) { const uint64_t iD = knn_graph[kAD + (graph_degree * iA)]; + if (iD >= graph_size) { continue; } for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) { const uint64_t iB_candidate = knn_graph[kDB + ((uint64_t)graph_degree * iD)]; for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) { diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh index 0003f2495..ecfd856f1 100644 --- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh @@ -93,10 +93,10 @@ struct search : public search_plan_impl intermediate_indices; - rmm::device_uvector intermediate_distances; + lightweight_uvector intermediate_indices; + lightweight_uvector intermediate_distances; size_t topk_workspace_size; - rmm::device_uvector topk_workspace; + lightweight_uvector topk_workspace; search(raft::resources const& res, search_params params, @@ -105,9 +105,9 @@ struct search : public search_plan_impl<<<1, 1, 0, cuda_stream>>>(host_ptr, dev_ptr); } +template +auto get_value(const T* const dev_ptr, cudaStream_t stream) -> T +{ + T value; + RAFT_CUDA_TRY(cudaMemcpyAsync(&value, dev_ptr, sizeof(value), cudaMemcpyDefault, stream)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + return value; +} + // MAX_DATASET_DIM : must equal to or greater than dataset_dim template RAFT_KERNEL random_pickup_kernel( @@ -609,18 +618,18 @@ struct search : search_plan_impl { using base_type::num_seeds; size_t result_buffer_allocation_size; - rmm::device_uvector result_indices; // results_indices_buffer - rmm::device_uvector result_distances; // result_distances_buffer - rmm::device_uvector parent_node_list; - rmm::device_uvector topk_hint; - rmm::device_scalar terminate_flag; // dev_terminate_flag, host_terminate_flag.; - rmm::device_uvector topk_workspace; + lightweight_uvector result_indices; // results_indices_buffer + lightweight_uvector result_distances; // result_distances_buffer + lightweight_uvector parent_node_list; + lightweight_uvector topk_hint; + lightweight_uvector terminate_flag; // dev_terminate_flag, host_terminate_flag.; + lightweight_uvector topk_workspace; // temporary storage for _find_topk - rmm::device_uvector input_keys_storage; - rmm::device_uvector output_keys_storage; - rmm::device_uvector input_values_storage; - rmm::device_uvector output_values_storage; + lightweight_uvector input_keys_storage; + lightweight_uvector output_keys_storage; + lightweight_uvector input_values_storage; + lightweight_uvector output_values_storage; search(raft::resources const& res, search_params params, @@ -629,16 +638,16 @@ struct search : search_plan_impl { int64_t graph_degree, uint32_t topk) : base_type(res, params, dataset_desc, dim, graph_degree, topk), - result_indices(0, raft::resource::get_cuda_stream(res)), - result_distances(0, raft::resource::get_cuda_stream(res)), - parent_node_list(0, raft::resource::get_cuda_stream(res)), - topk_hint(0, raft::resource::get_cuda_stream(res)), - topk_workspace(0, raft::resource::get_cuda_stream(res)), - terminate_flag(raft::resource::get_cuda_stream(res)), - input_keys_storage(0, raft::resource::get_cuda_stream(res)), - output_keys_storage(0, raft::resource::get_cuda_stream(res)), - input_values_storage(0, raft::resource::get_cuda_stream(res)), - output_values_storage(0, raft::resource::get_cuda_stream(res)) + result_indices(res), + result_distances(res), + parent_node_list(res), + topk_hint(res), + topk_workspace(res), + terminate_flag(res), + input_keys_storage(res), + output_keys_storage(res), + input_values_storage(res), + output_values_storage(res) { set_params(res); } @@ -662,7 +671,7 @@ struct search : search_plan_impl { itopk_size, max_queries, result_buffer_size, utils::get_cuda_data_type()); RAFT_LOG_DEBUG("# topk_workspace_size: %lu", topk_workspace_size); topk_workspace.resize(topk_workspace_size, raft::resource::get_cuda_stream(res)); - + terminate_flag.resize(1, raft::resource::get_cuda_stream(res)); hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res)); } @@ -847,7 +856,7 @@ struct search : search_plan_impl { stream); // termination (2) - if (iter + 1 >= min_iterations && terminate_flag.value(stream)) { + if (iter + 1 >= min_iterations && get_value(terminate_flag.data(), stream)) { iter++; break; } diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index f23b96631..99254aa50 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -151,7 +151,7 @@ struct search_plan_impl : public search_plan_impl_base { lightweight_uvector hashmap; lightweight_uvector num_executed_iterations; // device or managed? lightweight_uvector dev_seed; - const dataset_descriptor_host& dataset_desc; + dataset_descriptor_host dataset_desc; search_plan_impl(raft::resources const& res, search_params params, diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index 8c5767c50..c62a52540 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -16,42 +16,42 @@ #pragma once -#include - #include "ann_utils.cuh" #include "cagra/device_common.hpp" + +#include +#include + #include +#include #include #include +#include +#include +#include #include #include - +#include +#include #include // raft::util::arch::SM_* #include #include #include #include -#include +#include + #include -#include -#include -#include -#include -#include #include #include #include +#include #include #include namespace cuvs::neighbors::nn_descent::detail { -static const std::string RAFT_NAME = "raft"; -using pinned_memory_resource = thrust::universal_host_pinned_memory_resource; -template -using pinned_memory_allocator = thrust::mr::stateless_resource_allocator; using DistData_t = float; constexpr int DEGREE_ON_DEVICE{32}; @@ -216,6 +216,8 @@ struct BuildConfig { // If internal_node_degree == 0, the value of node_degree will be assigned to it size_t max_iterations{50}; float termination_threshold{0.0001}; + size_t output_graph_degree{32}; + cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded}; }; template @@ -300,6 +302,7 @@ class BloomFilter { template struct GnndGraph { + raft::resources const& res; static constexpr int segment_size = 32; InternalID_t* h_graph; @@ -310,16 +313,17 @@ struct GnndGraph { raft::host_matrix h_dists; - thrust::host_vector> h_graph_new; - thrust::host_vector> h_list_sizes_new; + raft::pinned_matrix h_graph_new; + raft::pinned_vector h_list_sizes_new; - thrust::host_vector> h_graph_old; - thrust::host_vector> h_list_sizes_old; + raft::pinned_matrix h_graph_old; + raft::pinned_vector h_list_sizes_old; BloomFilter bloom_filter; GnndGraph(const GnndGraph&) = delete; GnndGraph& operator=(const GnndGraph&) = delete; - GnndGraph(const size_t nrow, + GnndGraph(raft::resources const& res, + const size_t nrow, const size_t node_degree, const size_t internal_node_degree, const size_t num_samples); @@ -344,9 +348,14 @@ class GNND { GNND(const GNND&) = delete; GNND& operator=(const GNND&) = delete; - void build(Data_t* data, const Index_t nrow, Index_t* output_graph); + void build(Data_t* data, + const Index_t nrow, + Index_t* output_graph, + bool return_distances, + DistData_t* output_distances); ~GNND() = default; using ID_t = InternalID_t; + void reset(raft::resources const& res); private: void add_reverse_edges(Index_t* graph_ptr, @@ -371,15 +380,14 @@ class GNND { raft::device_matrix graph_buffer_; raft::device_matrix dists_buffer_; - // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827 - thrust::host_vector> graph_host_buffer_; - thrust::host_vector> dists_host_buffer_; + raft::pinned_matrix graph_host_buffer_; + raft::pinned_matrix dists_host_buffer_; raft::device_vector d_locks_; - thrust::host_vector> h_rev_graph_new_; - thrust::host_vector> h_graph_old_; - thrust::host_vector> h_rev_graph_old_; + raft::pinned_matrix h_rev_graph_new_; + raft::pinned_matrix h_graph_old_; + raft::pinned_matrix h_rev_graph_old_; // int2.x is the number of forward edges, int2.y is the number of reverse edges raft::device_vector d_list_sizes_new_; @@ -448,11 +456,13 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer, // TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827 /** Calculate L2 norm, and cast data to __half */ template -RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data, - __half* output_data, - int dim, - DistData_t* l2_norms, - size_t list_offset = 0) +RAFT_KERNEL preprocess_data_kernel( + const Data_t* input_data, + __half* output_data, + int dim, + DistData_t* l2_norms, + size_t list_offset = 0, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) { extern __shared__ char buffer[]; __shared__ float l2_norm; @@ -462,26 +472,32 @@ RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data, load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size()); if (threadIdx.x == 0) { l2_norm = 0; } __syncthreads(); - int lane_id = threadIdx.x % raft::warp_size(); - for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { - int idx = step * raft::warp_size() + lane_id; - float part_dist = 0; - if (idx < dim) { - part_dist = s_vec[idx]; - part_dist = part_dist * part_dist; - } - __syncwarp(); - for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { - part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); + + if (metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::CosineExpanded) { + int lane_id = threadIdx.x % raft::warp_size(); + for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { + int idx = step * raft::warp_size() + lane_id; + float part_dist = 0; + if (idx < dim) { + part_dist = s_vec[idx]; + part_dist = part_dist * part_dist; + } + __syncwarp(); + for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { + part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); + } + if (lane_id == 0) { l2_norm += part_dist; } + __syncwarp(); } - if (lane_id == 0) { l2_norm += part_dist; } - __syncwarp(); } for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { int idx = step * raft::warp_size() + threadIdx.x; if (idx < dim) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { + output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { output_data[list_id * dim + idx] = (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm); } else { @@ -709,7 +725,8 @@ __launch_bounds__(BLOCK_SIZE, 4) DistData_t* dists, int graph_width, int* locks, - DistData_t* l2_norms) + DistData_t* l2_norms, + cuvs::distance::DistanceType metric) { #if (__CUDA_ARCH__ >= 700) using namespace nvcuda; @@ -821,8 +838,10 @@ __launch_bounds__(BLOCK_SIZE, 4) for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size && i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { s_distances[i] = -s_distances[i]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { + s_distances[i] = 1.0 - s_distances[i]; } else { s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - @@ -900,8 +919,10 @@ __launch_bounds__(BLOCK_SIZE, 4) for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size && i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { s_distances[i] = -s_distances[i]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { + s_distances[i] = 1.0 - s_distances[i]; } else { s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - @@ -971,19 +992,21 @@ int insert_to_ordered_list(InternalID_t* list, } // namespace template -GnndGraph::GnndGraph(const size_t nrow, +GnndGraph::GnndGraph(raft::resources const& res, + const size_t nrow, const size_t node_degree, const size_t internal_node_degree, const size_t num_samples) - : nrow(nrow), + : res(res), + nrow(nrow), node_degree(node_degree), num_samples(num_samples), bloom_filter(nrow, internal_node_degree / segment_size, 3), h_dists{raft::make_host_matrix(nrow, node_degree)}, - h_graph_new(nrow * num_samples), - h_list_sizes_new(nrow), - h_graph_old(nrow * num_samples), - h_list_sizes_old{nrow} + h_graph_new{raft::make_pinned_matrix(res, nrow, num_samples)}, + h_list_sizes_new{raft::make_pinned_vector(res, nrow)}, + h_graph_old{raft::make_pinned_matrix(res, nrow, num_samples)}, + h_list_sizes_old{raft::make_pinned_vector(res, nrow)} { // node_degree must be a multiple of segment_size; assert(node_degree % segment_size == 0); @@ -1001,9 +1024,9 @@ void GnndGraph::sample_graph_new(InternalID_t* new_neighbors, { #pragma omp parallel for for (size_t i = 0; i < nrow; i++) { - auto list_new = h_graph_new.data() + i * num_samples; - h_list_sizes_new[i].x = 0; - h_list_sizes_new[i].y = 0; + auto list_new = h_graph_new.data_handle() + i * num_samples; + h_list_sizes_new.data_handle()[i].x = 0; + h_list_sizes_new.data_handle()[i].y = 0; for (size_t j = 0; j < width; j++) { auto new_neighb_id = new_neighbors[i * width + j].id(); @@ -1011,8 +1034,8 @@ void GnndGraph::sample_graph_new(InternalID_t* new_neighbors, if (bloom_filter.check(i, new_neighb_id)) { continue; } bloom_filter.add(i, new_neighb_id); new_neighbors[i * width + j].mark_old(); - list_new[h_list_sizes_new[i].x++] = new_neighb_id; - if (h_list_sizes_new[i].x == num_samples) break; + list_new[h_list_sizes_new.data_handle()[i].x++] = new_neighb_id; + if (h_list_sizes_new.data_handle()[i].x == num_samples) break; } } } @@ -1051,31 +1074,37 @@ void GnndGraph::sample_graph(bool sample_new) { #pragma omp parallel for for (size_t i = 0; i < nrow; i++) { - h_list_sizes_old[i].x = 0; - h_list_sizes_old[i].y = 0; - h_list_sizes_new[i].x = 0; - h_list_sizes_new[i].y = 0; + h_list_sizes_old.data_handle()[i].x = 0; + h_list_sizes_old.data_handle()[i].y = 0; + h_list_sizes_new.data_handle()[i].x = 0; + h_list_sizes_new.data_handle()[i].y = 0; auto list = h_graph + i * node_degree; - auto list_old = h_graph_old.data() + i * num_samples; - auto list_new = h_graph_new.data() + i * num_samples; + auto list_old = h_graph_old.data_handle() + i * num_samples; + auto list_new = h_graph_new.data_handle() + i * num_samples; for (int j = 0; j < segment_size; j++) { for (int k = 0; k < num_segments; k++) { auto neighbor = list[k * segment_size + j]; if ((size_t)neighbor.id() >= nrow) continue; if (!neighbor.is_new()) { - if (h_list_sizes_old[i].x < num_samples) { - list_old[h_list_sizes_old[i].x++] = neighbor.id(); + if (h_list_sizes_old.data_handle()[i].x < num_samples) { + list_old[h_list_sizes_old.data_handle()[i].x++] = neighbor.id(); } } else if (sample_new) { - if (h_list_sizes_new[i].x < num_samples) { + if (h_list_sizes_new.data_handle()[i].x < num_samples) { list[k * segment_size + j].mark_old(); - list_new[h_list_sizes_new[i].x++] = neighbor.id(); + list_new[h_list_sizes_new.data_handle()[i].x++] = neighbor.id(); } } - if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; } + if (h_list_sizes_old.data_handle()[i].x == num_samples && + h_list_sizes_new.data_handle()[i].x == num_samples) { + break; + } + } + if (h_list_sizes_old.data_handle()[i].x == num_samples && + h_list_sizes_new.data_handle()[i].x == num_samples) { + break; } - if (h_list_sizes_old[i].x == num_samples && h_list_sizes_new[i].x == num_samples) { break; } } } } @@ -1137,7 +1166,8 @@ template GNND::GNND(raft::resources const& res, const BuildConfig& build_config) : res(res), build_config_(build_config), - graph_(build_config.max_dataset_size, + graph_(res, + build_config.max_dataset_size, align32::roundUp(build_config.node_degree), align32::roundUp(build_config.internal_node_degree ? build_config.internal_node_degree : build_config.node_degree), @@ -1146,33 +1176,48 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build ndim_(build_config.dataset_dim), d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>( res, nrow_, build_config.dataset_dim)}, - l2_norms_{raft::make_device_vector(res, nrow_)}, + l2_norms_{raft::make_device_vector(res, 0)}, graph_buffer_{ raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, dists_buffer_{ raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, - graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE), - dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE), + graph_host_buffer_{ + raft::make_pinned_matrix(res, nrow_, DEGREE_ON_DEVICE)}, + dists_host_buffer_{ + raft::make_pinned_matrix(res, nrow_, DEGREE_ON_DEVICE)}, d_locks_{raft::make_device_vector(res, nrow_)}, - h_rev_graph_new_(nrow_ * NUM_SAMPLES), - h_graph_old_(nrow_ * NUM_SAMPLES), - h_rev_graph_old_(nrow_ * NUM_SAMPLES), + h_rev_graph_new_{ + raft::make_pinned_matrix(res, nrow_, NUM_SAMPLES)}, + h_graph_old_( + raft::make_pinned_matrix(res, nrow_, NUM_SAMPLES)), + h_rev_graph_old_{ + raft::make_pinned_matrix(res, nrow_, NUM_SAMPLES)}, d_list_sizes_new_{raft::make_device_vector(res, nrow_)}, d_list_sizes_old_{raft::make_device_vector(res, nrow_)} { static_assert(NUM_SAMPLES <= 32); - thrust::fill(thrust::device, - dists_buffer_.data_handle(), - dists_buffer_.data_handle() + dists_buffer_.size(), - std::numeric_limits::max()); - thrust::fill(thrust::device, - reinterpret_cast(graph_buffer_.data_handle()), - reinterpret_cast(graph_buffer_.data_handle()) + graph_buffer_.size(), - std::numeric_limits::max()); - thrust::fill(thrust::device, d_locks_.data_handle(), d_locks_.data_handle() + d_locks_.size(), 0); + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); + auto graph_buffer_view = raft::make_device_matrix_view( + reinterpret_cast(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE); + raft::matrix::fill(res, graph_buffer_view, std::numeric_limits::max()); + raft::matrix::fill(res, d_locks_.view(), 0); + + if (build_config.metric == cuvs::distance::DistanceType::L2Expanded) { + l2_norms_ = raft::make_device_vector(res, nrow_); + } }; +template +void GNND::reset(raft::resources const& res) +{ + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); + auto graph_buffer_view = raft::make_device_matrix_view( + reinterpret_cast(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE); + raft::matrix::fill(res, graph_buffer_view, std::numeric_limits::max()); + raft::matrix::fill(res, d_locks_.view(), 0); +} + template void GNND::add_reverse_edges(Index_t* graph_ptr, Index_t* h_rev_graph_ptr, @@ -1189,34 +1234,36 @@ void GNND::add_reverse_edges(Index_t* graph_ptr, template void GNND::local_join(cudaStream_t stream) { - thrust::fill(thrust::device.on(stream), - dists_buffer_.data_handle(), - dists_buffer_.data_handle() + dists_buffer_.size(), - std::numeric_limits::max()); - local_join_kernel<<>>( - thrust::raw_pointer_cast(graph_.h_graph_new.data()), - thrust::raw_pointer_cast(h_rev_graph_new_.data()), - d_list_sizes_new_.data_handle(), - thrust::raw_pointer_cast(h_graph_old_.data()), - thrust::raw_pointer_cast(h_rev_graph_old_.data()), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_.data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle()); + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); + local_join_kernel<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_.data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric); } template -void GNND::build(Data_t* data, const Index_t nrow, Index_t* output_graph) +void GNND::build(Data_t* data, + const Index_t nrow, + Index_t* output_graph, + bool return_distances, + DistData_t* output_distances) { using input_t = typename std::remove_const::type; cudaStream_t stream = raft::resource::get_cuda_stream(res); nrow_ = nrow; + graph_.nrow = nrow; graph_.h_graph = (InternalID_t*)output_graph; cudaPointerAttributes data_ptr_attr; @@ -1226,24 +1273,19 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out cuvs::spatial::knn::detail::utils::batch_load_iterator vec_batches{ data, static_cast(nrow_), build_config_.dataset_dim, batch_size, stream}; for (auto const& batch : vec_batches) { - preprocess_data_kernel<<(raft::warp_size())) * - raft::warp_size(), - stream>>>(batch.data(), - d_data_.data_handle(), - build_config_.dataset_dim, - l2_norms_.data_handle(), - batch.offset()); + preprocess_data_kernel<<< + batch.size(), + raft::warp_size(), + sizeof(Data_t) * ceildiv(build_config_.dataset_dim, static_cast(raft::warp_size())) * + raft::warp_size(), + stream>>>(batch.data(), + d_data_.data_handle(), + build_config_.dataset_dim, + l2_norms_.data_handle(), + batch.offset(), + build_config_.metric); } - thrust::fill(thrust::device.on(stream), - (Index_t*)graph_buffer_.data_handle(), - (Index_t*)graph_buffer_.data_handle() + graph_buffer_.size(), - std::numeric_limits::max()); - graph_.clear(); graph_.init_random_graph(); graph_.sample_graph(true); @@ -1251,8 +1293,8 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out auto update_and_sample = [&](bool update_graph) { if (update_graph) { update_counter_ = 0; - graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()), - thrust::raw_pointer_cast(dists_host_buffer_.data()), + graph_.update_graph(graph_host_buffer_.data_handle(), + dists_host_buffer_.data_handle(), DEGREE_ON_DEVICE, update_counter_); if (update_counter_ < build_config_.termination_threshold * nrow_ * @@ -1265,15 +1307,15 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out for (size_t it = 0; it < build_config_.max_iterations; it++) { raft::copy(d_list_sizes_new_.data_handle(), - thrust::raw_pointer_cast(graph_.h_list_sizes_new.data()), + graph_.h_list_sizes_new.data_handle(), nrow_, raft::resource::get_cuda_stream(res)); - raft::copy(thrust::raw_pointer_cast(h_graph_old_.data()), - thrust::raw_pointer_cast(graph_.h_graph_old.data()), + raft::copy(h_graph_old_.data_handle(), + graph_.h_graph_old.data_handle(), nrow_ * NUM_SAMPLES, raft::resource::get_cuda_stream(res)); raft::copy(d_list_sizes_old_.data_handle(), - thrust::raw_pointer_cast(graph_.h_list_sizes_old.data()), + graph_.h_list_sizes_old.data_handle(), nrow_, raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); @@ -1286,13 +1328,13 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out // contains some information for local_join. static_assert(DEGREE_ON_DEVICE * sizeof(*(dists_buffer_.data_handle())) >= NUM_SAMPLES * sizeof(*(graph_buffer_.data_handle()))); - add_reverse_edges(thrust::raw_pointer_cast(graph_.h_graph_new.data()), - thrust::raw_pointer_cast(h_rev_graph_new_.data()), + add_reverse_edges(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), (Index_t*)dists_buffer_.data_handle(), d_list_sizes_new_.data_handle(), stream); - add_reverse_edges(thrust::raw_pointer_cast(h_graph_old_.data()), - thrust::raw_pointer_cast(h_rev_graph_old_.data()), + add_reverse_edges(h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), (Index_t*)dists_buffer_.data_handle(), d_list_sizes_old_.data_handle(), stream); @@ -1316,21 +1358,21 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out update_and_sample_thread.join(); if (update_counter_ == -1) { break; } - raft::copy(thrust::raw_pointer_cast(graph_host_buffer_.data()), + raft::copy(graph_host_buffer_.data_handle(), graph_buffer_.data_handle(), nrow_ * DEGREE_ON_DEVICE, raft::resource::get_cuda_stream(res)); raft::resource::sync_stream(res); - raft::copy(thrust::raw_pointer_cast(dists_host_buffer_.data()), + raft::copy(dists_host_buffer_.data_handle(), dists_buffer_.data_handle(), nrow_ * DEGREE_ON_DEVICE, raft::resource::get_cuda_stream(res)); - graph_.sample_graph_new(thrust::raw_pointer_cast(graph_host_buffer_.data()), DEGREE_ON_DEVICE); + graph_.sample_graph_new(graph_host_buffer_.data_handle(), DEGREE_ON_DEVICE); } - graph_.update_graph(thrust::raw_pointer_cast(graph_host_buffer_.data()), - thrust::raw_pointer_cast(dists_host_buffer_.data()), + graph_.update_graph(graph_host_buffer_.data_handle(), + dists_host_buffer_.data_handle(), DEGREE_ON_DEVICE, update_counter_); raft::resource::sync_stream(res); @@ -1338,6 +1380,27 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out // Reuse graph_.h_dists as the buffer for shrink the lists in graph static_assert(sizeof(decltype(*(graph_.h_dists.data_handle()))) >= sizeof(Index_t)); + + if (return_distances) { + auto graph_d_dists = raft::make_device_matrix( + res, nrow_, build_config_.node_degree); + raft::copy(graph_d_dists.data_handle(), + graph_.h_dists.data_handle(), + nrow_ * build_config_.node_degree, + raft::resource::get_cuda_stream(res)); + + auto output_dist_view = raft::make_device_matrix_view( + output_distances, nrow_, build_config_.output_graph_degree); + + raft::matrix::slice_coordinates coords{static_cast(0), + static_cast(0), + static_cast(nrow_), + static_cast(build_config_.output_graph_degree)}; + raft::matrix::slice( + res, raft::make_const_mdspan(graph_d_dists.view()), output_dist_view, coords); + raft::resource::sync_stream(res); + } + Index_t* graph_shrink_buffer = (Index_t*)graph_.h_dists.data_handle(); #pragma omp parallel for @@ -1376,6 +1439,11 @@ void build(raft::resources const& res, RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits::max() - 1, "The dataset size for GNND should be less than %d", std::numeric_limits::max() - 1); + auto allowed_metrics = params.metric == cuvs::distance::DistanceType::L2Expanded || + params.metric == cuvs::distance::DistanceType::CosineExpanded || + params.metric == cuvs::distance::DistanceType::InnerProduct; + RAFT_EXPECTS(allowed_metrics && idx.metric() == params.metric, + "The metric for NN Descent should be L2Expanded, CosineExpanded or InnerProduct"); size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; @@ -1410,10 +1478,25 @@ void build(raft::resources const& res, .node_degree = extended_graph_degree, .internal_node_degree = extended_intermediate_degree, .max_iterations = params.max_iterations, - .termination_threshold = params.termination_threshold}; + .termination_threshold = params.termination_threshold, + .output_graph_degree = params.graph_degree, + .metric = params.metric}; GNND nnd(res, build_config); - nnd.build(dataset.data_handle(), dataset.extent(0), int_graph.data_handle()); + + if (idx.distances().has_value() || !params.return_distances) { + nnd.build(dataset.data_handle(), + dataset.extent(0), + int_graph.data_handle(), + params.return_distances, + idx.distances() + .value_or(raft::make_device_matrix(res, 0, 0).view()) + .data_handle()); + } else { + RAFT_EXPECTS(!params.return_distances, + "Distance view not allocated. Using return_distances set to true requires " + "distance view to be allocated."); + } #pragma omp parallel for for (size_t i = 0; i < static_cast(dataset.extent(0)); i++) { @@ -1445,11 +1528,15 @@ index build( graph_degree = intermediate_degree; } - index idx{res, dataset.extent(0), static_cast(graph_degree)}; + index idx{res, + dataset.extent(0), + static_cast(graph_degree), + params.return_distances, + params.metric}; build(res, params, dataset, idx); return idx; } -} // namespace cuvs::neighbors::nn_descent::detail +} // namespace cuvs::neighbors::nn_descent::detail diff --git a/cpp/src/neighbors/detail/nn_descent_batch.cuh b/cpp/src/neighbors/detail/nn_descent_batch.cuh new file mode 100644 index 000000000..842dbe788 --- /dev/null +++ b/cpp/src/neighbors/detail/nn_descent_batch.cuh @@ -0,0 +1,736 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#undef RAFT_EXPLICIT_INSTANTIATE_ONLY + +#include "nn_descent.cuh" +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace cuvs::neighbors::nn_descent::detail::experimental { + +// +// Run balanced kmeans on a subsample of the dataset to get centroids +// +template , memory_type::host>> +void get_balanced_kmeans_centroids( + raft::resources const& res, + cuvs::distance::DistanceType metric, + mdspan, row_major, Accessor> dataset, + raft::device_matrix_view centroids) +{ + size_t num_rows = static_cast(dataset.extent(0)); + size_t num_cols = static_cast(dataset.extent(1)); + size_t n_clusters = centroids.extent(0); + size_t num_subsamples = + std::min(static_cast(num_rows / n_clusters), static_cast(num_rows * 0.1)); + + auto d_subsample_dataset = + raft::make_device_matrix(res, num_subsamples, num_cols); + raft::matrix::sample_rows( + res, raft::random::RngState{0}, dataset, d_subsample_dataset.view()); + + cuvs::cluster::kmeans::balanced_params kmeans_params; + kmeans_params.metric = metric; + + auto d_subsample_dataset_const_view = + raft::make_device_matrix_view( + d_subsample_dataset.data_handle(), num_subsamples, num_cols); + auto centroids_view = raft::make_device_matrix_view( + centroids.data_handle(), n_clusters, num_cols); + cuvs::cluster::kmeans::fit(res, kmeans_params, d_subsample_dataset_const_view, centroids_view); +} + +// +// Get the top k closest centroid indices for each data point +// Loads the data in batches onto device if data is on host for memory efficiency +// +template +void get_global_nearest_k( + raft::resources const& res, + size_t k, + size_t num_rows, + size_t n_clusters, + const T* dataset, + raft::host_matrix_view global_nearest_cluster, + raft::device_matrix_view centroids, + cuvs::distance::DistanceType metric) +{ + size_t num_cols = centroids.extent(1); + auto centroids_view = raft::make_device_matrix_view( + centroids.data_handle(), n_clusters, num_cols); + + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, dataset)); + float* ptr = reinterpret_cast(attr.devicePointer); + + size_t num_batches = n_clusters; + size_t batch_size = (num_rows + n_clusters) / n_clusters; + if (ptr == nullptr) { // data on host + + auto d_dataset_batch = + raft::make_device_matrix(res, batch_size, num_cols); + + auto nearest_clusters_idx = + raft::make_device_matrix(res, batch_size, k); + auto nearest_clusters_idxt = + raft::make_device_matrix(res, batch_size, k); + auto nearest_clusters_dist = + raft::make_device_matrix(res, batch_size, k); + + for (size_t i = 0; i < num_batches; i++) { + size_t batch_size_ = batch_size; + + if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; } + raft::copy(d_dataset_batch.data_handle(), + dataset + i * batch_size * num_cols, + batch_size_ * num_cols, + resource::get_cuda_stream(res)); + + std::optional> norms_view; + cuvs::neighbors::brute_force::index brute_force_index( + res, centroids_view, norms_view, metric); + cuvs::neighbors::brute_force::search(res, + brute_force_index, + raft::make_const_mdspan(d_dataset_batch.view()), + nearest_clusters_idx.view(), + nearest_clusters_dist.view()); + + thrust::copy(raft::resource::get_thrust_policy(res), + nearest_clusters_idx.data_handle(), + nearest_clusters_idx.data_handle() + nearest_clusters_idx.size(), + nearest_clusters_idxt.data_handle()); + raft::copy(global_nearest_cluster.data_handle() + i * batch_size * k, + nearest_clusters_idxt.data_handle(), + batch_size_ * k, + resource::get_cuda_stream(res)); + } + } else { // data on device + auto nearest_clusters_idx = + raft::make_device_matrix(res, num_rows, k); + auto nearest_clusters_dist = + raft::make_device_matrix(res, num_rows, k); + + std::optional> norms_view; + cuvs::neighbors::brute_force::index brute_force_index( + res, centroids_view, norms_view, metric); + auto dataset_view = + raft::make_device_matrix_view(dataset, num_rows, num_cols); + cuvs::neighbors::brute_force::search(res, + brute_force_index, + dataset_view, + nearest_clusters_idx.view(), + nearest_clusters_dist.view()); + + auto nearest_clusters_idxt = + raft::make_device_matrix(res, batch_size, k); + for (size_t i = 0; i < num_batches; i++) { + size_t batch_size_ = batch_size; + + if (i == num_batches - 1) { batch_size_ = num_rows - batch_size * i; } + thrust::copy(raft::resource::get_thrust_policy(res), + nearest_clusters_idx.data_handle() + i * batch_size_ * k, + nearest_clusters_idx.data_handle() + (i + 1) * batch_size_ * k, + nearest_clusters_idxt.data_handle()); + raft::copy(global_nearest_cluster.data_handle() + i * batch_size_ * k, + nearest_clusters_idxt.data_handle(), + batch_size_ * k, + resource::get_cuda_stream(res)); + } + } +} + +// +// global_nearest_cluster [num_rows X k=2] : top 2 closest clusters for each data point +// inverted_indices [num_rows x k vector] : sparse vector for data indices for each cluster +// cluster_size [n_cluster] : cluster size for each cluster +// offset [n_cluster] : offset in inverted_indices for each cluster +// Loads the data in batches onto device if data is on host for memory efficiency +// +template +void get_inverted_indices(raft::resources const& res, + size_t n_clusters, + size_t& max_cluster_size, + size_t& min_cluster_size, + raft::host_matrix_view global_nearest_cluster, + raft::host_vector_view inverted_indices, + raft::host_vector_view cluster_size, + raft::host_vector_view offset) +{ + // build sparse inverted indices and get number of data points for each cluster + size_t num_rows = global_nearest_cluster.extent(0); + size_t k = global_nearest_cluster.extent(1); + + auto local_offset = raft::make_host_vector(n_clusters); + + max_cluster_size = 0; + min_cluster_size = std::numeric_limits::max(); + + std::fill(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters, 0); + std::fill(local_offset.data_handle(), local_offset.data_handle() + n_clusters, 0); + + // TODO: this part isn't really a bottleneck but maybe worth trying omp parallel + // for with atomic add + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = 0; j < k; j++) { + IdxT cluster_id = global_nearest_cluster(i, j); + cluster_size(cluster_id) += 1; + } + } + + offset(0) = 0; + for (size_t i = 1; i < n_clusters; i++) { + offset(i) = offset(i - 1) + cluster_size(i - 1); + } + for (size_t i = 0; i < num_rows; i++) { + for (size_t j = 0; j < k; j++) { + IdxT cluster_id = global_nearest_cluster(i, j); + inverted_indices(offset(cluster_id) + local_offset(cluster_id)) = i; + local_offset(cluster_id) += 1; + } + } + + max_cluster_size = static_cast( + *std::max_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters)); + min_cluster_size = static_cast( + *std::min_element(cluster_size.data_handle(), cluster_size.data_handle() + n_clusters)); +} + +template +struct KeyValuePair { + KeyType key; + ValueType value; +}; + +template +struct CustomKeyComparator { + __device__ bool operator()(const KeyValuePair& a, + const KeyValuePair& b) const + { + if (a.key == b.key) { return a.value < b.value; } + return a.key < b.key; + } +}; + +template +RAFT_KERNEL merge_subgraphs(IdxT* cluster_data_indices, + size_t graph_degree, + size_t num_cluster_in_batch, + float* global_distances, + float* batch_distances, + IdxT* global_indices, + IdxT* batch_indices) +{ + size_t batch_row = blockIdx.x; + typedef cub::BlockMergeSort, BLOCK_SIZE, ITEMS_PER_THREAD> + BlockMergeSortType; + __shared__ typename cub::BlockMergeSort, BLOCK_SIZE, ITEMS_PER_THREAD>:: + TempStorage tmpSmem; + + extern __shared__ char sharedMem[]; + float* blockKeys = reinterpret_cast(sharedMem); + IdxT* blockValues = reinterpret_cast(&sharedMem[graph_degree * 2 * sizeof(float)]); + int16_t* uniqueMask = + reinterpret_cast(&sharedMem[graph_degree * 2 * (sizeof(float) + sizeof(IdxT))]); + + if (batch_row < num_cluster_in_batch) { + // load batch or global depending on threadIdx + size_t global_row = cluster_data_indices[batch_row]; + + KeyValuePair threadKeyValuePair[ITEMS_PER_THREAD]; + + size_t halfway = BLOCK_SIZE / 2; + size_t do_global = threadIdx.x < halfway; + + float* distances; + IdxT* indices; + + if (do_global) { + distances = global_distances; + indices = global_indices; + } else { + distances = batch_distances; + indices = batch_indices; + } + + size_t idxBase = (threadIdx.x * do_global + (threadIdx.x - halfway) * (1lu - do_global)) * + static_cast(ITEMS_PER_THREAD); + size_t arrIdxBase = (global_row * do_global + batch_row * (1lu - do_global)) * graph_degree; + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId < graph_degree) { + threadKeyValuePair[i].key = distances[arrIdxBase + colId]; + threadKeyValuePair[i].value = indices[arrIdxBase + colId]; + } else { + threadKeyValuePair[i].key = std::numeric_limits::max(); + threadKeyValuePair[i].value = std::numeric_limits::max(); + } + } + + __syncthreads(); + + BlockMergeSortType(tmpSmem).Sort(threadKeyValuePair, CustomKeyComparator{}); + + // load sorted result into shared memory to get unique values + idxBase = threadIdx.x * ITEMS_PER_THREAD; + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId < 2 * graph_degree) { + blockKeys[colId] = threadKeyValuePair[i].key; + blockValues[colId] = threadKeyValuePair[i].value; + } + } + + __syncthreads(); + + // get unique mask + if (threadIdx.x == 0) { uniqueMask[0] = 1; } + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId > 0 && colId < 2 * graph_degree) { + uniqueMask[colId] = static_cast(blockValues[colId] != blockValues[colId - 1]); + } + } + + __syncthreads(); + + // prefix sum + if (threadIdx.x == 0) { + for (int i = 1; i < 2 * graph_degree; i++) { + uniqueMask[i] += uniqueMask[i - 1]; + } + } + + __syncthreads(); + // load unique values to global memory + if (threadIdx.x == 0) { + global_distances[global_row * graph_degree] = blockKeys[0]; + global_indices[global_row * graph_degree] = blockValues[0]; + } + + for (int i = 0; i < ITEMS_PER_THREAD; i++) { + size_t colId = idxBase + i; + if (colId > 0 && colId < 2 * graph_degree) { + bool is_unique = uniqueMask[colId] != uniqueMask[colId - 1]; + int16_t global_colId = uniqueMask[colId] - 1; + if (is_unique && static_cast(global_colId) < graph_degree) { + global_distances[global_row * graph_degree + global_colId] = blockKeys[colId]; + global_indices[global_row * graph_degree + global_colId] = blockValues[colId]; + } + } + } + } +} + +// +// builds knn graph using NN Descent and merge with global graph +// +template , memory_type::host>> +void build_and_merge(raft::resources const& res, + const index_params& params, + size_t num_data_in_cluster, + size_t graph_degree, + size_t int_graph_node_degree, + T* cluster_data, + IdxT* cluster_data_indices, + int* int_graph, + IdxT* inverted_indices, + IdxT* global_indices_d, + float* global_distances_d, + IdxT* batch_indices_h, + IdxT* batch_indices_d, + float* batch_distances_d, + GNND& nnd) +{ + nnd.build(cluster_data, num_data_in_cluster, int_graph, true, batch_distances_d); + + // remap indices +#pragma omp parallel for + for (size_t i = 0; i < num_data_in_cluster; i++) { + for (size_t j = 0; j < graph_degree; j++) { + size_t local_idx = int_graph[i * int_graph_node_degree + j]; + batch_indices_h[i * graph_degree + j] = inverted_indices[local_idx]; + } + } + + raft::copy(batch_indices_d, + batch_indices_h, + num_data_in_cluster * graph_degree, + raft::resource::get_cuda_stream(res)); + + size_t num_elems = graph_degree * 2; + size_t sharedMemSize = num_elems * (sizeof(float) + sizeof(IdxT) + sizeof(int16_t)); + + if (num_elems <= 128) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else if (num_elems <= 512) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else if (num_elems <= 1024) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else if (num_elems <= 2048) { + merge_subgraphs + <<>>( + cluster_data_indices, + graph_degree, + num_data_in_cluster, + global_distances_d, + batch_distances_d, + global_indices_d, + batch_indices_d); + } else { + // this is as far as we can get due to the shared mem usage of cub::BlockMergeSort + RAFT_FAIL("The degree of knn is too large (%lu). It must be smaller than 1024", graph_degree); + } + raft::resource::sync_stream(res); +} + +// +// For each cluster, gather the data samples that belong to that cluster, and +// call build_and_merge +// +template +void cluster_nnd(raft::resources const& res, + const index_params& params, + size_t graph_degree, + size_t extended_graph_degree, + size_t max_cluster_size, + raft::host_matrix_view dataset, + IdxT* offsets, + IdxT* cluster_size, + IdxT* cluster_data_indices, + int* int_graph, + IdxT* inverted_indices, + IdxT* global_indices_h, + float* global_distances_h, + IdxT* batch_indices_h, + IdxT* batch_indices_d, + float* batch_distances_d, + const BuildConfig& build_config) +{ + size_t num_rows = dataset.extent(0); + size_t num_cols = dataset.extent(1); + + GNND nnd(res, build_config); + + auto cluster_data_matrix = + raft::make_host_matrix(max_cluster_size, num_cols); + + for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) { + RAFT_LOG_DEBUG( + "# Data on host. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters); + size_t num_data_in_cluster = cluster_size[cluster_id]; + size_t offset = offsets[cluster_id]; + +#pragma omp parallel for + for (size_t i = 0; i < num_data_in_cluster; i++) { + for (size_t j = 0; j < num_cols; j++) { + size_t global_row = (inverted_indices + offset)[i]; + cluster_data_matrix(i, j) = dataset(global_row, j); + } + } + + build_and_merge(res, + params, + num_data_in_cluster, + graph_degree, + extended_graph_degree, + cluster_data_matrix.data_handle(), + cluster_data_indices + offset, + int_graph, + inverted_indices + offset, + global_indices_h, + global_distances_h, + batch_indices_h, + batch_indices_d, + batch_distances_d, + nnd); + nnd.reset(res); + } +} + +template +void cluster_nnd(raft::resources const& res, + const index_params& params, + size_t graph_degree, + size_t extended_graph_degree, + size_t max_cluster_size, + raft::device_matrix_view dataset, + IdxT* offsets, + IdxT* cluster_size, + IdxT* cluster_data_indices, + int* int_graph, + IdxT* inverted_indices, + IdxT* global_indices_h, + float* global_distances_h, + IdxT* batch_indices_h, + IdxT* batch_indices_d, + float* batch_distances_d, + const BuildConfig& build_config) +{ + size_t num_rows = dataset.extent(0); + size_t num_cols = dataset.extent(1); + + GNND nnd(res, build_config); + + auto cluster_data_matrix = + raft::make_device_matrix(res, max_cluster_size, num_cols); + + for (size_t cluster_id = 0; cluster_id < params.n_clusters; cluster_id++) { + RAFT_LOG_DEBUG( + "# Data on device. Running clusters: %lu / %lu", cluster_id + 1, params.n_clusters); + size_t num_data_in_cluster = cluster_size[cluster_id]; + size_t offset = offsets[cluster_id]; + + auto cluster_data_view = raft::make_device_matrix_view( + cluster_data_matrix.data_handle(), num_data_in_cluster, num_cols); + auto cluster_data_indices_view = raft::make_device_vector_view( + cluster_data_indices + offset, num_data_in_cluster); + + auto dataset_IdxT = + raft::make_device_matrix_view(dataset.data_handle(), num_rows, num_cols); + raft::matrix::gather(res, dataset_IdxT, cluster_data_indices_view, cluster_data_view); + + build_and_merge(res, + params, + num_data_in_cluster, + graph_degree, + extended_graph_degree, + cluster_data_view.data_handle(), + cluster_data_indices + offset, + int_graph, + inverted_indices + offset, + global_indices_h, + global_distances_h, + batch_indices_h, + batch_indices_d, + batch_distances_d, + nnd); + nnd.reset(res); + } +} + +template , memory_type::host>> +void batch_build(raft::resources const& res, + const index_params& params, + mdspan, row_major, Accessor> dataset, + index& global_idx) +{ + size_t graph_degree = params.graph_degree; + size_t intermediate_degree = params.intermediate_graph_degree; + + size_t num_rows = static_cast(dataset.extent(0)); + size_t num_cols = static_cast(dataset.extent(1)); + + auto centroids = + raft::make_device_matrix(res, params.n_clusters, num_cols); + get_balanced_kmeans_centroids(res, params.metric, dataset, centroids.view()); + + size_t k = 2; + auto global_nearest_cluster = raft::make_host_matrix(num_rows, k); + get_global_nearest_k(res, + k, + num_rows, + params.n_clusters, + dataset.data_handle(), + global_nearest_cluster.view(), + centroids.view(), + params.metric); + + auto inverted_indices = raft::make_host_vector(num_rows * k); + auto cluster_size = raft::make_host_vector(params.n_clusters); + auto offset = raft::make_host_vector(params.n_clusters); + + size_t max_cluster_size, min_cluster_size; + get_inverted_indices(res, + params.n_clusters, + max_cluster_size, + min_cluster_size, + global_nearest_cluster.view(), + inverted_indices.view(), + cluster_size.view(), + offset.view()); + + if (intermediate_degree >= min_cluster_size) { + RAFT_LOG_WARN( + "Intermediate graph degree cannot be larger than minimum cluster size, reducing it to %lu", + dataset.extent(0)); + intermediate_degree = min_cluster_size - 1; + } + if (intermediate_degree < graph_degree) { + RAFT_LOG_WARN( + "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing " + "graph_degree.", + graph_degree, + intermediate_degree); + graph_degree = intermediate_degree; + } + + size_t extended_graph_degree = + align32::roundUp(static_cast(graph_degree * (graph_degree <= 32 ? 1.0 : 1.3))); + size_t extended_intermediate_degree = align32::roundUp( + static_cast(intermediate_degree * (intermediate_degree <= 32 ? 1.0 : 1.3))); + + auto int_graph = raft::make_host_matrix( + max_cluster_size, static_cast(extended_graph_degree)); + + BuildConfig build_config{.max_dataset_size = max_cluster_size, + .dataset_dim = num_cols, + .node_degree = extended_graph_degree, + .internal_node_degree = extended_intermediate_degree, + .max_iterations = params.max_iterations, + .termination_threshold = params.termination_threshold, + .output_graph_degree = graph_degree}; + + auto global_indices_h = raft::make_managed_matrix(res, num_rows, graph_degree); + auto global_distances_h = raft::make_managed_matrix(res, num_rows, graph_degree); + + std::fill(global_indices_h.data_handle(), + global_indices_h.data_handle() + num_rows * graph_degree, + std::numeric_limits::max()); + std::fill(global_distances_h.data_handle(), + global_distances_h.data_handle() + num_rows * graph_degree, + std::numeric_limits::max()); + + auto batch_indices_h = + raft::make_host_matrix(max_cluster_size, graph_degree); + auto batch_indices_d = + raft::make_device_matrix(res, max_cluster_size, graph_degree); + auto batch_distances_d = + raft::make_device_matrix(res, max_cluster_size, graph_degree); + + auto cluster_data_indices = raft::make_device_vector(res, num_rows * k); + raft::copy(cluster_data_indices.data_handle(), + inverted_indices.data_handle(), + num_rows * k, + resource::get_cuda_stream(res)); + + cluster_nnd(res, + params, + graph_degree, + extended_graph_degree, + max_cluster_size, + dataset, + offset.data_handle(), + cluster_size.data_handle(), + cluster_data_indices.data_handle(), + int_graph.data_handle(), + inverted_indices.data_handle(), + global_indices_h.data_handle(), + global_distances_h.data_handle(), + batch_indices_h.data_handle(), + batch_indices_d.data_handle(), + batch_distances_d.data_handle(), + build_config); + + raft::copy(global_idx.graph().data_handle(), + global_indices_h.data_handle(), + num_rows * graph_degree, + raft::resource::get_cuda_stream(res)); + if (params.return_distances && global_idx.distances().has_value()) { + raft::copy(global_idx.distances().value().data_handle(), + global_distances_h.data_handle(), + num_rows * graph_degree, + raft::resource::get_cuda_stream(res)); + } +} + +template , memory_type::host>> +index batch_build(raft::resources const& res, + const index_params& params, + mdspan, row_major, Accessor> dataset) +{ + size_t intermediate_degree = params.intermediate_graph_degree; + size_t graph_degree = params.graph_degree; + + if (intermediate_degree < graph_degree) { + RAFT_LOG_WARN( + "Graph degree (%lu) cannot be larger than intermediate graph degree (%lu), reducing " + "graph_degree.", + graph_degree, + intermediate_degree); + graph_degree = intermediate_degree; + } + + index idx{ + res, dataset.extent(0), static_cast(graph_degree), params.return_distances}; + + batch_build(res, params, dataset, idx); + + return idx; +} + +} // namespace cuvs::neighbors::nn_descent::detail::experimental diff --git a/cpp/src/neighbors/detail/sparse_knn.cuh b/cpp/src/neighbors/detail/sparse_knn.cuh new file mode 100644 index 000000000..9c8e971b9 --- /dev/null +++ b/cpp/src/neighbors/detail/sparse_knn.cuh @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "../../distance/sparse_distance.cuh" +#include "knn_merge_parts.cuh" +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuvs::neighbors::detail { + +template +struct csr_batcher_t { + csr_batcher_t(value_idx batch_size, + value_idx n_rows, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data) + : batch_start_(0), + batch_stop_(0), + batch_rows_(0), + total_rows_(n_rows), + batch_size_(batch_size), + csr_indptr_(csr_indptr), + csr_indices_(csr_indices), + csr_data_(csr_data), + batch_csr_start_offset_(0), + batch_csr_stop_offset_(0) + { + } + + void set_batch(int batch_num) + { + batch_start_ = batch_num * batch_size_; + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + + if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1; // zero-based indexing + + batch_rows_ = (batch_stop_ - batch_start_) + 1; + } + + value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_indptr(batch_start_, + batch_stop_, + csr_indptr_, + batch_indptr, + &batch_csr_start_offset_, + &batch_csr_stop_offset_, + stream); + + return batch_csr_stop_offset_ - batch_csr_start_offset_; + } + + void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_, + batch_csr_stop_offset_, + csr_indices_, + csr_data_, + csr_indices, + csr_data, + stream); + } + + value_idx batch_rows() const { return batch_rows_; } + + value_idx batch_start() const { return batch_start_; } + + value_idx batch_stop() const { return batch_stop_; } + + private: + value_idx batch_size_; + value_idx batch_start_; + value_idx batch_stop_; + value_idx batch_rows_; + + value_idx total_rows_; + + const value_idx* csr_indptr_; + const value_idx* csr_indices_; + const value_t* csr_data_; + + value_idx batch_csr_start_offset_; + value_idx batch_csr_stop_offset_; +}; + +template +class sparse_knn_t { + public: + sparse_knn_t(const value_idx* idxIndptr_, + const value_idx* idxIndices_, + const value_t* idxData_, + size_t idxNNZ_, + int n_idx_rows_, + int n_idx_cols_, + const value_idx* queryIndptr_, + const value_idx* queryIndices_, + const value_t* queryData_, + size_t queryNNZ_, + int n_query_rows_, + int n_query_cols_, + value_idx* output_indices_, + value_t* output_dists_, + int k_, + raft::resources const& handle_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + cuvs::distance::DistanceType metric_ = cuvs::distance::DistanceType::L2Expanded, + float metricArg_ = 0) + : idxIndptr(idxIndptr_), + idxIndices(idxIndices_), + idxData(idxData_), + idxNNZ(idxNNZ_), + n_idx_rows(n_idx_rows_), + n_idx_cols(n_idx_cols_), + queryIndptr(queryIndptr_), + queryIndices(queryIndices_), + queryData(queryData_), + queryNNZ(queryNNZ_), + n_query_rows(n_query_rows_), + n_query_cols(n_query_cols_), + output_indices(output_indices_), + output_dists(output_dists_), + k(k_), + handle(handle_), + batch_size_index(batch_size_index_), + batch_size_query(batch_size_query_), + metric(metric_), + metricArg(metricArg_) + { + } + + void run() + { + using namespace raft::sparse; + + int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); + csr_batcher_t query_batcher( + batch_size_query, n_query_rows, queryIndptr, queryIndices, queryData); + + size_t rows_processed = 0; + + for (int i = 0; i < n_batches_query; i++) { + /** + * Compute index batch info + */ + query_batcher.set_batch(i); + + /** + * Slice CSR to rows in batch + */ + + rmm::device_uvector query_batch_indptr(query_batcher.batch_rows() + 1, + raft::resource::get_cuda_stream(handle)); + + value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( + query_batch_indptr.data(), raft::resource::get_cuda_stream(handle)); + + rmm::device_uvector query_batch_indices(n_query_batch_nnz, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector query_batch_data(n_query_batch_nnz, + raft::resource::get_cuda_stream(handle)); + + query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), + query_batch_data.data(), + raft::resource::get_cuda_stream(handle)); + + // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent + // batches and 1 space for the results of the merge, which get copied back to the top + rmm::device_uvector merge_buffer_indices(0, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector merge_buffer_dists(0, raft::resource::get_cuda_stream(handle)); + + value_t* dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_ptr; + + int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); + csr_batcher_t idx_batcher( + batch_size_index, n_idx_rows, idxIndptr, idxIndices, idxData); + + for (int j = 0; j < n_batches_idx; j++) { + idx_batcher.set_batch(j); + + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, + raft::resource::get_cuda_stream(handle)); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, + raft::resource::get_cuda_stream(handle)); + + /** + * Slice CSR to rows in batch + */ + rmm::device_uvector idx_batch_indptr(idx_batcher.batch_rows() + 1, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector idx_batch_indices(0, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector idx_batch_data(0, raft::resource::get_cuda_stream(handle)); + + value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( + idx_batch_indptr.data(), raft::resource::get_cuda_stream(handle)); + + idx_batch_indices.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle)); + idx_batch_data.resize(idx_batch_nnz, raft::resource::get_cuda_stream(handle)); + + idx_batcher.get_batch_csr_indices_data( + idx_batch_indices.data(), idx_batch_data.data(), raft::resource::get_cuda_stream(handle)); + + /** + * Compute distances + */ + uint64_t dense_size = + (uint64_t)idx_batcher.batch_rows() * (uint64_t)query_batcher.batch_rows(); + rmm::device_uvector batch_dists(dense_size, + raft::resource::get_cuda_stream(handle)); + + RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, + query_batcher, + idx_batch_nnz, + n_query_batch_nnz, + idx_batch_indptr.data(), + idx_batch_indices.data(), + idx_batch_data.data(), + query_batch_indptr.data(), + query_batch_indices.data(), + query_batch_data.data(), + batch_dists.data()); + + // Build batch indices array + rmm::device_uvector batch_indices(batch_dists.size(), + raft::resource::get_cuda_stream(handle)); + + // populate batch indices array + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); + + iota_fill( + batch_indices.data(), batch_rows, batch_cols, raft::resource::get_cuda_stream(handle)); + + /** + * Perform k-selection on batch & merge with other k-selections + */ + size_t merge_buffer_offset = batch_rows * k; + dists_merge_buffer_ptr = merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, + query_batcher, + batch_dists.data(), + batch_indices.data(), + dists_merge_buffer_ptr, + indices_merge_buffer_ptr); + + value_t* dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + + // Merge results of difference batches if necessary + if (idx_batcher.batch_start() > 0) { + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, + query_batcher, + merge_buffer_dists.data(), + merge_buffer_indices.data(), + dists_merge_buffer_tmp_ptr, + indices_merge_buffer_tmp_ptr); + } + + // copy merged output back into merge buffer partition for next iteration + raft::copy_async(merge_buffer_indices.data(), + indices_merge_buffer_tmp_ptr, + batch_rows * k, + raft::resource::get_cuda_stream(handle)); + raft::copy_async(merge_buffer_dists.data(), + dists_merge_buffer_tmp_ptr, + batch_rows * k, + raft::resource::get_cuda_stream(handle)); + } + + // Copy final merged batch to output array + raft::copy_async(output_indices + (rows_processed * k), + merge_buffer_indices.data(), + query_batcher.batch_rows() * k, + raft::resource::get_cuda_stream(handle)); + raft::copy_async(output_dists + (rows_processed * k), + merge_buffer_dists.data(), + query_batcher.batch_rows() * k, + raft::resource::get_cuda_stream(handle)); + + rows_processed += query_batcher.batch_rows(); + } + } + + private: + void merge_batches(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + value_t* merge_buffer_dists, + value_idx* merge_buffer_indices, + value_t* out_dists, + value_idx* out_indices) + { + // build translation buffer to shift resulting indices by the batch + std::vector id_ranges; + id_ranges.push_back(0); + id_ranges.push_back(idx_batcher.batch_start()); + + rmm::device_uvector trans(id_ranges.size(), raft::resource::get_cuda_stream(handle)); + raft::update_device( + trans.data(), id_ranges.data(), id_ranges.size(), raft::resource::get_cuda_stream(handle)); + + // combine merge buffers only if there's more than 1 partition to combine + cuvs::neighbors::detail::knn_merge_parts(merge_buffer_dists, + merge_buffer_indices, + out_dists, + out_indices, + query_batcher.batch_rows(), + 2, + k, + raft::resource::get_cuda_stream(handle), + trans.data()); + } + + void perform_k_selection(csr_batcher_t idx_batcher, + csr_batcher_t query_batcher, + value_t* batch_dists, + value_idx* batch_indices, + value_t* out_dists, + value_idx* out_indices) + { + // populate batch indices array + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); + + // build translation buffer to shift resulting indices by the batch + std::vector id_ranges; + id_ranges.push_back(0); + id_ranges.push_back(idx_batcher.batch_start()); + + // in the case where the number of idx rows in the batch is < k, we + // want to adjust k. + value_idx n_neighbors = std::min(static_cast(k), batch_cols); + + bool ascending = cuvs::distance::is_min_close(metric); + + // kernel to slice first (min) k cols and copy into batched merge buffer + cuvs::selection::select_k( + handle, + raft::make_device_matrix_view(batch_dists, batch_rows, batch_cols), + raft::make_device_matrix_view( + batch_indices, batch_rows, batch_cols), + raft::make_device_matrix_view(out_dists, batch_rows, n_neighbors), + raft::make_device_matrix_view(out_indices, batch_rows, n_neighbors), + ascending, + true); + } + + void compute_distances(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + size_t idx_batch_nnz, + size_t query_batch_nnz, + value_idx* idx_batch_indptr, + value_idx* idx_batch_indices, + value_t* idx_batch_data, + value_idx* query_batch_indptr, + value_idx* query_batch_indices, + value_t* query_batch_data, + value_t* batch_dists) + { + /** + * Compute distances + */ + cuvs::distance::detail::sparse::distances_config_t dist_config(handle); + dist_config.b_nrows = idx_batcher.batch_rows(); + dist_config.b_ncols = n_idx_cols; + dist_config.b_nnz = idx_batch_nnz; + + dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indices = idx_batch_indices; + dist_config.b_data = idx_batch_data; + + dist_config.a_nrows = query_batcher.batch_rows(); + dist_config.a_ncols = n_query_cols; + dist_config.a_nnz = query_batch_nnz; + + dist_config.a_indptr = query_batch_indptr; + dist_config.a_indices = query_batch_indices; + dist_config.a_data = query_batch_data; + + cuvs::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg); + } + + const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; + value_idx* output_indices; + const value_t *idxData, *queryData; + value_t* output_dists; + + size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; + + cuvs::distance::DistanceType metric; + + float metricArg; + + int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; + + raft::resources const& handle; +}; + +}; // namespace cuvs::neighbors::detail diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp index a329db429..9b3da75a4 100644 --- a/cpp/src/neighbors/iface/iface.hpp +++ b/cpp/src/neighbors/iface/iface.hpp @@ -1,4 +1,20 @@ -#include +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once #include #include @@ -6,6 +22,9 @@ #include #include +#include +#include + namespace cuvs::neighbors { using namespace raft; @@ -16,7 +35,7 @@ void build(const raft::device_resources& handle, const cuvs::neighbors::index_params* index_params, raft::mdspan, row_major, Accessor> index_dataset) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { auto idx = cuvs::neighbors::ivf_flat::build( @@ -32,8 +51,6 @@ void build(const raft::device_resources& handle, interface.index_.emplace(std::move(idx)); } resource::sync_stream(handle); - - interface.mutex_->unlock(); } template @@ -44,7 +61,7 @@ void extend( std::optional, layout_c_contiguous, Accessor2>> new_indices) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { auto idx = @@ -58,8 +75,6 @@ void extend( RAFT_FAIL("CAGRA does not implement the extend method"); } resource::sync_stream(handle); - - interface.mutex_->unlock(); } template @@ -70,7 +85,7 @@ void search(const raft::device_resources& handle, raft::device_matrix_view neighbors, raft::device_matrix_view distances) { - // interface.mutex_->lock(); + // std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { cuvs::neighbors::ivf_flat::search( handle, @@ -94,9 +109,7 @@ void search(const raft::device_resources& handle, neighbors, distances); } - resource::sync_stream(handle); - - // interface.mutex_->unlock(); + // resource::sync_stream(handle); } // for MG ANN only @@ -108,7 +121,7 @@ void search(const raft::device_resources& handle, raft::device_matrix_view d_neighbors, raft::device_matrix_view d_distances) { - // interface.mutex_->lock(); + // std::lock_guard(*interface.mutex_); int64_t n_rows = h_queries.extent(0); int64_t n_dims = h_queries.extent(1); @@ -120,8 +133,6 @@ void search(const raft::device_resources& handle, auto d_query_view = raft::make_const_mdspan(d_queries.view()); search(handle, interface, search_params, d_query_view, d_neighbors, d_distances); - - // interface.mutex_->unlock(); } template @@ -129,7 +140,7 @@ void serialize(const raft::device_resources& handle, const cuvs::neighbors::iface& interface, std::ostream& os) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { ivf_flat::serialize(handle, os, interface.index_.value()); @@ -138,8 +149,6 @@ void serialize(const raft::device_resources& handle, } else if constexpr (std::is_same>::value) { cagra::serialize(handle, os, interface.index_.value(), true); } - - interface.mutex_->unlock(); } template @@ -147,7 +156,7 @@ void deserialize(const raft::device_resources& handle, cuvs::neighbors::iface& interface, std::istream& is) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); if constexpr (std::is_same>::value) { ivf_flat::index idx(handle); @@ -162,8 +171,6 @@ void deserialize(const raft::device_resources& handle, cagra::deserialize(handle, is, &idx); interface.index_.emplace(std::move(idx)); } - - interface.mutex_->unlock(); } template @@ -171,7 +178,7 @@ void deserialize(const raft::device_resources& handle, cuvs::neighbors::iface& interface, const std::string& filename) { - interface.mutex_->lock(); + std::lock_guard(*interface.mutex_); std::ifstream is(filename, std::ios::in | std::ios::binary); if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); } @@ -191,8 +198,6 @@ void deserialize(const raft::device_resources& handle, } is.close(); - - interface.mutex_->unlock(); } -}; // namespace cuvs::neighbors \ No newline at end of file +}; // namespace cuvs::neighbors diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh index fb110d810..d6ffc1218 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh @@ -132,6 +132,10 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels, { const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; if (i >= n_rows) { return; } + auto source_ix = source_ixs == nullptr ? i + batch_offset : source_ixs[i]; + // In the context of refinement, some indices may be invalid (the generating NN algorithm does + // not return enough valid items). Do not add the item to the index in this case. + if (source_ix == ivf::kInvalidRecord || source_ix == raft::upper_bound()) { return; } auto list_id = labels[i]; auto inlist_id = atomicAdd(list_sizes_ptr + list_id, 1); @@ -139,7 +143,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels, auto* list_data = list_data_ptrs[list_id]; // Record the source vector id in the index - list_index[inlist_id] = source_ixs == nullptr ? i + batch_offset : source_ixs[i]; + list_index[inlist_id] = source_ix; // The data is written in interleaved groups of `index::kGroupSize` vectors using interleaved_group = raft::Pow2; @@ -151,7 +155,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels, // Point to the source vector if constexpr (gather_src) { - source_vecs += source_ixs[i] * dim; + source_vecs += source_ix * dim; } else { source_vecs += i * dim; } diff --git a/cpp/src/neighbors/ivf_flat_c.cpp b/cpp/src/neighbors/ivf_flat_c.cpp old mode 100755 new mode 100644 index c14c1edc0..2acc6b678 --- a/cpp/src/neighbors/ivf_flat_c.cpp +++ b/cpp/src/neighbors/ivf_flat_c.cpp @@ -29,6 +29,8 @@ #include #include +#include + namespace { template diff --git a/cpp/src/neighbors/mg/mg.cuh b/cpp/src/neighbors/mg/mg.cuh index d3f635bc4..e9cdc30f6 100644 --- a/cpp/src/neighbors/mg/mg.cuh +++ b/cpp/src/neighbors/mg/mg.cuh @@ -25,6 +25,8 @@ #include #include +#include + namespace cuvs::neighbors { using namespace raft; diff --git a/cpp/src/neighbors/nn_descent.cuh b/cpp/src/neighbors/nn_descent.cuh index 582da72c1..ed91dac91 100644 --- a/cpp/src/neighbors/nn_descent.cuh +++ b/cpp/src/neighbors/nn_descent.cuh @@ -17,9 +17,14 @@ #pragma once #include "detail/nn_descent.cuh" +#include "detail/nn_descent_batch.cuh" + +#include +#include #include #include +#include #include namespace cuvs::neighbors::nn_descent { @@ -61,7 +66,15 @@ auto build(raft::resources const& res, index_params const& params, raft::device_matrix_view dataset) -> index { - return detail::build(res, params, dataset); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + return detail::experimental::batch_build(res, params, dataset); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + return detail::build(res, params, dataset); + } } /** @@ -100,7 +113,15 @@ void build(raft::resources const& res, raft::device_matrix_view dataset, index& idx) { - detail::build(res, params, dataset, idx); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + detail::experimental::batch_build(res, params, dataset, idx); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + detail::build(res, params, dataset, idx); + } } /** @@ -135,7 +156,15 @@ auto build(raft::resources const& res, index_params const& params, raft::host_matrix_view dataset) -> index { - return detail::build(res, params, dataset); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + return detail::experimental::batch_build(res, params, dataset); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + return detail::build(res, params, dataset); + } } /** @@ -174,7 +203,15 @@ void build(raft::resources const& res, raft::host_matrix_view dataset, index& idx) { - detail::build(res, params, dataset, idx); + if (params.n_clusters > 1) { + if constexpr (std::is_same_v) { + detail::experimental::batch_build(res, params, dataset, idx); + } else { + RAFT_FAIL("Batched nn-descent is only supported for float precision"); + } + } else { + detail::build(res, params, dataset, idx); + } } /** @} */ // end group nn-descent diff --git a/cpp/src/neighbors/nn_descent_float.cu b/cpp/src/neighbors/nn_descent_float.cu index c6d356671..fa85db127 100644 --- a/cpp/src/neighbors/nn_descent_float.cu +++ b/cpp/src/neighbors/nn_descent_float.cu @@ -19,21 +19,38 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + }; \ + } \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(float, uint32_t); diff --git a/cpp/src/neighbors/nn_descent_half.cu b/cpp/src/neighbors/nn_descent_half.cu index 587993031..2ee45d435 100644 --- a/cpp/src/neighbors/nn_descent_half.cu +++ b/cpp/src/neighbors/nn_descent_half.cu @@ -19,21 +19,39 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ + }; \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(half, uint32_t); diff --git a/cpp/src/neighbors/nn_descent_index.cpp b/cpp/src/neighbors/nn_descent_index.cpp new file mode 100644 index 000000000..25d5b6af8 --- /dev/null +++ b/cpp/src/neighbors/nn_descent_index.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cuvs::neighbors::nn_descent { + +index_params::index_params(size_t graph_degree, cuvs::distance::DistanceType metric) +{ + this->graph_degree = graph_degree; + this->intermediate_graph_degree = 1.5 * graph_degree; + this->metric = metric; +} +} // namespace cuvs::neighbors::nn_descent \ No newline at end of file diff --git a/cpp/src/neighbors/nn_descent_int8.cu b/cpp/src/neighbors/nn_descent_int8.cu index 813a01746..e150f511b 100644 --- a/cpp/src/neighbors/nn_descent_int8.cu +++ b/cpp/src/neighbors/nn_descent_int8.cu @@ -19,21 +19,39 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ + }; \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(int8_t, uint32_t); diff --git a/cpp/src/neighbors/nn_descent_uint8.cu b/cpp/src/neighbors/nn_descent_uint8.cu index 9d73dd90f..d8657777b 100644 --- a/cpp/src/neighbors/nn_descent_uint8.cu +++ b/cpp/src/neighbors/nn_descent_uint8.cu @@ -19,21 +19,39 @@ namespace cuvs::neighbors::nn_descent { -#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::device_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ - }; \ - \ - auto build(raft::resources const& handle, \ - const cuvs::neighbors::nn_descent::index_params& params, \ - raft::host_matrix_view dataset) \ - ->cuvs::neighbors::nn_descent::index \ - { \ - return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ +#define CUVS_INST_NN_DESCENT_BUILD(T, IdxT) \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::device_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ + }; \ + \ + auto build(raft::resources const& handle, \ + const cuvs::neighbors::nn_descent::index_params& params, \ + raft::host_matrix_view dataset, \ + std::optional> graph) \ + ->cuvs::neighbors::nn_descent::index \ + { \ + if (!graph.has_value()) { \ + return cuvs::neighbors::nn_descent::build(handle, params, dataset); \ + } else { \ + std::optional> distances = \ + std::nullopt; \ + cuvs::neighbors::nn_descent::index idx{handle, graph.value(), distances}; \ + cuvs::neighbors::nn_descent::build(handle, params, dataset, idx); \ + return idx; \ + } \ }; CUVS_INST_NN_DESCENT_BUILD(uint8_t, uint32_t); diff --git a/cpp/src/neighbors/sparse_brute_force.cu b/cpp/src/neighbors/sparse_brute_force.cu new file mode 100644 index 000000000..e277961ec --- /dev/null +++ b/cpp/src/neighbors/sparse_brute_force.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "detail/sparse_knn.cuh" + +namespace cuvs::neighbors::brute_force { +template +sparse_index::sparse_index(raft::resources const& res, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric, + T metric_arg) + : dataset_(dataset), metric_(metric), metric_arg_(metric_arg) +{ +} + +auto build(raft::resources const& handle, + raft::device_csr_matrix_view dataset, + cuvs::distance::DistanceType metric, + float metric_arg) -> cuvs::neighbors::brute_force::sparse_index +{ + return sparse_index(handle, dataset, metric, metric_arg); +} + +void search(raft::resources const& handle, + const sparse_search_params& params, + const sparse_index& index, + raft::device_csr_matrix_view query, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances) +{ + auto idx_structure = index.dataset().structure_view(); + auto query_structure = query.structure_view(); + int k = neighbors.extent(1); + + detail::sparse_knn_t(idx_structure.get_indptr().data(), + idx_structure.get_indices().data(), + index.dataset().get_elements().data(), + idx_structure.get_nnz(), + idx_structure.get_n_rows(), + idx_structure.get_n_cols(), + query_structure.get_indptr().data(), + query_structure.get_indices().data(), + query.get_elements().data(), + query_structure.get_nnz(), + query_structure.get_n_rows(), + query_structure.get_n_cols(), + neighbors.data_handle(), + distances.data_handle(), + k, + handle, + params.batch_size_index, + params.batch_size_query, + index.metric(), + index.metric_arg()) + .run(); +} +} // namespace cuvs::neighbors::brute_force diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 1ed8466b3..286d721d7 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -94,7 +94,7 @@ endfunction() if(BUILD_TESTS) ConfigureTest( NAME NEIGHBORS_TEST PATH neighbors/brute_force.cu neighbors/brute_force_prefiltered.cu - neighbors/refine.cu GPUS 1 PERCENT 100 + neighbors/sparse_brute_force.cu neighbors/refine.cu GPUS 1 PERCENT 100 ) ConfigureTest( @@ -137,6 +137,7 @@ if(BUILD_TESTS) NAME NEIGHBORS_ANN_CAGRA_TEST PATH + neighbors/ann_cagra/bug_extreme_inputs_oob.cu neighbors/ann_cagra/bug_multi_cta_crash.cu neighbors/ann_cagra/test_float_uint32_t.cu neighbors/ann_cagra/test_half_uint32_t.cu @@ -205,6 +206,7 @@ if(BUILD_TESTS) distance/dist_lp_unexp.cu distance/dist_russell_rao.cu distance/masked_nn.cu + distance/sparse_distance.cu sparse/neighbors/cross_component_nn.cu GPUS 1 diff --git a/cpp/test/distance/sparse_distance.cu b/cpp/test/distance/sparse_distance.cu new file mode 100644 index 000000000..f95487414 --- /dev/null +++ b/cpp/test/distance/sparse_distance.cu @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace distance { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseDistanceInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + + cuvs::distance::DistanceType metric; + + float metric_arg = 0.0; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs& dims) +{ + return os; +} + +template +class SparseDistanceTest + : public ::testing::TestWithParam> { + public: + SparseDistanceTest() + : params(::testing::TestWithParam>::GetParam()), + indptr(0, resource::get_cuda_stream(handle)), + indices(0, resource::get_cuda_stream(handle)), + data(0, resource::get_cuda_stream(handle)), + out_dists(0, resource::get_cuda_stream(handle)), + out_dists_ref(0, resource::get_cuda_stream(handle)) + { + } + + void SetUp() override + { + make_data(); + + int out_size = static_cast(params.indptr_h.size() - 1) * + static_cast(params.indptr_h.size() - 1); + + out_dists.resize(out_size, resource::get_cuda_stream(handle)); + + auto out = raft::make_device_matrix_view( + out_dists.data(), + static_cast(params.indptr_h.size() - 1), + static_cast(params.indptr_h.size() - 1)); + + auto x_structure = raft::make_device_compressed_structure_view( + indptr.data(), + indices.data(), + static_cast(params.indptr_h.size() - 1), + params.n_cols, + static_cast(params.indices_h.size())); + auto x = raft::make_device_csr_matrix_view(data.data(), x_structure); + + cuvs::distance::pairwise_distance(handle, x, x, out, params.metric, params.metric_arg); + + RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle))); + } + + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref.data(), + out_dists.data(), + params.out_dists_ref_h.size(), + CompareApprox(1e-3))); + } + + protected: + void make_data() + { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + auto stream = resource::get_cuda_stream(handle); + indptr.resize(indptr_h.size(), stream); + indices.resize(indices_h.size(), stream); + data.resize(data_h.size(), stream); + + update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); + update_device(indices.data(), indices_h.data(), indices_h.size(), stream); + update_device(data.data(), data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + + out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream); + + update_device(out_dists_ref.data(), + out_dists_ref_h.data(), + out_dists_ref_h.size(), + resource::get_cuda_stream(handle)); + } + + raft::resources handle; + + // input data + rmm::device_uvector indptr, indices; + rmm::device_uvector data; + + // output data + rmm::device_uvector out_dists, out_dists_ref; + + SparseDistanceInputs params; +}; + +const std::vector> inputs_i32_f = { + {5, + {0, 0, 1, 2}, + + {1, 2}, + {0.5, 0.5}, + {0, 1, 1, 1, 0, 1, 1, 1, 0}, + cuvs::distance::DistanceType::CosineExpanded, + 0.0}, + {5, + {0, 0, 1, 2}, + + {1, 2}, + {1.0, 1.0}, + {0, 1, 1, 1, 0, 1, 1, 1, 0}, + cuvs::distance::DistanceType::JaccardExpanded, + 0.0}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + cuvs::distance::DistanceType::L2Expanded, + 0.0}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, + cuvs::distance::DistanceType::InnerProduct, + 0.0}, + {2, + {0, 2, 4, 6, 8}, + {0, 1, 0, 1, 0, 1, 0, 1}, // indices + {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, + { + // dense output + 0.0, + 4.0, + 3026.0, + 226.0, + 4.0, + 0.0, + 2930.0, + 234.0, + 3026.0, + 2930.0, + 0.0, + 1832.0, + 226.0, + 234.0, + 1832.0, + 0.0, + }, + cuvs::distance::DistanceType::L2Unexpanded, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102, + 1., 0.76978799, 0.39419924, 0., 0.97577154, 0.48904013, 0.48300801, 0.45087445, + 0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., 0.79593037, 0.48904013, + 0.51413997, 0., 0.28605559, 0.35772784, 1., 0.60889396, 0.43324829, 0.84923694, + 0.45658883, 0.48300801, 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0., + 0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1., + 0.6745457, 0.77917274, 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., 0.51360432, 0.68185144, + 1., 0.54847744, 0.8321819, 0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432, + 0., 1., 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + 0.61547536, 0.68185144, 1., 0.}, + cuvs::distance::DistanceType::CosineExpanded, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {0.0, + 0.42857142857142855, + 0.7142857142857143, + 0.75, + 0.2857142857142857, + 0.75, + 0.7142857142857143, + 0.5, + 1.0, + 0.6666666666666666, + 0.42857142857142855, + 0.0, + 0.75, + 0.625, + 0.375, + 0.42857142857142855, + 0.75, + 0.375, + 0.75, + 0.7142857142857143, + 0.7142857142857143, + 0.75, + 0.0, + 0.7142857142857143, + 0.42857142857142855, + 0.7142857142857143, + 0.6666666666666666, + 0.625, + 0.6666666666666666, + 1.0, + 0.75, + 0.625, + 0.7142857142857143, + 0.0, + 0.5, + 0.5714285714285714, + 1.0, + 0.8, + 0.5, + 0.6666666666666666, + 0.2857142857142857, + 0.375, + 0.42857142857142855, + 0.5, + 0.0, + 0.6666666666666666, + 0.7777777777777778, + 0.4444444444444444, + 0.7777777777777778, + 0.75, + 0.75, + 0.42857142857142855, + 0.7142857142857143, + 0.5714285714285714, + 0.6666666666666666, + 0.0, + 0.7142857142857143, + 0.5, + 0.5, + 0.8571428571428571, + 0.7142857142857143, + 0.75, + 0.6666666666666666, + 1.0, + 0.7777777777777778, + 0.7142857142857143, + 0.0, + 0.42857142857142855, + 0.8571428571428571, + 0.8333333333333334, + 0.5, + 0.375, + 0.625, + 0.8, + 0.4444444444444444, + 0.5, + 0.42857142857142855, + 0.0, + 0.7777777777777778, + 0.75, + 1.0, + 0.75, + 0.6666666666666666, + 0.5, + 0.7777777777777778, + 0.5, + 0.8571428571428571, + 0.7777777777777778, + 0.0, + 1.0, + 0.6666666666666666, + 0.7142857142857143, + 1.0, + 0.6666666666666666, + 0.75, + 0.8571428571428571, + 0.8333333333333334, + 0.75, + 1.0, + 0.0}, + cuvs::distance::DistanceType::JaccardExpanded, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + cuvs::distance::DistanceType::Canberra, + 0.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + cuvs::distance::DistanceType::LpUnexpanded, + 2.0}, + + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + cuvs::distance::DistanceType::Linf, + 0.0}, + + {15, + {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, 0, 3, 7, 8, 12, 0, 2, 5, + 7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219, + 1., 0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246, + 0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739, + 0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409, 0.31461499, 0.24412279, 0.08327667, + 0.04444576, 0.05047969, 0.26190054, 0.2077349, 0.10803964}, + {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, + 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, + 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, + 1.00000000e+00, 7.33151506e-01, 1.00000000e+00, 9.86880955e-01, + 9.19154851e-01, 5.38849774e-01, 1.00000000e+00, 8.98332369e-01, + 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 8.03303970e-01, + 6.64465915e-01, 8.69374690e-01, 1.00000000e+00, 1.00000000e+00, + 1.00000000e+00, 1.00000000e+00, 9.24116813e-01, 7.33151506e-01, + 8.03303970e-01, 0.00000000e+00, 8.16225843e-01, 9.39818306e-01, + 7.27700415e-01, 7.30155528e-01, 8.89451011e-01, 8.05419635e-01, + 9.90039274e-01, 1.00000000e+00, 6.64465915e-01, 8.16225843e-01, + 0.00000000e+00, 6.38804490e-01, 1.00000000e+00, 1.00000000e+00, + 9.52559809e-01, 9.53789212e-01, 7.97613546e-01, 9.86880955e-01, + 8.69374690e-01, 9.39818306e-01, 6.38804490e-01, 0.0, + 1.00000000e+00, 9.72569112e-01, 8.24907516e-01, 8.07933016e-01, + 8.91271059e-01, 9.19154851e-01, 1.00000000e+00, 7.27700415e-01, + 1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 7.63596268e-01, + 8.40131263e-01, 7.40428532e-01, 1.00000000e+00, 5.38849774e-01, + 1.00000000e+00, 7.30155528e-01, 1.00000000e+00, 9.72569112e-01, + 7.63596268e-01, 0.00000000e+00, 1.00000000e+00, 7.95485011e-01, + 6.64669302e-01, 1.00000000e+00, 1.00000000e+00, 8.89451011e-01, + 9.52559809e-01, 8.24907516e-01, 8.40131263e-01, 1.00000000e+00, + 0.00000000e+00, 8.51370877e-01, 8.59439512e-01, 8.98332369e-01, + 1.00000000e+00, 8.05419635e-01, 9.53789212e-01, 8.07933016e-01, + 7.40428532e-01, 7.95485011e-01, 8.51370877e-01, 1.49011612e-08}, + // Dataset is L1 normalized into pdfs + cuvs::distance::DistanceType::HellingerExpanded, + 0.0}, + + {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + cuvs::distance::DistanceType::L1, + 0.0}, + {5, + {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45}, + {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2, + 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4}, + {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667, + 0.34426657, 0.2357925, 0.01274851, 0.11422017, 0.3437756, 0.31967718, 0.5956055, 0.31610373, + 0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, + 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736, + 0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815, + 0.31648823, 0.89874295, 0.27366735, 0.5119944, 0.11416134}, + {// dense output + 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, 0.76962708, 1.122858, + 1.1232498, 1.08166081, 0.48769777, 0., 1.31332116, 0.98318907, 0.42661815, 0.09279052, + 1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, + 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907, + 1.82943642, 0., 0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116, + 0.26657011, 0.42661815, 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, + 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, 0.45060069, 0., + 1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281, + 0.77814948, 1.29899154, 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, + 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., 1.47318624, 1.92660889, + 1.1232498, 0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624, + 0., 0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, + 1.88812175, 1.92660889, 0.24992619, 0.}, + cuvs::distance::DistanceType::CorrelationExpanded, + 0.0}, + {5, + {0, 1, 2, 4, 4, 5, 6, 7, 9, 9, 10}, + {1, 4, 0, 4, 1, 3, 0, 1, 3, 0}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {// dense output + 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., 1., 1., 1., + 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., + 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.8, 1., 1., + 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, + cuvs::distance::DistanceType::RusselRaoExpanded, + 0.0}, + {5, + {0, 1, 1, 3, 3, 4, 4, 6, 9, 10, 10}, + {0, 3, 4, 4, 2, 3, 0, 2, 3, 2}, + {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, + {// dense output + 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, + 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0., 0.4, 0., + 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, + 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, + 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, + 0.2, 0.2, 0.4, 0., 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, + cuvs::distance::DistanceType::HammingUnexpanded, + 0.0}, + {3, + {0, 1, 2}, + {0, 1}, + {1.0, 1.0}, + {0.0, 0.83255, 0.83255, 0.0}, + cuvs::distance::DistanceType::JensenShannon, + 0.0}, + {2, + {0, 1, 3}, + {0, 0, 1}, + {1.0, 0.5, 0.5}, + {0, 0.4645014, 0.4645014, 0}, + cuvs::distance::DistanceType::JensenShannon, + 0.0}, + {3, + {0, 1, 2}, + {0, 0}, + {1.0, 1.0}, + {0.0, 0.0, 0.0, 0.0}, + cuvs::distance::DistanceType::JensenShannon, + 0.0}, + + {3, + {0, 1, 2}, + {0, 1}, + {1.0, 1.0}, + {0.0, 1.0, 1.0, 0.0}, + cuvs::distance::DistanceType::DiceExpanded, + 0.0}, + {3, + {0, 1, 3}, + {0, 0, 1}, + {1.0, 1.0, 1.0}, + {0, 0.333333, 0.333333, 0}, + cuvs::distance::DistanceType::DiceExpanded, + 0.0}, + +}; + +typedef SparseDistanceTest SparseDistanceTestF; +TEST_P(SparseDistanceTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, + SparseDistanceTestF, + ::testing::ValuesIn(inputs_i32_f)); + +} // end namespace distance +} // end namespace cuvs diff --git a/cpp/test/neighbors/ann_brute_force.cuh b/cpp/test/neighbors/ann_brute_force.cuh index c2afa4e8b..03d6e820c 100644 --- a/cpp/test/neighbors/ann_brute_force.cuh +++ b/cpp/test/neighbors/ann_brute_force.cuh @@ -114,12 +114,28 @@ class AnnBruteForceTest : public ::testing::TestWithParam(handle_); + brute_force::deserialize(handle_, std::string{"brute_force_index"}, &index_loaded); + brute_force::search(handle_, - idx, + index_loaded, search_queries_view, indices_out_view, dists_out_view, cuvs::neighbors::filtering::none_sample_filter{}); + raft::resource::sync_stream(handle_); + + ASSERT_TRUE(cuvs::neighbors::devArrMatchKnnPair(indices_naive_dev.data(), + indices_bruteforce_dev.data(), + distances_naive_dev.data(), + distances_bruteforce_dev.data(), + ps.num_queries, + ps.k, + 0.001f, + stream_, + true)); } } diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 37d42dd1d..660246c67 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -361,8 +361,8 @@ class AnnCagraTest : public ::testing::TestWithParam { // not used for knn_graph building. switch (ps.build_algo) { case graph_build_algo::IVF_PQ: - index_params.graph_build_params = - graph_build_params::ivf_pq_params(raft::matrix_extent(ps.n_rows, ps.dim)); + index_params.graph_build_params = graph_build_params::ivf_pq_params( + raft::matrix_extent(ps.n_rows, ps.dim), index_params.metric); if (ps.ivf_pq_search_refine_ratio) { std::get( index_params.graph_build_params) @@ -370,8 +370,8 @@ class AnnCagraTest : public ::testing::TestWithParam { } break; case graph_build_algo::NN_DESCENT: { - index_params.graph_build_params = - graph_build_params::nn_descent_params(index_params.intermediate_graph_degree); + index_params.graph_build_params = graph_build_params::nn_descent_params( + index_params.intermediate_graph_degree, index_params.metric); break; } case graph_build_algo::AUTO: @@ -389,7 +389,7 @@ class AnnCagraTest : public ::testing::TestWithParam { (const DataT*)database.data(), ps.n_rows, ps.dim); { - cagra::index index(handle_); + cagra::index index(handle_, index_params.metric); if (ps.host_dataset) { auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); diff --git a/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu new file mode 100644 index 000000000..e21a54e9e --- /dev/null +++ b/cpp/test/neighbors/ann_cagra/bug_extreme_inputs_oob.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +namespace cuvs::neighbors::cagra { + +class cagra_extreme_inputs_oob_test : public ::testing::Test { + public: + using data_type = float; + + protected: + void run() + { + cagra::index_params ix_ps; + graph_build_params::ivf_pq_params gb_params{}; + gb_params.refinement_rate = 2; + ix_ps.graph_build_params = gb_params; + ix_ps.graph_degree = 64; + ix_ps.intermediate_graph_degree = 128; + + [[maybe_unused]] auto ix = cagra::build(res, ix_ps, raft::make_const_mdspan(dataset->view())); + raft::resource::sync_stream(res); + } + + void SetUp() override + { + dataset.emplace(raft::make_device_matrix(res, n_samples, n_dim)); + raft::random::RngState r(1234ULL); + raft::random::normal( + res, r, dataset->data_handle(), n_samples * n_dim, data_type(0), data_type(1e20)); + raft::resource::sync_stream(res); + } + + void TearDown() override + { + dataset.reset(); + raft::resource::sync_stream(res); + } + + private: + raft::resources res; + std::optional> dataset = std::nullopt; + + constexpr static int64_t n_samples = 100000; + constexpr static int64_t n_dim = 200; + constexpr static cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded; +}; + +TEST_F(cagra_extreme_inputs_oob_test, cagra_extreme_inputs_oob_test) { this->run(); } + +} // namespace cuvs::neighbors::cagra diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index 8cc46b2f7..23d84ca98 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh index bce0f9899..09861a219 100644 --- a/cpp/test/neighbors/ann_nn_descent.cuh +++ b/cpp/test/neighbors/ann_nn_descent.cuh @@ -18,11 +18,16 @@ #include "../test_utils.cuh" #include "ann_utils.cuh" +#include #include + #include +#include #include +#include #include "naive_knn.cuh" +#include #include @@ -42,6 +47,15 @@ struct AnnNNDescentInputs { double min_recall; }; +struct AnnNNDescentBatchInputs { + std::pair recall_cluster; + int n_rows; + int dim; + int graph_degree; + cuvs::distance::DistanceType metric; + bool host_dataset; +}; + inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& p) { os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree @@ -50,6 +64,14 @@ inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentInputs& return os; } +inline ::std::ostream& operator<<(::std::ostream& os, const AnnNNDescentBatchInputs& p) +{ + os << "dataset shape=" << p.n_rows << "x" << p.dim << ", graph_degree=" << p.graph_degree + << ", metric=" << static_cast(p.metric) << (p.host_dataset ? ", host" : ", device") + << ", clusters=" << p.recall_cluster.second << std::endl; + return os; +} + template class AnnNNDescentTest : public ::testing::TestWithParam { public: @@ -65,7 +87,9 @@ class AnnNNDescentTest : public ::testing::TestWithParam { { size_t queries_size = ps.n_rows * ps.graph_degree; std::vector indices_NNDescent(queries_size); + std::vector distances_NNDescent(queries_size); std::vector indices_naive(queries_size); + std::vector distances_naive(queries_size); { rmm::device_uvector distances_naive_dev(queries_size, stream_); @@ -81,16 +105,17 @@ class AnnNNDescentTest : public ::testing::TestWithParam { ps.graph_degree, ps.metric); raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); + raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); raft::resource::sync_stream(handle_); } - { { - cuvs::neighbors::nn_descent::index_params index_params; + nn_descent::index_params index_params; index_params.metric = ps.metric; index_params.graph_degree = ps.graph_degree; index_params.intermediate_graph_degree = 2 * ps.graph_degree; index_params.max_iterations = 100; + index_params.return_distances = true; auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.n_rows, ps.dim); @@ -99,24 +124,171 @@ class AnnNNDescentTest : public ::testing::TestWithParam { if (ps.host_dataset) { auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); + raft::resource::sync_stream(handle_); auto database_host_view = raft::make_host_matrix_view( (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); - auto index = - cuvs::neighbors::nn_descent::build(handle_, index_params, database_host_view); - raft::update_host( + auto index = nn_descent::build(handle_, index_params, database_host_view); + raft::copy( indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } + } else { - auto index = cuvs::neighbors::nn_descent::build(handle_, index_params, database_view); - raft::update_host( + auto index = nn_descent::build(handle_, index_params, database_view); + raft::copy( indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } }; } raft::resource::sync_stream(handle_); } + if (ps.metric == cuvs::distance::DistanceType::InnerProduct) { + std::transform( + distances_naive.begin(), distances_naive.end(), distances_naive.begin(), [](auto x) { + return -x; + }); + } + double min_recall = ps.min_recall; - EXPECT_TRUE(eval_recall( - indices_naive, indices_NNDescent, ps.n_rows, ps.graph_degree, 0.001, min_recall)); + EXPECT_TRUE(eval_neighbours(indices_naive, + indices_NNDescent, + distances_naive, + distances_NNDescent, + ps.n_rows, + ps.graph_degree, + 0.001, + min_recall)); + } + } + + void SetUp() override + { + database.resize(((size_t)ps.n_rows) * ps.dim, stream_); + raft::random::RngState r(1234ULL); + if constexpr (std::is_same{}) { + raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0)); + } else if constexpr (std::is_same{}) { + raft::random::uniformInt( + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(-5), DataT(5)); + } else { + raft::random::uniformInt(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0), DataT(5)); + } + raft::resource::sync_stream(handle_); + } + + void TearDown() override + { + raft::resource::sync_stream(handle_); + database.resize(0, stream_); + } + + private: + raft::resources handle_; + rmm::cuda_stream_view stream_; + AnnNNDescentInputs ps; + rmm::device_uvector database; +}; + +template +class AnnNNDescentBatchTest : public ::testing::TestWithParam { + public: + AnnNNDescentBatchTest() + : stream_(raft::resource::get_cuda_stream(handle_)), + ps(::testing::TestWithParam::GetParam()), + database(0, stream_) + { + } + + void testNNDescentBatch() + { + size_t queries_size = ps.n_rows * ps.graph_degree; + std::vector indices_NNDescent(queries_size); + std::vector distances_NNDescent(queries_size); + std::vector indices_naive(queries_size); + std::vector distances_naive(queries_size); + + { + rmm::device_uvector distances_naive_dev(queries_size, stream_); + rmm::device_uvector indices_naive_dev(queries_size, stream_); + naive_knn(handle_, + distances_naive_dev.data(), + indices_naive_dev.data(), + database.data(), + database.data(), + ps.n_rows, + ps.n_rows, + ps.dim, + ps.graph_degree, + ps.metric); + raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_); + raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); + raft::resource::sync_stream(handle_); + } + + { + { + nn_descent::index_params index_params; + index_params.metric = ps.metric; + index_params.graph_degree = ps.graph_degree; + index_params.intermediate_graph_degree = 2 * ps.graph_degree; + index_params.max_iterations = 10; + index_params.return_distances = true; + index_params.n_clusters = ps.recall_cluster.second; + + auto database_view = raft::make_device_matrix_view( + (const DataT*)database.data(), ps.n_rows, ps.dim); + + { + if (ps.host_dataset) { + auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); + raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); + auto database_host_view = raft::make_host_matrix_view( + (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); + auto index = nn_descent::build(handle_, index_params, database_host_view); + raft::copy( + indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } + + } else { + auto index = nn_descent::build(handle_, index_params, database_view); + raft::copy( + indices_NNDescent.data(), index.graph().data_handle(), queries_size, stream_); + if (index.distances().has_value()) { + raft::copy(distances_NNDescent.data(), + index.distances().value().data_handle(), + queries_size, + stream_); + } + }; + } + raft::resource::sync_stream(handle_); + } + double min_recall = ps.recall_cluster.first; + EXPECT_TRUE(eval_neighbours(indices_naive, + indices_NNDescent, + distances_naive, + distances_NNDescent, + ps.n_rows, + ps.graph_degree, + 0.01, + min_recall, + true, + static_cast(ps.graph_degree * 0.1))); } } @@ -142,16 +314,29 @@ class AnnNNDescentTest : public ::testing::TestWithParam { private: raft::resources handle_; rmm::cuda_stream_view stream_; - AnnNNDescentInputs ps; + AnnNNDescentBatchInputs ps; rmm::device_uvector database; }; -const std::vector inputs = raft::util::itertools::product( - {1000, 2000}, // n_rows - {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, // dim - {32, 64}, // graph_degree - {cuvs::distance::DistanceType::L2Expanded}, - {false, true}, - {0.90}); +const std::vector inputs = + raft::util::itertools::product({2000, 4000}, // n_rows + {4, 16, 64, 256, 1024}, // dim + {32, 64}, // graph_degree + {cuvs::distance::DistanceType::L2Expanded, + cuvs::distance::DistanceType::InnerProduct, + cuvs::distance::DistanceType::CosineExpanded}, + {false, true}, + {0.90}); + +// TODO : Investigate why this test is failing Reference issue https +// : // github.com/rapidsai/raft/issues/2450 +const std::vector inputsBatch = + raft::util::itertools::product( + {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)}, // min_recall, n_clusters + {4000, 5000}, // n_rows + {192, 512}, // dim + {32, 64}, // graph_degree + {cuvs::distance::DistanceType::L2Expanded}, + {false, true}); -} // namespace cuvs::neighbors::nn_descent +} // namespace cuvs::neighbors::nn_descent diff --git a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu index 64c0e0291..7a24f96a1 100644 --- a/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu +++ b/cpp/test/neighbors/ann_nn_descent/test_float_uint32_t.cu @@ -23,6 +23,12 @@ namespace cuvs::neighbors::nn_descent { typedef AnnNNDescentTest AnnNNDescentTestF_U32; TEST_P(AnnNNDescentTestF_U32, AnnNNDescent) { this->testNNDescent(); } +// typedef AnnNNDescentBatchTest AnnNNDescentBatchTestF_U32; +// TEST_P(AnnNNDescentBatchTestF_U32, AnnNNDescentBatch) { this->testNNDescentBatch(); } + INSTANTIATE_TEST_CASE_P(AnnNNDescentTest, AnnNNDescentTestF_U32, ::testing::ValuesIn(inputs)); +// INSTANTIATE_TEST_CASE_P(AnnNNDescentBatchTest, +// AnnNNDescentBatchTestF_U32, +// ::testing::ValuesIn(inputsBatch)); } // namespace cuvs::neighbors::nn_descent diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index b08e1d725..94bccade2 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include // raft::make_device_matrix #include @@ -165,9 +166,14 @@ auto calc_recall(const std::vector& expected_idx, /** check uniqueness of indices */ template -auto check_unique_indices(const std::vector& actual_idx, size_t rows, size_t cols) +auto check_unique_indices(const std::vector& actual_idx, + size_t rows, + size_t cols, + size_t max_duplicates = 0) { size_t max_count; + size_t dup_count = 0lu; + std::set unique_indices; for (size_t i = 0; i < rows; ++i) { unique_indices.clear(); @@ -180,8 +186,11 @@ auto check_unique_indices(const std::vector& actual_idx, size_t rows, size_t } else if (unique_indices.find(act_idx) == unique_indices.end()) { unique_indices.insert(act_idx); } else { - return testing::AssertionFailure() - << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! "; + dup_count++; + if (dup_count > max_duplicates) { + return testing::AssertionFailure() + << "Duplicated index " << act_idx << " at k " << k << " for query " << i << "! "; + } } } } @@ -264,7 +273,8 @@ auto eval_neighbours(const std::vector& expected_idx, size_t cols, double eps, double min_recall, - bool test_unique = true) -> testing::AssertionResult + bool test_unique = true, + size_t max_duplicates = 0) -> testing::AssertionResult { auto [actual_recall, match_count, total_count] = calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps); @@ -284,7 +294,7 @@ auto eval_neighbours(const std::vector& expected_idx, << min_recall << "); eps = " << eps << ". "; } if (test_unique) - return check_unique_indices(actual_idx, rows, cols); + return check_unique_indices(actual_idx, rows, cols, max_duplicates); else return testing::AssertionSuccess(); } diff --git a/cpp/test/neighbors/sparse_brute_force.cu b/cpp/test/neighbors/sparse_brute_force.cu new file mode 100644 index 000000000..cb68989d4 --- /dev/null +++ b/cpp/test/neighbors/sparse_brute_force.cu @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" + +#include +#include +#include + +#include +#include + +namespace cuvs { +namespace neighbors { + +using namespace raft; +using namespace raft::sparse; + +template +struct SparseKNNInputs { + value_idx n_cols; + + std::vector indptr_h; + std::vector indices_h; + std::vector data_h; + + std::vector out_dists_ref_h; + std::vector out_indices_ref_h; + + int k; + + int batch_size_index = 2; + int batch_size_query = 2; + + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2SqrtExpanded; +}; + +template +::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs& dims) +{ + return os; +} + +template +class SparseKNNTest : public ::testing::TestWithParam> { + public: + SparseKNNTest() + : params(::testing::TestWithParam>::GetParam()), + indptr(0, resource::get_cuda_stream(handle)), + indices(0, resource::get_cuda_stream(handle)), + data(0, resource::get_cuda_stream(handle)), + out_indices(0, resource::get_cuda_stream(handle)), + out_dists(0, resource::get_cuda_stream(handle)), + out_indices_ref(0, resource::get_cuda_stream(handle)), + out_dists_ref(0, resource::get_cuda_stream(handle)) + { + } + + protected: + void SetUp() override + { + n_rows = params.indptr_h.size() - 1; + nnz = params.indices_h.size(); + k = params.k; + + make_data(); + + auto index_structure = + raft::make_device_compressed_structure_view( + indptr.data(), indices.data(), n_rows, params.n_cols, nnz); + auto index_csr = raft::make_device_csr_matrix_view(data.data(), index_structure); + + auto index = cuvs::neighbors::brute_force::build(handle, index_csr, params.metric); + + cuvs::neighbors::brute_force::sparse_search_params search_params; + search_params.batch_size_index = params.batch_size_index; + search_params.batch_size_query = params.batch_size_query; + + cuvs::neighbors::brute_force::search( + handle, + search_params, + index, + index_csr, + raft::make_device_matrix_view(out_indices.data(), n_rows, k), + raft::make_device_matrix_view(out_dists.data(), n_rows, k)); + + RAFT_CUDA_TRY(cudaStreamSynchronize(resource::get_cuda_stream(handle))); + } + + void compare() + { + ASSERT_TRUE(devArrMatch( + out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox(1e-4))); + ASSERT_TRUE( + devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare())); + } + + protected: + void make_data() + { + std::vector indptr_h = params.indptr_h; + std::vector indices_h = params.indices_h; + std::vector data_h = params.data_h; + + auto stream = resource::get_cuda_stream(handle); + indptr.resize(indptr_h.size(), stream); + indices.resize(indices_h.size(), stream); + data.resize(data_h.size(), stream); + + update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); + update_device(indices.data(), indices_h.data(), indices_h.size(), stream); + update_device(data.data(), data_h.data(), data_h.size(), stream); + + std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_indices_ref_h = params.out_indices_ref_h; + + out_indices_ref.resize(out_indices_ref_h.size(), stream); + out_dists_ref.resize(out_dists_ref_h.size(), stream); + + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); + + out_dists.resize(n_rows * k, stream); + out_indices.resize(n_rows * k, stream); + } + + raft::resources handle; + + int n_rows, nnz, k; + + // input data + rmm::device_uvector indptr, indices; + rmm::device_uvector data; + + // output data + rmm::device_uvector out_indices; + rmm::device_uvector out_dists; + + rmm::device_uvector out_indices_ref; + rmm::device_uvector out_dists_ref; + + SparseKNNInputs params; +}; + +const std::vector> inputs_i32_f = { + {9, // ncols + {0, 2, 4, 6, 8}, // indptr + {0, 4, 0, 3, 0, 2, 0, 8}, // indices + {0.0f, 1.0f, 5.0f, 6.0f, 5.0f, 6.0f, 0.0f, 1.0f}, // data + {0, 1.41421, 0, 7.87401, 0, 7.87401, 0, 1.41421}, // dists + {0, 3, 1, 0, 2, 0, 3, 0}, // inds + 2, + 2, + 2, + cuvs::distance::DistanceType::L2SqrtExpanded}}; +typedef SparseKNNTest SparseKNNTestF; +TEST_P(SparseKNNTestF, Result) { compare(); } +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f)); + +}; // end namespace neighbors +}; // end namespace cuvs diff --git a/dependencies.yaml b/dependencies.yaml index 48f023ccc..f60ed59d3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -213,11 +213,11 @@ dependencies: - matrix: cuda: "12.*" packages: - - &cuda_python12 cuda-python>=12.0,<13.0a0 + - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0 - matrix: cuda: "11.*" packages: - - &cuda_python11 cuda-python>=11.7.1,<12.0a0 + - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3 - matrix: packages: - &cuda_python cuda-python diff --git a/docs/source/c_api/neighbors_bruteforce_c.rst b/docs/source/c_api/neighbors_bruteforce_c.rst index af0356eee..a12175209 100644 --- a/docs/source/c_api/neighbors_bruteforce_c.rst +++ b/docs/source/c_api/neighbors_bruteforce_c.rst @@ -32,3 +32,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: bruteforce_c_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst index 4d83cd3e3..988e5b6f3 100644 --- a/docs/source/c_api/neighbors_hnsw_c.rst +++ b/docs/source/c_api/neighbors_hnsw_c.rst @@ -29,13 +29,13 @@ Index Index search ------------ -.. doxygengroup:: cagra_c_index_search +.. doxygengroup:: hnsw_c_index_search :project: cuvs :members: :content-only: Index serialize ------------- +--------------- .. doxygengroup:: hnsw_c_index_serialize :project: cuvs diff --git a/docs/source/c_api/neighbors_ivf_flat_c.rst b/docs/source/c_api/neighbors_ivf_flat_c.rst index 9e1ccc0d1..1254d70ef 100644 --- a/docs/source/c_api/neighbors_ivf_flat_c.rst +++ b/docs/source/c_api/neighbors_ivf_flat_c.rst @@ -48,3 +48,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: ivf_flat_c_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/c_api/neighbors_ivf_pq_c.rst b/docs/source/c_api/neighbors_ivf_pq_c.rst index 070719609..260057b8c 100644 --- a/docs/source/c_api/neighbors_ivf_pq_c.rst +++ b/docs/source/c_api/neighbors_ivf_pq_c.rst @@ -48,3 +48,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: ivf_pq_c_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/cpp_api/neighbors_bruteforce.rst b/docs/source/cpp_api/neighbors_bruteforce.rst index 3adcb01c5..f75e26b3c 100644 --- a/docs/source/cpp_api/neighbors_bruteforce.rst +++ b/docs/source/cpp_api/neighbors_bruteforce.rst @@ -34,3 +34,11 @@ Index search :project: cuvs :members: :content-only: + +Index serialize +--------------- + +.. doxygengroup:: bruteforce_cpp_index_serialize + :project: cuvs + :members: + :content-only: diff --git a/docs/source/index.rst b/docs/source/index.rst index 647061ae5..286836c18 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,19 +1,8 @@ cuVS: Vector Search and Clustering on the GPU ============================================= - Welcome to cuVS, the premier library for GPU-accelerated vector search and clustering! cuVS provides several core building blocks for constructing new algorithms, as well as end-to-end vector search and clustering algorithms for use either standalone or through a growing list of :doc:`integrations `. -There are several benefits to using cuVS and GPUs for vector search, including - -#. Fast index build -#. Latency critical and high throughput search -#. Parameter tuning -#. Cost savings -#. Interoperability (build on GPU, deploy on CPU) -#. Multiple language support -#. Building blocks for composing new or accelerating existing algorithms - Useful Resources ################ @@ -26,6 +15,67 @@ Useful Resources - `Issue tracker `_: Report issues or request features. + +What is cuVS? +############# + +cuVS contains state-of-the-art implementations of several algorithms for running approximate and exact nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering. + +Vector search is an information retrieval method that has been growing in popularity over the past few years, partly because of the rising importance of multimedia embeddings created from unstructured data and the need to perform semantic search on the embeddings to find items which are semantically similar to each other. + +Vector search is also used in *data mining and machine learning* tasks and comprises an important step in many *clustering* and *visualization* algorithms like `UMAP `_, `t-SNE `_, K-means, and `HDBSCAN `_. + +Finally, faster vector search enables interactions between dense vectors and graphs. Converting a pile of dense vectors into nearest neighbors graphs unlocks the entire world of graph analysis algorithms, such as those found in `GraphBLAS `_ and `cuGraph `_. + +Below are some common use-cases for vector search + +Semantic search +~~~~~~~~~~~~~~~ +- Generative AI & Retrieval augmented generation (RAG) +- Recommender systems +- Computer vision +- Image search +- Text search +- Audio search +- Molecular search +- Model training + + +Data mining +~~~~~~~~~~~ +- Clustering algorithms +- Visualization algorithms +- Sampling algorithms +- Class balancing +- Ensemble methods +- k-NN graph construction + +Why cuVS? +######### + +There are several benefits to using cuVS and GPUs for vector search, including + +1. Fast index build +2. Latency critical and high throughput search +3. Parameter tuning +4. Cost savings +5. Interoperability (build on GPU, deploy on CPU) +6. Multiple language support +7. Building blocks for composing new or accelerating existing algorithms + +In addition to the items above, cuVS shoulders the responsibility of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be receiving the best performance and scale. + +cuVS Technology Stack +##################### + +cuVS is built on top of the RAPIDS RAFT library of high performance machine learning primitives and provides all the necessary routines for vector search and clustering on the GPU. + +.. image:: ../../img/tech_stack.png + :width: 600 + :alt: cuVS is built on top of low-level CUDA libraries and provides many important routines that enable vector search and clustering on the GPU + + + Contents ######## diff --git a/docs/source/python_api/neighbors_brute_force.rst b/docs/source/python_api/neighbors_brute_force.rst index 5fdc3658f..d756a6c80 100644 --- a/docs/source/python_api/neighbors_brute_force.rst +++ b/docs/source/python_api/neighbors_brute_force.rst @@ -20,3 +20,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.brute_force.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.brute_force.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.brute_force.load diff --git a/docs/source/python_api/neighbors_cagra.rst b/docs/source/python_api/neighbors_cagra.rst index 09b2e2694..e7155efb8 100644 --- a/docs/source/python_api/neighbors_cagra.rst +++ b/docs/source/python_api/neighbors_cagra.rst @@ -34,3 +34,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.cagra.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.cagra.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.cagra.load diff --git a/docs/source/python_api/neighbors_hnsw.rst b/docs/source/python_api/neighbors_hnsw.rst index 9922805b3..64fe5493b 100644 --- a/docs/source/python_api/neighbors_hnsw.rst +++ b/docs/source/python_api/neighbors_hnsw.rst @@ -28,3 +28,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.hnsw.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.hnsw.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.hnsw.load diff --git a/docs/source/python_api/neighbors_ivf_flat.rst b/docs/source/python_api/neighbors_ivf_flat.rst index 5514e5e43..f2c21e68a 100644 --- a/docs/source/python_api/neighbors_ivf_flat.rst +++ b/docs/source/python_api/neighbors_ivf_flat.rst @@ -32,3 +32,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.ivf_flat.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.ivf_flat.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.ivf_flat.load diff --git a/docs/source/python_api/neighbors_ivf_pq.rst b/docs/source/python_api/neighbors_ivf_pq.rst index e3625ba67..57668fbc3 100644 --- a/docs/source/python_api/neighbors_ivf_pq.rst +++ b/docs/source/python_api/neighbors_ivf_pq.rst @@ -32,3 +32,13 @@ Index search ############ .. autofunction:: cuvs.neighbors.ivf_pq.search + +Index save +########## + +.. autofunction:: cuvs.neighbors.ivf_pq.save + +Index load +########## + +.. autofunction:: cuvs.neighbors.ivf_pq.load diff --git a/examples/cpp/src/common.cuh b/examples/cpp/src/common.cuh index 1c93dec0e..8e109a764 100644 --- a/examples/cpp/src/common.cuh +++ b/examples/cpp/src/common.cuh @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include #include @@ -28,6 +30,8 @@ #include #include +#include + // Fill dataset and queries with synthetic data. void generate_dataset(raft::device_resources const &dev_resources, raft::device_matrix_view dataset, diff --git a/img/tech_stack.png b/img/tech_stack.png new file mode 100644 index 000000000..2b3eeedba Binary files /dev/null and b/img/tech_stack.png differ diff --git a/python/cuvs/cuvs/neighbors/brute_force/__init__.py b/python/cuvs/cuvs/neighbors/brute_force/__init__.py index b88c4b464..6aa0e4bb2 100644 --- a/python/cuvs/cuvs/neighbors/brute_force/__init__.py +++ b/python/cuvs/cuvs/neighbors/brute_force/__init__.py @@ -13,6 +13,6 @@ # limitations under the License. -from .brute_force import Index, build, search +from .brute_force import Index, build, load, save, search -__all__ = ["Index", "build", "search"] +__all__ = ["Index", "build", "search", "save", "load"] diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd index 183827916..f1fc14ba7 100644 --- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd +++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pxd @@ -47,3 +47,11 @@ cdef extern from "cuvs/neighbors/brute_force.h" nogil: DLManagedTensor* neighbors, DLManagedTensor* distances, cuvsFilter filter) except + + + cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res, + const char * filename, + cuvsBruteForceIndex_t index) except + + + cuvsError_t cuvsBruteForceDeserialize(cuvsResources_t res, + const char * filename, + cuvsBruteForceIndex_t index) except + diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx index 559302ccc..9d43bfb29 100644 --- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx +++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx @@ -24,6 +24,7 @@ from cuvs.common.resources import auto_sync_resources from cython.operator cimport dereference as deref from libc.stdint cimport uint32_t from libcpp cimport bool +from libcpp.string cimport string from cuvs.common cimport cydlpack from cuvs.distance_type cimport cuvsDistanceType @@ -31,9 +32,9 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES +from cuvs.neighbors.common import _check_input_array from cuvs.common.c_api cimport cuvsResources_t @@ -256,3 +257,88 @@ def search(Index index, )) return (distances, neighbors) + + +@auto_sync_resources +def save(filename, Index index, bool include_dataset=True, resources=None): + """ + Saves the index to a file. + + The serialization format can be subject to changes, therefore loading + an index saved with a previous version of cuvs is not guaranteed + to work. + + Parameters + ---------- + filename : string + Name of the file. + index : Index + Trained Brute Force index. + {resources_docstring} + + Examples + -------- + >>> import cupy as cp + >>> from cuvs.neighbors import brute_force + >>> n_samples = 50000 + >>> n_features = 50 + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + >>> # Build index + >>> index = brute_force.build(dataset) + >>> # Serialize and deserialize the brute_force index built + >>> brute_force.save("my_index.bin", index) + >>> index_loaded = brute_force.load("my_index.bin") + """ + cdef string c_filename = filename.encode('utf-8') + cdef cuvsResources_t res = resources.get_c_obj() + check_cuvs(cuvsBruteForceSerialize(res, + c_filename.c_str(), + index.index)) + + +@auto_sync_resources +def load(filename, resources=None): + """ + Loads index from file. + + The serialization format can be subject to changes, therefore loading + an index saved with a previous version of cuvs is not guaranteed + to work. + + + Parameters + ---------- + filename : string + Name of the file. + {resources_docstring} + + Returns + ------- + index : Index + + Examples + -------- + >>> import cupy as cp + >>> from cuvs.neighbors import brute_force + >>> n_samples = 50000 + >>> n_features = 50 + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + >>> # Build index + >>> index = brute_force.build(dataset) + >>> # Serialize and deserialize the brute_force index built + >>> brute_force.save("my_index.bin", index) + >>> index_loaded = brute_force.load("my_index.bin") + """ + cdef Index idx = Index() + cdef cuvsResources_t res = resources.get_c_obj() + cdef string c_filename = filename.encode('utf-8') + + check_cuvs(cuvsBruteForceDeserialize( + res, + c_filename.c_str(), + idx.index + )) + idx.trained = True + return idx diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx index 95209dbeb..752aef741 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx @@ -32,7 +32,8 @@ from cuvs.common cimport cydlpack from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array + +from cuvs.neighbors.common import _check_input_array from libc.stdint cimport ( int8_t, diff --git a/python/cuvs/cuvs/neighbors/common.py b/python/cuvs/cuvs/neighbors/common.py new file mode 100644 index 000000000..c14b9f8c9 --- /dev/null +++ b/python/cuvs/cuvs/neighbors/common.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _check_input_array(cai, exp_dt, exp_rows=None, exp_cols=None): + if cai.dtype not in exp_dt: + raise TypeError("dtype %s not supported" % cai.dtype) + + if not cai.c_contiguous: + raise ValueError("Row major input is expected") + + if exp_cols is not None and cai.shape[1] != exp_cols: + raise ValueError( + "Incorrect number of columns, expected {} got {}".format( + exp_cols, cai.shape[1] + ) + ) + + if exp_rows is not None and cai.shape[0] != exp_rows: + raise ValueError( + "Incorrect number of rows, expected {} , got {}".format( + exp_rows, cai.shape[0] + ) + ) diff --git a/python/cuvs/cuvs/neighbors/filters/filters.pyx b/python/cuvs/cuvs/neighbors/filters/filters.pyx index 3a81cb786..9bc2a905c 100644 --- a/python/cuvs/cuvs/neighbors/filters/filters.pyx +++ b/python/cuvs/cuvs/neighbors/filters/filters.pyx @@ -20,11 +20,11 @@ import numpy as np from libc.stdint cimport uintptr_t from cuvs.common cimport cydlpack +from cuvs.neighbors.common import _check_input_array from .filters cimport BITMAP, NO_FILTER, cuvsFilter from pylibraft.common.cai_wrapper import wrap_array -from pylibraft.neighbors.common import _check_input_array cdef class Prefilter: diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx index 018fcfef9..bcfaf167e 100644 --- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx +++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx @@ -21,6 +21,7 @@ from libcpp.string cimport string from cuvs.common.exceptions import check_cuvs from cuvs.common.resources import auto_sync_resources +from cuvs.neighbors.common import _check_input_array from cuvs.common cimport cydlpack @@ -36,7 +37,6 @@ import uuid from pylibraft.common import auto_convert_output from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array cdef class SearchParams: diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx index 25b9b2aee..7a169e1a0 100644 --- a/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_flat/ivf_flat.pyx @@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES +from cuvs.neighbors.common import _check_input_array from libc.stdint cimport ( int8_t, diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx index 3add1df75..531302ee6 100644 --- a/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/cuvs/cuvs/neighbors/ivf_pq/ivf_pq.pyx @@ -31,9 +31,9 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES +from cuvs.neighbors.common import _check_input_array from libc.stdint cimport ( int8_t, diff --git a/python/cuvs/cuvs/neighbors/refine.pyx b/python/cuvs/cuvs/neighbors/refine.pyx index 0eccc4108..b7aa35dca 100644 --- a/python/cuvs/cuvs/neighbors/refine.pyx +++ b/python/cuvs/cuvs/neighbors/refine.pyx @@ -31,13 +31,13 @@ from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible -from pylibraft.neighbors.common import _check_input_array from cuvs.distance import DISTANCE_TYPES from cuvs.common.c_api cimport cuvsResources_t from cuvs.common.exceptions import check_cuvs +from cuvs.neighbors.common import _check_input_array @auto_sync_resources diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py index 92b88f013..56e132c23 100644 --- a/python/cuvs/cuvs/test/test_cagra.py +++ b/python/cuvs/cuvs/test/test_cagra.py @@ -122,8 +122,9 @@ def run_cagra_build_search_test( @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) @pytest.mark.parametrize("array_type", ["device", "host"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) +@pytest.mark.parametrize("metric", ["euclidean"]) def test_cagra_dataset_dtype_host_device( - dtype, array_type, inplace, build_algo + dtype, array_type, inplace, build_algo, metric ): # Note that inner_product tests use normalized input which we cannot # represent in int8, therefore we test only sqeuclidean metric here. @@ -132,6 +133,7 @@ def test_cagra_dataset_dtype_host_device( inplace=inplace, array_type=array_type, build_algo=build_algo, + metric=metric, ) diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py index 0ae97266b..20a35401e 100644 --- a/python/cuvs/cuvs/test/test_hnsw.py +++ b/python/cuvs/cuvs/test/test_hnsw.py @@ -23,7 +23,7 @@ def run_hnsw_build_search_test( - n_rows=1000, + n_rows=10000, n_cols=10, n_queries=100, k=10, @@ -41,8 +41,6 @@ def run_hnsw_build_search_test( pytest.skip( "inner_product metric is not supported for int8/uint8 data" ) - if build_algo == "nn_descent": - pytest.skip("inner_product metric is not supported for nn_descent") build_params = cagra.IndexParams( metric=metric, @@ -83,7 +81,7 @@ def run_hnsw_build_search_test( @pytest.mark.parametrize("k", [10, 20]) @pytest.mark.parametrize("ef", [30, 40]) @pytest.mark.parametrize("num_threads", [2, 4]) -@pytest.mark.parametrize("metric", ["sqeuclidean"]) +@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) def test_hnsw(dtype, k, ef, num_threads, metric, build_algo): # Note that inner_product tests use normalized input which we cannot diff --git a/python/cuvs/cuvs/test/test_serialization.py b/python/cuvs/cuvs/test/test_serialization.py index 4ffccf121..1f4a54e87 100644 --- a/python/cuvs/cuvs/test/test_serialization.py +++ b/python/cuvs/cuvs/test/test_serialization.py @@ -17,7 +17,7 @@ import pytest from pylibraft.common import device_ndarray -from cuvs.neighbors import cagra, ivf_flat, ivf_pq +from cuvs.neighbors import brute_force, cagra, ivf_flat, ivf_pq from cuvs.test.ann_utils import generate_data @@ -35,6 +35,10 @@ def test_save_load_ivf_pq(): run_save_load(ivf_pq, np.float32) +def test_save_load_brute_force(): + run_save_load(brute_force, np.float32) + + def run_save_load(ann_module, dtype): n_rows = 10000 n_cols = 50 @@ -43,8 +47,11 @@ def run_save_load(ann_module, dtype): dataset = generate_data((n_rows, n_cols), dtype) dataset_device = device_ndarray(dataset) - build_params = ann_module.IndexParams() - index = ann_module.build(build_params, dataset_device) + if ann_module == brute_force: + index = ann_module.build(dataset_device) + else: + build_params = ann_module.IndexParams() + index = ann_module.build(build_params, dataset_device) assert index.trained filename = "my_index.bin" @@ -54,20 +61,29 @@ def run_save_load(ann_module, dtype): queries = generate_data((n_queries, n_cols), dtype) queries_device = device_ndarray(queries) - search_params = ann_module.SearchParams() k = 10 - - distance_dev, neighbors_dev = ann_module.search( - search_params, index, queries_device, k - ) + if ann_module == brute_force: + distance_dev, neighbors_dev = ann_module.search( + index, queries_device, k + ) + else: + search_params = ann_module.SearchParams() + distance_dev, neighbors_dev = ann_module.search( + search_params, index, queries_device, k + ) neighbors = neighbors_dev.copy_to_host() dist = distance_dev.copy_to_host() del index - distance_dev, neighbors_dev = ann_module.search( - search_params, loaded_index, queries_device, k - ) + if ann_module == brute_force: + distance_dev, neighbors_dev = ann_module.search( + loaded_index, queries_device, k + ) + else: + distance_dev, neighbors_dev = ann_module.search( + search_params, loaded_index, queries_device, k + ) neighbors2 = neighbors_dev.copy_to_host() dist2 = distance_dev.copy_to_host() diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml index bf62f5adf..92e4993c7 100644 --- a/python/cuvs/pyproject.toml +++ b/python/cuvs/pyproject.toml @@ -133,7 +133,14 @@ build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" +[tool.pydistcheck] +select = [ + # NOTE: size threshold is managed via CLI args in CI scripts + "distro-too-large-compressed", +] + [tool.pytest.ini_options] filterwarnings = [ "error", + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning" ]