diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh index 5910a5c59..f2b235765 100755 --- a/ci/validate_wheel.sh +++ b/ci/validate_wheel.sh @@ -6,12 +6,26 @@ set -euo pipefail package_dir=$1 wheel_dir_relative_path=$2 +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" + +# some packages are much larger on CUDA 11 than on CUDA 12 +if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then + PYDISTCHECK_ARGS=( + --max-allowed-size-compressed '1.4G' + ) +else + PYDISTCHECK_ARGS=( + --max-allowed-size-compressed '950M' + ) +fi + cd "${package_dir}" rapids-logger "validate packages with 'pydistcheck'" pydistcheck \ --inspect \ + "${PYDISTCHECK_ARGS[@]}" \ "$(echo ${wheel_dir_relative_path}/*.whl)" rapids-logger "validate packages with 'twine'" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c493af488..81b82aa7b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -436,6 +436,7 @@ if(BUILD_SHARED_LIBS) src/neighbors/nn_descent.cu src/neighbors/nn_descent_float.cu src/neighbors/nn_descent_half.cu + src/neighbors/nn_descent_index.cpp src/neighbors/nn_descent_int8.cu src/neighbors/nn_descent_uint8.cu src/neighbors/reachability.cu diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp index bd41d1ff7..9cd8192b5 100644 --- a/cpp/include/cuvs/neighbors/nn_descent.hpp +++ b/cpp/include/cuvs/neighbors/nn_descent.hpp @@ -61,11 +61,10 @@ struct index_params : cuvs::neighbors::index_params { /** @brief Construct NN descent parameters for a specific kNN graph degree * * @param graph_degree output graph degree + * @param metric distance metric to use */ - index_params(size_t graph_degree = 64) - : graph_degree(graph_degree), intermediate_graph_degree(1.5 * graph_degree) - { - } + index_params(size_t graph_degree = 64, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded); }; /** @@ -103,11 +102,16 @@ struct index : cuvs::neighbors::index { * @param n_rows number of rows in knn-graph * @param n_cols number of cols in knn-graph * @param return_distances whether to return distances + * @param metric distance metric to use */ - index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false) + index(raft::resources const& res, + int64_t n_rows, + int64_t n_cols, + bool return_distances = false, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) : cuvs::neighbors::index(), res_{res}, - metric_{cuvs::distance::DistanceType::L2Expanded}, + metric_{metric}, graph_{raft::make_host_matrix(n_rows, n_cols)}, graph_view_{graph_.view()}, return_distances_{return_distances} @@ -129,14 +133,16 @@ struct index : cuvs::neighbors::index { * @param graph_view raft::host_matrix_view for storing knn-graph * @param distances_view optional raft::device_matrix_view for storing * distances + * @param metric distance metric to use */ index(raft::resources const& res, raft::host_matrix_view graph_view, std::optional> distances_view = - std::nullopt) + std::nullopt, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) : cuvs::neighbors::index(), res_{res}, - metric_{cuvs::distance::DistanceType::L2Expanded}, + metric_{metric}, graph_{raft::make_host_matrix(0, 0)}, graph_view_{graph_view}, distances_view_{distances_view}, @@ -473,8 +479,6 @@ auto build(raft::resources const& res, std::optional> graph = std::nullopt) -> cuvs::neighbors::nn_descent::index; -/** @} */ - /** * @brief Test if we have enough GPU memory to run NN descent algorithm. * diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 6209ff819..b7fec724b 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -436,11 +436,11 @@ index build( auto knn_build_params = params.graph_build_params; if (std::holds_alternative(params.graph_build_params)) { // Heuristic to decide default build algo and its params. - if (params.metric == cuvs::distance::DistanceType::L2Expanded && - cuvs::neighbors::nn_descent::has_enough_device_memory( + if (cuvs::neighbors::nn_descent::has_enough_device_memory( res, dataset.extents(), sizeof(IdxT))) { RAFT_LOG_DEBUG("NN descent solver"); - knn_build_params = cagra::graph_build_params::nn_descent_params(intermediate_degree); + knn_build_params = + cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric); } else { RAFT_LOG_DEBUG("Selecting IVF-PQ solver"); knn_build_params = cagra::graph_build_params::ivf_pq_params(dataset.extents(), params.metric); @@ -453,9 +453,6 @@ index build( std::get(knn_build_params); build_knn_graph(res, dataset, knn_graph->view(), ivf_pq_params); } else { - RAFT_EXPECTS( - params.metric == cuvs::distance::DistanceType::L2Expanded, - "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent"); auto nn_descent_params = std::get(knn_build_params); @@ -466,7 +463,8 @@ index build( "nn-descent graph_degree.", nn_descent_params.graph_degree, intermediate_degree); - nn_descent_params = cagra::graph_build_params::nn_descent_params(intermediate_degree); + nn_descent_params = + cagra::graph_build_params::nn_descent_params(intermediate_degree, params.metric); } // Use nn-descent to build CAGRA knn graph diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index 883d82d76..c62a52540 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -19,6 +19,7 @@ #include "ann_utils.cuh" #include "cagra/device_common.hpp" +#include #include #include @@ -216,6 +217,7 @@ struct BuildConfig { size_t max_iterations{50}; float termination_threshold{0.0001}; size_t output_graph_degree{32}; + cuvs::distance::DistanceType metric{cuvs::distance::DistanceType::L2Expanded}; }; template @@ -454,11 +456,13 @@ __device__ __forceinline__ void load_vec(Data_t* vec_buffer, // TODO: Replace with RAFT utilities https://github.com/rapidsai/raft/issues/1827 /** Calculate L2 norm, and cast data to __half */ template -RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data, - __half* output_data, - int dim, - DistData_t* l2_norms, - size_t list_offset = 0) +RAFT_KERNEL preprocess_data_kernel( + const Data_t* input_data, + __half* output_data, + int dim, + DistData_t* l2_norms, + size_t list_offset = 0, + cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded) { extern __shared__ char buffer[]; __shared__ float l2_norm; @@ -468,26 +472,32 @@ RAFT_KERNEL preprocess_data_kernel(const Data_t* input_data, load_vec(s_vec, input_data + blockIdx.x * dim, dim, dim, threadIdx.x % raft::warp_size()); if (threadIdx.x == 0) { l2_norm = 0; } __syncthreads(); - int lane_id = threadIdx.x % raft::warp_size(); - for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { - int idx = step * raft::warp_size() + lane_id; - float part_dist = 0; - if (idx < dim) { - part_dist = s_vec[idx]; - part_dist = part_dist * part_dist; - } - __syncwarp(); - for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { - part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); + + if (metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::CosineExpanded) { + int lane_id = threadIdx.x % raft::warp_size(); + for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { + int idx = step * raft::warp_size() + lane_id; + float part_dist = 0; + if (idx < dim) { + part_dist = s_vec[idx]; + part_dist = part_dist * part_dist; + } + __syncwarp(); + for (int offset = raft::warp_size() >> 1; offset >= 1; offset >>= 1) { + part_dist += __shfl_down_sync(raft::warp_full_mask(), part_dist, offset); + } + if (lane_id == 0) { l2_norm += part_dist; } + __syncwarp(); } - if (lane_id == 0) { l2_norm += part_dist; } - __syncwarp(); } for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { int idx = step * raft::warp_size() + threadIdx.x; if (idx < dim) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { + output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { output_data[list_id * dim + idx] = (float)input_data[(size_t)blockIdx.x * dim + idx] / sqrt(l2_norm); } else { @@ -715,7 +725,8 @@ __launch_bounds__(BLOCK_SIZE, 4) DistData_t* dists, int graph_width, int* locks, - DistData_t* l2_norms) + DistData_t* l2_norms, + cuvs::distance::DistanceType metric) { #if (__CUDA_ARCH__ >= 700) using namespace nvcuda; @@ -827,8 +838,10 @@ __launch_bounds__(BLOCK_SIZE, 4) for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_new_size && i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { s_distances[i] = -s_distances[i]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { + s_distances[i] = 1.0 - s_distances[i]; } else { s_distances[i] = l2_norms[new_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - @@ -906,8 +919,10 @@ __launch_bounds__(BLOCK_SIZE, 4) for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { if (i % SKEWED_MAX_NUM_BI_SAMPLES < list_old_size && i / SKEWED_MAX_NUM_BI_SAMPLES < list_new_size) { - if (l2_norms == nullptr) { + if (metric == cuvs::distance::DistanceType::InnerProduct) { s_distances[i] = -s_distances[i]; + } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { + s_distances[i] = 1.0 - s_distances[i]; } else { s_distances[i] = l2_norms[old_neighbors[i % SKEWED_MAX_NUM_BI_SAMPLES]] + l2_norms[new_neighbors[i / SKEWED_MAX_NUM_BI_SAMPLES]] - @@ -1161,7 +1176,7 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build ndim_(build_config.dataset_dim), d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>( res, nrow_, build_config.dataset_dim)}, - l2_norms_{raft::make_device_vector(res, nrow_)}, + l2_norms_{raft::make_device_vector(res, 0)}, graph_buffer_{ raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, dists_buffer_{ @@ -1181,11 +1196,16 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build d_list_sizes_old_{raft::make_device_vector(res, nrow_)} { static_assert(NUM_SAMPLES <= 32); + raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); auto graph_buffer_view = raft::make_device_matrix_view( reinterpret_cast(graph_buffer_.data_handle()), nrow_, DEGREE_ON_DEVICE); raft::matrix::fill(res, graph_buffer_view, std::numeric_limits::max()); raft::matrix::fill(res, d_locks_.view(), 0); + + if (build_config.metric == cuvs::distance::DistanceType::L2Expanded) { + l2_norms_ = raft::make_device_vector(res, nrow_); + } }; template @@ -1228,7 +1248,8 @@ void GNND::local_join(cudaStream_t stream) dists_buffer_.data_handle(), DEGREE_ON_DEVICE, d_locks_.data_handle(), - l2_norms_.data_handle()); + l2_norms_.data_handle(), + build_config_.metric); } template @@ -1261,7 +1282,8 @@ void GNND::build(Data_t* data, d_data_.data_handle(), build_config_.dataset_dim, l2_norms_.data_handle(), - batch.offset()); + batch.offset(), + build_config_.metric); } graph_.clear(); @@ -1417,6 +1439,11 @@ void build(raft::resources const& res, RAFT_EXPECTS(dataset.extent(0) < std::numeric_limits::max() - 1, "The dataset size for GNND should be less than %d", std::numeric_limits::max() - 1); + auto allowed_metrics = params.metric == cuvs::distance::DistanceType::L2Expanded || + params.metric == cuvs::distance::DistanceType::CosineExpanded || + params.metric == cuvs::distance::DistanceType::InnerProduct; + RAFT_EXPECTS(allowed_metrics && idx.metric() == params.metric, + "The metric for NN Descent should be L2Expanded, CosineExpanded or InnerProduct"); size_t intermediate_degree = params.intermediate_graph_degree; size_t graph_degree = params.graph_degree; @@ -1452,7 +1479,8 @@ void build(raft::resources const& res, .internal_node_degree = extended_intermediate_degree, .max_iterations = params.max_iterations, .termination_threshold = params.termination_threshold, - .output_graph_degree = params.graph_degree}; + .output_graph_degree = params.graph_degree, + .metric = params.metric}; GNND nnd(res, build_config); @@ -1500,8 +1528,11 @@ index build( graph_degree = intermediate_degree; } - index idx{ - res, dataset.extent(0), static_cast(graph_degree), params.return_distances}; + index idx{res, + dataset.extent(0), + static_cast(graph_degree), + params.return_distances, + params.metric}; build(res, params, dataset, idx); diff --git a/cpp/src/neighbors/nn_descent_index.cpp b/cpp/src/neighbors/nn_descent_index.cpp new file mode 100644 index 000000000..25d5b6af8 --- /dev/null +++ b/cpp/src/neighbors/nn_descent_index.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cuvs::neighbors::nn_descent { + +index_params::index_params(size_t graph_degree, cuvs::distance::DistanceType metric) +{ + this->graph_degree = graph_degree; + this->intermediate_graph_degree = 1.5 * graph_degree; + this->metric = metric; +} +} // namespace cuvs::neighbors::nn_descent \ No newline at end of file diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh index 37d42dd1d..660246c67 100644 --- a/cpp/test/neighbors/ann_cagra.cuh +++ b/cpp/test/neighbors/ann_cagra.cuh @@ -361,8 +361,8 @@ class AnnCagraTest : public ::testing::TestWithParam { // not used for knn_graph building. switch (ps.build_algo) { case graph_build_algo::IVF_PQ: - index_params.graph_build_params = - graph_build_params::ivf_pq_params(raft::matrix_extent(ps.n_rows, ps.dim)); + index_params.graph_build_params = graph_build_params::ivf_pq_params( + raft::matrix_extent(ps.n_rows, ps.dim), index_params.metric); if (ps.ivf_pq_search_refine_ratio) { std::get( index_params.graph_build_params) @@ -370,8 +370,8 @@ class AnnCagraTest : public ::testing::TestWithParam { } break; case graph_build_algo::NN_DESCENT: { - index_params.graph_build_params = - graph_build_params::nn_descent_params(index_params.intermediate_graph_degree); + index_params.graph_build_params = graph_build_params::nn_descent_params( + index_params.intermediate_graph_degree, index_params.metric); break; } case graph_build_algo::AUTO: @@ -389,7 +389,7 @@ class AnnCagraTest : public ::testing::TestWithParam { (const DataT*)database.data(), ps.n_rows, ps.dim); { - cagra::index index(handle_); + cagra::index index(handle_, index_params.metric); if (ps.host_dataset) { auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh index 7d2575c2b..09861a219 100644 --- a/cpp/test/neighbors/ann_nn_descent.cuh +++ b/cpp/test/neighbors/ann_nn_descent.cuh @@ -27,6 +27,7 @@ #include #include "naive_knn.cuh" +#include #include @@ -107,7 +108,6 @@ class AnnNNDescentTest : public ::testing::TestWithParam { raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_); raft::resource::sync_stream(handle_); } - { { nn_descent::index_params index_params; @@ -124,6 +124,7 @@ class AnnNNDescentTest : public ::testing::TestWithParam { if (ps.host_dataset) { auto database_host = raft::make_host_matrix(ps.n_rows, ps.dim); raft::copy(database_host.data_handle(), database.data(), database.size(), stream_); + raft::resource::sync_stream(handle_); auto database_host_view = raft::make_host_matrix_view( (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim); auto index = nn_descent::build(handle_, index_params, database_host_view); @@ -151,6 +152,13 @@ class AnnNNDescentTest : public ::testing::TestWithParam { raft::resource::sync_stream(handle_); } + if (ps.metric == cuvs::distance::DistanceType::InnerProduct) { + std::transform( + distances_naive.begin(), distances_naive.end(), distances_naive.begin(), [](auto x) { + return -x; + }); + } + double min_recall = ps.min_recall; EXPECT_TRUE(eval_neighbours(indices_naive, indices_NNDescent, @@ -169,9 +177,11 @@ class AnnNNDescentTest : public ::testing::TestWithParam { raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0)); - } else { + } else if constexpr (std::is_same{}) { raft::random::uniformInt( - handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20)); + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(-5), DataT(5)); + } else { + raft::random::uniformInt(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0), DataT(5)); } raft::resource::sync_stream(handle_); } @@ -308,13 +318,15 @@ class AnnNNDescentBatchTest : public ::testing::TestWithParam database; }; -const std::vector inputs = raft::util::itertools::product( - {1000, 2000}, // n_rows - {3, 5, 7, 8, 17, 64, 128, 137, 192, 256, 512, 619, 1024}, // dim - {32, 64}, // graph_degree - {cuvs::distance::DistanceType::L2Expanded}, - {false, true}, - {0.90}); +const std::vector inputs = + raft::util::itertools::product({2000, 4000}, // n_rows + {4, 16, 64, 256, 1024}, // dim + {32, 64}, // graph_degree + {cuvs::distance::DistanceType::L2Expanded, + cuvs::distance::DistanceType::InnerProduct, + cuvs::distance::DistanceType::CosineExpanded}, + {false, true}, + {0.90}); // TODO : Investigate why this test is failing Reference issue https // : // github.com/rapidsai/raft/issues/2450 diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py index 92b88f013..56e132c23 100644 --- a/python/cuvs/cuvs/test/test_cagra.py +++ b/python/cuvs/cuvs/test/test_cagra.py @@ -122,8 +122,9 @@ def run_cagra_build_search_test( @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) @pytest.mark.parametrize("array_type", ["device", "host"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) +@pytest.mark.parametrize("metric", ["euclidean"]) def test_cagra_dataset_dtype_host_device( - dtype, array_type, inplace, build_algo + dtype, array_type, inplace, build_algo, metric ): # Note that inner_product tests use normalized input which we cannot # represent in int8, therefore we test only sqeuclidean metric here. @@ -132,6 +133,7 @@ def test_cagra_dataset_dtype_host_device( inplace=inplace, array_type=array_type, build_algo=build_algo, + metric=metric, ) diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py index 8bd2e8b76..20a35401e 100644 --- a/python/cuvs/cuvs/test/test_hnsw.py +++ b/python/cuvs/cuvs/test/test_hnsw.py @@ -41,8 +41,6 @@ def run_hnsw_build_search_test( pytest.skip( "inner_product metric is not supported for int8/uint8 data" ) - if build_algo == "nn_descent": - pytest.skip("inner_product metric is not supported for nn_descent") build_params = cagra.IndexParams( metric=metric, @@ -83,7 +81,7 @@ def run_hnsw_build_search_test( @pytest.mark.parametrize("k", [10, 20]) @pytest.mark.parametrize("ef", [30, 40]) @pytest.mark.parametrize("num_threads", [2, 4]) -@pytest.mark.parametrize("metric", ["sqeuclidean"]) +@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) def test_hnsw(dtype, k, ef, num_threads, metric, build_algo): # Note that inner_product tests use normalized input which we cannot diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml index d40026776..92e4993c7 100644 --- a/python/cuvs/pyproject.toml +++ b/python/cuvs/pyproject.toml @@ -135,12 +135,10 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" [tool.pydistcheck] select = [ + # NOTE: size threshold is managed via CLI args in CI scripts "distro-too-large-compressed", ] -# detect when package size grows significantly -max_allowed_size_compressed = '1.4G' - [tool.pytest.ini_options] filterwarnings = [ "error",