From 660a2caa64f864e38e0e7bd19df86556d25aa7db Mon Sep 17 00:00:00 2001 From: Tarang Jain <40517122+tarang-jain@users.noreply.github.com> Date: Thu, 19 Dec 2024 13:29:55 -0800 Subject: [PATCH] Additional Distances for CAGRA C and Python API (#546) Add InnerProduct metric to CAGRA C and Python API + updates to CAGRA pytests. Closes https://github.com/rapidsai/cuvs/issues/545 Authors: - Tarang Jain (https://github.com/tarang-jain) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/cuvs/pull/546 --- cpp/include/cuvs/neighbors/cagra.h | 3 +++ cpp/src/neighbors/cagra_c.cpp | 6 ++++-- python/cuvs/cuvs/neighbors/cagra/cagra.pxd | 2 ++ python/cuvs/cuvs/neighbors/cagra/cagra.pyx | 20 ++++++++++++-------- python/cuvs/cuvs/test/test_cagra.py | 8 +++++--- 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.h b/cpp/include/cuvs/neighbors/cagra.h index 14331ebbc..f7f58a19c 100644 --- a/cpp/include/cuvs/neighbors/cagra.h +++ b/cpp/include/cuvs/neighbors/cagra.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -87,6 +88,8 @@ typedef struct cuvsCagraCompressionParams* cuvsCagraCompressionParams_t; * */ struct cuvsCagraIndexParams { + /** Distance type. */ + cuvsDistanceType metric; /** Degree of input graph for pruning. */ size_t intermediate_graph_degree; /** Degree of output graph. */ diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp index 326a89665..02b7a566e 100644 --- a/cpp/src/neighbors/cagra_c.cpp +++ b/cpp/src/neighbors/cagra_c.cpp @@ -41,7 +41,8 @@ void* _build(cuvsResources_t res, cuvsCagraIndexParams params, DLManagedTensor* auto res_ptr = reinterpret_cast(res); auto index = new cuvs::neighbors::cagra::index(*res_ptr); - auto index_params = cuvs::neighbors::cagra::index_params(); + auto index_params = cuvs::neighbors::cagra::index_params(); + index_params.metric = static_cast((int)params.metric), index_params.intermediate_graph_degree = params.intermediate_graph_degree; index_params.graph_degree = params.graph_degree; @@ -252,7 +253,8 @@ extern "C" cuvsError_t cuvsCagraSearch(cuvsResources_t res, extern "C" cuvsError_t cuvsCagraIndexParamsCreate(cuvsCagraIndexParams_t* params) { return cuvs::core::translate_exceptions([=] { - *params = new cuvsCagraIndexParams{.intermediate_graph_degree = 128, + *params = new cuvsCagraIndexParams{.metric = L2Expanded, + .intermediate_graph_degree = 128, .graph_degree = 64, .build_algo = IVF_PQ, .nn_descent_niter = 20}; diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd index bba5a91a8..a0f811480 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd @@ -28,6 +28,7 @@ from libcpp cimport bool from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor +from cuvs.distance_type cimport cuvsDistanceType cdef extern from "cuvs/neighbors/cagra.h" nogil: @@ -47,6 +48,7 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil: ctypedef cuvsCagraCompressionParams* cuvsCagraCompressionParams_t ctypedef struct cuvsCagraIndexParams: + cuvsDistanceType metric size_t intermediate_graph_degree size_t graph_degree cuvsCagraGraphBuildAlgo build_algo diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx index 752aef741..fd55905cf 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx @@ -28,11 +28,13 @@ from libcpp cimport bool, cast from libcpp.string cimport string from cuvs.common cimport cydlpack +from cuvs.distance_type cimport cuvsDistanceType from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray from pylibraft.common.cai_wrapper import wrap_array from pylibraft.common.interruptible import cuda_interruptible +from cuvs.distance import DISTANCE_TYPES from cuvs.neighbors.common import _check_input_array from libc.stdint cimport ( @@ -131,9 +133,11 @@ cdef class IndexParams: Parameters ---------- metric : string denoting the metric type, default="sqeuclidean" - Valid values for metric: ["sqeuclidean"], where + Valid values for metric: ["sqeuclidean", "inner_product"], where - sqeuclidean is the euclidean distance without the square root operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2 + - inner_product distance is defined as + distance(a, b) = \\sum_i a_i * b_i. intermediate_graph_degree : int, default = 128 graph_degree : int, default = 64 @@ -151,6 +155,7 @@ cdef class IndexParams: """ cdef cuvsCagraIndexParams* params + cdef object _metric # hold on to a reference to the compression, to keep from being GC'ed cdef public object compression @@ -170,10 +175,8 @@ cdef class IndexParams: nn_descent_niter=20, compression=None): - # todo (dgd): enable once other metrics are present - # and exposed in cuVS C API - # self.params.metric = _get_metric(metric) - # self.params.metric_arg = 0 + self._metric = metric + self.params.metric = DISTANCE_TYPES[metric] self.params.intermediate_graph_degree = intermediate_graph_degree self.params.graph_degree = graph_degree if build_algo == "ivf_pq": @@ -186,9 +189,9 @@ cdef class IndexParams: self.params.compression = \ compression.get_handle() - # @property - # def metric(self): - # return self.params.metric + @property + def metric(self): + return self._metric @property def intermediate_graph_degree(self): @@ -247,6 +250,7 @@ def build(IndexParams index_params, dataset, resources=None): The following distance metrics are supported: - L2 + - InnerProduct Parameters ---------- diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py index 56e132c23..d3b03a5d0 100644 --- a/python/cuvs/cuvs/test/test_cagra.py +++ b/python/cuvs/cuvs/test/test_cagra.py @@ -29,7 +29,7 @@ def run_cagra_build_search_test( n_queries=100, k=10, dtype=np.float32, - metric="euclidean", + metric="sqeuclidean", intermediate_graph_degree=128, graph_degree=64, build_algo="ivf_pq", @@ -42,6 +42,8 @@ def run_cagra_build_search_test( ): dataset = generate_data((n_rows, n_cols), dtype) if metric == "inner_product": + if dtype in [np.int8, np.uint8]: + pytest.skip("skip normalization for int8/uint8 data") dataset = normalize(dataset, norm="l2", axis=1) dataset_device = device_ndarray(dataset) @@ -122,7 +124,7 @@ def run_cagra_build_search_test( @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) @pytest.mark.parametrize("array_type", ["device", "host"]) @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"]) -@pytest.mark.parametrize("metric", ["euclidean"]) +@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"]) def test_cagra_dataset_dtype_host_device( dtype, array_type, inplace, build_algo, metric ): @@ -145,7 +147,7 @@ def test_cagra_dataset_dtype_host_device( "graph_degree": 32, "add_data_on_build": True, "k": 1, - "metric": "euclidean", + "metric": "sqeuclidean", "build_algo": "ivf_pq", }, {