From 12480cf795196cf487dda25aa77599ac5683627f Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Sat, 9 Sep 2023 20:53:56 -0700 Subject: [PATCH 1/4] Cagra memory optimizations (#1790) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When trying to build a CAGRA index with 500M embeddings, we were running out of memory - even when using managed memory. This PR contains some changes to reduce the memory usage: * For certain large matrices, don't make 2nd copies on the device or host if the memory is already accessible via UVM /ATS/HMM. For instance, we were taking a copy of the intermediate graph from host to device memory - and in certain cases (500M dataset, intermediate_graph_degree=128) the intermediate graph was 256GB alone. * Don’t create a separate ‘pruned_graph’ host matrix in the `optimize` call, and just use host memory passed in by caller * Free the intermediate graph before creating the index Authors: - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1790 --- cpp/include/raft/neighbors/cagra.cuh | 10 +- .../neighbors/detail/cagra/graph_core.cuh | 77 ++++++--------- .../raft/neighbors/detail/cagra/utils.hpp | 95 +++++++++++++++++++ 3 files changed, 130 insertions(+), 52 deletions(-) diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh index 6bb7beca55..903d0571dc 100644 --- a/cpp/include/raft/neighbors/cagra.cuh +++ b/cpp/include/raft/neighbors/cagra.cuh @@ -256,13 +256,17 @@ index build(raft::resources const& res, graph_degree = intermediate_degree; } - auto knn_graph = raft::make_host_matrix(dataset.extent(0), intermediate_degree); + std::optional> knn_graph( + raft::make_host_matrix(dataset.extent(0), intermediate_degree)); - build_knn_graph(res, dataset, knn_graph.view()); + build_knn_graph(res, dataset, knn_graph->view()); auto cagra_graph = raft::make_host_matrix(dataset.extent(0), graph_degree); - optimize(res, knn_graph.view(), cagra_graph.view()); + optimize(res, knn_graph->view(), cagra_graph.view()); + + // free intermediate graph before trying to create the index + knn_graph.reset(); // Construct an index from dataset and optimized knn graph. return index(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view())); diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh index 0558d7ea39..18d451be60 100644 --- a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh @@ -334,18 +334,13 @@ void optimize(raft::resources const& res, auto output_graph_ptr = new_graph.data_handle(); const IdxT graph_size = new_graph.extent(0); - auto pruned_graph = raft::make_host_matrix(graph_size, output_graph_degree); - { // // Prune kNN graph // - auto d_input_graph = - raft::make_device_matrix(res, graph_size, input_graph_degree); - - auto detour_count = raft::make_host_matrix(graph_size, input_graph_degree); auto d_detour_count = raft::make_device_matrix(res, graph_size, input_graph_degree); + RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(), 0xff, graph_size * input_graph_degree * sizeof(uint8_t), @@ -376,24 +371,13 @@ void optimize(raft::resources const& res, const double time_prune_start = cur_time(); RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r"); - raft::copy(d_input_graph.data_handle(), - input_graph_ptr, - graph_size * input_graph_degree, - resource::get_cuda_stream(res)); - void (*kernel_prune)(const IdxT* const, - const uint32_t, - const uint32_t, - const uint32_t, - const uint32_t, - const uint32_t, - uint8_t* const, - uint32_t* const, - uint64_t* const); + // Copy input_graph_ptr over to device if necessary + device_matrix_view_from_host d_input_graph( + res, + raft::make_host_matrix_view(input_graph_ptr, graph_size, input_graph_degree)); constexpr int MAX_DEGREE = 1024; - if (input_graph_degree <= MAX_DEGREE) { - kernel_prune = kern_prune; - } else { + if (input_graph_degree > MAX_DEGREE) { RAFT_FAIL( "The degree of input knn graph is too large (%u). " "It must be equal to or smaller than %d.", @@ -410,16 +394,17 @@ void optimize(raft::resources const& res, dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, resource::get_cuda_stream(res))); for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) { - kernel_prune<<>>( - d_input_graph.data_handle(), - graph_size, - input_graph_degree, - output_graph_degree, - batch_size, - i_batch, - d_detour_count.data_handle(), - d_num_no_detour_edges.data_handle(), - dev_stats.data_handle()); + kern_prune + <<>>( + d_input_graph.data_handle(), + graph_size, + input_graph_degree, + output_graph_degree, + batch_size, + i_batch, + d_detour_count.data_handle(), + d_num_no_detour_edges.data_handle(), + dev_stats.data_handle()); resource::sync_stream(res); RAFT_LOG_DEBUG( "# Pruning kNN Graph on GPUs (%.1lf %%)\r", @@ -428,10 +413,7 @@ void optimize(raft::resources const& res, resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - raft::copy(detour_count.data_handle(), - d_detour_count.data_handle(), - graph_size * input_graph_degree, - resource::get_cuda_stream(res)); + host_matrix_view_from_device detour_count(res, d_detour_count.view()); raft::copy( host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res)); @@ -447,7 +429,7 @@ void optimize(raft::resources const& res, if (max_detour < num_detour) { max_detour = num_detour; /* stats */ } for (uint64_t k = 0; k < input_graph_degree; k++) { if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; } - pruned_graph.data_handle()[pk + (output_graph_degree * i)] = + output_graph_ptr[pk + (output_graph_degree * i)] = input_graph_ptr[k + (input_graph_degree * i)]; pk += 1; if (pk >= output_graph_degree) break; @@ -478,8 +460,7 @@ void optimize(raft::resources const& res, // const double time_make_start = cur_time(); - auto d_rev_graph = - raft::make_device_matrix(res, graph_size, output_graph_degree); + device_matrix_view_from_host d_rev_graph(res, rev_graph.view()); RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(), 0xff, graph_size * output_graph_degree * sizeof(IdxT), @@ -497,7 +478,7 @@ void optimize(raft::resources const& res, for (uint64_t k = 0; k < output_graph_degree; k++) { #pragma omp parallel for for (uint64_t i = 0; i < graph_size; i++) { - dest_nodes.data_handle()[i] = pruned_graph.data_handle()[k + (output_graph_degree * i)]; + dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)]; } resource::sync_stream(res); @@ -520,10 +501,12 @@ void optimize(raft::resources const& res, resource::sync_stream(res); RAFT_LOG_DEBUG("\n"); - raft::copy(rev_graph.data_handle(), - d_rev_graph.data_handle(), - graph_size * output_graph_degree, - resource::get_cuda_stream(res)); + if (d_rev_graph.allocated_memory()) { + raft::copy(rev_graph.data_handle(), + d_rev_graph.data_handle(), + graph_size * output_graph_degree, + resource::get_cuda_stream(res)); + } raft::copy(rev_graph_count.data_handle(), d_rev_graph_count.data_handle(), graph_size, @@ -542,10 +525,6 @@ void optimize(raft::resources const& res, const uint64_t num_protected_edges = output_graph_degree / 2; RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges); - memcpy(output_graph_ptr, - pruned_graph.data_handle(), - sizeof(IdxT) * graph_size * output_graph_degree); - constexpr int _omp_chunk = 1024; #pragma omp parallel for schedule(dynamic, _omp_chunk) for (uint64_t j = 0; j < graph_size; j++) { @@ -578,7 +557,7 @@ void optimize(raft::resources const& res, #pragma omp parallel for reduction(+ : num_replaced_edges) for (uint64_t i = 0; i < graph_size; i++) { for (uint64_t k = 0; k < output_graph_degree; k++) { - const uint64_t j = pruned_graph.data_handle()[k + (output_graph_degree * i)]; + const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)]; const uint64_t pos = pos_in_array(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree); if (pos == output_graph_degree) { num_replaced_edges += 1; } diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp index 22c7a60647..22cbe6bbac 100644 --- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp +++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include namespace raft::neighbors::cagra::detail { @@ -150,4 +152,97 @@ struct gen_index_msb_1_mask { }; } // namespace utils +/** + * Utility to sync memory from a host_matrix_view to a device_matrix_view + * + * In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the + * device, and no extra allocations need to be performed. This class checks + * if the host_matrix_view is already accessible on the device, and only creates device + * memory and copies over if necessary. In memory limited situations this is preferable + * to having both a host and device copy + * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer + * have been merged, we should remove this class and switch over to using mdbuffer for this + */ +template +class device_matrix_view_from_host { + public: + device_matrix_view_from_host(raft::resources const& res, host_matrix_view host_view) + : host_view_(host_view) + { + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle())); + device_ptr = reinterpret_cast(attr.devicePointer); + if (device_ptr == NULL) { + // allocate memory and copy over + device_mem_.emplace( + raft::make_device_matrix(res, host_view.extent(0), host_view.extent(1))); + raft::copy(device_mem_->data_handle(), + host_view.data_handle(), + host_view.extent(0) * host_view.extent(1), + resource::get_cuda_stream(res)); + device_ptr = device_mem_->data_handle(); + } + } + + device_matrix_view view() + { + return make_device_matrix_view(device_ptr, host_view_.extent(0), host_view_.extent(1)); + } + + T* data_handle() { return device_ptr; } + + bool allocated_memory() const { return device_mem_.has_value(); } + + private: + std::optional> device_mem_; + host_matrix_view host_view_; + T* device_ptr; +}; + +/** + * Utility to sync memory from a device_matrix_view to a host_matrix_view + * + * In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the + * host, and no extra allocations need to be performed. This class checks + * if the device_matrix_view is already accessible on the host, and only creates host + * memory and copies over if necessary. In memory limited situations this is preferable + * to having both a host and device copy + * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer + * have been merged, we should remove this class and switch over to using mdbuffer for this + */ +template +class host_matrix_view_from_device { + public: + host_matrix_view_from_device(raft::resources const& res, device_matrix_view device_view) + : device_view_(device_view) + { + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, device_view.data_handle())); + host_ptr = reinterpret_cast(attr.hostPointer); + if (host_ptr == NULL) { + // allocate memory and copy over + host_mem_.emplace( + raft::make_host_matrix(device_view.extent(0), device_view.extent(1))); + raft::copy(host_mem_->data_handle(), + device_view.data_handle(), + device_view.extent(0) * device_view.extent(1), + resource::get_cuda_stream(res)); + host_ptr = host_mem_->data_handle(); + } + } + + host_matrix_view view() + { + return make_host_matrix_view(host_ptr, device_view_.extent(0), device_view_.extent(1)); + } + + T* data_handle() { return host_ptr; } + + bool allocated_memory() const { return host_mem_.has_value(); } + + private: + std::optional> host_mem_; + device_matrix_view device_view_; + T* host_ptr; +}; } // namespace raft::neighbors::cagra::detail From c59c9d194a7d255d1be3dd7083a3098b4a2a1672 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 11 Sep 2023 14:56:28 -0400 Subject: [PATCH 2/4] Various fixes to reproducible benchmarks (#1800) Authors: - Corey J. Nolet (https://github.com/cjnolet) - Artem M. Chirkin (https://github.com/achirkin) - Divye Gala (https://github.com/divyegala) - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Divye Gala (https://github.com/divyegala) URL: https://github.com/rapidsai/raft/pull/1800 --- cpp/bench/ann/src/ggnn/ggnn_benchmark.cu | 3 +-- cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh | 22 ++++----------------- cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h | 6 ++++-- docs/source/ann_benchmarks_param_tuning.md | 2 +- 4 files changed, 10 insertions(+), 23 deletions(-) diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu index 99481c2921..3b2e97062f 100644 --- a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu +++ b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu @@ -33,8 +33,7 @@ template void parse_build_param(const nlohmann::json& conf, typename raft::bench::ann::Ggnn::BuildParam& param) { - param.dataset_size = conf.at("dataset_size"); - param.k = conf.at("k"); + param.k = conf.at("k"); if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); } if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); } diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh index 74c7cddc3c..664ec511dd 100644 --- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh +++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh @@ -38,8 +38,6 @@ class Ggnn : public ANN { int num_layers{4}; // L float tau{0.5}; int refine_iterations{2}; - - size_t dataset_size; int k; // GGNN requires to know k during building }; @@ -182,12 +180,6 @@ GgnnImpl::GgnnImpl(Metric metric, } if (dim != D) { throw std::runtime_error("mis-matched dim"); } - - int device; - RAFT_CUDA_TRY(cudaGetDevice(&device)); - - ggnn_ = std::make_unique( - device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau); } template @@ -195,11 +187,10 @@ void GgnnImpl::build(const T* dataset, size_t nrow, cudaStream_t stream) { - if (nrow != build_param_.dataset_size) { - throw std::runtime_error( - "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) + - " , but nrow = " + std::to_string(nrow)); - } + int device; + RAFT_CUDA_TRY(cudaGetDevice(&device)); + ggnn_ = std::make_unique( + device, nrow, build_param_.num_layers, true, build_param_.tau); ggnn_->set_base_data(dataset); ggnn_->set_stream(stream); @@ -212,11 +203,6 @@ void GgnnImpl::build(const T* dataset, template void GgnnImpl::set_search_dataset(const T* dataset, size_t nrow) { - if (nrow != build_param_.dataset_size) { - throw std::runtime_error( - "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) + - " , but nrow = " + std::to_string(nrow)); - } ggnn_->set_base_data(dataset); } diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h index 5cd33ef94d..4d7b993aa1 100644 --- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h +++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h @@ -31,6 +31,8 @@ #include #include +#include + #include "../common/ann_types.hpp" #include @@ -164,13 +166,13 @@ class HnswLib : public ANN { struct BuildParam { int M; int ef_construction; - int num_threads{1}; + int num_threads = omp_get_num_procs(); }; using typename ANN::AnnSearchParam; struct SearchParam : public AnnSearchParam { int ef; - int num_threads{1}; + int num_threads = omp_get_num_procs(); }; HnswLib(Metric metric, int dim, const BuildParam& param); diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md index 712d22f0aa..dd6090c5e2 100644 --- a/docs/source/ann_benchmarks_param_tuning.md +++ b/docs/source/ann_benchmarks_param_tuning.md @@ -99,4 +99,4 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of | `ef` | `search_param` | Y | Positive Integer >0 | | Size of the dynamic list for the nearest neighbors used for search. Higher value leads to more accurate but slower search. Cannot be lower than `k`. | | `numThreads` | `search_params` | N | Positive Integer >0 | 1 | Number of threads to use for queries. | -Please refer to [HNSW algorithm parameters guide](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) from `hnswlib` to learn more about these arguments. +Please refer to [HNSW algorithm parameters guide] from `hnswlib` to learn more about these arguments. \ No newline at end of file From 6a5aa102e1491b1869a66a2f88cb518034429cea Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 12 Sep 2023 00:53:49 +0200 Subject: [PATCH 3/4] Add best deep-100M configs for IVF-PQ to ANN benchmarks (#1807) Authors: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1807 --- .../raft-ann-bench/run/conf/deep-100M.json | 271 +++++++----------- 1 file changed, 99 insertions(+), 172 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json index 6bef94c070..bc77b522a8 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json +++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json @@ -248,184 +248,111 @@ {"max_batch":10000, "max_k":10, "nprobe":1000} ] }, - - { - "name": "raft_ivf_pq.dimpq128-cluster1024", - "algo": "raft_ivf_pq", - "build_param": {"nlist": 1024, "pq_dim": 128, "ratio": 1, "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024", - "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "half", "smemLutDtype": "half"}, - {"nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "half"}, - {"nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "half"}, - {"nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "half"}, - {"nprobe": 500, "internalDistanceDtype": "half", "smemLutDtype": "half"}, - {"nprobe": 1024, "internalDistanceDtype": "half", "smemLutDtype": "half"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024" - }, - { - "name": "raft_ivf_pq.dimpq128-cluster1024-float-float", - "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 128, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-float", - "search_params": [ - {"nprobe": 1, "internalDistanceDtype": "float", "smemLutDtype": "float"}, - {"nprobe": 5, "internalDistanceDtype": "float", "smemLutDtype": "float"}, - {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "float"}, - {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "float"}, - {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "float"}, - {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "float"}, - {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "float"}, - {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "float"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-float" - }, - { - "name": "raft_ivf_pq.dimpq128-cluster1024-float-half", - "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 128, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-half", - "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "half"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-half" - }, - { - "name": "raft_ivf_pq.dimpq128-cluster1024-float-fp8", - "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 128, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-fp8", - "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-float-fp8" - }, - { - "name": "raft_ivf_pq.dimpq64-cluster1024-float-fp8", - "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 64, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-fp8", - "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-fp8" - }, - { - "name": "raft_ivf_pq.dimpq64-cluster1024-float-half", - "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 64, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-half", - "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "half"}, - {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "half"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq64-cluster1024-float-half" - }, - { - "name": "raft_ivf_pq.dimpq32-cluster1024-float-fp8", - "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 32, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq32-cluster1024-float-fp8", - "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq32-cluster1024-float-fp8" - }, - { - "name": "raft_ivf_pq.dimpq16-cluster1024-float-fp8", +{ + "name": "raft_ivf_pq.d96b5n50K", "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 16, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq16-cluster1024-float-fp8", + "build_param": {"nlist": 50000, "pq_dim": 96, "pq_bits": 5, "ratio": 10, "niter": 25}, + "file": "deep-100M/raft_ivf_pq/d96b5n50K", "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 500, "internalDistanceDtype": "float", "smemLutDtype": "fp8"}, - {"nprobe": 1024, "internalDistanceDtype": "float", "smemLutDtype": "fp8"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq16-cluster1024-float-fp8" + { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 2 }, + { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 2 }, + { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 }, + { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 2 } + ] }, { - "name": "raft_ivf_pq.dimpq128-cluster1024-half-float", + "name": "raft_ivf_pq.d64b5n50K", "algo": "raft_ivf_pq", - "build_param": { - "nlist": 1024, - "pq_dim": 128, - "ratio": 1, - "niter": 25 - }, - "file": "index/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-half-float", + "build_param": {"nlist": 50000, "pq_dim": 64, "pq_bits": 5, "ratio": 10, "niter": 25}, + "file": "deep-100M/raft_ivf_pq/d64b5n50K", "search_params": [ - {"nprobe": 10, "internalDistanceDtype": "half", "smemLutDtype": "float"}, - {"nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "float"}, - {"nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "float"}, - {"nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "float"}, - {"nprobe": 500, "internalDistanceDtype": "half", "smemLutDtype": "float"}, - {"nprobe": 1024, "internalDistanceDtype": "half", "smemLutDtype": "float"} - ], - "search_result_file": "result/deep-image-96-angular/raft_ivf_pq/dimpq128-cluster1024-half-float" + { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "float", "refine_ratio": 4 }, + { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 20, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 30, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 40, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 50, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 100, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 200, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 1000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 2000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 5000, "internalDistanceDtype": "float", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "half", "refine_ratio": 4 }, + { "nprobe": 20, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 30, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 40, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 50, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 100, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 200, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 1000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 2000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 }, + { "nprobe": 5000, "internalDistanceDtype": "half", "smemLutDtype": "fp8", "refine_ratio": 4 } + ] }, { "name": "raft_ivf_pq.dimpq512-cluster1024-float-float", From c38b19f6c846db57babddb39b32b0482c55bda53 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 11 Sep 2023 19:06:30 -0400 Subject: [PATCH 4/4] Adding rmm to `raft-ann-bench` dependencies (#1815) Authors: - Corey J. Nolet (https://github.com/cjnolet) - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/raft/pull/1815 --- conda/recipes/raft-ann-bench/meta.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml index b817968379..91d0fdb729 100644 --- a/conda/recipes/raft-ann-bench/meta.yaml +++ b/conda/recipes/raft-ann-bench/meta.yaml @@ -78,6 +78,8 @@ requirements: - h5py {{ h5py_version }} - benchmark - matplotlib + # rmm is needed to determine if package is gpu-enabled + - rmm ={{ minor_version }} - python - pandas - pyyaml