Merge branch 'branch-23.10' into brute_force_index

rapidsai · Sep 11, 2023 · 79e9a83 · 79e9a83
2 parents 6a64682 + c38b19f
commit 79e9a83
Show file tree

Hide file tree

Showing 9 changed files with 241 additions and 247 deletions.
diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
@@ -78,6 +78,8 @@ requirements:
     - h5py {{ h5py_version }}
     - benchmark
     - matplotlib
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
     - python
     - pandas
     - pyyaml

diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -33,8 +33,7 @@ template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::Ggnn<T>::BuildParam& param)
 {
-  param.dataset_size = conf.at("dataset_size");
-  param.k            = conf.at("k");
+  param.k = conf.at("k");
 
   if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
   if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }

diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -38,8 +38,6 @@ class Ggnn : public ANN<T> {
     int num_layers{4};     // L
     float tau{0.5};
     int refine_iterations{2};
-
-    size_t dataset_size;
     int k;  // GGNN requires to know k during building
   };
 
@@ -182,24 +180,17 @@ GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
   }
 
   if (dim != D) { throw std::runtime_error("mis-matched dim"); }
-
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-
-  ggnn_ = std::make_unique<GGNNGPUInstance>(
-    device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
 }
 
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
                                                        size_t nrow,
                                                        cudaStream_t stream)
 {
-  if (nrow != build_param_.dataset_size) {
-    throw std::runtime_error(
-      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
-      " , but nrow = " + std::to_string(nrow));
-  }
+  int device;
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
+  ggnn_ = std::make_unique<GGNNGPUInstance>(
+    device, nrow, build_param_.num_layers, true, build_param_.tau);
 
   ggnn_->set_base_data(dataset);
   ggnn_->set_stream(stream);
@@ -212,11 +203,6 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
 {
-  if (nrow != build_param_.dataset_size) {
-    throw std::runtime_error(
-      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
-      " , but nrow = " + std::to_string(nrow));
-  }
   ggnn_->set_base_data(dataset);
 }
 

diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -31,6 +31,8 @@
 #include <utility>
 #include <vector>
 
+#include <omp.h>
+
 #include "../common/ann_types.hpp"
 #include <hnswlib.h>
 
@@ -164,13 +166,13 @@ class HnswLib : public ANN<T> {
   struct BuildParam {
     int M;
     int ef_construction;
-    int num_threads{1};
+    int num_threads = omp_get_num_procs();
   };
 
   using typename ANN<T>::AnnSearchParam;
   struct SearchParam : public AnnSearchParam {
     int ef;
-    int num_threads{1};
+    int num_threads = omp_get_num_procs();
   };
 
   HnswLib(Metric metric, int dim, const BuildParam& param);

diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
@@ -256,13 +256,17 @@ index<T, IdxT> build(raft::resources const& res,
     graph_degree = intermediate_degree;
   }
 
-  auto knn_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree);
+  std::optional<raft::host_matrix<IdxT, int64_t>> knn_graph(
+    raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), intermediate_degree));
 
-  build_knn_graph(res, dataset, knn_graph.view());
+  build_knn_graph(res, dataset, knn_graph->view());
 
   auto cagra_graph = raft::make_host_matrix<IdxT, int64_t>(dataset.extent(0), graph_degree);
 
-  optimize<IdxT>(res, knn_graph.view(), cagra_graph.view());
+  optimize<IdxT>(res, knn_graph->view(), cagra_graph.view());
+
+  // free intermediate graph before trying to create the index
+  knn_graph.reset();
 
   // Construct an index from dataset and optimized knn graph.
   return index<T, IdxT>(res, params.metric, dataset, raft::make_const_mdspan(cagra_graph.view()));

diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -334,18 +334,13 @@ void optimize(raft::resources const& res,
   auto output_graph_ptr              = new_graph.data_handle();
   const IdxT graph_size              = new_graph.extent(0);
 
-  auto pruned_graph = raft::make_host_matrix<IdxT, int64_t>(graph_size, output_graph_degree);
-
   {
     //
     // Prune kNN graph
     //
-    auto d_input_graph =
-      raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
-
-    auto detour_count = raft::make_host_matrix<uint8_t, int64_t>(graph_size, input_graph_degree);
     auto d_detour_count =
       raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
+
     RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
                                   0xff,
                                   graph_size * input_graph_degree * sizeof(uint8_t),
@@ -376,24 +371,13 @@ void optimize(raft::resources const& res,
     const double time_prune_start = cur_time();
     RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
 
-    raft::copy(d_input_graph.data_handle(),
-               input_graph_ptr,
-               graph_size * input_graph_degree,
-               resource::get_cuda_stream(res));
-    void (*kernel_prune)(const IdxT* const,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         const uint32_t,
-                         uint8_t* const,
-                         uint32_t* const,
-                         uint64_t* const);
+    // Copy input_graph_ptr over to device if necessary
+    device_matrix_view_from_host d_input_graph(
+      res,
+      raft::make_host_matrix_view<IdxT, int64_t>(input_graph_ptr, graph_size, input_graph_degree));
 
     constexpr int MAX_DEGREE = 1024;
-    if (input_graph_degree <= MAX_DEGREE) {
-      kernel_prune = kern_prune<MAX_DEGREE, IdxT>;
-    } else {
+    if (input_graph_degree > MAX_DEGREE) {
       RAFT_FAIL(
         "The degree of input knn graph is too large (%u). "
         "It must be equal to or smaller than %d.",
@@ -410,16 +394,17 @@ void optimize(raft::resources const& res,
       dev_stats.data_handle(), 0, sizeof(uint64_t) * 2, resource::get_cuda_stream(res)));
 
     for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
-      kernel_prune<<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
-        d_input_graph.data_handle(),
-        graph_size,
-        input_graph_degree,
-        output_graph_degree,
-        batch_size,
-        i_batch,
-        d_detour_count.data_handle(),
-        d_num_no_detour_edges.data_handle(),
-        dev_stats.data_handle());
+      kern_prune<MAX_DEGREE, IdxT>
+        <<<blocks_prune, threads_prune, 0, resource::get_cuda_stream(res)>>>(
+          d_input_graph.data_handle(),
+          graph_size,
+          input_graph_degree,
+          output_graph_degree,
+          batch_size,
+          i_batch,
+          d_detour_count.data_handle(),
+          d_num_no_detour_edges.data_handle(),
+          dev_stats.data_handle());
       resource::sync_stream(res);
       RAFT_LOG_DEBUG(
         "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
@@ -428,10 +413,7 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    raft::copy(detour_count.data_handle(),
-               d_detour_count.data_handle(),
-               graph_size * input_graph_degree,
-               resource::get_cuda_stream(res));
+    host_matrix_view_from_device<uint8_t, int64_t> detour_count(res, d_detour_count.view());
 
     raft::copy(
       host_stats.data_handle(), dev_stats.data_handle(), 2, resource::get_cuda_stream(res));
@@ -447,7 +429,7 @@ void optimize(raft::resources const& res,
         if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
         for (uint64_t k = 0; k < input_graph_degree; k++) {
           if (detour_count.data_handle()[k + (input_graph_degree * i)] != num_detour) { continue; }
-          pruned_graph.data_handle()[pk + (output_graph_degree * i)] =
+          output_graph_ptr[pk + (output_graph_degree * i)] =
             input_graph_ptr[k + (input_graph_degree * i)];
           pk += 1;
           if (pk >= output_graph_degree) break;
@@ -478,8 +460,7 @@ void optimize(raft::resources const& res,
     //
     const double time_make_start = cur_time();
 
-    auto d_rev_graph =
-      raft::make_device_matrix<IdxT, int64_t>(res, graph_size, output_graph_degree);
+    device_matrix_view_from_host<IdxT, int64_t> d_rev_graph(res, rev_graph.view());
     RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph.data_handle(),
                                   0xff,
                                   graph_size * output_graph_degree * sizeof(IdxT),
@@ -497,7 +478,7 @@ void optimize(raft::resources const& res,
     for (uint64_t k = 0; k < output_graph_degree; k++) {
 #pragma omp parallel for
       for (uint64_t i = 0; i < graph_size; i++) {
-        dest_nodes.data_handle()[i] = pruned_graph.data_handle()[k + (output_graph_degree * i)];
+        dest_nodes.data_handle()[i] = output_graph_ptr[k + (output_graph_degree * i)];
       }
       resource::sync_stream(res);
 
@@ -520,10 +501,12 @@ void optimize(raft::resources const& res,
     resource::sync_stream(res);
     RAFT_LOG_DEBUG("\n");
 
-    raft::copy(rev_graph.data_handle(),
-               d_rev_graph.data_handle(),
-               graph_size * output_graph_degree,
-               resource::get_cuda_stream(res));
+    if (d_rev_graph.allocated_memory()) {
+      raft::copy(rev_graph.data_handle(),
+                 d_rev_graph.data_handle(),
+                 graph_size * output_graph_degree,
+                 resource::get_cuda_stream(res));
+    }
     raft::copy(rev_graph_count.data_handle(),
                d_rev_graph_count.data_handle(),
                graph_size,
@@ -542,10 +525,6 @@ void optimize(raft::resources const& res,
     const uint64_t num_protected_edges = output_graph_degree / 2;
     RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges);
 
-    memcpy(output_graph_ptr,
-           pruned_graph.data_handle(),
-           sizeof(IdxT) * graph_size * output_graph_degree);
-
     constexpr int _omp_chunk = 1024;
 #pragma omp parallel for schedule(dynamic, _omp_chunk)
     for (uint64_t j = 0; j < graph_size; j++) {
@@ -578,7 +557,7 @@ void optimize(raft::resources const& res,
 #pragma omp parallel for reduction(+ : num_replaced_edges)
     for (uint64_t i = 0; i < graph_size; i++) {
       for (uint64_t k = 0; k < output_graph_degree; k++) {
-        const uint64_t j = pruned_graph.data_handle()[k + (output_graph_degree * i)];
+        const uint64_t j = output_graph_ptr[k + (output_graph_degree * i)];
         const uint64_t pos =
           pos_in_array<IdxT>(j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
         if (pos == output_graph_degree) { num_replaced_edges += 1; }

diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -20,6 +20,8 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
 #include <type_traits>
 
 namespace raft::neighbors::cagra::detail {
@@ -150,4 +152,97 @@ struct gen_index_msb_1_mask {
 };
 }  // namespace utils
 
+/**
+ * Utility to sync memory from a host_matrix_view to a device_matrix_view
+ *
+ * In certain situations (UVM/HMM/ATS) host memory might be directly accessible on the
+ * device, and no extra allocations need to be performed. This class checks
+ * if the host_matrix_view is already accessible on the device, and only creates device
+ * memory and copies over if necessary. In memory limited situations this is preferable
+ * to having both a host and device copy
+ * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
+ * have been merged, we should remove this class and switch over to using mdbuffer for this
+ */
+template <typename T, typename IdxT>
+class device_matrix_view_from_host {
+ public:
+  device_matrix_view_from_host(raft::resources const& res, host_matrix_view<T, IdxT> host_view)
+    : host_view_(host_view)
+  {
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
+    device_ptr = reinterpret_cast<T*>(attr.devicePointer);
+    if (device_ptr == NULL) {
+      // allocate memory and copy over
+      device_mem_.emplace(
+        raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
+      raft::copy(device_mem_->data_handle(),
+                 host_view.data_handle(),
+                 host_view.extent(0) * host_view.extent(1),
+                 resource::get_cuda_stream(res));
+      device_ptr = device_mem_->data_handle();
+    }
+  }
+
+  device_matrix_view<T, IdxT> view()
+  {
+    return make_device_matrix_view<T, IdxT>(device_ptr, host_view_.extent(0), host_view_.extent(1));
+  }
+
+  T* data_handle() { return device_ptr; }
+
+  bool allocated_memory() const { return device_mem_.has_value(); }
+
+ private:
+  std::optional<device_matrix<T, IdxT>> device_mem_;
+  host_matrix_view<T, IdxT> host_view_;
+  T* device_ptr;
+};
+
+/**
+ * Utility to sync memory from a device_matrix_view to a host_matrix_view
+ *
+ * In certain situations (UVM/HMM/ATS) device memory might be directly accessible on the
+ * host, and no extra allocations need to be performed. This class checks
+ * if the device_matrix_view is already accessible on the host, and only creates host
+ * memory and copies over if necessary. In memory limited situations this is preferable
+ * to having both a host and device copy
+ * TODO: once the mdbuffer changes here https://github.com/wphicks/raft/blob/fea-mdbuffer
+ * have been merged, we should remove this class and switch over to using mdbuffer for this
+ */
+template <typename T, typename IdxT>
+class host_matrix_view_from_device {
+ public:
+  host_matrix_view_from_device(raft::resources const& res, device_matrix_view<T, IdxT> device_view)
+    : device_view_(device_view)
+  {
+    cudaPointerAttributes attr;
+    RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, device_view.data_handle()));
+    host_ptr = reinterpret_cast<T*>(attr.hostPointer);
+    if (host_ptr == NULL) {
+      // allocate memory and copy over
+      host_mem_.emplace(
+        raft::make_host_matrix<T, IdxT>(device_view.extent(0), device_view.extent(1)));
+      raft::copy(host_mem_->data_handle(),
+                 device_view.data_handle(),
+                 device_view.extent(0) * device_view.extent(1),
+                 resource::get_cuda_stream(res));
+      host_ptr = host_mem_->data_handle();
+    }
+  }
+
+  host_matrix_view<T, IdxT> view()
+  {
+    return make_host_matrix_view<T, IdxT>(host_ptr, device_view_.extent(0), device_view_.extent(1));
+  }
+
+  T* data_handle() { return host_ptr; }
+
+  bool allocated_memory() const { return host_mem_.has_value(); }
+
+ private:
+  std::optional<host_matrix<T, IdxT>> host_mem_;
+  device_matrix_view<T, IdxT> device_view_;
+  T* host_ptr;
+};
 }  // namespace raft::neighbors::cagra::detail
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
@@ -99,4 +99,4 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `ef`             | `search_param`  | Y        | Positive Integer >0                  |         | Size of the dynamic list for the nearest neighbors used for search. Higher value leads to more accurate but slower search. Cannot be lower than `k`.                                                                                                                                              |
 | `numThreads`     | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
 
-Please refer to [HNSW algorithm parameters guide](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) from `hnswlib` to learn more about these arguments.
+Please refer to [HNSW algorithm parameters guide] from `hnswlib` to learn more about these arguments.