rapidsai · achirkin · Apr 11, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 17, 2024
@@ -33,6 +33,7 @@ enum Objective {
 enum class MemoryType {
   Host,
   HostMmap,
+  HostPinned,
   Device,
 };
 
@@ -58,6 +59,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
     return MemoryType::Host;
   } else if (memory_type == "mmap") {
     return MemoryType::HostMmap;
+  } else if (memory_type == "pinned") {
+    return MemoryType::HostPinned;
   } else if (memory_type == "device") {
     return MemoryType::Device;
   } else {
@@ -73,7 +76,7 @@ struct AlgoProperty {
 
 class AnnBase {
  public:
-  using index_type = size_t;
+  using index_type = uint32_t;
 
   inline AnnBase(Metric metric, int dim) : metric_(metric), dim_(dim) {}
   virtual ~AnnBase() noexcept = default;

@@ -283,15 +283,56 @@ class Dataset {
   {
     switch (memory_type) {
       case MemoryType::Device: return query_set_on_gpu();
-      default: return query_set();
+      case MemoryType::Host: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (query_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          query_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::HostPinned: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (!query_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), query_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          query_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
+      default: return nullptr;
     }
   }
 
   auto base_set(MemoryType memory_type) const -> const T*
   {
     switch (memory_type) {
       case MemoryType::Device: return base_set_on_gpu();
-      case MemoryType::Host: return base_set();
+      case MemoryType::Host: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (base_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          base_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::HostPinned: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (!base_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), base_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          base_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
       case MemoryType::HostMmap: return mapped_base_set();
       default: return nullptr;
     }
@@ -312,18 +353,23 @@ class Dataset {
   mutable T* d_query_set_     = nullptr;
   mutable T* mapped_base_set_ = nullptr;
   mutable int32_t* gt_set_    = nullptr;
+
+  mutable bool base_set_pinned_  = false;
+  mutable bool query_set_pinned_ = false;
 };
 
 template <typename T>
 Dataset<T>::~Dataset()
 {
-  delete[] base_set_;
-  delete[] query_set_;
-  delete[] gt_set_;
 #ifndef BUILD_CPU_ONLY
   if (d_base_set_) { cudaFree(d_base_set_); }
   if (d_query_set_) { cudaFree(d_query_set_); }
+  if (base_set_pinned_) { cudaHostUnregister(base_set_); }
+  if (query_set_pinned_) { cudaHostUnregister(query_set_); }
 #endif
+  delete[] base_set_;
+  delete[] query_set_;
+  delete[] gt_set_;
 }
 
 template <typename T>

@@ -197,10 +197,12 @@ struct result_buffer {
   explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
   {
     if (size_ == 0) { return; }
-    data_host_ = malloc(size_);
 #ifndef BUILD_CPU_ONLY
     cudaMallocAsync(&data_device_, size_, stream_);
+    cudaMallocHost(&data_host_, size_);
     cudaStreamSynchronize(stream_);
+#else
+    data_host_ = malloc(size_);
 #endif
   }
   result_buffer()                                = delete;
@@ -213,9 +215,11 @@ struct result_buffer {
     if (size_ == 0) { return; }
 #ifndef BUILD_CPU_ONLY
     cudaFreeAsync(data_device_, stream_);
+    cudaFreeHost(data_host_);
     cudaStreamSynchronize(stream_);
-#endif
+#else
     free(data_host_);
+#endif
   }
 
   [[nodiscard]] auto size() const noexcept { return size_; }
@@ -278,6 +282,31 @@ inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
   return rb;
 }
 
+namespace detail {
+inline std::vector<std::unique_ptr<result_buffer>> global_tmp_buffer_pool(0);
+inline std::mutex gtp_mutex;
+}  // namespace detail
+
+/**
+ * Global temporary buffer pool for use by algorithms.
+ * In contrast to `get_result_buffer_from_global_pool`, the content of these buffers is never
+ * initialized.
+ */
+inline auto get_tmp_buffer_from_global_pool(size_t size) -> result_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  auto& rb    = [stream, size]() -> result_buffer& {
+    std::lock_guard guard(detail::gtp_mutex);
+    if (static_cast<int>(detail::global_tmp_buffer_pool.size()) < benchmark_n_threads) {
+      detail::global_tmp_buffer_pool.resize(benchmark_n_threads);
+    }
+    auto& rb = detail::global_tmp_buffer_pool[benchmark_thread_id];
+    if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+    return *rb;
+  }();
+  return rb;
+}
+
 /**
  * Delete all streams and memory allocations in the global pool.
  * It's called at the end of the `main` function - before global/static variables and cuda context

@@ -249,6 +249,10 @@ void parse_search_param(const nlohmann::json& conf,
   if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
   if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
   if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("persistent")) { param.p.persistent = conf.at("persistent"); }
+  if (conf.contains("thread_block_size")) {
+    param.p.thread_block_size = conf.at("thread_block_size");
+  }
   if (conf.contains("algo")) {
     if (conf.at("algo") == "single_cta") {
       param.p.algo = raft::neighbors::experimental::cagra::search_algo::SINGLE_CTA;

@@ -228,27 +228,47 @@ void refine_helper(const raft::resources& res,
   } else {
     auto dataset_host = raft::make_host_matrix_view<const data_type, extents_type>(
       dataset.data_handle(), dataset.extent(0), dataset.extent(1));
-    auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
-    auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
-    auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
-    auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
 
-    auto stream = resource::get_cuda_stream(res);
-    raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
-    raft::copy(
-      candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+    if (raft::get_device_for_address(queries.data_handle()) >= 0) {
+      // Queries & results are on the device
 
-    raft::resource::sync_stream(res);  // wait for the queries and candidates
-    raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
-                                                                        dataset_host,
-                                                                        queries_host.view(),
-                                                                        candidates_host.view(),
-                                                                        neighbors_host.view(),
-                                                                        distances_host.view(),
-                                                                        metric);
+      auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
+      auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
+      auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
+      auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
+
+      auto stream = resource::get_cuda_stream(res);
+      raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
+      raft::copy(
+        candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+
+      raft::resource::sync_stream(res);  // wait for the queries and candidates
+      raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
+                                                                          dataset_host,
+                                                                          queries_host.view(),
+                                                                          candidates_host.view(),
+                                                                          neighbors_host.view(),
+                                                                          distances_host.view(),
+                                                                          metric);
+
+      raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
+      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+
+    } else {
+      // Queries & results are on the host - no device sync / copy needed
+
+      auto queries_host = raft::make_host_matrix_view<const data_type, extents_type>(
+        queries.data_handle(), batch_size, dim);
+      auto candidates_host = raft::make_host_matrix_view<const index_type, extents_type>(
+        candidates.data_handle(), batch_size, k0);
+      auto neighbors_host =
+        raft::make_host_matrix_view<index_type, extents_type>(neighbors, batch_size, k);
+      auto distances_host =
+        raft::make_host_matrix_view<float, extents_type>(distances, batch_size, k);
 
-    raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
-    raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+      raft::neighbors::refine<index_type, data_type, float, extents_type>(
+        res, dataset_host, queries_host, candidates_host, neighbors_host, distances_host, metric);
+    }
   }
 }
 

@@ -117,6 +117,15 @@ class RaftCagra : public ANN<T>, public AnnGPU {
     return handle_.get_sync_stream();
   }
 
+  [[nodiscard]] auto uses_stream() const noexcept -> bool override
+  {
+    // If the algorithm uses persistent kernel, the CPU has to synchronize by the end of computing
+    // the result. Hence it guarantees the benchmark CUDA stream is empty by the end of the
+    // execution. Hence we inform the benchmark to not waste the time on recording & synchronizing
+    // the event.
+    return !search_params_.persistent;
+  }
+
   // to enable dataset access from GPU memory
   AlgoProperty get_preference() const override
   {
@@ -326,14 +335,33 @@ void RaftCagra<T, IdxT>::search(
   } else {
     auto queries_v =
       raft::make_device_matrix_view<const T, AnnBase::index_type>(queries, batch_size, dimension_);
-    auto candidate_ixs =
-      raft::make_device_matrix<AnnBase::index_type, AnnBase::index_type>(res, batch_size, k0);
-    auto candidate_dists =
-      raft::make_device_matrix<float, AnnBase::index_type>(res, batch_size, k0);
-    search_base(
-      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
-    refine_helper(
-      res, *input_dataset_v_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
+
+    auto& tmp_buf = get_tmp_buffer_from_global_pool((sizeof(float) + sizeof(AnnBase::index_type)) *
+                                                    batch_size * k0);
+    auto mem_type =
+      raft::get_device_for_address(neighbors) >= 0 ? MemoryType::Device : MemoryType::HostPinned;
+
+    auto candidate_ixs = raft::make_device_matrix_view<AnnBase::index_type, AnnBase::index_type>(
+      reinterpret_cast<AnnBase::index_type*>(tmp_buf.data(mem_type)), batch_size, k0);
+    auto candidate_dists = reinterpret_cast<float*>(candidate_ixs.data_handle() + batch_size * k0);
+
+    search_base(queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists);
+
+    if (mem_type == MemoryType::HostPinned && uses_stream()) {
+      // If the algorithm uses a stream to synchronize (non-persistent kernel), but the data is in
+      // the pinned host memory, we need top synchronize before the refinement operation to wait for
+      // the data being available for the host.
+      raft::resource::sync_stream(res);
+    }
+
+    refine_helper(res,
+                  *input_dataset_v_,
+                  queries_v,
+                  raft::make_const_mdspan(candidate_ixs),
+                  k,
+                  neighbors,
+                  distances,
+                  index_->metric());
   }
 }
 }  // namespace raft::bench::ann
@@ -124,6 +124,8 @@ struct search_params : ann::search_params {
   uint32_t num_random_samplings = 1;
   /** Bit mask used for initial random seed node selection. */
   uint64_t rand_xor_mask = 0x128394;
+  /** Whether to use the persistent version of the kernel (only SINGLE_CTA is supported a.t.m.) */
+  bool persistent = false;
 };
 
 static_assert(std::is_aggregate_v<index_params>);