Merge branch 'branch-24.10' into cagra_fp16

rapidsai · Sep 27, 2024 · ef3314c · ef3314c
2 parents c6ae3b4 + b93b8f6
commit ef3314c
Show file tree

Hide file tree

Showing 249 changed files with 7,205 additions and 8,086 deletions.
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -37,7 +37,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,7 +37,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -34,7 +34,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -34,7 +34,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
-- nccl>=2.9.9
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -35,7 +35,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -35,7 +35,6 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8

diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -32,7 +32,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -32,7 +32,6 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

diff --git a/conda/recipes/libcuvs/conda_build_config.yaml b/conda/recipes/libcuvs/conda_build_config.yaml
@@ -19,9 +19,6 @@ c_stdlib_version:
 cmake_version:
   - ">=3.26.4,!=3.30.0"
 
-nccl_version:
-  - ">=2.9.9"
-
 h5py_version:
   - ">=3.8.0"
 

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
@@ -35,6 +35,7 @@ enum class Mode {
 enum class MemoryType {
   kHost,
   kHostMmap,
+  kHostPinned,
   kDevice,
 };
 
@@ -60,6 +61,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
     return MemoryType::kHost;
   } else if (memory_type == "mmap") {
     return MemoryType::kHostMmap;
+  } else if (memory_type == "pinned") {
+    return MemoryType::kHostPinned;
   } else if (memory_type == "device") {
     return MemoryType::kDevice;
   } else {

diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp
@@ -286,15 +286,56 @@ class dataset {
   {
     switch (memory_type) {
       case MemoryType::kDevice: return query_set_on_gpu();
-      default: return query_set();
+      case MemoryType::kHost: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (query_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          query_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::kHostPinned: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (!query_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), query_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          query_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
+      default: return nullptr;
     }
   }
 
   auto base_set(MemoryType memory_type) const -> const T*
   {
     switch (memory_type) {
       case MemoryType::kDevice: return base_set_on_gpu();
-      case MemoryType::kHost: return base_set();
+      case MemoryType::kHost: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (base_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          base_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::kHostPinned: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (!base_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), base_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          base_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
       case MemoryType::kHostMmap: return mapped_base_set();
       default: return nullptr;
     }
@@ -315,18 +356,23 @@ class dataset {
   mutable T* d_query_set_     = nullptr;
   mutable T* mapped_base_set_ = nullptr;
   mutable int32_t* gt_set_    = nullptr;
+
+  mutable bool base_set_pinned_  = false;
+  mutable bool query_set_pinned_ = false;
 };
 
 template <typename T>
 dataset<T>::~dataset()
 {
-  delete[] base_set_;
-  delete[] query_set_;
-  delete[] gt_set_;
 #ifndef BUILD_CPU_ONLY
   if (d_base_set_) { cudaFree(d_base_set_); }
   if (d_query_set_) { cudaFree(d_query_set_); }
+  if (base_set_pinned_) { cudaHostUnregister(base_set_); }
+  if (query_set_pinned_) { cudaHostUnregister(query_set_); }
 #endif
+  delete[] base_set_;
+  delete[] query_set_;
+  delete[] gt_set_;
 }
 
 template <typename T>

diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
@@ -198,42 +198,71 @@ inline auto get_stream_from_global_pool() -> cudaStream_t
 #endif
 }
 
-struct result_buffer {
-  explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
+/** The workspace buffer for use thread-locally. */
+struct ws_buffer {
+  explicit ws_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream} {}
+  ws_buffer()                                    = delete;
+  ws_buffer(ws_buffer&&)                         = delete;
+  auto operator=(ws_buffer&&) -> ws_buffer&      = delete;
+  ws_buffer(const ws_buffer&)                    = delete;
+  auto operator=(const ws_buffer&) -> ws_buffer& = delete;
+  ~ws_buffer() noexcept
   {
-    if (size_ == 0) { return; }
-    data_host_ = malloc(size_);
 #ifndef BUILD_CPU_ONLY
-    cudaMallocAsync(&data_device_, size_, stream_);
-    cudaStreamSynchronize(stream_);
-#endif
-  }
-  result_buffer()                                        = delete;
-  result_buffer(result_buffer&&)                         = delete;
-  auto operator=(result_buffer&&) -> result_buffer&      = delete;
-  result_buffer(const result_buffer&)                    = delete;
-  auto operator=(const result_buffer&) -> result_buffer& = delete;
-  ~result_buffer() noexcept
-  {
-    if (size_ == 0) { return; }
-#ifndef BUILD_CPU_ONLY
-    cudaFreeAsync(data_device_, stream_);
-    cudaStreamSynchronize(stream_);
+    if (data_device_ != nullptr) {
+      cudaFreeAsync(data_device_, stream_);
+      cudaStreamSynchronize(stream_);
+    }
+    if (data_host_ != nullptr) { cudaFreeHost(data_host_); }
+#else
+    if (data_host_ != nullptr) { free(data_host_); }
 #endif
-    free(data_host_);
   }
 
   [[nodiscard]] auto size() const noexcept { return size_; }
-  [[nodiscard]] auto data(MemoryType loc) const noexcept
+  [[nodiscard]] auto data(MemoryType loc) const noexcept -> void*
   {
+    if (size_ == 0) { return nullptr; }
     switch (loc) {
-      case MemoryType::kDevice: return data_device_;
-      default: return data_host_;
+#ifndef BUILD_CPU_ONLY
+      case MemoryType::kDevice: {
+        if (data_device_ == nullptr) {
+          cudaMallocAsync(&data_device_, size_, stream_);
+          cudaStreamSynchronize(stream_);
+          needs_cleanup_device_ = false;
+        } else if (needs_cleanup_device_) {
+          cudaMemsetAsync(data_device_, 0, size_, stream_);
+          cudaStreamSynchronize(stream_);
+          needs_cleanup_device_ = false;
+        }
+        return data_device_;
+      }
+#endif
+      default: {
+        if (data_host_ == nullptr) {
+#ifndef BUILD_CPU_ONLY
+          cudaMallocHost(&data_host_, size_);
+#else
+          data_host_ = malloc(size_);
+#endif
+          needs_cleanup_host_ = false;
+        } else if (needs_cleanup_host_) {
+          memset(data_host_, 0, size_);
+          needs_cleanup_host_ = false;
+        }
+        return data_host_;
+      }
     }
   }
 
   void transfer_data(MemoryType dst, MemoryType src)
   {
+    // The destination is overwritten and thus does not need cleanup
+    if (dst == MemoryType::kDevice) {
+      needs_cleanup_device_ = false;
+    } else {
+      needs_cleanup_host_ = false;
+    }
     auto dst_ptr = data(dst);
     auto src_ptr = data(src);
     if (dst_ptr == src_ptr) { return; }
@@ -243,15 +272,25 @@ struct result_buffer {
 #endif
   }
 
+  /** Mark the buffer for reuse - it needs to be cleared to make sure the previous results are not
+   * leaked to the new iteration. */
+  void reuse()
+  {
+    needs_cleanup_host_   = true;
+    needs_cleanup_device_ = true;
+  }
+
  private:
   size_t size_{0};
-  cudaStream_t stream_ = nullptr;
-  void* data_host_     = nullptr;
-  void* data_device_   = nullptr;
+  cudaStream_t stream_               = nullptr;
+  mutable void* data_host_           = nullptr;
+  mutable void* data_device_         = nullptr;
+  mutable bool needs_cleanup_host_   = false;
+  mutable bool needs_cleanup_device_ = false;
 };
 
 namespace detail {
-inline std::vector<std::unique_ptr<result_buffer>> global_result_buffer_pool(0);
+inline std::vector<std::unique_ptr<ws_buffer>> global_result_buffer_pool(0);
 inline std::mutex grp_mutex;
 }  // namespace detail
 
@@ -262,24 +301,47 @@ inline std::mutex grp_mutex;
  * This reduces the setup overhead and number of times the context is being blocked
  * (this is relevant if there is a persistent kernel running across multiples benchmark cases).
  */
-inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
+inline auto get_result_buffer_from_global_pool(size_t size) -> ws_buffer&
 {
   auto stream = get_stream_from_global_pool();
-  auto& rb    = [stream, size]() -> result_buffer& {
+  auto& rb    = [stream, size]() -> ws_buffer& {
     std::lock_guard guard(detail::grp_mutex);
     if (static_cast<int>(detail::global_result_buffer_pool.size()) < benchmark_n_threads) {
       detail::global_result_buffer_pool.resize(benchmark_n_threads);
     }
     auto& rb = detail::global_result_buffer_pool[benchmark_thread_id];
-    if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+    if (!rb || rb->size() < size) {
+      rb = std::make_unique<ws_buffer>(size, stream);
+    } else {
+      rb->reuse();
+    }
     return *rb;
   }();
+  return rb;
+}
 
-  memset(rb.data(MemoryType::kHost), 0, size);
-#ifndef BUILD_CPU_ONLY
-  cudaMemsetAsync(rb.data(MemoryType::kDevice), 0, size, stream);
-  cudaStreamSynchronize(stream);
-#endif
+namespace detail {
+inline std::vector<std::unique_ptr<ws_buffer>> global_tmp_buffer_pool(0);
+inline std::mutex gtp_mutex;
+}  // namespace detail
+
+/**
+ * Global temporary buffer pool for use by algorithms.
+ * In contrast to `get_result_buffer_from_global_pool`, the content of these buffers is never
+ * initialized.
+ */
+inline auto get_tmp_buffer_from_global_pool(size_t size) -> ws_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  auto& rb    = [stream, size]() -> ws_buffer& {
+    std::lock_guard guard(detail::gtp_mutex);
+    if (static_cast<int>(detail::global_tmp_buffer_pool.size()) < benchmark_n_threads) {
+      detail::global_tmp_buffer_pool.resize(benchmark_n_threads);
+    }
+    auto& rb = detail::global_tmp_buffer_pool[benchmark_thread_id];
+    if (!rb || rb->size() < size) { rb = std::make_unique<ws_buffer>(size, stream); }
+    return *rb;
+  }();
   return rb;
 }
 
@@ -293,6 +355,7 @@ inline void reset_global_device_resources()
 {
 #ifndef BUILD_CPU_ONLY
   std::lock_guard guard(detail::gsp_mutex);
+  detail::global_tmp_buffer_pool.resize(0);
   detail::global_result_buffer_pool.resize(0);
   detail::global_stream_pool.resize(0);
 #endif

diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -247,6 +247,16 @@ void parse_search_param(const nlohmann::json& conf,
   if (conf.contains("itopk")) { param.p.itopk_size = conf.at("itopk"); }
   if (conf.contains("search_width")) { param.p.search_width = conf.at("search_width"); }
   if (conf.contains("max_iterations")) { param.p.max_iterations = conf.at("max_iterations"); }
+  if (conf.contains("persistent")) { param.p.persistent = conf.at("persistent"); }
+  if (conf.contains("persistent_lifetime")) {
+    param.p.persistent_lifetime = conf.at("persistent_lifetime");
+  }
+  if (conf.contains("persistent_device_usage")) {
+    param.p.persistent_device_usage = conf.at("persistent_device_usage");
+  }
+  if (conf.contains("thread_block_size")) {
+    param.p.thread_block_size = conf.at("thread_block_size");
+  }
   if (conf.contains("algo")) {
     if (conf.at("algo") == "single_cta") {
       param.p.algo = cuvs::neighbors::cagra::search_algo::SINGLE_CTA;