Merge branch 'branch-24.10' into cagra_build_binary_size

rapidsai · Sep 27, 2024 · 5409721 · 5409721
2 parents d4d3f20 + d9eec69
commit 5409721
Show file tree

Hide file tree

Showing 76 changed files with 3,460 additions and 320 deletions.
diff --git a/.gitignore b/.gitignore
@@ -79,3 +79,6 @@ cagra_index
 ivf_flat_index
 ivf_pq_index
 
+# cuvs_bench
+datasets/
+/*.json
diff --git a/build.sh b/build.sh
@@ -275,7 +275,7 @@ if hasArg tests || (( ${NUMARGS} == 0 )); then
 fi
 
 if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
-    BUILD_ANN_BENCH=ON
+    BUILD_CUVS_BENCH=ON
     CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
 fi
 
@@ -351,7 +351,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
           -DBUILD_C_TESTS=${BUILD_TESTS} \
-          -DBUILD_ANN_BENCH=${BUILD_ANN_BENCH} \
+          -DBUILD_CUVS_BENCH=${BUILD_CUVS_BENCH} \
           -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           ${CACHE_ARGS} \
@@ -419,6 +419,11 @@ if (( ${NUMARGS} == 0 )) || hasArg python; then
         python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs
 fi
 
+# Build and (optionally) install the cuvs_bench Python package
+if (( ${NUMARGS} == 0 )) || hasArg bench-ann; then
+    python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs_bench
+fi
+
 # Build the cuvs Rust bindings
 if (( ${NUMARGS} == 0 )) || hasArg rust; then
     cd ${REPODIR}/rust

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -42,5 +42,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-aarch64==2.17
+- wheel
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -42,5 +42,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-64==2.17
+- wheel
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -38,5 +38,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-aarch64==2.17
+- wheel
 name: bench_ann_cuda-125_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -38,5 +38,8 @@ dependencies:
 - pandas
 - pylibraft==24.10.*,>=0.0.0a0
 - pyyaml
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- setuptools
 - sysroot_linux-64==2.17
+- wheel
 name: bench_ann_cuda-125_arch-x86_64
diff --git a/conda/recipes/libcuvs/build_libcuvs_tests.sh b/conda/recipes/libcuvs/build_libcuvs_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-./build.sh tests bench-ann --allgpuarch --no-nvtx --build-metrics=tests_bench --incl-cache-stats
+./build.sh tests --allgpuarch --no-nvtx --build-metrics=tests --incl-cache-stats
 cmake --install cpp/build --component testing
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -55,7 +55,7 @@ option(BUILD_SHARED_LIBS "Build cuvs shared libraries" ON)
 option(BUILD_TESTS "Build cuvs unit-tests" ON)
 option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
 option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
-option(BUILD_ANN_BENCH "Build cuVS ann benchmarks" OFF)
+option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
@@ -96,7 +96,7 @@ include(CMakeDependentOption)
 
 message(VERBOSE "cuVS: Build cuVS unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}")
-message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_ANN_BENCH}")
+message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}")
 message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -188,7 +188,7 @@ endif()
 
 include(cmake/thirdparty/get_cutlass.cmake)
 
-if(BUILD_ANN_BENCH)
+if(BUILD_CUVS_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
   rapids_cpm_gbench(BUILD_STATIC)
 endif()
@@ -651,6 +651,6 @@ endif()
 # ##################################################################################################
 # * build ann benchmark executable -----------------------------------------------
 
-if(BUILD_ANN_BENCH)
+if(BUILD_CUVS_BENCH)
   add_subdirectory(bench/ann/)
 endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
@@ -199,30 +199,19 @@ if(NOT TARGET CUVS_ANN_BENCH_ALL)
 endif()
 
 if(CUVS_ANN_BENCH_USE_HNSWLIB)
-  ConfigureAnnBench(
-    NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
-  )
+  ConfigureAnnBench(NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib)
 
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ)
   ConfigureAnnBench(
-    NAME CUVS_IVF_PQ
-    PATH
-    src/cuvs/cuvs_benchmark.cu
-    src/cuvs/cuvs_ivf_pq.cu
-    LINKS cuvs
+    NAME CUVS_IVF_PQ PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_pq.cu LINKS cuvs
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT)
   ConfigureAnnBench(
-    NAME CUVS_IVF_FLAT
-    PATH
-    src/cuvs/cuvs_benchmark.cu
-    src/cuvs/cuvs_ivf_flat.cu
-    LINKS
-    cuvs
+    NAME CUVS_IVF_FLAT PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_flat.cu LINKS cuvs
   )
 endif()
 
@@ -232,12 +221,8 @@ endif()
 
 if(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE)
   ConfigureAnnBench(
-    NAME
-    CUVS_KNN_BRUTE_FORCE
-    PATH
-    $<$<BOOL:${CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE}>:src/cuvs/cuvs_brute_force_knn.cu>
-    LINKS
-    cuvs
+    NAME CUVS_KNN_BRUTE_FORCE PATH
+    $<$<BOOL:${CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE}>:src/cuvs/cuvs_brute_force_knn.cu> LINKS cuvs
   )
 endif()
 
@@ -258,45 +243,39 @@ endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
   ConfigureAnnBench(
-    NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs
-    hnswlib::hnswlib
+    NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib
   )
 endif()
 
 message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT AND CUVS_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
 if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND CUVS_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS
-    ${CUVS_FAISS_TARGETS}
+    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
   )
 endif()
 
@@ -322,13 +301,8 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
 
   target_link_libraries(
     ANN_BENCH
-    PRIVATE raft::raft
-            nlohmann_json::nlohmann_json
-            benchmark::benchmark
-            dl
-            fmt::fmt-header-only
-            spdlog::spdlog_header_only
-            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+    PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
+            spdlog::spdlog_header_only $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
   )
   set_target_properties(
     ANN_BENCH

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
@@ -35,6 +35,7 @@ enum class Mode {
 enum class MemoryType {
   kHost,
   kHostMmap,
+  kHostPinned,
   kDevice,
 };
 
@@ -60,6 +61,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
     return MemoryType::kHost;
   } else if (memory_type == "mmap") {
     return MemoryType::kHostMmap;
+  } else if (memory_type == "pinned") {
+    return MemoryType::kHostPinned;
   } else if (memory_type == "device") {
     return MemoryType::kDevice;
   } else {

diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp
@@ -286,15 +286,56 @@ class dataset {
   {
     switch (memory_type) {
       case MemoryType::kDevice: return query_set_on_gpu();
-      default: return query_set();
+      case MemoryType::kHost: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (query_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          query_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::kHostPinned: {
+        auto r = query_set();
+#ifndef BUILD_CPU_ONLY
+        if (!query_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), query_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          query_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
+      default: return nullptr;
     }
   }
 
   auto base_set(MemoryType memory_type) const -> const T*
   {
     switch (memory_type) {
       case MemoryType::kDevice: return base_set_on_gpu();
-      case MemoryType::kHost: return base_set();
+      case MemoryType::kHost: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (base_set_pinned_) {
+          cudaHostUnregister(const_cast<T*>(r));
+          base_set_pinned_ = false;
+        }
+#endif
+        return r;
+      }
+      case MemoryType::kHostPinned: {
+        auto r = base_set();
+#ifndef BUILD_CPU_ONLY
+        if (!base_set_pinned_) {
+          cudaHostRegister(
+            const_cast<T*>(r), base_set_size() * dim() * sizeof(T), cudaHostRegisterDefault);
+          base_set_pinned_ = true;
+        }
+#endif
+        return r;
+      }
       case MemoryType::kHostMmap: return mapped_base_set();
       default: return nullptr;
     }
@@ -315,18 +356,23 @@ class dataset {
   mutable T* d_query_set_     = nullptr;
   mutable T* mapped_base_set_ = nullptr;
   mutable int32_t* gt_set_    = nullptr;
+
+  mutable bool base_set_pinned_  = false;
+  mutable bool query_set_pinned_ = false;
 };
 
 template <typename T>
 dataset<T>::~dataset()
 {
-  delete[] base_set_;
-  delete[] query_set_;
-  delete[] gt_set_;
 #ifndef BUILD_CPU_ONLY
   if (d_base_set_) { cudaFree(d_base_set_); }
   if (d_query_set_) { cudaFree(d_query_set_); }
+  if (base_set_pinned_) { cudaHostUnregister(base_set_); }
+  if (query_set_pinned_) { cudaHostUnregister(query_set_); }
 #endif
+  delete[] base_set_;
+  delete[] query_set_;
+  delete[] gt_set_;
 }
 
 template <typename T>