Merge branch 'branch-23.10' into remove-block_size-from-CAGRA-2

rapidsai · Sep 20, 2023 · a383916 · a383916
2 parents f8253b8 + 6bbcf1f
commit a383916
Show file tree

Hide file tree

Showing 107 changed files with 586 additions and 489 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
                 additional_dependencies: [toml]
                 args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v16.0.1
+        rev: v16.0.6
         hooks:
               - id: clang-format
                 types_or: [c, c++, cuda]

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang=16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang=16.0.6
 - cmake>=3.26.4
 - cuda-cudart-dev
 - cuda-profiler-api

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang=16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8

diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
@@ -78,6 +78,8 @@ requirements:
     - h5py {{ h5py_version }}
     - benchmark
     - matplotlib
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
     - python
     - pandas
     - pyyaml

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
@@ -211,9 +211,10 @@ void bench_search(::benchmark::State& state,
     try {
       algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type),
                                dataset->base_set_size());
-    } catch (const std::exception&) {
+    } catch (const std::exception& ex) {
       state.SkipWithError("The algorithm '" + index.name +
-                          "' requires the base set, but it's not available.");
+                          "' requires the base set, but it's not available. " +
+                          "Exception: " + std::string(ex.what()));
       return;
     }
   }

diff --git a/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu b/cpp/bench/ann/src/ggnn/ggnn_benchmark.cu
@@ -33,8 +33,7 @@ template <typename T>
 void parse_build_param(const nlohmann::json& conf,
                        typename raft::bench::ann::Ggnn<T>::BuildParam& param)
 {
-  param.dataset_size = conf.at("dataset_size");
-  param.k            = conf.at("k");
+  param.k = conf.at("k");
 
   if (conf.contains("k_build")) { param.k_build = conf.at("k_build"); }
   if (conf.contains("segment_size")) { param.segment_size = conf.at("segment_size"); }

diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -38,8 +38,6 @@ class Ggnn : public ANN<T> {
     int num_layers{4};     // L
     float tau{0.5};
     int refine_iterations{2};
-
-    size_t dataset_size;
     int k;  // GGNN requires to know k during building
   };
 
@@ -182,24 +180,17 @@ GgnnImpl<T, measure, D, KBuild, KQuery, S>::GgnnImpl(Metric metric,
   }
 
   if (dim != D) { throw std::runtime_error("mis-matched dim"); }
-
-  int device;
-  RAFT_CUDA_TRY(cudaGetDevice(&device));
-
-  ggnn_ = std::make_unique<GGNNGPUInstance>(
-    device, build_param_.dataset_size, build_param_.num_layers, true, build_param_.tau);
 }
 
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
                                                        size_t nrow,
                                                        cudaStream_t stream)
 {
-  if (nrow != build_param_.dataset_size) {
-    throw std::runtime_error(
-      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
-      " , but nrow = " + std::to_string(nrow));
-  }
+  int device;
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
+  ggnn_ = std::make_unique<GGNNGPUInstance>(
+    device, nrow, build_param_.num_layers, true, build_param_.tau);
 
   ggnn_->set_base_data(dataset);
   ggnn_->set_stream(stream);
@@ -212,11 +203,6 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::build(const T* dataset,
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_dataset(const T* dataset, size_t nrow)
 {
-  if (nrow != build_param_.dataset_size) {
-    throw std::runtime_error(
-      "build_param_.dataset_size = " + std::to_string(build_param_.dataset_size) +
-      " , but nrow = " + std::to_string(nrow));
-  }
   ggnn_->set_base_data(dataset);
 }
 

diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -31,6 +31,8 @@
 #include <utility>
 #include <vector>
 
+#include <omp.h>
+
 #include "../common/ann_types.hpp"
 #include <hnswlib.h>
 
@@ -164,13 +166,13 @@ class HnswLib : public ANN<T> {
   struct BuildParam {
     int M;
     int ef_construction;
-    int num_threads{1};
+    int num_threads = omp_get_num_procs();
   };
 
   using typename ANN<T>::AnnSearchParam;
   struct SearchParam : public AnnSearchParam {
     int ef;
-    int num_threads{1};
+    int num_threads = omp_get_num_procs();
   };
 
   HnswLib(Metric metric, int dim, const BuildParam& param);

diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
@@ -19,6 +19,7 @@
 #include <common/benchmark.hpp>
 
 #include <raft/core/device_resources.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/detail/utils.h>
 #include <raft/util/cudart_utils.hpp>
@@ -38,6 +39,19 @@
 namespace raft::matrix {
 using namespace raft::bench;  // NOLINT
 
+template <typename KeyT>
+struct replace_with_mask {
+  KeyT replacement;
+  int64_t line_length;
+  int64_t spared_inputs;
+  constexpr auto inline operator()(int64_t offset, KeyT x, uint8_t mask) -> KeyT
+  {
+    auto i = offset % line_length;
+    // don't replace all the inputs, spare a few elements at the beginning of the input
+    return (mask && i >= spared_inputs) ? replacement : x;
+  }
+};
+
 template <typename KeyT, typename IdxT, select::Algo Algo>
 struct selection : public fixture {
   explicit selection(const select::params& p)
@@ -67,6 +81,21 @@ struct selection : public fixture {
       }
     }
     raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), min_value, max_value);
+    if (p.frac_infinities > 0.0) {
+      rmm::device_uvector<uint8_t> mask_buf(p.batch_size * p.len, stream);
+      auto mask = make_device_vector_view<uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+      raft::random::bernoulli(handle, state, mask, p.frac_infinities);
+      KeyT bound = p.select_min ? raft::upper_bound<KeyT>() : raft::lower_bound<KeyT>();
+      auto mask_in =
+        make_device_vector_view<const uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+      auto dists_in  = make_device_vector_view<const KeyT>(in_dists_.data(), in_dists_.size());
+      auto dists_out = make_device_vector_view<KeyT>(in_dists_.data(), in_dists_.size());
+      raft::linalg::map_offset(handle,
+                               dists_out,
+                               replace_with_mask<KeyT>{bound, int64_t(p.len), int64_t(p.k / 2)},
+                               dists_in,
+                               mask_in);
+    }
   }
 
   void run_benchmark(::benchmark::State& state) override  // NOLINT
@@ -75,8 +104,12 @@ struct selection : public fixture {
       std::ostringstream label_stream;
       label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
       if (params_.use_same_leading_bits) { label_stream << "#same-leading-bits"; }
+      if (params_.frac_infinities > 0) { label_stream << "#infs-" << params_.frac_infinities; }
       state.SetLabel(label_stream.str());
-      loop_on_state(state, [this]() {
+      common::nvtx::range case_scope("%s - %s", state.name().c_str(), label_stream.str().c_str());
+      int iter = 0;
+      loop_on_state(state, [&iter, this]() {
+        common::nvtx::range lap_scope("lap-", iter++);
         select::select_k_impl<KeyT, IdxT>(handle,
                                           Algo,
                                           in_dists_.data(),
@@ -149,6 +182,35 @@ const std::vector<select::params> kInputs{
   {10, 1000000, 64, true, false, true},
   {10, 1000000, 128, true, false, true},
   {10, 1000000, 256, true, false, true},
+
+  {10, 1000000, 1, true, false, false, true, 0.1},
+  {10, 1000000, 16, true, false, false, true, 0.1},
+  {10, 1000000, 64, true, false, false, true, 0.1},
+  {10, 1000000, 128, true, false, false, true, 0.1},
+  {10, 1000000, 256, true, false, false, true, 0.1},
+
+  {10, 1000000, 1, true, false, false, true, 0.9},
+  {10, 1000000, 16, true, false, false, true, 0.9},
+  {10, 1000000, 64, true, false, false, true, 0.9},
+  {10, 1000000, 128, true, false, false, true, 0.9},
+  {10, 1000000, 256, true, false, false, true, 0.9},
+  {1000, 10000, 1, true, false, false, true, 0.9},
+  {1000, 10000, 16, true, false, false, true, 0.9},
+  {1000, 10000, 64, true, false, false, true, 0.9},
+  {1000, 10000, 128, true, false, false, true, 0.9},
+  {1000, 10000, 256, true, false, false, true, 0.9},
+
+  {10, 1000000, 1, true, false, false, true, 1.0},
+  {10, 1000000, 16, true, false, false, true, 1.0},
+  {10, 1000000, 64, true, false, false, true, 1.0},
+  {10, 1000000, 128, true, false, false, true, 1.0},
+  {10, 1000000, 256, true, false, false, true, 1.0},
+  {1000, 10000, 1, true, false, false, true, 1.0},
+  {1000, 10000, 16, true, false, false, true, 1.0},
+  {1000, 10000, 64, true, false, false, true, 1.0},
+  {1000, 10000, 128, true, false, false, true, 1.0},
+  {1000, 10000, 256, true, false, false, true, 1.0},
+  {1000, 10000, 256, true, false, false, true, 0.999},
 };
 
 #define SELECTION_REGISTER(KeyT, IdxT, A)                        \
@@ -157,28 +219,28 @@ const std::vector<select::params> kInputs{
   RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
   }
 
-SELECTION_REGISTER(float, uint32_t, kPublicApi);              // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix8bits);             // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bits);            // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);   // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpAuto);               // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpImmediate);          // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpFiltered);           // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributed);        // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);     // NOLINT
+SELECTION_REGISTER(float, uint32_t, kPublicApi);             // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpAuto);              // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);    // NOLINT
 
 SELECTION_REGISTER(double, uint32_t, kRadix8bits);            // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bits);           // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bitsExtraPass);  // NOLINT
 SELECTION_REGISTER(double, uint32_t, kWarpAuto);              // NOLINT
 
-SELECTION_REGISTER(double, int64_t, kRadix8bits);             // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bits);            // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);   // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpImmediate);          // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpFiltered);           // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributed);        // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);     // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);    // NOLINT
 
 // For learning a heuristic of which selection algorithm to use, we
 // have a couple of additional constraints when generating the dataset:

diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -438,7 +438,7 @@ __global__ void __launch_bounds__((WarpSize * BlockDimY))
   adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
                         IdxT n_clusters,
                         IdxT dim,
-                        const T* dataset,               // [n_rows, dim]
+                        const T* dataset,  // [n_rows, dim]
                         IdxT n_rows,
                         const LabelT* labels,           // [n_rows]
                         const CounterT* cluster_sizes,  // [n_clusters]

diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
@@ -75,7 +75,7 @@ namespace numpy_serializer {
 
 #if RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_LITTLE_ENDIAN_CHAR
-#else   // RAFT_SYSTEM_LITTLE_ENDIAN == 1
+#else  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_BIG_ENDIAN_CHAR
 #endif  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 

diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
@@ -193,7 +193,7 @@ inline void pop_range()
 
 }  // namespace raft::common::nvtx::detail
 
-#else   // NVTX_ENABLED
+#else  // NVTX_ENABLED
 
 namespace raft::common::nvtx::detail {
 

diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp
@@ -32,8 +32,8 @@ struct KeyValuePair {
   typedef _Key Key;      ///< Key data type
   typedef _Value Value;  ///< Value data type
 
-  Key key;               ///< Item key
-  Value value;           ///< Item value
+  Key key;      ///< Item key
+  Value value;  ///< Item value
 
   /// Constructor
   RAFT_INLINE_FUNCTION KeyValuePair() {}

diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
@@ -42,7 +42,7 @@ enum resource_type {
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
 
-  LAST_KEY                 // reserved for the last key
+  LAST_KEY  // reserved for the last key
 };
 
 /**

diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h b/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
@@ -397,7 +397,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
     TensorTileIterator
       tensor_iterator,  ///< Threadblock tile iterator for additional tensor operand
     MatrixCoord const&
-      problem_size =    ///< Problem size needed to guard against out-of-bounds accesses
+      problem_size =  ///< Problem size needed to guard against out-of-bounds accesses
     MatrixCoord(Shape::kM, Shape::kN),
     MatrixCoord const&
       threadblock_offset =  ///< Threadblock's initial offset within the problem size space
@@ -418,7 +418,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
       broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
     ElementVector const* broadcast_ptr,  ///< Broadcast vector
     MatrixCoord const&
-      problem_size,       ///< Problem size needed to guard against out-of-bounds accesses
+      problem_size,  ///< Problem size needed to guard against out-of-bounds accesses
     MatrixCoord const&
       threadblock_offset  ///< Threadblock's initial offset within the problem size space
   )

diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h b/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h
@@ -88,8 +88,8 @@ namespace threadblock {
 ///
 /// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
 ///
-template <typename ThreadMap_,        ///< Thread map (conept: OutputTileThreadMap)
-          typename Element_,          ///< Element data type
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
           typename Layout_,
           bool ScatterD     = false,  ///< Scatter D operand or not
           bool UseCUDAStore = false>

diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <cstddef>                                          // size_t
-#include <limits>                                           // std::numeric_limits
-#include <raft/core/kvp.hpp>                                // raft::KeyValuePair
-#include <raft/core/operators.hpp>                          // raft::identity_op
-#include <raft/distance/detail/distance_ops/l2_exp.cuh>     // ops::l2_exp_distance_op
+#include <cstddef>                                       // size_t
+#include <limits>                                        // std::numeric_limits
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>  // ops::l2_exp_distance_op
 #include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
 #include <raft/linalg/contractions.cuh>                     // Policy

diff --git a/cpp/include/raft/distance/detail/masked_distance_base.cuh b/cpp/include/raft/distance/detail/masked_distance_base.cuh
@@ -217,7 +217,7 @@ struct MaskedDistances : public BaseClass {
         }  // tile_idx_n
       }    // idx_g
       rowEpilog_op(tile_idx_m);
-    }      // tile_idx_m
+    }  // tile_idx_m
   }
 
  private: