Merge branch 'branch-23.10' into brute_force_index

rapidsai · Sep 25, 2023 · e3041a5 · e3041a5
2 parents 79e9a83 + cb24d99
commit e3041a5
Show file tree

Hide file tree

Showing 167 changed files with 7,504 additions and 2,342 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -62,7 +62,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibraft:
     needs: checks

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -43,7 +43,7 @@ repos:
                 additional_dependencies: [toml]
                 args: ["--config=pyproject.toml"]
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v16.0.1
+        rev: v16.0.6
         hooks:
               - id: clang-format
                 types_or: [c, c++, cuda]

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -47,10 +47,6 @@ sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cma
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pylibraft/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/raft_dask/__init__.py
 
-# Python pyproject.toml updates
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pylibraft/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/raft-dask/pyproject.toml
-
 # Wheel testing script
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_raft_dask.sh
 
@@ -74,6 +70,7 @@ for FILE in python/*/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
   done
+  sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" "${FILE}"
   sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_SHORT_TAG_PEP440}.*\"/g" ${FILE}
 done
 

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang=16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.7.1,<12.0a0

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - breathe
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang=16.0.6
 - cmake>=3.26.4
 - cuda-cudart-dev
 - cuda-profiler-api

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,8 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.1
-- clang=16.0.1
+- clang-tools=16.0.6
+- clang=16.0.6
 - cmake>=3.26.4
 - cuda-profiler-api=11.8.86
 - cuda-version=11.8

diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
@@ -78,11 +78,11 @@ requirements:
     - h5py {{ h5py_version }}
     - benchmark
     - matplotlib
-    # rmm is needed to determine if package is gpu-enabled
-    - rmm ={{ minor_version }}
     - python
     - pandas
     - pyyaml
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
 
   run:
     - python
@@ -104,6 +104,8 @@ requirements:
     - python
     - pandas
     - pyyaml
+    # rmm is needed to determine if package is gpu-enabled
+    - rmm ={{ minor_version }}
 about:
   home: https://rapids.ai/
   license: Apache-2.0

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
@@ -211,9 +211,10 @@ void bench_search(::benchmark::State& state,
     try {
       algo->set_search_dataset(dataset->base_set(algo_property.dataset_memory_type),
                                dataset->base_set_size());
-    } catch (const std::exception&) {
+    } catch (const std::exception& ex) {
       state.SkipWithError("The algorithm '" + index.name +
-                          "' requires the base set, but it's not available.");
+                          "' requires the base set, but it's not available. " +
+                          "Exception: " + std::string(ex.what()));
       return;
     }
   }

diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
@@ -19,6 +19,7 @@
 #include <common/benchmark.hpp>
 
 #include <raft/core/device_resources.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/detail/utils.h>
 #include <raft/util/cudart_utils.hpp>
@@ -38,6 +39,19 @@
 namespace raft::matrix {
 using namespace raft::bench;  // NOLINT
 
+template <typename KeyT>
+struct replace_with_mask {
+  KeyT replacement;
+  int64_t line_length;
+  int64_t spared_inputs;
+  constexpr auto inline operator()(int64_t offset, KeyT x, uint8_t mask) -> KeyT
+  {
+    auto i = offset % line_length;
+    // don't replace all the inputs, spare a few elements at the beginning of the input
+    return (mask && i >= spared_inputs) ? replacement : x;
+  }
+};
+
 template <typename KeyT, typename IdxT, select::Algo Algo>
 struct selection : public fixture {
   explicit selection(const select::params& p)
@@ -67,6 +81,21 @@ struct selection : public fixture {
       }
     }
     raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), min_value, max_value);
+    if (p.frac_infinities > 0.0) {
+      rmm::device_uvector<uint8_t> mask_buf(p.batch_size * p.len, stream);
+      auto mask = make_device_vector_view<uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+      raft::random::bernoulli(handle, state, mask, p.frac_infinities);
+      KeyT bound = p.select_min ? raft::upper_bound<KeyT>() : raft::lower_bound<KeyT>();
+      auto mask_in =
+        make_device_vector_view<const uint8_t, size_t>(mask_buf.data(), mask_buf.size());
+      auto dists_in  = make_device_vector_view<const KeyT>(in_dists_.data(), in_dists_.size());
+      auto dists_out = make_device_vector_view<KeyT>(in_dists_.data(), in_dists_.size());
+      raft::linalg::map_offset(handle,
+                               dists_out,
+                               replace_with_mask<KeyT>{bound, int64_t(p.len), int64_t(p.k / 2)},
+                               dists_in,
+                               mask_in);
+    }
   }
 
   void run_benchmark(::benchmark::State& state) override  // NOLINT
@@ -75,8 +104,12 @@ struct selection : public fixture {
       std::ostringstream label_stream;
       label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
       if (params_.use_same_leading_bits) { label_stream << "#same-leading-bits"; }
+      if (params_.frac_infinities > 0) { label_stream << "#infs-" << params_.frac_infinities; }
       state.SetLabel(label_stream.str());
-      loop_on_state(state, [this]() {
+      common::nvtx::range case_scope("%s - %s", state.name().c_str(), label_stream.str().c_str());
+      int iter = 0;
+      loop_on_state(state, [&iter, this]() {
+        common::nvtx::range lap_scope("lap-", iter++);
         select::select_k_impl<KeyT, IdxT>(handle,
                                           Algo,
                                           in_dists_.data(),
@@ -149,6 +182,35 @@ const std::vector<select::params> kInputs{
   {10, 1000000, 64, true, false, true},
   {10, 1000000, 128, true, false, true},
   {10, 1000000, 256, true, false, true},
+
+  {10, 1000000, 1, true, false, false, true, 0.1},
+  {10, 1000000, 16, true, false, false, true, 0.1},
+  {10, 1000000, 64, true, false, false, true, 0.1},
+  {10, 1000000, 128, true, false, false, true, 0.1},
+  {10, 1000000, 256, true, false, false, true, 0.1},
+
+  {10, 1000000, 1, true, false, false, true, 0.9},
+  {10, 1000000, 16, true, false, false, true, 0.9},
+  {10, 1000000, 64, true, false, false, true, 0.9},
+  {10, 1000000, 128, true, false, false, true, 0.9},
+  {10, 1000000, 256, true, false, false, true, 0.9},
+  {1000, 10000, 1, true, false, false, true, 0.9},
+  {1000, 10000, 16, true, false, false, true, 0.9},
+  {1000, 10000, 64, true, false, false, true, 0.9},
+  {1000, 10000, 128, true, false, false, true, 0.9},
+  {1000, 10000, 256, true, false, false, true, 0.9},
+
+  {10, 1000000, 1, true, false, false, true, 1.0},
+  {10, 1000000, 16, true, false, false, true, 1.0},
+  {10, 1000000, 64, true, false, false, true, 1.0},
+  {10, 1000000, 128, true, false, false, true, 1.0},
+  {10, 1000000, 256, true, false, false, true, 1.0},
+  {1000, 10000, 1, true, false, false, true, 1.0},
+  {1000, 10000, 16, true, false, false, true, 1.0},
+  {1000, 10000, 64, true, false, false, true, 1.0},
+  {1000, 10000, 128, true, false, false, true, 1.0},
+  {1000, 10000, 256, true, false, false, true, 1.0},
+  {1000, 10000, 256, true, false, false, true, 0.999},
 };
 
 #define SELECTION_REGISTER(KeyT, IdxT, A)                        \
@@ -157,28 +219,28 @@ const std::vector<select::params> kInputs{
   RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
   }
 
-SELECTION_REGISTER(float, uint32_t, kPublicApi);              // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix8bits);             // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bits);            // NOLINT
-SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);   // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpAuto);               // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpImmediate);          // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpFiltered);           // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributed);        // NOLINT
-SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);     // NOLINT
+SELECTION_REGISTER(float, uint32_t, kPublicApi);             // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(float, uint32_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpAuto);              // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(float, uint32_t, kWarpDistributedShm);    // NOLINT
 
 SELECTION_REGISTER(double, uint32_t, kRadix8bits);            // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bits);           // NOLINT
 SELECTION_REGISTER(double, uint32_t, kRadix11bitsExtraPass);  // NOLINT
 SELECTION_REGISTER(double, uint32_t, kWarpAuto);              // NOLINT
 
-SELECTION_REGISTER(double, int64_t, kRadix8bits);             // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bits);            // NOLINT
-SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);   // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpImmediate);          // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpFiltered);           // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributed);        // NOLINT
-SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);     // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix8bits);            // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bits);           // NOLINT
+SELECTION_REGISTER(double, int64_t, kRadix11bitsExtraPass);  // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpImmediate);         // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpFiltered);          // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributed);       // NOLINT
+SELECTION_REGISTER(double, int64_t, kWarpDistributedShm);    // NOLINT
 
 // For learning a heuristic of which selection algorithm to use, we
 // have a couple of additional constraints when generating the dataset:

diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -438,7 +438,7 @@ __global__ void __launch_bounds__((WarpSize * BlockDimY))
   adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
                         IdxT n_clusters,
                         IdxT dim,
-                        const T* dataset,               // [n_rows, dim]
+                        const T* dataset,  // [n_rows, dim]
                         IdxT n_rows,
                         const LabelT* labels,           // [n_rows]
                         const CounterT* cluster_sizes,  // [n_clusters]

diff --git a/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp b/cpp/include/raft/core/detail/mdspan_numpy_serializer.hpp
@@ -75,7 +75,7 @@ namespace numpy_serializer {
 
 #if RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_LITTLE_ENDIAN_CHAR
-#else   // RAFT_SYSTEM_LITTLE_ENDIAN == 1
+#else  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 #define RAFT_NUMPY_HOST_ENDIAN_CHAR RAFT_NUMPY_BIG_ENDIAN_CHAR
 #endif  // RAFT_SYSTEM_LITTLE_ENDIAN == 1
 

diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
@@ -193,7 +193,7 @@ inline void pop_range()
 
 }  // namespace raft::common::nvtx::detail
 
-#else   // NVTX_ENABLED
+#else  // NVTX_ENABLED
 
 namespace raft::common::nvtx::detail {
 

diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp
@@ -32,8 +32,8 @@ struct KeyValuePair {
   typedef _Key Key;      ///< Key data type
   typedef _Value Value;  ///< Value data type
 
-  Key key;               ///< Item key
-  Value value;           ///< Item value
+  Key key;      ///< Item key
+  Value value;  ///< Item value
 
   /// Constructor
   RAFT_INLINE_FUNCTION KeyValuePair() {}

diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
@@ -42,7 +42,7 @@ enum resource_type {
   THRUST_POLICY,           // thrust execution policy
   WORKSPACE_RESOURCE,      // rmm device memory resource
 
-  LAST_KEY                 // reserved for the last key
+  LAST_KEY  // reserved for the last key
 };
 
 /**

diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h b/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
@@ -397,7 +397,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
     TensorTileIterator
       tensor_iterator,  ///< Threadblock tile iterator for additional tensor operand
     MatrixCoord const&
-      problem_size =    ///< Problem size needed to guard against out-of-bounds accesses
+      problem_size =  ///< Problem size needed to guard against out-of-bounds accesses
     MatrixCoord(Shape::kM, Shape::kN),
     MatrixCoord const&
       threadblock_offset =  ///< Threadblock's initial offset within the problem size space
@@ -418,7 +418,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
       broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
     ElementVector const* broadcast_ptr,  ///< Broadcast vector
     MatrixCoord const&
-      problem_size,       ///< Problem size needed to guard against out-of-bounds accesses
+      problem_size,  ///< Problem size needed to guard against out-of-bounds accesses
     MatrixCoord const&
       threadblock_offset  ///< Threadblock's initial offset within the problem size space
   )

diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h b/cpp/include/raft/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h
@@ -88,8 +88,8 @@ namespace threadblock {
 ///
 /// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
 ///
-template <typename ThreadMap_,        ///< Thread map (conept: OutputTileThreadMap)
-          typename Element_,          ///< Element data type
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
           typename Layout_,
           bool ScatterD     = false,  ///< Scatter D operand or not
           bool UseCUDAStore = false>

diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <cstddef>                                          // size_t
-#include <limits>                                           // std::numeric_limits
-#include <raft/core/kvp.hpp>                                // raft::KeyValuePair
-#include <raft/core/operators.hpp>                          // raft::identity_op
-#include <raft/distance/detail/distance_ops/l2_exp.cuh>     // ops::l2_exp_distance_op
+#include <cstddef>                                       // size_t
+#include <limits>                                        // std::numeric_limits
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>  // ops::l2_exp_distance_op
 #include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
 #include <raft/linalg/contractions.cuh>                     // Policy

diff --git a/cpp/include/raft/distance/detail/masked_distance_base.cuh b/cpp/include/raft/distance/detail/masked_distance_base.cuh
@@ -217,7 +217,7 @@ struct MaskedDistances : public BaseClass {
         }  // tile_idx_n
       }    // idx_g
       rowEpilog_op(tile_idx_m);
-    }      // tile_idx_m
+    }  // tile_idx_m
   }
 
  private:

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -18,7 +18,7 @@
 #include <raft/util/cuda_dev_essentials.cuh>  // ceildiv
 #include <raft/util/cuda_rt_essentials.hpp>   // RAFT_CUDA_TRY
 
-#include <cstddef>                            // size_t
+#include <cstddef>  // size_t
 
 namespace raft {
 namespace distance {

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -45,7 +45,7 @@ void pairwise_matrix_dispatch(OpT distance_op,
                               cudaStream_t stream,
                               bool is_row_major) RAFT_EXPLICIT;
 
-};      // namespace raft::distance::detail
+};  // namespace raft::distance::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 

diff --git a/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h b/cpp/include/raft/distance/detail/predicated_tile_iterator_normvec.h
@@ -57,8 +57,8 @@ namespace threadblock {
 ///
 /// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
 ///
-template <typename ThreadMap_,        ///< Thread map (conept: OutputTileThreadMap)
-          typename Element_,          ///< Element data type
+template <typename ThreadMap_,  ///< Thread map (conept: OutputTileThreadMap)
+          typename Element_,    ///< Element data type
           typename Layout_,
           bool ScatterD     = false,  ///< Scatter D operand or not
           bool UseCUDAStore = false>