Merge pull request #2232 from rapidsai/branch-24.04

Forward-merge branch-24.04 into branch-24.06 [skip ci]
rapidsai · Mar 19, 2024 · 1bcb564 · 1bcb564
2 parents bc2513e + 0b9692b
commit 1bcb564
Show file tree

Hide file tree

Showing 41 changed files with 3,067 additions and 179 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -256,7 +256,7 @@ endif()
 
 if(RAFT_NVTX)
   # This enables NVTX within the project with no option to disable it downstream.
-  target_link_libraries(raft INTERFACE CUDA::nvToolsExt)
+  target_link_libraries(raft INTERFACE CUDA::nvtx3)
   target_compile_definitions(raft INTERFACE NVTX_ENABLED)
 else()
   # Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
@@ -324,6 +324,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
     src/distance/distance.cu
     src/distance/fused_l2_nn.cu
+    src/distance/fused_distance_nn.cu
     src/linalg/detail/coalesced_reduction.cu
     src/matrix/detail/select_k_double_int64_t.cu
     src/matrix/detail/select_k_double_uint32_t.cu
@@ -422,6 +423,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/raft_runtime/cluster/update_centroids.cuh
     src/raft_runtime/cluster/update_centroids_double.cu
     src/raft_runtime/cluster/update_centroids_float.cu
+    src/raft_runtime/distance/fused_distance_min_arg.cu
     src/raft_runtime/distance/fused_l2_min_arg.cu
     src/raft_runtime/distance/pairwise_distance.cu
     src/raft_runtime/matrix/select_k_float_int64_t.cu

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
@@ -122,21 +122,13 @@ if(BUILD_PRIMS_BENCH)
   )
 
   ConfigureBench(
-    NAME
-    MATRIX_BENCH
-    PATH
-    bench/prims/matrix/argmin.cu
-    bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu
-    bench/prims/matrix/main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
     NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
-    bench/prims/random/rng.cu bench/prims/main.cpp
+   bench/prims/random/rng.cu bench/prims/random/subsample.cu bench/prims/main.cpp
   )
 
   ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)

diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
@@ -16,34 +16,48 @@
 
 #include <common/benchmark.hpp>
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/matrix/gather.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace raft::bench::matrix {
 
 template <typename IdxT>
 struct GatherParams {
   IdxT rows, cols, map_length;
+  bool host;
 };
 
 template <typename IdxT>
 inline auto operator<<(std::ostream& os, const GatherParams<IdxT>& p) -> std::ostream&
 {
-  os << p.rows << "#" << p.cols << "#" << p.map_length;
+  os << p.rows << "#" << p.cols << "#" << p.map_length << (p.host ? "#host" : "#device");
   return os;
 }
 
 template <typename T, typename MapT, typename IdxT, bool Conditional = false>
 struct Gather : public fixture {
   Gather(const GatherParams<IdxT>& p)
-    : params(p), matrix(this->handle), map(this->handle), out(this->handle), stencil(this->handle)
+    : params(p),
+      old_mr(rmm::mr::get_current_device_resource()),
+      pool_mr(rmm::mr::get_current_device_resource(), 2 * (1ULL << 30)),
+      matrix(this->handle),
+      map(this->handle),
+      out(this->handle),
+      stencil(this->handle),
+      matrix_h(this->handle)
   {
+    rmm::mr::set_current_device_resource(&pool_mr);
   }
 
+  ~Gather() { rmm::mr::set_current_device_resource(old_mr); }
+
   void allocate_data(const ::benchmark::State& state) override
   {
     matrix  = raft::make_device_matrix<T, IdxT>(handle, params.rows, params.cols);
@@ -59,6 +73,11 @@ struct Gather : public fixture {
     if constexpr (Conditional) {
       raft::random::uniform(handle, rng, stencil.data_handle(), params.map_length, T(-1), T(1));
     }
+
+    if (params.host) {
+      matrix_h = raft::make_host_matrix<T, IdxT>(handle, params.rows, params.cols);
+      raft::copy(matrix_h.data_handle(), matrix.data_handle(), matrix.size(), stream);
+    }
     resource::sync_stream(handle, stream);
   }
 
@@ -77,14 +96,22 @@ struct Gather : public fixture {
         raft::matrix::gather_if(
           handle, matrix_const_view, out.view(), map_const_view, stencil_const_view, pred_op);
       } else {
-        raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+        if (params.host) {
+          raft::matrix::detail::gather(
+            handle, make_const_mdspan(matrix_h.view()), map_const_view, out.view());
+        } else {
+          raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+        }
       }
     });
   }
 
  private:
   GatherParams<IdxT> params;
+  rmm::mr::device_memory_resource* old_mr;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr;
   raft::device_matrix<T, IdxT> matrix, out;
+  raft::host_matrix<T, IdxT> matrix_h;
   raft::device_vector<T, IdxT> stencil;
   raft::device_vector<MapT, IdxT> map;
 };  // struct Gather
@@ -100,4 +127,9 @@ RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((Gather<double, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((GatherIf<float, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((GatherIf<double, uint32_t, int64_t>), "", gather_inputs_i64);
+
+auto inputs_host = raft::util::itertools::product<GatherParams<int64_t>>(
+  {10000000}, {100}, {1000, 1000000, 10000000}, {true});
+RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "Host", inputs_host);
+
 }  // namespace raft::bench::matrix
diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cub/cub.cuh>
+
+namespace raft::bench::random {
+
+struct sample_inputs {
+  int n_samples;
+  int n_train;
+  int method;
+};  // struct sample_inputs
+
+inline auto operator<<(std::ostream& os, const sample_inputs& p) -> std::ostream&
+{
+  os << p.n_samples << "#" << p.n_train << "#" << p.method;
+  return os;
+}
+
+// Sample with replacement. We use this as a baseline.
+template <typename IdxT>
+auto bernoulli_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
+  -> raft::device_vector<IdxT, IdxT>
+{
+  RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
+
+  auto indices = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
+  raft::random::RngState state(123456ULL);
+  raft::random::uniformInt(
+    res, state, indices.data_handle(), n_subsamples, IdxT(0), IdxT(n_samples));
+  return indices;
+}
+
+template <typename T>
+struct sample : public fixture {
+  sample(const sample_inputs& p)
+    : params(p),
+      old_mr(rmm::mr::get_current_device_resource()),
+      pool_mr(rmm::mr::get_current_device_resource(), 2 * GiB),
+      in(make_device_vector<T, int64_t>(res, p.n_samples)),
+      out(make_device_vector<T, int64_t>(res, p.n_train))
+  {
+    rmm::mr::set_current_device_resource(&pool_mr);
+    raft::random::RngState r(123456ULL);
+  }
+
+  ~sample() { rmm::mr::set_current_device_resource(old_mr); }
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    raft::random::RngState r(123456ULL);
+    loop_on_state(state, [this, &r]() {
+      if (params.method == 1) {
+        this->out =
+          bernoulli_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
+      } else if (params.method == 2) {
+        this->out = raft::random::excess_subsample<T, int64_t>(
+          this->res, r, this->params.n_samples, this->params.n_train);
+      }
+    });
+  }
+
+ private:
+  float GiB = 1073741824.0f;
+  raft::device_resources res;
+  rmm::mr::device_memory_resource* old_mr;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr;
+  sample_inputs params;
+  raft::device_vector<T, int64_t> out, in;
+};  // struct sample
+
+const std::vector<sample_inputs> input_vecs = {{100000000, 10000000, 1},
+                                               {100000000, 50000000, 1},
+                                               {100000000, 100000000, 1},
+                                               {100000000, 10000000, 2},
+                                               {100000000, 50000000, 2},
+                                               {100000000, 100000000, 2}};
+
+RAFT_BENCH_REGISTER(sample<int64_t>, "", input_vecs);
+
+}  // namespace raft::bench::random
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn.cuh b/cpp/include/raft/distance/detail/fused_distance_nn.cuh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/kvp.hpp>                             // raft::KeyValuePair
+#include <raft/core/operators.hpp>                       // raft::identity_op
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>  // ops::l2_exp_distance_op
+#include <raft/distance/detail/fused_distance_nn/cutlass_base.cuh>
+#include <raft/distance/detail/fused_distance_nn/fused_cosine_nn.cuh>
+#include <raft/distance/detail/fused_distance_nn/fused_l2_nn.cuh>
+#include <raft/distance/detail/fused_distance_nn/helper_structs.cuh>
+#include <raft/distance/detail/fused_distance_nn/simt_kernel.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>  // PairwiseDistances
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/contractions.cuh>  // Policy
+#include <raft/util/arch.cuh>            // raft::util::arch::SM_*
+#include <raft/util/cuda_utils.cuh>      // raft::ceildiv, raft::shfl
+
+#include <cstddef>  // size_t
+#include <limits>   // std::numeric_limits
+
+namespace raft {
+namespace distance {
+
+namespace detail {
+
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedDistanceNNImpl(OutT* min,
+                         const DataT* x,
+                         const DataT* y,
+                         const DataT* xn,
+                         const DataT* yn,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         int* workspace,
+                         ReduceOpT redOp,
+                         KVPReduceOpT pairRedOp,
+                         bool sqrt,
+                         bool initOutBuffer,
+                         bool isRowMajor,
+                         raft::distance::DistanceType metric,
+                         float metric_arg,
+                         cudaStream_t stream)
+{
+  // The kernel policy is determined by fusedDistanceNN.
+  typedef Policy P;
+
+  dim3 blk(P::Nthreads);
+  auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
+  constexpr auto maxVal = std::numeric_limits<DataT>::max();
+  typedef KeyValuePair<IdxT, DataT> KVPair;
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
+  if (initOutBuffer) {
+    initKernel<DataT, OutT, IdxT, ReduceOpT>
+      <<<nblks, P::Nthreads, 0, stream>>>(min, m, maxVal, redOp);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+
+  switch (metric) {
+    case raft::distance::DistanceType::CosineExpanded:
+      fusedCosineNN<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
+        min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, stream);
+      break;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2Expanded:
+      // initOutBuffer is take care by fusedDistanceNNImpl() so we set it false to fusedL2NNImpl.
+      fusedL2NNImpl<DataT, OutT, IdxT, P, ReduceOpT, KVPReduceOpT>(
+        min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, false, stream);
+      break;
+    default: assert("only cosine/l2 metric is supported with fusedDistanceNN\n"); break;
+  }
+}
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h b/cpp/include/raft/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h
@@ -611,6 +611,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase<Shape_,
         ++tensor_iterator;
       }
     }
+    tensor_iterator.dumpToGmem();
   }
 
   /// Helper to invoke the output functor over each vector of output
-Original file line number
+Diff line change
@@ Expand Up @@
             ++tensor_iterator;
           }
         }
+        tensor_iterator.dumpToGmem();
       }
       /// Helper to invoke the output functor over each vector of output
@@ Expand Down @@