rapidsai · rapids-bot · Mar 19, 2024 · Feb 5, 2024 · Feb 5, 2024 · Mar 11, 2024
@@ -136,7 +136,7 @@ if(BUILD_PRIMS_BENCH)
 
   ConfigureBench(
     NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
-    bench/prims/random/rng.cu bench/prims/main.cpp
+   bench/prims/random/rng.cu bench/prims/random/subsample.cu bench/prims/main.cpp
   )
 
   ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)

@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cub/cub.cuh>
+
+namespace raft::bench::random {
+
+struct sample_inputs {
+  int n_samples;
+  int n_train;
+  int method;
+};  // struct sample_inputs
+
+inline auto operator<<(std::ostream& os, const sample_inputs& p) -> std::ostream&
+{
+  os << p.n_samples << "#" << p.n_train << "#" << p.method;
+  return os;
+}
+
+// Sample with replacement. We use this as a baseline.
+template <typename IdxT>
+auto bernoulli_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
+  -> raft::device_vector<IdxT, IdxT>
+{
+  RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
+
+  auto indices = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
+  raft::random::RngState state(123456ULL);
+  raft::random::uniformInt(
+    res, state, indices.data_handle(), n_subsamples, IdxT(0), IdxT(n_samples));
+  return indices;
+}
+
+template <typename T>
+struct sample : public fixture {
+  sample(const sample_inputs& p)
+    : params(p),
+      old_mr(rmm::mr::get_current_device_resource()),
+      pool_mr(rmm::mr::get_current_device_resource(), 2 * GiB),
+      in(make_device_vector<T, int64_t>(res, p.n_samples)),
+      out(make_device_vector<T, int64_t>(res, p.n_train))
+  {
+    rmm::mr::set_current_device_resource(&pool_mr);
+    raft::random::RngState r(123456ULL);
+  }
+
+  ~sample() { rmm::mr::set_current_device_resource(old_mr); }
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    raft::random::RngState r(123456ULL);
+    loop_on_state(state, [this, &r]() {
+      if (params.method == 1) {
+        this->out =
+          bernoulli_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
+      } else if (params.method == 2) {
+        this->out = raft::random::excess_subsample<T, int64_t>(
+          this->res, r, this->params.n_samples, this->params.n_train);
+      }
+    });
+  }
+
+ private:
+  float GiB = 1073741824.0f;
+  raft::device_resources res;
+  rmm::mr::device_memory_resource* old_mr;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr;
+  sample_inputs params;
+  raft::device_vector<T, int64_t> out, in;
+};  // struct sample
+
+const std::vector<sample_inputs> input_vecs = {{100000000, 10000000, 1},
+                                               {100000000, 50000000, 1},
+                                               {100000000, 100000000, 1},
+                                               {100000000, 10000000, 2},
+                                               {100000000, 50000000, 2},
+                                               {100000000, 100000000, 2}};
+
+RAFT_BENCH_REGISTER(sample<int64_t>, "", input_vecs);
+
+}  // namespace raft::bench::random
@@ -16,7 +16,15 @@
 
 #pragma once
 
+#include <raft/common/nvtx.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/pinned_mdarray.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <functional>
@@ -336,6 +344,70 @@ void gather_if(const InputIteratorT in,
   gatherImpl(in, D, N, map, stencil, map_length, out, pred_op, transform_op, stream);
 }
 
+template <typename T, typename IdxT = int64_t>
+void gather_buff(host_matrix_view<const T, IdxT> dataset,
+                 host_vector_view<const IdxT, IdxT> indices,
+                 IdxT offset,
+                 pinned_matrix_view<T, IdxT> buff)
+{
+  raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather_host_buff");
+  IdxT batch_size = std::min<IdxT>(buff.extent(0), indices.extent(0) - offset);
+
+#pragma omp for
+  for (IdxT i = 0; i < batch_size; i++) {
+    IdxT in_idx = indices(offset + i);
+    for (IdxT k = 0; k < buff.extent(1); k++) {
+      buff(i, k) = dataset(in_idx, k);
+    }
+  }
+}
+
+template <typename T, typename IdxT>
+void gather(raft::resources const& res,
+            host_matrix_view<const T, IdxT> dataset,
+            device_vector_view<const IdxT, IdxT> indices,
+            raft::device_matrix_view<T, IdxT> output)
+{
+  raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather");
+  IdxT n_dim        = output.extent(1);
+  IdxT n_train      = output.extent(0);
+  auto indices_host = raft::make_host_vector<IdxT, IdxT>(n_train);
+  raft::copy(
+    indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
+  resource::sync_stream(res);
+
+  const size_t max_batch_size = 32768;
+  // Gather the vector on the host in tmp buffers. We use two buffers to overlap H2D sync
+  // and gathering the data.
+  raft::common::nvtx::push_range("gather::alloc_buffers");
+  auto out_tmp1 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
+  auto out_tmp2 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
+  auto view1    = out_tmp1.view();
+  auto view2    = out_tmp2.view();
+  raft::common::nvtx::pop_range();
+
+  gather_buff(dataset, make_const_mdspan(indices_host.view()), (IdxT)0, view1);
+#pragma omp parallel
+  for (IdxT device_offset = 0; device_offset < n_train; device_offset += max_batch_size) {
+    IdxT batch_size = std::min<IdxT>(max_batch_size, n_train - device_offset);
+#pragma omp master
+    raft::copy(output.data_handle() + device_offset * n_dim,
+               view1.data_handle(),
+               batch_size * n_dim,
+               resource::get_cuda_stream(res));
+    // Start gathering the next batch on the host.
+    IdxT host_offset = device_offset + batch_size;
+    batch_size       = std::min<IdxT>(max_batch_size, n_train - host_offset);
+    if (batch_size > 0) {
+      gather_buff(dataset, make_const_mdspan(indices_host.view()), host_offset, view2);
+    }
+#pragma omp master
+    resource::sync_stream(res);
+#pragma omp barrier
+    std::swap(view1, view2);
+  }
+}
+
 }  // namespace detail
 }  // namespace matrix
 }  // namespace raft
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::matrix::detail {
+
+/** Select rows randomly from input and copy to output. */
+template <typename T, typename IdxT = int64_t>
+void sample_rows(raft::resources const& res,
+                 const T* input,
+                 IdxT n_rows_input,
+                 raft::device_matrix_view<T, IdxT> output,
+                 random::RngState random_state)
+{
+  IdxT n_dim     = output.extent(1);
+  IdxT n_samples = output.extent(0);
+
+  raft::device_vector<IdxT, IdxT> train_indices =
+    raft::random::excess_subsample<IdxT, int64_t>(res, random_state, n_rows_input, n_samples);
+
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, input));
+  T* ptr = reinterpret_cast<T*>(attr.devicePointer);
+  if (ptr != nullptr) {
+    raft::matrix::gather(res,
+                         raft::make_device_matrix_view<const T, IdxT>(ptr, n_rows_input, n_dim),
+                         raft::make_const_mdspan(train_indices.view()),
+                         output);
+  } else {
+    auto dataset = raft::make_host_matrix_view<const T, IdxT>(input, n_rows_input, n_dim);
+    raft::matrix::detail::gather(res, dataset, make_const_mdspan(train_indices.view()), output);
+  }
+}
+}  // namespace raft::matrix::detail
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/detail/sample_rows.cuh>
+#include <raft/random/rng.cuh>
+
+namespace raft::matrix {
+
+/** Select rows randomly from input and copy to output. */
+template <typename T, typename IdxT = int64_t, typename accessor>
+void sample_rows(raft::resources const& res,
+                 random::RngState random_state,
+                 mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
+                 raft::device_matrix_view<T, IdxT> output)
+{
+  detail::sample_rows(res, input, n_rows_input, output, random_state);
+
+  detail::sample_rows(res, dataset.data_handle(), dataset.extent(0), output, random_state);
+}
+
+/** Subsample the dataset to create a training set*/
+template <typename T, typename IdxT = int64_t, typename accessor>
+raft::device_matrix<T, IdxT> sample_rows(
+  raft::resources const& res,
+  random::RngState random_state,
+  mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
+  IdxT n_samples)
+{
+  auto output = raft::make_device_matrix<T, IdxT>(res, n_samples, dataset.extent(1));
+  detail::sample_rows(res, random_state, dataset, output.view());
+  return output;
+}
+
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/linalg/map.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/integer_utils.hpp>