Merge branch 'branch-23.10' into progressive_chunked_reader

rapidsai · Sep 27, 2023 · ab95c94 · ab95c94
2 parents d37e6a7 + b789d4c
commit ab95c94
Show file tree

Hide file tree

Showing 54 changed files with 2,594 additions and 356 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -323,6 +323,7 @@ add_library(
   src/groupby/sort/group_collect.cu
   src/groupby/sort/group_correlation.cu
   src/groupby/sort/group_count.cu
+  src/groupby/sort/group_histogram.cu
   src/groupby/sort/group_m2.cu
   src/groupby/sort/group_max.cu
   src/groupby/sort/group_min.cu
@@ -471,6 +472,7 @@ add_library(
   src/reductions/all.cu
   src/reductions/any.cu
   src/reductions/collect_ops.cu
+  src/reductions/histogram.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
@@ -618,6 +620,7 @@ add_library(
   src/text/subword/subword_tokenize.cu
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
+  src/text/vocabulary_tokenize.cu
   src/transform/bools_to_mask.cu
   src/transform/compute_column.cu
   src/transform/encode.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -173,7 +173,7 @@ ConfigureBench(ITERATOR_BENCH iterator/iterator.cu)
 # ##################################################################################################
 # * search benchmark ------------------------------------------------------------------------------
 ConfigureBench(SEARCH_BENCH search/search.cpp)
-ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
+ConfigureNVBench(SEARCH_NVBENCH search/contains_scalar.cpp search/contains_table.cpp)
 
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------

diff --git a/cpp/benchmarks/search/contains.cpp → cpp/benchmarks/search/contains_scalar.cpp b/cpp/benchmarks/search/contains.cpp → cpp/benchmarks/search/contains_scalar.cpp
diff --git a/cpp/benchmarks/search/contains_table.cpp b/cpp/benchmarks/search/contains_table.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/search.hpp>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+auto constexpr num_unique_elements = 1000;
+
+template <typename Type>
+static void nvbench_contains_table(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const size               = state.get_int64("table_size");
+  auto const dtype              = cudf::type_to_id<Type>();
+  double const null_probability = state.get_float64("null_probability");
+
+  auto builder = data_profile_builder().null_probability(null_probability);
+  if (dtype == cudf::type_id::LIST) {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements)
+      .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_unique_elements)
+      .list_depth(1);
+  } else {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements);
+  }
+
+  auto const haystack = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 0);
+  auto const needles = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 1);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const stream_view = rmm::cuda_stream_view{launch.get_stream()};
+    [[maybe_unused]] auto const result =
+      cudf::detail::contains(haystack->view(),
+                             needles->view(),
+                             cudf::null_equality::EQUAL,
+                             cudf::nan_equality::ALL_EQUAL,
+                             stream_view,
+                             rmm::mr::get_current_device_resource());
+  });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(nvbench_contains_table,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("contains_table")
+  .set_type_axes_names({"type"})
+  .add_float64_axis("null_probability", {0.0, 0.1})
+  .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,7 +116,9 @@ class aggregation {
     COVARIANCE,      ///< covariance between two sets of elements
     CORRELATION,     ///< correlation between two sets of elements
     TDIGEST,         ///< create a tdigest from a set of input values
-    MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
+    MERGE_TDIGEST,   ///< create a tdigest by merging multiple tdigests together
+    HISTOGRAM,       ///< compute frequency of each element
+    MERGE_HISTOGRAM  ///< merge partial values of HISTOGRAM aggregation,
   };
 
   aggregation() = delete;
@@ -288,6 +290,11 @@ std::unique_ptr<Base> make_any_aggregation();
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_all_aggregation();
 
+/// Factory to create a HISTOGRAM aggregation
+/// @return A HISTOGRAM aggregation object
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_histogram_aggregation();
+
 /// Factory to create a SUM_OF_SQUARES aggregation
 /// @return A SUM_OF_SQUARES aggregation object
 template <typename Base = aggregation>
@@ -610,6 +617,17 @@ std::unique_ptr<Base> make_merge_sets_aggregation(
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief Factory to create a MERGE_HISTOGRAM aggregation
+ *
+ * Merges the results of `HISTOGRAM` aggregations on independent sets into a new `HISTOGRAM` value
+ * equivalent to if a single `HISTOGRAM` aggregation was done across all of the sets at once.
+ *
+ * @return A MERGE_HISTOGRAM aggregation object
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_merge_histogram_aggregation();
+
 /**
  * @brief Factory to create a COVARIANCE aggregation
  *

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -45,6 +45,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class max_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class count_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class any_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -89,6 +91,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class covariance_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -108,6 +112,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class min_aggregation const& agg);
   virtual void visit(class max_aggregation const& agg);
   virtual void visit(class count_aggregation const& agg);
+  virtual void visit(class histogram_aggregation const& agg);
   virtual void visit(class any_aggregation const& agg);
   virtual void visit(class all_aggregation const& agg);
   virtual void visit(class sum_of_squares_aggregation const& agg);
@@ -130,6 +135,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class merge_histogram_aggregation const& agg);
   virtual void visit(class covariance_aggregation const& agg);
   virtual void visit(class correlation_aggregation const& agg);
   virtual void visit(class tdigest_aggregation const& agg);
@@ -251,6 +257,25 @@ class count_aggregation final : public rolling_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived class for specifying a histogram aggregation
+ */
+class histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  histogram_aggregation() : aggregation(HISTOGRAM) {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived class for specifying an any aggregation
  */
@@ -972,6 +997,25 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying MERGE_HISTOGRAM aggregation
+ */
+class merge_histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  explicit merge_histogram_aggregation() : aggregation{MERGE_HISTOGRAM} {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived aggregation class for specifying COVARIANCE aggregation
  */
@@ -1148,6 +1192,12 @@ struct target_type_impl<Source, aggregation::COUNT_ALL> {
   using type = size_type;
 };
 
+// Use list for HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::HISTOGRAM> {
+  using type = list_view;
+};
+
 // Computing ANY of any type, use bool accumulator
 template <typename Source>
 struct target_type_impl<Source, aggregation::ANY> {
@@ -1326,6 +1376,12 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Use list for MERGE_HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::MERGE_HISTOGRAM> {
+  using type = list_view;
+};
+
 // Always use double for COVARIANCE
 template <typename SourceType>
 struct target_type_impl<SourceType, aggregation::COVARIANCE> {
@@ -1417,6 +1473,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::COUNT_VALID>(std::forward<Ts>(args)...);
     case aggregation::COUNT_ALL:
       return f.template operator()<aggregation::COUNT_ALL>(std::forward<Ts>(args)...);
+    case aggregation::HISTOGRAM:
+      return f.template operator()<aggregation::HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::ANY:
       return f.template operator()<aggregation::ANY>(std::forward<Ts>(args)...);
     case aggregation::ALL:
@@ -1460,6 +1518,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_HISTOGRAM:
+      return f.template operator()<aggregation::MERGE_HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::COVARIANCE:
       return f.template operator()<aggregation::COVARIANCE>(std::forward<Ts>(args)...);
     case aggregation::CORRELATION:

diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -14,12 +14,15 @@
  * limitations under the License.
  */
 
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -29,6 +32,7 @@
 
 namespace cudf::detail {
 
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 using hash_map_type =
   cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
 

diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
@@ -81,6 +81,8 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * output   = { false, true, true }
  * @endcode
  *
+ * @throws cudf::logic_error If column types of haystack and needles don't match
+ *
  * @param haystack The table containing the search space
  * @param needles A table of rows whose existence to check in the search space
  * @param compare_nulls Control whether nulls should be compared as equal or not

diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <optional>
+
+namespace cudf::reduction::detail {
+
+/**
+ * @brief Compute the frequency for each distinct row in the input table.
+ *
+ * @param input The input table to compute histogram
+ * @param partial_counts An optional column containing count for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate memory of the returned objects
+ * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
+ * table, and their corresponding distinct counts
+ */
+[[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
+compute_row_frequencies(table_view const& input,
+                        std::optional<column_view> const& partial_counts,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Create an empty histogram column.
+ *
+ * A histogram column is a structs column `STRUCT<T, int64_t>` where T is type of the input
+ * values.
+ *
+ * @returns An empty histogram column
+ */
+[[nodiscard]] std::unique_ptr<column> make_empty_histogram_like(column_view const& values);
+
+}  // namespace cudf::reduction::detail