Merge remote-tracking branch 'upstream/branch-24.04' into feat/pylibc…

…udf_scan_reduce
rapidsai · Feb 6, 2024 · 68a0a76 · 68a0a76
2 parents c3b414a + c7e3dc5
commit 68a0a76
Show file tree

Hide file tree

Showing 45 changed files with 804 additions and 638 deletions.
diff --git a/ci/check_style.sh b/ci/check_style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 

diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at

diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at

diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -23,9 +23,12 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -76,17 +79,17 @@ constexpr int block_size = 512;
 template <typename MapRefType>
 struct bpe_unpairable_offsets_fn {
   cudf::device_span<char const> d_chars;
-  cudf::size_type offset;
+  int64_t offset;
   MapRefType const d_map;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  __device__ int64_t operator()(int64_t idx)
   {
     if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { return 0; }
 
     auto const itr  = d_chars.data() + idx;
     auto const end  = d_chars.end();
     auto const lhs  = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr));
     auto const next = itr + lhs.size_bytes();
-    auto output     = 0;
+    auto output     = 0L;
     if (next < end) {
       auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next));
       // see if both halves exist anywhere in the table, if not these are unpairable
@@ -123,6 +126,7 @@ struct bpe_unpairable_offsets_fn {
  */
 template <typename MapRefType>
 CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
+                                 char const* d_input_chars,
                                  MapRefType const d_map,
                                  int8_t* d_spaces_data,          // working memory
                                  cudf::size_type* d_ranks_data,  // more working memory
@@ -134,10 +138,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
     static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
   auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
 
-  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
-  auto const offsets =
-    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+  auto const d_str  = d_strings.element<cudf::string_view>(str_idx);
+  auto const offset = thrust::distance(d_input_chars, d_str.data());
 
   auto const d_spaces   = d_spaces_data + offset;
   auto const end_spaces = d_spaces + d_str.size_bytes();
@@ -292,6 +294,7 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
  * @param d_sizes Output sizes of each row
  */
 CUDF_KERNEL void bpe_finalize(cudf::column_device_view const d_strings,
+                              char const* d_input_chars,
                               int8_t* d_spaces_data,    // where separators are inserted
                               cudf::size_type* d_sizes  // output sizes of encoded strings
 )
@@ -311,9 +314,7 @@ CUDF_KERNEL void bpe_finalize(cudf::column_device_view const d_strings,
     return;
   }
 
-  auto const offsets =
-    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+  auto const offset = thrust::distance(d_input_chars, d_str.data());
 
   auto const d_spaces   = d_spaces_data + offset;
   auto const end_spaces = d_spaces + d_str.size_bytes();
@@ -352,27 +353,22 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto const first_offset  = (input.offset() == 0) ? 0
-                                                   : cudf::detail::get_value<cudf::size_type>(
+  auto const first_offset  = (input.offset() == 0) ? 0L
+                                                   : cudf::strings::detail::get_offset_value(
                                                       input.offsets(), input.offset(), stream);
   auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
-                               ? input.chars_size(stream)
-                               : cudf::detail::get_value<cudf::size_type>(
+                               ? static_cast<int64_t>(input.chars_size(stream))
+                               : cudf::strings::detail::get_offset_value(
                                  input.offsets(), input.size() + input.offset(), stream);
   auto const chars_size    = last_offset - first_offset;
   auto const d_input_chars = input.chars_begin(stream) + first_offset;
 
-  auto const offset_data_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
-  auto offsets                = cudf::make_numeric_column(
-    offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
-
   rmm::device_uvector<int8_t> d_spaces(chars_size, stream);  // identifies non-merged pairs
   // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions
-  rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
+  rmm::device_uvector<int64_t> d_working(chars_size, stream);
 
-  auto const chars_begin = thrust::counting_iterator<cudf::size_type>(0);
-  auto const chars_end   = thrust::counting_iterator<cudf::size_type>(chars_size);
+  auto const chars_begin = thrust::counting_iterator<int64_t>(0);
+  auto const chars_end   = thrust::counting_iterator<int64_t>(chars_size);
 
   {
     // this kernel locates unpairable sections of strings to create artificial string row
@@ -383,14 +379,16 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     auto up_fn = bpe_unpairable_offsets_fn<decltype(mp_map)>{d_chars_span, first_offset, mp_map};
     thrust::transform(rmm::exec_policy_nosync(stream), chars_begin, chars_end, d_up_offsets, up_fn);
     auto const up_end =  // remove all but the unpairable offsets
-      thrust::remove(rmm::exec_policy_nosync(stream), d_up_offsets, d_up_offsets + chars_size, 0);
+      thrust::remove(rmm::exec_policy_nosync(stream), d_up_offsets, d_up_offsets + chars_size, 0L);
     auto const unpairables = thrust::distance(d_up_offsets, up_end);  // number of unpairables
 
     // new string boundaries created by combining unpairable offsets with the existing offsets
-    auto tmp_offsets = rmm::device_uvector<cudf::size_type>(unpairables + input.size() + 1, stream);
+    auto tmp_offsets = rmm::device_uvector<int64_t>(unpairables + input.size() + 1, stream);
+    auto input_offsets =
+      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
     thrust::merge(rmm::exec_policy_nosync(stream),
-                  input.offsets_begin(),
-                  input.offsets_end(),
+                  input_offsets,
+                  input_offsets + input.size() + 1,
                   d_up_offsets,
                   up_end,
                   tmp_offsets.begin());
@@ -402,31 +400,28 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     tmp_offsets.resize(offsets_total, stream);
 
     // temp column created with the merged offsets and the original chars data
-    auto const col_offsets =
-      cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
-    auto const tmp_size  = offsets_total - 1;
-    auto const tmp_input = cudf::column_view(
+    auto const col_offsets = cudf::column_view(cudf::device_span<int64_t const>(tmp_offsets));
+    auto const tmp_size    = offsets_total - 1;
+    auto const tmp_input   = cudf::column_view(
       input.parent().type(), tmp_size, input.chars_begin(stream), nullptr, 0, 0, {col_offsets});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
     // launch the byte-pair-encoding kernel on the temp column
     rmm::device_uvector<int8_t> d_rerank(chars_size, stream);  // more working memory;
-    auto const d_ranks  = d_working.data();                    // store pair ranks here
+    rmm::device_uvector<cudf::size_type> d_ranks(chars_size, stream);
     auto const pair_map = get_bpe_merge_pairs_impl(merge_pairs)->get_merge_pairs_ref();
     bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
-      *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data());
+      *d_tmp_strings, d_input_chars, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data());
   }
 
-  // compute the output sizes and store them in the d_offsets vector
+  // compute the output sizes
+  auto output_sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
   bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
-    *d_strings, d_spaces.data(), d_offsets);
+    *d_strings, d_input_chars, d_spaces.data(), output_sizes.data());
 
   // convert sizes to offsets in-place
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
+  auto [offsets, bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
 
   // build the output: inserting separators to the input character data
   rmm::device_uvector<char> chars(bytes, stream, mr);
@@ -436,8 +431,8 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
     return d_spaces[idx] > 0;  // separator to be inserted here
   };
-  auto const copy_end = thrust::copy_if(
-    rmm::exec_policy_nosync(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero);
+  auto const copy_end =
+    cudf::detail::copy_if_safe(chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero, stream);
 
   // this will insert the single-byte separator into positions specified in d_inserts
   auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);

diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
@@ -23,6 +23,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -37,12 +38,9 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
-#include <thrust/transform_scan.h>
 
 #include <cuda/functional>
 
-#include <stdexcept>
-
 namespace nvtext {
 namespace detail {
 namespace {
@@ -60,10 +58,10 @@ namespace {
  * member.
  */
 struct string_tokens_positions_fn {
-  cudf::column_device_view const d_strings;  // strings to tokenize
-  cudf::string_view const d_delimiter;       // delimiter to tokenize around
-  cudf::size_type const* d_token_offsets;    // offsets into the d_token_positions for each string
-  position_pair* d_token_positions;          // token positions in each string
+  cudf::column_device_view const d_strings;          // strings to tokenize
+  cudf::string_view const d_delimiter;               // delimiter to tokenize around
+  cudf::detail::input_offsetalator d_token_offsets;  // offsets of d_token_positions for each string
+  position_pair* d_token_positions;                  // token positions in each string
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -95,12 +93,12 @@ struct ngram_builder_fn {
   cudf::column_device_view const d_strings;  // strings to generate ngrams from
   cudf::string_view const d_separator;       // separator to place between them 'grams
   cudf::size_type const ngrams;              // ngram number to generate (2=bi-gram, 3=tri-gram)
-  cudf::size_type const* d_token_offsets;    // offsets for token position for each string
-  position_pair const* d_token_positions;    // token positions for each string
-  cudf::size_type const* d_chars_offsets{};  // offsets for each string's ngrams
-  char* d_chars{};                           // write ngram strings to here
-  cudf::size_type const* d_ngram_offsets{};  // offsets for sizes of each string's ngrams
-  cudf::size_type* d_ngram_sizes{};          // write ngram sizes to here
+  cudf::detail::input_offsetalator d_token_offsets;    // offsets for token position for each string
+  position_pair const* d_token_positions;              // token positions for each string
+  cudf::detail::input_offsetalator d_chars_offsets{};  // offsets for each string's ngrams
+  char* d_chars{};                                     // write ngram strings to here
+  cudf::size_type const* d_ngram_offsets{};            // offsets for sizes of each string's ngrams
+  cudf::size_type* d_ngram_sizes{};                    // write ngram sizes to here
 
   __device__ cudf::size_type operator()(cudf::size_type idx)
   {
@@ -165,16 +163,12 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
 
   // first, get the number of tokens per string to get the token-offsets
   // Ex. token-counts = [3,2]; token-offsets = [0,3,5]
-  rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
-  auto d_token_offsets = token_offsets.data();
-  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
-                                   thrust::make_counting_iterator<cudf::size_type>(0),
-                                   thrust::make_counting_iterator<cudf::size_type>(strings_count),
-                                   d_token_offsets + 1,
-                                   strings_tokenizer{d_strings, d_delimiter},
-                                   thrust::plus<cudf::size_type>());
-  token_offsets.set_element_to_zero_async(0, stream);
-  auto const total_tokens = token_offsets.back_element(stream);  // Ex. 5 tokens
+  auto const count_itr =
+    cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter});
+  auto [token_offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
+    count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+  auto d_token_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
 
   // get the token positions (in bytes) per string
   // Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
@@ -188,21 +182,17 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
 
   // compute the number of ngrams per string to get the total number of ngrams to generate
   // Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
-  rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
-  auto d_ngram_offsets = ngram_offsets.data();
-  thrust::transform_inclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(strings_count),
-    d_ngram_offsets + 1,
+  auto const ngram_counts = cudf::detail::make_counting_transform_iterator(
+    0,
     cuda::proclaim_return_type<cudf::size_type>(
       [d_token_offsets, ngrams] __device__(cudf::size_type idx) {
-        auto token_count = d_token_offsets[idx + 1] - d_token_offsets[idx];
+        auto token_count =
+          static_cast<cudf::size_type>(d_token_offsets[idx + 1] - d_token_offsets[idx]);
         return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
-      }),
-    thrust::plus{});
-  ngram_offsets.set_element_to_zero_async(0, stream);
-  auto const total_ngrams = ngram_offsets.back_element(stream);
+      }));
+  auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column(
+    ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource());
+  auto d_ngram_offsets = ngram_offsets->view().begin<cudf::size_type>();
 
   // Compute the total size of the ngrams for each string (not for each ngram)
   // Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
@@ -212,21 +202,14 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   // ngrams for each string.
   // Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
   //     is built in memory like this: "a_bbbb_ccc"
-  rmm::device_uvector<cudf::size_type> chars_offsets(strings_count + 1, stream);
-  // First compute the output sizes for each string (this not the final output result)
-  thrust::transform(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(strings_count),
-    chars_offsets.begin(),
-    ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
-  // Convert the sizes to offsets
-  auto const output_chars_size = cudf::detail::sizes_to_offsets(
-    chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
-  CUDF_EXPECTS(
-    output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
-    "Size of output exceeds the column size limit",
-    std::overflow_error);
+
+  //  First compute the output sizes for each string (this not the final output result)
+  auto const sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
+  auto [chars_offsets, output_chars_size] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+  auto d_chars_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view());
 
   // This will contain the size in bytes of each ngram to generate
   rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
@@ -245,14 +228,13 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                       ngrams,
                                       d_token_offsets,
                                       d_token_positions,
-                                      chars_offsets.data(),
+                                      d_chars_offsets,
                                       d_chars,
                                       d_ngram_offsets,
                                       ngram_sizes.data()});
   // build the offsets column -- converting the ngram sizes into offsets
   auto offsets_column = std::get<0>(
     cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr));
-  offsets_column->set_null_count(0);
   // create the output strings column
   return make_strings_column(
     total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});