Merge branch 'branch-24.10' into test-drop-python-3.9

rapidsai · Aug 13, 2024 · 4f44e6f · 4f44e6f
2 parents 0b43c1a + 3801f81
commit 4f44e6f
Show file tree

Hide file tree

Showing 10 changed files with 94 additions and 111 deletions.
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
@@ -19,7 +19,8 @@ jobs:
       secrets: inherit
       uses: rapidsai/shared-workflows/.github/workflows/[email protected]
       with:
-        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+        # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+        matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
         build_type: nightly
         branch: ${{ inputs.branch }}
         date: ${{ inputs.date }}

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -187,6 +187,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/run_tests.sh
@@ -196,7 +197,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.

diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
@@ -30,7 +30,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/reduce.h>
 
 #include <random>
@@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state)
                           sizeof(TypeParam));
 }
 
-// operator+ defined for pair iterator reduction
-template <typename T>
-__device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pair<T, bool> rhs)
-{
-  return thrust::pair<T, bool>{lhs.first * lhs.second + rhs.first * rhs.second,
-                               lhs.second + rhs.second};
-}
-// -----------------------------------------------------------------------------
-template <typename T, bool has_null>
-void pair_iterator_bench_cub(cudf::column_view& col,
-                             rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col    = cudf::column_device_view::create(col);
-  int num_items = col.size();
-  auto begin    = d_col->pair_begin<T, has_null>();
-  reduce_by_cub(result.begin(), begin, num_items, init);
-}
-
-template <typename T, bool has_null>
-void pair_iterator_bench_thrust(cudf::column_view& col,
-                                rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col = cudf::column_device_view::create(col);
-  auto d_in  = d_col->pair_begin<T, has_null>();
-  auto d_end = d_in + col.size();
-  thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{});
-}
-
-template <class TypeParam, bool cub_or_thrust>
-void BM_pair_iterator(benchmark::State& state)
-{
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  using T      = TypeParam;
-  auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
-  auto null_gen =
-    thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; });
-
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_T(
-    num_gen, num_gen + column_size, null_gen);
-  cudf::column_view hasnull_F = wrap_hasnull_F;
-  cudf::column_view hasnull_T = wrap_hasnull_T;
-
-  // Initialize dev_result to false
-  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    if (cub_or_thrust) {
-      pair_iterator_bench_cub<T, false>(hasnull_T,
-                                        dev_result);  // driven by pair iterator with nulls
-    } else {
-      pair_iterator_bench_thrust<T, false>(hasnull_T,
-                                           dev_result);  // driven by pair iterator with nulls
-    }
-  }
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          sizeof(TypeParam));
-}
-
 #define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \
   BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state)             \
   {                                                                          \
@@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true);
 ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false);
-
-#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust)  \
-  BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
-  {                                                              \
-    BM_pair_iterator<type, cub_or_thrust>(state);                \
-  }                                                              \
-  BENCHMARK_REGISTER_F(Iterator, name)                           \
-    ->RangeMultiplier(10)                                        \
-    ->Range(1000, 10000000)                                      \
-    ->UseManualTime()                                            \
-    ->Unit(benchmark::kMillisecond);
-
-PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true);
-PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false);
diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <cstdint>
@@ -40,53 +39,49 @@ class byte_range_info {
   int64_t _size{};    ///< size in bytes
 
  public:
-  constexpr byte_range_info() = default;
+  byte_range_info() = default;
   /**
    * @brief Constructs a byte_range_info object
    *
    * @param offset offset in bytes
    * @param size size in bytes
    */
-  constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
-  {
-    CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
-    CUDF_EXPECTS(size >= 0, "size must be non-negative");
-  }
+  byte_range_info(int64_t offset, int64_t size);
 
   /**
    * @brief Copy constructor
    *
    * @param other byte_range_info object to copy
    */
-  constexpr byte_range_info(byte_range_info const& other) noexcept = default;
+  byte_range_info(byte_range_info const& other) noexcept = default;
   /**
    * @brief  Copy assignment operator
    *
    * @param other byte_range_info object to copy
    * @return this object after copying
    */
-  constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
+  byte_range_info& operator=(byte_range_info const& other) noexcept = default;
 
   /**
    * @brief Get the offset in bytes
    *
    * @return Offset in bytes
    */
-  [[nodiscard]] constexpr int64_t offset() { return _offset; }
+  [[nodiscard]] int64_t offset() const { return _offset; }
 
   /**
    * @brief Get the size in bytes
    *
    * @return Size in bytes
    */
-  [[nodiscard]] constexpr int64_t size() { return _size; }
+  [[nodiscard]] int64_t size() const { return _size; }
 
   /**
    * @brief Returns whether the span is empty.
    *
-   * @return true iff the span is empty, i.e. `size() == 0`
+   * @return true iff the range is empty, i.e. `size() == 0`
    */
-  [[nodiscard]] constexpr bool empty() { return size() == 0; }
+  [[nodiscard]] bool is_empty() const { return size() == 0; }
 };
 
 /**

diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
@@ -16,13 +16,20 @@
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
 namespace cudf {
 namespace io {
 namespace text {
 
+byte_range_info::byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
+{
+  CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
+  CUDF_EXPECTS(size >= 0, "size must be non-negative");
+}
+
 byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }
 
 std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
@@ -310,7 +310,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 {
   CUDF_FUNC_RANGE();
 
-  if (byte_range.empty()) { return make_empty_column(type_id::STRING); }
+  if (byte_range.is_empty()) { return make_empty_column(type_id::STRING); }
 
   auto device_delim = cudf::string_scalar(delimiter, true, stream, mr);
 

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
@@ -142,7 +142,7 @@ struct base_split_tokenizer {
 
     // max_tokens already included in token counts
     if (d_tokens.size() == 1) {
-      d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
+      d_tokens[0] = string_index_pair{(d_str.empty() ? "" : d_str.data()), d_str.size_bytes()};
       return;
     }
 
@@ -357,24 +357,20 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   auto const chars_bytes =
     get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
     get_offset_value(input.offsets(), input.offset(), stream);
-  if (chars_bytes == 0) {
-    auto offsets = cudf::make_column_from_scalar(
-      numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
-    auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
-    return std::pair{std::move(offsets), std::move(tokens)};
-  }
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
   rmm::device_scalar<int64_t> d_count(0, stream);
-  constexpr int64_t block_size         = 512;
-  constexpr size_type bytes_per_thread = 4;
-  auto const num_blocks                = util::div_rounding_up_safe(
-    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
-  count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
-    <<<num_blocks, block_size, 0, stream.value()>>>(
-      tokenizer, d_offsets, chars_bytes, d_count.data());
+  if (chars_bytes > 0) {
+    constexpr int64_t block_size         = 512;
+    constexpr size_type bytes_per_thread = 4;
+    auto const num_blocks                = util::div_rounding_up_safe(
+      util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+    count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
+      <<<num_blocks, block_size, 0, stream.value()>>>(
+        tokenizer, d_offsets, chars_bytes, d_count.data());
+  }
 
   // Create a vector of every delimiter position in the chars column.
   // These may include overlapping or otherwise out-of-bounds delimiters which

diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
@@ -71,6 +71,10 @@ struct token_reader_fn {
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
     auto const d_result     = d_tokens + token_offset;  // store tokens here
+    if (nchars == 0) {
+      d_result[0] = string_index_pair{"", 0};
+      return;
+    }
 
     int64_t token_idx = 0;
     auto itr          = d_str.begin();

diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
@@ -307,24 +307,46 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
-TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+TEST_F(StringsSplitTest, SplitAllEmpty)
 {
   auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
   auto sv        = cudf::strings_column_view(input);
+  auto empty     = cudf::string_scalar("");
   auto delimiter = cudf::string_scalar("s");
+
+  auto result = cudf::strings::split(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  // whitespace hits a special case where nothing matches returns an all-null column
+  auto expected = cudf::test::strings_column_wrapper({"", "", "", ""}, {0, 0, 0, 0});
+  result        = cudf::strings::split(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+  result = cudf::strings::rsplit(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+}
+
+TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+{
+  auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv        = cudf::strings_column_view(input);
   auto empty     = cudf::string_scalar("");
+  auto delimiter = cudf::string_scalar("s");
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
+  LCW expected({LCW{""}, LCW{""}, LCW{""}, LCW{""}});
+  LCW expected_empty({LCW{}, LCW{}, LCW{}, LCW{}});
+
   auto result = cudf::strings::split_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::split_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 
   result = cudf::strings::rsplit_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::rsplit_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 }
 
 TEST_F(StringsSplitTest, MultiByteDelimiters)
@@ -575,6 +597,23 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
   }
 }
 
+TEST_F(StringsSplitTest, SplitRegexAllEmpty)
+{
+  auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv    = cudf::strings_column_view(input);
+  auto prog  = cudf::strings::regex_program::create("[ _]");
+
+  auto result = cudf::strings::split_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  auto rec_result = cudf::strings::split_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+  rec_result = cudf::strings::rsplit_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+}
+
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<char const*> h_strings{

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
@@ -978,6 +978,22 @@ def test_string_split_re(data, pat, n, expand):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("pat", [None, "\\s+"])
+@pytest.mark.parametrize("regex", [False, True])
+@pytest.mark.parametrize("expand", [False, True])
+def test_string_split_all_empty(pat, regex, expand):
+    ps = pd.Series(["", "", "", ""], dtype="str")
+    gs = cudf.Series(["", "", "", ""], dtype="str")
+
+    expect = ps.str.split(pat=pat, expand=expand, regex=regex)
+    got = gs.str.split(pat=pat, expand=expand, regex=regex)
+
+    if isinstance(got, cudf.DataFrame):
+        assert_eq(expect, got, check_column_type=False)
+    else:
+        assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
 )