From 7178bf2eb34334db909a151926d8112c441b3b09 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 13 Aug 2024 08:45:44 -0400 Subject: [PATCH 1/4] Rework cudf::io::text::byte_range_info class member functions (#16518) Adds `const` declarations to appropriate member functions in class `cudf::io::text::byte_range_info` and moves the ctor implementation to .cpp file. This helps with using the `byte_range_info` objects in `const` variables and inside of `const` functions. Found while working on #15983 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16518 --- cpp/include/cudf/io/text/byte_range_info.hpp | 21 ++++++++------------ cpp/src/io/text/byte_range_info.cpp | 7 +++++++ cpp/src/io/text/multibyte_split.cu | 2 +- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp index 7e9256be1d3..5f3c91dc99c 100644 --- a/cpp/include/cudf/io/text/byte_range_info.hpp +++ b/cpp/include/cudf/io/text/byte_range_info.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include #include @@ -40,53 +39,49 @@ class byte_range_info { int64_t _size{}; ///< size in bytes public: - constexpr byte_range_info() = default; + byte_range_info() = default; /** * @brief Constructs a byte_range_info object * * @param offset offset in bytes * @param size size in bytes */ - constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size) - { - CUDF_EXPECTS(offset >= 0, "offset must be non-negative"); - CUDF_EXPECTS(size >= 0, "size must be non-negative"); - } + byte_range_info(int64_t offset, int64_t size); /** * @brief Copy constructor * * @param other byte_range_info object to copy */ - constexpr byte_range_info(byte_range_info const& other) noexcept = default; + byte_range_info(byte_range_info const& other) noexcept = default; /** * @brief Copy assignment operator * * @param other byte_range_info object to copy * @return this object after copying */ - constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default; + byte_range_info& operator=(byte_range_info const& other) noexcept = default; /** * @brief Get the offset in bytes * * @return Offset in bytes */ - [[nodiscard]] constexpr int64_t offset() { return _offset; } + [[nodiscard]] int64_t offset() const { return _offset; } /** * @brief Get the size in bytes * * @return Size in bytes */ - [[nodiscard]] constexpr int64_t size() { return _size; } + [[nodiscard]] int64_t size() const { return _size; } /** * @brief Returns whether the span is empty. * - * @return true iff the span is empty, i.e. `size() == 0` + * @return true iff the range is empty, i.e. `size() == 0` */ - [[nodiscard]] constexpr bool empty() { return size() == 0; } + [[nodiscard]] bool is_empty() const { return size() == 0; } }; /** diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp index 6a7836ed4e1..fe811739b97 100644 --- a/cpp/src/io/text/byte_range_info.cpp +++ b/cpp/src/io/text/byte_range_info.cpp @@ -16,6 +16,7 @@ #include #include +#include #include @@ -23,6 +24,12 @@ namespace cudf { namespace io { namespace text { +byte_range_info::byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size) +{ + CUDF_EXPECTS(offset >= 0, "offset must be non-negative"); + CUDF_EXPECTS(size >= 0, "size must be non-negative"); +} + byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits::max()}; } std::vector create_byte_range_infos_consecutive(int64_t total_bytes, diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 97729a091fb..e3435a24b18 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -310,7 +310,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source { CUDF_FUNC_RANGE(); - if (byte_range.empty()) { return make_empty_column(type_id::STRING); } + if (byte_range.is_empty()) { return make_empty_column(type_id::STRING); } auto device_delim = cudf::string_scalar(delimiter, true, stream, mr); From 419fb99fa9ac471ae00ebe7787543b8e9cc154b5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 13 Aug 2024 08:52:30 -0400 Subject: [PATCH 2/4] Fix all-empty input column for strings split APIs (#16466) Fixes specialized behavior for all empty input column on the strings split APIs. Verifying behavior with Pandas `str.split( pat, expand, regex )` `pat=None -- whitespace` `expand=False -- record APIs` `regex=True -- re APIs` - [x] `split` - [x] `split` - whitespace - [x] `rsplit` - [x] `rsplit` - whitespace - [x] `split_record` - [x] `split_record` - whitespace - [x] `rsplit_record` - [x] `rsplit_record` - whitespace - [x] `split_re` - [x] `rsplit_re` - [x] `split_record_re` - [x] `rsplit_record_re` Closes #16453 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mark Harris (https://github.com/harrism) - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/16466 --- cpp/src/strings/split/split.cuh | 24 ++++++-------- cpp/src/strings/split/split_re.cu | 4 +++ cpp/tests/strings/split_tests.cpp | 47 ++++++++++++++++++++++++--- python/cudf/cudf/tests/test_string.py | 16 +++++++++ 4 files changed, 73 insertions(+), 18 deletions(-) diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 4d7096c02ca..af70367678e 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -142,7 +142,7 @@ struct base_split_tokenizer { // max_tokens already included in token counts if (d_tokens.size() == 1) { - d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()}; + d_tokens[0] = string_index_pair{(d_str.empty() ? "" : d_str.data()), d_str.size_bytes()}; return; } @@ -357,24 +357,20 @@ std::pair, rmm::device_uvector> split auto const chars_bytes = get_offset_value(input.offsets(), input.offset() + strings_count, stream) - get_offset_value(input.offsets(), input.offset(), stream); - if (chars_bytes == 0) { - auto offsets = cudf::make_column_from_scalar( - numeric_scalar(0, true, stream), strings_count + 1, stream, mr); - auto tokens = rmm::device_uvector(0, stream); - return std::pair{std::move(offsets), std::move(tokens)}; - } auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // count the number of delimiters in the entire column rmm::device_scalar d_count(0, stream); - constexpr int64_t block_size = 512; - constexpr size_type bytes_per_thread = 4; - auto const num_blocks = util::div_rounding_up_safe( - util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); - count_delimiters_kernel - <<>>( - tokenizer, d_offsets, chars_bytes, d_count.data()); + if (chars_bytes > 0) { + constexpr int64_t block_size = 512; + constexpr size_type bytes_per_thread = 4; + auto const num_blocks = util::div_rounding_up_safe( + util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); + count_delimiters_kernel + <<>>( + tokenizer, d_offsets, chars_bytes, d_count.data()); + } // Create a vector of every delimiter position in the chars column. // These may include overlapping or otherwise out-of-bounds delimiters which diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index d72ec1085b5..e0aacf07ef0 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -71,6 +71,10 @@ struct token_reader_fn { auto const token_offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - token_offset; auto const d_result = d_tokens + token_offset; // store tokens here + if (nchars == 0) { + d_result[0] = string_index_pair{"", 0}; + return; + } int64_t token_idx = 0; auto itr = d_str.begin(); diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 4c020cb4c29..7ece08b19f2 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -307,24 +307,46 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } -TEST_F(StringsSplitTest, SplitRecordAllEmpty) +TEST_F(StringsSplitTest, SplitAllEmpty) { auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); auto sv = cudf::strings_column_view(input); + auto empty = cudf::string_scalar(""); auto delimiter = cudf::string_scalar("s"); + + auto result = cudf::strings::split(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + result = cudf::strings::rsplit(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + + // whitespace hits a special case where nothing matches returns an all-null column + auto expected = cudf::test::strings_column_wrapper({"", "", "", ""}, {0, 0, 0, 0}); + result = cudf::strings::split(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected); + result = cudf::strings::rsplit(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected); +} + +TEST_F(StringsSplitTest, SplitRecordAllEmpty) +{ + auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); + auto sv = cudf::strings_column_view(input); auto empty = cudf::string_scalar(""); + auto delimiter = cudf::string_scalar("s"); using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{}, LCW{}, LCW{}, LCW{}}); + LCW expected({LCW{""}, LCW{""}, LCW{""}, LCW{""}}); + LCW expected_empty({LCW{}, LCW{}, LCW{}, LCW{}}); + auto result = cudf::strings::split_record(sv, delimiter); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); result = cudf::strings::split_record(sv, empty); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty); result = cudf::strings::rsplit_record(sv, delimiter); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); result = cudf::strings::rsplit_record(sv, empty); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty); } TEST_F(StringsSplitTest, MultiByteDelimiters) @@ -575,6 +597,23 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary) } } +TEST_F(StringsSplitTest, SplitRegexAllEmpty) +{ + auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); + auto sv = cudf::strings_column_view(input); + auto prog = cudf::strings::regex_program::create("[ _]"); + + auto result = cudf::strings::split_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + result = cudf::strings::rsplit_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + + auto rec_result = cudf::strings::split_record_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input); + rec_result = cudf::strings::rsplit_record_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input); +} + TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index a2a3e874c91..30880f074c0 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -978,6 +978,22 @@ def test_string_split_re(data, pat, n, expand): assert_eq(expect, got) +@pytest.mark.parametrize("pat", [None, "\\s+"]) +@pytest.mark.parametrize("regex", [False, True]) +@pytest.mark.parametrize("expand", [False, True]) +def test_string_split_all_empty(pat, regex, expand): + ps = pd.Series(["", "", "", ""], dtype="str") + gs = cudf.Series(["", "", "", ""], dtype="str") + + expect = ps.str.split(pat=pat, expand=expand, regex=regex) + got = gs.str.split(pat=pat, expand=expand, regex=regex) + + if isinstance(got, cudf.DataFrame): + assert_eq(expect, got, check_column_type=False) + else: + assert_eq(expect, got) + + @pytest.mark.parametrize( "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] ) From 3a791cb8a83ca2cf446a910cb94d5a4e3edf2b9f Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 13 Aug 2024 08:56:43 -0400 Subject: [PATCH 3/4] Remove unneeded pair-iterator benchmark (#16511) Removes the pair-iterator benchmark logic. The remaining benchmarks use the null-replacement-iterator which uses the libcudf pair-iterator internally. There is no need for benchmarking this unique iterator pattern that is not used by libcudf. The `cpp/benchmarks/iterator/iterator.cu` failed to compile with gcc 12 because the sum-reduce function cannot resolve adding `thrust::pair` objects together likely due to some recent changes in CCCL. Regardless, adding `thrust::pair` objects is not something we need to benchmark. The existing benchmark benchmarks libcudf's usage of the internal pair-iterator correctly. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16511 --- cpp/benchmarks/iterator/iterator.cu | 77 ----------------------------- 1 file changed, 77 deletions(-) diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu index ada7a9bd73d..fd0cebb12ea 100644 --- a/cpp/benchmarks/iterator/iterator.cu +++ b/cpp/benchmarks/iterator/iterator.cu @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state) sizeof(TypeParam)); } -// operator+ defined for pair iterator reduction -template -__device__ thrust::pair operator+(thrust::pair lhs, thrust::pair rhs) -{ - return thrust::pair{lhs.first * lhs.second + rhs.first * rhs.second, - lhs.second + rhs.second}; -} -// ----------------------------------------------------------------------------- -template -void pair_iterator_bench_cub(cudf::column_view& col, - rmm::device_uvector>& result) -{ - thrust::pair init{0, false}; - auto d_col = cudf::column_device_view::create(col); - int num_items = col.size(); - auto begin = d_col->pair_begin(); - reduce_by_cub(result.begin(), begin, num_items, init); -} - -template -void pair_iterator_bench_thrust(cudf::column_view& col, - rmm::device_uvector>& result) -{ - thrust::pair init{0, false}; - auto d_col = cudf::column_device_view::create(col); - auto d_in = d_col->pair_begin(); - auto d_end = d_in + col.size(); - thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{}); -} - -template -void BM_pair_iterator(benchmark::State& state) -{ - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; - using T = TypeParam; - auto num_gen = thrust::counting_iterator(0); - auto null_gen = - thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; }); - - cudf::test::fixed_width_column_wrapper wrap_hasnull_F(num_gen, num_gen + column_size); - cudf::test::fixed_width_column_wrapper wrap_hasnull_T( - num_gen, num_gen + column_size, null_gen); - cudf::column_view hasnull_F = wrap_hasnull_F; - cudf::column_view hasnull_T = wrap_hasnull_T; - - // Initialize dev_result to false - auto dev_result = cudf::detail::make_zeroed_device_uvector_sync>( - 1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - if (cub_or_thrust) { - pair_iterator_bench_cub(hasnull_T, - dev_result); // driven by pair iterator with nulls - } else { - pair_iterator_bench_thrust(hasnull_T, - dev_result); // driven by pair iterator with nulls - } - } - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - sizeof(TypeParam)); -} - #define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \ BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \ { \ @@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true); ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false); ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true); ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false); - -#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust) \ - BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \ - { \ - BM_pair_iterator(state); \ - } \ - BENCHMARK_REGISTER_F(Iterator, name) \ - ->RangeMultiplier(10) \ - ->Range(1000, 10000000) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true); -PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false); From 3801f811ab7713e4cb9cc3bb34d282f8a04e71e4 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 13 Aug 2024 12:40:40 -0500 Subject: [PATCH 4/4] Remove hardcoded versions from workflows. (#16540) This PR removes hardcoded Python versions from CI workflows. It is a prerequisite for dropping Python 3.9. See https://github.com/rapidsai/build-planning/issues/88. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/16540 --- .github/workflows/pandas-tests.yaml | 3 ++- .github/workflows/pr.yaml | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index cf0c2b377dd..10c803f7921 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -19,7 +19,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index c2e7f64f952..ea8a1762b2c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -187,6 +187,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: ci/cudf_pandas_scripts/run_tests.sh @@ -196,7 +197,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.