diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index 5fc2f0cd569..39431e4e58b 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -19,7 +19,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@drop-python-3.9 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 1dc75031a38..e8fff1f2d19 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -187,6 +187,7 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@drop-python-3.9 with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: ci/cudf_pandas_scripts/run_tests.sh @@ -196,7 +197,8 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@drop-python-3.9 with: - matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) )) + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu index ada7a9bd73d..fd0cebb12ea 100644 --- a/cpp/benchmarks/iterator/iterator.cu +++ b/cpp/benchmarks/iterator/iterator.cu @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state) sizeof(TypeParam)); } -// operator+ defined for pair iterator reduction -template -__device__ thrust::pair operator+(thrust::pair lhs, thrust::pair rhs) -{ - return thrust::pair{lhs.first * lhs.second + rhs.first * rhs.second, - lhs.second + rhs.second}; -} -// ----------------------------------------------------------------------------- -template -void pair_iterator_bench_cub(cudf::column_view& col, - rmm::device_uvector>& result) -{ - thrust::pair init{0, false}; - auto d_col = cudf::column_device_view::create(col); - int num_items = col.size(); - auto begin = d_col->pair_begin(); - reduce_by_cub(result.begin(), begin, num_items, init); -} - -template -void pair_iterator_bench_thrust(cudf::column_view& col, - rmm::device_uvector>& result) -{ - thrust::pair init{0, false}; - auto d_col = cudf::column_device_view::create(col); - auto d_in = d_col->pair_begin(); - auto d_end = d_in + col.size(); - thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{}); -} - -template -void BM_pair_iterator(benchmark::State& state) -{ - cudf::size_type const column_size{(cudf::size_type)state.range(0)}; - using T = TypeParam; - auto num_gen = thrust::counting_iterator(0); - auto null_gen = - thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; }); - - cudf::test::fixed_width_column_wrapper wrap_hasnull_F(num_gen, num_gen + column_size); - cudf::test::fixed_width_column_wrapper wrap_hasnull_T( - num_gen, num_gen + column_size, null_gen); - cudf::column_view hasnull_F = wrap_hasnull_F; - cudf::column_view hasnull_T = wrap_hasnull_T; - - // Initialize dev_result to false - auto dev_result = cudf::detail::make_zeroed_device_uvector_sync>( - 1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - if (cub_or_thrust) { - pair_iterator_bench_cub(hasnull_T, - dev_result); // driven by pair iterator with nulls - } else { - pair_iterator_bench_thrust(hasnull_T, - dev_result); // driven by pair iterator with nulls - } - } - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - sizeof(TypeParam)); -} - #define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \ BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \ { \ @@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true); ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false); ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true); ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false); - -#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust) \ - BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \ - { \ - BM_pair_iterator(state); \ - } \ - BENCHMARK_REGISTER_F(Iterator, name) \ - ->RangeMultiplier(10) \ - ->Range(1000, 10000000) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true); -PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false); diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp index 7e9256be1d3..5f3c91dc99c 100644 --- a/cpp/include/cudf/io/text/byte_range_info.hpp +++ b/cpp/include/cudf/io/text/byte_range_info.hpp @@ -16,7 +16,6 @@ #pragma once -#include #include #include @@ -40,53 +39,49 @@ class byte_range_info { int64_t _size{}; ///< size in bytes public: - constexpr byte_range_info() = default; + byte_range_info() = default; /** * @brief Constructs a byte_range_info object * * @param offset offset in bytes * @param size size in bytes */ - constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size) - { - CUDF_EXPECTS(offset >= 0, "offset must be non-negative"); - CUDF_EXPECTS(size >= 0, "size must be non-negative"); - } + byte_range_info(int64_t offset, int64_t size); /** * @brief Copy constructor * * @param other byte_range_info object to copy */ - constexpr byte_range_info(byte_range_info const& other) noexcept = default; + byte_range_info(byte_range_info const& other) noexcept = default; /** * @brief Copy assignment operator * * @param other byte_range_info object to copy * @return this object after copying */ - constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default; + byte_range_info& operator=(byte_range_info const& other) noexcept = default; /** * @brief Get the offset in bytes * * @return Offset in bytes */ - [[nodiscard]] constexpr int64_t offset() { return _offset; } + [[nodiscard]] int64_t offset() const { return _offset; } /** * @brief Get the size in bytes * * @return Size in bytes */ - [[nodiscard]] constexpr int64_t size() { return _size; } + [[nodiscard]] int64_t size() const { return _size; } /** * @brief Returns whether the span is empty. * - * @return true iff the span is empty, i.e. `size() == 0` + * @return true iff the range is empty, i.e. `size() == 0` */ - [[nodiscard]] constexpr bool empty() { return size() == 0; } + [[nodiscard]] bool is_empty() const { return size() == 0; } }; /** diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp index 6a7836ed4e1..fe811739b97 100644 --- a/cpp/src/io/text/byte_range_info.cpp +++ b/cpp/src/io/text/byte_range_info.cpp @@ -16,6 +16,7 @@ #include #include +#include #include @@ -23,6 +24,12 @@ namespace cudf { namespace io { namespace text { +byte_range_info::byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size) +{ + CUDF_EXPECTS(offset >= 0, "offset must be non-negative"); + CUDF_EXPECTS(size >= 0, "size must be non-negative"); +} + byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits::max()}; } std::vector create_byte_range_infos_consecutive(int64_t total_bytes, diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 97729a091fb..e3435a24b18 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -310,7 +310,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source { CUDF_FUNC_RANGE(); - if (byte_range.empty()) { return make_empty_column(type_id::STRING); } + if (byte_range.is_empty()) { return make_empty_column(type_id::STRING); } auto device_delim = cudf::string_scalar(delimiter, true, stream, mr); diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 4d7096c02ca..af70367678e 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -142,7 +142,7 @@ struct base_split_tokenizer { // max_tokens already included in token counts if (d_tokens.size() == 1) { - d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()}; + d_tokens[0] = string_index_pair{(d_str.empty() ? "" : d_str.data()), d_str.size_bytes()}; return; } @@ -357,24 +357,20 @@ std::pair, rmm::device_uvector> split auto const chars_bytes = get_offset_value(input.offsets(), input.offset() + strings_count, stream) - get_offset_value(input.offsets(), input.offset(), stream); - if (chars_bytes == 0) { - auto offsets = cudf::make_column_from_scalar( - numeric_scalar(0, true, stream), strings_count + 1, stream, mr); - auto tokens = rmm::device_uvector(0, stream); - return std::pair{std::move(offsets), std::move(tokens)}; - } auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // count the number of delimiters in the entire column rmm::device_scalar d_count(0, stream); - constexpr int64_t block_size = 512; - constexpr size_type bytes_per_thread = 4; - auto const num_blocks = util::div_rounding_up_safe( - util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); - count_delimiters_kernel - <<>>( - tokenizer, d_offsets, chars_bytes, d_count.data()); + if (chars_bytes > 0) { + constexpr int64_t block_size = 512; + constexpr size_type bytes_per_thread = 4; + auto const num_blocks = util::div_rounding_up_safe( + util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); + count_delimiters_kernel + <<>>( + tokenizer, d_offsets, chars_bytes, d_count.data()); + } // Create a vector of every delimiter position in the chars column. // These may include overlapping or otherwise out-of-bounds delimiters which diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index d72ec1085b5..e0aacf07ef0 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -71,6 +71,10 @@ struct token_reader_fn { auto const token_offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - token_offset; auto const d_result = d_tokens + token_offset; // store tokens here + if (nchars == 0) { + d_result[0] = string_index_pair{"", 0}; + return; + } int64_t token_idx = 0; auto itr = d_str.begin(); diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 4c020cb4c29..7ece08b19f2 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -307,24 +307,46 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } -TEST_F(StringsSplitTest, SplitRecordAllEmpty) +TEST_F(StringsSplitTest, SplitAllEmpty) { auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); auto sv = cudf::strings_column_view(input); + auto empty = cudf::string_scalar(""); auto delimiter = cudf::string_scalar("s"); + + auto result = cudf::strings::split(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + result = cudf::strings::rsplit(sv, delimiter); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + + // whitespace hits a special case where nothing matches returns an all-null column + auto expected = cudf::test::strings_column_wrapper({"", "", "", ""}, {0, 0, 0, 0}); + result = cudf::strings::split(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected); + result = cudf::strings::rsplit(sv, empty); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected); +} + +TEST_F(StringsSplitTest, SplitRecordAllEmpty) +{ + auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); + auto sv = cudf::strings_column_view(input); auto empty = cudf::string_scalar(""); + auto delimiter = cudf::string_scalar("s"); using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{}, LCW{}, LCW{}, LCW{}}); + LCW expected({LCW{""}, LCW{""}, LCW{""}, LCW{""}}); + LCW expected_empty({LCW{}, LCW{}, LCW{}, LCW{}}); + auto result = cudf::strings::split_record(sv, delimiter); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); result = cudf::strings::split_record(sv, empty); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty); result = cudf::strings::rsplit_record(sv, delimiter); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); result = cudf::strings::rsplit_record(sv, empty); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty); } TEST_F(StringsSplitTest, MultiByteDelimiters) @@ -575,6 +597,23 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary) } } +TEST_F(StringsSplitTest, SplitRegexAllEmpty) +{ + auto input = cudf::test::strings_column_wrapper({"", "", "", ""}); + auto sv = cudf::strings_column_view(input); + auto prog = cudf::strings::regex_program::create("[ _]"); + + auto result = cudf::strings::split_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + result = cudf::strings::rsplit_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input); + + auto rec_result = cudf::strings::split_record_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input); + rec_result = cudf::strings::rsplit_record_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input); +} + TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index a2a3e874c91..30880f074c0 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -978,6 +978,22 @@ def test_string_split_re(data, pat, n, expand): assert_eq(expect, got) +@pytest.mark.parametrize("pat", [None, "\\s+"]) +@pytest.mark.parametrize("regex", [False, True]) +@pytest.mark.parametrize("expand", [False, True]) +def test_string_split_all_empty(pat, regex, expand): + ps = pd.Series(["", "", "", ""], dtype="str") + gs = cudf.Series(["", "", "", ""], dtype="str") + + expect = ps.str.split(pat=pat, expand=expand, regex=regex) + got = gs.str.split(pat=pat, expand=expand, regex=regex) + + if isinstance(got, cudf.DataFrame): + assert_eq(expect, got, check_column_type=False) + else: + assert_eq(expect, got) + + @pytest.mark.parametrize( "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] )