From 1fd3db8b662c61b4fb04e4be07cf6ac737cef8a1 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:36:26 -0400 Subject: [PATCH] Use experimental make_strings_children for strings replace/filter/translate (#15586) Updates strings replace functions to use the new experimental `make_strings_children` which supports building large strings. Reference #15579 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15586 --- cpp/src/strings/char_types/char_types.cu | 13 +++++++------ cpp/src/strings/filter_chars.cu | 11 ++++++----- cpp/src/strings/replace/multi.cu | 15 +++++++++------ cpp/src/strings/replace/replace.cu | 11 ++++++----- cpp/src/strings/replace/replace_slice.cu | 11 ++++++----- cpp/src/strings/translate.cu | 11 ++++++----- 6 files changed, 40 insertions(+), 32 deletions(-) diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 28068cf7e78..7716cf0cc29 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -130,8 +130,9 @@ struct filter_chars_fn { string_character_types const types_to_remove; string_character_types const types_to_keep; string_view const d_replacement; ///< optional replacement for removed characters - int32_t* d_offsets{}; ///< size of the output string stored here during first pass - char* d_chars{}; ///< this is null only during the first pass + size_type* d_sizes{}; + char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; /** * @brief Returns true if the given character should be replaced. @@ -150,7 +151,7 @@ struct filter_chars_fn { __device__ void operator()(size_type idx) { if (d_column.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const d_str = d_column.element(idx); @@ -165,7 +166,7 @@ struct filter_chars_fn { nbytes += d_newchar.size_bytes() - char_size; if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar); } - if (!out_ptr) d_offsets[idx] = nbytes; + if (!out_ptr) { d_sizes[idx] = nbytes; } } }; @@ -202,7 +203,7 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str // this utility calls filterer to build the offsets and chars columns auto [offsets_column, chars] = - cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); + cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr); // return new strings column return make_strings_column(strings_count, diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 32717dac78d..4705ae519cd 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -57,8 +57,9 @@ struct filter_fn { rmm::device_uvector::iterator table_begin; rmm::device_uvector::iterator table_end; string_view const d_replacement; - int32_t* d_offsets{}; + size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; /** * @brief Return true if this character should be removed. @@ -87,7 +88,7 @@ struct filter_fn { __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const d_str = d_strings.element(idx); @@ -104,7 +105,7 @@ struct filter_fn { else nbytes += d_newchar.size_bytes() - char_size; } - if (!out_ptr) d_offsets[idx] = nbytes; + if (!out_ptr) { d_sizes[idx] = nbytes; } } }; @@ -141,7 +142,7 @@ std::unique_ptr filter_characters( // this utility calls the strip_fn to build the offsets and chars columns filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement}; auto [offsets_column, chars] = - cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr); + cudf::strings::detail::experimental::make_strings_children(ffn, strings.size(), stream, mr); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 2eb03bd10a4..9abcca7a5e6 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -404,13 +404,14 @@ struct replace_multi_fn { column_device_view const d_strings; column_device_view const d_targets; column_device_view const d_repls; - int32_t* d_offsets{}; + size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { - if (!d_chars) { d_offsets[idx] = 0; } + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const d_str = d_strings.element(idx); @@ -443,9 +444,11 @@ struct replace_multi_fn { ++spos; } if (out_ptr) // copy remainder + { memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); - else - d_offsets[idx] = bytes; + } else { + d_sizes[idx] = bytes; + } } }; @@ -459,7 +462,7 @@ std::unique_ptr replace_string_parallel(strings_column_view const& input auto d_targets = column_device_view::create(targets.parent(), stream); auto d_replacements = column_device_view::create(repls.parent(), stream); - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children( replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr); return make_strings_column(input.size(), diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 857bc7fb41c..df8526fa942 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include @@ -345,13 +345,14 @@ struct replace_fn { string_view d_target; string_view d_replacement; cudf::size_type maxrepl; - cudf::size_type* d_offsets{}; + cudf::size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { - if (!d_chars) { d_offsets[idx] = 0; } + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const d_str = d_strings.element(idx); @@ -384,7 +385,7 @@ struct replace_fn { if (out_ptr) { // copy remainder memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); } else { - d_offsets[idx] = bytes; + d_sizes[idx] = bytes; } } }; @@ -398,7 +399,7 @@ std::unique_ptr replace_string_parallel(strings_column_view const& input { auto d_strings = column_device_view::create(input.parent(), stream); - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children( replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr); return make_strings_column(input.size(), diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu index 90540b39189..54e84dfe504 100644 --- a/cpp/src/strings/replace/replace_slice.cu +++ b/cpp/src/strings/replace/replace_slice.cu @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -45,13 +45,14 @@ struct replace_slice_fn { string_view const d_repl; size_type const start; size_type const stop; - size_type* d_offsets{}; + size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { - if (!d_chars) { d_offsets[idx] = 0; } + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const d_str = d_strings.element(idx); @@ -69,7 +70,7 @@ struct replace_slice_fn { in_ptr + end, d_str.size_bytes() - end); } else { - d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin); + d_sizes[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin); } } }; @@ -94,7 +95,7 @@ std::unique_ptr replace_slice(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // this utility calls the given functor to build the offsets and chars columns - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children( replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr); return make_strings_column(input.size(), diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index fcf55429e09..75bc46d30c4 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -52,13 +52,14 @@ struct translate_fn { column_device_view const d_strings; rmm::device_uvector::iterator table_begin; rmm::device_uvector::iterator table_end; - int32_t* d_offsets{}; + size_type* d_sizes{}; char* d_chars{}; + cudf::detail::input_offsetalator d_offsets; __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } string_view const d_str = d_strings.element(idx); @@ -80,7 +81,7 @@ struct translate_fn { } if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr); } - if (!d_chars) d_offsets[idx] = bytes; + if (!d_chars) { d_sizes[idx] = bytes; } } }; @@ -111,7 +112,7 @@ std::unique_ptr translate(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); - auto [offsets_column, chars] = make_strings_children( + auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children( translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr); return make_strings_column(strings.size(),