Skip to content

Commit

Permalink
Use experimental make_strings_children for strings replace/filter/tra…
Browse files Browse the repository at this point in the history
…nslate (#15586)

Updates strings replace functions to use the new experimental `make_strings_children` which supports building large strings.

Reference #15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #15586
  • Loading branch information
davidwendt authored Apr 30, 2024
1 parent f4ec1a4 commit 1fd3db8
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 32 deletions.
13 changes: 7 additions & 6 deletions cpp/src/strings/char_types/char_types.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/char_types/char_types.hpp>
#include <cudf/strings/detail/char_tables.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -130,8 +130,9 @@ struct filter_chars_fn {
string_character_types const types_to_remove;
string_character_types const types_to_keep;
string_view const d_replacement; ///< optional replacement for removed characters
int32_t* d_offsets{}; ///< size of the output string stored here during first pass
char* d_chars{}; ///< this is null only during the first pass
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Returns true if the given character should be replaced.
Expand All @@ -150,7 +151,7 @@ struct filter_chars_fn {
__device__ void operator()(size_type idx)
{
if (d_column.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_column.element<string_view>(idx);
Expand All @@ -165,7 +166,7 @@ struct filter_chars_fn {
nbytes += d_newchar.size_bytes() - char_size;
if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar);
}
if (!out_ptr) d_offsets[idx] = nbytes;
if (!out_ptr) { d_sizes[idx] = nbytes; }
}
};

Expand Down Expand Up @@ -202,7 +203,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str

// this utility calls filterer to build the offsets and chars columns
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);

// return new strings column
return make_strings_column(strings_count,
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/strings/filter_chars.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -57,8 +57,9 @@ struct filter_fn {
rmm::device_uvector<char_range>::iterator table_begin;
rmm::device_uvector<char_range>::iterator table_end;
string_view const d_replacement;
int32_t* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

/**
* @brief Return true if this character should be removed.
Expand Down Expand Up @@ -87,7 +88,7 @@ struct filter_fn {
__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_strings.element<string_view>(idx);
Expand All @@ -104,7 +105,7 @@ struct filter_fn {
else
nbytes += d_newchar.size_bytes() - char_size;
}
if (!out_ptr) d_offsets[idx] = nbytes;
if (!out_ptr) { d_sizes[idx] = nbytes; }
}
};

Expand Down Expand Up @@ -141,7 +142,7 @@ std::unique_ptr<column> filter_characters(
// this utility calls the strip_fn to build the offsets and chars columns
filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
cudf::strings::detail::experimental::make_strings_children(ffn, strings.size(), stream, mr);

return make_strings_column(strings_count,
std::move(offsets_column),
Expand Down
15 changes: 9 additions & 6 deletions cpp/src/strings/replace/multi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/replace.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/replace.hpp>
Expand Down Expand Up @@ -404,13 +404,14 @@ struct replace_multi_fn {
column_device_view const d_strings;
column_device_view const d_targets;
column_device_view const d_repls;
int32_t* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) { d_offsets[idx] = 0; }
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_strings.element<string_view>(idx);
Expand Down Expand Up @@ -443,9 +444,11 @@ struct replace_multi_fn {
++spos;
}
if (out_ptr) // copy remainder
{
memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
else
d_offsets[idx] = bytes;
} else {
d_sizes[idx] = bytes;
}
}
};

Expand All @@ -459,7 +462,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
auto d_targets = column_device_view::create(targets.parent(), stream);
auto d_replacements = column_device_view::create(repls.parent(), stream);

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);

return make_strings_column(input.size(),
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/strings/replace/replace.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/strings/detail/replace.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/replace.hpp>
Expand Down Expand Up @@ -345,13 +345,14 @@ struct replace_fn {
string_view d_target;
string_view d_replacement;
cudf::size_type maxrepl;
cudf::size_type* d_offsets{};
cudf::size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) { d_offsets[idx] = 0; }
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_strings.element<string_view>(idx);
Expand Down Expand Up @@ -384,7 +385,7 @@ struct replace_fn {
if (out_ptr) { // copy remainder
memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
} else {
d_offsets[idx] = bytes;
d_sizes[idx] = bytes;
}
}
};
Expand All @@ -398,7 +399,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
{
auto d_strings = column_device_view::create(input.parent(), stream);

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);

return make_strings_column(input.size(),
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/strings/replace/replace_slice.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/replace.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/replace.hpp>
#include <cudf/strings/string_view.cuh>
Expand All @@ -45,13 +45,14 @@ struct replace_slice_fn {
string_view const d_repl;
size_type const start;
size_type const stop;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) { d_offsets[idx] = 0; }
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_strings.element<string_view>(idx);
Expand All @@ -69,7 +70,7 @@ struct replace_slice_fn {
in_ptr + end,
d_str.size_bytes() - end);
} else {
d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
d_sizes[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
}
}
};
Expand All @@ -94,7 +95,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
auto d_strings = column_device_view::create(input.parent(), stream);

// this utility calls the given functor to build the offsets and chars columns
auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);

return make_strings_column(input.size(),
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/strings/translate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/translate.hpp>
Expand Down Expand Up @@ -52,13 +52,14 @@ struct translate_fn {
column_device_view const d_strings;
rmm::device_uvector<translate_table>::iterator table_begin;
rmm::device_uvector<translate_table>::iterator table_end;
int32_t* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
string_view const d_str = d_strings.element<string_view>(idx);
Expand All @@ -80,7 +81,7 @@ struct translate_fn {
}
if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
}
if (!d_chars) d_offsets[idx] = bytes;
if (!d_chars) { d_sizes[idx] = bytes; }
}
};

Expand Down Expand Up @@ -111,7 +112,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,

auto d_strings = column_device_view::create(strings.parent(), stream);

auto [offsets_column, chars] = make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);

return make_strings_column(strings.size(),
Expand Down

0 comments on commit 1fd3db8

Please sign in to comment.