From 0a3dbf79ccf8900cb9ec2e4cc061e951e745b539 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 14 May 2024 19:31:40 -0400 Subject: [PATCH 1/4] Improve performance for long strings for nvtext::replace/filter functions --- cpp/src/text/replace.cu | 182 ++++++++++++++++++++++++++++--- cpp/tests/text/replace_tests.cpp | 23 ++++ 2 files changed, 188 insertions(+), 17 deletions(-) diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index f95b53a3ac8..d2e451e031d 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -34,10 +35,13 @@ #include #include +#include +#include #include #include #include #include +#include namespace nvtext { namespace detail { @@ -164,6 +168,73 @@ struct replace_tokens_fn : base_token_replacer_fn { } }; +struct replace_tokens_ls : replace_tokens_fn { + cudf::size_type const* d_indices; + cudf::size_type* d_output_sizes; + replace_tokens_ls(cudf::column_device_view const& d_strings, + cudf::string_view const& d_delimiter, + strings_iterator d_targets_begin, + strings_iterator d_targets_end, + cudf::column_device_view const& d_replacements, + cudf::size_type* d_indices, + cudf::size_type* d_output_sizes) + : replace_tokens_fn{d_strings, d_delimiter, d_targets_begin, d_targets_end, d_replacements}, + d_indices{d_indices}, + d_output_sizes{d_output_sizes} + { + } + + __device__ void operator()(cudf::size_type idx) + { + process_string( + idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); }); + if (!d_chars) { + // accumulate sub-row sizes into output row size + auto const size = d_sizes[idx]; + if (size > 0) { + auto out_idx = d_indices[idx] - 1; // adjust for upper_bound + cuda::atomic_ref ref{ + *(d_output_sizes + out_idx)}; + ref.fetch_add(size, cuda::std::memory_order_relaxed); + } + } + } +}; + +constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64; +constexpr cudf::size_type LS_SUB_BLOCK_SIZE = 64; + +struct sub_offset_fn { + char const* d_input_chars; + int64_t first_offset; + int64_t last_offset; + cudf::string_view const d_delimiter; + + __device__ int64_t operator()(int64_t idx) const + { + auto end = d_input_chars + last_offset; + auto position = (idx + 1) * LS_SUB_BLOCK_SIZE; + auto begin = d_input_chars + first_offset + position; + while ((begin < end) && + cudf::strings::detail::is_utf8_continuation_char(static_cast(*begin))) { + ++begin; + ++position; + } + if (begin >= end) { return 0; } // or last_offset + // keep delimiter search within this sub-block + end = d_input_chars + std::min(last_offset, (idx + 2) * LS_SUB_BLOCK_SIZE + first_offset); + auto tokenizer = characters_tokenizer(cudf::string_view{}, d_delimiter); + while (begin < end) { + auto chr = cudf::char_utf8{}; + auto chr_size = cudf::strings::detail::to_char_utf8(begin, chr); + if (tokenizer.is_delimiter(chr)) { break; } + begin += chr_size; + position += chr_size; + } + return (begin < end) ? position + first_offset : 0; + } +}; + /** * @brief Functor to filter tokens in each string. * @@ -200,7 +271,7 @@ struct remove_small_tokens_fn : base_token_replacer_fn { // detail APIs -std::unique_ptr replace_tokens(cudf::strings_column_view const& strings, +std::unique_ptr replace_tokens(cudf::strings_column_view const& input, cudf::strings_column_view const& targets, cudf::strings_column_view const& replacements, cudf::string_scalar const& delimiter, @@ -214,32 +285,109 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st "Parameter targets and replacements must be the same size"); CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); - cudf::size_type const strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + cudf::size_type const strings_count = input.size(); + if (strings_count == 0) { + return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + } - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); + auto strings_column = cudf::column_device_view::create(input.parent(), stream); auto targets_column = cudf::column_device_view::create(targets.parent(), stream); auto replacements_column = cudf::column_device_view::create(replacements.parent(), stream); cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); - replace_tokens_fn replacer{*strings_column, - d_delimiter, - targets_column->begin(), - targets_column->end(), - *replacements_column}; - // copy null mask from input column - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); + auto const first_offset = (input.offset() == 0) ? 0L + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + + if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + replace_tokens_fn replacer{*strings_column, + d_delimiter, + targets_column->begin(), + targets_column->end(), + *replacements_column}; + // this utility calls replacer to build the offsets and chars columns + auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children( + replacer, strings_count, stream, mr); + // return new strings column + return cudf::make_strings_column(strings_count, + std::move(offsets_column), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); + } - // this utility calls replacer to build the offsets and chars columns - auto [offsets_column, chars] = - cudf::strings::detail::experimental::make_strings_children(replacer, strings_count, stream, mr); + auto const input_chars = input.chars_begin(stream); + auto input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); - // return new strings column + auto sub_count = chars_size / LS_SUB_BLOCK_SIZE; + auto tmp_offsets = rmm::device_uvector(sub_count + input.size() + 1, stream); + { + rmm::device_uvector sub_offsets(sub_count, stream); + auto const count_itr = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy_nosync(stream), + count_itr, + count_itr + sub_count, + sub_offsets.data(), + sub_offset_fn{input_chars, first_offset, last_offset}); + + auto rend = + thrust::remove(rmm::exec_policy_nosync(stream), sub_offsets.begin(), sub_offsets.end(), 0L); + sub_count = thrust::distance(sub_offsets.begin(), rend); + sub_offsets.resize(sub_count, stream); + + // merge them with input offsets + thrust::merge(rmm::exec_policy_nosync(stream), + input_offsets, + input_offsets + input.size() + 1, + sub_offsets.begin(), + sub_offsets.end(), + tmp_offsets.begin()); + tmp_offsets.resize(sub_count + input.size() + 1, stream); + stream.synchronize(); // protect against destruction of sub_offsets + } + + auto const tmp_size = static_cast(tmp_offsets.size()); + auto tmp_strings = cudf::column_view( + cudf::data_type{cudf::type_id::STRING}, + tmp_size - 1, + input_chars, + nullptr, + 0, + 0, + {cudf::column_view( + cudf::data_type{cudf::type_id::INT64}, tmp_size, tmp_offsets.data(), nullptr, 0)}); + auto d_strings = cudf::column_device_view::create(tmp_strings, stream); + + auto indices = rmm::device_uvector(tmp_offsets.size(), stream); + thrust::upper_bound(rmm::exec_policy_nosync(stream), + input_offsets, + input_offsets + input.size() + 1, + tmp_offsets.begin(), + tmp_offsets.end(), + indices.begin()); + + auto d_sizes = rmm::device_uvector(input.size(), stream); + thrust::fill(rmm::exec_policy_nosync(stream), d_sizes.begin(), d_sizes.end(), 0); + replace_tokens_ls replacer{*d_strings, + d_delimiter, + targets_column->begin(), + targets_column->end(), + *replacements_column, + indices.data(), + d_sizes.data()}; + auto chars = std::get<1>(cudf::strings::detail::experimental::make_strings_children( + replacer, tmp_strings.size(), stream, mr)); + auto offsets_column = std::get<0>( + cudf::strings::detail::make_offsets_child_column(d_sizes.begin(), d_sizes.end(), stream, mr)); return cudf::make_strings_column(strings_count, std::move(offsets_column), chars.release(), - strings.null_count(), - std::move(null_mask)); + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } std::unique_ptr filter_tokens(cudf::strings_column_view const& strings, diff --git a/cpp/tests/text/replace_tests.cpp b/cpp/tests/text/replace_tests.cpp index 8c58c6bcaca..de59ee45e38 100644 --- a/cpp/tests/text/replace_tests.cpp +++ b/cpp/tests/text/replace_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,28 @@ TEST_F(TextReplaceTest, ReplaceTokensEmptyTest) EXPECT_EQ(results->has_nulls(), false); } +TEST_F(TextReplaceTest, ReplaceTokensLongStrings) +{ + cudf::test::strings_column_wrapper input{ + "pellentesque ut euismod semo phaselus tristiut libero ut dui congusem non pellentesque nunc ", + "pellentesque ut euismod se phaselus tristiut libero ut dui congusem non pellentesque ", + "pellentesque ut euismod phaselus tristiut libero ut dui congusem non pellentesque nun ", + "pellentesque ut euismod seem phaselus tristiut libero ut dui congusem non pellentesque un "}; + cudf::test::strings_column_wrapper targets({"ut", "pellentesque"}); + cudf::test::strings_column_wrapper repls({"___", "é"}); + + auto expected = cudf::test::strings_column_wrapper{ + "é ___ euismod semo phaselus tristiut libero ___ dui congusem non é nunc ", + "é ___ euismod se phaselus tristiut libero ___ dui congusem non é ", + "é ___ euismod phaselus tristiut libero ___ dui congusem non é nun ", + "é ___ euismod seem phaselus tristiut libero ___ dui congusem non é un "}; + + auto results = nvtext::replace_tokens(cudf::strings_column_view(input), + cudf::strings_column_view(targets), + cudf::strings_column_view(repls)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(TextReplaceTest, ReplaceTokensErrorTest) { auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); From f48936a34d9df81b9b7fe9bea09f0bff1380471e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 16 May 2024 18:03:33 -0400 Subject: [PATCH 2/4] refactor code for filter-tokens --- cpp/src/text/replace.cu | 233 ++++++++++++++++++---------------------- 1 file changed, 104 insertions(+), 129 deletions(-) diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 2cb7211e546..bca30e14f9a 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -29,7 +29,6 @@ #include #include -#include #include #include @@ -50,11 +49,13 @@ namespace { using replace_result = thrust::pair; struct base_token_replacer_fn { - cudf::column_device_view const d_strings; ///< strings to tokenize - cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing - cudf::size_type* d_sizes{}; ///< for output string size - char* d_chars{}; ///< output buffer - cudf::detail::input_offsetalator d_offsets; + cudf::column_device_view d_strings; ///< strings to tokenize + cudf::string_view const d_delimiter; ///< delimiter characters for tokenizing + cudf::size_type* d_sizes{}; ///< for output string size + char* d_chars{}; ///< output buffer + cudf::detail::input_offsetalator d_offsets; ///< offsets for output buffer + cudf::size_type const* d_indices{}; ///< indices for long strings + cudf::size_type* d_output_sizes{}; ///< output sizes for long strings /** * @brief Tokenizes each string and calls the provided `replacer` function @@ -65,7 +66,7 @@ struct base_token_replacer_fn { * @param replacer Function to call for each token to determined its replacement */ template - __device__ void process_string(cudf::size_type idx, ReplaceFn replacer) + __device__ void process_string(cudf::size_type idx, ReplaceFn replacer) const { if (d_strings.is_null(idx)) { if (!d_chars) { d_sizes[idx] = 0; } @@ -104,6 +105,13 @@ struct base_token_replacer_fn { memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos); } else { d_sizes[idx] = nbytes; + // handles output size calculation for long strings + if (nbytes > 0 && d_indices) { + auto out_idx = d_indices[idx] - 1; // adjust for upper_bound + cuda::atomic_ref ref{ + *(d_output_sizes + out_idx)}; + ref.fetch_add(nbytes, cuda::std::memory_order_relaxed); + } } } }; @@ -123,7 +131,7 @@ using strings_iterator = cudf::column_device_view::const_iterator 0) { - auto out_idx = d_indices[idx] - 1; // adjust for upper_bound - cuda::atomic_ref ref{ - *(d_output_sizes + out_idx)}; - ref.fetch_add(size, cuda::std::memory_order_relaxed); - } - } - } -}; - +// For determining long strings processing constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64; -constexpr cudf::size_type LS_SUB_BLOCK_SIZE = 64; +// For computing sub-block sizes of long strings +constexpr cudf::size_type LS_SUB_BLOCK_SIZE = 64; +/** + * @brief Locate delimiters to produce sub-offsets in the input device array + * + * The sub-offsets provide additional tokenize boundaries within longer strings. + */ struct sub_offset_fn { char const* d_input_chars; int64_t first_offset; @@ -214,24 +196,24 @@ struct sub_offset_fn { { auto end = d_input_chars + last_offset; auto position = (idx + 1) * LS_SUB_BLOCK_SIZE; - auto begin = d_input_chars + first_offset + position; - while ((begin < end) && - cudf::strings::detail::is_utf8_continuation_char(static_cast(*begin))) { - ++begin; + auto itr = d_input_chars + first_offset + position; + while ((itr < end) && + cudf::strings::detail::is_utf8_continuation_char(static_cast(*itr))) { + ++itr; ++position; } - if (begin >= end) { return 0; } // or last_offset + if (itr >= end) { return 0; } // 0s will be filtered out // keep delimiter search within this sub-block end = d_input_chars + std::min(last_offset, (idx + 2) * LS_SUB_BLOCK_SIZE + first_offset); auto tokenizer = characters_tokenizer(cudf::string_view{}, d_delimiter); - while (begin < end) { + while (itr < end) { auto chr = cudf::char_utf8{}; - auto chr_size = cudf::strings::detail::to_char_utf8(begin, chr); + auto chr_size = cudf::strings::detail::to_char_utf8(itr, chr); if (tokenizer.is_delimiter(chr)) { break; } - begin += chr_size; + itr += chr_size; position += chr_size; } - return (begin < end) ? position + first_offset : 0; + return (itr < end) ? position + first_offset : 0; } }; @@ -258,43 +240,24 @@ struct remove_small_tokens_fn : base_token_replacer_fn { { } - __device__ void operator()(cudf::size_type idx) + __device__ replace_result token_replacement(cudf::string_view token) const { - auto replacer = [this] __device__(cudf::string_view const& token) { - return replace_result{token.length() < min_token_length, d_replacement}; - }; - process_string(idx, replacer); + return replace_result{token.length() < min_token_length, d_replacement}; } -}; -} // namespace - -// detail APIs + __device__ void operator()(cudf::size_type idx) const + { + process_string( + idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); }); + } +}; -std::unique_ptr replace_tokens(cudf::strings_column_view const& input, - cudf::strings_column_view const& targets, - cudf::strings_column_view const& replacements, - cudf::string_scalar const& delimiter, +template +std::unique_ptr replace_helper(ReplacerFn replacer, + cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(!targets.has_nulls(), "Parameter targets must not have nulls"); - CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have nulls"); - if (replacements.size() != 1) - CUDF_EXPECTS(replacements.size() == targets.size(), - "Parameter targets and replacements must be the same size"); - CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); - - cudf::size_type const strings_count = input.size(); - if (strings_count == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - } - - auto strings_column = cudf::column_device_view::create(input.parent(), stream); - auto targets_column = cudf::column_device_view::create(targets.parent(), stream); - auto replacements_column = cudf::column_device_view::create(replacements.parent(), stream); - cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); - auto const first_offset = (input.offset() == 0) ? 0L : cudf::strings::detail::get_offset_value( input.offsets(), input.offset(), stream); @@ -303,16 +266,11 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& in auto const chars_size = last_offset - first_offset; if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { - replace_tokens_fn replacer{*strings_column, - d_delimiter, - targets_column->begin(), - targets_column->end(), - *replacements_column}; // this utility calls replacer to build the offsets and chars columns auto [offsets_column, chars] = - cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr); + cudf::strings::detail::make_strings_children(replacer, input.size(), stream, mr); // return new strings column - return cudf::make_strings_column(strings_count, + return cudf::make_strings_column(input.size(), std::move(offsets_column), chars.release(), input.null_count(), @@ -320,9 +278,10 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& in } auto const input_chars = input.chars_begin(stream); - auto input_offsets = + auto const input_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + // divide up long strings into shorter strings by finding new sub-offsets at delimiters auto sub_count = chars_size / LS_SUB_BLOCK_SIZE; auto tmp_offsets = rmm::device_uvector(sub_count + input.size() + 1, stream); { @@ -350,18 +309,15 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& in stream.synchronize(); // protect against destruction of sub_offsets } - auto const tmp_size = static_cast(tmp_offsets.size()); - auto tmp_strings = cudf::column_view( - cudf::data_type{cudf::type_id::STRING}, - tmp_size - 1, - input_chars, - nullptr, - 0, - 0, - {cudf::column_view( - cudf::data_type{cudf::type_id::INT64}, tmp_size, tmp_offsets.data(), nullptr, 0)}); - auto d_strings = cudf::column_device_view::create(tmp_strings, stream); + // cobble together a column_view of type STRING + auto const tmp_size = static_cast(tmp_offsets.size()) - 1; + auto const children = std::vector({cudf::column_view( + cudf::data_type{cudf::type_id::INT64}, tmp_size + 1, tmp_offsets.data(), nullptr, 0)}); + auto const tmp_strings = cudf::column_view( + cudf::data_type{cudf::type_id::STRING}, tmp_size, input_chars, nullptr, 0, 0, children); + auto const d_tmp_strings = cudf::column_device_view::create(tmp_strings, stream); + // compute indices of actual output rows auto indices = rmm::device_uvector(tmp_offsets.size(), stream); thrust::upper_bound(rmm::exec_policy_nosync(stream), input_offsets, @@ -372,25 +328,56 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& in auto d_sizes = rmm::device_uvector(input.size(), stream); thrust::fill(rmm::exec_policy_nosync(stream), d_sizes.begin(), d_sizes.end(), 0); - replace_tokens_ls replacer{*d_strings, - d_delimiter, - targets_column->begin(), - targets_column->end(), - *replacements_column, - indices.data(), - d_sizes.data()}; + + replacer.d_strings = *d_tmp_strings; + replacer.d_indices = indices.data(); + replacer.d_output_sizes = d_sizes.data(); + auto chars = std::get<1>( cudf::strings::detail::make_strings_children(replacer, tmp_strings.size(), stream, mr)); auto offsets_column = std::get<0>( cudf::strings::detail::make_offsets_child_column(d_sizes.begin(), d_sizes.end(), stream, mr)); - return cudf::make_strings_column(strings_count, + return cudf::make_strings_column(input.size(), std::move(offsets_column), chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } +} // namespace + +// detail APIs + +std::unique_ptr replace_tokens(cudf::strings_column_view const& input, + cudf::strings_column_view const& targets, + cudf::strings_column_view const& replacements, + cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!targets.has_nulls(), "Parameter targets must not have nulls"); + CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have nulls"); + if (replacements.size() != 1) + CUDF_EXPECTS(replacements.size() == targets.size(), + "Parameter targets and replacements must be the same size"); + CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); + + if (input.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const d_targets = cudf::column_device_view::create(targets.parent(), stream); + auto const d_replacements = cudf::column_device_view::create(replacements.parent(), stream); + auto const d_delimiter = cudf::string_view(delimiter.data(), delimiter.size()); + + replace_tokens_fn replacer{*d_strings, + d_delimiter, + d_targets->begin(), + d_targets->end(), + *d_replacements}; + + return replace_helper(replacer, input, stream, mr); +} -std::unique_ptr filter_tokens(cudf::strings_column_view const& strings, +std::unique_ptr filter_tokens(cudf::strings_column_view const& input, cudf::size_type min_token_length, cudf::string_scalar const& replacement, cudf::string_scalar const& delimiter, @@ -400,27 +387,15 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid"); CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); - cudf::size_type const strings_count = strings.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - - auto strings_column = cudf::column_device_view::create(strings.parent(), stream); - cudf::string_view d_replacement(replacement.data(), replacement.size()); - cudf::string_view d_delimiter(delimiter.data(), delimiter.size()); - remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement}; + if (input.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); } - // copy null mask from input column - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const d_replacement = cudf::string_view(replacement.data(), replacement.size()); + auto const d_delimiter = cudf::string_view(delimiter.data(), delimiter.size()); - // this utility calls filterer to build the offsets and chars columns - auto [offsets_column, chars] = - cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); + remove_small_tokens_fn filterer{*d_strings, d_delimiter, min_token_length, d_replacement}; - // return new strings column - return cudf::make_strings_column(strings_count, - std::move(offsets_column), - chars.release(), - strings.null_count(), - std::move(null_mask)); + return replace_helper(filterer, input, stream, mr); } } // namespace detail From 870868539244a43f146a4b2fff04dfdf22d7d949 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 21 May 2024 14:34:36 -0400 Subject: [PATCH 3/4] remove unneeded include --- cpp/tests/text/replace_tests.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/text/replace_tests.cpp b/cpp/tests/text/replace_tests.cpp index de59ee45e38..faced4a14d3 100644 --- a/cpp/tests/text/replace_tests.cpp +++ b/cpp/tests/text/replace_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include From eb86b1fc35fa16e27bc5909ee144868f2440935f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 22 May 2024 10:42:01 -0400 Subject: [PATCH 4/4] fixup headers; removed unneeded var --- cpp/src/text/replace.cu | 46 +++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index bca30e14f9a..81c787caf86 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -194,26 +194,25 @@ struct sub_offset_fn { __device__ int64_t operator()(int64_t idx) const { - auto end = d_input_chars + last_offset; - auto position = (idx + 1) * LS_SUB_BLOCK_SIZE; - auto itr = d_input_chars + first_offset + position; + // keep delimiter search within this sub-block + auto const end = + d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset); + // starting point of this sub-block + auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE); while ((itr < end) && cudf::strings::detail::is_utf8_continuation_char(static_cast(*itr))) { ++itr; - ++position; } if (itr >= end) { return 0; } // 0s will be filtered out - // keep delimiter search within this sub-block - end = d_input_chars + std::min(last_offset, (idx + 2) * LS_SUB_BLOCK_SIZE + first_offset); + // now check for a delimiter in this block auto tokenizer = characters_tokenizer(cudf::string_view{}, d_delimiter); while (itr < end) { auto chr = cudf::char_utf8{}; auto chr_size = cudf::strings::detail::to_char_utf8(itr, chr); if (tokenizer.is_delimiter(chr)) { break; } itr += chr_size; - position += chr_size; } - return (itr < end) ? position + first_offset : 0; + return (itr < end) ? thrust::distance(d_input_chars, itr) : 0L; } }; @@ -252,6 +251,19 @@ struct remove_small_tokens_fn : base_token_replacer_fn { } }; +/** + * @brief Common code for replace and filter + * + * Builds the output strings column using the given replace functor. + * + * @tparam ReplaceFn Functor called for replacing tokens + * + * @param replacer Functor for determining matching token and its replacement + * @param input Strings column to tokenize and replace + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of with replaced strings + */ template std::unique_ptr replace_helper(ReplacerFn replacer, cudf::strings_column_view const& input, @@ -277,6 +289,10 @@ std::unique_ptr replace_helper(ReplacerFn replacer, cudf::detail::copy_bitmask(input.parent(), stream, mr)); } + // Long strings logic builds a new fake strings column with the same data but additional offsets + // thus converting the input to a larger column of smaller strings. + // This can be processed in parallel more efficiently than long strings in general. + auto const input_chars = input.chars_begin(stream); auto const input_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); @@ -292,24 +308,23 @@ std::unique_ptr replace_helper(ReplacerFn replacer, count_itr + sub_count, sub_offsets.data(), sub_offset_fn{input_chars, first_offset, last_offset}); - - auto rend = + // remove 0s -- where sub-offset could not be computed + auto const remove_end = thrust::remove(rmm::exec_policy_nosync(stream), sub_offsets.begin(), sub_offsets.end(), 0L); - sub_count = thrust::distance(sub_offsets.begin(), rend); - sub_offsets.resize(sub_count, stream); + sub_count = thrust::distance(sub_offsets.begin(), remove_end); // merge them with input offsets thrust::merge(rmm::exec_policy_nosync(stream), input_offsets, input_offsets + input.size() + 1, sub_offsets.begin(), - sub_offsets.end(), + sub_offsets.begin() + sub_count, tmp_offsets.begin()); tmp_offsets.resize(sub_count + input.size() + 1, stream); stream.synchronize(); // protect against destruction of sub_offsets } - // cobble together a column_view of type STRING + // cobble together a column_view of type STRING using the original data and the tmp offsets auto const tmp_size = static_cast(tmp_offsets.size()) - 1; auto const children = std::vector({cudf::column_view( cudf::data_type{cudf::type_id::INT64}, tmp_size + 1, tmp_offsets.data(), nullptr, 0)}); @@ -317,7 +332,7 @@ std::unique_ptr replace_helper(ReplacerFn replacer, cudf::data_type{cudf::type_id::STRING}, tmp_size, input_chars, nullptr, 0, 0, children); auto const d_tmp_strings = cudf::column_device_view::create(tmp_strings, stream); - // compute indices of actual output rows + // compute indices to the actual output rows auto indices = rmm::device_uvector(tmp_offsets.size(), stream); thrust::upper_bound(rmm::exec_policy_nosync(stream), input_offsets, @@ -326,6 +341,7 @@ std::unique_ptr replace_helper(ReplacerFn replacer, tmp_offsets.end(), indices.begin()); + // initialize the output row sizes auto d_sizes = rmm::device_uvector(input.size(), stream); thrust::fill(rmm::exec_policy_nosync(stream), d_sizes.begin(), d_sizes.end(), 0);