From 5d311ea76ddc8bdbb357b6afdf64dfce6ece39a7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 5 Oct 2023 13:21:31 -0400 Subject: [PATCH] Fix strings replace for adjacent, identical multi-byte UTF-8 character targets (#14235) Fixes bug that can occur when replacing all occurrences in a string using a multi-byte UTF-8 target when the target matches sequentially in the same string -- some characters were missed. Specialized gtest is also added. Found while working on #13891 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14235 --- cpp/src/strings/replace/replace.cu | 2 +- cpp/tests/strings/replace_tests.cpp | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index a622d1a742d..acc1502f4d6 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -97,7 +97,7 @@ struct replace_row_parallel_fn { } else { bytes += d_repl.size_bytes() - d_target.size_bytes(); } - position = d_str.find(d_target, position + d_target.size_bytes()); + position = d_str.find(d_target, position + d_target.length()); --max_n; } if (out_ptr) // copy whats left (or right depending on your point of view) diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index f143983aded..f04bb832f09 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget) +{ + auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"}); + auto strings_view = cudf::strings_column_view(input); + // replace all occurrences of 'é' with 'e' + cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"}); + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + + auto target = cudf::string_scalar("é", true, stream); + auto repl = cudf::string_scalar("e", true, stream); + auto results = cudf::strings::replace(strings_view, target, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceTest, ReplaceSlice) { std::vector h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};