From b81b7da89605458625ca298b91f4a3c1719dc05b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 9 Sep 2024 16:01:56 -0400 Subject: [PATCH 1/2] Fix slice_strings wide strings logic with multibyte characters --- cpp/src/strings/slice.cu | 8 +++++--- cpp/tests/strings/slice_tests.cpp | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index d8324a9b08e..c0a94ebe64d 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings, break; } size_type const cc = (itr < end) && is_begin_utf8_char(*itr); - size_type const bc = (itr < end); + size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0; char_count += cg::reduce(warp, cc, cg::plus()); byte_count += cg::reduce(warp, bc, cg::plus()); itr += cudf::detail::warp_size; } + __syncwarp(); + if (warp.thread_rank() == 0) { if (start >= char_count) { d_output[str_idx] = string_index_pair{"", 0}; return; } - // we are just below start/stop and must now increment up to it from here + // we are just below start/stop and must now increment up to them from here auto first_byte = start_counts.second; if (start_counts.first < start) { auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte); first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first)); } - stop = max(stop, char_count); + stop = min(stop, char_count); auto last_byte = stop_counts.second; if (stop_counts.first < stop) { auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte); diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp index 52e439bd93f..efea6d0e011 100644 --- a/cpp/tests/strings/slice_tests.cpp +++ b/cpp/tests/strings/slice_tests.cpp @@ -268,6 +268,23 @@ TEST_F(StringsSliceTest, MaxPositions) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsSliceTest, MultiByteChars) +{ + auto input = cudf::test::strings_column_wrapper({ + // clang-format off + "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " + "the following code snippet demonstrates how to use search for values in an ordered range " + "it returns the last position where value could be inserted without the ééééé ordering ", + "algorithms execution is parallelized as determined by an execution policy; this is a 12345" + "continuation of previous row to make sure string boundaries are honored 012345678901234567" + "01234567890é34567890012345678901234567890" + // clang-format on + }); + + auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); +} + TEST_F(StringsSliceTest, Error) { cudf::test::strings_column_wrapper strings{"this string intentionally left blank"}; From 8e7c30015f04072e26ea92cd3211916761b91ebc Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 11 Sep 2024 09:41:24 -0400 Subject: [PATCH 2/2] add comments about special characters --- cpp/tests/strings/slice_tests.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp index efea6d0e011..7f7fd9d521b 100644 --- a/cpp/tests/strings/slice_tests.cpp +++ b/cpp/tests/strings/slice_tests.cpp @@ -274,9 +274,11 @@ TEST_F(StringsSliceTest, MultiByteChars) // clang-format off "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving " "the following code snippet demonstrates how to use search for values in an ordered range " + // this placement tests proper multi-byte chars handling ------vvvvv "it returns the last position where value could be inserted without the ééééé ordering ", "algorithms execution is parallelized as determined by an execution policy; this is a 12345" "continuation of previous row to make sure string boundaries are honored 012345678901234567" + // v--- this one also "01234567890é34567890012345678901234567890" // clang-format on });