diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index a7db972d39f..cd4d3ca964b 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -75,5 +75,5 @@ void bench_case(nvbench::state& state) NVBENCH_BENCH(bench_case) .set_name("case") .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 77c014301ba..c1688d20791 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -237,13 +238,16 @@ CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter, auto const d_str = d_strings.element(str_idx); auto const str_ptr = d_str.data(); + // each thread processes 4 bytes size_type size = 0; - for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) { - auto const chr = str_ptr[i]; - if (is_utf8_continuation_char(chr)) { continue; } - char_utf8 u8 = 0; - to_char_utf8(str_ptr + i, u8); - size += converter.process_character(u8); + for (auto i = lane_idx * 4; i < d_str.size_bytes(); i += cudf::detail::warp_size * 4) { + for (auto j = i; (j < (i + 4)) && (j < d_str.size_bytes()); j++) { + auto const chr = str_ptr[j]; + if (is_utf8_continuation_char(chr)) { continue; } + char_utf8 u8 = 0; + to_char_utf8(str_ptr + j, u8); + size += converter.process_character(u8); + } } // this is slightly faster than using the cub::warp_reduce if (size > 0) { @@ -260,6 +264,41 @@ struct ascii_converter_fn { __device__ char operator()(char chr) { return converter.process_ascii(chr); } }; +constexpr int64_t block_size = 512; +constexpr int64_t bytes_per_thread = 8; + +/** + * @brief Checks the chars data for any multibyte characters + * + * The output count is not accurate but it is only checked for > 0. + */ +CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars, + int64_t first_offset, + int64_t last_offset, + int64_t* d_output) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + // read only every 2nd byte; all bytes in a multibyte char have high bit set + auto const byte_idx = (static_cast(idx) * bytes_per_thread) + first_offset; + auto const lane_idx = static_cast(threadIdx.x); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + // each thread processes 8 bytes (only 4 need to be checked) + int64_t mb_count = 0; + for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < last_offset); i += 2) { + u_char const chr = static_cast(d_input_chars[i]); + mb_count += ((chr & 0x80) > 0); + } + auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum()); + + if ((lane_idx == 0) && (mb_total > 0)) { + cuda::atomic_ref ref{*d_output}; + ref.fetch_add(mb_total, cuda::std::memory_order_relaxed); + } +} + /** * @brief Utility method for converting upper and lower case characters * in a strings column @@ -289,7 +328,8 @@ std::unique_ptr convert_case(strings_column_view const& input, input.offsets(), input.offset(), stream); auto const last_offset = cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); - auto const chars_size = last_offset - first_offset; + auto const chars_size = last_offset - first_offset; + auto const input_chars = input.chars_begin(stream); convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special}; upper_lower_fn converter{ccfn, *d_strings}; @@ -306,16 +346,15 @@ std::unique_ptr convert_case(strings_column_view const& input, // Check if the input contains any multi-byte characters. // This check incurs ~20% performance hit for smaller strings and so we only use it - // after the threshold check above. The check makes very little impact for larger strings + // after the threshold check above. The check makes very little impact for long strings // but results in a large performance gain when the input contains only single-byte characters. - // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016 - bool const multi_byte_chars = - thrust::count_if(rmm::exec_policy(stream), - input.chars_begin(stream), - input.chars_end(stream), - cuda::proclaim_return_type( - [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0; - if (!multi_byte_chars) { + rmm::device_scalar mb_count(0, stream); + // cudf::detail::grid_1d is limited to size_type elements + auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size); + // we only need to check every other byte since either will contain high bit + has_multibytes_kernel<<>>( + input_chars, first_offset, last_offset, mb_count.data()); + if (mb_count.value(stream) == 0) { // optimization for ASCII-only case: copy the input column and inplace replace each character auto result = std::make_unique(input.parent(), stream, mr); auto d_chars = result->mutable_view().head(); @@ -329,21 +368,21 @@ std::unique_ptr convert_case(strings_column_view const& input, // note: tried to use segmented-reduce approach instead here and it was consistently slower auto [offsets, bytes] = [&] { rmm::device_uvector sizes(input.size(), stream); - constexpr int block_size = 512; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; - count_bytes_kernel<<>>( + // cudf::detail::grid_1d is limited to size_type threads + auto const num_blocks = util::div_rounding_up_safe( + static_cast(input.size()) * cudf::detail::warp_size, block_size); + count_bytes_kernel<<>>( ccfn, *d_strings, sizes.data()); // convert sizes to offsets return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr); }(); // build sub-offsets - auto const input_chars = input.chars_begin(stream); - auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE; - auto tmp_offsets = rmm::device_uvector(sub_count + input.size() + 1, stream); + auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE; + auto tmp_offsets = rmm::device_uvector(sub_count + input.size() + 1, stream); { - rmm::device_uvector sub_offsets(sub_count, stream); - auto const count_itr = thrust::make_counting_iterator(0); + rmm::device_uvector sub_offsets(sub_count, stream); + auto const count_itr = thrust::make_counting_iterator(0); thrust::transform(rmm::exec_policy_nosync(stream), count_itr, count_itr + sub_count, @@ -359,6 +398,7 @@ std::unique_ptr convert_case(strings_column_view const& input, sub_offsets.begin(), sub_offsets.end(), tmp_offsets.begin()); + stream.synchronize(); // protect against destruction of sub_offsets } // run case conversion over the new sub-strings diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 5daacbdc2fa..7622e39e735 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -265,15 +265,15 @@ std::unique_ptr concatenate(host_span columns, // Use a heuristic to guess when the fused kernel will be faster than memcpy if (use_fused_kernel_heuristic(has_nulls, total_bytes, columns.size())) { // Use single kernel launch to copy chars columns - constexpr size_type block_size{256}; - cudf::detail::grid_1d config(total_bytes, block_size); - auto const kernel = fused_concatenate_string_chars_kernel; - kernel<<>>( - d_views, - d_partition_offsets.data(), - static_cast(columns.size()), - total_bytes, - d_new_chars); + constexpr size_t block_size{256}; + // cudf::detail::grid_1d limited to size_type elements + auto const num_blocks = util::div_rounding_up_safe(total_bytes, block_size); + auto const kernel = fused_concatenate_string_chars_kernel; + kernel<<>>(d_views, + d_partition_offsets.data(), + static_cast(columns.size()), + total_bytes, + d_new_chars); } else { // Memcpy each input chars column (more efficient for very large strings) for (auto column = columns.begin(); column != columns.end(); ++column) { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e779e1d1410..d3eae8011d8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -572,9 +572,10 @@ ConfigureTest( # * large strings test ---------------------------------------------------------------------------- ConfigureTest( LARGE_STRINGS_TEST + large_strings/concatenate_tests.cpp + large_strings/case_tests.cpp large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp - large_strings/concatenate_tests.cpp large_strings/parquet_tests.cpp large_strings/reshape_tests.cpp GPUS 1 diff --git a/cpp/tests/large_strings/case_tests.cpp b/cpp/tests/large_strings/case_tests.cpp new file mode 100644 index 00000000000..e56d984421a --- /dev/null +++ b/cpp/tests/large_strings/case_tests.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include + +#include +#include +#include +#include + +#include + +struct CaseTest : public cudf::test::StringsLargeTest {}; + +TEST_F(CaseTest, ToLower) +{ + auto const wide = this->wide_column(); + auto input = cudf::concatenate(std::vector(120000, wide)); // 230MB + auto expected = cudf::strings::to_lower(cudf::strings_column_view(input->view())); + + int const multiplier = 12; + std::vector input_cols(multiplier, input->view()); + std::vector splits; + std::generate_n(std::back_inserter(splits), multiplier - 1, [&input, n = 1]() mutable { + return input->view().size() * (n++); + }); + + auto large_input = cudf::concatenate(input_cols); // 2700MB > 2GB + auto const sv = cudf::strings_column_view(large_input->view()); + auto result = cudf::strings::to_lower(sv); + + // verify results in sections + auto sliced = cudf::split(result->view(), splits); + for (auto c : sliced) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected->view()); + } +} diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp index aa445bf761b..89be2c307bf 100644 --- a/cpp/tests/large_strings/concatenate_tests.cpp +++ b/cpp/tests/large_strings/concatenate_tests.cpp @@ -63,3 +63,16 @@ TEST_F(ConcatenateTest, ConcatenateVertical) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input); } } + +TEST_F(ConcatenateTest, ManyColumns) +{ + auto input = this->wide_column(); + auto view = cudf::column_view(input); + int const multiplier = 1200000; + std::vector input_cols(multiplier, view); // 2500MB > 2GB + // this tests a unique path through the code + auto result = cudf::concatenate(input_cols); + auto sv = cudf::strings_column_view(result->view()); + EXPECT_EQ(sv.size(), view.size() * multiplier); + EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64}); +}