From c0b30fe407019b0f5aa3c456124a453eb0d0d9cc Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 4 Sep 2024 17:03:40 +0800 Subject: [PATCH] Fix comments Optimize warp parallel --- cpp/benchmarks/string/find.cpp | 11 -- cpp/include/cudf/strings/find.hpp | 3 + cpp/src/strings/search/find.cu | 183 ++++++++++++++++-------------- 3 files changed, 100 insertions(+), 97 deletions(-) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 6c19771b737..8c9c4d60f0c 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -74,7 +74,6 @@ static void bench_find_string(nvbench::state& state) constexpr bool combine = false; // test true/false bool has_same_target_first_char = false; // test true/false constexpr int iters = 10; // test 4/10 - bool check_result = false; std::vector match_targets({" abc", "W43", @@ -102,16 +101,6 @@ static void bench_find_string(nvbench::state& state) cudf::strings::contains(input, cudf::string_scalar(multi_targets[i]))); contains_cvs.emplace_back(contains_results.back()->view()); } - - if (check_result) { - cudf::test::strings_column_wrapper multi_targets_column(multi_targets.begin(), - multi_targets.end()); - auto tab = - cudf::strings::multi_contains(input, cudf::strings_column_view(multi_targets_column)); - for (int i = 0; i < tab->num_columns(); i++) { - cudf::test::detail::expect_columns_equal(contains_cvs[i], tab->get_column(i).view()); - } - } }); } else { // combine state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp index f86203cb46a..cadec213d70 100644 --- a/cpp/include/cudf/strings/find.hpp +++ b/cpp/include/cudf/strings/find.hpp @@ -153,11 +153,14 @@ std::unique_ptr contains( * * Any null string entries return corresponding null entries in the output columns. * e.g.: + * @code * input: "a", "b", "c" * targets: "a", "c" * output is a table with two boolean columns: * column_0: true, false, false * column_1: false, false, true + * @endcode + * * @param input Strings instance for this operation * @param targets UTF-8 encoded strings to search for in each string in `input` * @param stream CUDA stream used for device memory operations and kernel launches diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index ce4c2673861..48aa39c747a 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -418,45 +418,94 @@ std::unique_ptr contains_warp_parallel(strings_column_view const& input, return results; } -CUDF_KERNEL void multi_contains_warp_parallel_multi_scalars_fn(column_device_view const d_strings, - column_device_view const d_targets, - cudf::device_span d_results) +/** + * Each string uses a warp(32 threads) to handle all the targets. + * Each thread uses num_targets bools shared memory to store temp result for each lane. + */ +CUDF_KERNEL void multi_contains_warp_parallel_multi_scalars_fn( + column_device_view const d_strings, + column_device_view const d_targets, + cudf::device_span const d_target_first_bytes, + column_device_view const d_target_indexes_for_first_bytes, + cudf::device_span d_results) { auto const num_targets = d_targets.size(); auto const num_rows = d_strings.size(); - auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; - + auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); if (idx >= (num_rows * cudf::detail::warp_size)) { return; } auto const lane_idx = idx % cudf::detail::warp_size; auto const str_idx = idx / cudf::detail::warp_size; if (d_strings.is_null(str_idx)) { return; } // bitmask will set result to null. - // get the string for this warp auto const d_str = d_strings.element(str_idx); - for (size_t target_idx = 0; target_idx < num_targets; target_idx++) { - // Identify the target. + /** + * size of shared_bools = Min(targets_size * block_size, target_group * block_size) + * each thread uses targets_size bools + */ + extern __shared__ bool shared_bools[]; + + // initialize temp result: + // set true if target is empty, set false otherwise + for (int target_idx = 0; target_idx < num_targets; target_idx++) { auto const d_target = d_targets.element(target_idx); + shared_bools[threadIdx.x * num_targets + target_idx] = d_target.size_bytes() == 0; + } - // each thread of the warp will check just part of the string - auto found = false; - if (d_target.empty()) { - found = true; - } else { - for (auto i = static_cast(lane_idx); - !found && ((i + d_target.size_bytes()) <= d_str.size_bytes()); - i += cudf::detail::warp_size) { - // check the target matches this part of the d_str data - if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; } + for (size_type str_byte_idx = lane_idx; str_byte_idx < d_str.size_bytes(); + str_byte_idx += cudf::detail::warp_size) { + // 1. check the first chars using binary search on first char set + char c = *(d_str.data() + str_byte_idx); + auto first_byte_ptr = + thrust::lower_bound(thrust::seq, d_target_first_bytes.begin(), d_target_first_bytes.end(), c); + if (not(first_byte_ptr != d_target_first_bytes.end() && *first_byte_ptr == c)) { + // first char is not matched for all targets, already set result as found + continue; + } + + // 2. check the 2nd chars + int first_char_index_in_list = first_byte_ptr - d_target_first_bytes.begin(); + // get possible targets + auto const possible_targets_list = + cudf::list_device_view{d_target_indexes_for_first_bytes, first_char_index_in_list}; + for (auto list_idx = 0; list_idx < possible_targets_list.size(); + ++list_idx) { // iterate possible targets + auto target_idx = possible_targets_list.element(list_idx); + int temp_result_idx = threadIdx.x * num_targets + target_idx; + if (!shared_bools[temp_result_idx]) { // not found before + auto const d_target = d_targets.element(target_idx); + if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { + // first char already checked, only need to check the [2nd, end) chars if has. + bool found = true; + for (auto i = 1; i < d_target.size_bytes(); i++) { + if (*(d_str.data() + str_byte_idx + i) != *(d_target.data() + i)) { + found = false; + break; + } + } + if (found) { shared_bools[temp_result_idx] = true; } + } + } + } + } + + // wait all lanes are done in a warp + __syncwarp(); + + if (lane_idx == 0) { + for (int target_idx = 0; target_idx < num_targets; target_idx++) { + bool found = false; + for (int lane_idx = 0; lane_idx < cudf::detail::warp_size; lane_idx++) { + bool temp_idx = (str_idx * cudf::detail::warp_size + lane_idx) * num_targets + target_idx; + if (shared_bools[temp_idx]) { + found = true; + break; + } } + d_results[target_idx][str_idx] = found; } - __syncwarp(); - auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max()); - if (lane_idx == 0) { d_results[target_idx][str_idx] = result; } } } @@ -483,7 +532,7 @@ CUDF_KERNEL void multi_contains_using_indexes_fn( for (auto str_byte_idx = 0; str_byte_idx < d_str.size_bytes(); ++str_byte_idx) { // iterate the start index in the string - // binary search in the target first char set. + // 1. check the first chars using binary search on first char set char c = *(d_str.data() + str_byte_idx); auto first_byte_ptr = thrust::lower_bound(thrust::seq, d_target_first_bytes.begin(), d_target_first_bytes.end(), c); @@ -499,8 +548,9 @@ CUDF_KERNEL void multi_contains_using_indexes_fn( auto const possible_targets_list = cudf::list_device_view{d_target_indexes_for_first_bytes, first_char_index_in_list}; - for (auto i = 0; i < possible_targets_list.size(); ++i) { // iterate possible targets - auto target_idx = possible_targets_list.element(i); + for (auto list_idx = 0; list_idx < possible_targets_list.size(); + ++list_idx) { // iterate possible targets + auto target_idx = possible_targets_list.element(list_idx); if (!d_results[target_idx][str_idx]) { // not found before auto const d_target = d_targets.element(target_idx); if (d_str.size_bytes() - str_byte_idx >= d_target.size_bytes()) { @@ -540,11 +590,11 @@ CUDF_KERNEL void multi_contains_using_indexes_fn( * if char in string is 'a', then only need to try ["ac", "ad", "af"] targets. * */ -std::vector> multi_contains_using_indexes( - strings_column_view const& input, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::vector> multi_contains(bool warp_parallel, + strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const num_targets = static_cast(targets.size()); CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); @@ -644,59 +694,20 @@ std::vector> multi_contains_using_indexes( constexpr int block_size = 256; cudf::detail::grid_1d grid{input.size(), block_size}; - multi_contains_using_indexes_fn<<>>( - *d_strings, *d_targets, d_first_bytes, *d_list_column, device_results_list); - return results_list; -} - -/** - * Execute multi contains for long strings - */ -std::vector> multi_contains_using_warp_parallel( - strings_column_view const& input, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_targets = static_cast(targets.size()); - CUDF_EXPECTS(not targets.is_empty(), "Must specify at least one target string."); - - // Create output columns. - auto const results_iter = - thrust::make_transform_iterator(thrust::counting_iterator(0), [&](int i) { - return make_numeric_column(data_type{type_id::BOOL8}, - input.size(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count(), - stream, - mr); - }); - auto results_list = - std::vector>(results_iter, results_iter + targets.size()); - auto device_results_list = [&] { - auto host_results_pointer_iter = - thrust::make_transform_iterator(results_list.begin(), [](auto const& results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = std::vector( - host_results_pointer_iter, host_results_pointer_iter + results_list.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); - }(); - - constexpr int block_size = 256; - auto const d_strings = column_device_view::create(input.parent(), stream); - auto const d_targets = column_device_view::create(targets.parent(), stream); - - // launch warp per string; one warp handles multi-targets for the same string. - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; - multi_contains_warp_parallel_multi_scalars_fn<<>>( - *d_strings, *d_targets, device_results_list); + if (warp_parallel) { + int shared_mem_size = block_size * targets.size(); + multi_contains_warp_parallel_multi_scalars_fn<<>>( + *d_strings, *d_targets, d_first_bytes, *d_list_column, device_results_list); + } else { + multi_contains_using_indexes_fn<<>>( + *d_strings, *d_targets, d_first_bytes, *d_list_column, device_results_list); + } return results_list; } @@ -859,10 +870,10 @@ std::unique_ptr multi_contains(strings_column_view const& input, ((input.chars_size(stream) / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { // Large strings. // use warp parallel when the average string width is greater than the threshold - return multi_contains_using_warp_parallel(input, targets, stream, mr); + return multi_contains(/**warp parallel**/ true, input, targets, stream, mr); } else { // Small strings. Searching for multiple targets in one thread seems to work fastest. - return multi_contains_using_indexes(input, targets, stream, mr); + return multi_contains(/**warp parallel**/ false, input, targets, stream, mr); } }(); return std::make_unique
(std::move(result_columns));