Skip to content

Commit

Permalink
Remove unneeded output size parameter from internal count_matches uti…
Browse files Browse the repository at this point in the history
…lity (#16531)

Removes `output_size` parameter from `cudf::strings::detail::count_matches` utility since the output size should equal the input size from the first parameter. This also removes an unnecessary `assert()` call. The parameter became unnecessary as part of the large strings work.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: #16531
  • Loading branch information
davidwendt authored Aug 14, 2024
1 parent 1f0d0c9 commit c20d6b3
Show file tree
Hide file tree
Showing 6 changed files with 9 additions and 14 deletions.
2 changes: 1 addition & 1 deletion cpp/src/strings/contains.cu
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ std::unique_ptr<column> count_re(strings_column_view const& input,

auto const d_strings = column_device_view::create(input.parent(), stream);

auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr);
auto result = count_matches(*d_strings, *d_prog, stream, mr);
if (input.has_nulls()) {
result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count());
Expand Down
9 changes: 3 additions & 6 deletions cpp/src/strings/count_matches.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,15 @@ struct count_fn {

std::unique_ptr<column> count_matches(column_device_view const& d_strings,
reprog_device& d_prog,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
assert(output_size >= d_strings.size() and "Unexpected output size");

auto results = make_numeric_column(
data_type{type_to_id<size_type>()}, output_size, mask_state::UNALLOCATED, stream, mr);
data_type{type_to_id<size_type>()}, d_strings.size(), mask_state::UNALLOCATED, stream, mr);

if (d_strings.size() == 0) return results;
if (d_strings.size() == 0) { return results; }

auto d_results = results->mutable_view().data<int32_t>();
auto d_results = results->mutable_view().data<cudf::size_type>();

launch_transform_kernel(count_fn{d_strings}, d_prog, d_results, d_strings.size(), stream);

Expand Down
2 changes: 0 additions & 2 deletions cpp/src/strings/count_matches.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,12 @@ class reprog_device;
*
* @param d_strings Device view of the input strings column.
* @param d_prog Regex instance to evaluate on each string.
* @param output_size Number of rows for the output column.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return Integer column of match counts
*/
std::unique_ptr<column> count_matches(column_device_view const& d_strings,
reprog_device& d_prog,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/extract/extract_all.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,

// Get the match counts for each string.
// This column will become the output lists child offsets column.
auto counts = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
auto counts = count_matches(*d_strings, *d_prog, stream, mr);
auto d_counts = counts->mutable_view().data<size_type>();

// Compute null output rows
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/search/findall.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
auto d_prog = regex_device_builder::create_prog_device(prog, stream);

// Create lists offsets column
auto const sizes = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
auto const sizes = count_matches(*d_strings, *d_prog, stream, mr);
auto [offsets, total_matches] = cudf::detail::make_offsets_child_column(
sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
auto const d_offsets = offsets->view().data<size_type>();
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
auto d_strings = column_device_view::create(input.parent(), stream);

// count the number of delimiters matched in each string
auto const counts = count_matches(
*d_strings, *d_prog, strings_count, stream, rmm::mr::get_current_device_resource());
auto const counts =
count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());

// get the split tokens from the input column; this also converts the counts into offsets
auto [tokens, offsets] =
Expand Down Expand Up @@ -275,7 +275,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
auto d_strings = column_device_view::create(input.parent(), stream);

// count the number of delimiters matched in each string
auto counts = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
auto counts = count_matches(*d_strings, *d_prog, stream, mr);

// get the split tokens from the input column; this also converts the counts into offsets
auto [tokens, offsets] =
Expand Down

0 comments on commit c20d6b3

Please sign in to comment.