Skip to content

Commit

Permalink
Expose stream parameter in public strings split/partition APIs (#14247)
Browse files Browse the repository at this point in the history
Follow on to PR #13997 which did not include all the split APIs or a stream test.
Add stream parameter to public APIs:

- `cudf::strings::partition()`
- `cudf::strings::rpartition()`
- `cudf::strings::split_re()`
- `cudf::strings::rsplit_re()`
- `cudf::strings::split_record_re()`
- `cudf::strings::rsplit_record_re()`

Also cleaned up some of the doxygen comments. 

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14247
  • Loading branch information
davidwendt authored Oct 11, 2023
1 parent 301dce1 commit aa598bc
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 22 deletions.
22 changes: 13 additions & 9 deletions cpp/include/cudf/strings/split/partition.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,15 +51,17 @@ namespace strings {
* r[2] is ["cd","g_h"]
* @endcode
*
* @param strings Strings instance for this operation.
* @param input Strings instance for this operation
* @param delimiter UTF-8 encoded string indicating where to split each string.
* Default of empty string indicates split on whitespace.
* @param mr Device memory resource used to allocate the returned table's device memory.
* @return New table of strings columns.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned table's device memory
* @return New table of strings columns
*/
std::unique_ptr<table> partition(
strings_column_view const& strings,
strings_column_view const& input,
string_scalar const& delimiter = string_scalar(""),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -83,15 +85,17 @@ std::unique_ptr<table> partition(
* r[2] is ["cd","h"]
* @endcode
*
* @param strings Strings instance for this operation.
* @param input Strings instance for this operation
* @param delimiter UTF-8 encoded string indicating where to split each string.
* Default of empty string indicates split on whitespace.
* @param mr Device memory resource used to allocate the returned table's device memory.
* @return New strings columns.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned table's device memory
* @return New strings columns
*/
std::unique_ptr<table> rpartition(
strings_column_view const& strings,
strings_column_view const& input,
string_scalar const& delimiter = string_scalar(""),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
16 changes: 12 additions & 4 deletions cpp/include/cudf/strings/split/split_re.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,15 @@ struct regex_program;
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned result's device memory
* @return A table of columns of strings
*/
std::unique_ptr<table> split_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -125,17 +127,19 @@ std::unique_ptr<table> split_re(
*
* @throw cudf::logic_error if `pattern` is empty.
*
* @param input A column of string elements to be split.
* @param input A column of string elements to be split
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param mr Device memory resource used to allocate the returned result's device memory.
* @return A table of columns of strings.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned result's device memory
* @return A table of columns of strings
*/
std::unique_ptr<table> rsplit_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -185,13 +189,15 @@ std::unique_ptr<table> rsplit_re(
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned result's device memory
* @return Lists column of strings.
* @return Lists column of strings
*/
std::unique_ptr<column> split_record_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -243,13 +249,15 @@ std::unique_ptr<column> split_record_re(
* @param prog Regex program instance
* @param maxsplit Maximum number of splits to perform.
* Default of -1 indicates all possible splits on each string.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned result's device memory
* @return Lists column of strings
*/
std::unique_ptr<column> rsplit_record_re(
strings_column_view const& input,
regex_program const& prog,
size_type maxsplit = -1,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
10 changes: 6 additions & 4 deletions cpp/src/strings/split/partition.cu
Original file line number Diff line number Diff line change
Expand Up @@ -239,20 +239,22 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,

// external APIs

std::unique_ptr<table> partition(strings_column_view const& strings,
std::unique_ptr<table> partition(strings_column_view const& input,
string_scalar const& delimiter,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::partition(strings, delimiter, cudf::get_default_stream(), mr);
return detail::partition(input, delimiter, stream, mr);
}

std::unique_ptr<table> rpartition(strings_column_view const& strings,
std::unique_ptr<table> rpartition(strings_column_view const& input,
string_scalar const& delimiter,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr);
return detail::rpartition(input, delimiter, stream, mr);
}

} // namespace strings
Expand Down
12 changes: 8 additions & 4 deletions cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -340,37 +340,41 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
std::unique_ptr<table> split_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
return detail::split_re(input, prog, maxsplit, stream, mr);
}

std::unique_ptr<column> split_record_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
return detail::split_record_re(input, prog, maxsplit, stream, mr);
}

std::unique_ptr<table> rsplit_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
return detail::rsplit_re(input, prog, maxsplit, stream, mr);
}

std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
regex_program const& prog,
size_type maxsplit,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
return detail::rsplit_record_re(input, prog, maxsplit, stream, mr);
}

} // namespace strings
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
ConfigureTest(
STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp
streams/strings/strings_tests.cpp STREAM_MODE testing
streams/strings/split_test.cpp streams/strings/strings_tests.cpp STREAM_MODE testing
)
ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
Expand Down
49 changes: 49 additions & 0 deletions cpp/tests/streams/strings/split_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

#include <cudf/strings/regex/regex_program.hpp>
#include <cudf/strings/split/partition.hpp>
#include <cudf/strings/split/split.hpp>
#include <cudf/strings/split/split_re.hpp>

#include <string>

class StringsSplitTest : public cudf::test::BaseFixture {};

TEST_F(StringsSplitTest, SplitPartition)
{
auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings", ""});
auto view = cudf::strings_column_view(input);

auto const delimiter = cudf::string_scalar("é", true, cudf::test::get_default_stream());
cudf::strings::split(view, delimiter, -1, cudf::test::get_default_stream());
cudf::strings::rsplit(view, delimiter, -1, cudf::test::get_default_stream());
cudf::strings::split_record(view, delimiter, -1, cudf::test::get_default_stream());
cudf::strings::rsplit_record(view, delimiter, -1, cudf::test::get_default_stream());
cudf::strings::partition(view, delimiter, cudf::test::get_default_stream());
cudf::strings::rpartition(view, delimiter, cudf::test::get_default_stream());

auto const pattern = std::string("\\s");
auto const prog = cudf::strings::regex_program::create(pattern);
cudf::strings::split_re(view, *prog, -1, cudf::test::get_default_stream());
cudf::strings::split_record_re(view, *prog, -1, cudf::test::get_default_stream());
cudf::strings::rsplit_re(view, *prog, -1, cudf::test::get_default_stream());
cudf::strings::rsplit_record_re(view, *prog, -1, cudf::test::get_default_stream());
}

0 comments on commit aa598bc

Please sign in to comment.