Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add remaining string.char_types APIs to pylibcudf #16788

Merged
merged 8 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cpp/include/cudf/strings/char_types/char_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace strings {
*/

/**
* @brief Returns a boolean column identifying strings entries in which all
* @brief Returns a boolean column identifying string entries where all
* characters are of the type specified.
*
* The output row entry will be set to false if the corresponding string element
Expand Down Expand Up @@ -105,7 +105,8 @@ std::unique_ptr<column> all_characters_of_type(
* `types_to_remove` will be filtered.
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New column of boolean results for each string
* @return New strings column with the specified characters filtered out and replaced with specified
* replacement string.
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
*/
std::unique_ptr<column> filter_characters_of_type(
strings_column_view const& input,
Expand Down
178 changes: 58 additions & 120 deletions python/cudf/cudf/_lib/strings/char_types.pyx
Original file line number Diff line number Diff line change
@@ -1,50 +1,28 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.


from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.char_types cimport (
all_characters_of_type as cpp_all_characters_of_type,
filter_characters_of_type as cpp_filter_characters_of_type,
string_character_types,
)

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf.strings import char_types


@acquire_spill_lock()
def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
"""
Returns a Column of strings keeping only alphanumeric character types.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_repl = <const string_scalar*>(
repl.get_raw_ptr()
plc_column = char_types.filter_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.ALL_TYPES if keep
else char_types.StringCharacterTypes.ALPHANUM,
py_repl.device_value.c_value,
char_types.StringCharacterTypes.ALPHANUM if keep
else char_types.StringCharacterTypes.ALL_TYPES
)

with nogil:
c_result = move(cpp_filter_characters_of_type(
source_view,
string_character_types.ALL_TYPES if keep
else string_character_types.ALPHANUM,
scalar_repl[0],
string_character_types.ALPHANUM if keep
else string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -54,17 +32,12 @@ def is_decimal(Column source_strings):
that contain only decimal characters -- those that can be used
to extract base10 numbers.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.DECIMAL,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.DECIMAL,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -75,17 +48,12 @@ def is_alnum(Column source_strings):

Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal()
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.ALPHANUM,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.ALPHANUM,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -94,17 +62,12 @@ def is_alpha(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only alphabetic characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.ALPHA,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.ALPHA,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -113,17 +76,12 @@ def is_digit(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only decimal and digit characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.DIGIT,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.DIGIT,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -133,17 +91,12 @@ def is_numeric(Column source_strings):
that contain only numeric characters. These include digit and
numeric characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.NUMERIC,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.NUMERIC,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -152,17 +105,12 @@ def is_upper(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only upper-case characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.UPPER,
string_character_types.CASE_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.UPPER,
char_types.StringCharacterTypes.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -171,17 +119,12 @@ def is_lower(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only lower-case characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.LOWER,
string_character_types.CASE_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.LOWER,
char_types.StringCharacterTypes.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -190,14 +133,9 @@ def is_space(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contains all characters which are spaces only.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.SPACE,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.SPACE,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)
3 changes: 0 additions & 3 deletions python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
CASE_TYPES
ALL_TYPES

cdef extern from "cudf/strings/char_types/char_types.hpp" \
namespace "cudf::strings" nogil:

cdef unique_ptr[column] all_characters_of_type(
column_view source_strings,
string_character_types types,
Expand Down
16 changes: 16 additions & 0 deletions python/pylibcudf/pylibcudf/strings/char_types.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.char_types cimport string_character_types
from pylibcudf.scalar cimport Scalar


cpdef Column all_characters_of_type(
Column source_strings,
string_character_types types,
string_character_types verify_types
)

cpdef Column filter_characters_of_type(
Column source_strings,
string_character_types types_to_remove,
Scalar replacement,
string_character_types types_to_keep
)
89 changes: 89 additions & 0 deletions python/pylibcudf/pylibcudf/strings/char_types.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,93 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings cimport char_types as cpp_char_types
from pylibcudf.scalar cimport Scalar

from cython.operator import dereference
from pylibcudf.libcudf.strings.char_types import \
string_character_types as StringCharacterTypes # no-cython-lint


cpdef Column all_characters_of_type(
Column source_strings,
string_character_types types,
string_character_types verify_types
):
"""
Identifies strings where all characters match the specified type.

Parameters
----------
source_strings : Column
Strings instance for this operation
types : StringCharacterTypes
The character types to check in each string
verify_types : StringCharacterTypes
Only verify against these character types.

Returns
-------
Column
New column of boolean results for each string
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_char_types.all_characters_of_type(
source_strings.view(),
types,
verify_types,
)
)

return Column.from_libcudf(move(c_result))

cpdef Column filter_characters_of_type(
Column source_strings,
string_character_types types_to_remove,
Scalar replacement,
string_character_types types_to_keep
):
"""
Filter specific character types from a column of strings.

Parameters
----------
source_strings : Column
Strings instance for this operation
types_to_remove : StringCharacterTypes
The character types to check in each string.
replacement : Scalar
The replacement character to use when removing characters
types_to_keep : StringCharacterTypes
Default `ALL_TYPES` means all characters of `types_to_remove`
will be filtered.

Returns
-------
Column
New column with the specified characters filtered out and
replaced with the specified replacement string.
"""
cdef const string_scalar* c_replacement = <const string_scalar*>(
replacement.c_obj.get()
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_char_types.filter_characters_of_type(
source_strings.view(),
types_to_remove,
dereference(c_replacement),
types_to_keep,
)
)

return Column.from_libcudf(move(c_result))
Loading
Loading