From 0059b38a20fe0b7c3bdbcfd224feb58caf4b763f Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 25 Sep 2024 12:58:14 -0700 Subject: [PATCH] address review --- .../cudf/strings/char_types/char_types.hpp | 5 +- python/cudf/cudf/_lib/strings/char_types.pyx | 60 +++++++++---------- .../pylibcudf/strings/char_types.pyx | 5 +- 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 3ebe5cb53e9..735a2550e6e 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -30,7 +30,7 @@ namespace strings { */ /** - * @brief Returns a boolean column identifying strings entries in which all + * @brief Returns a boolean column identifying string entries where all * characters are of the type specified. * * The output row entry will be set to false if the corresponding string element @@ -105,7 +105,8 @@ std::unique_ptr all_characters_of_type( * `types_to_remove` will be filtered. * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches - * @return New column of boolean results for each string + * @return New strings column with the specified characters filtered out and replaced with specified + * replacement string. */ std::unique_ptr filter_characters_of_type( strings_column_view const& input, diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index 30d426d2edc..a57ce29eb45 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -6,7 +6,7 @@ from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column -import pylibcudf as plc +from pylibcudf.strings import char_types @acquire_spill_lock() @@ -14,13 +14,13 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True): """ Returns a Column of strings keeping only alphanumeric character types. """ - plc_column = plc.strings.char_types.filter_characters_of_type( + plc_column = char_types.filter_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.ALL_TYPES if keep - else plc.strings.char_types.StringCharacterTypes.ALPHANUM, + char_types.StringCharacterTypes.ALL_TYPES if keep + else char_types.StringCharacterTypes.ALPHANUM, py_repl.device_value.c_value, - plc.strings.char_types.StringCharacterTypes.ALPHANUM if keep - else plc.strings.char_types.StringCharacterTypes.ALL_TYPES + char_types.StringCharacterTypes.ALPHANUM if keep + else char_types.StringCharacterTypes.ALL_TYPES ) return Column.from_pylibcudf(plc_column) @@ -32,10 +32,10 @@ def is_decimal(Column source_strings): that contain only decimal characters -- those that can be used to extract base10 numbers. """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.DECIMAL, - plc.strings.char_types.StringCharacterTypes.ALL_TYPES + char_types.StringCharacterTypes.DECIMAL, + char_types.StringCharacterTypes.ALL_TYPES ) return Column.from_pylibcudf(plc_column) @@ -48,10 +48,10 @@ def is_alnum(Column source_strings): Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.ALPHANUM, - plc.strings.char_types.StringCharacterTypes.ALL_TYPES + char_types.StringCharacterTypes.ALPHANUM, + char_types.StringCharacterTypes.ALL_TYPES ) return Column.from_pylibcudf(plc_column) @@ -62,10 +62,10 @@ def is_alpha(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only alphabetic characters. """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.ALPHA, - plc.strings.char_types.StringCharacterTypes.ALL_TYPES + char_types.StringCharacterTypes.ALPHA, + char_types.StringCharacterTypes.ALL_TYPES ) return Column.from_pylibcudf(plc_column) @@ -76,10 +76,10 @@ def is_digit(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only decimal and digit characters. """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.DIGIT, - plc.strings.char_types.StringCharacterTypes.ALL_TYPES + char_types.StringCharacterTypes.DIGIT, + char_types.StringCharacterTypes.ALL_TYPES ) return Column.from_pylibcudf(plc_column) @@ -91,10 +91,10 @@ def is_numeric(Column source_strings): that contain only numeric characters. These include digit and numeric characters. """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.NUMERIC, - plc.strings.char_types.StringCharacterTypes.ALL_TYPES + char_types.StringCharacterTypes.NUMERIC, + char_types.StringCharacterTypes.ALL_TYPES ) return Column.from_pylibcudf(plc_column) @@ -105,10 +105,10 @@ def is_upper(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only upper-case characters. """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.UPPER, - plc.strings.char_types.StringCharacterTypes.CASE_TYPES + char_types.StringCharacterTypes.UPPER, + char_types.StringCharacterTypes.CASE_TYPES ) return Column.from_pylibcudf(plc_column) @@ -119,10 +119,10 @@ def is_lower(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only lower-case characters. """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.LOWER, - plc.strings.char_types.StringCharacterTypes.CASE_TYPES + char_types.StringCharacterTypes.LOWER, + char_types.StringCharacterTypes.CASE_TYPES ) return Column.from_pylibcudf(plc_column) @@ -133,9 +133,9 @@ def is_space(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contains all characters which are spaces only. """ - plc_column = plc.strings.char_types.all_characters_of_type( + plc_column = char_types.all_characters_of_type( source_strings.to_pylibcudf(mode="read"), - plc.strings.char_types.StringCharacterTypes.SPACE, - plc.strings.char_types.StringCharacterTypes.ALL_TYPES + char_types.StringCharacterTypes.SPACE, + char_types.StringCharacterTypes.ALL_TYPES ) return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index f8d09e8fd2b..6a24d79bc4b 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -19,7 +19,7 @@ cpdef Column all_characters_of_type( string_character_types verify_types ): """ - Filter specific character types from a column of strings. + Identifies strings where all characters match the specified type. Parameters ---------- @@ -72,7 +72,8 @@ cpdef Column filter_characters_of_type( Returns ------- Column - New column of boolean results for each string + New column with the specified characters filtered out and + replaced with the specified replacement string. """ cdef const string_scalar* c_replacement = ( replacement.c_obj.get()