diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx index fe8c17c9e31..df81b3942b4 100644 --- a/python/cudf/cudf/_lib/strings/attributes.pyx +++ b/python/cudf/cudf/_lib/strings/attributes.pyx @@ -2,19 +2,10 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.attributes cimport ( - code_points as cpp_code_points, - count_bytes as cpp_count_bytes, - count_characters as cpp_count_characters, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def count_characters(Column source_strings): @@ -22,13 +13,10 @@ def count_characters(Column source_strings): Returns an integer numeric column containing the length of each string in characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_count_characters(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.count_characters( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -37,13 +25,10 @@ def count_bytes(Column source_strings): Returns an integer numeric column containing the number of bytes of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_count_bytes(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.count_bytes( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -52,10 +37,7 @@ def code_points(Column source_strings): Creates a numeric column with code point values (integers) for each character of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_code_points(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.code_points( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 77f20b0b917..142bc124ca2 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -13,8 +13,21 @@ # ============================================================================= set(cython_sources - capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx - regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx + attributes.pyx + capitalize.pyx + case.pyx + char_types.pyx + contains.pyx + extract.pyx + find.pyx + findall.pyx + regex_flags.pyx + regex_program.pyx + repeat.pyx + replace.pyx + side_type.pyx + slice.pyx + strip.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 91d884b294b..d8afccc7336 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . cimport ( + attributes, capitalize, case, char_types, @@ -16,3 +17,21 @@ from . cimport ( strip, ) from .side_type cimport side_type + +__all__ = [ + "attributes", + "capitalize", + "case", + "char_types", + "contains", + "convert", + "extract", + "find", + "findall", + "regex_flags", + "regex_program", + "replace", + "slice", + "strip", + "side_type", +] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index b4856784390..22452812e42 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import ( + attributes, capitalize, case, char_types, @@ -17,3 +18,21 @@ strip, ) from .side_type import SideType + +__all__ = [ + "attributes", + "capitalize", + "case", + "char_types", + "contains", + "convert", + "extract", + "find", + "findall", + "regex_flags", + "regex_program", + "replace", + "slice", + "strip", + "SideType", +] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/strings/attributes.pxd new file mode 100644 index 00000000000..27398766924 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column count_characters(Column source_strings) + +cpdef Column count_bytes(Column source_strings) + +cpdef Column code_points(Column source_strings) diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx new file mode 100644 index 00000000000..36bee7bd1d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport attributes as cpp_attributes + + +cpdef Column count_characters(Column source_strings): + """ + Returns a column containing character lengths of each string + in the given column. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with lengths for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.count_characters(source_strings.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column count_bytes(Column source_strings): + """ + Returns a column containing byte lengths of each string + in the given column. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with the number of bytes for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.count_bytes(source_strings.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column code_points(Column source_strings): + """ + Creates a numeric column with code point values (integers) + for each character of each string. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with code point integer values for each character + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.code_points(source_strings.view())) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py new file mode 100644 index 00000000000..a1820def0b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture() +def str_data(): + pa_data = pa.array(["A", None]) + return pa_data, plc.interop.from_arrow(pa_data) + + +def test_count_characters(str_data): + result = plc.strings.attributes.count_characters(str_data[1]) + expected = pc.utf8_length(str_data[0]) + assert_column_eq(expected, result) + + +def test_count_bytes(str_data): + result = plc.strings.attributes.count_characters(str_data[1]) + expected = pc.binary_length(str_data[0]) + assert_column_eq(expected, result) + + +def test_code_points(str_data): + result = plc.strings.attributes.code_points(str_data[1]) + exp_data = [ord(str_data[0].to_pylist()[0])] + expected = pa.chunked_array([exp_data], type=pa.int32()) + assert_column_eq(expected, result)