Skip to content

Commit

Permalink
Add string.attributes APIs to pylibcudf (#16785)
Browse files Browse the repository at this point in the history
Contributes to #15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16785
  • Loading branch information
mroeschke authored Sep 25, 2024
1 parent 0425963 commit c7f6a22
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 34 deletions.
46 changes: 14 additions & 32 deletions python/cudf/cudf/_lib/strings/attributes.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,21 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.attributes cimport (
code_points as cpp_code_points,
count_bytes as cpp_count_bytes,
count_characters as cpp_count_characters,
)

from cudf._lib.column cimport Column

import pylibcudf as plc


@acquire_spill_lock()
def count_characters(Column source_strings):
"""
Returns an integer numeric column containing the
length of each string in characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_count_characters(source_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.attributes.count_characters(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -37,13 +25,10 @@ def count_bytes(Column source_strings):
Returns an integer numeric column containing the
number of bytes of each string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_count_bytes(source_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.attributes.count_bytes(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -52,10 +37,7 @@ def code_points(Column source_strings):
Creates a numeric column with code point values (integers)
for each character of each string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_code_points(source_view))

return Column.from_unique_ptr(move(c_result))
plc_column = plc.strings.attributes.code_points(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)
17 changes: 15 additions & 2 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,21 @@
# =============================================================================

set(cython_sources
capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx
regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
attributes.pyx
capitalize.pyx
case.pyx
char_types.pyx
contains.pyx
extract.pyx
find.pyx
findall.pyx
regex_flags.pyx
regex_program.pyx
repeat.pyx
replace.pyx
side_type.pyx
slice.pyx
strip.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
19 changes: 19 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport (
attributes,
capitalize,
case,
char_types,
Expand All @@ -16,3 +17,21 @@ from . cimport (
strip,
)
from .side_type cimport side_type

__all__ = [
"attributes",
"capitalize",
"case",
"char_types",
"contains",
"convert",
"extract",
"find",
"findall",
"regex_flags",
"regex_program",
"replace",
"slice",
"strip",
"side_type",
]
19 changes: 19 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import (
attributes,
capitalize,
case,
char_types,
Expand All @@ -17,3 +18,21 @@
strip,
)
from .side_type import SideType

__all__ = [
"attributes",
"capitalize",
"case",
"char_types",
"contains",
"convert",
"extract",
"find",
"findall",
"regex_flags",
"regex_program",
"replace",
"slice",
"strip",
"SideType",
]
10 changes: 10 additions & 0 deletions python/pylibcudf/pylibcudf/strings/attributes.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column


cpdef Column count_characters(Column source_strings)

cpdef Column count_bytes(Column source_strings)

cpdef Column code_points(Column source_strings)
76 changes: 76 additions & 0 deletions python/pylibcudf/pylibcudf/strings/attributes.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.strings cimport attributes as cpp_attributes


cpdef Column count_characters(Column source_strings):
"""
Returns a column containing character lengths of each string
in the given column.
Parameters
----------
source_strings : Column
Column of strings.
Returns
-------
Column
New column with lengths for each string
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_attributes.count_characters(source_strings.view()))

return Column.from_libcudf(move(c_result))


cpdef Column count_bytes(Column source_strings):
"""
Returns a column containing byte lengths of each string
in the given column.
Parameters
----------
source_strings : Column
Column of strings.
Returns
-------
Column
New column with the number of bytes for each string
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_attributes.count_bytes(source_strings.view()))

return Column.from_libcudf(move(c_result))


cpdef Column code_points(Column source_strings):
"""
Creates a numeric column with code point values (integers)
for each character of each string.
Parameters
----------
source_strings : Column
Column of strings.
Returns
-------
Column
New column with code point integer values for each character
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_attributes.code_points(source_strings.view()))

return Column.from_libcudf(move(c_result))
32 changes: 32 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_attributes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pyarrow.compute as pc
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture()
def str_data():
pa_data = pa.array(["A", None])
return pa_data, plc.interop.from_arrow(pa_data)


def test_count_characters(str_data):
result = plc.strings.attributes.count_characters(str_data[1])
expected = pc.utf8_length(str_data[0])
assert_column_eq(expected, result)


def test_count_bytes(str_data):
result = plc.strings.attributes.count_characters(str_data[1])
expected = pc.binary_length(str_data[0])
assert_column_eq(expected, result)


def test_code_points(str_data):
result = plc.strings.attributes.code_points(str_data[1])
exp_data = [ord(str_data[0].to_pylist()[0])]
expected = pa.chunked_array([exp_data], type=pa.int32())
assert_column_eq(expected, result)

0 comments on commit c7f6a22

Please sign in to comment.