Skip to content

Commit

Permalink
Migrate string capitalize APIs to pylibcudf (#15503)
Browse files Browse the repository at this point in the history
This PR creates the `pylibcudf.strings.capitalize` namespace and migrates the cuDF cython to use it. Depends on #15489

Part of #15162

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15503
  • Loading branch information
brandon-b-miller authored May 29, 2024
1 parent 27220d6 commit bdafa73
Show file tree
Hide file tree
Showing 17 changed files with 217 additions and 49 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.p
)

set(linked_libraries cudf::cudf)

rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
)
add_subdirectory(strings)
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar


cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
23 changes: 23 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================

set(cython_sources char_types.pyx)

set(linked_libraries cudf::cudf)

rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_strings
)
12 changes: 10 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,22 @@ from libcpp.memory cimport unique_ptr

from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
string_character_types,
)


cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
cdef unique_ptr[column] capitalize(
const column_view & strings) except +
const column_view & strings,
const string_scalar & delimiters
) except +

cdef unique_ptr[column] title(
const column_view & strings) except +
const column_view & strings,
string_character_types sequence_type
) except +

cdef unique_ptr[column] is_title(
const column_view & strings) except +
6 changes: 6 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view


cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
cdef unique_ptr[column] capitalize(
const column_view & input) except +

cdef unique_ptr[column] is_title(
const column_view & input) except +

cdef unique_ptr[column] to_lower(
const column_view & strings) except +

Expand Down
23 changes: 12 additions & 11 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t
from libcpp.memory cimport unique_ptr

from cudf._lib.pylibcudf.libcudf.column.column cimport column
Expand All @@ -10,17 +11,17 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
cdef extern from "cudf/strings/char_types/char_types.hpp" \
namespace "cudf::strings" nogil:

ctypedef enum string_character_types:
DECIMAL 'cudf::strings::string_character_types::DECIMAL'
NUMERIC 'cudf::strings::string_character_types::NUMERIC'
DIGIT 'cudf::strings::string_character_types::DIGIT'
ALPHA 'cudf::strings::string_character_types::ALPHA'
SPACE 'cudf::strings::string_character_types::SPACE'
UPPER 'cudf::strings::string_character_types::UPPER'
LOWER 'cudf::strings::string_character_types::LOWER'
ALPHANUM 'cudf::strings::string_character_types::ALPHANUM'
CASE_TYPES 'cudf::strings::string_character_types::CASE_TYPES'
ALL_TYPES 'cudf::strings::string_character_types::ALL_TYPES'
cpdef enum class string_character_types(uint32_t):
DECIMAL
NUMERIC
DIGIT
ALPHA
SPACE
UPPER
LOWER
ALPHANUM
CASE_TYPES
ALL_TYPES

cdef extern from "cudf/strings/char_types/char_types.hpp" \
namespace "cudf::strings" nogil:
Expand Down
Empty file.
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# the License.
# =============================================================================

set(cython_sources case.pyx find.pyx)
set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport case, find
from . cimport capitalize, case, char_types, find
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import case, find
from . import capitalize, case, char_types, find
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.scalar cimport Scalar


cpdef Column capitalize(Column input, Scalar delimiters=*)
cpdef Column title(Column input)
cpdef Column is_title(Column input)
62 changes: 62 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from cudf._lib.pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize
from cudf._lib.pylibcudf.scalar cimport Scalar
from cudf._lib.pylibcudf.strings.char_types cimport string_character_types

from cython.operator import dereference


cpdef Column capitalize(
Column input,
Scalar delimiters=None
# TODO: default scalar values
# https://github.com/rapidsai/cudf/issues/15505
):

cdef unique_ptr[column] c_result

if delimiters is None:
delimiters = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)

cdef const string_scalar* cpp_delimiters = <const string_scalar*>(
delimiters.c_obj.get()
)

with nogil:
c_result = cpp_capitalize.capitalize(
input.view(),
dereference(cpp_delimiters)
)

return Column.from_libcudf(move(c_result))


cpdef Column title(
Column input,
string_character_types sequence_type=string_character_types.ALPHA
):
cdef unique_ptr[column] c_result
with nogil:
c_result = cpp_capitalize.title(input.view(), sequence_type)

return Column.from_libcudf(move(c_result))


cpdef Column is_title(Column input):
cdef unique_ptr[column] c_result
with nogil:
c_result = cpp_capitalize.is_title(input.view())

return Column.from_libcudf(move(c_result))
5 changes: 5 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
string_character_types,
)
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.libcudf.strings.char_types import \
string_character_types as StringCharacterTypes # no-cython-lint
48 changes: 17 additions & 31 deletions python/cudf/cudf/_lib/strings/capitalize.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,33 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.strings.capitalize cimport (
capitalize as cpp_capitalize,
is_title as cpp_is_title,
title as cpp_title,
)

import cudf._lib.pylibcudf as plc


@acquire_spill_lock()
def capitalize(Column source_strings):
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_capitalize(source_view))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.capitalize.capitalize(
source_strings.to_pylibcudf(mode="read")
)
)


@acquire_spill_lock()
def title(Column source_strings):
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_title(source_view))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.capitalize.title(
source_strings.to_pylibcudf(mode="read")
)
)


@acquire_spill_lock()
def is_title(Column source_strings):
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_is_title(source_view))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(
plc.strings.capitalize.is_title(
source_strings.to_pylibcudf(mode="read")
)
)
1 change: 0 additions & 1 deletion python/cudf/cudf/pylibcudf_tests/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None:
plc_pa = plc_pa.combine_chunks()
if isinstance(pa_array, pa.ChunkedArray):
pa_array = pa_array.combine_chunks()

assert plc_pa.equals(pa_array)


Expand Down
54 changes: 54 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture(scope="module")
def pa_data():
data = [
"leopard",
"Golden Eagle",
"SNAKE",
"",
"!A",
"hello World",
"A B C",
"#",
"AƻB",
"Ⓑⓖ",
"Art of War",
"The quick bRoWn fox juMps over the laze DOG",
'123nr98nv9rev!$#INF4390v03n1243<>?}{:-"',
"accénted",
None,
]
return pa.array(data)


@pytest.fixture(scope="module")
def plc_data(pa_data):
return plc.interop.from_arrow(pa_data)


def test_capitalize(plc_data, pa_data):
got = plc.strings.capitalize.capitalize(plc_data)
expected = pa.compute.utf8_capitalize(pa_data)
assert_column_eq(got, expected)


def test_title(plc_data, pa_data):
got = plc.strings.capitalize.title(
plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES
)
expected = pa.compute.utf8_title(pa_data)
assert_column_eq(got, expected)


def test_is_title(plc_data, pa_data):
got = plc.strings.capitalize.is_title(plc_data)
expected = pa.compute.utf8_is_title(pa_data)
assert_column_eq(got, expected)

0 comments on commit bdafa73

Please sign in to comment.