Skip to content

Commit

Permalink
Add strings.combine APIs to pylibcudf (#16790)
Browse files Browse the repository at this point in the history
Contributes to #15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Matthew Murray (https://github.com/Matt711)

URL: #16790
  • Loading branch information
mroeschke authored Oct 17, 2024
1 parent 5f863a5 commit 3683e46
Show file tree
Hide file tree
Showing 12 changed files with 397 additions and 111 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
combine
=======

.. automodule:: pylibcudf.strings.combine
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ strings

capitalize
char_types
combine
contains
extract
find
Expand Down
130 changes: 29 additions & 101 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,11 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cudf._lib.column cimport Column

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.combine cimport (
concatenate as cpp_concatenate,
join_list_elements as cpp_join_list_elements,
join_strings as cpp_join_strings,
output_if_empty_list,
separator_on_nulls,
)
from pylibcudf.libcudf.table.table_view cimport table_view
import pylibcudf as plc

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport table_view_from_columns
import cudf


@acquire_spill_lock()
Expand All @@ -31,26 +18,12 @@ def concatenate(list source_strings,
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""
cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef table_view source_view = table_view_from_columns(source_strings)

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.concatenate(
plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]),
sep.device_value.c_value,
na_rep.device_value.c_value,
)

with nogil:
c_result = move(cpp_concatenate(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -62,27 +35,12 @@ def join(Column source_strings,
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""

cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.join_strings(
source_strings.to_pylibcudf(mode="read"),
sep.device_value.c_value,
na_rep.device_value.c_value,
)

with nogil:
c_result = move(cpp_join_strings(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -96,29 +54,15 @@ def join_lists_with_scalar(
between each string in lists and `<NA>`/`None` values
are replaced by `py_narep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.join_list_elements(
source_strings.to_pylibcudf(mode="read"),
py_separator.device_value.c_value,
py_narep.device_value.c_value,
cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value,
plc.strings.combine.SeparatorOnNulls.YES,
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
)

with nogil:
c_result = move(cpp_join_list_elements(
source_view,
scalar_separator[0],
scalar_narep[0],
separator_on_nulls.YES,
output_if_empty_list.NULL_ELEMENT
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -135,28 +79,12 @@ def join_lists_with_column(
`<NA>`/`None` values in `separator_strings` are replaced
by `py_separator_narep`
"""

cdef DeviceScalar source_narep = py_source_narep.device_value
cdef DeviceScalar separator_narep = py_separator_narep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view separator_view = separator_strings.view()

cdef const string_scalar* scalar_source_narep = \
<const string_scalar*>(source_narep.get_raw_ptr())
cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
separator_narep.get_raw_ptr()
plc_column = plc.strings.combine.join_list_elements(
source_strings.to_pylibcudf(mode="read"),
separator_strings.to_pylibcudf(mode="read"),
py_separator_narep.device_value.c_value,
py_source_narep.device_value.c_value,
plc.strings.combine.SeparatorOnNulls.YES,
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
)

with nogil:
c_result = move(cpp_join_list_elements(
source_view,
separator_view,
scalar_separator_narep[0],
scalar_source_narep[0],
separator_on_nulls.YES,
output_if_empty_list.NULL_ELEMENT
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx)

set(linked_libraries cudf::cudf)

Expand Down
27 changes: 18 additions & 9 deletions python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport int
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
Expand All @@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view

cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:

ctypedef enum separator_on_nulls:
YES 'cudf::strings::separator_on_nulls::YES'
NO 'cudf::strings::separator_on_nulls::NO'
cpdef enum class separator_on_nulls(int):
YES
NO

ctypedef enum output_if_empty_list:
EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING'
NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT'
cpdef enum class output_if_empty_list(int):
EMPTY_STRING
NULL_ELEMENT

cdef unique_ptr[column] concatenate(
table_view source_strings,
table_view strings_columns,
string_scalar separator,
string_scalar narep) except +
string_scalar narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] concatenate(
table_view strings_columns,
column_view separators,
string_scalar separator_narep,
string_scalar col_narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] join_strings(
column_view source_strings,
column_view input,
string_scalar separator,
string_scalar narep) except +

Expand Down
Empty file.
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(cython_sources
case.pyx
char_types.pyx
contains.pyx
combine.pyx
extract.pyx
find.pyx
find_multiple.pyx
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from . cimport (
capitalize,
case,
char_types,
combine,
contains,
convert,
extract,
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
capitalize,
case,
char_types,
combine,
contains,
convert,
extract,
Expand Down
33 changes: 33 additions & 0 deletions python/pylibcudf/pylibcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.combine cimport (
output_if_empty_list,
separator_on_nulls,
)
from pylibcudf.scalar cimport Scalar
from pylibcudf.table cimport Table

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column concatenate(
Table strings_columns,
ColumnOrScalar separator,
Scalar narep=*,
Scalar col_narep=*,
separator_on_nulls separate_nulls=*,
)

cpdef Column join_strings(Column input, Scalar separator, Scalar narep)


cpdef Column join_list_elements(
Column source_strings,
ColumnOrScalar separator,
Scalar separator_narep,
Scalar string_narep,
separator_on_nulls separate_nulls,
output_if_empty_list empty_list_policy,
)
Loading

0 comments on commit 3683e46

Please sign in to comment.