Skip to content

Commit

Permalink
Add strings.combine APIs to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Sep 10, 2024
1 parent 5192b88 commit 54f7a3e
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 32 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
combine
=======

.. automodule:: pylibcudf.strings.combine
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ strings
capitalize
char_types
contains
combine
find
regex_flags
regex_program
Expand Down
28 changes: 7 additions & 21 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.combine cimport (
concatenate as cpp_concatenate,
join_list_elements as cpp_join_list_elements,
join_strings as cpp_join_strings,
output_if_empty_list,
separator_on_nulls,
)
Expand All @@ -21,6 +20,8 @@ from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport table_view_from_columns

import pylibcudf as plc


@acquire_spill_lock()
def concatenate(list source_strings,
Expand Down Expand Up @@ -62,27 +63,12 @@ def join(Column source_strings,
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""

cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef const string_scalar* scalar_separator = \
<const string_scalar*>(separator.get_raw_ptr())
cdef const string_scalar* scalar_narep = <const string_scalar*>(
narep.get_raw_ptr()
plc_column = plc.strings.combine.join_strings(
source_strings.to_pylibcudf(mode="read"),
sep.device_value.c_value,
na_rep.device_value.c_value,
)

with nogil:
c_result = move(cpp_join_strings(
source_view,
scalar_separator[0],
scalar_narep[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand Down
27 changes: 18 additions & 9 deletions python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport int
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
Expand All @@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view

cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:

ctypedef enum separator_on_nulls:
YES 'cudf::strings::separator_on_nulls::YES'
NO 'cudf::strings::separator_on_nulls::NO'
cpdef enum class separator_on_nulls(int):
YES
NO

ctypedef enum output_if_empty_list:
EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING'
NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT'
cpdef enum class output_if_empty_list(int):
EMPTY_STRING
NULL_ELEMENT

cdef unique_ptr[column] concatenate(
table_view source_strings,
table_view strings_columns,
string_scalar separator,
string_scalar narep) except +
string_scalar narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] concatenate(
table_view strings_columns,
column_view separators,
string_scalar separator_narep,
string_scalar col_narep,
separator_on_nulls separate_nulls) except +

cdef unique_ptr[column] join_strings(
column_view source_strings,
column_view input,
string_scalar separator,
string_scalar narep) except +

Expand Down
4 changes: 2 additions & 2 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# the License.
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx replace.pyx slice.pyx
set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx combine.pyx find.pyx
regex_flags.pyx regex_program.pyx replace.pyx slice.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ from . cimport (
capitalize,
case,
char_types,
combine,
contains,
find,
regex_flags,
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
capitalize,
case,
char_types,
combine,
contains,
find,
regex_flags,
Expand Down
25 changes: 25 additions & 0 deletions python/pylibcudf/pylibcudf/strings/combine.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.combine cimport (
output_if_empty_list,
separator_on_nulls,
)
from pylibcudf.scalar cimport Scalar
from pylibcudf.table cimport Table

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column concatenate(
Table strings_columns,
ColumnOrScalar separator,
Scalar narep,
Scalar col_narep,
separator_on_nulls separate_nulls,
)

cpdef Column join_strings(Column input, Scalar separator, Scalar narep)

cpdef Column join_list_elements(Column source_strings)
124 changes: 124 additions & 0 deletions python/pylibcudf/pylibcudf/strings/combine.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings cimport combine as cpp_combine
from pylibcudf.scalar cimport Scalar
from pylibcudf.table cimport Table

from pylibcudf.libcudf.strings.combine import \
output_if_empty_list as OutputIfEmptyList # no-cython-lint
from pylibcudf.libcudf.strings.combine import \
separator_on_nulls as SeparatorOnNulls # no-cython-lint


cpdef Column concatenate(
Table strings_columns,
ColumnOrScalar separator,
Scalar narep,
Scalar col_narep,
separator_on_nulls separate_nulls,
):
"""
Concatenates all strings in the column into one new string delimited
by an optional separator string.
Parameters
----------
strings_columns : Table
Strings for this operation
separator : Column or Scalar
Separator(s) for a given row
narep : Scalar
String to replace a null separator for a given row.
col_narep : Scalar
String that should be used in place of any null strings found in any column.
Ignored when separator is a Scalar.
separate_nulls : SeparatorOnNulls
If YES, then the separator is included for null rows.
Returns
-------
Column
New column with concatenated results
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_narep = <const string_scalar*>(
narep.c_obj.get()
)
cdef const string_scalar* c_col_narep = <const string_scalar*>(
narep.c_obj.get()
)
if ColumnOrScalar is Column:
with nogil:
c_result = move(
cpp_combine.concatenate(
strings_columns.view(),
separator.view(),
dereference(c_narep),
dereference(c_col_narep),
separate_nulls
)
)
elif ColumnOrScalar is Scalar:
cdef const string_scalar* c_separator = <const string_scalar*>(
separator.c_obj.get()
)
with nogil:
c_result = move(
cpp_combine.concatenate(
strings_columns.view(),
dereference(c_separator),
dereference(c_narep),
separate_nulls
)
)
else:
raise ValueError("separator must be a Column or a Scalar")
return Column.from_libcudf(move(c_result))


cpdef Column join_strings(Column input, Scalar separator, Scalar narep):
"""
Concatenates all strings in the column into one new string delimited
by an optional separator string.
Parameters
----------
input : Column
List of strings columns to concatenate
separator : Scalar
Strings column that provides the separator for a given row
narep : Scalar
String to replace any null strings found.
Returns
-------
Column
New column containing one string
"""
cdef unique_ptr[column] c_result
cdef const string_scalar* c_separator = <const string_scalar*>(
separator.c_obj.get()
)
cdef const string_scalar* c_narep = <const string_scalar*>(
narep.c_obj.get()
)
with nogil:
c_result = move(
cpp_combine.join_strings(
input.view(),
c_separator,
c_narep,
)
)

return Column.from_libcudf(move(c_result))

0 comments on commit 54f7a3e

Please sign in to comment.