diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst new file mode 100644 index 00000000000..38a46641200 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst @@ -0,0 +1,6 @@ +======= +combine +======= + +.. automodule:: pylibcudf.strings.combine + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 462a756a092..b54265f3877 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -7,6 +7,7 @@ strings capitalize char_types contains + combine find regex_flags regex_program diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 76cc13db0da..80dacc5fc3e 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -11,7 +11,6 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.strings.combine cimport ( concatenate as cpp_concatenate, join_list_elements as cpp_join_list_elements, - join_strings as cpp_join_strings, output_if_empty_list, separator_on_nulls, ) @@ -21,6 +20,8 @@ from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport table_view_from_columns +import pylibcudf as plc + @acquire_spill_lock() def concatenate(list source_strings, @@ -62,27 +63,12 @@ def join(Column source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_strings( + source_strings.to_pylibcudf(mode="read"), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_join_strings( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd index e4c9fa5817a..e659993b834 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport int from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: - ctypedef enum separator_on_nulls: - YES 'cudf::strings::separator_on_nulls::YES' - NO 'cudf::strings::separator_on_nulls::NO' + cpdef enum class separator_on_nulls(int): + YES + NO - ctypedef enum output_if_empty_list: - EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING' - NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT' + cpdef enum class output_if_empty_list(int): + EMPTY_STRING + NULL_ELEMENT cdef unique_ptr[column] concatenate( - table_view source_strings, + table_view strings_columns, string_scalar separator, - string_scalar narep) except + + string_scalar narep, + separator_on_nulls separate_nulls) except + + + cdef unique_ptr[column] concatenate( + table_view strings_columns, + column_view separators, + string_scalar separator_narep, + string_scalar col_narep, + separator_on_nulls separate_nulls) except + cdef unique_ptr[column] join_strings( - column_view source_strings, + column_view input, string_scalar separator, string_scalar narep) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index b499a127541..7a18f3bbc48 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx slice.pyx +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx combine.pyx find.pyx + regex_flags.pyx regex_program.pyx replace.pyx slice.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d1f632d6d8e..8f72d7b6dd7 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( capitalize, case, char_types, + combine, contains, find, regex_flags, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index ef102aff2af..0776391f87f 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -4,6 +4,7 @@ capitalize, case, char_types, + combine, contains, find, regex_flags, diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd new file mode 100644 index 00000000000..06491b03278 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.combine cimport ( + output_if_empty_list, + separator_on_nulls, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep, + Scalar col_narep, + separator_on_nulls separate_nulls, +) + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep) + +cpdef Column join_list_elements(Column source_strings) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx new file mode 100644 index 00000000000..5d8f39052e7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -0,0 +1,124 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport combine as cpp_combine +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from pylibcudf.libcudf.strings.combine import \ + output_if_empty_list as OutputIfEmptyList # no-cython-lint +from pylibcudf.libcudf.strings.combine import \ + separator_on_nulls as SeparatorOnNulls # no-cython-lint + + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep, + Scalar col_narep, + separator_on_nulls separate_nulls, +): + """ + Concatenates all strings in the column into one new string delimited + by an optional separator string. + + Parameters + ---------- + strings_columns : Table + Strings for this operation + + separator : Column or Scalar + Separator(s) for a given row + + narep : Scalar + String to replace a null separator for a given row. + + col_narep : Scalar + String that should be used in place of any null strings found in any column. + Ignored when separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows. + + Returns + ------- + Column + New column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + cdef const string_scalar* c_col_narep = ( + narep.c_obj.get() + ) + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + separator.view(), + dereference(c_narep), + dereference(c_col_narep), + separate_nulls + ) + ) + elif ColumnOrScalar is Scalar: + cdef const string_scalar* c_separator = ( + separator.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + dereference(c_separator), + dereference(c_narep), + separate_nulls + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep): + """ + Concatenates all strings in the column into one new string delimited + by an optional separator string. + + Parameters + ---------- + input : Column + List of strings columns to concatenate + + separator : Scalar + Strings column that provides the separator for a given row + + narep : Scalar + String to replace any null strings found. + + Returns + ------- + Column + New column containing one string + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator = ( + separator.c_obj.get() + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.join_strings( + input.view(), + c_separator, + c_narep, + ) + ) + + return Column.from_libcudf(move(c_result))