From 54f7a3e9d671c16bb778a13816b396e058fb583b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:30:19 -0700 Subject: [PATCH 1/7] Add strings.combine APIs to pylibcudf --- .../api_docs/pylibcudf/strings/combine.rst | 6 + .../api_docs/pylibcudf/strings/index.rst | 1 + python/cudf/cudf/_lib/strings/combine.pyx | 28 +--- .../pylibcudf/libcudf/strings/combine.pxd | 27 ++-- .../pylibcudf/strings/CMakeLists.txt | 4 +- .../pylibcudf/pylibcudf/strings/__init__.pxd | 1 + .../pylibcudf/pylibcudf/strings/__init__.py | 1 + .../pylibcudf/pylibcudf/strings/combine.pxd | 25 ++++ .../pylibcudf/pylibcudf/strings/combine.pyx | 124 ++++++++++++++++++ 9 files changed, 185 insertions(+), 32 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst create mode 100644 python/pylibcudf/pylibcudf/strings/combine.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/combine.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst new file mode 100644 index 00000000000..38a46641200 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst @@ -0,0 +1,6 @@ +======= +combine +======= + +.. automodule:: pylibcudf.strings.combine + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 462a756a092..b54265f3877 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -7,6 +7,7 @@ strings capitalize char_types contains + combine find regex_flags regex_program diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 76cc13db0da..80dacc5fc3e 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -11,7 +11,6 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.strings.combine cimport ( concatenate as cpp_concatenate, join_list_elements as cpp_join_list_elements, - join_strings as cpp_join_strings, output_if_empty_list, separator_on_nulls, ) @@ -21,6 +20,8 @@ from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport table_view_from_columns +import pylibcudf as plc + @acquire_spill_lock() def concatenate(list source_strings, @@ -62,27 +63,12 @@ def join(Column source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_strings( + source_strings.to_pylibcudf(mode="read"), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_join_strings( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd index e4c9fa5817a..e659993b834 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport int from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: - ctypedef enum separator_on_nulls: - YES 'cudf::strings::separator_on_nulls::YES' - NO 'cudf::strings::separator_on_nulls::NO' + cpdef enum class separator_on_nulls(int): + YES + NO - ctypedef enum output_if_empty_list: - EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING' - NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT' + cpdef enum class output_if_empty_list(int): + EMPTY_STRING + NULL_ELEMENT cdef unique_ptr[column] concatenate( - table_view source_strings, + table_view strings_columns, string_scalar separator, - string_scalar narep) except + + string_scalar narep, + separator_on_nulls separate_nulls) except + + + cdef unique_ptr[column] concatenate( + table_view strings_columns, + column_view separators, + string_scalar separator_narep, + string_scalar col_narep, + separator_on_nulls separate_nulls) except + cdef unique_ptr[column] join_strings( - column_view source_strings, + column_view input, string_scalar separator, string_scalar narep) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index b499a127541..7a18f3bbc48 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx slice.pyx +set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx combine.pyx find.pyx + regex_flags.pyx regex_program.pyx replace.pyx slice.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d1f632d6d8e..8f72d7b6dd7 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( capitalize, case, char_types, + combine, contains, find, regex_flags, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index ef102aff2af..0776391f87f 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -4,6 +4,7 @@ capitalize, case, char_types, + combine, contains, find, regex_flags, diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd new file mode 100644 index 00000000000..06491b03278 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.combine cimport ( + output_if_empty_list, + separator_on_nulls, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep, + Scalar col_narep, + separator_on_nulls separate_nulls, +) + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep) + +cpdef Column join_list_elements(Column source_strings) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx new file mode 100644 index 00000000000..5d8f39052e7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -0,0 +1,124 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport combine as cpp_combine +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from pylibcudf.libcudf.strings.combine import \ + output_if_empty_list as OutputIfEmptyList # no-cython-lint +from pylibcudf.libcudf.strings.combine import \ + separator_on_nulls as SeparatorOnNulls # no-cython-lint + + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep, + Scalar col_narep, + separator_on_nulls separate_nulls, +): + """ + Concatenates all strings in the column into one new string delimited + by an optional separator string. + + Parameters + ---------- + strings_columns : Table + Strings for this operation + + separator : Column or Scalar + Separator(s) for a given row + + narep : Scalar + String to replace a null separator for a given row. + + col_narep : Scalar + String that should be used in place of any null strings found in any column. + Ignored when separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows. + + Returns + ------- + Column + New column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + cdef const string_scalar* c_col_narep = ( + narep.c_obj.get() + ) + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + separator.view(), + dereference(c_narep), + dereference(c_col_narep), + separate_nulls + ) + ) + elif ColumnOrScalar is Scalar: + cdef const string_scalar* c_separator = ( + separator.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + dereference(c_separator), + dereference(c_narep), + separate_nulls + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep): + """ + Concatenates all strings in the column into one new string delimited + by an optional separator string. + + Parameters + ---------- + input : Column + List of strings columns to concatenate + + separator : Scalar + Strings column that provides the separator for a given row + + narep : Scalar + String to replace any null strings found. + + Returns + ------- + Column + New column containing one string + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator = ( + separator.c_obj.get() + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.join_strings( + input.view(), + c_separator, + c_narep, + ) + ) + + return Column.from_libcudf(move(c_result)) From cb4e5c2323b4141a094fb710830bc44a99eba93a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 11 Sep 2024 18:13:27 -0700 Subject: [PATCH 2/7] finalize join_list_elements, fix some compiliation --- python/cudf/cudf/_lib/strings/combine.pyx | 106 +++++------------- .../pylibcudf/pylibcudf/strings/combine.pxd | 10 +- .../pylibcudf/pylibcudf/strings/combine.pyx | 92 ++++++++++++++- 3 files changed, 121 insertions(+), 87 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 80dacc5fc3e..6e24a757a2f 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -2,26 +2,12 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.combine cimport ( - concatenate as cpp_concatenate, - join_list_elements as cpp_join_list_elements, - output_if_empty_list, - separator_on_nulls, -) -from pylibcudf.libcudf.table.table_view cimport table_view - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_columns import pylibcudf as plc +import cudf + @acquire_spill_lock() def concatenate(list source_strings, @@ -32,26 +18,14 @@ def concatenate(list source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef table_view source_view = table_view_from_columns(source_strings) - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.concatenate( + plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), + sep.device_value.c_value, + na_rep.device_value.c_value, + cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, + plc.strings.combine.SeparatorOnNulls.YES, ) - - with nogil: - c_result = move(cpp_concatenate( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -82,29 +56,15 @@ def join_lists_with_scalar( between each string in lists and ``/`None` values are replaced by `py_narep` """ - - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + py_separator.device_value.c_value, + py_narep.device_value.c_value, + cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - scalar_separator[0], - scalar_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -121,28 +81,12 @@ def join_lists_with_column( ``/`None` values in `separator_strings` are replaced by `py_separator_narep` """ - - cdef DeviceScalar source_narep = py_source_narep.device_value - cdef DeviceScalar separator_narep = py_separator_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view separator_view = separator_strings.view() - - cdef const string_scalar* scalar_source_narep = \ - (source_narep.get_raw_ptr()) - cdef const string_scalar* scalar_separator_narep = ( - separator_narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + separator_strings.to_pylibcudf(mode="read"), + py_source_narep.device_value.c_value, + py_separator_narep.device_value.c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - separator_view, - scalar_separator_narep[0], - scalar_source_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd index 06491b03278..41bff7f92ee 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pxd +++ b/python/pylibcudf/pylibcudf/strings/combine.pxd @@ -22,4 +22,12 @@ cpdef Column concatenate( cpdef Column join_strings(Column input, Scalar separator, Scalar narep) -cpdef Column join_list_elements(Column source_strings) + +cpdef Column join_list_elements( + Column source_strings, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index 5d8f39052e7..ff27cce77ae 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -8,6 +8,7 @@ from pylibcudf.libcudf.strings cimport combine as cpp_combine from pylibcudf.scalar cimport Scalar from pylibcudf.table cimport Table +from cython.operator import dereference from pylibcudf.libcudf.strings.combine import \ output_if_empty_list as OutputIfEmptyList # no-cython-lint from pylibcudf.libcudf.strings.combine import \ @@ -55,6 +56,8 @@ cpdef Column concatenate( cdef const string_scalar* c_col_narep = ( narep.c_obj.get() ) + cdef const string_scalar* c_separator + if ColumnOrScalar is Column: with nogil: c_result = move( @@ -67,9 +70,7 @@ cpdef Column concatenate( ) ) elif ColumnOrScalar is Scalar: - cdef const string_scalar* c_separator = ( - separator.c_obj.get() - ) + c_separator = (separator.c_obj.get()) with nogil: c_result = move( cpp_combine.concatenate( @@ -116,9 +117,90 @@ cpdef Column join_strings(Column input, Scalar separator, Scalar narep): c_result = move( cpp_combine.join_strings( input.view(), - c_separator, - c_narep, + dereference(c_separator), + dereference(c_narep), ) ) return Column.from_libcudf(move(c_result)) + + +cpdef Column join_list_elements( + Column lists_strings_column, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +): + """ + Given a lists column of strings (each row is a list of strings), + concatenates the strings within each row and returns a single strings + column result. + + Parameters + ---------- + lists_strings_column : Column + Column containing lists of strings to concatenate + + separator : Column or Scalar + String(s) that should inserted between each string from each row. + + separator_narep : Scalar + String that should be used to replace a null separator. + + string_narep : Scalar + String to replace null strings in any non-null list row. + Ignored if separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows + if `narep` is valid + + empty_list_policy : OutputIfEmptyList + If set to EMPTY_STRING, any input row that is an empty + list will result in an empty string. Otherwise, it will + result in a null. + + + Returns + ------- + Column + New strings column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator_narep = ( + separator_narep.c_obj.get() + ) + cdef const string_scalar* c_string_narep = ( + string_narep.c_obj.get() + ) + cdef const string_scalar* c_separator + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + separator.view(), + dereference(c_separator_narep), + dereference(c_string_narep), + separate_nulls, + empty_list_policy, + ) + ) + elif ColumnOrScalar is Scalar: + c_separator = (separator.c_obj.get()) + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + dereference(c_separator), + dereference(c_separator_narep), + separate_nulls, + empty_list_policy, + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) From 1ab6f5cbaba0239b1b9759899e8fca4a8f021410 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 16 Sep 2024 15:17:55 -0700 Subject: [PATCH 3/7] Add pylibcudf tests --- .../pylibcudf/libcudf/strings/CMakeLists.txt | 2 +- .../pylibcudf/libcudf/strings/combine.pyx | 0 .../pylibcudf/tests/test_string_combine.py | 54 +++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_combine.py diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index bd6e2e0af02..562c33073fc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx) +set(cython_sources char_types.pyx combine.pyx regex_flags.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py new file mode 100644 index 00000000000..7656704c277 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_concatenate(): + arr = pa.array(["a", "b"]) + sep = pa.scalar("") + plc_table = plc.interop.from_arrow(pa.table({"a": arr, "b": arr})) + plc_result = plc.strings.combine.concatenate( + plc_table, + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar("")), + plc.strings.combine.SeparatorOnNulls.YES, + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) + ) + assert result.equals(expected) + + +def test_join_strings(): + pa_arr = pa.array(list("abc")) + sep = pa.scalar("") + plc_result = plc.strings.combine.join_strings( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array([["abc"]]) + assert result.equals(expected) + + +def test_join_list_elements(): + pa_arr = pa.array([["a", "a"], ["b", "b"]]) + sep = pa.scalar("") + plc_result = plc.strings.combine.join_list_elements( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar("")), + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) + ) + assert result.equals(expected) From 39d4bcd11de876e9425323fefac786069b9e51e5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:17:31 -0700 Subject: [PATCH 4/7] Swap arguments --- python/cudf/cudf/_lib/strings/combine.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 6e24a757a2f..7b8696c1351 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -59,8 +59,8 @@ def join_lists_with_scalar( plc_column = plc.strings.combine.join_list_elements( source_strings.to_pylibcudf(mode="read"), py_separator.device_value.c_value, - py_narep.device_value.c_value, cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, + py_narep.device_value.c_value, plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -84,8 +84,8 @@ def join_lists_with_column( plc_column = plc.strings.combine.join_list_elements( source_strings.to_pylibcudf(mode="read"), separator_strings.to_pylibcudf(mode="read"), - py_source_narep.device_value.c_value, py_separator_narep.device_value.c_value, + py_source_narep.device_value.c_value, plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) From 0761a20d61a79e0fb2845fb5f68d4127d162fd0b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:06:46 -0700 Subject: [PATCH 5/7] Only one needed swapping --- python/cudf/cudf/_lib/strings/combine.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 7b8696c1351..9a6dc9ac54e 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -59,8 +59,8 @@ def join_lists_with_scalar( plc_column = plc.strings.combine.join_list_elements( source_strings.to_pylibcudf(mode="read"), py_separator.device_value.c_value, - cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, py_narep.device_value.c_value, + cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) From bdb33923f6a93434bb4b0b3625d2734c5d8d9735 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:33:45 -0700 Subject: [PATCH 6/7] Address reviews --- python/cudf/cudf/_lib/strings/combine.pyx | 2 - .../pylibcudf/pylibcudf/strings/combine.pxd | 6 +- .../pylibcudf/pylibcudf/strings/combine.pyx | 35 ++++++--- .../pylibcudf/tests/test_string_combine.py | 75 +++++++++++++------ 4 files changed, 81 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 9a6dc9ac54e..0f7b27d85d7 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -22,8 +22,6 @@ def concatenate(list source_strings, plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), sep.device_value.c_value, na_rep.device_value.c_value, - cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, - plc.strings.combine.SeparatorOnNulls.YES, ) return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd index 41bff7f92ee..ea22f626973 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pxd +++ b/python/pylibcudf/pylibcudf/strings/combine.pxd @@ -15,9 +15,9 @@ ctypedef fused ColumnOrScalar: cpdef Column concatenate( Table strings_columns, ColumnOrScalar separator, - Scalar narep, - Scalar col_narep, - separator_on_nulls separate_nulls, + Scalar narep=*, + Scalar col_narep=*, + separator_on_nulls separate_nulls=*, ) cpdef Column join_strings(Column input, Scalar separator, Scalar narep) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index ff27cce77ae..be9c8c0a63e 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -4,6 +4,9 @@ from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from pylibcudf.libcudf.strings cimport combine as cpp_combine from pylibcudf.scalar cimport Scalar from pylibcudf.table cimport Table @@ -18,12 +21,12 @@ from pylibcudf.libcudf.strings.combine import \ cpdef Column concatenate( Table strings_columns, ColumnOrScalar separator, - Scalar narep, - Scalar col_narep, - separator_on_nulls separate_nulls, + Scalar narep=None, + Scalar col_narep=None, + separator_on_nulls separate_nulls=separator_on_nulls.YES, ): """ - Concatenates all strings in the column into one new string delimited + Concatenate all strings in the column into one new string delimited by an optional separator string. Parameters @@ -39,7 +42,7 @@ cpdef Column concatenate( col_narep : Scalar String that should be used in place of any null strings found in any column. - Ignored when separator is a Scalar. + An exception is raised when separator is a Scalar. separate_nulls : SeparatorOnNulls If YES, then the separator is included for null rows. @@ -50,15 +53,25 @@ cpdef Column concatenate( New column with concatenated results """ cdef unique_ptr[column] c_result + cdef const string_scalar* c_col_narep + cdef const string_scalar* c_separator + + if narep is None: + narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) cdef const string_scalar* c_narep = ( narep.c_obj.get() ) - cdef const string_scalar* c_col_narep = ( - narep.c_obj.get() - ) - cdef const string_scalar* c_separator if ColumnOrScalar is Column: + if col_narep is None: + col_narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + c_col_narep = ( + col_narep.c_obj.get() + ) with nogil: c_result = move( cpp_combine.concatenate( @@ -70,6 +83,10 @@ cpdef Column concatenate( ) ) elif ColumnOrScalar is Scalar: + if col_narep is not None: + raise ValueError( + "col_narep cannot be specified when separator is a Scalar" + ) c_separator = (separator.c_obj.get()) with nogil: c_result = move( diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py index 7656704c277..4a7007a0d6b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_combine.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py @@ -3,43 +3,75 @@ import pyarrow as pa import pyarrow.compute as pc import pylibcudf as plc +import pytest +from utils import assert_column_eq -def test_concatenate(): - arr = pa.array(["a", "b"]) - sep = pa.scalar("") - plc_table = plc.interop.from_arrow(pa.table({"a": arr, "b": arr})) - plc_result = plc.strings.combine.concatenate( +def test_concatenate_scalar_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.scalar("-")) + result = plc.strings.combine.concatenate( plc_table, - plc.interop.from_arrow(sep), - plc.interop.from_arrow(pa.scalar("")), - plc.interop.from_arrow(pa.scalar("")), - plc.strings.combine.SeparatorOnNulls.YES, + sep, ) - result = plc.interop.to_arrow(plc_result) - expected = pa.chunked_array( - pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) + expected = pa.array(["a-a", "-b", "c-"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, sep, narep=plc.interop.from_arrow(pa.scalar("!")) ) - assert result.equals(expected) + expected = pa.array(["a-a", "!-b", "c-!"]) + assert_column_eq(result, expected) + + with pytest.raises(ValueError): + plc.strings.combine.concatenate( + plc_table, + sep, + narep=plc.interop.from_arrow(pa.scalar("!")), + col_narep=plc.interop.from_arrow(pa.scalar("?")), + ) + + +def test_concatenate_column_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.array(["-", "?", ","])) + result = plc.strings.combine.concatenate( + plc_table, + sep, + ) + expected = pa.array(["a-a", "?b", "c,"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, + plc.interop.from_arrow(pa.array([None, "?", ","])), + narep=plc.interop.from_arrow(pa.scalar("1")), + col_narep=plc.interop.from_arrow(pa.scalar("*")), + ) + expected = pa.array(["a1a", "*?b", "c,*"]) + assert_column_eq(result, expected) def test_join_strings(): pa_arr = pa.array(list("abc")) sep = pa.scalar("") - plc_result = plc.strings.combine.join_strings( + result = plc.strings.combine.join_strings( plc.interop.from_arrow(pa_arr), plc.interop.from_arrow(sep), plc.interop.from_arrow(pa.scalar("")), ) - result = plc.interop.to_arrow(plc_result) - expected = pa.chunked_array([["abc"]]) - assert result.equals(expected) + expected = pa.array(["abc"]) + assert_column_eq(result, expected) def test_join_list_elements(): pa_arr = pa.array([["a", "a"], ["b", "b"]]) sep = pa.scalar("") - plc_result = plc.strings.combine.join_list_elements( + result = plc.strings.combine.join_list_elements( plc.interop.from_arrow(pa_arr), plc.interop.from_arrow(sep), plc.interop.from_arrow(pa.scalar("")), @@ -47,8 +79,5 @@ def test_join_list_elements(): plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - result = plc.interop.to_arrow(plc_result) - expected = pa.chunked_array( - pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) - ) - assert result.equals(expected) + expected = pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) + assert_column_eq(result, expected) From c1194d41cb2d541285a49782e06e71a4e961f540 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 16 Oct 2024 18:06:59 -0700 Subject: [PATCH 7/7] address review --- .../source/user_guide/api_docs/pylibcudf/strings/index.rst | 2 +- python/pylibcudf/pylibcudf/strings/combine.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index fab46ec8b8d..c8c0016126d 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -6,8 +6,8 @@ strings capitalize char_types - contains combine + contains extract find find_multiple diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index be9c8c0a63e..f17d5265ab4 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -26,8 +26,8 @@ cpdef Column concatenate( separator_on_nulls separate_nulls=separator_on_nulls.YES, ): """ - Concatenate all strings in the column into one new string delimited - by an optional separator string. + Concatenate all columns in the table horizontally into one new string + delimited by an optional separator string. Parameters ----------