From 69b0f661ff2fc4c12bb0fe696e556f6b3224b381 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 10 Oct 2024 08:38:11 -1000 Subject: [PATCH] Add string.convert.convert_lists APIs to pylibcudf (#16997) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16997 --- .../strings/convert/convert_booleans.rst | 6 ++ .../strings/convert/convert_datetime.rst | 6 ++ .../strings/convert/convert_durations.rst | 6 ++ .../strings/convert/convert_fixed_point.rst | 6 ++ .../strings/convert/convert_floats.rst | 6 ++ .../strings/convert/convert_ipv4.rst | 6 ++ .../strings/convert/convert_lists.rst | 6 ++ .../strings/convert/convert_urls.rst | 6 ++ .../pylibcudf/strings/convert/index.rst | 14 ++++ .../api_docs/pylibcudf/strings/index.rst | 6 ++ .../_lib/strings/convert/convert_lists.pyx | 32 ++------- .../libcudf/strings/convert/convert_lists.pxd | 2 +- .../pylibcudf/strings/convert/CMakeLists.txt | 5 +- .../pylibcudf/strings/convert/__init__.pxd | 1 + .../pylibcudf/strings/convert/__init__.py | 1 + .../strings/convert/convert_fixed_point.pyx | 6 +- .../strings/convert/convert_lists.pxd | 11 +++ .../strings/convert/convert_lists.pyx | 72 +++++++++++++++++++ .../tests/test_string_convert_lists.py | 21 ++++++ 19 files changed, 187 insertions(+), 32 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst new file mode 100644 index 00000000000..de62221456f --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst @@ -0,0 +1,6 @@ +================ +convert_booleans +================ + +.. automodule:: pylibcudf.strings.convert.convert_booleans + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst new file mode 100644 index 00000000000..fc5d5204ab3 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst @@ -0,0 +1,6 @@ +================ +convert_datetime +================ + +.. automodule:: pylibcudf.strings.convert.convert_datetime + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst new file mode 100644 index 00000000000..e80b0c15a61 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst @@ -0,0 +1,6 @@ +================= +convert_durations +================= + +.. automodule:: pylibcudf.strings.convert.convert_durations + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst new file mode 100644 index 00000000000..16d971a6849 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst @@ -0,0 +1,6 @@ +=================== +convert_fixed_point +=================== + +.. automodule:: pylibcudf.strings.convert.convert_fixed_point + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst new file mode 100644 index 00000000000..9ae4004cea9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst @@ -0,0 +1,6 @@ +============== +convert_floats +============== + +.. automodule:: pylibcudf.strings.convert.convert_floats + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst new file mode 100644 index 00000000000..4ead8677a69 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst @@ -0,0 +1,6 @@ +============ +convert_ipv4 +============ + +.. automodule:: pylibcudf.strings.convert.convert_ipv4 + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst new file mode 100644 index 00000000000..33a719a42e1 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst @@ -0,0 +1,6 @@ +============= +convert_lists +============= + +.. automodule:: pylibcudf.strings.convert.convert_lists + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst new file mode 100644 index 00000000000..f20d95e0cdd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst @@ -0,0 +1,6 @@ +============ +convert_urls +============ + +.. automodule:: pylibcudf.strings.convert.convert_urls + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst new file mode 100644 index 00000000000..fa05cb7d786 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst @@ -0,0 +1,14 @@ +convert +======= + +.. toctree:: + :maxdepth: 1 + + convert_booleans + convert_datetime + convert_durations + convert_fixed_point + convert_floats + convert_ipv4 + convert_lists + convert_urls diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 48dc8a13c3e..65dc5d2d1c3 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -21,3 +21,9 @@ strings split strip wrap + +.. toctree:: + :maxdepth: 2 + :caption: Subpackages + + convert/index.rst diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx index 73aebf8ab35..3a2cb4bd5c7 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx @@ -1,23 +1,13 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_lists cimport ( - format_list_column as cpp_format_list_column, -) - from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar -from cudf._lib.scalar cimport DeviceScalar - @acquire_spill_lock() def format_list_column(Column source_list, Column separators): @@ -34,19 +24,9 @@ def format_list_column(Column source_list, Column separators): ------- Formatted strings column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_list.view() - cdef column_view separators_view = separators.view() - # Use 'None' as null-replacement string - cdef DeviceScalar str_na_rep = as_device_scalar("None") - cdef const string_scalar* string_scalar_na_rep = ( - str_na_rep.get_raw_ptr()) - - with nogil: - c_result = move(cpp_format_list_column( - source_view, string_scalar_na_rep[0], separators_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_lists.format_list_column( + source_list.to_pylibcudf(mode="read"), + as_device_scalar("None").c_value, + separators.to_pylibcudf(mode="read"), ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd index 109111568d8..6e1ecd30539 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd @@ -9,6 +9,6 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] format_list_column( - column_view input_col, + column_view input, string_scalar na_rep, column_view separators) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 7b228c06a18..846070870b1 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx - convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx +set(cython_sources + convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx + convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index be6145384ad..799532d72c6 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -6,5 +6,6 @@ from . cimport ( convert_fixed_point, convert_floats, convert_ipv4, + convert_lists, convert_urls, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index 7c94387282b..deb2d8ab74b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -6,5 +6,6 @@ convert_fixed_point, convert_floats, convert_ipv4, + convert_lists, convert_urls, ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx index 40dadf6f967..60a8fca8baf 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -15,7 +15,7 @@ cpdef Column to_fixed_point(Column input, DataType output_type): Returns a new fixed-point column parsing decimal values from the provided strings column. - For details, see :cpp:details:`cudf::strings::to_fixed_point` + For details, see :cpp:func:`cudf::strings::to_fixed_point` Parameters ---------- @@ -47,7 +47,7 @@ cpdef Column from_fixed_point(Column input): Returns a new strings column converting the fixed-point values into a strings column. - For details, see :cpp:details:`cudf::strings::from_fixed_point` + For details, see :cpp:func:`cudf::strings::from_fixed_point` Parameters ---------- @@ -75,7 +75,7 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None): Returns a boolean column identifying strings in which all characters are valid for conversion to fixed-point. - For details, see :cpp:details:`cudf::strings::is_fixed_point` + For details, see :cpp:func:`cudf::strings::is_fixed_point` Parameters ---------- diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd new file mode 100644 index 00000000000..1ba4272afa2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column format_list_column( + Column input, + Scalar na_rep=*, + Column separators=* +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx new file mode 100644 index 00000000000..3fbc08a9ab5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.column_factories cimport make_empty_column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.convert cimport ( + convert_lists as cpp_convert_lists, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.types cimport type_id + +from cython.operator import dereference + + +cpdef Column format_list_column( + Column input, + Scalar na_rep=None, + Column separators=None +): + """ + Convert a list column of strings into a formatted strings column. + + For details, see :cpp:func`cudf::strings::format_list_column` + + Parameters + ---------- + input : Column + Lists column to format + + na_rep : Scalar + Replacement string for null elements. + Default, empty string + + separators : Column + Strings to use for enclosing list components and separating elements. + Default, ``,``, ``[``, ``]`` + + Returns + ------- + Column + New strings column + """ + cdef unique_ptr[column] c_result + + if na_rep is None: + na_rep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef const string_scalar* c_na_rep = ( + na_rep.c_obj.get() + ) + + if separators is None: + separators = make_empty_column(type_id.STRING) + + with nogil: + c_result = move( + cpp_convert_lists.format_list_column( + input.view(), + dereference(c_na_rep), + separators.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py new file mode 100644 index 00000000000..8591732b39e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py @@ -0,0 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.mark.parametrize("na_rep", [None, pa.scalar("")]) +@pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])]) +def test_format_list_column(na_rep, separators): + arr = pa.array([["1", "A"], None]) + result = plc.strings.convert.convert_lists.format_list_column( + plc.interop.from_arrow(arr), + na_rep if na_rep is None else plc.interop.from_arrow(na_rep), + separators + if separators is None + else plc.interop.from_arrow(separators), + ) + expected = pa.array(["[1,A]", ""]) + assert_column_eq(result, expected)