Skip to content

Commit

Permalink
Add string.convert.convert_lists APIs to pylibcudf (#16997)
Browse files Browse the repository at this point in the history
Contributes to #15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16997
  • Loading branch information
mroeschke authored Oct 10, 2024
1 parent 7173b52 commit 69b0f66
Show file tree
Hide file tree
Showing 19 changed files with 187 additions and 32 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
================
convert_booleans
================

.. automodule:: pylibcudf.strings.convert.convert_booleans
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
================
convert_datetime
================

.. automodule:: pylibcudf.strings.convert.convert_datetime
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=================
convert_durations
=================

.. automodule:: pylibcudf.strings.convert.convert_durations
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
===================
convert_fixed_point
===================

.. automodule:: pylibcudf.strings.convert.convert_fixed_point
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
==============
convert_floats
==============

.. automodule:: pylibcudf.strings.convert.convert_floats
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
============
convert_ipv4
============

.. automodule:: pylibcudf.strings.convert.convert_ipv4
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=============
convert_lists
=============

.. automodule:: pylibcudf.strings.convert.convert_lists
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
============
convert_urls
============

.. automodule:: pylibcudf.strings.convert.convert_urls
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
convert
=======

.. toctree::
:maxdepth: 1

convert_booleans
convert_datetime
convert_durations
convert_fixed_point
convert_floats
convert_ipv4
convert_lists
convert_urls
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,9 @@ strings
split
strip
wrap

.. toctree::
:maxdepth: 2
:caption: Subpackages

convert/index.rst
32 changes: 6 additions & 26 deletions python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
Original file line number Diff line number Diff line change
@@ -1,23 +1,13 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
import pylibcudf as plc

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.convert.convert_lists cimport (
format_list_column as cpp_format_list_column,
)

from cudf._lib.column cimport Column

from cudf._lib.scalar import as_device_scalar

from cudf._lib.scalar cimport DeviceScalar


@acquire_spill_lock()
def format_list_column(Column source_list, Column separators):
Expand All @@ -34,19 +24,9 @@ def format_list_column(Column source_list, Column separators):
-------
Formatted strings column
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_list.view()
cdef column_view separators_view = separators.view()
# Use 'None' as null-replacement string
cdef DeviceScalar str_na_rep = as_device_scalar("None")
cdef const string_scalar* string_scalar_na_rep = <const string_scalar*>(
str_na_rep.get_raw_ptr())

with nogil:
c_result = move(cpp_format_list_column(
source_view, string_scalar_na_rep[0], separators_view
))

return Column.from_unique_ptr(
move(c_result)
plc_column = plc.strings.convert.convert_lists.format_list_column(
source_list.to_pylibcudf(mode="read"),
as_device_scalar("None").c_value,
separators.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(plc_column)
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
"cudf::strings" nogil:

cdef unique_ptr[column] format_list_column(
column_view input_col,
column_view input,
string_scalar na_rep,
column_view separators) except +
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# the License.
# =============================================================================

set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
convert_fixed_point.pyx convert_floats.pyx convert_ipv4.pyx convert_urls.pyx
set(cython_sources
convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx
convert_floats.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ from . cimport (
convert_fixed_point,
convert_floats,
convert_ipv4,
convert_lists,
convert_urls,
)
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
convert_fixed_point,
convert_floats,
convert_ipv4,
convert_lists,
convert_urls,
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ cpdef Column to_fixed_point(Column input, DataType output_type):
Returns a new fixed-point column parsing decimal values from the
provided strings column.
For details, see :cpp:details:`cudf::strings::to_fixed_point`
For details, see :cpp:func:`cudf::strings::to_fixed_point`
Parameters
----------
Expand Down Expand Up @@ -47,7 +47,7 @@ cpdef Column from_fixed_point(Column input):
Returns a new strings column converting the fixed-point values
into a strings column.
For details, see :cpp:details:`cudf::strings::from_fixed_point`
For details, see :cpp:func:`cudf::strings::from_fixed_point`
Parameters
----------
Expand Down Expand Up @@ -75,7 +75,7 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None):
Returns a boolean column identifying strings in which all
characters are valid for conversion to fixed-point.
For details, see :cpp:details:`cudf::strings::is_fixed_point`
For details, see :cpp:func:`cudf::strings::is_fixed_point`
Parameters
----------
Expand Down
11 changes: 11 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.scalar cimport Scalar


cpdef Column format_list_column(
Column input,
Scalar na_rep=*,
Column separators=*
)
72 changes: 72 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.column_factories cimport make_empty_column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from pylibcudf.libcudf.strings.convert cimport (
convert_lists as cpp_convert_lists,
)
from pylibcudf.scalar cimport Scalar
from pylibcudf.types cimport type_id

from cython.operator import dereference


cpdef Column format_list_column(
Column input,
Scalar na_rep=None,
Column separators=None
):
"""
Convert a list column of strings into a formatted strings column.
For details, see :cpp:func`cudf::strings::format_list_column`
Parameters
----------
input : Column
Lists column to format
na_rep : Scalar
Replacement string for null elements.
Default, empty string
separators : Column
Strings to use for enclosing list components and separating elements.
Default, ``,``, ``[``, ``]``
Returns
-------
Column
New strings column
"""
cdef unique_ptr[column] c_result

if na_rep is None:
na_rep = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)

cdef const string_scalar* c_na_rep = <const string_scalar*>(
na_rep.c_obj.get()
)

if separators is None:
separators = make_empty_column(type_id.STRING)

with nogil:
c_result = move(
cpp_convert_lists.format_list_column(
input.view(),
dereference(c_na_rep),
separators.view()
)
)

return Column.from_libcudf(move(c_result))
21 changes: 21 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
@pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
def test_format_list_column(na_rep, separators):
arr = pa.array([["1", "A"], None])
result = plc.strings.convert.convert_lists.format_list_column(
plc.interop.from_arrow(arr),
na_rep if na_rep is None else plc.interop.from_arrow(na_rep),
separators
if separators is None
else plc.interop.from_arrow(separators),
)
expected = pa.array(["[1,A]", ""])
assert_column_eq(result, expected)

0 comments on commit 69b0f66

Please sign in to comment.