From e9b5b538d515219ea36ec62f31ff78424e1fcf89 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 19 Sep 2024 07:36:55 -1000 Subject: [PATCH] Add string.repeats API to pylibcudf (#16834) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16834 --- .../api_docs/pylibcudf/strings/index.rst | 1 + .../api_docs/pylibcudf/strings/repeat.rst | 6 +++ python/cudf/cudf/_lib/strings/repeat.pyx | 40 +++++---------- .../pylibcudf/libcudf/strings/repeat.pxd | 8 +-- .../pylibcudf/strings/CMakeLists.txt | 2 +- .../pylibcudf/pylibcudf/strings/__init__.py | 1 + python/pylibcudf/pylibcudf/strings/repeat.pxd | 10 ++++ python/pylibcudf/pylibcudf/strings/repeat.pyx | 51 +++++++++++++++++++ .../pylibcudf/tests/test_string_repeat.py | 20 ++++++++ 9 files changed, 106 insertions(+), 33 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_repeat.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 462a756a092..1200ecba5d9 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -10,5 +10,6 @@ strings find regex_flags regex_program + repeat replace slice diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst new file mode 100644 index 00000000000..0041fe4c3da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst @@ -0,0 +1,6 @@ +====== +repeat +====== + +.. automodule:: pylibcudf.strings.repeat + :members: diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx index 42fcfa5d94e..43649d4defe 100644 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ b/python/cudf/cudf/_lib/strings/repeat.pyx @@ -1,17 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def repeat_scalar(Column source_strings, @@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repeats_view = repeats.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd index 410ff58f299..59262820411 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd @@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] repeat_strings( - column_view strings, - size_type repeat) except + + column_view input, + size_type repeat_times) except + cdef unique_ptr[column] repeat_strings( - column_view strings, - column_view repeats) except + + column_view input, + column_view repeat_times) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index b499a127541..457e462e3cf 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx slice.pyx + regex_program.pyx repeat.pyx replace.pyx slice.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index ef102aff2af..250cefedf55 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -8,6 +8,7 @@ find, regex_flags, regex_program, + repeat, replace, slice, ) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd new file mode 100644 index 00000000000..bc70926b6fa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnorSizeType: + Column + size_type + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx new file mode 100644 index 00000000000..5f627218f6e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport repeat as cpp_repeat +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): + """ + Repeat each string in the given strings column by the numbers + of times given in another numeric column. + + For details, see :cpp:func:`cudf::strings::repeat`. + + Parameters + ---------- + input : Column + The column containing strings to repeat. + repeat_times : Column or int + Number(s) of times that the corresponding input strings + for each row are repeated. + + Returns + ------- + Column + New column containing the repeated strings. + """ + cdef unique_ptr[column] c_result + + if ColumnorSizeType is Column: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() + ) + ) + elif ColumnorSizeType is size_type: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times + ) + ) + else: + raise ValueError("repeat_times must be size_type or integer") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py new file mode 100644 index 00000000000..18b5d8bf4d0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest + + +@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) +def test_repeat_strings(repeats): + arr = pa.array(["1", None]) + plc_result = plc.strings.repeat.repeat_strings( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(repeats) + if not isinstance(repeats, int) + else repeats, + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.binary_repeat(arr, repeats)) + assert result.equals(expected)