From 51c2dd6f05f9c9d07f6e07b0119906e1ea32fc2d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 19 Sep 2024 07:38:48 -1000 Subject: [PATCH] Add string.contains APIs to pylibcudf (#16814) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16814 --- python/cudf/cudf/_lib/strings/contains.pyx | 80 ++--------- .../pylibcudf/libcudf/strings/contains.pxd | 7 +- .../pylibcudf/pylibcudf/strings/contains.pxd | 14 ++ .../pylibcudf/pylibcudf/strings/contains.pyx | 130 +++++++++++++++++- .../pylibcudf/tests/test_string_contains.py | 37 +++++ 5 files changed, 199 insertions(+), 69 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 82f5e06c547..03b4887f200 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,27 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.contains cimport ( - count_re as cpp_count_re, - like as cpp_like, - matches_re as cpp_matches_re, -) -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar from pylibcudf.strings import contains from pylibcudf.strings.regex_program import RegexProgram @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with count of occurrences of `reg_ex` in each string of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_count_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with each value True if the string matches `reg_ex` regular expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_matches_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape): Returns a Column with each value True if the string matches the `py_pattern` like expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef DeviceScalar pattern = py_pattern.device_value - cdef DeviceScalar escape = py_escape.device_value - - cdef const string_scalar* scalar_ptn = ( - pattern.get_raw_ptr() - ) - cdef const string_scalar* scalar_esc = ( - escape.get_raw_ptr() + plc_column = contains.like( + source_strings.to_pylibcudf(mode="read"), + py_pattern.device_value.c_value, + py_escape.device_value.c_value, ) - - with nogil: - c_result = move(cpp_like( - source_view, - scalar_ptn[0], - scalar_esc[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd index c2fb5f0dce4..eac0f748257 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd @@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] like( column_view source_strings, string_scalar pattern, - string_scalar escape) except + + string_scalar escape_character) except + + + cdef unique_ptr[column] like( + column_view source_strings, + column_view patterns, + string_scalar escape_character) except + diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd index 2cd4891a0ea..6146a1119d6 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/strings/contains.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_program cimport RegexProgram +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Column contains_re(Column input, RegexProgram prog) + +cpdef Column count_re(Column input, RegexProgram prog) + +cpdef Column matches_re(Column input, RegexProgram prog) + +cpdef Column like( + Column input, + ColumnOrScalar pattern, + Scalar escape_character = * +) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 1a2446f6e2c..82bd1fbea32 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -1,8 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cython.operator import dereference + from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram @@ -32,9 +38,131 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = cpp_contains.contains_re( + result = move(cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column count_re( + Column input, + RegexProgram prog +): + """Returns the number of times the given regex_program's pattern + matches in each string. + + For details, see :cpp:func:`cudf::strings::count_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of match counts for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.count_re( input.view(), prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column matches_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which + matching the given regex_program object but only at + the beginning the string. + + For details, see :cpp:func:`cudf::strings::matches_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.matches_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None): + """ + Returns a boolean column identifying rows which + match the given like pattern. + + For details, see :cpp:func:`cudf::strings::like`. + + Parameters + ---------- + input : Column + The input strings + pattern : Column or Scalar + Like patterns to match within each string + escape_character : Scalar + Optional character specifies the escape prefix. + Default is no escape character. + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + cdef unique_ptr[column] result + + if escape_character is None: + escape_character = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) ) + cdef const string_scalar* c_escape_character = ( + escape_character.c_obj.get() + ) + cdef const string_scalar* c_pattern + + if ColumnOrScalar is Column: + with nogil: + result = move(cpp_contains.like( + input.view(), + pattern.view(), + dereference(c_escape_character) + )) + elif ColumnOrScalar is Scalar: + c_pattern = (pattern.c_obj.get()) + with nogil: + result = move(cpp_contains.like( + input.view(), + dereference(c_pattern), + dereference(c_escape_character) + )) + else: + raise ValueError("pattern must be a Column or a Scalar") + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4f88e09183f..4e4dd7cbb00 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat): pa_target_col, pa_target_scalar.as_py() ) assert_column_eq(got, expected) + + +def test_count_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["A1a2A3a4", "A1A2A3", None]) + result = plc.strings.contains.count_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.count_substring_regex(arr, pattern) + assert_column_eq(result, expected) + + +def test_match_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["1a2b", "b1a2", None]) + result = plc.strings.contains.matches_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.match_substring_regex(arr, f"^{pattern}") + assert_column_eq(result, expected) + + +def test_like(): + pattern = "%a" + arr = pa.array(["1a2aa3aaa"]) + result = plc.strings.contains.like( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(pa.array([pattern])), + ) + expected = pc.match_like(arr, pattern) + assert_column_eq(result, expected)