Add string.contains APIs to pylibcudf (#16814)

Contributes to #15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #16814
rapidsai · Sep 19, 2024 · 51c2dd6 · 51c2dd6
1 parent e9b5b53
commit 51c2dd6
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 69 deletions.
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,27 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.contains cimport (
-    count_re as cpp_count_re,
-    like as cpp_like,
-    matches_re as cpp_matches_re,
-)
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 from pylibcudf.strings import contains
 from pylibcudf.strings.regex_program import RegexProgram
@@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with count of occurrences of `reg_ex` in
     each string of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_count_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.count_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with each value True if the string matches `reg_ex`
     regular expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_matches_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape):
     Returns a Column with each value True if the string matches the
     `py_pattern` like expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef DeviceScalar pattern = py_pattern.device_value
-    cdef DeviceScalar escape = py_escape.device_value
-
-    cdef const string_scalar* scalar_ptn = <const string_scalar*>(
-        pattern.get_raw_ptr()
-    )
-    cdef const string_scalar* scalar_esc = <const string_scalar*>(
-        escape.get_raw_ptr()
+    plc_column = contains.like(
+        source_strings.to_pylibcudf(mode="read"),
+        py_pattern.device_value.c_value,
+        py_escape.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_like(
-            source_view,
-            scalar_ptn[0],
-            scalar_esc[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
@@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] like(
         column_view source_strings,
         string_scalar pattern,
-        string_scalar escape) except +
+        string_scalar escape_character) except +
+
+    cdef unique_ptr[column] like(
+        column_view source_strings,
+        column_view patterns,
+        string_scalar escape_character) except +
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd
@@ -1,7 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Column contains_re(Column input, RegexProgram prog)
+
+cpdef Column count_re(Column input, RegexProgram prog)
+
+cpdef Column matches_re(Column input, RegexProgram prog)
+
+cpdef Column like(
+    Column input,
+    ColumnOrScalar pattern,
+    Scalar escape_character = *
+)
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -1,8 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from cython.operator import dereference
+
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
 from pylibcudf.libcudf.strings cimport contains as cpp_contains
 from pylibcudf.strings.regex_program cimport RegexProgram
 
@@ -32,9 +38,131 @@ cpdef Column contains_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_contains.contains_re(
+        result = move(cpp_contains.contains_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column count_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns the number of times the given regex_program's pattern
+    matches in each string.
+
+    For details, see :cpp:func:`cudf::strings::count_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of match counts for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.count_re(
             input.view(),
             prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column matches_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns a boolean column identifying rows which
+    matching the given regex_program object but only at
+    the beginning the string.
+
+    For details, see :cpp:func:`cudf::strings::matches_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.matches_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None):
+    """
+    Returns a boolean column identifying rows which
+    match the given like pattern.
+
+    For details, see :cpp:func:`cudf::strings::like`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    pattern : Column or Scalar
+        Like patterns to match within each string
+    escape_character : Scalar
+        Optional character specifies the escape prefix.
+        Default is no escape character.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+    cdef unique_ptr[column] result
+
+    if escape_character is None:
+        escape_character = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
         )
 
+    cdef const string_scalar* c_escape_character = <const string_scalar*>(
+        escape_character.c_obj.get()
+    )
+    cdef const string_scalar* c_pattern
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                pattern.view(),
+                dereference(c_escape_character)
+            ))
+    elif ColumnOrScalar is Scalar:
+        c_pattern = <const string_scalar*>(pattern.c_obj.get())
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                dereference(c_pattern),
+                dereference(c_escape_character)
+            ))
+    else:
+        raise ValueError("pattern must be a Column or a Scalar")
+
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
         pa_target_col, pa_target_scalar.as_py()
     )
     assert_column_eq(got, expected)
+
+
+def test_count_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["A1a2A3a4", "A1A2A3", None])
+    result = plc.strings.contains.count_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.count_substring_regex(arr, pattern)
+    assert_column_eq(result, expected)
+
+
+def test_match_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["1a2b", "b1a2", None])
+    result = plc.strings.contains.matches_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.match_substring_regex(arr, f"^{pattern}")
+    assert_column_eq(result, expected)
+
+
+def test_like():
+    pattern = "%a"
+    arr = pa.array(["1a2aa3aaa"])
+    result = plc.strings.contains.like(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(pa.array([pattern])),
+    )
+    expected = pc.match_like(arr, pattern)
+    assert_column_eq(result, expected)