Migrate NVText Normalizing APIs to Pylibcudf (#17072)

Apart of #15162. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: #17072
rapidsai · Oct 17, 2024 · ce93c36 · ce93c36
1 parent 00feb82
commit ce93c36
Show file tree

Hide file tree

Showing 10 changed files with 155 additions and 34 deletions.
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -9,3 +9,4 @@ nvtext
     jaccard
     minhash
     ngrams_tokenize
+    normalize
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst
@@ -0,0 +1,6 @@
+=========
+normalize
+=========
+
+.. automodule:: pylibcudf.nvtext.normalize
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -14,10 +14,11 @@ def ngrams_tokenize(
     object py_delimiter,
     object py_separator
 ):
-    result = nvtext.ngrams_tokenize.ngrams_tokenize(
-        input.to_pylibcudf(mode="read"),
-        ngrams,
-        py_delimiter.device_value.c_value,
-        py_separator.device_value.c_value
+    return Column.from_pylibcudf(
+        nvtext.ngrams_tokenize.ngrams_tokenize(
+            input.to_pylibcudf(mode="read"),
+            ngrams,
+            py_delimiter.device_value.c_value,
+            py_separator.device_value.c_value
+        )
     )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -3,36 +3,24 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
 
 from cudf._lib.column cimport Column
 
-
-@acquire_spill_lock()
-def normalize_spaces(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_normalize_spaces(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
-def normalize_characters(Column strings, bool do_lower=True):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
+def normalize_spaces(Column input):
+    result = nvtext.normalize.normalize_spaces(
+        input.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(result)
 
-    with nogil:
-        c_result = move(cpp_normalize_characters(c_strings, do_lower))
 
-    return Column.from_unique_ptr(move(c_result))
+@acquire_spill_lock()
+def normalize_characters(Column input, bool do_lower=True):
+    result = nvtext.normalize.normalize_characters(
+        input.to_pylibcudf(mode="read"),
+        do_lower,
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx
+                   ngrams_tokenize.pyx normalize.pyx
 )
 
 set(linked_libraries cudf::cudf)

diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -6,12 +6,14 @@ from . cimport (
     jaccard,
     minhash,
     ngrams_tokenize,
+    normalize,
 )
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
     "minhash",
-    "ngrams_tokenize"
+    "ngrams_tokenize",
+    "normalize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,11 +1,19 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance, generate_ngrams, jaccard, minhash, ngrams_tokenize
+from . import (
+    edit_distance,
+    generate_ngrams,
+    jaccard,
+    minhash,
+    ngrams_tokenize,
+    normalize,
+)
 
 __all__ = [
     "edit_distance",
     "generate_ngrams",
     "jaccard",
     "minhash",
     "ngrams_tokenize",
+    "normalize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+
+
+cpdef Column normalize_spaces(Column input)
+
+cpdef Column normalize_characters(Column input, bool do_lower_case)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.normalize cimport (
+    normalize_characters as cpp_normalize_characters,
+    normalize_spaces as cpp_normalize_spaces,
+)
+
+
+cpdef Column normalize_spaces(Column input):
+    """
+    Returns a new strings column by normalizing the whitespace in
+    each string in the input column.
+
+    For details, see :cpp:func:`normalize_spaces`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+
+    Returns
+    -------
+    Column
+        New strings columns of normalized strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_spaces(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column normalize_characters(Column input, bool do_lower_case):
+    """
+    Normalizes strings characters for tokenizing.
+
+    For details, see :cpp:func:`normalize_characters`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    do_lower_case : bool
+        If true, upper-case characters are converted to lower-case
+        and accents are stripped from those characters. If false,
+        accented and upper-case characters are not transformed.
+
+    Returns
+    -------
+    Column
+        Normalized strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize_characters(input.view(), do_lower_case)
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def norm_spaces_input_data():
+    arr = ["a b", "  c  d\n", "e \t f "]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def norm_chars_input_data():
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    return pa.array(arr)
+
+
+def test_normalize_spaces(norm_spaces_input_data):
+    result = plc.nvtext.normalize.normalize_spaces(
+        plc.interop.from_arrow(norm_spaces_input_data)
+    )
+    expected = pa.array(["a b", "c d", "e f"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalize_characters(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        do_lower,
+    )
+    expected = pa.array(
+        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    )
+    if not do_lower:
+        expected = pa.array(
+            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+        )
+    assert_column_eq(result, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,4 @@ nvtext @@
         jaccard
         minhash
         ngrams_tokenize
+        normalize