diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst index 58303356336..3a79c869971 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -9,3 +9,4 @@ nvtext jaccard minhash ngrams_tokenize + normalize diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst new file mode 100644 index 00000000000..e496f6a45da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst @@ -0,0 +1,6 @@ +========= +normalize +========= + +.. automodule:: pylibcudf.nvtext.normalize + :members: diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx index 6521116eafe..c125d92a24e 100644 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx @@ -14,10 +14,11 @@ def ngrams_tokenize( object py_delimiter, object py_separator ): - result = nvtext.ngrams_tokenize.ngrams_tokenize( - input.to_pylibcudf(mode="read"), - ngrams, - py_delimiter.device_value.c_value, - py_separator.device_value.c_value + return Column.from_pylibcudf( + nvtext.ngrams_tokenize.ngrams_tokenize( + input.to_pylibcudf(mode="read"), + ngrams, + py_delimiter.device_value.c_value, + py_separator.device_value.c_value + ) ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx index 5e86a9ce959..633bc902db1 100644 --- a/python/cudf/cudf/_lib/nvtext/normalize.pyx +++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx @@ -3,36 +3,24 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) from cudf._lib.column cimport Column - -@acquire_spill_lock() -def normalize_spaces(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_normalize_spaces(c_strings)) - - return Column.from_unique_ptr(move(c_result)) +from pylibcudf import nvtext @acquire_spill_lock() -def normalize_characters(Column strings, bool do_lower=True): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result +def normalize_spaces(Column input): + result = nvtext.normalize.normalize_spaces( + input.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) - with nogil: - c_result = move(cpp_normalize_characters(c_strings, do_lower)) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def normalize_characters(Column input, bool do_lower=True): + result = nvtext.normalize.normalize_characters( + input.to_pylibcudf(mode="read"), + do_lower, + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 94df9bbbebb..e01ca3fbdd3 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx - ngrams_tokenize.pyx + ngrams_tokenize.pyx normalize.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index b6659827688..08dbec84090 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -6,6 +6,7 @@ from . cimport ( jaccard, minhash, ngrams_tokenize, + normalize, ) __all__ = [ @@ -13,5 +14,6 @@ __all__ = [ "generate_ngrams", "jaccard", "minhash", - "ngrams_tokenize" + "ngrams_tokenize", + "normalize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index f74633a3521..6dccf3dd9cf 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,6 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance, generate_ngrams, jaccard, minhash, ngrams_tokenize +from . import ( + edit_distance, + generate_ngrams, + jaccard, + minhash, + ngrams_tokenize, + normalize, +) __all__ = [ "edit_distance", @@ -8,4 +15,5 @@ "jaccard", "minhash", "ngrams_tokenize", + "normalize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd new file mode 100644 index 00000000000..90676145afa --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from pylibcudf.column cimport Column + + +cpdef Column normalize_spaces(Column input) + +cpdef Column normalize_characters(Column input, bool do_lower_case) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx new file mode 100644 index 00000000000..637d900b659 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.normalize cimport ( + normalize_characters as cpp_normalize_characters, + normalize_spaces as cpp_normalize_spaces, +) + + +cpdef Column normalize_spaces(Column input): + """ + Returns a new strings column by normalizing the whitespace in + each string in the input column. + + For details, see :cpp:func:`normalize_spaces` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New strings columns of normalized strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize_spaces(input.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, bool do_lower_case): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + do_lower_case : bool + If true, upper-case characters are converted to lower-case + and accents are stripped from those characters. If false, + accented and upper-case characters are not transformed. + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize_characters(input.view(), do_lower_case) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py new file mode 100644 index 00000000000..fe28b83c09a --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def norm_spaces_input_data(): + arr = ["a b", " c d\n", "e \t f "] + return pa.array(arr) + + +@pytest.fixture(scope="module") +def norm_chars_input_data(): + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + return pa.array(arr) + + +def test_normalize_spaces(norm_spaces_input_data): + result = plc.nvtext.normalize.normalize_spaces( + plc.interop.from_arrow(norm_spaces_input_data) + ) + expected = pa.array(["a b", "c d", "e f"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalize_characters(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + do_lower, + ) + expected = pa.array( + ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + ) + if not do_lower: + expected = pa.array( + ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + ) + assert_column_eq(result, expected)