Skip to content

Commit

Permalink
Migrate nvtext generate_ngrams APIs to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 committed Oct 6, 2024
1 parent fcff2b6 commit 7a0e19f
Show file tree
Hide file tree
Showing 9 changed files with 209 additions and 60 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
===============
generate_ngrams
===============

.. automodule:: pylibcudf.nvtext.generate_ngrams
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ nvtext
:maxdepth: 1

edit_distance
generate_ngrams
76 changes: 19 additions & 57 deletions python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,75 +2,37 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
generate_character_ngrams as cpp_generate_character_ngrams,
generate_ngrams as cpp_generate_ngrams,
hash_character_ngrams as cpp_hash_character_ngrams,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar as plc_Scalar

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf import nvtext


@acquire_spill_lock()
def generate_ngrams(Column strings, int ngrams, object py_separator):

cdef DeviceScalar separator = py_separator.device_value

cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef const string_scalar* c_separator = <const string_scalar*>separator\
.get_raw_ptr()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_ngrams(
c_strings,
c_ngrams,
c_separator[0]
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.generate_ngrams.generate_ngrams(
strings.to_pylibcudf(mode="read"),
<size_type> ngrams,
<plc_Scalar> py_separator.device_value.c_value
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def generate_character_ngrams(Column strings, int ngrams):
cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_character_ngrams(
c_strings,
c_ngrams
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.generate_ngrams.generate_character_ngrams(
strings.to_pylibcudf(mode="read"),
<size_type> ngrams
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def hash_character_ngrams(Column strings, int ngrams):
cdef column_view c_strings = strings.view()
cdef size_type c_ngrams = ngrams
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_hash_character_ngrams(
c_strings,
c_ngrams
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.generate_ngrams.generate_chash_character_ngramsharacter_ngrams(
strings.to_pylibcudf(mode="read"),
<size_type> ngrams
)
return Column.from_pylibcudf(result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance
from . cimport edit_distance, generate_ngrams

__all__ = [
"edit_distance",
"generate_ngrams",
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance
from . import edit_distance, generate_ngrams

__all__ = [
"edit_distance",
"generate_ngrams",
]
12 changes: 12 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)

cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)

cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
111 changes: 111 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
generate_character_ngrams as cpp_generate_character_ngrams,
generate_ngrams as cpp_generate_ngrams,
hash_character_ngrams as cpp_hash_character_ngrams,
)
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar


cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
"""
Returns a single column of strings by generating ngrams from a strings column.
For details, see :cpp:func:`generate_ngrams`
Parameters
----------
input : Column
Input strings
ngram : size_type
The ngram number to generate
separator : Scalar
The string to use for separating ngram tokens
Returns
-------
Column
New strings columns of tokens
"""
cdef column_view c_strings = input.view()
cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_ngrams(
c_strings,
ngrams,
c_separator[0]
)
)
return Column.from_libcudf(move(c_result))


cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
"""
Returns a lists column of ngrams of characters within each string.
For details, see :cpp:func:`generate_character_ngrams`
Parameters
----------
input : Column
Input strings
ngram : size_type
The ngram number to generate
Returns
-------
Column
Lists column of strings
"""
cdef column_view c_strings = input.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_generate_character_ngrams(
c_strings,
ngrams,
)
)
return Column.from_libcudf(move(c_result))

cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
"""
Returns a lists column of hash values of the characters in each string
For details, see :cpp:func:`hash_character_ngrams`
Parameters
----------
input : Column
Input strings
ngram : size_type
The ngram number to generate
Returns
-------
Column
Lists column of hash values
"""
cdef column_view c_strings = input.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_hash_character_ngrams(
c_strings,
ngrams,
)
)
return Column.from_libcudf(move(c_result))
55 changes: 55 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_col():
arr = ["ab", "cde", "fgh"]
return pa.array(arr)


@pytest.mark.parametrize("ngram", [2, 3])
@pytest.mark.parametrize("sep", ["_", "**", ","])
def test_generate_ngrams(input_col, ngram, sep):
result = plc.nvtext.generate_ngrams.generate_ngrams(
plc.interop.from_arrow(input_col),
ngram,
plc.interop.from_arrow(pa.scalar(sep)),
)
expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"])
if ngram == 3:
expected = pa.array([f"ab{sep}cde{sep}fgh"])
assert_column_eq(result, expected)


@pytest.mark.parametrize("ngram", [2, 3])
def test_generate_character_ngrams(input_col, ngram):
result = plc.nvtext.generate_ngrams.generate_character_ngrams(
plc.interop.from_arrow(input_col),
ngram,
)
expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]])
if ngram == 3:
expected = pa.array([[], ["cde"], ["fgh"]])
assert_column_eq(result, expected)


@pytest.mark.parametrize("ngram", [2, 3])
def test_hash_character_ngrams(input_col, ngram):
result = plc.nvtext.generate_ngrams.hash_character_ngrams(
plc.interop.from_arrow(input_col),
ngram,
)
pa_result = plc.interop.to_arrow(result)
if ngram == 2:
assert len(pa_result[0]) == 1
assert len(pa_result[1]) == 2
assert len(pa_result[2]) == 2
else:
assert len(pa_result[0]) == 0
assert len(pa_result[1]) == 1
assert len(pa_result[2]) == 1

0 comments on commit 7a0e19f

Please sign in to comment.