Skip to content

Commit

Permalink
Migrate nvtext jaccard API to pylibcudf (#17007)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17007
  • Loading branch information
Matt711 authored Oct 8, 2024
1 parent 553d8ec commit 618a93f
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ nvtext

edit_distance
generate_ngrams
jaccard
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
jaccard
=======

.. automodule:: pylibcudf.nvtext.jaccard
:members:
33 changes: 8 additions & 25 deletions python/cudf/cudf/_lib/nvtext/jaccard.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,16 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.jaccard cimport (
jaccard_index as cpp_jaccard_index,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column

from pylibcudf import nvtext


@acquire_spill_lock()
def jaccard_index(Column input1, Column input2, int width):
cdef column_view c_input1 = input1.view()
cdef column_view c_input2 = input2.view()
cdef size_type c_width = width
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_jaccard_index(
c_input1,
c_input2,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.jaccard.jaccard_index(
input1.to_pylibcudf(mode="read"),
input2.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance, generate_ngrams
from . cimport edit_distance, generate_ngrams, jaccard

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance, generate_ngrams
from . import edit_distance, generate_ngrams, jaccard

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
]
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type


cpdef Column jaccard_index(Column input1, Column input2, size_type width)
47 changes: 47 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.jaccard cimport (
jaccard_index as cpp_jaccard_index,
)
from pylibcudf.libcudf.types cimport size_type


cpdef Column jaccard_index(Column input1, Column input2, size_type width):
"""
Returns the Jaccard similarity between individual rows in two strings columns.
For details, see :cpp:func:`jaccard_index`
Parameters
----------
input1 : Column
Input strings column
input2 : Column
Input strings column
width : size_type
The ngram number to generate
Returns
-------
Column
Index calculation values
"""
cdef column_view c_input1 = input1.view()
cdef column_view c_input2 = input2.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_jaccard_index(
c_input1,
c_input2,
width
)
)

return Column.from_libcudf(move(c_result))
37 changes: 37 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture(scope="module")
def input_data():
input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"]
input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"]
return pa.array(input1), pa.array(input2)


@pytest.mark.parametrize("width", [2, 3])
def test_jaccard_index(input_data, width):
def get_tokens(s, width):
return [s[i : i + width] for i in range(len(s) - width + 1)]

def jaccard_index(s1, s2, width):
x = set(get_tokens(s1, width))
y = set(get_tokens(s2, width))
return len(x & y) / len(x | y)

input1, input2 = input_data
result = plc.nvtext.jaccard.jaccard_index(
plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width
)
expected = pa.array(
[
jaccard_index(s1.as_py(), s2.as_py(), width)
for s1, s2 in zip(input1, input2)
],
type=pa.float32(),
)
assert_column_eq(result, expected)

0 comments on commit 618a93f

Please sign in to comment.