Skip to content

Commit

Permalink
[WIP] Migrate nvtext/edit_distance APIs to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 committed Sep 30, 2024
1 parent 9b2f892 commit 86e1f5f
Show file tree
Hide file tree
Showing 13 changed files with 161 additions and 25 deletions.
2 changes: 1 addition & 1 deletion cpp/include/nvtext/edit_distance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext {
* @param targets Strings to compute edit distance against `input`
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings columns of with replaced strings
* @return New lists column of edit distance values
*/
std::unique_ptr<cudf::column> edit_distance(
cudf::strings_column_view const& input,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=============
edit_distance
=============

.. automodule:: pylibcudf.nvtext.edit_distance
:members:
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
nvtext
======
.. toctree::
:maxdepth: 1
edit_distance
34 changes: 10 additions & 24 deletions python/cudf/cudf/_lib/nvtext/edit_distance.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,23 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.edit_distance cimport (
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix,
)
from pylibcudf cimport nvtext

from cudf._lib.column cimport Column


@acquire_spill_lock()
def edit_distance(Column strings, Column targets):
cdef column_view c_strings = strings.view()
cdef column_view c_targets = targets.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance(c_strings, c_targets))

return Column.from_unique_ptr(move(c_result))
result = nvtext.edit_distance.edit_distance(
strings.to_pylibcudf(mode="read"),
targets.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def edit_distance_matrix(Column strings):
cdef column_view c_strings = strings.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance_matrix(c_strings))

return Column.from_unique_ptr(move(c_result))
result = nvtext.edit_distance.edit_distance_matrix(
strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(result)
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
add_subdirectory(libcudf)
add_subdirectory(strings)
add_subdirectory(io)
add_subdirectory(nvtext)
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from . cimport (
lists,
merge,
null_mask,
nvtext,
partitioning,
quantiles,
reduce,
Expand Down Expand Up @@ -78,4 +79,5 @@ __all__ = [
"transpose",
"types",
"unary",
"nvtext",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
lists,
merge,
null_mask,
nvtext,
partitioning,
quantiles,
reduce,
Expand Down Expand Up @@ -92,4 +93,5 @@
"transpose",
"types",
"unary",
"nvtext",
]
22 changes: 22 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf
)
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance

__all__ = [
"edit_distance",
]
7 changes: 7 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance

__all__ = [
"edit_distance",
]
8 changes: 8 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column


cpdef Column edit_distance(Column input, Column targets)

cpdef Column edit_distance_matrix(Column input)
63 changes: 63 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.edit_distance cimport (
edit_distance as cpp_edit_distance,
edit_distance_matrix as cpp_edit_distance_matrix,
)


cpdef Column edit_distance(Column input, Column targets):
"""
Returns the edit distance between individual strings in two strings columns
For details, see :cpp:func:`edit_distance`
Parameters
----------
input : Column
Input strings
targets : Column
Strings to compute edit distance against
Returns
-------
Column
New column of edit distance values
"""
cdef column_view c_strings = input.view()
cdef column_view c_targets = targets.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance(c_strings, c_targets))

return Column.from_libcudf(move(c_result))


cpdef Column edit_distance_matrix(Column input):
"""
Returns the edit distance between all strings in the input strings column
For details, see :cpp:func:`edit_distance_matrix`
Parameters
----------
input : Column
Input strings
Returns
-------
Column
New column of edit distance values
"""
cdef column_view c_strings = input.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_edit_distance_matrix(c_strings))

return Column.from_libcudf(move(c_result))
27 changes: 27 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
from utils import assert_column_eq


def test_edit_distance():
input_arr = pa.array(["hallo", "goodbye", "world"])
targets = pa.array(["hello", "", "world"])
result = plc.nvtext.edit_distance.edit_distance(
plc.interop.from_arrow(input_arr),
plc.interop.from_arrow(targets),
)
expected = pa.array([1, 7, 0], type=pa.int32())
assert_column_eq(result, expected)


def test_edit_distance_matrix():
input_arr = pa.array(["hallo", "goodbye", "world"])
result = plc.nvtext.edit_distance.edit_distance_matrix(
plc.interop.from_arrow(input_arr),
)
expected = pa.array(
[[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32())
)
assert_column_eq(expected, result)

0 comments on commit 86e1f5f

Please sign in to comment.