Skip to content

Commit

Permalink
Migrate Min Hashing APIs to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 committed Oct 8, 2024
1 parent 860835d commit 5e5cd03
Show file tree
Hide file tree
Showing 11 changed files with 203 additions and 87 deletions.
99 changes: 26 additions & 73 deletions python/cudf/cudf/_lib/nvtext/minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,93 +2,46 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.minhash cimport (
minhash as cpp_minhash,
minhash64 as cpp_minhash64,
word_minhash as cpp_word_minhash,
word_minhash64 as cpp_word_minhash64,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column

from pylibcudf import nvtext


@acquire_spill_lock()
def minhash(Column strings, Column seeds, int width):

cdef column_view c_strings = strings.view()
cdef size_type c_width = width
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_minhash(
c_strings,
c_seeds,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.minhash(
strings.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def minhash64(Column strings, Column seeds, int width):

cdef column_view c_strings = strings.view()
cdef size_type c_width = width
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_minhash64(
c_strings,
c_seeds,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.minhash64(
strings.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def word_minhash(Column input, Column seeds):

cdef column_view c_input = input.view()
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash(
c_input,
c_seeds
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.minhash(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
4,
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def word_minhash64(Column input, Column seeds):

cdef column_view c_input = input.view()
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash64(
c_input,
c_seeds
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.minhash64(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
4,
)
return Column.from_pylibcudf(result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector
from pylibcudf.libcudf.column.column cimport column, column_view
from pylibcudf.libcudf.table.table cimport table, table_view
from pylibcudf.libcudf.utilities.host_span cimport host_span
from pylibcudf.libcudf.utilities.span cimport host_span

from rmm._lib.device_buffer cimport device_buffer

Expand Down
1 change: 0 additions & 1 deletion python/pylibcudf/pylibcudf/libcudf/groupby.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport (
size_type,
sorted,
)
from pylibcudf.libcudf.utilities.host_span cimport host_span

# workaround for https://github.com/cython/cython/issues/3885
ctypedef const scalar constscalar
Expand Down
21 changes: 13 additions & 8 deletions python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
Original file line number Diff line number Diff line change
@@ -1,31 +1,36 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.libcudf.utilities.span cimport device_span


cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:

cdef unique_ptr[column] minhash(
const column_view &strings,
const column_view &seeds,
const numeric_scalar[uint32_t] seed,
const size_type width,
) except +

cdef unique_ptr[column] minhash64(
cdef unique_ptr[column] minhash(
const column_view &strings,
const column_view &seeds,
const size_type width,
) except +

cdef unique_ptr[column] word_minhash(
const column_view &input,
const column_view &seeds
cdef unique_ptr[column] minhash64(
const column_view &strings,
const column_view &seeds,
const size_type width,
) except +

cdef unique_ptr[column] word_minhash64(
const column_view &input,
const column_view &seeds
cdef unique_ptr[column] minhash64(
const column_view &strings,
const numeric_scalar[uint64_t] seed,
const size_type width,
) except +
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.vector cimport vector

Expand All @@ -7,3 +7,6 @@ cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil:
cdef cppclass host_span[T]:
host_span() except +
host_span(vector[T]) except +
cdef cppclass device_span[T]:
device_span()
device_span(device_span other) except +
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance, generate_ngrams, jaccard
from . cimport edit_distance, generate_ngrams, jaccard, minhash

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash"
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance, generate_ngrams, jaccard
from . import edit_distance, generate_ngrams, jaccard, minhash

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash",
]
14 changes: 14 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/minhash.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)

cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
115 changes: 115 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/minhash.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.nvtext.minhash cimport (
minhash as cpp_minhash,
minhash64 as cpp_minhash64,
)
from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar

from cython.operator import dereference


cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
"""
Returns the minhash values for each string per seed.
This function uses MurmurHash3_x86_32 for the hash algorithm.
For details, see :cpp:func:`cudf::nvtext::minhash`.
Parameters
----------
input : Column
Strings column to compute minhash
seeds : Column or Scalar
Seed value(s) used for the hash algorithm.
width : size_type
Character width used for apply substrings;
Default is 4 characters.
Returns
-------
Column
List column of minhash values for each string per seed
"""
cdef unique_ptr[column] c_result
cdef numeric_scalar[uint32_t]* cpp_seed

if ColumnOrScalar is Column:
with nogil:
c_result = move(
cpp_minhash(
input.view(),
seeds.view(),
width
)
)
elif ColumnOrScalar is Scalar:
cpp_seed = <numeric_scalar[uint32_t]*>seeds.c_obj.get()
with nogil:
c_result = move(
cpp_minhash(
input.view(),
dereference(cpp_seed),
width
)
)
else:
raise ValueError("seeds must be a Column or Scalar")

return Column.from_libcudf(move(c_result))

cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
"""
Returns the minhash values for each string per seed.
This function uses MurmurHash3_x64_128 for the hash algorithm.
For details, see :cpp:func:`cudf::nvtext::minhash64`.
Parameters
----------
input : Column
Strings column to compute minhash
seeds : Column or Scalar
Seed value(s) used for the hash algorithm.
width : size_type
Character width used for apply substrings;
Default is 4 characters.
Returns
-------
Column
List column of minhash values for each string per seed
"""
cdef unique_ptr[column] c_result
cdef numeric_scalar[uint64_t]* cpp_seed

if ColumnOrScalar is Column:
with nogil:
c_result = move(
cpp_minhash64(
input.view(),
seeds.view(),
width
)
)
elif ColumnOrScalar is Scalar:
cpp_seed = <numeric_scalar[uint64_t]*>seeds.c_obj.get()
with nogil:
c_result = move(
cpp_minhash64(
input.view(),
dereference(cpp_seed),
width
)
)
else:
raise ValueError("seeds must be a Column or Scalar")

return Column.from_libcudf(move(c_result))
25 changes: 25 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest


@pytest.fixture(scope="module")
def input_data():
input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"])
seeds = pa.array([2, 3, 4, 5], pa.uint32())
return input_arr, seeds


@pytest.mark.parametrize("width", [5, 12])
def test_minhash(input_data, width):
input_arr, seeds = input_data
result = plc.nvtext.minhash.minhash(
plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
)
pa_result = plc.interop.to_arrow(result)
assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
assert pa_result.type == pa.list_(
pa.field("element", pa.uint32(), nullable=False)
)

0 comments on commit 5e5cd03

Please sign in to comment.