Skip to content

Commit

Permalink
Migrate Min Hashing APIs to pylibcudf (#17021)
Browse files Browse the repository at this point in the history
Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17021
  • Loading branch information
Matt711 authored Oct 11, 2024
1 parent fea87cb commit c8a56a5
Show file tree
Hide file tree
Showing 13 changed files with 285 additions and 80 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ nvtext
edit_distance
generate_ngrams
jaccard
minhash
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=======
minhash
=======

.. automodule:: pylibcudf.nvtext.minhash
:members:
101 changes: 26 additions & 75 deletions python/cudf/cudf/_lib/nvtext/minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,93 +2,44 @@

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.nvtext.minhash cimport (
minhash as cpp_minhash,
minhash64 as cpp_minhash64,
word_minhash as cpp_word_minhash,
word_minhash64 as cpp_word_minhash64,
)
from pylibcudf.libcudf.types cimport size_type

from cudf._lib.column cimport Column


@acquire_spill_lock()
def minhash(Column strings, Column seeds, int width):

cdef column_view c_strings = strings.view()
cdef size_type c_width = width
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_minhash(
c_strings,
c_seeds,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
from pylibcudf import nvtext


@acquire_spill_lock()
def minhash64(Column strings, Column seeds, int width):

cdef column_view c_strings = strings.view()
cdef size_type c_width = width
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result
def minhash(Column input, Column seeds, int width=4):
result = nvtext.minhash.minhash(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)

with nogil:
c_result = move(
cpp_minhash64(
c_strings,
c_seeds,
c_width
)
)

return Column.from_unique_ptr(move(c_result))
@acquire_spill_lock()
def minhash64(Column input, Column seeds, int width=4):
result = nvtext.minhash.minhash64(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
width,
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def word_minhash(Column input, Column seeds):

cdef column_view c_input = input.view()
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash(
c_input,
c_seeds
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.word_minhash(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(result)


@acquire_spill_lock()
def word_minhash64(Column input, Column seeds):

cdef column_view c_input = input.view()
cdef column_view c_seeds = seeds.view()
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash64(
c_input,
c_seeds
)
)

return Column.from_unique_ptr(move(c_result))
result = nvtext.minhash.word_minhash64(
input.to_pylibcudf(mode="read"),
seeds.to_pylibcudf(mode="read"),
)
return Column.from_pylibcudf(result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector
from pylibcudf.libcudf.column.column cimport column, column_view
from pylibcudf.libcudf.table.table cimport table, table_view
from pylibcudf.libcudf.utilities.host_span cimport host_span
from pylibcudf.libcudf.utilities.span cimport host_span

from rmm.librmm.device_buffer cimport device_buffer

Expand Down
1 change: 0 additions & 1 deletion python/pylibcudf/pylibcudf/libcudf/groupby.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport (
size_type,
sorted,
)
from pylibcudf.libcudf.utilities.host_span cimport host_span

# workaround for https://github.com/cython/cython/issues/3885
ctypedef const scalar constscalar
Expand Down
14 changes: 14 additions & 0 deletions python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from pylibcudf.libcudf.types cimport size_type


cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:

cdef unique_ptr[column] minhash(
const column_view &strings,
const numeric_scalar[uint32_t] seed,
const size_type width,
) except +

cdef unique_ptr[column] minhash(
const column_view &strings,
const column_view &seeds,
Expand All @@ -20,6 +28,12 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
const size_type width,
) except +

cdef unique_ptr[column] minhash64(
const column_view &strings,
const numeric_scalar[uint64_t] seed,
const size_type width,
) except +

cdef unique_ptr[column] word_minhash(
const column_view &input,
const column_view &seeds
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx)
set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport edit_distance, generate_ngrams, jaccard
from . cimport edit_distance, generate_ngrams, jaccard, minhash

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash"
]
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/nvtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import edit_distance, generate_ngrams, jaccard
from . import edit_distance, generate_ngrams, jaccard, minhash

__all__ = [
"edit_distance",
"generate_ngrams",
"jaccard",
"minhash",
]
18 changes: 18 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/minhash.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)

cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)

cpdef Column word_minhash(Column input, Column seeds)

cpdef Column word_minhash64(Column input, Column seeds)
160 changes: 160 additions & 0 deletions python/pylibcudf/pylibcudf/nvtext/minhash.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t, uint64_t
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.nvtext.minhash cimport (
minhash as cpp_minhash,
minhash64 as cpp_minhash64,
word_minhash as cpp_word_minhash,
word_minhash64 as cpp_word_minhash64,
)
from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar

from cython.operator import dereference


cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
"""
Returns the minhash values for each string per seed.
This function uses MurmurHash3_x86_32 for the hash algorithm.
For details, see :cpp:func:`minhash`.
Parameters
----------
input : Column
Strings column to compute minhash
seeds : Column or Scalar
Seed value(s) used for the hash algorithm.
width : size_type
Character width used for apply substrings;
Default is 4 characters.
Returns
-------
Column
List column of minhash values for each string per seed
"""
cdef unique_ptr[column] c_result

if not isinstance(seeds, (Column, Scalar)):
raise TypeError("Must pass a Column or Scalar")

with nogil:
c_result = move(
cpp_minhash(
input.view(),
seeds.view() if ColumnOrScalar is Column else
dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
width
)
)

return Column.from_libcudf(move(c_result))

cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
"""
Returns the minhash values for each string per seed.
This function uses MurmurHash3_x64_128 for the hash algorithm.
For details, see :cpp:func:`minhash64`.
Parameters
----------
input : Column
Strings column to compute minhash
seeds : Column or Scalar
Seed value(s) used for the hash algorithm.
width : size_type
Character width used for apply substrings;
Default is 4 characters.
Returns
-------
Column
List column of minhash values for each string per seed
"""
cdef unique_ptr[column] c_result

if not isinstance(seeds, (Column, Scalar)):
raise TypeError("Must pass a Column or Scalar")

with nogil:
c_result = move(
cpp_minhash64(
input.view(),
seeds.view() if ColumnOrScalar is Column else
dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
width
)
)

return Column.from_libcudf(move(c_result))

cpdef Column word_minhash(Column input, Column seeds):
"""
Returns the minhash values for each row of strings per seed.
This function uses MurmurHash3_x86_32 for the hash algorithm.
For details, see :cpp:func:`word_minhash`.
Parameters
----------
input : Column
Lists column of strings to compute minhash
seeds : Column or Scalar
Seed values used for the hash algorithm.
Returns
-------
Column
List column of minhash values for each string per seed
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash(
input.view(),
seeds.view()
)
)

return Column.from_libcudf(move(c_result))

cpdef Column word_minhash64(Column input, Column seeds):
"""
Returns the minhash values for each row of strings per seed.
This function uses MurmurHash3_x64_128 for the hash algorithm though
only the first 64-bits of the hash are used in computing the output.
For details, see :cpp:func:`word_minhash64`.
Parameters
----------
input : Column
Lists column of strings to compute minhash
seeds : Column or Scalar
Seed values used for the hash algorithm.
Returns
-------
Column
List column of minhash values for each string per seed
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_word_minhash64(
input.view(),
seeds.view()
)
)

return Column.from_libcudf(move(c_result))
Loading

0 comments on commit c8a56a5

Please sign in to comment.