-
Notifications
You must be signed in to change notification settings - Fork 912
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Migrate Min Hashing APIs to pylibcudf
- Loading branch information
Showing
11 changed files
with
203 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,36 @@ | ||
# Copyright (c) 2023-2024, NVIDIA CORPORATION. | ||
|
||
from libc.stdint cimport uint32_t, uint64_t | ||
from libcpp.memory cimport unique_ptr | ||
from pylibcudf.libcudf.column.column cimport column | ||
from pylibcudf.libcudf.column.column_view cimport column_view | ||
from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar | ||
from pylibcudf.libcudf.types cimport size_type | ||
from pylibcudf.libcudf.utilities.span cimport device_span | ||
|
||
|
||
cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: | ||
|
||
cdef unique_ptr[column] minhash( | ||
const column_view &strings, | ||
const column_view &seeds, | ||
const numeric_scalar[uint32_t] seed, | ||
const size_type width, | ||
) except + | ||
|
||
cdef unique_ptr[column] minhash64( | ||
cdef unique_ptr[column] minhash( | ||
const column_view &strings, | ||
const column_view &seeds, | ||
const size_type width, | ||
) except + | ||
|
||
cdef unique_ptr[column] word_minhash( | ||
const column_view &input, | ||
const column_view &seeds | ||
cdef unique_ptr[column] minhash64( | ||
const column_view &strings, | ||
const column_view &seeds, | ||
const size_type width, | ||
) except + | ||
|
||
cdef unique_ptr[column] word_minhash64( | ||
const column_view &input, | ||
const column_view &seeds | ||
cdef unique_ptr[column] minhash64( | ||
const column_view &strings, | ||
const numeric_scalar[uint64_t] seed, | ||
const size_type width, | ||
) except + |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,10 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . cimport edit_distance, generate_ngrams, jaccard | ||
from . cimport edit_distance, generate_ngrams, jaccard, minhash | ||
|
||
__all__ = [ | ||
"edit_distance", | ||
"generate_ngrams", | ||
"jaccard", | ||
"minhash" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,10 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from . import edit_distance, generate_ngrams, jaccard | ||
from . import edit_distance, generate_ngrams, jaccard, minhash | ||
|
||
__all__ = [ | ||
"edit_distance", | ||
"generate_ngrams", | ||
"jaccard", | ||
"minhash", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libc.stdint cimport uint32_t, uint64_t | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.types cimport size_type | ||
from pylibcudf.scalar cimport Scalar | ||
|
||
ctypedef fused ColumnOrScalar: | ||
Column | ||
Scalar | ||
|
||
cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) | ||
|
||
cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
from libc.stdint cimport uint32_t, uint64_t | ||
from libcpp.memory cimport unique_ptr | ||
from libcpp.utility cimport move | ||
from pylibcudf.column cimport Column | ||
from pylibcudf.libcudf.column.column cimport column | ||
from pylibcudf.libcudf.nvtext.minhash cimport ( | ||
minhash as cpp_minhash, | ||
minhash64 as cpp_minhash64, | ||
) | ||
from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar | ||
from pylibcudf.libcudf.types cimport size_type | ||
from pylibcudf.scalar cimport Scalar | ||
|
||
from cython.operator import dereference | ||
|
||
|
||
cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): | ||
""" | ||
Returns the minhash values for each string per seed. | ||
This function uses MurmurHash3_x86_32 for the hash algorithm. | ||
For details, see :cpp:func:`cudf::nvtext::minhash`. | ||
Parameters | ||
---------- | ||
input : Column | ||
Strings column to compute minhash | ||
seeds : Column or Scalar | ||
Seed value(s) used for the hash algorithm. | ||
width : size_type | ||
Character width used for apply substrings; | ||
Default is 4 characters. | ||
Returns | ||
------- | ||
Column | ||
List column of minhash values for each string per seed | ||
""" | ||
cdef unique_ptr[column] c_result | ||
cdef numeric_scalar[uint32_t]* cpp_seed | ||
|
||
if ColumnOrScalar is Column: | ||
with nogil: | ||
c_result = move( | ||
cpp_minhash( | ||
input.view(), | ||
seeds.view(), | ||
width | ||
) | ||
) | ||
elif ColumnOrScalar is Scalar: | ||
cpp_seed = <numeric_scalar[uint32_t]*>seeds.c_obj.get() | ||
with nogil: | ||
c_result = move( | ||
cpp_minhash( | ||
input.view(), | ||
dereference(cpp_seed), | ||
width | ||
) | ||
) | ||
else: | ||
raise ValueError("seeds must be a Column or Scalar") | ||
|
||
return Column.from_libcudf(move(c_result)) | ||
|
||
cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): | ||
""" | ||
Returns the minhash values for each string per seed. | ||
This function uses MurmurHash3_x64_128 for the hash algorithm. | ||
For details, see :cpp:func:`cudf::nvtext::minhash64`. | ||
Parameters | ||
---------- | ||
input : Column | ||
Strings column to compute minhash | ||
seeds : Column or Scalar | ||
Seed value(s) used for the hash algorithm. | ||
width : size_type | ||
Character width used for apply substrings; | ||
Default is 4 characters. | ||
Returns | ||
------- | ||
Column | ||
List column of minhash values for each string per seed | ||
""" | ||
cdef unique_ptr[column] c_result | ||
cdef numeric_scalar[uint64_t]* cpp_seed | ||
|
||
if ColumnOrScalar is Column: | ||
with nogil: | ||
c_result = move( | ||
cpp_minhash64( | ||
input.view(), | ||
seeds.view(), | ||
width | ||
) | ||
) | ||
elif ColumnOrScalar is Scalar: | ||
cpp_seed = <numeric_scalar[uint64_t]*>seeds.c_obj.get() | ||
with nogil: | ||
c_result = move( | ||
cpp_minhash64( | ||
input.view(), | ||
dereference(cpp_seed), | ||
width | ||
) | ||
) | ||
else: | ||
raise ValueError("seeds must be a Column or Scalar") | ||
|
||
return Column.from_libcudf(move(c_result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
|
||
import pyarrow as pa | ||
import pylibcudf as plc | ||
import pytest | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def input_data(): | ||
input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) | ||
seeds = pa.array([2, 3, 4, 5], pa.uint32()) | ||
return input_arr, seeds | ||
|
||
|
||
@pytest.mark.parametrize("width", [5, 12]) | ||
def test_minhash(input_data, width): | ||
input_arr, seeds = input_data | ||
result = plc.nvtext.minhash.minhash( | ||
plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width | ||
) | ||
pa_result = plc.interop.to_arrow(result) | ||
assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) | ||
assert pa_result.type == pa.list_( | ||
pa.field("element", pa.uint32(), nullable=False) | ||
) |