diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 59cb8d51440..c2886709402 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -2,93 +2,46 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.minhash cimport ( - minhash as cpp_minhash, - minhash64 as cpp_minhash64, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column +from pylibcudf import nvtext + @acquire_spill_lock() def minhash(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash( + strings.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def minhash64(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash64( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash64( + strings.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + 4, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash64(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash64( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + 4, + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index 92f5a185a54..81c889e3b3d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.table.table cimport table, table_view -from pylibcudf.libcudf.utilities.host_span cimport host_span +from pylibcudf.libcudf.utilities.span cimport host_span from rmm._lib.device_buffer cimport device_buffer diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd index 848462131fe..17ea33a2066 100644 --- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport ( size_type, sorted, ) -from pylibcudf.libcudf.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index f2dd22f43aa..b4d4733e962 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -1,31 +1,36 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.utilities.span cimport device_span cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( const column_view &strings, - const column_view &seeds, + const numeric_scalar[uint32_t] seed, const size_type width, ) except + - cdef unique_ptr[column] minhash64( + cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, const size_type width, ) except + - cdef unique_ptr[column] word_minhash( - const column_view &input, - const column_view &seeds + cdef unique_ptr[column] minhash64( + const column_view &strings, + const column_view &seeds, + const size_type width, ) except + - cdef unique_ptr[column] word_minhash64( - const column_view &input, - const column_view &seeds + cdef unique_ptr[column] minhash64( + const column_view &strings, + const numeric_scalar[uint64_t] seed, + const size_type width, ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd similarity index 57% rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd index 7e591e96373..36876972a92 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from libcpp.vector cimport vector @@ -7,3 +7,6 @@ cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil: cdef cppclass host_span[T]: host_span() except + host_span(vector[T]) except + + cdef cppclass device_span[T]: + device_span() + device_span(device_span other) except + diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 9913e1fbadb..7fd65beeeb0 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx) +set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 5f1762b1e3d..9eed1da1ab5 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport edit_distance, generate_ngrams, jaccard +from . cimport edit_distance, generate_ngrams, jaccard, minhash __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash" ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 1c0ddb1e5a4..a3a2363f7ef 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import edit_distance, generate_ngrams, jaccard +from . import edit_distance, generate_ngrams, jaccard, minhash __all__ = [ "edit_distance", "generate_ngrams", "jaccard", + "minhash", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd new file mode 100644 index 00000000000..be0912d2d47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx new file mode 100644 index 00000000000..3434ef3d7da --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -0,0 +1,115 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.minhash cimport ( + minhash as cpp_minhash, + minhash64 as cpp_minhash64, +) +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`cudf::nvtext::minhash`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + cdef numeric_scalar[uint32_t]* cpp_seed + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_minhash( + input.view(), + seeds.view(), + width + ) + ) + elif ColumnOrScalar is Scalar: + cpp_seed = seeds.c_obj.get() + with nogil: + c_result = move( + cpp_minhash( + input.view(), + dereference(cpp_seed), + width + ) + ) + else: + raise ValueError("seeds must be a Column or Scalar") + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`cudf::nvtext::minhash64`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + cdef numeric_scalar[uint64_t]* cpp_seed + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_minhash64( + input.view(), + seeds.view(), + width + ) + ) + elif ColumnOrScalar is Scalar: + cpp_seed = seeds.c_obj.get() + with nogil: + c_result = move( + cpp_minhash64( + input.view(), + dereference(cpp_seed), + width + ) + ) + else: + raise ValueError("seeds must be a Column or Scalar") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py new file mode 100644 index 00000000000..f1bfe70ac05 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest + + +@pytest.fixture(scope="module") +def input_data(): + input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) + seeds = pa.array([2, 3, 4, 5], pa.uint32()) + return input_arr, seeds + + +@pytest.mark.parametrize("width", [5, 12]) +def test_minhash(input_data, width): + input_arr, seeds = input_data + result = plc.nvtext.minhash.minhash( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", pa.uint32(), nullable=False) + )