From c3399eeda35edd6b6cf64be4643258743ee30189 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Nov 2024 14:45:32 -0800 Subject: [PATCH] Remove cudf._lib.hash in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/column.pyi | 9 +++++ python/cudf/cudf/_lib/hash.pyx | 47 -------------------------- python/cudf/cudf/core/dataframe.py | 23 ++++++++++--- python/cudf/cudf/core/indexed_frame.py | 47 ++++++++++++++++++++++---- 6 files changed, 67 insertions(+), 61 deletions(-) delete mode 100644 python/cudf/cudf/_lib/hash.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 41a7db2285a..458ba5bfce4 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -22,7 +22,6 @@ set(cython_sources datetime.pyx filling.pyx groupby.pyx - hash.pyx interop.pyx join.pyx json.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 57df6899a22..0707d93975d 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -9,7 +9,6 @@ datetime, filling, groupby, - hash, interop, join, json, diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index bb38488eefb..bdd90be45b8 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,12 @@ from __future__ import annotations +from typing import Literal + from typing_extensions import Self +import pylibcudf as plc + from cudf._typing import Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase @@ -71,3 +75,8 @@ class Column: # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... + @staticmethod + def from_pylibcudf( + col: plc.Column, data_ptr_exposed: bool = False + ) -> ColumnBase: ... + def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ... diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx deleted file mode 100644 index 89309b36371..00000000000 --- a/python/cudf/cudf/_lib/hash.pyx +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.table cimport Table - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def hash_partition(list source_columns, list columns_to_hash, - int num_partitions): - plc_table, offsets = plc.partitioning.hash_partition( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), - columns_to_hash, - num_partitions - ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets - - -@acquire_spill_lock() -def hash(list source_columns, str method, int seed=0): - cdef Table ctbl = Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ) - if method == "murmur3": - return Column.from_pylibcudf(plc.hashing.murmurhash3_x86_32(ctbl, seed)) - elif method == "xxhash64": - return Column.from_pylibcudf(plc.hashing.xxhash_64(ctbl, seed)) - elif method == "md5": - return Column.from_pylibcudf(plc.hashing.md5(ctbl)) - elif method == "sha1": - return Column.from_pylibcudf(plc.hashing.sha1(ctbl)) - elif method == "sha224": - return Column.from_pylibcudf(plc.hashing.sha224(ctbl)) - elif method == "sha256": - return Column.from_pylibcudf(plc.hashing.sha256(ctbl)) - elif method == "sha384": - return Column.from_pylibcudf(plc.hashing.sha384(ctbl)) - elif method == "sha512": - return Column.from_pylibcudf(plc.hashing.sha512(ctbl)) - else: - raise ValueError( - f"Unsupported hashing algorithm {method}." - ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bf1c39b23da..9be5aabb4e2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -26,6 +26,8 @@ from pandas.io.formats.printing import pprint_thing from typing_extensions import Self, assert_never +import pylibcudf as plc + import cudf import cudf.core.common from cudf import _lib as libcudf @@ -43,6 +45,7 @@ from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -4962,7 +4965,9 @@ def apply_chunks( ) @_performance_tracking - def partition_by_hash(self, columns, nparts, keep_index=True): + def partition_by_hash( + self, columns, nparts: int, keep_index: bool = True + ) -> list[DataFrame]: """Partition the dataframe by the hashed value of data in *columns*. Parameters @@ -4986,13 +4991,21 @@ def partition_by_hash(self, columns, nparts, keep_index=True): else: cols = [*self._columns] - output_columns, offsets = libcudf.hash.hash_partition( - cols, key_indices, nparts - ) + with acquire_spill_lock(): + plc_table, offsets = plc.partitioning.hash_partition( + plc.Table([col.to_pylibcudf(mode="read") for col in cols]), + key_indices, + nparts, + ) + output_columns = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] + outdf = self._from_columns_like_self( output_columns, self._column_names, - self._index_names if keep_index else None, + self._index_names if keep_index else None, # type: ignore[arg-type] ) # Slice into partitions. Notice, `hash_partition` returns the start # offset of each partition thus we skip the first offset diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e031f2a4e8e..9130779c3e9 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -21,7 +21,7 @@ import pandas as pd from typing_extensions import Self -import pylibcudf +import pylibcudf as plc import cudf import cudf._lib as libcudf @@ -2817,7 +2817,20 @@ def memory_usage(self, index=True, deep=False): """ raise NotImplementedError - def hash_values(self, method="murmur3", seed=None): + def hash_values( + self, + method: Literal[ + "murmur3", + "xxhash64", + "md5", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + ] = "murmur3", + seed: int | None = None, + ) -> cudf.Series: """Compute the hash of values in this column. Parameters @@ -2894,11 +2907,31 @@ def hash_values(self, method="murmur3", seed=None): "Provided seed value has no effect for the hash method " f"`{method}`. Only {seed_hash_methods} support seeds." ) - # Note that both Series and DataFrame return Series objects from this - # calculation, necessitating the unfortunate circular reference to the - # child class here. + with acquire_spill_lock(): + plc_table = plc.Table( + [c.to_pylibcudf(mode="read") for c in self._columns] + ) + if method == "murmur3": + plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed) + elif method == "xxhash64": + plc_column = plc.hashing.xxhash_64(plc_table, seed) + elif method == "md5": + plc_column = plc.hashing.md5(plc_table) + elif method == "sha1": + plc_column = plc.hashing.sha1(plc_table) + elif method == "sha224": + plc_column = plc.hashing.sha224(plc_table) + elif method == "sha256": + plc_column = plc.hashing.sha256(plc_table) + elif method == "sha384": + plc_column = plc.hashing.sha384(plc_table) + elif method == "sha512": + plc_column = plc.hashing.sha512(plc_table) + else: + raise ValueError(f"Unsupported hashing algorithm {method}.") + result = libcudf.column.Column.from_pylibcudf(plc_column) return cudf.Series._from_column( - libcudf.hash.hash([*self._columns], method, seed), + result, index=self.index, ) @@ -6270,7 +6303,7 @@ def rank( if method not in {"average", "min", "max", "first", "dense"}: raise KeyError(method) - method_enum = pylibcudf.aggregation.RankMethod[method.upper()] + method_enum = plc.aggregation.RankMethod[method.upper()] if na_option not in {"keep", "top", "bottom"}: raise ValueError( "na_option must be one of 'keep', 'top', or 'bottom'"