Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cudf._lib.hash in favor of inlining pylibcudf #17345

Merged
merged 1 commit into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ set(cython_sources
datetime.pyx
filling.pyx
groupby.pyx
hash.pyx
interop.pyx
join.pyx
json.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
datetime,
filling,
groupby,
hash,
interop,
join,
json,
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/column.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

from __future__ import annotations

from typing import Literal

from typing_extensions import Self

import pylibcudf as plc

from cudf._typing import Dtype, DtypeObj, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
Expand Down Expand Up @@ -71,3 +75,8 @@ class Column:
# TODO: The val parameter should be Scalar, not ScalarLike
@staticmethod
def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
@staticmethod
def from_pylibcudf(
col: plc.Column, data_ptr_exposed: bool = False
) -> ColumnBase: ...
def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ...
47 changes: 0 additions & 47 deletions python/cudf/cudf/_lib/hash.pyx

This file was deleted.

23 changes: 18 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from pandas.io.formats.printing import pprint_thing
from typing_extensions import Self, assert_never

import pylibcudf as plc

import cudf
import cudf.core.common
from cudf import _lib as libcudf
Expand All @@ -43,6 +45,7 @@
from cudf.core import column, df_protocol, indexing_utils, reshape
from cudf.core._compat import PANDAS_LT_300
from cudf.core.abc import Serializable
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import (
CategoricalColumn,
ColumnBase,
Expand Down Expand Up @@ -4962,7 +4965,9 @@ def apply_chunks(
)

@_performance_tracking
def partition_by_hash(self, columns, nparts, keep_index=True):
def partition_by_hash(
self, columns, nparts: int, keep_index: bool = True
) -> list[DataFrame]:
"""Partition the dataframe by the hashed value of data in *columns*.

Parameters
Expand All @@ -4986,13 +4991,21 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
else:
cols = [*self._columns]

output_columns, offsets = libcudf.hash.hash_partition(
cols, key_indices, nparts
)
with acquire_spill_lock():
plc_table, offsets = plc.partitioning.hash_partition(
plc.Table([col.to_pylibcudf(mode="read") for col in cols]),
key_indices,
nparts,
)
output_columns = [
libcudf.column.Column.from_pylibcudf(col)
for col in plc_table.columns()
]

outdf = self._from_columns_like_self(
output_columns,
self._column_names,
self._index_names if keep_index else None,
self._index_names if keep_index else None, # type: ignore[arg-type]
)
# Slice into partitions. Notice, `hash_partition` returns the start
# offset of each partition thus we skip the first offset
Expand Down
47 changes: 40 additions & 7 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import pandas as pd
from typing_extensions import Self

import pylibcudf
import pylibcudf as plc

import cudf
import cudf._lib as libcudf
Expand Down Expand Up @@ -2817,7 +2817,20 @@ def memory_usage(self, index=True, deep=False):
"""
raise NotImplementedError

def hash_values(self, method="murmur3", seed=None):
def hash_values(
self,
method: Literal[
"murmur3",
"xxhash64",
"md5",
"sha1",
"sha224",
"sha256",
"sha384",
"sha512",
] = "murmur3",
seed: int | None = None,
) -> cudf.Series:
"""Compute the hash of values in this column.

Parameters
Expand Down Expand Up @@ -2894,11 +2907,31 @@ def hash_values(self, method="murmur3", seed=None):
"Provided seed value has no effect for the hash method "
f"`{method}`. Only {seed_hash_methods} support seeds."
)
# Note that both Series and DataFrame return Series objects from this
# calculation, necessitating the unfortunate circular reference to the
# child class here.
with acquire_spill_lock():
plc_table = plc.Table(
[c.to_pylibcudf(mode="read") for c in self._columns]
)
if method == "murmur3":
plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed)
elif method == "xxhash64":
plc_column = plc.hashing.xxhash_64(plc_table, seed)
elif method == "md5":
plc_column = plc.hashing.md5(plc_table)
elif method == "sha1":
plc_column = plc.hashing.sha1(plc_table)
elif method == "sha224":
plc_column = plc.hashing.sha224(plc_table)
elif method == "sha256":
plc_column = plc.hashing.sha256(plc_table)
elif method == "sha384":
plc_column = plc.hashing.sha384(plc_table)
elif method == "sha512":
plc_column = plc.hashing.sha512(plc_table)
else:
raise ValueError(f"Unsupported hashing algorithm {method}.")
Comment on lines +2930 to +2931
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Could use assert_never(method), although in that case you don't get as nice an error message

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah didn't know about assert_never, but I think it would be good to have an informative error message here

result = libcudf.column.Column.from_pylibcudf(plc_column)
return cudf.Series._from_column(
libcudf.hash.hash([*self._columns], method, seed),
result,
index=self.index,
)

Expand Down Expand Up @@ -6270,7 +6303,7 @@ def rank(
if method not in {"average", "min", "max", "first", "dense"}:
raise KeyError(method)

method_enum = pylibcudf.aggregation.RankMethod[method.upper()]
method_enum = plc.aggregation.RankMethod[method.upper()]
if na_option not in {"keep", "top", "bottom"}:
raise ValueError(
"na_option must be one of 'keep', 'top', or 'bottom'"
Expand Down
Loading