Skip to content

Commit

Permalink
Remove cudf._lib.hash in favor of inlining pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Nov 15, 2024
1 parent 8664fad commit c3399ee
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 61 deletions.
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ set(cython_sources
datetime.pyx
filling.pyx
groupby.pyx
hash.pyx
interop.pyx
join.pyx
json.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
datetime,
filling,
groupby,
hash,
interop,
join,
json,
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/column.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

from __future__ import annotations

from typing import Literal

from typing_extensions import Self

import pylibcudf as plc

from cudf._typing import Dtype, DtypeObj, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase
Expand Down Expand Up @@ -71,3 +75,8 @@ class Column:
# TODO: The val parameter should be Scalar, not ScalarLike
@staticmethod
def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
@staticmethod
def from_pylibcudf(
col: plc.Column, data_ptr_exposed: bool = False
) -> ColumnBase: ...
def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ...
47 changes: 0 additions & 47 deletions python/cudf/cudf/_lib/hash.pyx

This file was deleted.

23 changes: 18 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from pandas.io.formats.printing import pprint_thing
from typing_extensions import Self, assert_never

import pylibcudf as plc

import cudf
import cudf.core.common
from cudf import _lib as libcudf
Expand All @@ -43,6 +45,7 @@
from cudf.core import column, df_protocol, indexing_utils, reshape
from cudf.core._compat import PANDAS_LT_300
from cudf.core.abc import Serializable
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import (
CategoricalColumn,
ColumnBase,
Expand Down Expand Up @@ -4962,7 +4965,9 @@ def apply_chunks(
)

@_performance_tracking
def partition_by_hash(self, columns, nparts, keep_index=True):
def partition_by_hash(
self, columns, nparts: int, keep_index: bool = True
) -> list[DataFrame]:
"""Partition the dataframe by the hashed value of data in *columns*.
Parameters
Expand All @@ -4986,13 +4991,21 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
else:
cols = [*self._columns]

output_columns, offsets = libcudf.hash.hash_partition(
cols, key_indices, nparts
)
with acquire_spill_lock():
plc_table, offsets = plc.partitioning.hash_partition(
plc.Table([col.to_pylibcudf(mode="read") for col in cols]),
key_indices,
nparts,
)
output_columns = [
libcudf.column.Column.from_pylibcudf(col)
for col in plc_table.columns()
]

outdf = self._from_columns_like_self(
output_columns,
self._column_names,
self._index_names if keep_index else None,
self._index_names if keep_index else None, # type: ignore[arg-type]
)
# Slice into partitions. Notice, `hash_partition` returns the start
# offset of each partition thus we skip the first offset
Expand Down
47 changes: 40 additions & 7 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import pandas as pd
from typing_extensions import Self

import pylibcudf
import pylibcudf as plc

import cudf
import cudf._lib as libcudf
Expand Down Expand Up @@ -2817,7 +2817,20 @@ def memory_usage(self, index=True, deep=False):
"""
raise NotImplementedError

def hash_values(self, method="murmur3", seed=None):
def hash_values(
self,
method: Literal[
"murmur3",
"xxhash64",
"md5",
"sha1",
"sha224",
"sha256",
"sha384",
"sha512",
] = "murmur3",
seed: int | None = None,
) -> cudf.Series:
"""Compute the hash of values in this column.
Parameters
Expand Down Expand Up @@ -2894,11 +2907,31 @@ def hash_values(self, method="murmur3", seed=None):
"Provided seed value has no effect for the hash method "
f"`{method}`. Only {seed_hash_methods} support seeds."
)
# Note that both Series and DataFrame return Series objects from this
# calculation, necessitating the unfortunate circular reference to the
# child class here.
with acquire_spill_lock():
plc_table = plc.Table(
[c.to_pylibcudf(mode="read") for c in self._columns]
)
if method == "murmur3":
plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed)
elif method == "xxhash64":
plc_column = plc.hashing.xxhash_64(plc_table, seed)
elif method == "md5":
plc_column = plc.hashing.md5(plc_table)
elif method == "sha1":
plc_column = plc.hashing.sha1(plc_table)
elif method == "sha224":
plc_column = plc.hashing.sha224(plc_table)
elif method == "sha256":
plc_column = plc.hashing.sha256(plc_table)
elif method == "sha384":
plc_column = plc.hashing.sha384(plc_table)
elif method == "sha512":
plc_column = plc.hashing.sha512(plc_table)
else:
raise ValueError(f"Unsupported hashing algorithm {method}.")
result = libcudf.column.Column.from_pylibcudf(plc_column)
return cudf.Series._from_column(
libcudf.hash.hash([*self._columns], method, seed),
result,
index=self.index,
)

Expand Down Expand Up @@ -6270,7 +6303,7 @@ def rank(
if method not in {"average", "min", "max", "first", "dense"}:
raise KeyError(method)

method_enum = pylibcudf.aggregation.RankMethod[method.upper()]
method_enum = plc.aggregation.RankMethod[method.upper()]
if na_option not in {"keep", "top", "bottom"}:
raise ValueError(
"na_option must be one of 'keep', 'top', or 'bottom'"
Expand Down

0 comments on commit c3399ee

Please sign in to comment.