diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e59b948aba9..fbf0ed860a6 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -14,6 +14,8 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby @@ -25,6 +27,7 @@ from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import GatherMap @@ -770,9 +773,22 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): join_keys = map(list, zip(*join_keys)) # By construction, left and right keys are related by # a permutation, so we can use an inner join. - left_order, right_order = libcudf.join.join( - *join_keys, how="inner" - ) + with acquire_spill_lock(): + plc_tables = [ + plc.Table( + [col.to_pylibcudf(mode="read") for col in cols] + ) + for cols in join_keys + ] + left_plc, right_plc = plc.join.inner_join( + plc_tables[0], + plc_tables[1], + plc.types.NullEquality.EQUAL, + ) + left_order = libcudf.column.Column.from_pylibcudf(left_plc) + right_order = libcudf.column.Column.from_pylibcudf( + right_plc + ) # left order is some permutation of the ordering we # want, and right order is a matching gather map for # the result table. Get the correct order by sorting diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1b90e9f9df0..0b433dbd353 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -16,6 +16,8 @@ import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf._lib.filling import sequence @@ -32,6 +34,7 @@ ) from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -1360,7 +1363,14 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): except ValueError: return _return_get_indexer_result(result.values) - scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") + with acquire_spill_lock(): + left_plc, right_plc = plc.join.inner_join( + plc.Table([lcol.to_pylibcudf(mode="read")]), + plc.Table([rcol.to_pylibcudf(mode="read")]), + plc.types.NullEquality.EQUAL, + ) + scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) + indices = libcudf.column.Column.from_pylibcudf(right_plc) result = libcudf.copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bfff62f0a89..19a53af018d 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -14,6 +14,8 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf import cudf._lib as libcudf from cudf._lib.types import size_type_dtype @@ -22,6 +24,7 @@ from cudf.core import column from cudf.core._base_index import _return_get_indexer_result from cudf.core.algorithms import factorize +from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import ( @@ -1919,10 +1922,18 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): for lcol, rcol in zip(target._columns, self._columns) ] join_keys = map(list, zip(*join_keys)) - scatter_map, indices = libcudf.join.join( - *join_keys, - how="inner", - ) + with acquire_spill_lock(): + plc_tables = [ + plc.Table([col.to_pylibcudf(mode="read") for col in cols]) + for cols in join_keys + ] + left_plc, right_plc = plc.join.inner_join( + plc_tables[0], + plc_tables[1], + plc.types.NullEquality.EQUAL, + ) + scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) + indices = libcudf.column.Column.from_pylibcudf(right_plc) result = libcudf.copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result)