diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8cec8af3c67..427ffcc8c12 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx sort.pyx - stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx +set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx stream_compaction.pyx + string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 001e5cbb676..26afdd62caf 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -5,7 +5,6 @@ copying, groupby, interop, - sort, stream_compaction, string_casting, strings_udf, diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx deleted file mode 100644 index eefe37d9880..00000000000 --- a/python/cudf/cudf/_lib/sort.pyx +++ /dev/null @@ -1,365 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from itertools import repeat - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from pylibcudf.libcudf.aggregation cimport rank_method -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -@acquire_spill_lock() -def is_sorted( - list source_columns, object ascending=None, object null_position=None -): - """ - Checks whether the rows of a `table` are sorted in lexicographical order. - - Parameters - ---------- - source_columns : list of columns - columns to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order of - each column. If list-like, size of list-like must be len(columns). If - None, all columns expected sort order is set to ascending. False (0) - - descending, True (1) - ascending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of nulls - compared to other elements. If list-like, size of list-like must be - len(columns). If None, null order is set to before. False (0) - after, - True (1) - before. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - - if ascending is None: - column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns) - else: - if len(ascending) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(ascending)} for `ascending`" - ) - column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns) - for idx, val in enumerate(ascending): - if val: - column_order[idx] = pylibcudf.types.Order.ASCENDING - - if null_position is None: - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - else: - if len(null_position) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(null_position)} for `null_position`" - ) - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - for idx, val in enumerate(null_position): - if val: - null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE - - return pylibcudf.sorting.is_sorted( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - column_order, - null_precedence - ) - - -def ordering(column_order, null_precedence): - """ - Construct order and null order vectors - - Parameters - ---------- - column_order - Iterable of bool (True for ascending order, False for descending) - null_precedence - Iterable string for null positions ("first" for start, "last" for end) - - Both iterables must be the same length (not checked) - - Returns - ------- - pair of vectors (order, and null_order) - """ - c_column_order = [] - c_null_precedence = [] - for asc, null in zip(column_order, null_precedence): - c_column_order.append( - pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING - ) - if asc ^ (null == "first"): - c_null_precedence.append(pylibcudf.types.NullOrder.AFTER) - elif asc ^ (null == "last"): - c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE) - else: - raise ValueError(f"Invalid null precedence {null}") - return c_column_order, c_null_precedence - - -@acquire_spill_lock() -def order_by( - list columns_from_table, - object ascending, - str na_position, - *, - bool stable -): - """ - Get index to sort the table in ascending/descending order. - - Parameters - ---------- - columns_from_table : list[Column] - Columns from the table which will be sorted - ascending : sequence[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : str - Whether null values should show up at the "first" or "last" - position of **all** sorted column. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - Column of indices that sorts the table - """ - order = ordering(ascending, repeat(na_position)) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order") - - return Column.from_pylibcudf( - func( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in columns_from_table], - ), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort( - list values, - list column_order=None, - list null_precedence=None, -): - """ - Sort the table in ascending/descending order. - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - return columns_from_pylibcudf_table( - pylibcudf.sorting.sort( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort_by_key( - list values, - list keys, - object ascending, - object na_position, - *, - bool stable, -): - """ - Sort a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - ascending : list[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : list[str] - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - order = ordering(ascending, na_position) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key") - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def segmented_sort_by_key( - list values, - list keys, - Column segment_offsets, - list column_order=None, - list null_precedence=None, - *, - bool stable, -): - """ - Sort segments of a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - offsets : Column - Segment offsets - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - func = getattr( - pylibcudf.sorting, - f"{'stable_' if stable else ''}segmented_sort_by_key" - ) - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - segment_offsets.to_pylibcudf(mode="read"), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def digitize(list source_columns, list bins, bool right=False): - """ - Return the indices of the bins to which each value in source_table belongs. - - Parameters - ---------- - source_columns : Input columns to be binned. - bins : List containing columns of bins - right : Indicating whether the intervals include the - right or the left bin edge. - """ - return Column.from_pylibcudf( - getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in bins] - ), - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - [pylibcudf.types.Order.ASCENDING]*len(bins), - [pylibcudf.types.NullOrder.BEFORE]*len(bins) - ) - ) - - -@acquire_spill_lock() -def rank_columns(list source_columns, rank_method method, str na_option, - bool ascending, bool pct - ): - """ - Compute numerical data ranks (1 through n) of each column in the dataframe - """ - column_order = ( - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ) - # ascending - # #top = na_is_smallest - # #bottom = na_is_largest - # #keep = na_is_largest - # descending - # #top = na_is_largest - # #bottom = na_is_smallest - # #keep = na_is_smallest - if ascending: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.BEFORE - else: - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - null_precedence = pylibcudf.types.NullOrder.BEFORE - c_null_handling = ( - pylibcudf.types.NullPolicy.EXCLUDE - if na_option == 'keep' - else pylibcudf.types.NullPolicy.INCLUDE - ) - - return [ - Column.from_pylibcudf( - pylibcudf.sorting.rank( - col.to_pylibcudf(mode="read"), - method, - column_order, - c_null_handling, - null_precedence, - pct, - ) - ) - for col in source_columns - ] diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py new file mode 100644 index 00000000000..69f9e7664b1 --- /dev/null +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -0,0 +1,205 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +import itertools +from typing import TYPE_CHECKING, Literal + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from collections.abc import Iterable + + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def is_sorted( + source_columns: list[ColumnBase], + ascending: list[bool] | None = None, + null_position: list[bool] | None = None, +) -> bool: + """ + Checks whether the rows of a `table` are sorted in lexicographical order. + + Parameters + ---------- + source_columns : list of columns + columns to be checked for sort order + ascending : None or list-like of booleans + None or list-like of boolean values indicating expected sort order of + each column. If list-like, size of list-like must be len(columns). If + None, all columns expected sort order is set to ascending. False (0) - + descending, True (1) - ascending. + null_position : None or list-like of booleans + None or list-like of boolean values indicating desired order of nulls + compared to other elements. If list-like, size of list-like must be + len(columns). If None, null order is set to before. False (0) - after, + True (1) - before. + + Returns + ------- + returns : boolean + Returns True, if sorted as expected by ``ascending`` and + ``null_position``, False otherwise. + """ + if ascending is None: + column_order = [plc.types.Order.ASCENDING] * len(source_columns) + else: + if len(ascending) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(ascending)} for `ascending`" + ) + column_order = [ + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + for asc in ascending + ] + + if null_position is None: + null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns) + else: + if len(null_position) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(null_position)} for `null_position`" + ) + null_precedence = [ + plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER + for null in null_position + ] + + return plc.sorting.is_sorted( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + column_order, + null_precedence, + ) + + +def ordering( + column_order: list[bool], + null_precedence: Iterable[Literal["first", "last"]], +) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: + """ + Construct order and null order vectors + + Parameters + ---------- + column_order + Iterable of bool (True for ascending order, False for descending) + null_precedence + Iterable string for null positions ("first" for start, "last" for end) + + Both iterables must be the same length (not checked) + + Returns + ------- + pair of vectors (order, and null_order) + """ + c_column_order = [] + c_null_precedence = [] + for asc, null in zip(column_order, null_precedence): + c_column_order.append( + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + ) + if asc ^ (null == "first"): + c_null_precedence.append(plc.types.NullOrder.AFTER) + elif asc ^ (null == "last"): + c_null_precedence.append(plc.types.NullOrder.BEFORE) + else: + raise ValueError(f"Invalid null precedence {null}") + return c_column_order, c_null_precedence + + +@acquire_spill_lock() +def order_by( + columns_from_table: list[ColumnBase], + ascending: list[bool], + na_position: Literal["first", "last"], + *, + stable: bool, +): + """ + Get index to sort the table in ascending/descending order. + + Parameters + ---------- + columns_from_table : list[Column] + Columns from the table which will be sorted + ascending : sequence[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : str + Whether null values should show up at the "first" or "last" + position of **all** sorted column. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + Column of indices that sorts the table + """ + order = ordering(ascending, itertools.repeat(na_position)) + func = ( + plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order + ) + return Column.from_pylibcudf( + func( + plc.Table( + [col.to_pylibcudf(mode="read") for col in columns_from_table], + ), + order[0], + order[1], + ) + ) + + +@acquire_spill_lock() +def sort_by_key( + values: list[ColumnBase], + keys: list[ColumnBase], + ascending: list[bool], + na_position: list[Literal["first", "last"]], + *, + stable: bool, +) -> list[ColumnBase]: + """ + Sort a table by given keys + + Parameters + ---------- + values : list[Column] + Columns of the table which will be sorted + keys : list[Column] + Columns making up the sort key + ascending : list[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : list[str] + Sequence of "first" or "last" values (default "first") + indicating the position of null values when sorting the keys. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + list[Column] + list of value columns sorted by keys + """ + order = ordering(ascending, na_position) + func = ( + plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + ) + return [ + Column.from_pylibcudf(col) + for col in func( + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + plc.Table([col.to_pylibcudf(mode="read") for col in keys]), + order[0], + order[1], + ).columns() + ] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 624a3ac95ed..cc07af0f669 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -42,7 +42,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals import aggregation, unary +from cudf.core._internals import aggregation, sorting, unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -996,13 +996,13 @@ def is_unique(self) -> bool: @cached_property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [True], None ) @cached_property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [False], None ) @@ -1026,15 +1026,20 @@ def contains(self, other: ColumnBase) -> ColumnBase: def sort_values( self: Self, ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ) -> Self: if (not ascending and self.is_monotonic_decreasing) or ( ascending and self.is_monotonic_increasing ): return self.copy() - return libcudf.sort.sort( - [self], column_order=[ascending], null_precedence=[na_position] - )[0] + order = sorting.ordering([ascending], [na_position]) + with acquire_spill_lock(): + plc_table = plc.sorting.sort( + plc.Table([self.to_pylibcudf(mode="read")]), + order[0], + order[1], + ) + return type(self).from_pylibcudf(plc_table.columns()[0]) # type: ignore[return-value] def distinct_count(self, dropna: bool = True) -> int: try: @@ -1204,7 +1209,7 @@ def argsort( as_column(range(len(self) - 1, -1, -1)), ) else: - return libcudf.sort.order_by( + return sorting.order_by( [self], [ascending], na_position, stable=True ) @@ -1511,7 +1516,7 @@ def _return_sentinel_column(): del right_rows # reorder `codes` so that its values correspond to the # values of `self`: - (codes,) = libcudf.sort.sort_by_key( + (codes,) = sorting.sort_by_key( codes, [left_gather_map], [True], ["last"], stable=True ) return codes.fillna(na_sentinel.value) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 28a2bd7fa6c..f099cef3331 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -718,6 +718,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: return super()._reduction_result_dtype(reduction_op) + @acquire_spill_lock() + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: + """Return the indices of the bins to which each value in column belongs. + + Parameters + ---------- + bins : np.ndarray + 1-D column-like object of bins with same type as `column`, should be + monotonically increasing. + right : bool + Indicates whether interval contains the right or left bin edge. + + Returns + ------- + A column containing the indices + """ + if self.dtype != bins.dtype: + raise ValueError( + "digitize() expects bins and input column have the same dtype." + ) + + bin_col = as_column(bins, dtype=bins.dtype) + if bin_col.nullable: + raise ValueError("`bins` cannot contain null entries.") + + return type(self).from_pylibcudf( # type: ignore[return-value] + getattr(plc.search, "lower_bound" if right else "upper_bound")( + plc.Table([bin_col.to_pylibcudf(mode="read")]), + plc.Table([self.to_pylibcudf(mode="read")]), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.BEFORE], + ) + ) + def _normalize_find_and_replace_input( input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list @@ -772,34 +806,3 @@ def _normalize_find_and_replace_input( if not normalized_column.can_cast_safely(input_column_dtype): return normalized_column return normalized_column.astype(input_column_dtype) - - -def digitize( - column: ColumnBase, bins: np.ndarray, right: bool = False -) -> ColumnBase: - """Return the indices of the bins to which each value in column belongs. - - Parameters - ---------- - column : Column - Input column. - bins : Column-like - 1-D column-like object of bins with same type as `column`, should be - monotonically increasing. - right : bool - Indicates whether interval contains the right or left bin edge. - - Returns - ------- - A column containing the indices - """ - if not column.dtype == bins.dtype: - raise ValueError( - "Digitize() expects bins and input column have the same dtype." - ) - - bin_col = as_column(bins, dtype=bins.dtype) - if bin_col.nullable: - raise ValueError("`bins` cannot contain null entries.") - - return as_column(libcudf.sort.digitize([column], [bin_col], right)) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 7a39355dd50..aaf2239a71e 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -10,7 +10,7 @@ import pylibcudf as plc import cudf -from cudf import _lib as libcudf +from cudf.core._internals import sorting from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.missing import NA @@ -144,7 +144,7 @@ def quantile( ) else: # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( + indices = sorting.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) with acquire_spill_lock(): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 00199cca828..4f40ba0bd92 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -22,6 +22,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sorting from cudf.core._internals.search import search_sorted from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock @@ -1476,7 +1477,7 @@ def _get_sorted_inds( else: ascending_lst = list(ascending) - return libcudf.sort.order_by( + return sorting.order_by( list(to_sort), ascending_lst, na_position, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a8d82f977d5..b772d35846d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -18,11 +18,11 @@ import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby -from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sorting from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -792,7 +792,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): # want, and right order is a matching gather map for # the result table. Get the correct order by sorting # the right gather map. - (right_order,) = libcudf.sort.sort_by_key( + (right_order,) = sorting.sort_by_key( [right_order], [left_order], [True], @@ -1248,15 +1248,20 @@ def sample( for off, size in zip(group_offsets, size_per_group): rs.shuffle(indices[off : off + size]) else: - rng = cp.random.default_rng(seed=random_state) - (indices,) = segmented_sort_by_key( - [as_column(indices)], - [as_column(rng.random(size=nrows))], - as_column(group_offsets), - [], - [], - stable=True, + keys = cp.random.default_rng(seed=random_state).random( + size=nrows ) + with acquire_spill_lock(): + plc_table = plc.sorting.stable_segmented_sort_by_key( + plc.Table( + [as_column(indices).to_pylibcudf(mode="read")] + ), + plc.Table([as_column(keys).to_pylibcudf(mode="read")]), + as_column(group_offsets).to_pylibcudf(mode="read"), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER], + ) + indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 81d954960e2..1a667e24bef 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6367,9 +6367,49 @@ def rank( elif source._num_columns != num_cols: dropped_cols = True - result_columns = libcudf.sort.rank_columns( - [*source._columns], method_enum, na_option, ascending, pct + column_order = ( + plc.types.Order.ASCENDING + if ascending + else plc.types.Order.DESCENDING ) + # ascending + # #top = na_is_smallest + # #bottom = na_is_largest + # #keep = na_is_largest + # descending + # #top = na_is_largest + # #bottom = na_is_smallest + # #keep = na_is_smallest + if ascending: + if na_option == "top": + null_precedence = plc.types.NullOrder.BEFORE + else: + null_precedence = plc.types.NullOrder.AFTER + else: + if na_option == "top": + null_precedence = plc.types.NullOrder.AFTER + else: + null_precedence = plc.types.NullOrder.BEFORE + c_null_handling = ( + plc.types.NullPolicy.EXCLUDE + if na_option == "keep" + else plc.types.NullPolicy.INCLUDE + ) + + with acquire_spill_lock(): + result_columns = [ + libcudf.column.Column.from_pylibcudf( + plc.sorting.rank( + col.to_pylibcudf(mode="read"), + method_enum, + column_order, + c_null_handling, + null_precedence, + pct, + ) + ) + for col in source._columns + ] if dropped_cols: result = type(source)._from_data( diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 5c224176730..e7ea91c1f21 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -9,6 +9,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.types import size_type_dtype +from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import ( @@ -256,7 +257,7 @@ def _gather_maps(self, left_cols, right_cols): for map_, n, null in zip(maps, lengths, nullify) ) ) - return libcudf.sort.sort_by_key( + return sorting.sort_by_key( list(maps), # If how is right, right map is primary sort key. key_order[:: -1 if self.how == "right" else 1], @@ -426,7 +427,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: else: to_sort = [*result._columns] index_names = None - result_columns = libcudf.sort.sort_by_key( + result_columns = sorting.sort_by_key( to_sort, by, [True] * len(by), diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f5ee36f851c..a99e06e4a8e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -22,6 +22,7 @@ from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result +from cudf.core._internals import sorting from cudf.core.algorithms import factorize from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor @@ -1677,7 +1678,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool: f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" ) - return libcudf.sort.is_sorted( + return sorting.is_sorted( [*self._columns], ascending=ascending, null_position=null_position ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 647e20fc16b..961e5e11bc0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3410,7 +3410,7 @@ def describe( ) @_performance_tracking - def digitize(self, bins, right=False): + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: """Return the indices of the bins to which each value belongs. Notes @@ -3441,9 +3441,8 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - return Series._from_column( - cudf.core.column.numerical.digitize(self._column, bins, right), - name=self.name, + return type(self)._from_column( + self._column.digitize(bins, right), name=self.name ) @_performance_tracking