From c53f1287e08ebc7336c87750dfb0bb2570c19cc4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:11:55 -0800 Subject: [PATCH 1/5] Move cudf._lib.sort to cudf.core._internals --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/sort.pyx | 365 ------------------ python/cudf/cudf/core/_internals/sort.py | 200 ++++++++++ python/cudf/cudf/core/column/column.py | 25 +- python/cudf/cudf/core/column/numerical.py | 70 ++-- .../cudf/cudf/core/column/numerical_base.py | 3 +- python/cudf/cudf/core/frame.py | 3 +- python/cudf/cudf/core/groupby/groupby.py | 25 +- python/cudf/cudf/core/indexed_frame.py | 44 ++- python/cudf/cudf/core/join/join.py | 5 +- python/cudf/cudf/core/multiindex.py | 3 +- python/cudf/cudf/core/series.py | 7 +- 13 files changed, 320 insertions(+), 432 deletions(-) delete mode 100644 python/cudf/cudf/_lib/sort.pyx create mode 100644 python/cudf/cudf/core/_internals/sort.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index de483b3070d..6e8c1a98ab4 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -27,7 +27,6 @@ set(cython_sources reduce.pyx round.pyx scalar.pyx - sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index ee1bd13f2c4..f7112065969 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -14,7 +14,6 @@ parquet, reduce, round, - sort, stream_compaction, string_casting, strings, diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx deleted file mode 100644 index eefe37d9880..00000000000 --- a/python/cudf/cudf/_lib/sort.pyx +++ /dev/null @@ -1,365 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from itertools import repeat - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from pylibcudf.libcudf.aggregation cimport rank_method -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -@acquire_spill_lock() -def is_sorted( - list source_columns, object ascending=None, object null_position=None -): - """ - Checks whether the rows of a `table` are sorted in lexicographical order. - - Parameters - ---------- - source_columns : list of columns - columns to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order of - each column. If list-like, size of list-like must be len(columns). If - None, all columns expected sort order is set to ascending. False (0) - - descending, True (1) - ascending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of nulls - compared to other elements. If list-like, size of list-like must be - len(columns). If None, null order is set to before. False (0) - after, - True (1) - before. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - - if ascending is None: - column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns) - else: - if len(ascending) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(ascending)} for `ascending`" - ) - column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns) - for idx, val in enumerate(ascending): - if val: - column_order[idx] = pylibcudf.types.Order.ASCENDING - - if null_position is None: - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - else: - if len(null_position) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(null_position)} for `null_position`" - ) - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - for idx, val in enumerate(null_position): - if val: - null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE - - return pylibcudf.sorting.is_sorted( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - column_order, - null_precedence - ) - - -def ordering(column_order, null_precedence): - """ - Construct order and null order vectors - - Parameters - ---------- - column_order - Iterable of bool (True for ascending order, False for descending) - null_precedence - Iterable string for null positions ("first" for start, "last" for end) - - Both iterables must be the same length (not checked) - - Returns - ------- - pair of vectors (order, and null_order) - """ - c_column_order = [] - c_null_precedence = [] - for asc, null in zip(column_order, null_precedence): - c_column_order.append( - pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING - ) - if asc ^ (null == "first"): - c_null_precedence.append(pylibcudf.types.NullOrder.AFTER) - elif asc ^ (null == "last"): - c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE) - else: - raise ValueError(f"Invalid null precedence {null}") - return c_column_order, c_null_precedence - - -@acquire_spill_lock() -def order_by( - list columns_from_table, - object ascending, - str na_position, - *, - bool stable -): - """ - Get index to sort the table in ascending/descending order. - - Parameters - ---------- - columns_from_table : list[Column] - Columns from the table which will be sorted - ascending : sequence[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : str - Whether null values should show up at the "first" or "last" - position of **all** sorted column. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - Column of indices that sorts the table - """ - order = ordering(ascending, repeat(na_position)) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order") - - return Column.from_pylibcudf( - func( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in columns_from_table], - ), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort( - list values, - list column_order=None, - list null_precedence=None, -): - """ - Sort the table in ascending/descending order. - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - return columns_from_pylibcudf_table( - pylibcudf.sorting.sort( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort_by_key( - list values, - list keys, - object ascending, - object na_position, - *, - bool stable, -): - """ - Sort a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - ascending : list[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : list[str] - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - order = ordering(ascending, na_position) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key") - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def segmented_sort_by_key( - list values, - list keys, - Column segment_offsets, - list column_order=None, - list null_precedence=None, - *, - bool stable, -): - """ - Sort segments of a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - offsets : Column - Segment offsets - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - func = getattr( - pylibcudf.sorting, - f"{'stable_' if stable else ''}segmented_sort_by_key" - ) - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - segment_offsets.to_pylibcudf(mode="read"), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def digitize(list source_columns, list bins, bool right=False): - """ - Return the indices of the bins to which each value in source_table belongs. - - Parameters - ---------- - source_columns : Input columns to be binned. - bins : List containing columns of bins - right : Indicating whether the intervals include the - right or the left bin edge. - """ - return Column.from_pylibcudf( - getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in bins] - ), - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - [pylibcudf.types.Order.ASCENDING]*len(bins), - [pylibcudf.types.NullOrder.BEFORE]*len(bins) - ) - ) - - -@acquire_spill_lock() -def rank_columns(list source_columns, rank_method method, str na_option, - bool ascending, bool pct - ): - """ - Compute numerical data ranks (1 through n) of each column in the dataframe - """ - column_order = ( - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ) - # ascending - # #top = na_is_smallest - # #bottom = na_is_largest - # #keep = na_is_largest - # descending - # #top = na_is_largest - # #bottom = na_is_smallest - # #keep = na_is_smallest - if ascending: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.BEFORE - else: - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - null_precedence = pylibcudf.types.NullOrder.BEFORE - c_null_handling = ( - pylibcudf.types.NullPolicy.EXCLUDE - if na_option == 'keep' - else pylibcudf.types.NullPolicy.INCLUDE - ) - - return [ - Column.from_pylibcudf( - pylibcudf.sorting.rank( - col.to_pylibcudf(mode="read"), - method, - column_order, - c_null_handling, - null_precedence, - pct, - ) - ) - for col in source_columns - ] diff --git a/python/cudf/cudf/core/_internals/sort.py b/python/cudf/cudf/core/_internals/sort.py new file mode 100644 index 00000000000..2e521dd7bef --- /dev/null +++ b/python/cudf/cudf/core/_internals/sort.py @@ -0,0 +1,200 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +import itertools +from typing import TYPE_CHECKING, Literal + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def is_sorted( + source_columns: list[ColumnBase], + ascending: list[bool] | None = None, + null_position: list[bool] | None = None, +) -> bool: + """ + Checks whether the rows of a `table` are sorted in lexicographical order. + + Parameters + ---------- + source_columns : list of columns + columns to be checked for sort order + ascending : None or list-like of booleans + None or list-like of boolean values indicating expected sort order of + each column. If list-like, size of list-like must be len(columns). If + None, all columns expected sort order is set to ascending. False (0) - + descending, True (1) - ascending. + null_position : None or list-like of booleans + None or list-like of boolean values indicating desired order of nulls + compared to other elements. If list-like, size of list-like must be + len(columns). If None, null order is set to before. False (0) - after, + True (1) - before. + + Returns + ------- + returns : boolean + Returns True, if sorted as expected by ``ascending`` and + ``null_position``, False otherwise. + """ + + if ascending is None: + column_order = [plc.types.Order.ASCENDING] * len(source_columns) + else: + if len(ascending) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(ascending)} for `ascending`" + ) + column_order = [plc.types.Order.DESCENDING] * len(source_columns) + for idx, val in enumerate(ascending): + if val: + column_order[idx] = plc.types.Order.ASCENDING + + if null_position is None: + null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns) + else: + if len(null_position) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(null_position)} for `null_position`" + ) + null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns) + for idx, val in enumerate(null_position): + if val: + null_precedence[idx] = plc.types.NullOrder.BEFORE + + return plc.sorting.is_sorted( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + column_order, + null_precedence, + ) + + +def ordering( + column_order: list[bool], null_precedence: list[Literal["first", "last"]] +) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: + """ + Construct order and null order vectors + + Parameters + ---------- + column_order + Iterable of bool (True for ascending order, False for descending) + null_precedence + Iterable string for null positions ("first" for start, "last" for end) + + Both iterables must be the same length (not checked) + + Returns + ------- + pair of vectors (order, and null_order) + """ + c_column_order = [] + c_null_precedence = [] + for asc, null in zip(column_order, null_precedence): + c_column_order.append( + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + ) + if asc ^ (null == "first"): + c_null_precedence.append(plc.types.NullOrder.AFTER) + elif asc ^ (null == "last"): + c_null_precedence.append(plc.types.NullOrder.BEFORE) + else: + raise ValueError(f"Invalid null precedence {null}") + return c_column_order, c_null_precedence + + +@acquire_spill_lock() +def order_by( + columns_from_table: list[ColumnBase], + ascending: list[bool], + na_position: Literal["first", "last"], + *, + stable: bool, +): + """ + Get index to sort the table in ascending/descending order. + + Parameters + ---------- + columns_from_table : list[Column] + Columns from the table which will be sorted + ascending : sequence[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : str + Whether null values should show up at the "first" or "last" + position of **all** sorted column. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + Column of indices that sorts the table + """ + order = ordering(ascending, list(itertools.repeat(na_position))) + func = getattr(plc.sorting, f"{'stable_' if stable else ''}sorted_order") + + return Column.from_pylibcudf( + func( + plc.Table( + [col.to_pylibcudf(mode="read") for col in columns_from_table], + ), + order[0], + order[1], + ) + ) + + +@acquire_spill_lock() +def sort_by_key( + values: list[ColumnBase], + keys: list[ColumnBase], + ascending: list[bool], + na_position: list[Literal["first", "last"]], + *, + stable: bool, +) -> list[ColumnBase]: + """ + Sort a table by given keys + + Parameters + ---------- + values : list[Column] + Columns of the table which will be sorted + keys : list[Column] + Columns making up the sort key + ascending : list[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : list[str] + Sequence of "first" or "last" values (default "first") + indicating the position of null values when sorting the keys. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + list[Column] + list of value columns sorted by keys + """ + order = ordering(ascending, na_position) + func = getattr(plc.sorting, f"{'stable_' if stable else ''}sort_by_key") + return [ + Column.from_pylibcudf(col) + for col in func( + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + plc.Table([col.to_pylibcudf(mode="read") for col in keys]), + order[0], + order[1], + ).columns() + ] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d1938f47d66..ae56d924a2a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -43,7 +43,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals import unary +from cudf.core._internals import sort, unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -984,13 +984,13 @@ def is_unique(self) -> bool: @cached_property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sort.is_sorted( [self], [True], None ) @cached_property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sort.is_sorted( [self], [False], None ) @@ -1014,15 +1014,20 @@ def contains(self, other: ColumnBase) -> ColumnBase: def sort_values( self: Self, ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ) -> Self: if (not ascending and self.is_monotonic_decreasing) or ( ascending and self.is_monotonic_increasing ): return self.copy() - return libcudf.sort.sort( - [self], column_order=[ascending], null_precedence=[na_position] - )[0] + order = sort.ordering([ascending], [na_position]) + with acquire_spill_lock(): + plc_table = plc.sorting.sort( + plc.Table([self.to_pylibcudf(mode="read")]), + order[0], + order[1], + ) + return type(self).from_pylibcudf(plc_table.columns()[0]) # type: ignore[return-value] def distinct_count(self, dropna: bool = True) -> int: try: @@ -1192,9 +1197,7 @@ def argsort( as_column(range(len(self) - 1, -1, -1)), ) else: - return libcudf.sort.order_by( - [self], [ascending], na_position, stable=True - ) + return sort.order_by([self], [ascending], na_position, stable=True) def __arrow_array__(self, type=None): raise TypeError( @@ -1500,7 +1503,7 @@ def _return_sentinel_column(): del right_rows # reorder `codes` so that its values correspond to the # values of `self`: - (codes,) = libcudf.sort.sort_by_key( + (codes,) = sort.sort_by_key( codes, [left_gather_map], [True], ["last"], stable=True ) return codes.fillna(na_sentinel.value) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index c8f859596b2..a87bd0c8802 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -9,12 +9,13 @@ import pandas as pd from typing_extensions import Self -import pylibcudf +import pylibcudf as plc import cudf from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import unary +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand @@ -184,7 +185,7 @@ def unary_operator(self, unaryop: str | Callable) -> ColumnBase: unaryop = unaryop.upper() unaryop = _unaryop_map.get(unaryop, unaryop) - unaryop = pylibcudf.unary.UnaryOperator[unaryop] + unaryop = plc.unary.UnaryOperator[unaryop] return unary.unary_operation(self, unaryop) def __invert__(self): @@ -722,6 +723,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: return super()._reduction_result_dtype(reduction_op) + @acquire_spill_lock() + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: + """Return the indices of the bins to which each value in column belongs. + + Parameters + ---------- + bins : np.ndarray + 1-D column-like object of bins with same type as `column`, should be + monotonically increasing. + right : bool + Indicates whether interval contains the right or left bin edge. + + Returns + ------- + A column containing the indices + """ + if self.dtype != bins.dtype: + raise ValueError( + "digitize() expects bins and input column have the same dtype." + ) + + bin_col = as_column(bins, dtype=bins.dtype) + if bin_col.nullable: + raise ValueError("`bins` cannot contain null entries.") + + return type(self).from_pylibcudf( # type: ignore[return-value] + getattr(plc.search, "lower_bound" if right else "upper_bound")( + plc.Table([bin_col.to_pylibcudf(mode="read")]), + plc.Table([self.to_pylibcudf(mode="read")]), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.BEFORE], + ) + ) + def _normalize_find_and_replace_input( input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list @@ -776,34 +811,3 @@ def _normalize_find_and_replace_input( if not normalized_column.can_cast_safely(input_column_dtype): return normalized_column return normalized_column.astype(input_column_dtype) - - -def digitize( - column: ColumnBase, bins: np.ndarray, right: bool = False -) -> ColumnBase: - """Return the indices of the bins to which each value in column belongs. - - Parameters - ---------- - column : Column - Input column. - bins : Column-like - 1-D column-like object of bins with same type as `column`, should be - monotonically increasing. - right : bool - Indicates whether interval contains the right or left bin edge. - - Returns - ------- - A column containing the indices - """ - if not column.dtype == bins.dtype: - raise ValueError( - "Digitize() expects bins and input column have the same dtype." - ) - - bin_col = as_column(bins, dtype=bins.dtype) - if bin_col.nullable: - raise ValueError("`bins` cannot contain null entries.") - - return as_column(libcudf.sort.digitize([column], [bin_col], right)) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 6d639337401..bbc8b997929 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -11,6 +11,7 @@ import cudf from cudf import _lib as libcudf +from cudf.core._internals import sort from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase from cudf.core.missing import NA @@ -144,7 +145,7 @@ def quantile( ) else: # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( + indices = sort.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) with acquire_spill_lock(): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0c0f271fe6f..058d5628266 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -23,6 +23,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sort from cudf.core._internals.search import search_sorted from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -1433,7 +1434,7 @@ def _get_sorted_inds( else: ascending_lst = list(ascending) - return libcudf.sort.order_by( + return sort.order_by( list(to_sort), ascending_lst, na_position, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e977f037b79..b2708118976 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -19,11 +19,11 @@ import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby -from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sort from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -791,7 +791,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): # want, and right order is a matching gather map for # the result table. Get the correct order by sorting # the right gather map. - (right_order,) = libcudf.sort.sort_by_key( + (right_order,) = sort.sort_by_key( [right_order], [left_order], [True], @@ -1247,15 +1247,20 @@ def sample( for off, size in zip(group_offsets, size_per_group): rs.shuffle(indices[off : off + size]) else: - rng = cp.random.default_rng(seed=random_state) - (indices,) = segmented_sort_by_key( - [as_column(indices)], - [as_column(rng.random(size=nrows))], - as_column(group_offsets), - [], - [], - stable=True, + keys = cp.random.default_rng(seed=random_state).random( + size=nrows ) + with acquire_spill_lock(): + plc_table = plc.sorting.stable_segmented_sort_by_key( + plc.Table( + [as_column(indices).to_pylibcudf(mode="read")] + ), + plc.Table([as_column(keys).to_pylibcudf(mode="read")]), + as_column(group_offsets).to_pylibcudf(mode="read"), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER], + ) + indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2f8c2587937..350d2d307ca 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6362,9 +6362,49 @@ def rank( elif source._num_columns != num_cols: dropped_cols = True - result_columns = libcudf.sort.rank_columns( - [*source._columns], method_enum, na_option, ascending, pct + column_order = ( + plc.types.Order.ASCENDING + if ascending + else plc.types.Order.DESCENDING ) + # ascending + # #top = na_is_smallest + # #bottom = na_is_largest + # #keep = na_is_largest + # descending + # #top = na_is_largest + # #bottom = na_is_smallest + # #keep = na_is_smallest + if ascending: + if na_option == "top": + null_precedence = plc.types.NullOrder.BEFORE + else: + null_precedence = plc.types.NullOrder.AFTER + else: + if na_option == "top": + null_precedence = plc.types.NullOrder.AFTER + else: + null_precedence = plc.types.NullOrder.BEFORE + c_null_handling = ( + plc.types.NullPolicy.EXCLUDE + if na_option == "keep" + else plc.types.NullPolicy.INCLUDE + ) + + with acquire_spill_lock(): + result_columns = [ + libcudf.column.Column.from_pylibcudf( + plc.sorting.rank( + col.to_pylibcudf(mode="read"), + method_enum, + column_order, + c_null_handling, + null_precedence, + pct, + ) + ) + for col in source._columns + ] if dropped_cols: result = type(source)._from_data( diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 5c224176730..77b9f376e35 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -9,6 +9,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.types import size_type_dtype +from cudf.core._internals import sort from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import ( @@ -256,7 +257,7 @@ def _gather_maps(self, left_cols, right_cols): for map_, n, null in zip(maps, lengths, nullify) ) ) - return libcudf.sort.sort_by_key( + return sort.sort_by_key( list(maps), # If how is right, right map is primary sort key. key_order[:: -1 if self.how == "right" else 1], @@ -426,7 +427,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: else: to_sort = [*result._columns] index_names = None - result_columns = libcudf.sort.sort_by_key( + result_columns = sort.sort_by_key( to_sort, by, [True] * len(by), diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 19a53af018d..5dd67fac6c1 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -23,6 +23,7 @@ from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result +from cudf.core._internals import sort from cudf.core.algorithms import factorize from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor @@ -1678,7 +1679,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool: f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" ) - return libcudf.sort.is_sorted( + return sort.is_sorted( [*self._columns], ascending=ascending, null_position=null_position ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 95ea22b5ad5..b5e023c698f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3401,7 +3401,7 @@ def describe( ) @_performance_tracking - def digitize(self, bins, right=False): + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: """Return the indices of the bins to which each value belongs. Notes @@ -3432,9 +3432,8 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - return Series._from_column( - cudf.core.column.numerical.digitize(self._column, bins, right), - name=self.name, + return type(self)._from_column( + self._column.digitize(bins, right), name=self.name ) @_performance_tracking From 926e30cf5476ddd9dae92c8f8fdf60e26e8ffd5f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:08:44 -0800 Subject: [PATCH 2/5] Simplfy some implementations --- .../cudf/core/_internals/{sort.py => sorting.py} | 10 ++++++---- python/cudf/cudf/core/column/column.py | 14 ++++++++------ python/cudf/cudf/core/column/numerical_base.py | 4 ++-- python/cudf/cudf/core/frame.py | 4 ++-- python/cudf/cudf/core/groupby/groupby.py | 4 ++-- python/cudf/cudf/core/join/join.py | 6 +++--- python/cudf/cudf/core/multiindex.py | 4 ++-- 7 files changed, 25 insertions(+), 21 deletions(-) rename python/cudf/cudf/core/_internals/{sort.py => sorting.py} (96%) diff --git a/python/cudf/cudf/core/_internals/sort.py b/python/cudf/cudf/core/_internals/sorting.py similarity index 96% rename from python/cudf/cudf/core/_internals/sort.py rename to python/cudf/cudf/core/_internals/sorting.py index 2e521dd7bef..63df69f23f7 100644 --- a/python/cudf/cudf/core/_internals/sort.py +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -43,7 +43,6 @@ def is_sorted( Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ - if ascending is None: column_order = [plc.types.Order.ASCENDING] * len(source_columns) else: @@ -141,8 +140,9 @@ def order_by( Column of indices that sorts the table """ order = ordering(ascending, list(itertools.repeat(na_position))) - func = getattr(plc.sorting, f"{'stable_' if stable else ''}sorted_order") - + func = ( + plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order + ) return Column.from_pylibcudf( func( plc.Table( @@ -188,7 +188,9 @@ def sort_by_key( list of value columns sorted by keys """ order = ordering(ascending, na_position) - func = getattr(plc.sorting, f"{'stable_' if stable else ''}sort_by_key") + func = ( + plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + ) return [ Column.from_pylibcudf(col) for col in func( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 935c42bc8c2..bb973fd27d3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -43,7 +43,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals import sort, unary +from cudf.core._internals import sorting, unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -994,13 +994,13 @@ def is_unique(self) -> bool: @cached_property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls(include_nan=True) and sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [True], None ) @cached_property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls(include_nan=True) and sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [False], None ) @@ -1030,7 +1030,7 @@ def sort_values( ascending and self.is_monotonic_increasing ): return self.copy() - order = sort.ordering([ascending], [na_position]) + order = sorting.ordering([ascending], [na_position]) with acquire_spill_lock(): plc_table = plc.sorting.sort( plc.Table([self.to_pylibcudf(mode="read")]), @@ -1207,7 +1207,9 @@ def argsort( as_column(range(len(self) - 1, -1, -1)), ) else: - return sort.order_by([self], [ascending], na_position, stable=True) + return sorting.order_by( + [self], [ascending], na_position, stable=True + ) def __arrow_array__(self, type=None): raise TypeError( @@ -1512,7 +1514,7 @@ def _return_sentinel_column(): del right_rows # reorder `codes` so that its values correspond to the # values of `self`: - (codes,) = sort.sort_by_key( + (codes,) = sorting.sort_by_key( codes, [left_gather_map], [True], ["last"], stable=True ) return codes.fillna(na_sentinel.value) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 56e22ac0d35..be6cbd2d252 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -11,7 +11,7 @@ import cudf from cudf import _lib as libcudf -from cudf.core._internals import sort +from cudf.core._internals import sorting from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.missing import NA @@ -145,7 +145,7 @@ def quantile( ) else: # get sorted indices and exclude nulls - indices = sort.order_by( + indices = sorting.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) with acquire_spill_lock(): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fca6d84c52d..ce6d7571886 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -23,7 +23,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 -from cudf.core._internals import sort +from cudf.core._internals import sorting from cudf.core._internals.search import search_sorted from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -1434,7 +1434,7 @@ def _get_sorted_inds( else: ascending_lst = list(ascending) - return sort.order_by( + return sorting.order_by( list(to_sort), ascending_lst, na_position, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 56cbbb58991..f2eb397b943 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -23,7 +23,7 @@ from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 -from cudf.core._internals import sort +from cudf.core._internals import sorting from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -794,7 +794,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): # want, and right order is a matching gather map for # the result table. Get the correct order by sorting # the right gather map. - (right_order,) = sort.sort_by_key( + (right_order,) = sorting.sort_by_key( [right_order], [left_order], [True], diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 77b9f376e35..e7ea91c1f21 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -9,7 +9,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.types import size_type_dtype -from cudf.core._internals import sort +from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import ( @@ -257,7 +257,7 @@ def _gather_maps(self, left_cols, right_cols): for map_, n, null in zip(maps, lengths, nullify) ) ) - return sort.sort_by_key( + return sorting.sort_by_key( list(maps), # If how is right, right map is primary sort key. key_order[:: -1 if self.how == "right" else 1], @@ -427,7 +427,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: else: to_sort = [*result._columns] index_names = None - result_columns = sort.sort_by_key( + result_columns = sorting.sort_by_key( to_sort, by, [True] * len(by), diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 10ca5479496..5fb7b4e12f1 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -23,7 +23,7 @@ from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result -from cudf.core._internals import sort +from cudf.core._internals import sorting from cudf.core.algorithms import factorize from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor @@ -1679,7 +1679,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool: f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" ) - return sort.is_sorted( + return sorting.is_sorted( [*self._columns], ascending=ascending, null_position=null_position ) From 1fa1611bf65ec3aa31316685b0cd098b45f83aab Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:16:15 -0800 Subject: [PATCH 3/5] Remove list call --- python/cudf/cudf/core/_internals/sorting.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py index 63df69f23f7..b5f57558a57 100644 --- a/python/cudf/cudf/core/_internals/sorting.py +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -10,6 +10,8 @@ from cudf.core.buffer import acquire_spill_lock if TYPE_CHECKING: + from collections.abc import Iterable + from cudf.core.column import ColumnBase @@ -43,6 +45,7 @@ def is_sorted( Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ + breakpoint() if ascending is None: column_order = [plc.types.Order.ASCENDING] * len(source_columns) else: @@ -77,7 +80,8 @@ def is_sorted( def ordering( - column_order: list[bool], null_precedence: list[Literal["first", "last"]] + column_order: list[bool], + null_precedence: Iterable[Literal["first", "last"]], ) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: """ Construct order and null order vectors @@ -139,7 +143,7 @@ def order_by( ------- Column of indices that sorts the table """ - order = ordering(ascending, list(itertools.repeat(na_position))) + order = ordering(ascending, itertools.repeat(na_position)) func = ( plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order ) @@ -187,6 +191,7 @@ def sort_by_key( list[Column] list of value columns sorted by keys """ + breakpoint() order = ordering(ascending, na_position) func = ( plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key From 18413d7d494542b9f5c65c936950a3dbb1fbd478 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Dec 2024 12:06:46 -0800 Subject: [PATCH 4/5] Remove breakpoints --- python/cudf/cudf/core/_internals/sorting.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py index b5f57558a57..23c95fcea85 100644 --- a/python/cudf/cudf/core/_internals/sorting.py +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -45,7 +45,6 @@ def is_sorted( Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ - breakpoint() if ascending is None: column_order = [plc.types.Order.ASCENDING] * len(source_columns) else: @@ -191,7 +190,6 @@ def sort_by_key( list[Column] list of value columns sorted by keys """ - breakpoint() order = ordering(ascending, na_position) func = ( plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key From cbc9afcf13e70d754c3da65ef903b47a157fc3c7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:10:23 -0800 Subject: [PATCH 5/5] Use list comp --- python/cudf/cudf/core/_internals/sorting.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py index 23c95fcea85..69f9e7664b1 100644 --- a/python/cudf/cudf/core/_internals/sorting.py +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -53,10 +53,10 @@ def is_sorted( f"Expected a list-like of length {len(source_columns)}, " f"got length {len(ascending)} for `ascending`" ) - column_order = [plc.types.Order.DESCENDING] * len(source_columns) - for idx, val in enumerate(ascending): - if val: - column_order[idx] = plc.types.Order.ASCENDING + column_order = [ + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + for asc in ascending + ] if null_position is None: null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns) @@ -66,10 +66,10 @@ def is_sorted( f"Expected a list-like of length {len(source_columns)}, " f"got length {len(null_position)} for `null_position`" ) - null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns) - for idx, val in enumerate(null_position): - if val: - null_precedence[idx] = plc.types.NullOrder.BEFORE + null_precedence = [ + plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER + for null in null_position + ] return plc.sorting.is_sorted( plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),