From 30011c58ed2444f0a6ba9f80c17766e591a610a1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 16 Aug 2024 07:19:54 -1000 Subject: [PATCH] Clean up reshaping ops (#16553) Uses some more "idiomatic" cudf patterns such as * Checking `isinstance(column.dtype, ...)` instead of `isinstance(column, ...)` (to avoid importing the column objects) * Using `DataFrame._from_data(dict)` instead of creating an empty `DataFrame` and adding columns one by one Also avoids some column materialization in `DataFrame.columns = `: * For `RangeIndex`, avoid materializing to a column to get a distinct count * For `MultiIndex`, avoid creating a `cudf.MultiIndex` with columns as it's converted to a CPU object to get column labels for the `ColumnAccessor` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16553 --- python/cudf/cudf/core/dataframe.py | 8 +- python/cudf/cudf/core/reshape.py | 141 ++++++++++++++++------------- 2 files changed, 82 insertions(+), 67 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3d805881c5a..6ee3d69441f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2654,8 +2654,12 @@ def columns(self, columns): elif isinstance(columns, (cudf.BaseIndex, ColumnBase, Series)): level_names = (getattr(columns, "name", None),) rangeindex = isinstance(columns, cudf.RangeIndex) - columns = as_column(columns) - if columns.distinct_count(dropna=False) != len(columns): + if rangeindex: + unique_count = len(columns) + else: + columns = as_column(columns) + unique_count = columns.distinct_count(dropna=False) + if unique_count != len(columns): raise ValueError("Duplicate column names are not allowed") pd_columns = pd.Index(columns.to_pandas()) label_dtype = pd_columns.dtype diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 703a239bea2..3d205957126 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -3,7 +3,7 @@ import itertools import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -14,7 +14,7 @@ from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like -from cudf.core.column.categorical import CategoricalColumn +from cudf.core.column_accessor import ColumnAccessor from cudf.utils.dtypes import min_unsigned_type if TYPE_CHECKING: @@ -101,7 +101,9 @@ def _get_combined_index(indexes, intersect: bool = False, sort=None): return index -def _normalize_series_and_dataframe(objs, axis): +def _normalize_series_and_dataframe( + objs: list[cudf.Series | cudf.DataFrame], axis: Literal[0, 1] +) -> None: """Convert any cudf.Series objects in objs to DataFrames in place.""" # Default to naming series by a numerical id if they are not named. sr_name = 0 @@ -335,7 +337,7 @@ def concat( result = obj.to_frame() else: result = obj.copy(deep=True) - result.columns = pd.RangeIndex(len(result._data)) + result.columns = cudf.RangeIndex(len(result._data)) else: result = type(obj)._from_data( data=obj._data.copy(deep=True), @@ -350,7 +352,7 @@ def concat( result = obj.copy(deep=True) if keys_objs is not None and isinstance(result, cudf.DataFrame): k = keys_objs[0] - result.columns = cudf.MultiIndex.from_tuples( + result.columns = pd.MultiIndex.from_tuples( [ (k, *c) if isinstance(c, tuple) else (k, c) for c in result._column_names @@ -369,7 +371,6 @@ def concat( raise TypeError( "Can only concatenate Series and DataFrame objects when axis=1" ) - df = cudf.DataFrame() _normalize_series_and_dataframe(objs, axis=axis) any_empty = any(obj.empty for obj in objs) @@ -393,18 +394,23 @@ def concat( objs = [obj for obj in objs if obj.shape != (0, 0)] if len(objs) == 0: - return df + # TODO: https://github.com/rapidsai/cudf/issues/16550 + return cudf.DataFrame() # Don't need to align indices of all `objs` since we # would anyway return an empty dataframe below if not empty_inner: objs = _align_objs(objs, how=join, sort=sort) - df.index = objs[0].index + result_index = objs[0].index + else: + result_index = None + result_data = {} + result_columns = None if keys_objs is None: for o in objs: for name, col in o._data.items(): - if name in df._data: + if name in result_data: raise NotImplementedError( f"A Column with duplicate name found: {name}, cuDF " f"doesn't support having multiple columns with " @@ -414,11 +420,11 @@ def concat( # if join is inner and it contains an empty df # we return an empty df, hence creating an empty # column with dtype metadata retained. - df[name] = cudf.core.column.column_empty_like( + result_data[name] = cudf.core.column.column_empty_like( col, newsize=0 ) else: - df[name] = col + result_data[name] = col result_columns = ( objs[0] @@ -451,21 +457,21 @@ def concat( else: col_label = (k, name) if empty_inner: - df[col_label] = cudf.core.column.column_empty_like( - col, newsize=0 + result_data[col_label] = ( + cudf.core.column.column_empty_like(col, newsize=0) ) else: - df[col_label] = col + result_data[col_label] = col - if keys_objs is None: - df.columns = result_columns.unique() - if ignore_index: - df.columns = cudf.RangeIndex(len(result_columns.unique())) - elif ignore_index: - # with ignore_index the column names change to numbers - df.columns = cudf.RangeIndex(len(result_columns)) + df = cudf.DataFrame._from_data( + ColumnAccessor(result_data, verify=False), index=result_index + ) + if ignore_index: + df.columns = cudf.RangeIndex(df._num_columns) + elif result_columns is not None: + df.columns = result_columns elif not only_series: - df.columns = cudf.MultiIndex.from_tuples(df._column_names) + df.columns = pd.MultiIndex.from_tuples(df._column_names) if empty_inner: # if join is inner and it contains an empty df @@ -486,6 +492,7 @@ def concat( if len(objs) == 0: # If objs is empty, that indicates all of # objs are empty dataframes. + # TODO: https://github.com/rapidsai/cudf/issues/16550 return cudf.DataFrame() elif len(objs) == 1: obj = objs[0] @@ -519,7 +526,7 @@ def concat( elif typ is cudf.MultiIndex: return cudf.MultiIndex._concat(objs) elif issubclass(typ, cudf.Index): - return cudf.core.index.Index._concat(objs) + return cudf.Index._concat(objs) else: raise TypeError(f"cannot concatenate object of type {typ}") @@ -632,18 +639,19 @@ def melt( value_vars = [c for c in frame._column_names if c not in unique_id] # Error for unimplemented support for datatype - dtypes = [frame[col].dtype for col in id_vars + value_vars] - if any(isinstance(typ, cudf.CategoricalDtype) for typ in dtypes): + if any( + isinstance(frame[col].dtype, cudf.CategoricalDtype) + for col in id_vars + value_vars + ): raise NotImplementedError( "Categorical columns are not yet supported for function" ) # Check dtype homogeneity in value_var # Because heterogeneous concat is unimplemented - dtypes = [frame[col].dtype for col in value_vars] - if len(dtypes) > 0: - dtype = dtypes[0] - if any(t != dtype for t in dtypes): + if len(value_vars) > 1: + dtype = frame[value_vars[0]].dtype + if any(frame[col].dtype != dtype for col in value_vars): raise ValueError("all cols in value_vars must have the same dtype") # overlap @@ -969,37 +977,39 @@ def _pivot(df, index, columns): index_labels, index_idx = index._encode() column_labels = columns_labels.to_pandas().to_flat_index() - def as_tuple(x): - return x if isinstance(x, tuple) else (x,) - result = {} - for v in df: - names = [as_tuple(v) + as_tuple(name) for name in column_labels] + if len(index_labels) != 0 and len(columns_labels) != 0: + + def as_tuple(x): + return x if isinstance(x, tuple) else (x,) + nrows = len(index_labels) - ncols = len(names) - num_elements = nrows * ncols - if num_elements > 0: - col = df._data[v] + for col_label, col in df._data.items(): + names = [ + as_tuple(col_label) + as_tuple(name) for name in column_labels + ] + new_size = nrows * len(names) scatter_map = (columns_idx * np.int32(nrows)) + index_idx - target = cudf.DataFrame._from_data( - { - None: cudf.core.column.column_empty_like( - col, masked=True, newsize=nrows * ncols - ) - } + target_col = cudf.core.column.column_empty_like( + col, masked=True, newsize=new_size ) - target._data[None][scatter_map] = col - result_frames = target._split(range(nrows, nrows * ncols, nrows)) + target_col[scatter_map] = col + target = cudf.Index._from_column(target_col) result.update( { - name: next(iter(f._columns)) - for name, f in zip(names, result_frames) + name: idx._column + for name, idx in zip( + names, target._split(range(nrows, new_size, nrows)) + ) } ) # the result of pivot always has a multicolumn - ca = cudf.core.column_accessor.ColumnAccessor( - result, multiindex=True, level_names=(None,) + columns._data.names + ca = ColumnAccessor( + result, + multiindex=True, + level_names=(None,) + columns._data.names, + verify=False, ) return cudf.DataFrame._from_data( ca, index=cudf.Index(index_labels, name=index.name) @@ -1070,19 +1080,20 @@ def pivot(data, columns=None, index=no_default, values=no_default): if index is no_default: index = df.index else: - index = cudf.core.index.Index(df.loc[:, index]) + index = cudf.Index(df.loc[:, index]) columns = cudf.Index(df.loc[:, columns]) # Create a DataFrame composed of columns from both # columns and index - columns_index = {} - columns_index = { - i: col - for i, col in enumerate( - itertools.chain(index._data.columns, columns._data.columns) - ) - } - columns_index = cudf.DataFrame(columns_index) + ca = ColumnAccessor( + dict( + enumerate( + itertools.chain(index._data.columns, columns._data.columns) + ) + ), + verify=False, + ) + columns_index = cudf.DataFrame._from_data(ca) # Check that each row is unique: if len(columns_index) != len(columns_index.drop_duplicates()): @@ -1225,13 +1236,13 @@ def unstack(df, level, fill_value=None, sort: bool = True): return result -def _get_unique(column, dummy_na): +def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase: """ Returns unique values in a column, if dummy_na is False, nan's are also dropped. """ - if isinstance(column, cudf.core.column.CategoricalColumn): - unique = column.categories + if isinstance(column.dtype, cudf.CategoricalDtype): + unique = column.categories # type: ignore[attr-defined] else: unique = column.unique().sort_values() if not dummy_na: @@ -1251,11 +1262,11 @@ def _one_hot_encode_column( `prefix`, separated with category name with `prefix_sep`. The encoding columns maybe coerced into `dtype`. """ - if isinstance(column, CategoricalColumn): + if isinstance(column.dtype, cudf.CategoricalDtype): if column.size == column.null_count: column = column_empty_like(categories, newsize=column.size) else: - column = column._get_decategorized_column() + column = column._get_decategorized_column() # type: ignore[attr-defined] if column.size * categories.size >= np.iinfo(size_type_dtype).max: raise ValueError( @@ -1536,7 +1547,7 @@ def pivot_table( table_columns = tuple( map(lambda column: column[1:], table._data.names) ) - table.columns = cudf.MultiIndex.from_tuples( + table.columns = pd.MultiIndex.from_tuples( tuples=table_columns, names=column_names )