diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index de44f392eef..3eb82f767c3 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1984,14 +1984,15 @@ def drop_duplicates( # This utilizes the fact that all `Index` is also a `Frame`. # Except RangeIndex. - return self._from_columns_like_self( - drop_duplicates( - list(self._columns), - keys=range(len(self._data)), - keep=keep, - nulls_are_equal=nulls_are_equal, - ), - self._column_names, + return self._from_data( + self._data._from_columns_like_self( + drop_duplicates( + list(self._columns), + keys=range(len(self._data)), + keep=keep, + nulls_are_equal=nulls_are_equal, + ), + ) ) def duplicated(self, keep="first"): @@ -2071,13 +2072,14 @@ def dropna(self, how="any"): for col in self._columns ] - return self._from_columns_like_self( - drop_nulls( - data_columns, - how=how, - keys=range(len(data_columns)), - ), - self._column_names, + return self._from_data( + self._data._from_columns_like_self( + drop_nulls( + data_columns, + how=how, + keys=range(len(data_columns)), + ), + ) ) def _gather(self, gather_map, nullify=False, check_bounds=True): @@ -2098,9 +2100,10 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): ): raise IndexError("Gather map index is out of bounds.") - return self._from_columns_like_self( - gather(list(self._columns), gather_map, nullify=nullify), - self._column_names, + return self._from_data( + self._data._from_columns_like_self( + gather(list(self._columns), gather_map, nullify=nullify), + ) ) def take(self, indices, axis=0, allow_fill=True, fill_value=None): @@ -2147,9 +2150,10 @@ def _apply_boolean_mask(self, boolean_mask): if not is_bool_dtype(boolean_mask.dtype): raise ValueError("boolean_mask is not boolean type.") - return self._from_columns_like_self( - apply_boolean_mask(list(self._columns), boolean_mask), - column_names=self._column_names, + return self._from_data( + self._data._from_columns_like_self( + apply_boolean_mask(list(self._columns), boolean_mask), + ) ) def repeat(self, repeats, axis=None): diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 33085bede78..b99475ad8d6 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,6 +3,7 @@ from __future__ import annotations import itertools +import sys from collections import abc from functools import cached_property, reduce from typing import ( @@ -174,6 +175,31 @@ def __repr__(self) -> str: ) return f"{type_info}\n{column_info}" + def _from_columns_like_self( + self, columns: abc.Iterable[ColumnBase], verify: bool = True + ): + """ + Return a new ColumnAccessor with columns and the properties of self. + """ + if sys.version_info.major >= 3 and sys.version_info.minor >= 10: + data = zip(self.names, columns, strict=True) + else: + columns = list(columns) + if len(columns) != len(self.names): + raise ValueError( + f"The number of columns ({len(columns)}) must match " + f"the number of existing column labels ({len(self.names)})." + ) + data = zip(self.names, columns) + return type(self)( + data=dict(data), + multiindex=self.multiindex, + level_names=self.level_names, + rangeindex=self.rangeindex, + label_dtype=self.label_dtype, + verify=verify, + ) + @property def level_names(self) -> Tuple[Any, ...]: if self._level_names is None or len(self._level_names) == 0: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2a4f93c1716..6481a1b7c20 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3036,8 +3036,11 @@ def where(self, cond, other=None, inplace=False): # First process the condition. if isinstance(cond, Series): - cond = self._from_data_like_self( - {name: cond._column for name in self._column_names}, + cond = self._from_data( + self._data._from_columns_like_self( + itertools.repeat(cond._column, len(self._column_names)), + verify=False, + ) ) elif hasattr(cond, "__cuda_array_interface__"): cond = DataFrame( @@ -3078,7 +3081,7 @@ def where(self, cond, other=None, inplace=False): should be equal to number of columns of self""" ) - out = {} + out = [] for (name, col), other_col in zip(self._data.items(), other_cols): col, other_col = _check_and_cast_columns_with_other( source_col=col, @@ -3091,16 +3094,17 @@ def where(self, cond, other=None, inplace=False): col, other_col, cond_col ) - out[name] = _make_categorical_like(result, self._data[name]) + out.append(_make_categorical_like(result, self._data[name])) else: out_mask = cudf._lib.null_mask.create_null_mask( len(col), state=cudf._lib.null_mask.MaskState.ALL_NULL, ) - out[name] = col.set_mask(out_mask) + out.append(col.set_mask(out_mask)) return self._mimic_inplace( - self._from_data_like_self(out), inplace=inplace + self._from_data(self._data._from_columns_like_self(out)), + inplace=inplace, ) @docutils.doc_apply( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 809bdb4e6d1..1c1e2bb0c5c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -133,10 +133,6 @@ def _from_data(cls, data: MutableMapping) -> Self: Frame.__init__(obj, data) return obj - @_cudf_nvtx_annotate - def _from_data_like_self(self, data: MutableMapping) -> Self: - return self._from_data(data) - @_cudf_nvtx_annotate def _from_columns_like_self( self, @@ -813,16 +809,17 @@ def _quantile_table( libcudf.types.NullOrder[key] for key in null_precedence ] - return self._from_columns_like_self( - libcudf.quantiles.quantile_table( - [*self._columns], - q, - interpolation, - is_sorted, - column_order, - null_precedence, - ), - column_names=self._column_names, + return self._from_data( + self._data._from_columns_like_self( + libcudf.quantiles.quantile_table( + [*self._columns], + q, + interpolation, + is_sorted, + column_order, + null_precedence, + ), + ) ) @classmethod @@ -1120,7 +1117,9 @@ def isna(self): array([False, False, True, True, False, False]) """ data_columns = (col.isnull() for col in self._columns) - return self._from_data_like_self(zip(self._column_names, data_columns)) + return self._from_data( + self._data._from_columns_like_self(data_columns) + ) # Alias for isna isnull = isna @@ -1199,7 +1198,9 @@ def notna(self): array([ True, True, False, False, True, True]) """ data_columns = (col.notnull() for col in self._columns) - return self._from_data_like_self(zip(self._column_names, data_columns)) + return self._from_data( + self._data._from_columns_like_self(data_columns) + ) # Alias for notna notnull = notna @@ -1487,26 +1488,26 @@ def _split(self, splits): """Split a frame with split points in ``splits``. Returns a list of Frames of length `len(splits) + 1`. """ + frame_splits = libcudf.copying.columns_split( + [*self._data.columns], splits + ) return [ - self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[ - split_idx - ], - self._column_names, - ) - for split_idx in range(len(splits) + 1) + self._from_data(self._data._from_columns_like_self(split)) + for split in frame_splits ] @_cudf_nvtx_annotate def _encode(self): columns, indices = libcudf.transform.table_encode([*self._columns]) - keys = self._from_columns_like_self(columns) + keys = self._from_data(self._data._from_columns_like_self(columns)) return keys, indices @_cudf_nvtx_annotate def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) - return self._from_data_like_self(zip(self._column_names, data_columns)) + return self._from_data( + self._data._from_columns_like_self((data_columns)) + ) @classmethod @_cudf_nvtx_annotate @@ -1637,13 +1638,15 @@ def _apply_cupy_ufunc_to_operands( @_cudf_nvtx_annotate def __neg__(self): """Negate for integral dtypes, logical NOT for bools.""" - return self._from_data_like_self( - { - name: col.unary_operator("not") - if is_bool_dtype(col.dtype) - else -1 * col - for name, col in self._data.items() - } + return self._from_data( + self._data._from_columns_like_self( + ( + col.unary_operator("not") + if col.dtype.kind == "b" + else -1 * col + for col in self._data.columns + ) + ) ) @_cudf_nvtx_annotate @@ -1908,11 +1911,10 @@ def __copy__(self): @_cudf_nvtx_annotate def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" - return self._from_data_like_self( - { - name: _apply_inverse_column(col) - for name, col in self._data.items() - } + return self._from_data( + self._data._from_columns_like_self( + (_apply_inverse_column(col) for col in self._data.columns) + ) ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bd9dc1ae3da..20227b2c730 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1068,7 +1068,7 @@ def _binaryop( binop_result = self._colwise_binop(operands, op) if isinstance(other, cudf.Series): - ret = other._from_data_like_self(binop_result) + ret = other._from_data(binop_result) other_name = other.name else: ret = _index_from_data(binop_result) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ca9d5590044..35247ee949d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -298,12 +298,6 @@ def _from_data( out._index = RangeIndex(out._data.nrows) if index is None else index return out - @_cudf_nvtx_annotate - def _from_data_like_self(self, data: MutableMapping): - out = self._from_data(data, self._index) - out._data._level_names = self._data._level_names - return out - @_cudf_nvtx_annotate def _from_columns_like_self( self, @@ -1906,13 +1900,13 @@ def nans_to_nulls(self): 1 3.14 2 """ - result_data = {} - for name, col in self._data.items(): - try: - result_data[name] = col.nans_to_nulls() - except AttributeError: - result_data[name] = col.copy() - return self._from_data_like_self(result_data) + result = ( + col.nans_to_nulls() + if isinstance(col, cudf.core.column.NumericalColumn) + else col.copy() + for col in self._data.columns + ) + return self._from_data(self._data._from_columns_like_self(result)) def _copy_type_metadata( self, diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 019daacddba..f9bfdec452e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -2088,6 +2088,8 @@ def _split_columns_by_levels(self, levels): return data_columns, index_columns, data_names, index_names def repeat(self, repeats, axis=None): - return self._from_columns_like_self( - Frame._repeat([*self._columns], repeats, axis), self._column_names + return self._from_data( + self._data._from_columns_like_self( + Frame._repeat([*self._columns], repeats, axis) + ) ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 275dc664175..94db1f29fc5 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3654,7 +3654,7 @@ def pct_change( def where(self, cond, other=None, inplace=False): result_col = super().where(cond, other, inplace) return self._mimic_inplace( - self._from_data_like_self({self.name: result_col}), + self._from_data(self._data._from_columns_like_self([result_col])), inplace=inplace, ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ead1ab2da6c..079f40c4faf 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10956,3 +10956,24 @@ def test_squeeze(axis, data): result = df.squeeze(axis=axis) expected = df.to_pandas().squeeze(axis=axis) assert_eq(result, expected) + + +@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)]) +@pytest.mark.parametrize( + "operation", + [ + lambda df: df.where(df < 2, 2), + lambda df: df.quantile(q=[0.5, 0.7], method="table"), + lambda df: df.isna(), + lambda df: df.notna(), + lambda df: abs(df), + lambda df: -df, + lambda df: ~df, + lambda df: df.nans_to_nulls(), + ], +) +def test_op_preserves_column_metadata(column, operation): + df = cudf.DataFrame([1], columns=cudf.Index(column)) + result = operation(df).columns + expected = pd.Index(column) + pd.testing.assert_index_equal(result, expected, exact=True)