Skip to content

Commit

Permalink
Preserve column metadata during more DataFrame operations
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Mar 27, 2024
1 parent 7c69e66 commit a80e593
Show file tree
Hide file tree
Showing 9 changed files with 134 additions and 81 deletions.
46 changes: 25 additions & 21 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1984,14 +1984,15 @@ def drop_duplicates(

# This utilizes the fact that all `Index` is also a `Frame`.
# Except RangeIndex.
return self._from_columns_like_self(
drop_duplicates(
list(self._columns),
keys=range(len(self._data)),
keep=keep,
nulls_are_equal=nulls_are_equal,
),
self._column_names,
return self._from_data(
self._data._from_columns_like_self(
drop_duplicates(
list(self._columns),
keys=range(len(self._data)),
keep=keep,
nulls_are_equal=nulls_are_equal,
),
)
)

def duplicated(self, keep="first"):
Expand Down Expand Up @@ -2071,13 +2072,14 @@ def dropna(self, how="any"):
for col in self._columns
]

return self._from_columns_like_self(
drop_nulls(
data_columns,
how=how,
keys=range(len(data_columns)),
),
self._column_names,
return self._from_data(
self._data._from_columns_like_self(
drop_nulls(
data_columns,
how=how,
keys=range(len(data_columns)),
),
)
)

def _gather(self, gather_map, nullify=False, check_bounds=True):
Expand All @@ -2098,9 +2100,10 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
):
raise IndexError("Gather map index is out of bounds.")

return self._from_columns_like_self(
gather(list(self._columns), gather_map, nullify=nullify),
self._column_names,
return self._from_data(
self._data._from_columns_like_self(
gather(list(self._columns), gather_map, nullify=nullify),
)
)

def take(self, indices, axis=0, allow_fill=True, fill_value=None):
Expand Down Expand Up @@ -2147,9 +2150,10 @@ def _apply_boolean_mask(self, boolean_mask):
if not is_bool_dtype(boolean_mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

return self._from_columns_like_self(
apply_boolean_mask(list(self._columns), boolean_mask),
column_names=self._column_names,
return self._from_data(
self._data._from_columns_like_self(
apply_boolean_mask(list(self._columns), boolean_mask),
)
)

def repeat(self, repeats, axis=None):
Expand Down
26 changes: 26 additions & 0 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import itertools
import sys
from collections import abc
from functools import cached_property, reduce
from typing import (
Expand Down Expand Up @@ -174,6 +175,31 @@ def __repr__(self) -> str:
)
return f"{type_info}\n{column_info}"

def _from_columns_like_self(
self, columns: abc.Iterable[ColumnBase], verify: bool = True
):
"""
Return a new ColumnAccessor with columns and the properties of self.
"""
if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
data = zip(self.names, columns, strict=True)
else:
columns = list(columns)
if len(columns) != len(self.names):
raise ValueError(
f"The number of columns ({len(columns)}) must match "
f"the number of existing column labels ({len(self.names)})."
)
data = zip(self.names, columns)
return type(self)(
data=dict(data),
multiindex=self.multiindex,
level_names=self.level_names,
rangeindex=self.rangeindex,
label_dtype=self.label_dtype,
verify=verify,
)

@property
def level_names(self) -> Tuple[Any, ...]:
if self._level_names is None or len(self._level_names) == 0:
Expand Down
16 changes: 10 additions & 6 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3036,8 +3036,11 @@ def where(self, cond, other=None, inplace=False):

# First process the condition.
if isinstance(cond, Series):
cond = self._from_data_like_self(
{name: cond._column for name in self._column_names},
cond = self._from_data(
self._data._from_columns_like_self(
itertools.repeat(cond._column, len(self._column_names)),
verify=False,
)
)
elif hasattr(cond, "__cuda_array_interface__"):
cond = DataFrame(
Expand Down Expand Up @@ -3078,7 +3081,7 @@ def where(self, cond, other=None, inplace=False):
should be equal to number of columns of self"""
)

out = {}
out = []
for (name, col), other_col in zip(self._data.items(), other_cols):
col, other_col = _check_and_cast_columns_with_other(
source_col=col,
Expand All @@ -3091,16 +3094,17 @@ def where(self, cond, other=None, inplace=False):
col, other_col, cond_col
)

out[name] = _make_categorical_like(result, self._data[name])
out.append(_make_categorical_like(result, self._data[name]))
else:
out_mask = cudf._lib.null_mask.create_null_mask(
len(col),
state=cudf._lib.null_mask.MaskState.ALL_NULL,
)
out[name] = col.set_mask(out_mask)
out.append(col.set_mask(out_mask))

return self._mimic_inplace(
self._from_data_like_self(out), inplace=inplace
self._from_data(self._data._from_columns_like_self(out)),
inplace=inplace,
)

@docutils.doc_apply(
Expand Down
76 changes: 39 additions & 37 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,6 @@ def _from_data(cls, data: MutableMapping) -> Self:
Frame.__init__(obj, data)
return obj

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping) -> Self:
return self._from_data(data)

@_cudf_nvtx_annotate
def _from_columns_like_self(
self,
Expand Down Expand Up @@ -813,16 +809,17 @@ def _quantile_table(
libcudf.types.NullOrder[key] for key in null_precedence
]

return self._from_columns_like_self(
libcudf.quantiles.quantile_table(
[*self._columns],
q,
interpolation,
is_sorted,
column_order,
null_precedence,
),
column_names=self._column_names,
return self._from_data(
self._data._from_columns_like_self(
libcudf.quantiles.quantile_table(
[*self._columns],
q,
interpolation,
is_sorted,
column_order,
null_precedence,
),
)
)

@classmethod
Expand Down Expand Up @@ -1120,7 +1117,9 @@ def isna(self):
array([False, False, True, True, False, False])
"""
data_columns = (col.isnull() for col in self._columns)
return self._from_data_like_self(zip(self._column_names, data_columns))
return self._from_data(
self._data._from_columns_like_self(data_columns)
)

# Alias for isna
isnull = isna
Expand Down Expand Up @@ -1199,7 +1198,9 @@ def notna(self):
array([ True, True, False, False, True, True])
"""
data_columns = (col.notnull() for col in self._columns)
return self._from_data_like_self(zip(self._column_names, data_columns))
return self._from_data(
self._data._from_columns_like_self(data_columns)
)

# Alias for notna
notnull = notna
Expand Down Expand Up @@ -1487,26 +1488,26 @@ def _split(self, splits):
"""Split a frame with split points in ``splits``. Returns a list of
Frames of length `len(splits) + 1`.
"""
frame_splits = libcudf.copying.columns_split(
[*self._data.columns], splits
)
return [
self._from_columns_like_self(
libcudf.copying.columns_split([*self._data.columns], splits)[
split_idx
],
self._column_names,
)
for split_idx in range(len(splits) + 1)
self._from_data(self._data._from_columns_like_self(split))
for split in frame_splits
]

@_cudf_nvtx_annotate
def _encode(self):
columns, indices = libcudf.transform.table_encode([*self._columns])
keys = self._from_columns_like_self(columns)
keys = self._from_data(self._data._from_columns_like_self(columns))
return keys, indices

@_cudf_nvtx_annotate
def _unaryop(self, op):
data_columns = (col.unary_operator(op) for col in self._columns)
return self._from_data_like_self(zip(self._column_names, data_columns))
return self._from_data(
self._data._from_columns_like_self((data_columns))
)

@classmethod
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1637,13 +1638,15 @@ def _apply_cupy_ufunc_to_operands(
@_cudf_nvtx_annotate
def __neg__(self):
"""Negate for integral dtypes, logical NOT for bools."""
return self._from_data_like_self(
{
name: col.unary_operator("not")
if is_bool_dtype(col.dtype)
else -1 * col
for name, col in self._data.items()
}
return self._from_data(
self._data._from_columns_like_self(
(
col.unary_operator("not")
if col.dtype.kind == "b"
else -1 * col
for col in self._data.columns
)
)
)

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1908,11 +1911,10 @@ def __copy__(self):
@_cudf_nvtx_annotate
def __invert__(self):
"""Bitwise invert (~) for integral dtypes, logical NOT for bools."""
return self._from_data_like_self(
{
name: _apply_inverse_column(col)
for name, col in self._data.items()
}
return self._from_data(
self._data._from_columns_like_self(
(_apply_inverse_column(col) for col in self._data.columns)
)
)

@_cudf_nvtx_annotate
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ def _binaryop(
binop_result = self._colwise_binop(operands, op)

if isinstance(other, cudf.Series):
ret = other._from_data_like_self(binop_result)
ret = other._from_data(binop_result)
other_name = other.name
else:
ret = _index_from_data(binop_result)
Expand Down
20 changes: 7 additions & 13 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,6 @@ def _from_data(
out._index = RangeIndex(out._data.nrows) if index is None else index
return out

@_cudf_nvtx_annotate
def _from_data_like_self(self, data: MutableMapping):
out = self._from_data(data, self._index)
out._data._level_names = self._data._level_names
return out

@_cudf_nvtx_annotate
def _from_columns_like_self(
self,
Expand Down Expand Up @@ -1906,13 +1900,13 @@ def nans_to_nulls(self):
1 <NA> 3.14
2 <NA> <NA>
"""
result_data = {}
for name, col in self._data.items():
try:
result_data[name] = col.nans_to_nulls()
except AttributeError:
result_data[name] = col.copy()
return self._from_data_like_self(result_data)
result = (
col.nans_to_nulls()
if isinstance(col, cudf.core.column.NumericalColumn)
else col.copy()
for col in self._data.columns
)
return self._from_data(self._data._from_columns_like_self(result))

def _copy_type_metadata(
self,
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2088,6 +2088,8 @@ def _split_columns_by_levels(self, levels):
return data_columns, index_columns, data_names, index_names

def repeat(self, repeats, axis=None):
return self._from_columns_like_self(
Frame._repeat([*self._columns], repeats, axis), self._column_names
return self._from_data(
self._data._from_columns_like_self(
Frame._repeat([*self._columns], repeats, axis)
)
)
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3654,7 +3654,7 @@ def pct_change(
def where(self, cond, other=None, inplace=False):
result_col = super().where(cond, other, inplace)
return self._mimic_inplace(
self._from_data_like_self({self.name: result_col}),
self._from_data(self._data._from_columns_like_self([result_col])),
inplace=inplace,
)

Expand Down
21 changes: 21 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10956,3 +10956,24 @@ def test_squeeze(axis, data):
result = df.squeeze(axis=axis)
expected = df.to_pandas().squeeze(axis=axis)
assert_eq(result, expected)


@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
@pytest.mark.parametrize(
"operation",
[
lambda df: df.where(df < 2, 2),
lambda df: df.quantile(q=[0.5, 0.7], method="table"),
lambda df: df.isna(),
lambda df: df.notna(),
lambda df: abs(df),
lambda df: -df,
lambda df: ~df,
lambda df: df.nans_to_nulls(),
],
)
def test_op_preserves_column_metadata(column, operation):
df = cudf.DataFrame([1], columns=cudf.Index(column))
result = operation(df).columns
expected = pd.Index(column)
pd.testing.assert_index_equal(result, expected, exact=True)

0 comments on commit a80e593

Please sign in to comment.