Skip to content

Commit

Permalink
Fix DataFrame reductions with median returning scalar instead of Seri…
Browse files Browse the repository at this point in the history
…es (#16527)

xref #16507

This turned into a little bit of a refactor that also fixes the following:

* `cudf.DataFrame.from_pandas` not preserving the `pandas.DataFrame.column.dtype`
* `cudf.DataFrame.<reduction>(axis=0)` not preserving the `.column` properties in the resulting `.index`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16527
  • Loading branch information
mroeschke authored Aug 16, 2024
1 parent 10cdd5f commit cb843db
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 110 deletions.
3 changes: 3 additions & 0 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,9 @@ def insert(
new_values = self.columns[:loc] + (value,) + self.columns[loc:]
self._data = self._data.__class__(zip(new_keys, new_values))
self._clear_cache(old_ncols, old_ncols + 1)
if old_ncols == 0:
# The type(name) may no longer match the prior label_dtype
self.label_dtype = None

def copy(self, deep=False) -> ColumnAccessor:
"""
Expand Down
120 changes: 43 additions & 77 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5497,14 +5497,9 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
)

if isinstance(dataframe, pd.DataFrame):
if not dataframe.columns.is_unique:
raise ValueError("Duplicate column names are not allowed")

data = {
col_name: column.as_column(
col_value.array, nan_as_null=nan_as_null
)
for col_name, col_value in dataframe.items()
i: column.as_column(col_value.array, nan_as_null=nan_as_null)
for i, (_, col_value) in enumerate(dataframe.items())
}
if isinstance(dataframe.index, pd.MultiIndex):
index = cudf.MultiIndex.from_pandas(
Expand All @@ -5515,14 +5510,8 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
dataframe.index, nan_as_null=nan_as_null
)
df = cls._from_data(data, index)
df._data._level_names = tuple(dataframe.columns.names)

if isinstance(dataframe.columns, pd.RangeIndex):
df._data.rangeindex = True
# Set columns only if it is a MultiIndex
elif isinstance(dataframe.columns, pd.MultiIndex):
df.columns = dataframe.columns

# Checks duplicate columns and sets column metadata
df.columns = dataframe.columns
return df
elif hasattr(dataframe, "__dataframe__"):
# TODO: Probably should be handled in the constructor as
Expand Down Expand Up @@ -6382,8 +6371,11 @@ def _reduce(
source = self

if axis is None:
assert PANDAS_LT_300, "Replace if/else with just axis=2"
# TODO(pandas3.0): Remove if/else for just axis = 2
if op in {"sum", "product", "std", "var"}:
# Do not remove until pandas 2.0 support is added.
# pandas only raises FutureWarning for these ops
# though it applies for all reductions
warnings.warn(
f"In a future version, {type(self).__name__}"
f".{op}(axis=None) will return a scalar {op} over "
Expand All @@ -6402,9 +6394,7 @@ def _reduce(

if numeric_only:
numeric_cols = (
name
for name in self._data.names
if is_numeric_dtype(self._data[name].dtype)
name for name, dtype in self._dtypes if is_numeric_dtype(dtype)
)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
Expand All @@ -6414,62 +6404,41 @@ def _reduce(
else source.index,
dtype="float64",
)
if axis in {0, 2}:
if axis == 2 and op in ("kurtosis", "kurt", "skew"):
# TODO: concat + op can probably be done in the general case
# for axis == 2.
# https://github.com/rapidsai/cudf/issues/14930
return getattr(concat_columns(source._data.columns), op)(
**kwargs
)
try:
result = [
getattr(source._data[col], op)(**kwargs)
for col in source._data.names
]
except AttributeError:
numeric_ops = (
"mean",
"min",
"max",
"sum",
"product",
"prod",
"std",
"var",
"kurtosis",
"kurt",
"skew",
)

if op in numeric_ops:
if (
axis == 2
and op in {"kurtosis", "skew"}
and self._num_rows < 4
and self._num_columns > 1
):
# Total number of elements may satisfy the min number of values
# to compute skew/kurtosis
return getattr(concat_columns(source._columns), op)(**kwargs)
elif axis == 1:
return source._apply_cupy_method_axis_1(op, **kwargs)
else:
axis_0_results = []
for col_label, col in source._data.items():
try:
axis_0_results.append(getattr(col, op)(**kwargs))
except AttributeError as err:
if numeric_only:
try:
result = [
getattr(source._data[col], op)(**kwargs)
for col in source._data.names
]
except AttributeError:
raise NotImplementedError(
f"Not all column dtypes support op {op}"
)
elif any(
not is_numeric_dtype(self._data[name].dtype)
for name in self._data.names
):
raise NotImplementedError(
f"Column {col_label} with type {col.dtype} does not support {op}"
) from err
elif not is_numeric_dtype(col.dtype):
raise TypeError(
"Non numeric columns passed with "
"`numeric_only=False`, pass `numeric_only=True` "
f"to perform DataFrame.{op}"
)
else:
raise
) from err
else:
raise
if axis == 2:
return getattr(as_column(result, nan_as_null=False), op)(
**kwargs
)
return getattr(
as_column(axis_0_results, nan_as_null=False), op
)(**kwargs)
else:
source_dtypes = [c.dtype for c in source._data.columns]
source_dtypes = [dtype for _, dtype in source._dtypes]
common_dtype = find_common_type(source_dtypes)
if (
is_object_dtype(common_dtype)
Expand All @@ -6483,17 +6452,14 @@ def _reduce(
"Columns must all have the same dtype to "
f"perform {op=} with {axis=}"
)
pd_index = source._data.to_pandas_index()
if source._data.multiindex:
idx = MultiIndex.from_tuples(
source._data.names, names=source._data.level_names
)
idx = MultiIndex.from_pandas(pd_index)
else:
idx = cudf.Index(source._data.names)
return Series._from_column(as_column(result), index=idx)
elif axis == 1:
return source._apply_cupy_method_axis_1(op, **kwargs)
else:
raise ValueError(f"Invalid value of {axis=} received for {op}")
idx = cudf.Index.from_pandas(pd_index)
return Series._from_column(
as_column(axis_0_results), index=idx
)

@_performance_tracking
def _scan(
Expand Down
36 changes: 3 additions & 33 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,11 +1386,6 @@ def sum(
a 10
b 34
dtype: int64
.. pandas-compat::
:meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
Parameters currently not supported are `level`, `numeric_only`.
"""
return self._reduce(
"sum",
Expand Down Expand Up @@ -1447,11 +1442,6 @@ def product(
a 24
b 5040
dtype: int64
.. pandas-compat::
:meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
Parameters currently not supported are level`, `numeric_only`.
"""

return self._reduce(
Expand Down Expand Up @@ -1508,7 +1498,9 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
**kwargs,
)

def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
def median(
self, axis=no_default, skipna=True, numeric_only=None, **kwargs
):
"""
Return the median of the values for the requested axis.
Expand Down Expand Up @@ -1542,11 +1534,6 @@ def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
dtype: int64
>>> ser.median()
17.0
.. pandas-compat::
:meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
Parameters currently not supported are `level` and `numeric_only`.
"""
return self._reduce(
"median",
Expand Down Expand Up @@ -1598,12 +1585,6 @@ def std(
a 1.290994
b 1.290994
dtype: float64
.. pandas-compat::
:meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
Parameters currently not supported are `level` and
`numeric_only`
"""

return self._reduce(
Expand Down Expand Up @@ -1657,12 +1638,6 @@ def var(
a 1.666667
b 1.666667
dtype: float64
.. pandas-compat::
:meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
Parameters currently not supported are `level` and
`numeric_only`
"""
return self._reduce(
"var",
Expand Down Expand Up @@ -1713,11 +1688,6 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
a -1.2
b -1.2
dtype: float64
.. pandas-compat::
:meth:`pandas.DataFrame.kurtosis`
Parameters currently not supported are `level` and `numeric_only`
"""
if axis not in (0, "index", None, no_default):
raise NotImplementedError("Only axis=0 is currently supported.")
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11114,3 +11114,9 @@ def test_bool_raises():
lfunc_args_and_kwargs=[[cudf.DataFrame()]],
rfunc_args_and_kwargs=[[pd.DataFrame()]],
)


def test_from_pandas_preserve_column_dtype():
df = pd.DataFrame([[1, 2]], columns=pd.Index([1, 2], dtype="int8"))
result = cudf.DataFrame.from_pandas(df)
pd.testing.assert_index_equal(result.columns, df.columns, exact=True)
35 changes: 35 additions & 0 deletions python/cudf/cudf/tests/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,30 @@ def test_reductions_axis_none_warning(op):
assert_eq(expected, actual, check_dtype=False)


@pytest.mark.parametrize(
"op",
[
"sum",
"product",
"std",
"var",
"kurt",
"kurtosis",
"skew",
"min",
"max",
"mean",
"median",
],
)
def test_dataframe_reduction_no_args(op):
df = cudf.DataFrame({"a": range(10), "b": range(10)})
pdf = df.to_pandas()
result = getattr(df, op)()
expected = getattr(pdf, op)()
assert_eq(result, expected)


def test_reduction_column_multiindex():
idx = cudf.MultiIndex.from_tuples(
[("a", 1), ("a", 2)], names=["foo", "bar"]
Expand All @@ -374,3 +398,14 @@ def test_dtype_deprecated(op):
with pytest.warns(FutureWarning):
result = getattr(ser, op)(dtype=np.dtype(np.int8))
assert isinstance(result, np.int8)


@pytest.mark.parametrize(
"columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")]
)
def test_dataframe_axis_0_preserve_column_type_in_index(columns):
pd_df = pd.DataFrame([[1, 2]], columns=columns)
cudf_df = cudf.DataFrame.from_pandas(pd_df)
result = cudf_df.sum(axis=0)
expected = pd_df.sum(axis=0)
assert_eq(result, expected, check_index_type=True)

0 comments on commit cb843db

Please sign in to comment.