Skip to content

Commit

Permalink
Align misc DataFrame and MultiIndex methods with pandas 2.x (#16402)
Browse files Browse the repository at this point in the history
The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

* `DataFrame.insert`
* `DataFrame.melt`
* `DataFrame.merge`
* `DataFrame.quantile`
* `DataFrame.cov`
* `DataFrame.corr`
* `DataFrame.median`
* `DataFrame.rolling`
* `DataFrame.resample`
* `DataFrame.dropna`
* `MultiIndex.from_tuple`
* `MultiIndex.from_frame`
* `MultiIndex.from_product`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16402
  • Loading branch information
mroeschke authored Jul 29, 2024
1 parent 6e7624d commit 3579605
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 76 deletions.
106 changes: 75 additions & 31 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3215,26 +3215,37 @@ def reset_index(
)

@_performance_tracking
def insert(self, loc, name, value, nan_as_null=no_default):
def insert(
self,
loc,
column,
value,
allow_duplicates: bool = False,
nan_as_null=no_default,
):
"""Add a column to DataFrame at the index specified by loc.
Parameters
----------
loc : int
location to insert by index, cannot be greater then num columns + 1
name : number or string
name or label of column to be inserted
column : number or string
column or label of column to be inserted
value : Series or array-like
nan_as_null : bool, Default None
If ``None``/``True``, converts ``np.nan`` values to
``null`` values.
If ``False``, leaves ``np.nan`` values as is.
"""
if allow_duplicates is not False:
raise NotImplementedError(
"allow_duplicates is currently not implemented."
)
if nan_as_null is no_default:
nan_as_null = not cudf.get_option("mode.pandas_compatible")
return self._insert(
loc=loc,
name=name,
name=column,
value=value,
nan_as_null=nan_as_null,
ignore_index=False,
Expand Down Expand Up @@ -4097,7 +4108,15 @@ def transpose(self):
T = property(transpose, doc=transpose.__doc__)

@_performance_tracking
def melt(self, **kwargs):
def melt(
self,
id_vars=None,
value_vars=None,
var_name=None,
value_name="value",
col_level=None,
ignore_index: bool = True,
):
"""Unpivots a DataFrame from wide format to long format,
optionally leaving identifier variables set.
Expand All @@ -4124,23 +4143,30 @@ def melt(self, **kwargs):
"""
from cudf.core.reshape import melt

return melt(self, **kwargs)
return melt(
self,
id_vars=id_vars,
value_vars=value_vars,
var_name=var_name,
value_name=value_name,
col_level=col_level,
ignore_index=ignore_index,
)

@_performance_tracking
def merge(
self,
right,
how="inner",
on=None,
left_on=None,
right_on=None,
left_index=False,
right_index=False,
how="inner",
sort=False,
lsuffix=None,
rsuffix=None,
indicator=False,
suffixes=("_x", "_y"),
indicator=False,
validate=None,
):
"""Merge GPU DataFrame objects by performing a database-style join
operation by columns or indexes.
Expand Down Expand Up @@ -4241,17 +4267,8 @@ def merge(
raise NotImplementedError(
"Only indicator=False is currently supported"
)

if lsuffix or rsuffix:
raise ValueError(
"The lsuffix and rsuffix keywords have been replaced with the "
"``suffixes=`` keyword. "
"Please provide the following instead: \n\n"
" suffixes=('%s', '%s')"
% (lsuffix or "_x", rsuffix or "_y")
)
else:
lsuffix, rsuffix = suffixes
if validate is not None:
raise NotImplementedError("validate is currently not supported.")

lhs, rhs = self, right
merge_cls = Merge
Expand Down Expand Up @@ -5952,9 +5969,9 @@ def quantile(
axis=0,
numeric_only=True,
interpolation=None,
method="single",
columns=None,
exact=True,
method="single",
):
"""
Return values at the given quantile.
Expand All @@ -5980,14 +5997,14 @@ def quantile(
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.
columns : list of str
List of column names to include.
exact : boolean
Whether to use approximate or exact quantile algorithm.
method : {'single', 'table'}, default `'single'`
Whether to compute quantiles per-column ('single') or over all
columns ('table'). When 'table', the only allowed interpolation
methods are 'nearest', 'lower', and 'higher'.
columns : list of str
List of column names to include.
exact : boolean
Whether to use approximate or exact quantile algorithm.
Returns
-------
Expand Down Expand Up @@ -7309,25 +7326,47 @@ def unnamed_group_generator():
return result

@_performance_tracking
def cov(self, **kwargs):
def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
"""Compute the covariance matrix of a DataFrame.
Parameters
----------
**kwargs
Keyword arguments to be passed to cupy.cov
min_periods : int, optional
Minimum number of observations required per pair of columns to
have a valid result.
Currently not supported.
ddof : int, default 1
Delta degrees of freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.
numeric_only : bool, default False
Include only `float`, `int` or `boolean` data.
Currently not supported.
Returns
-------
cov : DataFrame
"""
cov = cupy.cov(self.values, rowvar=False)
if min_periods is not None:
raise NotImplementedError(
"min_periods is currently not supported."
)

if numeric_only is not False:
raise NotImplementedError(
"numeric_only is currently not supported."
)

cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
cols = self._data.to_pandas_index()
df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
df._set_columns_like(self._data)
return df

def corr(self, method="pearson", min_periods=None):
def corr(
self, method="pearson", min_periods=None, numeric_only: bool = False
):
"""Compute the correlation matrix of a DataFrame.
Parameters
Expand Down Expand Up @@ -7357,6 +7396,11 @@ def corr(self, method="pearson", min_periods=None):
if min_periods is not None:
raise NotImplementedError("Unsupported argument 'min_periods'")

if numeric_only is not False:
raise NotImplementedError(
"numeric_only is currently not supported."
)

corr = cupy.corrcoef(values, rowvar=False)
cols = self._data.to_pandas_index()
df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
Expand Down
81 changes: 52 additions & 29 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1495,9 +1495,7 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
**kwargs,
)

def median(
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
):
def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
"""
Return the median of the values for the requested axis.
Expand Down Expand Up @@ -1857,15 +1855,28 @@ def mask(
@_performance_tracking
@copy_docstring(Rolling)
def rolling(
self, window, min_periods=None, center=False, axis=0, win_type=None
self,
window,
min_periods=None,
center: bool = False,
win_type: str | None = None,
on=None,
axis=0,
closed: str | None = None,
step: int | None = None,
method: str = "single",
):
return Rolling(
self,
window,
min_periods=min_periods,
center=center,
axis=axis,
on=on,
win_type=win_type,
closed=closed,
step=step,
method=method,
)

@copy_docstring(ExponentialMovingWindow)
Expand All @@ -1880,6 +1891,7 @@ def ewm(
ignore_na: bool = False,
axis: int = 0,
times: str | np.ndarray | None = None,
method: Literal["single", "table"] = "single",
):
return ExponentialMovingWindow(
self,
Expand All @@ -1892,6 +1904,7 @@ def ewm(
ignore_na=ignore_na,
axis=axis,
times=times,
method=method,
)

@_performance_tracking
Expand Down Expand Up @@ -3943,16 +3956,15 @@ def resample(
self,
rule,
axis=0,
closed=None,
label=None,
convention="start",
closed: Literal["right", "left"] | None = None,
label: Literal["right", "left"] | None = None,
convention: Literal["start", "end", "s", "e"] = "start",
kind=None,
loffset=None,
base=None,
on=None,
level=None,
origin="start_day",
offset=None,
group_keys: bool = False,
):
"""
Convert the frequency of ("resample") the given time series data.
Expand Down Expand Up @@ -4090,26 +4102,27 @@ def resample(
"deprecated and will be removed in a future version. ",
FutureWarning,
)
if (axis, convention, kind, loffset, base, origin, offset) != (
0,
"start",
None,
None,
None,
"start_day",
None,
):
raise NotImplementedError(
"The following arguments are not "
"currently supported by resample:\n\n"
"- axis\n"
"- convention\n"
"- kind\n"
"- loffset\n"
"- base\n"
"- origin\n"
"- offset"
raise NotImplementedError("kind is currently not supported.")
if axis != 0:
warnings.warn(
"The 'axis' keyword in is "
"deprecated and will be removed in a future version. ",
FutureWarning,
)
raise NotImplementedError("axis is currently not supported.")
if convention != "start":
warnings.warn(
"The 'convention' keyword in is "
"deprecated and will be removed in a future version. ",
FutureWarning,
)
raise NotImplementedError("convention is currently not supported.")
if origin != "start_day":
raise NotImplementedError("origin is currently not supported.")
if offset is not None:
raise NotImplementedError("offset is currently not supported.")
if group_keys is not False:
raise NotImplementedError("group_keys is currently not supported.")
by = cudf.Grouper(
key=on, freq=rule, closed=closed, label=label, level=level
)
Expand All @@ -4120,7 +4133,13 @@ def resample(
)

def dropna(
self, axis=0, how="any", thresh=None, subset=None, inplace=False
self,
axis=0,
how="any",
thresh=None,
subset=None,
inplace=False,
ignore_index: bool = False,
):
"""
Drop rows (or columns) containing nulls from a Column.
Expand All @@ -4144,6 +4163,8 @@ def dropna(
columns, subset is a list of rows to consider.
inplace : bool, default False
If True, do operation inplace and return None.
ignore_index : bool, default ``False``
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
Returns
-------
Expand Down Expand Up @@ -4220,6 +4241,8 @@ def dropna(
"""
if axis == 0:
result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
if ignore_index:
result.index = RangeIndex(len(result))
else:
result = self._drop_na_columns(
how=how, subset=subset, thresh=thresh
Expand Down
Loading

0 comments on commit 3579605

Please sign in to comment.