Skip to content

Commit

Permalink
Align public utility function signatures with pandas 2.x (#16565)
Browse files Browse the repository at this point in the history
The following function signatures have a breaking change

* `concat`
* `get_dummies`
* `date_range`

Additionally deprecates the `cat` argument in `get_dummies` (doesn't exist in pandas and not tested), and fixes a bug in `interval_range` where `names` was not being respected

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16565
  • Loading branch information
mroeschke authored Aug 15, 2024
1 parent 19846b6 commit 89863a3
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 29 deletions.
2 changes: 2 additions & 0 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
"DatetimeIndex",
"Decimal32Dtype",
"Decimal64Dtype",
"Decimal128Dtype",
"Grouper",
"Index",
"IntervalDtype",
Expand Down Expand Up @@ -126,6 +127,7 @@
"isclose",
"melt",
"merge",
"option_context",
"pivot",
"pivot_table",
"read_avro",
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3350,14 +3350,14 @@ def interval_range(
if len(right_col) == 0 or len(left_col) == 0:
dtype = IntervalDtype("int64", closed)
data = column.column_empty_like_same_mask(left_col, dtype)
return IntervalIndex(data, closed=closed)
return IntervalIndex(data, closed=closed, name=name)

interval_col = IntervalColumn(
dtype=IntervalDtype(left_col.dtype, closed),
size=len(left_col),
children=(left_col, right_col),
)
return IntervalIndex(interval_col, closed=closed)
return IntervalIndex(interval_col, closed=closed, name=name)


class IntervalIndex(Index):
Expand Down
74 changes: 55 additions & 19 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,17 @@ def _normalize_series_and_dataframe(objs, axis):
objs[idx] = obj.to_frame(name=name)


def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
def concat(
objs,
axis=0,
join="outer",
ignore_index=False,
keys=None,
levels=None,
names=None,
verify_integrity=False,
sort=None,
):
"""Concatenate DataFrames, Series, or Indices row-wise.
Parameters
Expand All @@ -132,6 +142,21 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
ignore_index : bool, default False
Set True to ignore the index of the *objs* and provide a
default range index instead.
keys : sequence, default None
If multiple levels passed, should contain tuples. Construct
hierarchical index using the passed keys as the outermost level.
Currently not supported.
levels : list of sequences, default None
Specific levels (unique values) to use for constructing a
MultiIndex. Otherwise they will be inferred from the keys.
Currently not supported.
names : list, default None
Names for the levels in the resulting hierarchical index.
Currently not supported.
verify_integrity : bool, default False
Check whether the new concatenated axis contains duplicates. This can
be very expensive relative to the actual data concatenation.
Currently not supported.
sort : bool, default False
Sort non-concatenation axis if it is not already aligned.
Expand Down Expand Up @@ -243,6 +268,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
0 a 1 c 3
1 b 2 d 4
"""
if keys is not None:
raise NotImplementedError("keys is currently not supported")
if levels is not None:
raise NotImplementedError("levels is currently not supported")
if names is not None:
raise NotImplementedError("names is currently not supported")
# TODO: Do we really need to have different error messages for an empty
# list and a list of None?
if not objs:
Expand All @@ -260,15 +291,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
f"Can only concatenate dictionary input along axis=1, not {axis}"
)
objs = {k: obj for k, obj in objs.items() if obj is not None}
keys = list(objs)
keys_objs = list(objs)
objs = list(objs.values())
if any(isinstance(o, cudf.BaseIndex) for o in objs):
raise TypeError(
"cannot concatenate a dictionary containing indices"
)
else:
objs = [obj for obj in objs if obj is not None]
keys = None
keys_objs = None

if not objs:
raise ValueError("All objects passed were None")
Expand Down Expand Up @@ -317,8 +348,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
result = obj.to_frame()
else:
result = obj.copy(deep=True)
if keys is not None and isinstance(result, cudf.DataFrame):
k = keys[0]
if keys_objs is not None and isinstance(result, cudf.DataFrame):
k = keys_objs[0]
result.columns = cudf.MultiIndex.from_tuples(
[
(k, *c) if isinstance(c, tuple) else (k, c)
Expand Down Expand Up @@ -370,7 +401,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
objs = _align_objs(objs, how=join, sort=sort)
df.index = objs[0].index

if keys is None:
if keys_objs is None:
for o in objs:
for name, col in o._data.items():
if name in df._data:
Expand Down Expand Up @@ -408,9 +439,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
"label types in cuDF at this time. You must convert "
"the labels to the same type."
)
for k, o in zip(keys, objs):
for k, o in zip(keys_objs, objs):
for name, col in o._data.items():
# if only series, then only keep keys as column labels
# if only series, then only keep keys_objs as column labels
# if the existing column is multiindex, prepend it
# to handle cases where dfs and srs are concatenated
if only_series:
Expand All @@ -426,7 +457,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
else:
df[col_label] = col

if keys is None:
if keys_objs is None:
df.columns = result_columns.unique()
if ignore_index:
df.columns = cudf.RangeIndex(len(result_columns.unique()))
Expand Down Expand Up @@ -666,7 +697,7 @@ def _tile(A, reps):


def get_dummies(
df,
data,
prefix=None,
prefix_sep="_",
dummy_na=False,
Expand All @@ -681,7 +712,7 @@ def get_dummies(
Parameters
----------
df : array-like, Series, or DataFrame
data : array-like, Series, or DataFrame
Data of which to get dummy indicators.
prefix : str, dict, or sequence, optional
Prefix to append. Either a str (to apply a constant prefix), dict
Expand Down Expand Up @@ -759,17 +790,22 @@ def get_dummies(

if cats is None:
cats = {}
else:
warnings.warn(
"cats is deprecated and will be removed in a future version.",
FutureWarning,
)
if sparse:
raise NotImplementedError("sparse is not supported yet")

if drop_first:
raise NotImplementedError("drop_first is not supported yet")

if isinstance(df, cudf.DataFrame):
if isinstance(data, cudf.DataFrame):
encode_fallback_dtypes = ["object", "category"]

if columns is None or len(columns) == 0:
columns = df.select_dtypes(
columns = data.select_dtypes(
include=encode_fallback_dtypes
)._column_names

Expand All @@ -796,33 +832,33 @@ def get_dummies(
# If we have no columns to encode, we need to drop
# fallback columns(if any)
if len(columns) == 0:
return df.select_dtypes(exclude=encode_fallback_dtypes)
return data.select_dtypes(exclude=encode_fallback_dtypes)
else:
result_data = {
col_name: col
for col_name, col in df._data.items()
for col_name, col in data._data.items()
if col_name not in columns
}

for name in columns:
if name not in cats:
unique = _get_unique(
column=df._data[name], dummy_na=dummy_na
column=data._data[name], dummy_na=dummy_na
)
else:
unique = as_column(cats[name])

col_enc_data = _one_hot_encode_column(
column=df._data[name],
column=data._data[name],
categories=unique,
prefix=prefix_map.get(name, prefix),
prefix_sep=prefix_sep_map.get(name, prefix_sep),
dtype=dtype,
)
result_data.update(col_enc_data)
return cudf.DataFrame._from_data(result_data, index=df.index)
return cudf.DataFrame._from_data(result_data, index=data.index)
else:
ser = cudf.Series(df)
ser = cudf.Series(data)
unique = _get_unique(column=ser._column, dummy_na=dummy_na)
data = _one_hot_encode_column(
column=ser._column,
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@ def date_range(
tz=None,
normalize: bool = False,
name=None,
closed: Literal["left", "right", "both", "neither"] = "both",
inclusive: Literal["left", "right", "both", "neither"] = "both",
*,
unit: str | None = None,
):
Expand Down Expand Up @@ -823,7 +823,7 @@ def date_range(
name : str, default None
Name of the resulting DatetimeIndex
closed : {"left", "right", "both", "neither"}, default "both"
inclusive : {"left", "right", "both", "neither"}, default "both"
Whether to set each bound as closed or open.
Currently only "both" is supported
Expand All @@ -839,7 +839,7 @@ def date_range(
-----
Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three
must be specified. If `freq` is omitted, the resulting DatetimeIndex will
have periods linearly spaced elements between start and end (closed on both
have periods linearly spaced elements between start and end (inclusive on both
sides).
cudf supports `freq` specified with either fixed-frequency offset
Expand All @@ -866,8 +866,8 @@ def date_range(
'2026-04-23 08:00:00'],
dtype='datetime64[ns]')
"""
if closed != "both":
raise NotImplementedError(f"{closed=} is currently unsupported.")
if inclusive != "both":
raise NotImplementedError(f"{inclusive=} is currently unsupported.")
if unit is not None:
raise NotImplementedError(f"{unit=} is currently unsupported.")
if normalize is not False:
Expand Down Expand Up @@ -961,7 +961,7 @@ def date_range(
periods = 0
else:
# If end == start, periods == 0 and we return exactly 1 timestamp (start).
# Otherwise, since closed="both", we ensure the end point is included.
# Otherwise, since inclusive="both", we ensure the end point is included.
periods += 1

# We compute `end_estim` (the estimated upper bound of the date
Expand Down
9 changes: 7 additions & 2 deletions python/cudf/cudf/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from cudf.core.column import ColumnBase


def to_numeric(arg, errors="raise", downcast=None):
def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
"""
Convert argument into numerical types.
Expand Down Expand Up @@ -48,6 +48,8 @@ def to_numeric(arg, errors="raise", downcast=None):
Note that downcast behavior is decoupled from parsing. Errors
encountered during downcast is raised regardless of ``errors``
parameter.
dtype_backend : None
Not implemented.
Returns
-------
Expand Down Expand Up @@ -93,7 +95,10 @@ def to_numeric(arg, errors="raise", downcast=None):
For example ``[1, 'a']``. A ``TypeError`` will be raised when such
input is received, regardless of ``errors`` parameter.
"""

if dtype_backend is not None:
raise NotImplementedError(
"dtype_backend is not currently implemented."
)
if errors not in {"raise", "ignore", "coerce"}:
raise ValueError("invalid error value specified")
elif errors == "ignore":
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/indexes/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,9 @@ def test_from_tuples():
result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
assert_eq(result, expected)


def test_interval_range_name():
expected = pd.interval_range(start=0, periods=5, freq=2, name="foo")
result = cudf.interval_range(start=0, periods=5, freq=2, name="foo")
assert_eq(result, expected)
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_onehot.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,9 @@ def test_get_dummies_array_like_with_nan():
actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")

assert_eq(expected, actual)


def test_get_dummies_cats_deprecated():
df = cudf.DataFrame(range(3))
with pytest.warns(FutureWarning):
cudf.get_dummies(df, cats={0: [0, 1, 2]})

0 comments on commit 89863a3

Please sign in to comment.