From 89863a3b791250a2285b90d2c13f51f009638f44 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 14 Aug 2024 17:22:31 -1000 Subject: [PATCH] Align public utility function signatures with pandas 2.x (#16565) The following function signatures have a breaking change * `concat` * `get_dummies` * `date_range` Additionally deprecates the `cat` argument in `get_dummies` (doesn't exist in pandas and not tested), and fixes a bug in `interval_range` where `names` was not being respected Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16565 --- python/cudf/cudf/__init__.py | 2 + python/cudf/cudf/core/index.py | 4 +- python/cudf/cudf/core/reshape.py | 74 ++++++++++++++----- python/cudf/cudf/core/tools/datetimes.py | 12 +-- python/cudf/cudf/core/tools/numeric.py | 9 ++- .../cudf/cudf/tests/indexes/test_interval.py | 6 ++ python/cudf/cudf/tests/test_onehot.py | 6 ++ 7 files changed, 84 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index e14815a1b0d..77ae0791b81 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -97,6 +97,7 @@ "DatetimeIndex", "Decimal32Dtype", "Decimal64Dtype", + "Decimal128Dtype", "Grouper", "Index", "IntervalDtype", @@ -126,6 +127,7 @@ "isclose", "melt", "merge", + "option_context", "pivot", "pivot_table", "read_avro", diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c55f86d48e1..d02633a97fa 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3350,14 +3350,14 @@ def interval_range( if len(right_col) == 0 or len(left_col) == 0: dtype = IntervalDtype("int64", closed) data = column.column_empty_like_same_mask(left_col, dtype) - return IntervalIndex(data, closed=closed) + return IntervalIndex(data, closed=closed, name=name) interval_col = IntervalColumn( dtype=IntervalDtype(left_col.dtype, closed), size=len(left_col), children=(left_col, right_col), ) - return IntervalIndex(interval_col, closed=closed) + return IntervalIndex(interval_col, closed=closed, name=name) class IntervalIndex(Index): diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 52a55760d4a..df471692702 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -118,7 +118,17 @@ def _normalize_series_and_dataframe(objs, axis): objs[idx] = obj.to_frame(name=name) -def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): +def concat( + objs, + axis=0, + join="outer", + ignore_index=False, + keys=None, + levels=None, + names=None, + verify_integrity=False, + sort=None, +): """Concatenate DataFrames, Series, or Indices row-wise. Parameters @@ -132,6 +142,21 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): ignore_index : bool, default False Set True to ignore the index of the *objs* and provide a default range index instead. + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level. + Currently not supported. + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys. + Currently not supported. + names : list, default None + Names for the levels in the resulting hierarchical index. + Currently not supported. + verify_integrity : bool, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation. + Currently not supported. sort : bool, default False Sort non-concatenation axis if it is not already aligned. @@ -243,6 +268,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): 0 a 1 c 3 1 b 2 d 4 """ + if keys is not None: + raise NotImplementedError("keys is currently not supported") + if levels is not None: + raise NotImplementedError("levels is currently not supported") + if names is not None: + raise NotImplementedError("names is currently not supported") # TODO: Do we really need to have different error messages for an empty # list and a list of None? if not objs: @@ -260,7 +291,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): f"Can only concatenate dictionary input along axis=1, not {axis}" ) objs = {k: obj for k, obj in objs.items() if obj is not None} - keys = list(objs) + keys_objs = list(objs) objs = list(objs.values()) if any(isinstance(o, cudf.BaseIndex) for o in objs): raise TypeError( @@ -268,7 +299,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): ) else: objs = [obj for obj in objs if obj is not None] - keys = None + keys_objs = None if not objs: raise ValueError("All objects passed were None") @@ -317,8 +348,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): result = obj.to_frame() else: result = obj.copy(deep=True) - if keys is not None and isinstance(result, cudf.DataFrame): - k = keys[0] + if keys_objs is not None and isinstance(result, cudf.DataFrame): + k = keys_objs[0] result.columns = cudf.MultiIndex.from_tuples( [ (k, *c) if isinstance(c, tuple) else (k, c) @@ -370,7 +401,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): objs = _align_objs(objs, how=join, sort=sort) df.index = objs[0].index - if keys is None: + if keys_objs is None: for o in objs: for name, col in o._data.items(): if name in df._data: @@ -408,9 +439,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): "label types in cuDF at this time. You must convert " "the labels to the same type." ) - for k, o in zip(keys, objs): + for k, o in zip(keys_objs, objs): for name, col in o._data.items(): - # if only series, then only keep keys as column labels + # if only series, then only keep keys_objs as column labels # if the existing column is multiindex, prepend it # to handle cases where dfs and srs are concatenated if only_series: @@ -426,7 +457,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): else: df[col_label] = col - if keys is None: + if keys_objs is None: df.columns = result_columns.unique() if ignore_index: df.columns = cudf.RangeIndex(len(result_columns.unique())) @@ -666,7 +697,7 @@ def _tile(A, reps): def get_dummies( - df, + data, prefix=None, prefix_sep="_", dummy_na=False, @@ -681,7 +712,7 @@ def get_dummies( Parameters ---------- - df : array-like, Series, or DataFrame + data : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, dict, or sequence, optional Prefix to append. Either a str (to apply a constant prefix), dict @@ -759,17 +790,22 @@ def get_dummies( if cats is None: cats = {} + else: + warnings.warn( + "cats is deprecated and will be removed in a future version.", + FutureWarning, + ) if sparse: raise NotImplementedError("sparse is not supported yet") if drop_first: raise NotImplementedError("drop_first is not supported yet") - if isinstance(df, cudf.DataFrame): + if isinstance(data, cudf.DataFrame): encode_fallback_dtypes = ["object", "category"] if columns is None or len(columns) == 0: - columns = df.select_dtypes( + columns = data.select_dtypes( include=encode_fallback_dtypes )._column_names @@ -796,33 +832,33 @@ def get_dummies( # If we have no columns to encode, we need to drop # fallback columns(if any) if len(columns) == 0: - return df.select_dtypes(exclude=encode_fallback_dtypes) + return data.select_dtypes(exclude=encode_fallback_dtypes) else: result_data = { col_name: col - for col_name, col in df._data.items() + for col_name, col in data._data.items() if col_name not in columns } for name in columns: if name not in cats: unique = _get_unique( - column=df._data[name], dummy_na=dummy_na + column=data._data[name], dummy_na=dummy_na ) else: unique = as_column(cats[name]) col_enc_data = _one_hot_encode_column( - column=df._data[name], + column=data._data[name], categories=unique, prefix=prefix_map.get(name, prefix), prefix_sep=prefix_sep_map.get(name, prefix_sep), dtype=dtype, ) result_data.update(col_enc_data) - return cudf.DataFrame._from_data(result_data, index=df.index) + return cudf.DataFrame._from_data(result_data, index=data.index) else: - ser = cudf.Series(df) + ser = cudf.Series(data) unique = _get_unique(column=ser._column, dummy_na=dummy_na) data = _one_hot_encode_column( column=ser._column, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index a92bf420147..7197560b5a4 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -785,7 +785,7 @@ def date_range( tz=None, normalize: bool = False, name=None, - closed: Literal["left", "right", "both", "neither"] = "both", + inclusive: Literal["left", "right", "both", "neither"] = "both", *, unit: str | None = None, ): @@ -823,7 +823,7 @@ def date_range( name : str, default None Name of the resulting DatetimeIndex - closed : {"left", "right", "both", "neither"}, default "both" + inclusive : {"left", "right", "both", "neither"}, default "both" Whether to set each bound as closed or open. Currently only "both" is supported @@ -839,7 +839,7 @@ def date_range( ----- Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three must be specified. If `freq` is omitted, the resulting DatetimeIndex will - have periods linearly spaced elements between start and end (closed on both + have periods linearly spaced elements between start and end (inclusive on both sides). cudf supports `freq` specified with either fixed-frequency offset @@ -866,8 +866,8 @@ def date_range( '2026-04-23 08:00:00'], dtype='datetime64[ns]') """ - if closed != "both": - raise NotImplementedError(f"{closed=} is currently unsupported.") + if inclusive != "both": + raise NotImplementedError(f"{inclusive=} is currently unsupported.") if unit is not None: raise NotImplementedError(f"{unit=} is currently unsupported.") if normalize is not False: @@ -961,7 +961,7 @@ def date_range( periods = 0 else: # If end == start, periods == 0 and we return exactly 1 timestamp (start). - # Otherwise, since closed="both", we ensure the end point is included. + # Otherwise, since inclusive="both", we ensure the end point is included. periods += 1 # We compute `end_estim` (the estimated upper bound of the date diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 8b95f6f6a04..6cecf3fa170 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -20,7 +20,7 @@ from cudf.core.column import ColumnBase -def to_numeric(arg, errors="raise", downcast=None): +def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): """ Convert argument into numerical types. @@ -48,6 +48,8 @@ def to_numeric(arg, errors="raise", downcast=None): Note that downcast behavior is decoupled from parsing. Errors encountered during downcast is raised regardless of ``errors`` parameter. + dtype_backend : None + Not implemented. Returns ------- @@ -93,7 +95,10 @@ def to_numeric(arg, errors="raise", downcast=None): For example ``[1, 'a']``. A ``TypeError`` will be raised when such input is received, regardless of ``errors`` parameter. """ - + if dtype_backend is not None: + raise NotImplementedError( + "dtype_backend is not currently implemented." + ) if errors not in {"raise", "ignore", "coerce"}: raise ValueError("invalid error value specified") elif errors == "ignore": diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index 3b3a9f96543..a567c27f584 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -401,3 +401,9 @@ def test_from_tuples(): result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a") expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a") assert_eq(result, expected) + + +def test_interval_range_name(): + expected = pd.interval_range(start=0, periods=5, freq=2, name="foo") + result = cudf.interval_range(start=0, periods=5, freq=2, name="foo") + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index 154e1e19072..cc17dc46e0a 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -155,3 +155,9 @@ def test_get_dummies_array_like_with_nan(): actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_") assert_eq(expected, actual) + + +def test_get_dummies_cats_deprecated(): + df = cudf.DataFrame(range(3)) + with pytest.warns(FutureWarning): + cudf.get_dummies(df, cats={0: [0, 1, 2]})