Align public utility function signatures with pandas 2.x (#16565)

The following function signatures have a breaking change * `concat` * `get_dummies` * `date_range` Additionally deprecates the `cat` argument in `get_dummies` (doesn't exist in pandas and not tested), and fixes a bug in `interval_range` where `names` was not being respected Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #16565
rapidsai · Aug 15, 2024 · 89863a3 · 89863a3
1 parent 19846b6
commit 89863a3
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 29 deletions.
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
@@ -97,6 +97,7 @@
     "DatetimeIndex",
     "Decimal32Dtype",
     "Decimal64Dtype",
+    "Decimal128Dtype",
     "Grouper",
     "Index",
     "IntervalDtype",
@@ -126,6 +127,7 @@
     "isclose",
     "melt",
     "merge",
+    "option_context",
     "pivot",
     "pivot_table",
     "read_avro",

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -3350,14 +3350,14 @@ def interval_range(
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
         data = column.column_empty_like_same_mask(left_col, dtype)
-        return IntervalIndex(data, closed=closed)
+        return IntervalIndex(data, closed=closed, name=name)
 
     interval_col = IntervalColumn(
         dtype=IntervalDtype(left_col.dtype, closed),
         size=len(left_col),
         children=(left_col, right_col),
     )
-    return IntervalIndex(interval_col, closed=closed)
+    return IntervalIndex(interval_col, closed=closed, name=name)
 
 
 class IntervalIndex(Index):

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
@@ -118,7 +118,17 @@ def _normalize_series_and_dataframe(objs, axis):
             objs[idx] = obj.to_frame(name=name)
 
 
-def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
+def concat(
+    objs,
+    axis=0,
+    join="outer",
+    ignore_index=False,
+    keys=None,
+    levels=None,
+    names=None,
+    verify_integrity=False,
+    sort=None,
+):
     """Concatenate DataFrames, Series, or Indices row-wise.
 
     Parameters
@@ -132,6 +142,21 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     ignore_index : bool, default False
         Set True to ignore the index of the *objs* and provide a
         default range index instead.
+    keys : sequence, default None
+        If multiple levels passed, should contain tuples. Construct
+        hierarchical index using the passed keys as the outermost level.
+        Currently not supported.
+    levels : list of sequences, default None
+        Specific levels (unique values) to use for constructing a
+        MultiIndex. Otherwise they will be inferred from the keys.
+        Currently not supported.
+    names : list, default None
+        Names for the levels in the resulting hierarchical index.
+        Currently not supported.
+    verify_integrity : bool, default False
+        Check whether the new concatenated axis contains duplicates. This can
+        be very expensive relative to the actual data concatenation.
+        Currently not supported.
     sort : bool, default False
         Sort non-concatenation axis if it is not already aligned.
 
@@ -243,6 +268,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     0      a       1       c       3
     1      b       2       d       4
     """
+    if keys is not None:
+        raise NotImplementedError("keys is currently not supported")
+    if levels is not None:
+        raise NotImplementedError("levels is currently not supported")
+    if names is not None:
+        raise NotImplementedError("names is currently not supported")
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
@@ -260,15 +291,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 f"Can only concatenate dictionary input along axis=1, not {axis}"
             )
         objs = {k: obj for k, obj in objs.items() if obj is not None}
-        keys = list(objs)
+        keys_objs = list(objs)
         objs = list(objs.values())
         if any(isinstance(o, cudf.BaseIndex) for o in objs):
             raise TypeError(
                 "cannot concatenate a dictionary containing indices"
             )
     else:
         objs = [obj for obj in objs if obj is not None]
-        keys = None
+        keys_objs = None
 
     if not objs:
         raise ValueError("All objects passed were None")
@@ -317,8 +348,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = obj.to_frame()
             else:
                 result = obj.copy(deep=True)
-            if keys is not None and isinstance(result, cudf.DataFrame):
-                k = keys[0]
+            if keys_objs is not None and isinstance(result, cudf.DataFrame):
+                k = keys_objs[0]
                 result.columns = cudf.MultiIndex.from_tuples(
                     [
                         (k, *c) if isinstance(c, tuple) else (k, c)
@@ -370,7 +401,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        if keys is None:
+        if keys_objs is None:
             for o in objs:
                 for name, col in o._data.items():
                     if name in df._data:
@@ -408,9 +439,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     "label types in cuDF at this time. You must convert "
                     "the labels to the same type."
                 )
-            for k, o in zip(keys, objs):
+            for k, o in zip(keys_objs, objs):
                 for name, col in o._data.items():
-                    # if only series, then only keep keys as column labels
+                    # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
                     if only_series:
@@ -426,7 +457,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     else:
                         df[col_label] = col
 
-        if keys is None:
+        if keys_objs is None:
             df.columns = result_columns.unique()
             if ignore_index:
                 df.columns = cudf.RangeIndex(len(result_columns.unique()))
@@ -666,7 +697,7 @@ def _tile(A, reps):
 
 
 def get_dummies(
-    df,
+    data,
     prefix=None,
     prefix_sep="_",
     dummy_na=False,
@@ -681,7 +712,7 @@ def get_dummies(
 
     Parameters
     ----------
-    df : array-like, Series, or DataFrame
+    data : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, dict, or sequence, optional
         Prefix to append. Either a str (to apply a constant prefix), dict
@@ -759,17 +790,22 @@ def get_dummies(
 
     if cats is None:
         cats = {}
+    else:
+        warnings.warn(
+            "cats is deprecated and will be removed in a future version.",
+            FutureWarning,
+        )
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
     if drop_first:
         raise NotImplementedError("drop_first is not supported yet")
 
-    if isinstance(df, cudf.DataFrame):
+    if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
         if columns is None or len(columns) == 0:
-            columns = df.select_dtypes(
+            columns = data.select_dtypes(
                 include=encode_fallback_dtypes
             )._column_names
 
@@ -796,33 +832,33 @@ def get_dummies(
         # If we have no columns to encode, we need to drop
         # fallback columns(if any)
         if len(columns) == 0:
-            return df.select_dtypes(exclude=encode_fallback_dtypes)
+            return data.select_dtypes(exclude=encode_fallback_dtypes)
         else:
             result_data = {
                 col_name: col
-                for col_name, col in df._data.items()
+                for col_name, col in data._data.items()
                 if col_name not in columns
             }
 
             for name in columns:
                 if name not in cats:
                     unique = _get_unique(
-                        column=df._data[name], dummy_na=dummy_na
+                        column=data._data[name], dummy_na=dummy_na
                     )
                 else:
                     unique = as_column(cats[name])
 
                 col_enc_data = _one_hot_encode_column(
-                    column=df._data[name],
+                    column=data._data[name],
                     categories=unique,
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
                 )
                 result_data.update(col_enc_data)
-            return cudf.DataFrame._from_data(result_data, index=df.index)
+            return cudf.DataFrame._from_data(result_data, index=data.index)
     else:
-        ser = cudf.Series(df)
+        ser = cudf.Series(data)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
         data = _one_hot_encode_column(
             column=ser._column,

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
@@ -785,7 +785,7 @@ def date_range(
     tz=None,
     normalize: bool = False,
     name=None,
-    closed: Literal["left", "right", "both", "neither"] = "both",
+    inclusive: Literal["left", "right", "both", "neither"] = "both",
     *,
     unit: str | None = None,
 ):
@@ -823,7 +823,7 @@ def date_range(
     name : str, default None
         Name of the resulting DatetimeIndex
 
-    closed : {"left", "right", "both", "neither"}, default "both"
+    inclusive : {"left", "right", "both", "neither"}, default "both"
         Whether to set each bound as closed or open.
         Currently only "both" is supported
 
@@ -839,7 +839,7 @@ def date_range(
     -----
     Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three
     must be specified. If `freq` is omitted, the resulting DatetimeIndex will
-    have periods linearly spaced elements between start and end (closed on both
+    have periods linearly spaced elements between start and end (inclusive on both
     sides).
 
     cudf supports `freq` specified with either fixed-frequency offset
@@ -866,8 +866,8 @@ def date_range(
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
     """
-    if closed != "both":
-        raise NotImplementedError(f"{closed=} is currently unsupported.")
+    if inclusive != "both":
+        raise NotImplementedError(f"{inclusive=} is currently unsupported.")
     if unit is not None:
         raise NotImplementedError(f"{unit=} is currently unsupported.")
     if normalize is not False:
@@ -961,7 +961,7 @@ def date_range(
             periods = 0
         else:
             # If end == start, periods == 0 and we return exactly 1 timestamp (start).
-            # Otherwise, since closed="both", we ensure the end point is included.
+            # Otherwise, since inclusive="both", we ensure the end point is included.
             periods += 1
 
     # We compute `end_estim` (the estimated upper bound of the date

diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
@@ -20,7 +20,7 @@
     from cudf.core.column import ColumnBase
 
 
-def to_numeric(arg, errors="raise", downcast=None):
+def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
     """
     Convert argument into numerical types.
 
@@ -48,6 +48,8 @@ def to_numeric(arg, errors="raise", downcast=None):
         Note that downcast behavior is decoupled from parsing. Errors
         encountered during downcast is raised regardless of ``errors``
         parameter.
+    dtype_backend : None
+        Not implemented.
 
     Returns
     -------
@@ -93,7 +95,10 @@ def to_numeric(arg, errors="raise", downcast=None):
         For example ``[1, 'a']``. A ``TypeError`` will be raised when such
         input is received, regardless of ``errors`` parameter.
     """
-
+    if dtype_backend is not None:
+        raise NotImplementedError(
+            "dtype_backend is not currently implemented."
+        )
     if errors not in {"raise", "ignore", "coerce"}:
         raise ValueError("invalid error value specified")
     elif errors == "ignore":

diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -401,3 +401,9 @@ def test_from_tuples():
     result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
     expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
     assert_eq(result, expected)
+
+
+def test_interval_range_name():
+    expected = pd.interval_range(start=0, periods=5, freq=2, name="foo")
+    result = cudf.interval_range(start=0, periods=5, freq=2, name="foo")
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
@@ -155,3 +155,9 @@ def test_get_dummies_array_like_with_nan():
     actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")
 
     assert_eq(expected, actual)
+
+
+def test_get_dummies_cats_deprecated():
+    df = cudf.DataFrame(range(3))
+    with pytest.warns(FutureWarning):
+        cudf.get_dummies(df, cats={0: [0, 1, 2]})