diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2f6e864b51c..c0bd9ec6eee 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -19,6 +19,7 @@ drop_nulls, ) from cudf._lib.types import size_type_dtype +from cudf.api.extensions import no_default from cudf.api.types import ( is_bool_dtype, is_integer, @@ -701,21 +702,65 @@ def fillna(self, value, downcast=None): return super().fillna(value=value) - def to_frame(self, index=True, name=None): + def to_frame(self, index=True, name=no_default): """Create a DataFrame with a column containing this Index Parameters ---------- index : boolean, default True Set the index of the returned DataFrame as the original Index - name : str, default None - Name to be used for the column + name : object, defaults to index.name + The passed name should substitute for the index name (if it has + one). + Returns ------- DataFrame - cudf DataFrame - """ - if name is not None: + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + if name is None: + warnings.warn( + "Explicitly passing `name=None` currently preserves " + "the Index's name or uses a default name of 0. This " + "behaviour is deprecated, and in the future `None` " + "will be used as the name of the " + "resulting DataFrame column.", + FutureWarning, + ) + name = no_default + if name is not no_default: col_name = name elif self.name is None: col_name = 0 diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bc6726879c1..21380bb841c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -20,6 +20,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column from cudf.core._compat import PANDAS_GE_150 @@ -1015,7 +1016,12 @@ def __getitem__(self, index): elif isinstance(index, slice): start, stop, step = index.indices(len(self)) index = column.arange(start, stop, step) - result = MultiIndex.from_frame(self.to_frame(index=False).take(index)) + result = MultiIndex.from_frame( + self.to_frame(index=False, name=range(0, self.nlevels)).take( + index + ), + names=self.names, + ) # we are indexing into a single row of the MultiIndex, # return that row as a tuple: @@ -1026,24 +1032,95 @@ def __getitem__(self, index): result._codes = self._codes.take(index) if self._levels is not None: result._levels = self._levels - result.names = self.names return result @_cudf_nvtx_annotate - def to_frame(self, index=True, name=None): + def to_frame(self, index=True, name=no_default, allow_duplicates=False): + """ + Create a DataFrame with the levels of the MultiIndex as columns. + + Column ordering is determined by the DataFrame constructor with data as + a dict. + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original MultiIndex. + name : list / sequence of str, optional + The passed names should substitute index level names. + allow_duplicates : bool, optional default False + Allow duplicate column labels to be created. Note + that this parameter is non-functional because + duplicates column labels aren't supported in cudf. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import cudf + >>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) + >>> mi + MultiIndex([('a', 'c'), + ('b', 'd')], + ) + + >>> df = mi.to_frame() + >>> df + 0 1 + a c a c + b d b d + + >>> df = mi.to_frame(index=False) + >>> df + 0 1 + 0 a c + 1 b d + + >>> df = mi.to_frame(name=['x', 'y']) + >>> df + x y + a c a c + b d b d + """ # TODO: Currently this function makes a shallow copy, which is # incorrect. We want to make a deep copy, otherwise further # modifications of the resulting DataFrame will affect the MultiIndex. - df = cudf.DataFrame._from_data(data=self._data) - if index: - df = df.set_index(self) - if name is not None: + if name is None: + warnings.warn( + "Explicitly passing `name=None` currently preserves the " + "Index's name or uses a default name of 0. This behaviour " + "is deprecated, and in the future `None` will be used " + "as the name of the resulting DataFrame column.", + FutureWarning, + ) + name = no_default + + if name is not no_default: if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " "number of levels on index." ) - df.columns = name + column_names = name + else: + column_names = self.names + all_none_names = None + if not ( + all_none_names := all(x is None for x in column_names) + ) and len(column_names) != len(set(column_names)): + raise ValueError("Duplicate column names are not allowed") + df = cudf.DataFrame._from_data( + data=self._data, + columns=column_names + if name is not no_default and not all_none_names + else None, + ) + + if index: + df = df.set_index(self) + return df @_cudf_nvtx_annotate @@ -1504,7 +1581,9 @@ def droplevel(self, level=-1): @_cudf_nvtx_annotate def to_pandas(self, nullable=False, **kwargs): - result = self.to_frame(index=False).to_pandas(nullable=nullable) + result = self.to_frame( + index=False, name=list(range(self.nlevels)) + ).to_pandas(nullable=nullable) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod @@ -1623,7 +1702,7 @@ def _clean_nulls_from_index(self): Convert all na values(if any) in MultiIndex object to `` as a preprocessing step to `__repr__` methods. """ - index_df = self.to_frame(index=False) + index_df = self.to_frame(index=False, name=list(range(self.nlevels))) return MultiIndex.from_frame( index_df._clean_nulls_from_dataframe(index_df), names=self.names ) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 6fb615c22e0..b3791cddce3 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200 from cudf.core.index import ( @@ -2777,3 +2778,21 @@ def test_index_empty_from_pandas(request, dtype): gidx = cudf.from_pandas(pidx) assert_eq(pidx, gidx) + + +@pytest.mark.parametrize( + "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] +) +@pytest.mark.parametrize("data_name", [None, 1, "abc"]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) +def test_index_to_frame(data, data_name, index, name): + pidx = pd.Index(data, name=data_name) + gidx = cudf.from_pandas(pidx) + + with expect_warning_if(name is None): + expected = pidx.to_frame(index=index, name=name) + with expect_warning_if(name is None): + actual = gidx.to_frame(index=index, name=name) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 3c843ace0a8..fb2b0c07efb 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -16,6 +16,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_GE_200 from cudf.core.column import as_column from cudf.core.index import as_index @@ -1926,3 +1927,85 @@ def test_multiindex_to_series_error(): midx = cudf.MultiIndex.from_tuples([("a", "b")]) with pytest.raises(NotImplementedError): midx.to_series() + + +@pytest.mark.parametrize( + "pidx", + [ + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "a", "a"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + ), + ], +) +@pytest.mark.parametrize( + "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]] +) +@pytest.mark.parametrize("allow_duplicates", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +def test_multiindex_to_frame_allow_duplicates( + pidx, name, allow_duplicates, index +): + gidx = cudf.from_pandas(pidx) + + if ( + ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + ) + and not allow_duplicates + and (name is None or name is no_default) + ): + assert_exceptions_equal( + pidx.to_frame, + gidx.to_frame, + lfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + rfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + ) + else: + if ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + and not isinstance(name, list) + ) or (isinstance(name, list) and len(name) != len(set(name))): + # cudf doesn't have the ability to construct dataframes + # with duplicate column names + with expect_warning_if(name is None): + with pytest.raises(ValueError): + gidx.to_frame( + index=index, + name=name, + allow_duplicates=allow_duplicates, + ) + else: + with expect_warning_if(name is None): + expected = pidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + with expect_warning_if(name is None): + actual = gidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + + assert_eq(expected, actual)