Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix naming issues with Index.to_frame and MultiIndex.to_frame APIs #14105

Merged
merged 9 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 51 additions & 6 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
drop_nulls,
)
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import (
is_bool_dtype,
is_integer,
Expand Down Expand Up @@ -701,21 +702,65 @@ def fillna(self, value, downcast=None):

return super().fillna(value=value)

def to_frame(self, index=True, name=None):
def to_frame(self, index=True, name=no_default):
"""Create a DataFrame with a column containing this Index

Parameters
----------
index : boolean, default True
Set the index of the returned DataFrame as the original Index
name : str, default None
Name to be used for the column
name : object, defaults to index.name
The passed name should substitute for the index name (if it has
one).

Returns
-------
DataFrame
cudf DataFrame
"""
if name is not None:
DataFrame containing the original Index data.

See Also
--------
Index.to_series : Convert an Index to a Series.
Series.to_frame : Convert Series to DataFrame.

Examples
--------
>>> import cudf
>>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
>>> idx.to_frame()
animal
animal
Ant Ant
Bear Bear
Cow Cow

By default, the original Index is reused. To enforce a new Index:

>>> idx.to_frame(index=False)
animal
0 Ant
1 Bear
2 Cow

To override the name of the resulting column, specify `name`:

>>> idx.to_frame(index=False, name='zoo')
zoo
0 Ant
1 Bear
2 Cow
"""
if name is None:
warnings.warn(
"Explicitly passing `name=None` currently preserves "
"the Index's name or uses a default name of 0. This "
"behaviour is deprecated, and in the future `None` "
"will be used as the name of the "
"resulting DataFrame column.",
FutureWarning,
)
name = no_default
if name is not no_default:
col_name = name
elif self.name is None:
col_name = 0
Expand Down
99 changes: 89 additions & 10 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import cudf
from cudf import _lib as libcudf
from cudf._typing import DataFrameOrSeries
from cudf.api.extensions import no_default
from cudf.api.types import is_integer, is_list_like, is_object_dtype
from cudf.core import column
from cudf.core._compat import PANDAS_GE_150
Expand Down Expand Up @@ -1015,7 +1016,12 @@ def __getitem__(self, index):
elif isinstance(index, slice):
start, stop, step = index.indices(len(self))
index = column.arange(start, stop, step)
result = MultiIndex.from_frame(self.to_frame(index=False).take(index))
result = MultiIndex.from_frame(
self.to_frame(index=False, name=range(0, self.nlevels)).take(
index
),
names=self.names,
)

# we are indexing into a single row of the MultiIndex,
# return that row as a tuple:
Expand All @@ -1026,24 +1032,95 @@ def __getitem__(self, index):
result._codes = self._codes.take(index)
if self._levels is not None:
result._levels = self._levels
result.names = self.names
return result

@_cudf_nvtx_annotate
def to_frame(self, index=True, name=None):
def to_frame(self, index=True, name=no_default, allow_duplicates=False):
"""
Create a DataFrame with the levels of the MultiIndex as columns.

Column ordering is determined by the DataFrame constructor with data as
a dict.

Parameters
----------
index : bool, default True
Set the index of the returned DataFrame as the original MultiIndex.
name : list / sequence of str, optional
The passed names should substitute index level names.
allow_duplicates : bool, optional default False
Allow duplicate column labels to be created. Note
that this parameter is non-functional because
duplicates column labels aren't supported in cudf.

Returns
-------
DataFrame

Examples
--------
>>> import cudf
>>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
>>> mi
MultiIndex([('a', 'c'),
('b', 'd')],
)

>>> df = mi.to_frame()
>>> df
0 1
a c a c
b d b d

>>> df = mi.to_frame(index=False)
>>> df
0 1
0 a c
1 b d

>>> df = mi.to_frame(name=['x', 'y'])
>>> df
x y
a c a c
b d b d
"""
# TODO: Currently this function makes a shallow copy, which is
# incorrect. We want to make a deep copy, otherwise further
# modifications of the resulting DataFrame will affect the MultiIndex.
df = cudf.DataFrame._from_data(data=self._data)
if index:
df = df.set_index(self)
if name is not None:
if name is None:
warnings.warn(
"Explicitly passing `name=None` currently preserves the "
"Index's name or uses a default name of 0. This behaviour "
"is deprecated, and in the future `None` will be used "
"as the name of the resulting DataFrame column.",
FutureWarning,
)
name = no_default

if name is not no_default:
if len(name) != len(self.levels):
raise ValueError(
"'name' should have the same length as "
"number of levels on index."
)
df.columns = name
column_names = name
else:
column_names = self.names
all_none_names = None
if not (
all_none_names := all(x is None for x in column_names)
) and len(column_names) != len(set(column_names)):
raise ValueError("Duplicate column names are not allowed")
df = cudf.DataFrame._from_data(
data=self._data,
columns=column_names
if name is not no_default and not all_none_names
else None,
)

if index:
df = df.set_index(self)

return df

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1504,7 +1581,9 @@ def droplevel(self, level=-1):

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False, **kwargs):
result = self.to_frame(index=False).to_pandas(nullable=nullable)
result = self.to_frame(
index=False, name=list(range(self.nlevels))
).to_pandas(nullable=nullable)
return pd.MultiIndex.from_frame(result, names=self.names)

@classmethod
Expand Down Expand Up @@ -1623,7 +1702,7 @@ def _clean_nulls_from_index(self):
Convert all na values(if any) in MultiIndex object
to `<NA>` as a preprocessing step to `__repr__` methods.
"""
index_df = self.to_frame(index=False)
index_df = self.to_frame(index=False, name=list(range(self.nlevels)))
return MultiIndex.from_frame(
index_df._clean_nulls_from_dataframe(index_df), names=self.names
)
Expand Down
19 changes: 19 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pytest

import cudf
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype
from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200
from cudf.core.index import (
Expand Down Expand Up @@ -2777,3 +2778,21 @@ def test_index_empty_from_pandas(request, dtype):
gidx = cudf.from_pandas(pidx)

assert_eq(pidx, gidx)


@pytest.mark.parametrize(
"data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
)
@pytest.mark.parametrize("data_name", [None, 1, "abc"])
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize("name", [None, no_default, 1, "abc"])
def test_index_to_frame(data, data_name, index, name):
pidx = pd.Index(data, name=data_name)
gidx = cudf.from_pandas(pidx)

with expect_warning_if(name is None):
expected = pidx.to_frame(index=index, name=name)
with expect_warning_if(name is None):
actual = gidx.to_frame(index=index, name=name)

assert_eq(expected, actual)
83 changes: 83 additions & 0 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pytest

import cudf
from cudf.api.extensions import no_default
from cudf.core._compat import PANDAS_GE_200
from cudf.core.column import as_column
from cudf.core.index import as_index
Expand Down Expand Up @@ -1926,3 +1927,85 @@ def test_multiindex_to_series_error():
midx = cudf.MultiIndex.from_tuples([("a", "b")])
with pytest.raises(NotImplementedError):
midx.to_series()


@pytest.mark.parametrize(
"pidx",
[
pd.MultiIndex.from_arrays(
[[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
names=["a", "b", "c"],
),
pd.MultiIndex.from_arrays(
[[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
names=["a", "a", "a"],
),
pd.MultiIndex.from_arrays(
[[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
),
],
)
@pytest.mark.parametrize(
"name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]]
)
@pytest.mark.parametrize("allow_duplicates", [True, False])
@pytest.mark.parametrize("index", [True, False])
def test_multiindex_to_frame_allow_duplicates(
pidx, name, allow_duplicates, index
):
gidx = cudf.from_pandas(pidx)

if (
(
len(pidx.names) != len(set(pidx.names))
and not all(x is None for x in pidx.names)
)
and not allow_duplicates
and (name is None or name is no_default)
):
assert_exceptions_equal(
pidx.to_frame,
gidx.to_frame,
lfunc_args_and_kwargs=(
[],
{
"index": index,
"name": name,
"allow_duplicates": allow_duplicates,
},
),
rfunc_args_and_kwargs=(
[],
{
"index": index,
"name": name,
"allow_duplicates": allow_duplicates,
},
),
)
else:
if (
len(pidx.names) != len(set(pidx.names))
and not all(x is None for x in pidx.names)
and not isinstance(name, list)
) or (isinstance(name, list) and len(name) != len(set(name))):
# cudf doesn't have the ability to construct dataframes
# with duplicate column names
with expect_warning_if(name is None):
with pytest.raises(ValueError):
gidx.to_frame(
index=index,
name=name,
allow_duplicates=allow_duplicates,
)
else:
with expect_warning_if(name is None):
expected = pidx.to_frame(
index=index, name=name, allow_duplicates=allow_duplicates
)
with expect_warning_if(name is None):
actual = gidx.to_frame(
index=index, name=name, allow_duplicates=allow_duplicates
)

assert_eq(expected, actual)