Skip to content

Commit

Permalink
Delay materializing RangeIndex in .reset_index (#15588)
Browse files Browse the repository at this point in the history
Before, a `RangeIndex` would be materialized to a `Column` even if it wasn't used (`drop=True`). Now, it's only materialized if the index will be added as a new column in the DataFrame.

Also caught a validation bug where an `invalid` number of levels would not raise an error

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15588
  • Loading branch information
mroeschke authored May 8, 2024
1 parent b09e794 commit c0c38eb
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 54 deletions.
23 changes: 15 additions & 8 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pickle
import warnings
from collections.abc import Generator
from functools import cached_property
from typing import Any, Literal, Set, Tuple

Expand Down Expand Up @@ -2190,14 +2191,20 @@ def repeat(self, repeats, axis=None):
"""
raise NotImplementedError

def _split_columns_by_levels(self, levels):
if isinstance(levels, int) and levels > 0:
raise ValueError(f"Out of bound level: {levels}")
return (
[self._data[self.name]],
[],
["index" if self.name is None else self.name],
[],
def _new_index_for_reset_index(
self, levels: tuple | None, name
) -> None | BaseIndex:
"""Return the new index after .reset_index"""
# None is caught later to return RangeIndex
return None

def _columns_for_reset_index(
self, levels: tuple | None
) -> Generator[tuple[Any, ColumnBase], None, None]:
"""Return the columns and column names for .reset_index"""
yield (
"index" if self.name is None else self.name,
next(iter(self._columns)),
)

def _split(self, splits):
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import operator
import pickle
import warnings
from collections.abc import Generator
from functools import cache, cached_property
from numbers import Number
from typing import (
Expand Down Expand Up @@ -970,6 +971,13 @@ def __abs__(self) -> Self | Index:
else:
return abs(self._as_int_index())

def _columns_for_reset_index(
self, levels: tuple | None
) -> Generator[tuple[Any, ColumnBase], None, None]:
"""Return the columns and column names for .reset_index"""
# We need to explicitly materialize the RangeIndex to a column
yield "index" if self.name is None else self.name, as_column(self)

@_warn_no_dask_cudf
def __dask_tokenize__(self):
return (type(self), self.start, self.stop, self.step)
Expand Down
35 changes: 14 additions & 21 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4340,34 +4340,27 @@ def take(self, indices, axis=0):

def _reset_index(self, level, drop, col_level=0, col_fill=""):
"""Shared path for DataFrame.reset_index and Series.reset_index."""
if level is not None and not isinstance(level, (tuple, list)):
level = (level,)
if level is not None:
if (
isinstance(level, int)
and level > 0
and not isinstance(self.index, MultiIndex)
):
raise IndexError(
f"Too many levels: Index has only 1 level, not {level + 1}"
)
if not isinstance(level, (tuple, list)):
level = (level,)
_check_duplicate_level_names(level, self._index.names)

# Split the columns in the index into data and index columns
(
data_columns,
index_columns,
data_names,
index_names,
) = self._index._split_columns_by_levels(level)
if index_columns:
index = _index_from_data(
dict(enumerate(index_columns)),
name=self._index.name,
)
if isinstance(index, MultiIndex):
index.names = index_names
else:
index.name = index_names[0]
else:
index = self.index._new_index_for_reset_index(level, self.index.name)
if index is None:
index = RangeIndex(len(self))

if drop:
return self._data, index

new_column_data = {}
for name, col in zip(data_names, data_columns):
for name, col in self.index._columns_for_reset_index(level):
if name == "index" and "index" in self._data:
name = "level_0"
name = (
Expand Down
74 changes: 49 additions & 25 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pickle
import warnings
from collections import abc
from collections.abc import Generator
from functools import cached_property
from numbers import Integral
from typing import Any, List, MutableMapping, Tuple, Union
Expand Down Expand Up @@ -2052,41 +2053,64 @@ def _copy_type_metadata(
return res

@_cudf_nvtx_annotate
def _split_columns_by_levels(self, levels):
def _split_columns_by_levels(
self, levels: tuple, *, in_levels: bool
) -> Generator[tuple[Any, column.ColumnBase], None, None]:
# This function assumes that for levels with duplicate names, they are
# specified by indices, not name by ``levels``. E.g. [None, None] can
# only be specified by 0, 1, not "None".

if levels is None:
return (
list(self._data.columns),
[],
[
f"level_{i}" if name is None else name
for i, name in enumerate(self.names)
],
[],
)

# Normalize named levels into indices
level_names = list(self.names)
level_indices = {
lv if isinstance(lv, int) else level_names.index(lv)
for lv in levels
}

# Split the columns
data_columns, index_columns = [], []
data_names, index_names = [], []
for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
if i in level_indices:
if in_levels and i in level_indices:
name = f"level_{i}" if name is None else name
data_columns.append(col)
data_names.append(name)
else:
index_columns.append(col)
index_names.append(name)
return data_columns, index_columns, data_names, index_names
yield name, col
elif not in_levels and i not in level_indices:
yield name, col

@_cudf_nvtx_annotate
def _new_index_for_reset_index(
self, levels: tuple | None, name
) -> None | BaseIndex:
"""Return the new index after .reset_index"""
if levels is None:
return None

index_columns, index_names = [], []
for name, col in self._split_columns_by_levels(
levels, in_levels=False
):
index_columns.append(col)
index_names.append(name)

if not index_columns:
# None is caught later to return RangeIndex
return None

index = cudf.core.index._index_from_data(
dict(enumerate(index_columns)),
name=name,
)
if isinstance(index, type(self)):
index.names = index_names
else:
index.name = index_names[0]
return index

def _columns_for_reset_index(
self, levels: tuple | None
) -> Generator[tuple[Any, column.ColumnBase], None, None]:
"""Return the columns and column names for .reset_index"""
if levels is None:
for i, (col, name) in enumerate(
zip(self._data.columns, self.names)
):
yield f"level_{i}" if name is None else name, col
else:
yield from self._split_columns_by_levels(levels, in_levels=True)

def repeat(self, repeats, axis=None):
return self._from_data(
Expand Down
8 changes: 8 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3208,6 +3208,14 @@ def test_reset_index_unnamed(
assert_eq(expect, got)


def test_reset_index_invalid_level():
with pytest.raises(IndexError):
cudf.DataFrame([1]).reset_index(level=2)

with pytest.raises(IndexError):
pd.DataFrame([1]).reset_index(level=2)


@pytest.mark.parametrize(
"data",
[
Expand Down

0 comments on commit c0c38eb

Please sign in to comment.