Skip to content

Commit

Permalink
Remove internal usage of core.index.as_index in favor of cudf.Index (#…
Browse files Browse the repository at this point in the history
…15851)

`cudf.Index.__init__` essentially calls `as_index` immediately internally. To avoid both from potentially diverging, the public `cudf.Index` should be preferred to ensure the public behaviors are used internally

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15851
  • Loading branch information
mroeschke authored Jun 5, 2024
1 parent fe74129 commit 22ef063
Show file tree
Hide file tree
Showing 20 changed files with 116 additions and 140 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask
from cudf.core.index import RangeIndex, as_index
from cudf.core.index import Index, RangeIndex
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.options import get_option
Expand Down Expand Up @@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

return labels, cats.values if return_cupy_array else as_index(cats)
return labels, cats.values if return_cupy_array else Index(cats)


def _linear_interpolation(column, index=None):
Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/column/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ def _return_or_inplace(
else:
return cudf.Series(new_col, name=self._parent.name)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.core.index.as_index(
new_col, name=self._parent.name
)
return cudf.Index(new_col, name=self._parent.name)
else:
return self._parent._mimic_inplace(new_col, inplace=False)
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4391,7 +4391,7 @@ def code_points(self) -> SeriesOrIndex:
if isinstance(self._parent, cudf.Series):
return cudf.Series(new_col, name=self._parent.name)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.core.index.as_index(new_col, name=self._parent.name)
return cudf.Index(new_col, name=self._parent.name)
else:
return new_col

Expand Down Expand Up @@ -4706,7 +4706,7 @@ def character_tokenize(self) -> SeriesOrIndex:
index = self._parent.index.repeat(lengths)
return cudf.Series(result_col, name=self._parent.name, index=index)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.core.index.as_index(result_col, name=self._parent.name)
return cudf.Index(result_col, name=self._parent.name)
else:
return result_col

Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from collections import abc

Expand Down Expand Up @@ -292,7 +292,7 @@ def cut(
)

# we return a categorical index, as we don't have a Categorical method
categorical_index = cudf.core.index.as_index(col)
categorical_index = cudf.Index(col)

if isinstance(orig_x, (pd.Series, cudf.Series)):
# if we have a series input we return a series output
Expand Down
36 changes: 17 additions & 19 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,7 +712,7 @@ def __init__(
data = data.reindex(index)
index = data.index
else:
index = as_index(index)
index = cudf.Index(index)
else:
index = data.index

Expand Down Expand Up @@ -761,7 +761,7 @@ def __init__(
if index is None:
self._index = RangeIndex(0)
else:
self._index = as_index(index)
self._index = cudf.Index(index)
if columns is not None:
rangeindex = isinstance(
columns, (range, pd.RangeIndex, cudf.RangeIndex)
Expand Down Expand Up @@ -875,7 +875,7 @@ def _init_from_series_list(self, data, columns, index):
# When `index` is `None`, the final index of
# resulting dataframe will be union of
# all Series's names.
final_index = as_index(_get_union_of_series_names(data))
final_index = cudf.Index(_get_union_of_series_names(data))
else:
# When an `index` is passed, the final index of
# resulting dataframe will be whatever
Expand Down Expand Up @@ -919,7 +919,7 @@ def _init_from_series_list(self, data, columns, index):
f"not match length of index ({index_length})"
)

final_index = as_index(index)
final_index = cudf.Index(index)

series_lengths = list(map(len, data))
data = numeric_normalize_types(*data)
Expand All @@ -943,7 +943,7 @@ def _init_from_series_list(self, data, columns, index):
# Setting `final_columns` to self._index so
# that the resulting `transpose` will be have
# columns set to `final_columns`
self._index = as_index(final_columns)
self._index = cudf.Index(final_columns)

transpose = self.T
else:
Expand Down Expand Up @@ -987,9 +987,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
if index is None:
index = RangeIndex(start=0, stop=len(data))
else:
index = as_index(index)
index = cudf.Index(index)

self._index = as_index(index)
self._index = cudf.Index(index)
# list-of-dicts case
if len(data) > 0 and isinstance(data[0], dict):
data = DataFrame.from_pandas(pd.DataFrame(data))
Expand Down Expand Up @@ -1095,7 +1095,7 @@ def _init_from_dict_like(

self._index = RangeIndex(0, num_rows)
else:
self._index = as_index(index)
self._index = cudf.Index(index)

if len(data):
self._data.multiindex = True
Expand Down Expand Up @@ -1410,7 +1410,7 @@ def __setitem__(self, arg, value):
new_columns, verify=False
)
if isinstance(value, (pd.Series, Series)):
self._index = as_index(value.index)
self._index = cudf.Index(value.index)
elif len(value) > 0:
self._index = RangeIndex(length)
return
Expand Down Expand Up @@ -1728,7 +1728,7 @@ def _concat(
for cols in columns:
table_index = None
if 1 == first_data_column_position:
table_index = cudf.core.index.as_index(cols[0])
table_index = cudf.Index(cols[0])
elif first_data_column_position > 1:
table_index = DataFrame._from_data(
data=dict(
Expand Down Expand Up @@ -1780,9 +1780,7 @@ def _concat(
if not isinstance(out.index, MultiIndex) and isinstance(
out.index.dtype, cudf.CategoricalDtype
):
out = out.set_index(
cudf.core.index.as_index(out.index._values)
)
out = out.set_index(cudf.Index(out.index._values))
for name, col in out._data.items():
out._data[name] = col._with_type_metadata(
tables[0]._data[name].dtype
Expand Down Expand Up @@ -2828,7 +2826,7 @@ def reindex(
if columns is None:
df = self
else:
columns = as_index(columns)
columns = cudf.Index(columns)
intersection = self._data.to_pandas_index().intersection(
columns.to_pandas()
)
Expand Down Expand Up @@ -3245,7 +3243,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
if len(self) == 0:
if isinstance(value, (pd.Series, Series)):
if not ignore_index:
self.index = as_index(value.index)
self.index = cudf.Index(value.index)
elif (length := len(value)) > 0:
if num_cols != 0:
ca = self._data._from_columns_like_self(
Expand Down Expand Up @@ -5654,7 +5652,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
}

if not is_scalar(index):
new_index = as_index(index)
new_index = cudf.Index(index)
else:
new_index = None

Expand Down Expand Up @@ -5738,7 +5736,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
}

if index is not None:
index = as_index(index)
index = cudf.Index(index)

if isinstance(columns, (pd.Index, cudf.Index)):
level_names = tuple(columns.names)
Expand Down Expand Up @@ -6171,7 +6169,7 @@ def count(self, axis=0, numeric_only=False):
for col in self._data.names
]
},
as_index(self._data.names),
cudf.Index(self._data.names),
)

_SUPPORT_AXIS_LOOKUP = {
Expand Down Expand Up @@ -6298,7 +6296,7 @@ def _reduce(
source._data.names, names=source._data.level_names
)
else:
idx = as_index(source._data.names)
idx = cudf.Index(source._data.names)
return Series._from_data({None: as_column(result)}, idx)
elif axis == 1:
return source._apply_cupy_method_axis_1(op, **kwargs)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ def categories(self) -> "cudf.core.index.Index":
Index(['b', 'a'], dtype='object')
"""
if self._categories is None:
return cudf.core.index.as_index(
return cudf.Index(
cudf.core.column.column_empty(0, dtype="object", masked=False)
)
return cudf.core.index.as_index(self._categories, copy=False)
return cudf.Index(self._categories, copy=False)

@property
def type(self):
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2800,15 +2800,13 @@ def keys(self):
nkeys = len(self._key_columns)

if nkeys == 0:
return cudf.core.index.as_index([], name=None)
return cudf.Index([], name=None)
elif nkeys > 1:
return cudf.MultiIndex._from_data(
dict(zip(range(nkeys), self._key_columns))
)._set_names(self.names)
else:
return cudf.core.index.as_index(
self._key_columns[0], name=self.names[0]
)
return cudf.Index(self._key_columns[0], name=self.names[0])

@property
def values(self) -> cudf.core.frame.Frame:
Expand Down
30 changes: 11 additions & 19 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,7 +1169,7 @@ def _concat(cls, objs):
result = _concat_range_index(non_empties)
else:
data = concat_columns([o._values for o in non_empties])
result = as_index(data)
result = Index(data)

names = {obj.name for obj in objs}
if len(names) == 1:
Expand Down Expand Up @@ -1437,7 +1437,7 @@ def __repr__(self):
def __getitem__(self, index):
res = self._get_elements_from_column(index)
if isinstance(res, ColumnBase):
res = as_index(res, name=self.name)
res = Index(res, name=self.name)
return res

@property # type: ignore
Expand Down Expand Up @@ -1958,7 +1958,7 @@ def microsecond(self):
>>> datetime_index.microsecond
Index([0, 1, 2], dtype='int32')
""" # noqa: E501
return as_index(
return Index(
(
# Need to manually promote column to int32 because
# pandas-matching binop behaviour requires that this
Expand Down Expand Up @@ -2209,7 +2209,7 @@ def _get_dt_field(self, field):
mask=out_column.base_mask,
offset=out_column.offset,
)
return as_index(out_column, name=self.name)
return Index(out_column, name=self.name)

def _is_boolean(self):
return False
Expand Down Expand Up @@ -2522,29 +2522,23 @@ def days(self):
Number of days for each element.
"""
# Need to specifically return `int64` to avoid overflow.
return as_index(
arbitrary=self._values.days, name=self.name, dtype="int64"
)
return Index(self._values.days, name=self.name, dtype="int64")

@property # type: ignore
@_cudf_nvtx_annotate
def seconds(self):
"""
Number of seconds (>= 0 and less than 1 day) for each element.
"""
return as_index(
arbitrary=self._values.seconds, name=self.name, dtype="int32"
)
return Index(self._values.seconds, name=self.name, dtype="int32")

@property # type: ignore
@_cudf_nvtx_annotate
def microseconds(self):
"""
Number of microseconds (>= 0 and less than 1 second) for each element.
"""
return as_index(
arbitrary=self._values.microseconds, name=self.name, dtype="int32"
)
return Index(self._values.microseconds, name=self.name, dtype="int32")

@property # type: ignore
@_cudf_nvtx_annotate
Expand All @@ -2553,9 +2547,7 @@ def nanoseconds(self):
Number of nanoseconds (>= 0 and less than 1 microsecond) for each
element.
"""
return as_index(
arbitrary=self._values.nanoseconds, name=self.name, dtype="int32"
)
return Index(self._values.nanoseconds, name=self.name, dtype="int32")

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -2693,7 +2685,7 @@ def codes(self):
"""
The category codes of this categorical.
"""
return as_index(self._values.codes)
return Index(self._values.codes)

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -3137,15 +3129,15 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
elif step is None:
# First non-empty index had only one element
if obj.start == start:
result = as_index(concat_columns([x._values for x in indexes]))
result = Index(concat_columns([x._values for x in indexes]))
return result
step = obj.start - start

non_consecutive = (step != obj.step and len(obj) > 1) or (
next_ is not None and obj.start != next_
)
if non_consecutive:
result = as_index(concat_columns([x._values for x in indexes]))
result = Index(concat_columns([x._values for x in indexes]))
return result
if step is not None:
next_ = obj[-1] + step
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3638,7 +3638,7 @@ def _align_to_index(
sort: bool = True,
allow_non_unique: bool = False,
) -> Self:
index = cudf.core.index.as_index(index)
index = cudf.Index(index)

if self.index.equals(index):
return self
Expand Down Expand Up @@ -3713,7 +3713,7 @@ def _reindex(
raise ValueError(
"cannot reindex on an axis with duplicate labels"
)
index = cudf.core.index.as_index(
index = cudf.Index(
index, name=getattr(index, "name", self.index.name)
)

Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
BaseIndex,
_get_indexer_basic,
_lexsorted_equal_range,
as_index,
)
from cudf.core.join._join_helpers import _match_join_keys
from cudf.utils.dtypes import is_column_like
Expand Down Expand Up @@ -824,7 +823,7 @@ def _index_and_downcast(self, result, index, index_key):
# it into an Index and name the final index values according
# to that column's name.
*_, last_column = index._data.columns
out_index = as_index(last_column)
out_index = cudf.Index(last_column)
out_index.name = index.names[-1]
index = out_index
elif out_index._num_columns > 1:
Expand Down Expand Up @@ -1082,7 +1081,9 @@ def get_level_values(self, level):
raise KeyError(f"Level not found: '{level}'")
else:
level_idx = colnames.index(level)
level_values = as_index(self._data[level], name=self.names[level_idx])
level_values = cudf.Index(
self._data[level], name=self.names[level_idx]
)
return level_values

def _is_numeric(self):
Expand Down
Loading

0 comments on commit 22ef063

Please sign in to comment.