Skip to content

Commit

Permalink
Disallow cudf.Index accepting column in favor of ._from_column (#16549)
Browse files Browse the repository at this point in the history
Similar to #16454, this PR disallows the public `cudf.Index` accepting a private `ColumnBase` object in favor of `_from_column` (which was added in the linked PR)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16549
  • Loading branch information
mroeschke authored Aug 15, 2024
1 parent 0253e97 commit 19846b6
Show file tree
Hide file tree
Showing 22 changed files with 232 additions and 154 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ cdef object _process_metadata(object df,
if len(filtered_idx) > 0:
idx = cudf.concat(filtered_idx)
else:
idx = cudf.Index(cudf.core.column.column_empty(0))
idx = cudf.Index._from_column(cudf.core.column.column_empty(0))
else:
start = range_index_meta["start"] + skip_rows
stop = range_index_meta["stop"]
Expand All @@ -240,7 +240,7 @@ cdef object _process_metadata(object df,
index_data = df[index_col]
actual_index_names = list(index_col_names.values())
if len(index_data._data) == 1:
idx = cudf.Index(
idx = cudf.Index._from_column(
index_data._data.columns[0],
name=actual_index_names[0]
)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ cpdef generate_pandas_metadata(table, index):
materialize_index = False
if index is not False:
for level, name in enumerate(table._index.names):
if isinstance(table._index, cudf.core.multiindex.MultiIndex):
if isinstance(table._index, cudf.MultiIndex):
idx = table.index.get_level_values(level)
else:
idx = table.index

if isinstance(idx, cudf.core.index.RangeIndex):
if isinstance(idx, cudf.RangeIndex):
if index is None:
descr = {
"kind": "range",
Expand All @@ -110,7 +110,7 @@ cpdef generate_pandas_metadata(table, index):
else:
materialize_index = True
# When `index=True`, RangeIndex needs to be materialized.
materialized_idx = cudf.Index(idx._values, name=idx.name)
materialized_idx = idx._as_int_index()
descr = _index_level_name(
index_name=materialized_idx.name,
level=level,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def _union_categoricals(
new_categories=sorted_categories
)

return cudf.Index(result_col)
return cudf.CategoricalIndex._from_column(result_col)


def is_bool_dtype(arr_or_dtype):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1979,7 +1979,7 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default):
name=index.name,
)
else:
return cudf.Index(
return cudf.Index._from_column(
column.as_column(index, nan_as_null=nan_as_null),
name=index.name,
)
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np

from cudf.core.column import as_column
from cudf.core.index import RangeIndex, ensure_index
from cudf.core.index import Index, RangeIndex
from cudf.core.scalar import Scalar
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column
Expand Down Expand Up @@ -112,7 +112,9 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

return labels, cats.values if return_cupy_array else ensure_index(cats)
return labels, cats.values if return_cupy_array else Index._from_column(
cats
)


def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
Expand Down
8 changes: 5 additions & 3 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,11 +601,13 @@ def __setitem__(self, key, value):
to_add_categories = 0
else:
if cudf.api.types.is_scalar(value):
arr = [value]
arr = column.as_column(value, length=1, nan_as_null=False)
else:
arr = value
arr = column.as_column(value, nan_as_null=False)
to_add_categories = len(
cudf.Index(arr, nan_as_null=False).difference(self.categories)
cudf.Index._from_column(arr).difference(
cudf.Index._from_column(self.categories)
)
)

if to_add_categories > 0:
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,10 @@ def __contains__(self, item: ScalarLike) -> bool:
def time_unit(self) -> str:
return np.datetime_data(self.dtype)[0]

@property
def quarter(self) -> ColumnBase:
return libcudf.datetime.extract_quarter(self)

@property
def year(self) -> ColumnBase:
return self.get_dt_field("year")
Expand Down Expand Up @@ -308,14 +312,18 @@ def is_quarter_start(self) -> ColumnBase:
@property
def is_year_end(self) -> ColumnBase:
day_of_year = self.day_of_year
leap_dates = libcudf.datetime.is_leap_year(self)
leap_dates = self.is_leap_year

leap = day_of_year == cudf.Scalar(366)
non_leap = day_of_year == cudf.Scalar(365)
return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna(
False
)

@property
def is_leap_year(self) -> ColumnBase:
return libcudf.datetime.is_leap_year(self)

@property
def is_year_start(self) -> ColumnBase:
return (self.day_of_year == 1).fillna(False)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/column/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def _return_or_inplace(
"""
if inplace:
self._parent._mimic_inplace(
self._parent.__class__._from_data(
{self._parent.name: new_col}
type(self._parent)._from_column(
new_col, name=self._parent.name
),
inplace=True,
)
Expand All @@ -92,6 +92,6 @@ def _return_or_inplace(
index=self._parent.index if retain_index else None,
)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.Index(new_col, name=self._parent.name)
return cudf.Index._from_column(new_col, name=self._parent.name)
else:
return self._parent._mimic_inplace(new_col, inplace=False)
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4693,7 +4693,7 @@ def character_tokenize(self) -> SeriesOrIndex:
result_col, name=self._parent.name, index=index
)
elif isinstance(self._parent, cudf.BaseIndex):
return cudf.Index(result_col, name=self._parent.name)
return cudf.Index._from_column(result_col, name=self._parent.name)
else:
return result_col

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def cut(
)

# we return a categorical index, as we don't have a Categorical method
categorical_index = cudf.CategoricalIndex._from_data({None: col})
categorical_index = cudf.CategoricalIndex._from_column(col)

if isinstance(orig_x, (pd.Series, cudf.Series)):
# if we have a series input we return a series output
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def _getitem_tuple_arg(self, arg):
range(len(tmp_arg[0]))
)
},
index=cudf.Index(tmp_arg[0]),
index=cudf.Index._from_column(tmp_arg[0]),
)
columns_df[cantor_name] = column.as_column(
range(len(columns_df))
Expand Down Expand Up @@ -1758,7 +1758,7 @@ def _concat(
for cols in columns:
table_index = None
if 1 == first_data_column_position:
table_index = cudf.Index(cols[0])
table_index = cudf.Index._from_column(cols[0])
elif first_data_column_position > 1:
table_index = cudf.MultiIndex._from_data(
data=dict(
Expand Down Expand Up @@ -1810,7 +1810,7 @@ def _concat(
if not isinstance(out.index, MultiIndex) and isinstance(
out.index.dtype, cudf.CategoricalDtype
):
out = out.set_index(cudf.Index(out.index._values))
out = out.set_index(out.index)
for name, col in out._data.items():
out._data[name] = col._with_type_metadata(
tables[0]._data[name].dtype
Expand Down Expand Up @@ -3007,7 +3007,7 @@ def set_index(
and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
):
# Don't turn single level MultiIndex into an Index
idx = cudf.Index(data_to_add[0], name=names[0])
idx = cudf.Index._from_column(data_to_add[0], name=names[0])
else:
idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
idx.names = names
Expand Down
14 changes: 9 additions & 5 deletions python/cudf/cudf/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None:
self._ordered = ordered

@property
def categories(self) -> "cudf.core.index.Index":
def categories(self) -> cudf.Index:
"""
An ``Index`` containing the unique categories allowed.
Expand All @@ -194,10 +194,12 @@ def categories(self) -> "cudf.core.index.Index":
Index(['b', 'a'], dtype='object')
"""
if self._categories is None:
return cudf.Index(
cudf.core.column.column_empty(0, dtype="object", masked=False)
col = cudf.core.column.column_empty(
0, dtype="object", masked=False
)
return cudf.Index(self._categories, copy=False)
else:
col = self._categories
return cudf.Index._from_column(col)

@property
def type(self):
Expand Down Expand Up @@ -259,7 +261,9 @@ def to_pandas(self) -> pd.CategoricalDtype:
categories = self._categories.to_pandas()
return pd.CategoricalDtype(categories=categories, ordered=self.ordered)

def _init_categories(self, categories: Any):
def _init_categories(
self, categories: Any
) -> cudf.core.column.ColumnBase | None:
if categories is None:
return categories
if len(categories) == 0 and not isinstance(
Expand Down
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
if len(group_keys) > 1:
index = cudf.MultiIndex.from_arrays(group_keys)
else:
(group_keys,) = group_keys
index = cudf.Index(group_keys)
index = cudf.Index._from_column(group_keys[0])
return dict(
zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
)
Expand Down Expand Up @@ -2583,7 +2582,7 @@ def _mimic_pandas_order(
# corresponding output rows in pandas, to do that here
# expand the result by reindexing.
ri = cudf.RangeIndex(0, len(self.obj))
result.index = cudf.Index(ordering)
result.index = cudf.Index._from_column(ordering)
# This reorders and expands
result = result.reindex(ri)
else:
Expand Down Expand Up @@ -3154,7 +3153,9 @@ def keys(self):
dict(zip(range(nkeys), self._key_columns))
)._set_names(self.names)
else:
return cudf.Index(self._key_columns[0], name=self.names[0])
return cudf.Index._from_column(
self._key_columns[0], name=self.names[0]
)

@property
def values(self) -> cudf.core.frame.Frame:
Expand Down
Loading

0 comments on commit 19846b6

Please sign in to comment.