Skip to content

Commit

Permalink
Access Frame attributes instead of ColumnAccessor attributes when ava…
Browse files Browse the repository at this point in the history
…ilable (#16652)

There are some places where a public object like `DataFrame` or `Index` accesses a `ColumnAccessor` attribute when it's accessible in a shared subclass attribute instead (like `Frame`).

In an effort to access the `ColumnAccessor` less, replaced usages of `._data.attribute` with a `Frame` specific attribute`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16652
  • Loading branch information
mroeschke authored Sep 19, 2024
1 parent 8e1345f commit d63ca6a
Show file tree
Hide file tree
Showing 24 changed files with 223 additions and 210 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/concat.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def concat_columns(object columns):
def concat_tables(object tables, bool ignore_index=False):
plc_tables = []
for table in tables:
cols = table._data.columns
cols = table._columns
if not ignore_index:
cols = table._index._data.columns + cols
cols = table._index._columns + cols
plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))

return data_from_pylibcudf_table(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ cdef class _CPackedColumns:

p.column_names = input_table._column_names
p.column_dtypes = {}
for name, col in input_table._data.items():
for name, col in input_table._column_labels_and_values:
if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
p.column_dtypes[name] = col.dtype

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def read_csv(
elif isinstance(dtype, abc.Collection):
for index, col_dtype in enumerate(dtype):
if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
col_name = df._data.names[index]
col_name = df._column_names[index]
df._data[col_name] = df._data[col_name].astype(col_dtype)

if names is not None and len(names) and isinstance(names[0], int):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/io/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ cdef update_struct_field_names(
):
# Deprecated, remove in favor of add_col_struct_names
# when a reader is ported to pylibcudf
for i, (name, col) in enumerate(table._data.items()):
for i, (name, col) in enumerate(table._column_labels_and_values):
table._data[name] = update_column_struct_field_names(
col, schema_info[i]
)
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -235,24 +235,24 @@ cdef object _process_metadata(object df,
df._index = idx
elif set(index_col).issubset(names):
index_data = df[index_col]
actual_index_names = list(index_col_names.values())
if len(index_data._data) == 1:
actual_index_names = iter(index_col_names.values())
if index_data._num_columns == 1:
idx = cudf.Index._from_column(
index_data._data.columns[0],
name=actual_index_names[0]
index_data._columns[0],
name=next(actual_index_names)
)
else:
idx = cudf.MultiIndex.from_frame(
index_data,
names=actual_index_names
names=list(actual_index_names)
)
df.drop(columns=index_col, inplace=True)
df._index = idx
else:
if use_pandas_metadata:
df.index.names = index_col

if len(df._data.names) == 0 and column_index_type is not None:
if df._num_columns == 0 and column_index_type is not None:
df._data.label_dtype = cudf.dtype(column_index_type)

return df
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
If True, don't include the index in the columns.
"""
return table_view_from_columns(
tbl._index._data.columns + tbl._data.columns
tbl._index._columns + tbl._columns
if not ignore_index and tbl._index is not None
else tbl._data.columns
else tbl._columns
)


Expand All @@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index):
index_descriptors = []
columns_to_convert = list(table._columns)
# Columns
for name, col in table._data.items():
for name, col in table._column_labels_and_values:
if cudf.get_option("mode.pandas_compatible"):
# in pandas-compat mode, non-string column names are stringified.
col_names.append(str(name))
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1951,7 +1951,7 @@ def drop_duplicates(
return self._from_columns_like_self(
drop_duplicates(
list(self._columns),
keys=range(len(self._data)),
keys=range(len(self._columns)),
keep=keep,
nulls_are_equal=nulls_are_equal,
),
Expand Down
24 changes: 12 additions & 12 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None:
self.set_by_label(key, value)

def __delitem__(self, key: abc.Hashable) -> None:
old_ncols = len(self._data)
old_ncols = len(self)
del self._data[key]
new_ncols = len(self._data)
new_ncols = len(self)
self._clear_cache(old_ncols, new_ncols)

def __len__(self) -> int:
Expand Down Expand Up @@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]:

@property
def nlevels(self) -> int:
if len(self._data) == 0:
if len(self) == 0:
return 0
if not self.multiindex:
return 1
Expand All @@ -226,7 +226,7 @@ def name(self) -> abc.Hashable:

@cached_property
def nrows(self) -> int:
if len(self._data) == 0:
if len(self) == 0:
return 0
else:
return len(next(iter(self.values())))
Expand Down Expand Up @@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
Parameters
----------
old_ncols: int
len(self._data) before self._data was modified
len(self) before self._data was modified
new_ncols: int
len(self._data) after self._data was modified
len(self) after self._data was modified
"""
cached_properties = ("columns", "names", "_grouped_data")
for attr in cached_properties:
Expand Down Expand Up @@ -335,7 +335,7 @@ def insert(
if name in self._data:
raise ValueError(f"Cannot insert '{name}', already exists")

old_ncols = len(self._data)
old_ncols = len(self)
if loc == -1:
loc = old_ncols
elif not (0 <= loc <= old_ncols):
Expand Down Expand Up @@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple:
tuple
"""
if isinstance(index, slice):
start, stop, step = index.indices(len(self._data))
start, stop, step = index.indices(len(self))
return self.names[start:stop:step]
elif pd.api.types.is_integer(index):
return (self.names[index],)
Expand Down Expand Up @@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None:
if len(self) > 0 and len(value) != self.nrows:
raise ValueError("All columns must be of equal length")

old_ncols = len(self._data)
old_ncols = len(self)
self._data[key] = value
new_ncols = len(self._data)
new_ncols = len(self)
self._clear_cache(old_ncols, new_ncols)

def _select_by_label_list_like(self, key: tuple) -> Self:
Expand Down Expand Up @@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None:
if level < 0:
level += self.nlevels

old_ncols = len(self._data)
old_ncols = len(self)
self._data = {
_remove_key_level(key, level): value # type: ignore[arg-type]
for key, value in self._data.items()
}
new_ncols = len(self._data)
new_ncols = len(self)
self._level_names = (
self._level_names[:level] + self._level_names[level + 1 :]
)
Expand Down
Loading

0 comments on commit d63ca6a

Please sign in to comment.