Skip to content

Commit

Permalink
Remove quadratic runtime due to accessing Frame._dtypes in loop (#14028)
Browse files Browse the repository at this point in the history
Frame._dtypes maps column names to dtypes, however, it is a property that is computed on-demand. Consequently, a seemingly innocuous dict lookup is actually O(N). When used in a loop over columns, this makes an O(N) loop into an O(N^2) one.

This mostly bites on IO when reading data with many thousands of columns. To fix this, manually move access of Frame._dtypes outside of any loop over columns.

A more systematic way might be to make this a cached property, but the cache invalidation is rather hard to reason about.

- Closes #14005

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: #14028
  • Loading branch information
wence- authored Sep 1, 2023
1 parent b705c81 commit d1fb671
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 12 deletions.
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,10 @@ def dtypes(self):
3 object int64
"""
index = self.grouping.keys.unique().sort_values().to_pandas()
obj_dtypes = self.obj._dtypes
return pd.DataFrame(
{
name: [self.obj._dtypes[name]] * len(index)
name: [obj_dtypes[name]] * len(index)
for name in self.grouping.values._column_names
},
index=index,
Expand Down
6 changes: 1 addition & 5 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,11 +822,7 @@ def replace(
) = _get_replacement_values_for_columns(
to_replace=to_replace,
value=value,
# TODO: This should be replaced with `DataFrame._dtypes` once
# that is moved up to `Frame`.
columns_dtype_map={
col: self._data[col].dtype for col in self._data
},
columns_dtype_map=self._dtypes,
)

for name, col in self._data.items():
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,12 @@ def read_csv(
if dtype is None or isinstance(dtype, abc.Mapping):
# There exists some dtypes in the result columns that is inferred.
# Find them and map them to the default dtypes.
dtype = {} if dtype is None else dtype
specified_dtypes = {} if dtype is None else dtype
df_dtypes = df._dtypes
unspecified_dtypes = {
name: df._dtypes[name]
name: df_dtypes[name]
for name in df._column_names
if name not in dtype
if name not in specified_dtypes
}
default_dtypes = {}

Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,12 @@ def read_json(
if dtype is True or isinstance(dtype, abc.Mapping):
# There exists some dtypes in the result columns that is inferred.
# Find them and map them to the default dtypes.
dtype = {} if dtype is True else dtype
specified_dtypes = {} if dtype is True else dtype
df_dtypes = df._dtypes
unspecified_dtypes = {
name: df._dtypes[name]
name: df_dtypes[name]
for name in df._column_names
if name not in dtype
if name not in specified_dtypes
}
default_dtypes = {}

Expand Down

0 comments on commit d1fb671

Please sign in to comment.