Remove quadratic runtime due to accessing Frame._dtypes in loop (#14028)

Frame._dtypes maps column names to dtypes, however, it is a property that is computed on-demand. Consequently, a seemingly innocuous dict lookup is actually O(N). When used in a loop over columns, this makes an O(N) loop into an O(N^2) one. This mostly bites on IO when reading data with many thousands of columns. To fix this, manually move access of Frame._dtypes outside of any loop over columns. A more systematic way might be to make this a cached property, but the cache invalidation is rather hard to reason about. - Closes #14005 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - https://github.com/brandon-b-miller URL: #14028
rapidsai · Sep 1, 2023 · d1fb671 · d1fb671
1 parent b705c81
commit d1fb671
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 12 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -313,9 +313,10 @@ def dtypes(self):
         3  object  int64
         """
         index = self.grouping.keys.unique().sort_values().to_pandas()
+        obj_dtypes = self.obj._dtypes
         return pd.DataFrame(
             {
-                name: [self.obj._dtypes[name]] * len(index)
+                name: [obj_dtypes[name]] * len(index)
                 for name in self.grouping.values._column_names
             },
             index=index,

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -822,11 +822,7 @@ def replace(
             ) = _get_replacement_values_for_columns(
                 to_replace=to_replace,
                 value=value,
-                # TODO: This should be replaced with `DataFrame._dtypes` once
-                # that is moved up to `Frame`.
-                columns_dtype_map={
-                    col: self._data[col].dtype for col in self._data
-                },
+                columns_dtype_map=self._dtypes,
             )
 
             for name, col in self._data.items():

diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
@@ -123,11 +123,12 @@ def read_csv(
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is None else dtype
+        specified_dtypes = {} if dtype is None else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}
 

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
@@ -158,11 +158,12 @@ def read_json(
     if dtype is True or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is True else dtype
+        specified_dtypes = {} if dtype is True else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}