From e6e67615c248d4992d0bf2ce5a47b09534cd4c82 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 17 May 2024 19:52:26 -0400 Subject: [PATCH] Eagerly populate the class dict for cudf.pandas proxy types (#14534) Rather than dynamically looking up class attributes (and methods), this PR makes it so that we eagerly populate the class with all known methods and attributes (by inspecting the "slow" class). This solves a number of problems: - it makes `getattr` trivially inexpensive (no dynamic `__getattr__` for each attribute access) - it ensures the _same_ object is returned every time you do, e.g., `DataFrame.max` - it makes tab completion fast because the attributes don't have to be computed each time - it no longer exposes attributes that are specific to cuDF - for example `Series.list` - it allows subclassing of proxy types to work better. For example, derived types can now call `super().` to access attributes of base types Authors: - Ashwin Srinath (https://github.com/shwina) - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/14534 --- docs/cudf/source/cudf_pandas/faq.md | 12 - python/cudf/cudf/pandas/_wrappers/common.py | 8 +- python/cudf/cudf/pandas/_wrappers/numpy.py | 4 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 117 ++++++- python/cudf/cudf/pandas/fast_slow_proxy.py | 297 +++++++++++------- python/cudf/cudf/pandas/profiler.py | 7 +- .../cudf/pandas/scripts/run-pandas-tests.sh | 2 +- .../cudf_pandas_tests/test_cudf_pandas.py | 32 ++ .../cudf/cudf_pandas_tests/test_profiler.py | 4 +- 9 files changed, 326 insertions(+), 157 deletions(-) diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index dde7afb1360..55976740105 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -151,15 +151,3 @@ for testing or benchmarking purposes. To do so, set the ```bash CUDF_PANDAS_FALLBACK_MODE=1 python -m cudf.pandas some_script.py ``` - -## Slow tab completion in IPython? - -You may experience slow tab completion when inspecting the -methods/attributes of large dataframes. We expect this issue to be -resolved in an upcoming release. In the mean time, you may execute the -following command in IPython before loading `cudf.pandas` to work -around the issue: - -``` -%config IPCompleter.jedi_compute_type_timeout=0 -``` diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py index 1669882631b..468c5687c15 100644 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ b/python/cudf/cudf/pandas/_wrappers/common.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -17,9 +17,9 @@ def array_method(self: _FastSlowProxy, *args, **kwargs): def array_function_method(self, func, types, args, kwargs): try: - return _FastSlowAttribute("__array_function__").__get__(self)( - func, types, args, kwargs - ) + return _FastSlowAttribute("__array_function__").__get__( + self, type(self) + )(func, types, args, kwargs) except Exception: # if something went wrong with __array_function__ we # attempt to call the function directly on the slow diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py index 9955550ef90..94298872213 100644 --- a/python/cudf/cudf/pandas/_wrappers/numpy.py +++ b/python/cudf/cudf/pandas/_wrappers/numpy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -10,6 +10,7 @@ import numpy.core.multiarray from ..fast_slow_proxy import ( + _FastSlowAttribute, make_final_proxy_type, make_intermediate_proxy_type, ) @@ -122,6 +123,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor): "__iter__": custom_iter, # Special wrapping to handle scalar values "_fsproxy_wrap": classmethod(wrap_ndarray), + "base": _FastSlowAttribute("base", private=True), }, ) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index de92cce8ebb..29aaaac245d 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -107,14 +107,16 @@ class _AccessorAttr: """ def __init__(self, typ): - self.__typ = typ + self._typ = typ + + def __set_name__(self, owner, name): + self._name = name def __get__(self, obj, cls=None): if obj is None: - return self.__typ + return self._typ else: - # allow __getattr__ to handle this - raise AttributeError() + return _FastSlowAttribute(self._name).__get__(obj, type(obj)) def Timestamp_Timedelta__new__(cls, *args, **kwargs): @@ -214,6 +216,7 @@ def _DataFrame__dir__(self): "__dir__": _DataFrame__dir__, "_constructor": _FastSlowAttribute("_constructor"), "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"), + "_accessors": set(), }, ) @@ -236,6 +239,7 @@ def _DataFrame__dir__(self): "cat": _AccessorAttr(_CategoricalAccessor), "_constructor": _FastSlowAttribute("_constructor"), "_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"), + "_accessors": set(), }, ) @@ -273,6 +277,9 @@ def Index__new__(cls, *args, **kwargs): "__new__": Index__new__, "_constructor": _FastSlowAttribute("_constructor"), "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), + "_accessors": set(), + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), }, ) @@ -337,7 +344,11 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=lambda fast: fast.to_pandas(), slow_to_fast=cudf.from_pandas, bases=(Index,), - additional_attributes={"__init__": _DELETE}, + additional_attributes={ + "__init__": _DELETE, + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) DatetimeArray = make_final_proxy_type( @@ -346,6 +357,10 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.DatetimeArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), + additional_attributes={ + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) DatetimeTZDtype = make_final_proxy_type( @@ -364,7 +379,11 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=lambda fast: fast.to_pandas(), slow_to_fast=cudf.from_pandas, bases=(Index,), - additional_attributes={"__init__": _DELETE}, + additional_attributes={ + "__init__": _DELETE, + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) NumpyExtensionArray = make_final_proxy_type( @@ -385,6 +404,10 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.TimedeltaArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), + additional_attributes={ + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) PeriodIndex = make_final_proxy_type( @@ -394,7 +417,11 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), bases=(Index,), - additional_attributes={"__init__": _DELETE}, + additional_attributes={ + "__init__": _DELETE, + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) PeriodArray = make_final_proxy_type( @@ -403,6 +430,11 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.PeriodArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), + additional_attributes={ + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), + }, ) PeriodDtype = make_final_proxy_type( @@ -464,6 +496,10 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.StringArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), + additional_attributes={ + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) StringDtype = make_final_proxy_type( @@ -472,7 +508,10 @@ def Index__new__(cls, *args, **kwargs): pd.StringDtype, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, + additional_attributes={ + "__hash__": _FastSlowAttribute("__hash__"), + "storage": _FastSlowAttribute("storage"), + }, ) BooleanArray = make_final_proxy_type( @@ -482,7 +521,9 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), }, ) @@ -502,7 +543,9 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") + "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), }, ) @@ -586,7 +629,11 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=lambda fast: fast.to_pandas(), slow_to_fast=cudf.from_pandas, bases=(Index,), - additional_attributes={"__init__": _DELETE}, + additional_attributes={ + "__init__": _DELETE, + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) IntervalArray = make_final_proxy_type( @@ -595,6 +642,10 @@ def Index__new__(cls, *args, **kwargs): pd.arrays.IntervalArray, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), + additional_attributes={ + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), + }, ) IntervalDtype = make_final_proxy_type( @@ -622,7 +673,9 @@ def Index__new__(cls, *args, **kwargs): fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__") + "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), + "_data": _FastSlowAttribute("_data", private=True), + "_mask": _FastSlowAttribute("_mask", private=True), }, ) @@ -798,6 +851,14 @@ def Index__new__(cls, *args, **kwargs): pd_Styler, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), + additional_attributes={ + "css": _FastSlowAttribute("css"), + "ctx": _FastSlowAttribute("ctx"), + "index": _FastSlowAttribute("ctx"), + "data": _FastSlowAttribute("data"), + "_display_funcs": _FastSlowAttribute("_display_funcs"), + "table_styles": _FastSlowAttribute("table_styles"), + }, ) except ImportError: # Styler requires Jinja to be installed @@ -813,7 +874,7 @@ def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None): return local_dict, global_dict -@register_proxy_func(pd.eval) +@register_proxy_func(pd.core.computation.eval.eval) @nvtx.annotate( "CUDF_PANDAS_EVAL", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], @@ -843,6 +904,24 @@ def _eval( ) +_orig_df_eval_method = DataFrame.eval + + +@register_proxy_func(pd.core.accessor.register_dataframe_accessor) +def _register_dataframe_accessor(name): + return pd.core.accessor._register_accessor(name, DataFrame) + + +@register_proxy_func(pd.core.accessor.register_series_accessor) +def _register_series_accessor(name): + return pd.core.accessor._register_accessor(name, Series) + + +@register_proxy_func(pd.core.accessor.register_index_accessor) +def _register_index_accessor(name): + return pd.core.accessor._register_accessor(name, Index) + + @nvtx.annotate( "CUDF_PANDAS_DATAFRAME_EVAL", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], @@ -853,11 +932,14 @@ def _df_eval_method(self, *args, local_dict=None, global_dict=None, **kwargs): local_dict, global_dict = _get_eval_locals_and_globals( level, local_dict, global_dict ) - return super(type(self), self).__getattr__("eval")( - *args, local_dict=local_dict, global_dict=global_dict, **kwargs + return _orig_df_eval_method( + self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs ) +_orig_query_eval_method = DataFrame.query + + @nvtx.annotate( "CUDF_PANDAS_DATAFRAME_QUERY", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], @@ -870,8 +952,8 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): local_dict, global_dict = _get_eval_locals_and_globals( level, local_dict, global_dict ) - return super(type(self), self).__getattr__("query")( - *args, local_dict=local_dict, global_dict=global_dict, **kwargs + return _orig_query_eval_method( + self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs ) @@ -1277,6 +1359,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs): additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, ) + MonthBegin = make_final_proxy_type( "MonthBegin", _Unusable, diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index e5c86d2318e..94caec1ce6c 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -83,6 +83,9 @@ def __getattribute__(self, name: str) -> Any: return super().__getattribute__(name) raise TypeError("Unusable type. Falling back to the slow object") + def __repr__(self) -> str: + raise AttributeError("Unusable type. Falling back to the slow object") + class _PickleConstructor: """A pickleable object to support construction in __reduce__. @@ -231,6 +234,13 @@ def _fsproxy_state(self) -> _State: elif v is not _DELETE: cls_dict[k] = v + for slow_name in dir(slow_type): + if slow_name in cls_dict or slow_name.startswith("__"): + continue + else: + cls_dict[slow_name] = _FastSlowAttribute( + slow_name, private=slow_name.startswith("_") + ) if meta_class is None: meta_class = _FastSlowProxyMeta else: @@ -329,11 +339,26 @@ def _fsproxy_fast_to_slow(self): "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow, "_fsproxy_state": _fsproxy_state, } - for method in _SPECIAL_METHODS: if getattr(slow_type, method, False): cls_dict[method] = _FastSlowAttribute(method) + for slow_name in dir(slow_type): + if slow_name in cls_dict or slow_name.startswith("__"): + continue + else: + cls_dict[slow_name] = _FastSlowAttribute( + slow_name, private=slow_name.startswith("_") + ) + + for slow_name in getattr(slow_type, "_attributes", []): + if slow_name in cls_dict: + continue + else: + cls_dict[slow_name] = _FastSlowAttribute( + slow_name, private=slow_name.startswith("_") + ) + cls = types.new_class( name, (_IntermediateProxy,), @@ -411,62 +436,16 @@ def _raise_attribute_error(obj, name): raise AttributeError(f"'{obj}' object has no attribute '{name}'") -class _FastSlowAttribute: - """ - A descriptor type used to define attributes of fast-slow proxies. - """ - - def __init__(self, name: str): - self._name = name - - def __get__(self, obj, owner=None) -> Any: - if obj is None: - # class attribute - obj = owner - - if not ( - isinstance(obj, _FastSlowProxy) - or issubclass(type(obj), _FastSlowProxyMeta) - ): - # we only want to look up attributes on the underlying - # fast/slow objects for instances of _FastSlowProxy or - # subtypes of _FastSlowProxyMeta: - _raise_attribute_error(owner if owner else obj, self._name) - - result, _ = _fast_slow_function_call(getattr, obj, self._name) - - if isinstance(result, functools.cached_property): - # TODO: temporary workaround until dask is able - # to correctly inspect cached_property objects. - # GH: 264 - result = property(result.func) - - if isinstance(result, (_MethodProxy, property)): - from .module_accelerator import disable_module_accelerator - - type_ = owner if owner else type(obj) - slow_result_type = getattr(type_._fsproxy_slow, self._name) - with disable_module_accelerator(): - result.__doc__ = inspect.getdoc( # type: ignore - slow_result_type - ) - - if isinstance(result, _MethodProxy): - # Note that this will produce the wrong result for bound - # methods because dir for the method won't be the same as for - # the pure unbound function, but the alternative is - # materializing the slow object when we don't really want to. - result._fsproxy_slow_dir = dir(slow_result_type) # type: ignore - - return result - - class _FastSlowProxyMeta(type): """ Metaclass used to dynamically find class attributes and classmethods of fast-slow proxy types. """ + _fsproxy_slow_dir: list + _fsproxy_slow_type: type + _fsproxy_fast_type: type + @property def _fsproxy_slow(self) -> type: return self._fsproxy_slow_type @@ -483,15 +462,6 @@ def __dir__(self): except AttributeError: return type.__dir__(self) - def __getattr__(self, name: str) -> Any: - if name.startswith("_fsproxy") or name.startswith("__"): - # an AttributeError was raised when trying to evaluate - # an internal attribute, we just need to propagate this - _raise_attribute_error(self.__class__.__name__, name) - - attr = _FastSlowAttribute(name) - return attr.__get__(None, owner=self) - def __subclasscheck__(self, __subclass: type) -> bool: if super().__subclasscheck__(__subclass): return True @@ -565,56 +535,13 @@ def __dir__(self): except AttributeError: return object.__dir__(self) - def __getattr__(self, name: str) -> Any: - if name.startswith("_fsproxy"): - # an AttributeError was raised when trying to evaluate - # an internal attribute, we just need to propagate this - _raise_attribute_error(self.__class__.__name__, name) - if name in { - "_ipython_canary_method_should_not_exist_", - "_ipython_display_", - "_repr_mimebundle_", - # Workaround for https://github.com/numpy/numpy/issues/5350 - # see GH:216 for details - "__array_struct__", - }: - # IPython always looks for these names in its display - # logic. See #GH:70 and #GH:172 for more details but the - # gist is that not raising an AttributeError immediately - # results in slow display in IPython (since the fast - # object will be copied to the slow one to look for - # attributes there which then also won't exist). - # This is somewhat delicate to the order in which IPython - # implements special display fallbacks. - _raise_attribute_error(self.__class__.__name__, name) - if name.startswith("_"): - # private attributes always come from `._fsproxy_slow`: - obj = getattr(self._fsproxy_slow, name) - if name.startswith("__array"): - # TODO: numpy methods raise when given proxy ndarray objects - # https://numpy.org/doc/stable/reference/arrays.classes.html#special-attributes-and-methods # noqa:E501 - return obj - - if not _is_function_or_method(obj): - return _maybe_wrap_result( - obj, getattr, self._fsproxy_slow, name - ) - - @functools.wraps(obj) - def _wrapped_private_slow(*args, **kwargs): - slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) - result = obj(*slow_args, **slow_kwargs) - return _maybe_wrap_result(result, obj, *args, **kwargs) - - return _wrapped_private_slow - attr = _FastSlowAttribute(name) - return attr.__get__(self) - def __setattr__(self, name, value): if name.startswith("_"): object.__setattr__(self, name, value) return - return _FastSlowAttribute("__setattr__").__get__(self)(name, value) + return _FastSlowAttribute("__setattr__").__get__(self, type(self))( + name, value + ) class _FinalProxy(_FastSlowProxy): @@ -790,17 +717,162 @@ class _FunctionProxy(_CallableProxyMixin): __name__: str - def __init__(self, fast: Callable | _Unusable, slow: Callable): + def __init__( + self, + fast: Callable | _Unusable, + slow: Callable, + *, + assigned=None, + updated=None, + ): self._fsproxy_fast = fast self._fsproxy_slow = slow - functools.update_wrapper(self, slow) + if assigned is None: + assigned = functools.WRAPPER_ASSIGNMENTS + if updated is None: + updated = functools.WRAPPER_UPDATES + functools.update_wrapper( + self, + slow, + assigned=assigned, + updated=updated, + ) + def __reduce__(self): + """ + In conjunction with `__proxy_setstate__`, this effectively enables + proxy types to be pickled and unpickled by pickling and unpickling + the underlying wrapped types. + """ + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + pickled_fast = pickle.dumps(self._fsproxy_fast) + pickled_slow = pickle.dumps(self._fsproxy_slow) + return ( + _PickleConstructor(type(self)), + (), + (pickled_fast, pickled_slow), + ) -class _MethodProxy(_CallableProxyMixin, _IntermediateProxy): + def __setstate__(self, state): + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + unpickled_fast = pickle.loads(state[0]) + unpickled_slow = pickle.loads(state[1]) + self._fsproxy_fast = unpickled_fast + self._fsproxy_slow = unpickled_slow + + +def is_bound_method(obj): + return inspect.ismethod(obj) and not inspect.isfunction(obj) + + +def is_function(obj): + return inspect.isfunction(obj) or isinstance(obj, types.FunctionType) + + +class _FastSlowAttribute: """ - Methods of fast-slow proxies are of type _MethodProxy. + A descriptor type used to define attributes of fast-slow proxies. """ + _attr: Any + + def __init__(self, name: str, *, private: bool = False): + self._name = name + self._private = private + self._attr = None + self._doc = None + self._dir = None + + def __get__(self, instance, owner) -> Any: + from .module_accelerator import disable_module_accelerator + + if self._attr is None: + if self._private: + fast_attr = _Unusable() + else: + fast_attr = getattr( + owner._fsproxy_fast, self._name, _Unusable() + ) + + try: + slow_attr = getattr(owner._fsproxy_slow, self._name) + except AttributeError as e: + if instance is not None: + return _maybe_wrap_result( + getattr(instance._fsproxy_slow, self._name), + None, # type: ignore + ) + else: + raise e + + if _is_function_or_method(slow_attr): + self._attr = _MethodProxy(fast_attr, slow_attr) + else: + # for anything else, use a fast-slow attribute: + self._attr, _ = _fast_slow_function_call( + getattr, owner, self._name + ) + + if isinstance( + self._attr, (property, functools.cached_property) + ): + with disable_module_accelerator(): + self._attr.__doc__ = inspect.getdoc(slow_attr) + + if instance is not None: + if isinstance(self._attr, _MethodProxy): + if is_bound_method(self._attr._fsproxy_slow): + return self._attr + else: + return types.MethodType(self._attr, instance) + else: + if self._private: + return _maybe_wrap_result( + getattr(instance._fsproxy_slow, self._name), + None, # type: ignore + ) + return _fast_slow_function_call(getattr, instance, self._name)[ + 0 + ] + return self._attr + + +class _MethodProxy(_FunctionProxy): + def __init__(self, fast, slow): + super().__init__( + fast, + slow, + updated=functools.WRAPPER_UPDATES, + assigned=( + tuple(filter(lambda x: x != "__name__", _WRAPPER_ASSIGNMENTS)) + ), + ) + + def __dir__(self): + return self._fsproxy_slow.__dir__() + + @property + def __doc__(self): + return self._fsproxy_slow.__doc__ + + @property + def __name__(self): + return self._fsproxy_slow.__name__ + + @__name__.setter + def __name__(self, value): + try: + setattr(self._fsproxy_fast, "__name__", value) + except AttributeError: + pass + setattr(self._fsproxy_slow, "__name__", value) + def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any: """ @@ -981,10 +1053,6 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: return type(result)(wrapped) elif isinstance(result, Iterator): return (_maybe_wrap_result(r, lambda x: x, r) for r in result) - elif _is_function_or_method(result): - return _MethodProxy._fsproxy_wrap( - result, method_chain=(func, args, kwargs) - ) else: return result @@ -1081,6 +1149,7 @@ def _replace_closurevars( "__and__", "__bool__", "__call__", + "__getattr__", "__complex__", "__contains__", "__copy__", diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index 0124d411e3b..0dbd333ce4f 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -127,12 +127,7 @@ def get_namespaced_function_name( ], ): if isinstance(func_obj, _MethodProxy): - # Extract classname from method object - type_name = type(func_obj._fsproxy_wrapped.__self__).__name__ - # Explicitly ask for __name__ on _fsproxy_wrapped to avoid - # getting a private attribute and forcing a slow-path copy - func_name = func_obj._fsproxy_wrapped.__name__ - return ".".join([type_name, func_name]) + return func_obj._fsproxy_slow.__qualname__ elif isinstance(func_obj, _FunctionProxy) or issubclass( func_obj, (_FinalProxy, _IntermediateProxy) ): diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 6eb28104120..cd9f90d50fe 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -138,7 +138,7 @@ and not test_eof_states" # TODO: Remove "not db" once a postgres & mysql container is set up on the CI PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \ -v -m "not single_cpu and not db" \ - -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \ + -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \ --import-mode=importlib \ ${PYTEST_IGNORES} \ "$@" || [ $? = 1 ] # Exit success if exit code was 1 (permit test failures but not other errors) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 9fb0891fa52..e3d4f878ad5 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -379,6 +379,8 @@ def test_pickle_round_trip(dataframe): def test_excel_round_trip(dataframe): + pytest.importorskip("openpyxl") + pdf, df = dataframe excel_pdf = BytesIO() excel_cudf_pandas = BytesIO() @@ -1211,6 +1213,24 @@ def test_func_namespace(): assert xpd.concat is xpd.core.reshape.concat.concat +def test_register_accessor(): + @xpd.api.extensions.register_dataframe_accessor("xyz") + class XYZ: + def __init__(self, obj): + self._obj = obj + + @property + def foo(self): + return "spam" + + # the accessor must be registered with the proxy type, + # not the underlying fast or slow type + assert "xyz" in xpd.DataFrame.__dict__ + + df = xpd.DataFrame() + assert df.xyz.foo == "spam" + + def test_pickle_groupby(dataframe): pdf, df = dataframe pgb = pdf.groupby("a") @@ -1232,6 +1252,18 @@ def test_isinstance_base_offset(): assert isinstance(offset, xpd.tseries.offsets.BaseOffset) +def test_super_attribute_lookup(): + # test that we can use super() to access attributes + # of the base class when subclassing proxy types + + class Foo(xpd.Series): + def max_times_two(self): + return super().max() * 2 + + s = Foo([1, 2, 3]) + assert s.max_times_two() == 6 + + def test_floordiv_array_vs_df(): xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index 359a2a2c515..588398265f2 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -33,11 +33,11 @@ def test_profiler(): "Timestamp", "DataFrame", "DataFrame.groupby", - "DataFrameGroupBy.sum", + "GroupBy.sum", "DataFrame.sum", "Series.__getitem__", "Timedelta", - "Timestamp.__add__", + "_Timestamp.__add__", } for name, func in per_function_stats.items(): assert (