From ecc451d6ff0934ac88ead3875d092d0841ecc0c1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 9 Aug 2024 14:13:06 -0700 Subject: [PATCH] REF (string): de-duplicate str_map_nan_semantics (#59464) REF: de-duplicate str_map_nan_semantics --- pandas/core/arrays/string_.py | 9 ++++--- pandas/core/arrays/string_arrow.py | 42 ------------------------------ 2 files changed, 5 insertions(+), 46 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 94d49c4fdf6e6..2ba7c9fccbfce 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -395,7 +395,7 @@ def _str_map( return constructor(result, mask) else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) def _str_map_str_or_object( self, @@ -404,7 +404,6 @@ def _str_map_str_or_object( arr: np.ndarray, f, mask: npt.NDArray[np.bool_], - convert: bool, ): # _str_map helper for case where dtype is either string dtype or object if is_string_dtype(dtype) and not is_object_dtype(dtype): @@ -438,7 +437,6 @@ def _str_map_nan_semantics( mask = isna(self) arr = np.asarray(self) - convert = convert and not np.all(mask) if is_integer_dtype(dtype) or is_bool_dtype(dtype): na_value_is_na = isna(na_value) @@ -457,6 +455,9 @@ def _str_map_nan_semantics( dtype=np.dtype(cast(type, dtype)), ) if na_value_is_na and mask.any(): + # TODO: we could alternatively do this check before map_infer_mask + # and adjust the dtype/na_value we pass there. Which is more + # performant? if is_integer_dtype(dtype): result = result.astype("float64") else: @@ -465,7 +466,7 @@ def _str_map_nan_semantics( return result else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) + return self._str_map_str_or_object(dtype, na_value, arr, f, mask) # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fdb0259230c7f..cc37995969f0a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,7 +6,6 @@ from typing import ( TYPE_CHECKING, Union, - cast, ) import numpy as np @@ -23,8 +22,6 @@ ) from pandas.core.dtypes.common import ( - is_bool_dtype, - is_integer_dtype, is_scalar, pandas_dtype, ) @@ -281,45 +278,6 @@ def astype(self, dtype, copy: bool = True): _str_map = BaseStringArray._str_map - def _str_map_nan_semantics( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True - ): - if dtype is None: - dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - if is_integer_dtype(dtype): - na_value = np.nan - else: - na_value = False - - dtype = np.dtype(cast(type, dtype)) - if mask.any(): - # numpy int/bool dtypes cannot hold NaNs so we must convert to - # float64 for int (to match maybe_convert_objects) or - # object for bool (again to match maybe_convert_objects) - if is_integer_dtype(dtype): - dtype = np.dtype("float64") - else: - dtype = np.dtype(object) - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=dtype, - ) - return result - - else: - return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ):