From 9b01de4f8c6ed0c7947e7b9cbdab9e0279290360 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Fri, 6 Dec 2024 19:53:09 -0800 Subject: [PATCH 01/22] BUG: Preserve timezone in numpy_dtype for ArrowDtype --- pandas/core/dtypes/dtypes.py | 14 +++++--------- pandas/tests/dtypes/test_dtypes.py | 27 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1dd1b12d6ae95..82671dd3dd23e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2276,21 +2276,17 @@ def name(self) -> str: # type: ignore[override] def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" if pa.types.is_timestamp(self.pyarrow_dtype): - # pa.timestamp(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow timestamp units. - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 + # Preserve timezone information if present + if self.pyarrow_dtype.tz is not None: + # Use PyArrow's to_pandas_dtype method for timezone-aware types + return self.pyarrow_dtype.to_pandas_dtype() + # Fall back to naive datetime64 for timezone-naive timestamps return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") if pa.types.is_duration(self.pyarrow_dtype): - # pa.duration(unit).to_pandas_dtype() returns ns units - # regardless of the pyarrow duration units - # This can be removed if/when pyarrow addresses it: - # https://github.com/apache/arrow/issues/34462 return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( self.pyarrow_dtype ): - # pa.string().to_pandas_dtype() = object which we don't want return np.dtype(str) try: return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b7e37ff270e60..b7c1af8d0fdfe 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -24,6 +24,7 @@ DatetimeTZDtype, IntervalDtype, PeriodDtype, + ArrowDtype, ) import pandas as pd @@ -1103,6 +1104,32 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) +class TestArrowDtype(Base): + @pytest.fixture + def dtype(self): + """Fixture for ArrowDtype.""" + import pyarrow as pa + return ArrowDtype(pa.timestamp("ns", tz="UTC")) + + def test_numpy_dtype_preserves_timezone(self, dtype): + # Test timezone-aware timestamp + assert dtype.numpy_dtype == dtype.pyarrow_dtype.to_pandas_dtype() + + def test_numpy_dtype_naive_timestamp(self): + import pyarrow as pa + arrow_type = pa.timestamp("ns") + dtype = ArrowDtype(arrow_type) + assert dtype.numpy_dtype == pa.timestamp("ns").to_pandas_dtype() + + @pytest.mark.parametrize("tz", ["UTC", "America/New_York", None]) + def test_numpy_dtype_with_varied_timezones(self, tz): + import pyarrow as pa + arrow_type = pa.timestamp("ns", tz=tz) + dtype = ArrowDtype(arrow_type) + if tz: + assert dtype.numpy_dtype == arrow_type.to_pandas_dtype() + else: + assert dtype.numpy_dtype == pa.timestamp("ns").to_pandas_dtype() @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] From 0e261ee9bb6c098d9760ec07bef2f32951d7fd41 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Fri, 6 Dec 2024 20:11:00 -0800 Subject: [PATCH 02/22] added entry in latest docs/source/whatsnew --- doc/source/whatsnew/v3.0.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab5746eca1b18..d56621cdf53f0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -614,6 +614,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) +- Bug in :class:`ArrowDtype` where `.convert_dtypes(dtype_backend="pyarrow")` stripped timezone information from timezone-aware PyArrow timestamps, resulting in a loss of timezone data. This has been fixed to ensure timezone information is preserved during conversions. (:issue:`60237`) - Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) @@ -637,7 +638,7 @@ Timedelta Timezones ^^^^^^^^^ -- +- Fixed an issue where `.convert_dtypes(dtype_backend="pyarrow")` stripped timezone information from timezone-aware PyArrow timestamps. Timezone data is now correctly preserved during conversions. (:issue:`60237`) - Numeric From 46099a0e388ee93c87e1cf49476c948bebe5d195 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Fri, 6 Dec 2024 20:43:37 -0800 Subject: [PATCH 03/22] fixed tests to fail gracefully if pyarrow is not installed in ci/cd --- pandas/tests/dtypes/test_dtypes.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b7c1af8d0fdfe..e2e69fa3df009 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1104,26 +1104,29 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) +import pytest + class TestArrowDtype(Base): @pytest.fixture def dtype(self): """Fixture for ArrowDtype.""" - import pyarrow as pa + pa = pytest.importorskip("pyarrow") return ArrowDtype(pa.timestamp("ns", tz="UTC")) def test_numpy_dtype_preserves_timezone(self, dtype): + pa = pytest.importorskip("pyarrow") # Test timezone-aware timestamp assert dtype.numpy_dtype == dtype.pyarrow_dtype.to_pandas_dtype() def test_numpy_dtype_naive_timestamp(self): - import pyarrow as pa + pa = pytest.importorskip("pyarrow") arrow_type = pa.timestamp("ns") dtype = ArrowDtype(arrow_type) assert dtype.numpy_dtype == pa.timestamp("ns").to_pandas_dtype() @pytest.mark.parametrize("tz", ["UTC", "America/New_York", None]) def test_numpy_dtype_with_varied_timezones(self, tz): - import pyarrow as pa + pa = pytest.importorskip("pyarrow") arrow_type = pa.timestamp("ns", tz=tz) dtype = ArrowDtype(arrow_type) if tz: From ff6e892348932720d46a8672b605e86ffaf587f2 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Fri, 6 Dec 2024 21:01:16 -0800 Subject: [PATCH 04/22] removing test as ArrowDtype doe not have reset cache methods --- pandas/tests/dtypes/test_dtypes.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index e2e69fa3df009..5a9756fd3e21d 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1104,36 +1104,6 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) -import pytest - -class TestArrowDtype(Base): - @pytest.fixture - def dtype(self): - """Fixture for ArrowDtype.""" - pa = pytest.importorskip("pyarrow") - return ArrowDtype(pa.timestamp("ns", tz="UTC")) - - def test_numpy_dtype_preserves_timezone(self, dtype): - pa = pytest.importorskip("pyarrow") - # Test timezone-aware timestamp - assert dtype.numpy_dtype == dtype.pyarrow_dtype.to_pandas_dtype() - - def test_numpy_dtype_naive_timestamp(self): - pa = pytest.importorskip("pyarrow") - arrow_type = pa.timestamp("ns") - dtype = ArrowDtype(arrow_type) - assert dtype.numpy_dtype == pa.timestamp("ns").to_pandas_dtype() - - @pytest.mark.parametrize("tz", ["UTC", "America/New_York", None]) - def test_numpy_dtype_with_varied_timezones(self, tz): - pa = pytest.importorskip("pyarrow") - arrow_type = pa.timestamp("ns", tz=tz) - dtype = ArrowDtype(arrow_type) - if tz: - assert dtype.numpy_dtype == arrow_type.to_pandas_dtype() - else: - assert dtype.numpy_dtype == pa.timestamp("ns").to_pandas_dtype() - @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] ) From 82623f2ae8164e5807992778ff08d7cb4aaf8c6e Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Fri, 6 Dec 2024 21:05:24 -0800 Subject: [PATCH 05/22] fixing whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d56621cdf53f0..ab92711f8a4b9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -638,7 +638,7 @@ Timedelta Timezones ^^^^^^^^^ -- Fixed an issue where `.convert_dtypes(dtype_backend="pyarrow")` stripped timezone information from timezone-aware PyArrow timestamps. Timezone data is now correctly preserved during conversions. (:issue:`60237`) +- Fixed an issue where ``.convert_dtypes(dtype_backend="pyarrow")`` stripped timezone information from timezone-aware PyArrow timestamps. Timezone data is now correctly preserved during conversions. (:issue:`60237`) - Numeric From b86e696052afcb5a1fde676438eb3169dcb7f2e5 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Fri, 6 Dec 2024 21:08:45 -0800 Subject: [PATCH 06/22] fixing whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab92711f8a4b9..5d8f507c27bb0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -614,7 +614,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) -- Bug in :class:`ArrowDtype` where `.convert_dtypes(dtype_backend="pyarrow")` stripped timezone information from timezone-aware PyArrow timestamps, resulting in a loss of timezone data. This has been fixed to ensure timezone information is preserved during conversions. (:issue:`60237`) +- Bug in :class:`ArrowDtype` where ``.convert_dtypes(dtype_backend="pyarrow")`` stripped timezone information from timezone-aware PyArrow timestamps, resulting in a loss of timezone data. This has been fixed to ensure timezone information is preserved during conversions. (:issue:`60237`) - Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) From c1f47359096e6db4d8d40ecef07117bfa86db0df Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Sun, 8 Dec 2024 19:29:08 -0800 Subject: [PATCH 07/22] handling cases where np_dtype is DatetimeTZDtype --- pandas/core/dtypes/dtypes.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 82671dd3dd23e..f8181c4a453b1 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2276,11 +2276,9 @@ def name(self) -> str: # type: ignore[override] def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" if pa.types.is_timestamp(self.pyarrow_dtype): - # Preserve timezone information if present if self.pyarrow_dtype.tz is not None: - # Use PyArrow's to_pandas_dtype method for timezone-aware types - return self.pyarrow_dtype.to_pandas_dtype() - # Fall back to naive datetime64 for timezone-naive timestamps + # Handle timezone-aware timestamps + return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") if pa.types.is_duration(self.pyarrow_dtype): return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") @@ -2289,10 +2287,15 @@ def numpy_dtype(self) -> np.dtype: ): return np.dtype(str) try: - return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) + np_dtype = self.pyarrow_dtype.to_pandas_dtype() + if isinstance(np_dtype, DatetimeTZDtype): + # Convert timezone-aware to naive datetime64 + return np.dtype(f"datetime64[{np_dtype.unit}]") + return np.dtype(np_dtype) except (NotImplementedError, TypeError): return np.dtype(object) + @cache_readonly def kind(self) -> str: if pa.types.is_timestamp(self.pyarrow_dtype): From 720ca433144d8cb73e997733d2fdc760ad2249bb Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Sun, 8 Dec 2024 20:36:16 -0800 Subject: [PATCH 08/22] fixing linting.formatting --- pandas/tests/dtypes/test_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 5a9756fd3e21d..42b0a07cb1c95 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -20,11 +20,11 @@ is_string_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, - ArrowDtype, ) import pandas as pd From 8c074f5e93b59d9dafeaad5d0eb34f3dfccf6e4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 05:07:19 +0000 Subject: [PATCH 09/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/dtypes/dtypes.py | 1 - pandas/tests/dtypes/test_dtypes.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f8181c4a453b1..3ee2b30e886fb 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2295,7 +2295,6 @@ def numpy_dtype(self) -> np.dtype: except (NotImplementedError, TypeError): return np.dtype(object) - @cache_readonly def kind(self) -> str: if pa.types.is_timestamp(self.pyarrow_dtype): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 42b0a07cb1c95..b7e37ff270e60 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -20,7 +20,6 @@ is_string_dtype, ) from pandas.core.dtypes.dtypes import ( - ArrowDtype, CategoricalDtype, DatetimeTZDtype, IntervalDtype, @@ -1104,6 +1103,7 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) + @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] ) From d2e53fc46ab711ba949944e80b7944663fa0cb13 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Sun, 8 Dec 2024 22:11:07 -0800 Subject: [PATCH 10/22] handling macos13 platform specific error --- pandas/core/dtypes/dtypes.py | 43 ++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f8181c4a453b1..b542430947a58 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2274,26 +2274,31 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: - """Return an instance of the related numpy dtype""" - if pa.types.is_timestamp(self.pyarrow_dtype): - if self.pyarrow_dtype.tz is not None: - # Handle timezone-aware timestamps - return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") + """Return an instance of the related numpy dtype.""" + if pa.types.is_timestamp(self.pyarrow_dtype): + if self.pyarrow_dtype.tz is not None: return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - if pa.types.is_duration(self.pyarrow_dtype): - return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( - self.pyarrow_dtype - ): - return np.dtype(str) - try: - np_dtype = self.pyarrow_dtype.to_pandas_dtype() - if isinstance(np_dtype, DatetimeTZDtype): - # Convert timezone-aware to naive datetime64 - return np.dtype(f"datetime64[{np_dtype.unit}]") - return np.dtype(np_dtype) - except (NotImplementedError, TypeError): - return np.dtype(object) + return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") + if pa.types.is_duration(self.pyarrow_dtype): + return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): + return np.dtype(str) + try: + np_dtype = self.pyarrow_dtype.to_pandas_dtype() + + if isinstance(np_dtype, object): + if hasattr(np_dtype, "categories") and isinstance(np_dtype.categories, pd.IntervalIndex): + return np.dtype(object) + + if isinstance(np_dtype, DatetimeTZDtype): + return np.dtype(f"datetime64[{np_dtype.unit}]") + + return np.dtype(np_dtype) + except (NotImplementedError, TypeError): + return np.dtype(object) + @cache_readonly From a814a26c7c828f0651be68b77db352843b1ba395 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Sun, 8 Dec 2024 22:14:04 -0800 Subject: [PATCH 11/22] handling macos13 platform specific error --- pandas/core/dtypes/dtypes.py | 48 +++++++++++++++++------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index b542430947a58..7f246ec1b519a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2274,32 +2274,30 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: - """Return an instance of the related numpy dtype.""" - if pa.types.is_timestamp(self.pyarrow_dtype): - if self.pyarrow_dtype.tz is not None: + """Return an instance of the related numpy dtype.""" + if pa.types.is_timestamp(self.pyarrow_dtype): + if self.pyarrow_dtype.tz is not None: + return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - if pa.types.is_duration(self.pyarrow_dtype): - return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( - self.pyarrow_dtype - ): - return np.dtype(str) - try: - np_dtype = self.pyarrow_dtype.to_pandas_dtype() - - if isinstance(np_dtype, object): - if hasattr(np_dtype, "categories") and isinstance(np_dtype.categories, pd.IntervalIndex): - return np.dtype(object) - - if isinstance(np_dtype, DatetimeTZDtype): - return np.dtype(f"datetime64[{np_dtype.unit}]") - - return np.dtype(np_dtype) - except (NotImplementedError, TypeError): - return np.dtype(object) - - + if pa.types.is_duration(self.pyarrow_dtype): + return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): + return np.dtype(str) + try: + np_dtype = self.pyarrow_dtype.to_pandas_dtype() + + if isinstance(np_dtype, object): + if hasattr(np_dtype, "categories") and isinstance(np_dtype.categories, pd.IntervalIndex): + return np.dtype(object) + + if isinstance(np_dtype, DatetimeTZDtype): + return np.dtype(f"datetime64[{np_dtype.unit}]") + + return np.dtype(np_dtype) + except (NotImplementedError, TypeError): + return np.dtype(object) @cache_readonly def kind(self) -> str: From a7604c977820590b869b4946a3fd6335c4579e85 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Sun, 8 Dec 2024 22:22:29 -0800 Subject: [PATCH 12/22] fixing macos13 platofrm issue --- pandas/core/dtypes/dtypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 7f246ec1b519a..be1b3fc42133a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2289,7 +2289,8 @@ def numpy_dtype(self) -> np.dtype: np_dtype = self.pyarrow_dtype.to_pandas_dtype() if isinstance(np_dtype, object): - if hasattr(np_dtype, "categories") and isinstance(np_dtype.categories, pd.IntervalIndex): + from pandas.core.indexes.interval import IntervalIndex + if hasattr(np_dtype, "categories") and isinstance(np_dtype.categories, IntervalIndex): return np.dtype(object) if isinstance(np_dtype, DatetimeTZDtype): From c5bfdf876b467a48df7f731765c0155950d67477 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Dec 2024 06:41:41 +0000 Subject: [PATCH 13/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/dtypes/dtypes.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index be1b3fc42133a..824fc2b6f522e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2287,15 +2287,18 @@ def numpy_dtype(self) -> np.dtype: return np.dtype(str) try: np_dtype = self.pyarrow_dtype.to_pandas_dtype() - + if isinstance(np_dtype, object): from pandas.core.indexes.interval import IntervalIndex - if hasattr(np_dtype, "categories") and isinstance(np_dtype.categories, IntervalIndex): + + if hasattr(np_dtype, "categories") and isinstance( + np_dtype.categories, IntervalIndex + ): return np.dtype(object) - + if isinstance(np_dtype, DatetimeTZDtype): return np.dtype(f"datetime64[{np_dtype.unit}]") - + return np.dtype(np_dtype) except (NotImplementedError, TypeError): return np.dtype(object) From 7b09def455adc9d86b55232f4c7ac18bf9c97d39 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Mon, 9 Dec 2024 20:15:02 -0800 Subject: [PATCH 14/22] fixing unintentional sideffects of trying to handle dtypes on case by case basis. added test --- pandas/core/dtypes/dtypes.py | 23 +++++------------------ pandas/tests/dtypes/test_dtypes.py | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index be1b3fc42133a..02e37f42a0e24 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2275,29 +2275,16 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype.""" - if pa.types.is_timestamp(self.pyarrow_dtype): - if self.pyarrow_dtype.tz is not None: - return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]") - if pa.types.is_duration(self.pyarrow_dtype): - return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( - self.pyarrow_dtype - ): + # For string-like arrow dtypes, pa.string().to_pandas_dtype() = object + # so we handle them explicitly. + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string(self.pyarrow_dtype): return np.dtype(str) + try: np_dtype = self.pyarrow_dtype.to_pandas_dtype() - - if isinstance(np_dtype, object): - from pandas.core.indexes.interval import IntervalIndex - if hasattr(np_dtype, "categories") and isinstance(np_dtype.categories, IntervalIndex): - return np.dtype(object) - - if isinstance(np_dtype, DatetimeTZDtype): - return np.dtype(f"datetime64[{np_dtype.unit}]") - return np.dtype(np_dtype) except (NotImplementedError, TypeError): + # Fallback if something unexpected happens return np.dtype(object) @cache_readonly diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b7e37ff270e60..c54bfa2086a29 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1103,6 +1103,24 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) +class TestArrowDtype: + @pytest.mark.parametrize( + "tz", + ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"] + ) + def test_pyarrow_timestamp_tz_preserved(self, tz): + pytest.importorskip("pyarrow") + s = pd.Series( + pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), + dtype=f"timestamp[ns, tz={tz}][pyarrow]" + ) + + result = s.convert_dtypes(dtype_backend="pyarrow") + assert result.dtype == s.dtype, f"Expected {s.dtype}, got {result.dtype}" + + assert str(result.iloc[0].tzinfo) == str(s.iloc[0].tzinfo) + tm.assert_series_equal(result, s) + @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] From 4eee761a5f24774602e875ab8a2812cb1d695903 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Mon, 9 Dec 2024 20:20:19 -0800 Subject: [PATCH 15/22] fixing syntax error --- pandas/core/dtypes/dtypes.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 832559c92cee6..02e37f42a0e24 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2282,21 +2282,6 @@ def numpy_dtype(self) -> np.dtype: try: np_dtype = self.pyarrow_dtype.to_pandas_dtype() -<<<<<<< HEAD -======= - - if isinstance(np_dtype, object): - from pandas.core.indexes.interval import IntervalIndex - - if hasattr(np_dtype, "categories") and isinstance( - np_dtype.categories, IntervalIndex - ): - return np.dtype(object) - - if isinstance(np_dtype, DatetimeTZDtype): - return np.dtype(f"datetime64[{np_dtype.unit}]") - ->>>>>>> c5bfdf876b467a48df7f731765c0155950d67477 return np.dtype(np_dtype) except (NotImplementedError, TypeError): # Fallback if something unexpected happens From 55f9d8b33bec53faf251d6cb6665b3a20fad3aac Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Mon, 9 Dec 2024 20:36:26 -0800 Subject: [PATCH 16/22] fixing linting errors --- pandas/core/dtypes/dtypes.py | 5 ++++- pandas/tests/dtypes/test_dtypes.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 02e37f42a0e24..f2b63ec3c4a8e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2277,7 +2277,10 @@ def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype.""" # For string-like arrow dtypes, pa.string().to_pandas_dtype() = object # so we handle them explicitly. - if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string(self.pyarrow_dtype): + if ( + pa.types.is_string(self.pyarrow_dtype) + or pa.types.is_large_string(self.pyarrow_dtype) + ): return np.dtype(str) try: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index c54bfa2086a29..21b985e046c39 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1110,7 +1110,7 @@ class TestArrowDtype: ) def test_pyarrow_timestamp_tz_preserved(self, tz): pytest.importorskip("pyarrow") - s = pd.Series( + s = Series( pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), dtype=f"timestamp[ns, tz={tz}][pyarrow]" ) From 3f3c383fa08cefa4ed19a0900fea3e130f077124 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 04:43:07 +0000 Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/computation/expressions.py | 6 +++++- pandas/core/computation/pytables.py | 4 +++- pandas/core/dtypes/dtypes.py | 5 ++--- pandas/tests/dtypes/test_dtypes.py | 6 +++--- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index a2c3a706ae29c..07fd42c0ac95b 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -257,7 +257,11 @@ def where(cond, left_op, right_op, use_numexpr: bool = True): Whether to try to use numexpr. """ assert _where is not None - return _where(cond, left_op, right_op) if use_numexpr else _where_standard(cond, left_op, right_op) + return ( + _where(cond, left_op, right_op) + if use_numexpr + else _where_standard(cond, left_op, right_op) + ) def set_test_mode(v: bool = True) -> None: diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 4a75acce46632..166c9d47294cd 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -274,7 +274,9 @@ def stringify(value): # string quoting return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column") + raise TypeError( + f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column" + ) def convert_values(self) -> None: pass diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f2b63ec3c4a8e..5539944fc6742 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2277,9 +2277,8 @@ def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype.""" # For string-like arrow dtypes, pa.string().to_pandas_dtype() = object # so we handle them explicitly. - if ( - pa.types.is_string(self.pyarrow_dtype) - or pa.types.is_large_string(self.pyarrow_dtype) + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype ): return np.dtype(str) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 21b985e046c39..2d084c2560998 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1103,16 +1103,16 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) + class TestArrowDtype: @pytest.mark.parametrize( - "tz", - ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"] + "tz", ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"] ) def test_pyarrow_timestamp_tz_preserved(self, tz): pytest.importorskip("pyarrow") s = Series( pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), - dtype=f"timestamp[ns, tz={tz}][pyarrow]" + dtype=f"timestamp[ns, tz={tz}][pyarrow]", ) result = s.convert_dtypes(dtype_backend="pyarrow") From 599579edd5dc725b12cffd700b0cb0bc683e716f Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Mon, 9 Dec 2024 20:47:20 -0800 Subject: [PATCH 18/22] fixing linting error --- pandas/core/computation/expressions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index a2c3a706ae29c..f2d843edceaac 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -108,7 +108,7 @@ def _evaluate_numexpr(op, op_str, left_op, right_op): try: result = ne.evaluate( f"left_value {op_str} right_value", - local_dict={"left_value": left_value, "right_value": right_op}, + local_dict={"left_value": left_value, "right_value": right_value}, casting="safe", ) except TypeError: From 6ba19429003fcdfe15ac8a67ac697fb339125336 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Tue, 10 Dec 2024 19:59:59 -0800 Subject: [PATCH 19/22] handling tz aware ts explicitly, preventing re-deriving of ArrowDtype. Moving test to appropriate file. --- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 27 ++++++++++++++++++++------- pandas/tests/dtypes/test_dtypes.py | 19 ------------------- pandas/tests/extension/test_arrow.py | 17 +++++++++++++++++ 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 02b9291da9b31..b63145f6b8ad5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1113,7 +1113,7 @@ def convert_dtypes( else: inferred_dtype = input_array.dtype - if dtype_backend == "pyarrow": + if dtype_backend == "pyarrow" and not isinstance(inferred_dtype, ArrowDtype): from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5539944fc6742..788e1cf2bb3ca 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2275,18 +2275,31 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype.""" - # For string-like arrow dtypes, pa.string().to_pandas_dtype() = object - # so we handle them explicitly. - if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( - self.pyarrow_dtype - ): + pa_type = self.pyarrow_dtype + + # handle tz-aware timestamps + if pa.types.is_timestamp(pa_type): + if pa_type.tz is not None: + # preserve tz by NOT calling numpy_dtype for this dtype. + return np.dtype("datetime64[ns]") + else: + # For tz-naive timestamps, just return the corresponding unit + return np.dtype(f"datetime64[{pa_type.unit}]") + + if pa.types.is_duration(pa_type): + return np.dtype(f"timedelta64[{pa_type.unit}]") + + if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): return np.dtype(str) try: - np_dtype = self.pyarrow_dtype.to_pandas_dtype() + np_dtype = pa_type.to_pandas_dtype() + if isinstance(np_dtype, DatetimeTZDtype): + # In theory we shouldn't get here for tz-aware arrow timestamps + # if we've handled them above. This is a fallback. + return np.dtype("datetime64[ns]") return np.dtype(np_dtype) except (NotImplementedError, TypeError): - # Fallback if something unexpected happens return np.dtype(object) @cache_readonly diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 2d084c2560998..319c28fa360d6 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1103,25 +1103,6 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) - -class TestArrowDtype: - @pytest.mark.parametrize( - "tz", ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"] - ) - def test_pyarrow_timestamp_tz_preserved(self, tz): - pytest.importorskip("pyarrow") - s = Series( - pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), - dtype=f"timestamp[ns, tz={tz}][pyarrow]", - ) - - result = s.convert_dtypes(dtype_backend="pyarrow") - assert result.dtype == s.dtype, f"Expected {s.dtype}, got {result.dtype}" - - assert str(result.iloc[0].tzinfo) == str(s.iloc[0].tzinfo) - tm.assert_series_equal(result, s) - - @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c6ac6368f2770..6e968d052d1c8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -51,6 +51,7 @@ import pandas as pd import pandas._testing as tm +from pandas import Series from pandas.api.extensions import no_default from pandas.api.types import ( is_bool_dtype, @@ -3505,3 +3506,19 @@ def test_map_numeric_na_action(): result = ser.map(lambda x: 42, na_action="ignore") expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "tz", ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"] +) +def test_pyarrow_timestamp_tz_preserved(tz): + s = Series( + pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), + dtype=f"timestamp[ns, tz={tz}][pyarrow]" + ) + + result = s.convert_dtypes(dtype_backend="pyarrow") + assert result.dtype == s.dtype, f"Expected {s.dtype}, got {result.dtype}" + + assert str(result.iloc[0].tzinfo) == str(s.iloc[0].tzinfo) + tm.assert_series_equal(result, s) \ No newline at end of file From e0062fa66ce6b25fa06dee1f447d3ed0651031ad Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Dec 2024 04:11:11 +0000 Subject: [PATCH 20/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/dtypes/test_dtypes.py | 1 + pandas/tests/extension/test_arrow.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 319c28fa360d6..b7e37ff270e60 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1103,6 +1103,7 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) + @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6e968d052d1c8..00016926340f2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -50,8 +50,8 @@ ) import pandas as pd -import pandas._testing as tm from pandas import Series +import pandas._testing as tm from pandas.api.extensions import no_default from pandas.api.types import ( is_bool_dtype, @@ -3514,11 +3514,11 @@ def test_map_numeric_na_action(): def test_pyarrow_timestamp_tz_preserved(tz): s = Series( pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), - dtype=f"timestamp[ns, tz={tz}][pyarrow]" + dtype=f"timestamp[ns, tz={tz}][pyarrow]", ) result = s.convert_dtypes(dtype_backend="pyarrow") assert result.dtype == s.dtype, f"Expected {s.dtype}, got {result.dtype}" assert str(result.iloc[0].tzinfo) == str(s.iloc[0].tzinfo) - tm.assert_series_equal(result, s) \ No newline at end of file + tm.assert_series_equal(result, s) From b18dea2a7ce22ea911d78a8b6c99af291fe58536 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Tue, 10 Dec 2024 20:20:48 -0800 Subject: [PATCH 21/22] fixing linting error --- pandas/tests/extension/test_arrow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6e968d052d1c8..a7479968e78d0 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -51,7 +51,6 @@ import pandas as pd import pandas._testing as tm -from pandas import Series from pandas.api.extensions import no_default from pandas.api.types import ( is_bool_dtype, @@ -3512,7 +3511,7 @@ def test_map_numeric_na_action(): "tz", ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"] ) def test_pyarrow_timestamp_tz_preserved(tz): - s = Series( + s = pd.Series( pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz), dtype=f"timestamp[ns, tz={tz}][pyarrow]" ) From 5d3f5a14a1739c52a6a62d877cfc596d4ab855b9 Mon Sep 17 00:00:00 2001 From: Koookadooo Date: Tue, 10 Dec 2024 20:27:38 -0800 Subject: [PATCH 22/22] fixing linting error --- pandas/tests/extension/test_arrow.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b7a9b6a1d9d6c..1942f481684cb 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -50,10 +50,6 @@ ) import pandas as pd -<<<<<<< HEAD -======= -from pandas import Series ->>>>>>> e0062fa66ce6b25fa06dee1f447d3ed0651031ad import pandas._testing as tm from pandas.api.extensions import no_default from pandas.api.types import (