Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Preserve timezone in numpy_dtype for ArrowDtype #60514

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9b01de4
BUG: Preserve timezone in numpy_dtype for ArrowDtype
Koookadooo Dec 7, 2024
0e261ee
added entry in latest docs/source/whatsnew
Koookadooo Dec 7, 2024
46099a0
fixed tests to fail gracefully if pyarrow is not installed in ci/cd
Koookadooo Dec 7, 2024
ff6e892
removing test as ArrowDtype doe not have reset cache methods
Koookadooo Dec 7, 2024
82623f2
fixing whatsnew
Koookadooo Dec 7, 2024
b86e696
fixing whatsnew
Koookadooo Dec 7, 2024
c1f4735
handling cases where np_dtype is DatetimeTZDtype
Koookadooo Dec 9, 2024
720ca43
fixing linting.formatting
Koookadooo Dec 9, 2024
8c074f5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 9, 2024
d2e53fc
handling macos13 platform specific error
Koookadooo Dec 9, 2024
a814a26
handling macos13 platform specific error
Koookadooo Dec 9, 2024
0525509
Merge branch 'bugfix-arrowdtypes-timezone' of https://github.com/Kooo…
Koookadooo Dec 9, 2024
a7604c9
fixing macos13 platofrm issue
Koookadooo Dec 9, 2024
c5bfdf8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 9, 2024
7b09def
fixing unintentional sideffects of trying to handle dtypes on case by…
Koookadooo Dec 10, 2024
02b291c
fixing merge conflict
Koookadooo Dec 10, 2024
4eee761
fixing syntax error
Koookadooo Dec 10, 2024
c6e6c51
Merge branch 'main' into bugfix-arrowdtypes-timezone
Koookadooo Dec 10, 2024
55f9d8b
fixing linting errors
Koookadooo Dec 10, 2024
0dd8367
Merge branch 'bugfix-arrowdtypes-timezone' of https://github.com/Kooo…
Koookadooo Dec 10, 2024
3f3c383
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 10, 2024
599579e
fixing linting error
Koookadooo Dec 10, 2024
f1a0ede
Merge branch 'bugfix-arrowdtypes-timezone' of https://github.com/Kooo…
Koookadooo Dec 10, 2024
6ba1942
handling tz aware ts explicitly, preventing re-deriving of ArrowDtype…
Koookadooo Dec 11, 2024
e0062fa
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 11, 2024
b18dea2
fixing linting error
Koookadooo Dec 11, 2024
400a2a0
fixing linting errors
Koookadooo Dec 11, 2024
5d3f5a1
fixing linting error
Koookadooo Dec 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,7 @@ Categorical
Datetimelike
^^^^^^^^^^^^
- Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`)
- Bug in :class:`ArrowDtype` where ``.convert_dtypes(dtype_backend="pyarrow")`` stripped timezone information from timezone-aware PyArrow timestamps, resulting in a loss of timezone data. This has been fixed to ensure timezone information is preserved during conversions. (:issue:`60237`)
- Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`)
- Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`)
- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
Expand All @@ -638,7 +639,7 @@ Timedelta

Timezones
^^^^^^^^^
-
- Fixed an issue where ``.convert_dtypes(dtype_backend="pyarrow")`` stripped timezone information from timezone-aware PyArrow timestamps. Timezone data is now correctly preserved during conversions. (:issue:`60237`)
-

Numeric
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/computation/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _evaluate_numexpr(op, op_str, left_op, right_op):
try:
result = ne.evaluate(
f"left_value {op_str} right_value",
local_dict={"left_value": left_value, "right_value": right_op},
local_dict={"left_value": left_value, "right_value": right_value},
casting="safe",
)
except TypeError:
Expand Down Expand Up @@ -257,7 +257,11 @@ def where(cond, left_op, right_op, use_numexpr: bool = True):
Whether to try to use numexpr.
"""
assert _where is not None
return _where(cond, left_op, right_op) if use_numexpr else _where_standard(cond, left_op, right_op)
return (
_where(cond, left_op, right_op)
if use_numexpr
else _where_standard(cond, left_op, right_op)
)


def set_test_mode(v: bool = True) -> None:
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/computation/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,9 @@ def stringify(value):
# string quoting
return TermValue(conv_val, stringify(conv_val), "string")
else:
raise TypeError(f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column")
raise TypeError(
f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column"
)

def convert_values(self) -> None:
pass
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,7 @@ def convert_dtypes(
else:
inferred_dtype = input_array.dtype

if dtype_backend == "pyarrow":
if dtype_backend == "pyarrow" and not isinstance(inferred_dtype, ArrowDtype):
from pandas.core.arrays.arrow.array import to_pyarrow_type
from pandas.core.arrays.string_ import StringDtype

Expand Down
41 changes: 23 additions & 18 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2274,26 +2274,31 @@ def name(self) -> str: # type: ignore[override]

@cache_readonly
def numpy_dtype(self) -> np.dtype:
"""Return an instance of the related numpy dtype"""
if pa.types.is_timestamp(self.pyarrow_dtype):
# pa.timestamp(unit).to_pandas_dtype() returns ns units
# regardless of the pyarrow timestamp units.
# This can be removed if/when pyarrow addresses it:
# https://github.com/apache/arrow/issues/34462
return np.dtype(f"datetime64[{self.pyarrow_dtype.unit}]")
if pa.types.is_duration(self.pyarrow_dtype):
# pa.duration(unit).to_pandas_dtype() returns ns units
# regardless of the pyarrow duration units
# This can be removed if/when pyarrow addresses it:
# https://github.com/apache/arrow/issues/34462
return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]")
if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string(
self.pyarrow_dtype
):
# pa.string().to_pandas_dtype() = object which we don't want
"""Return an instance of the related numpy dtype."""
pa_type = self.pyarrow_dtype

# handle tz-aware timestamps
if pa.types.is_timestamp(pa_type):
if pa_type.tz is not None:
# preserve tz by NOT calling numpy_dtype for this dtype.
return np.dtype("datetime64[ns]")
else:
# For tz-naive timestamps, just return the corresponding unit
return np.dtype(f"datetime64[{pa_type.unit}]")

if pa.types.is_duration(pa_type):
return np.dtype(f"timedelta64[{pa_type.unit}]")

if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
return np.dtype(str)

try:
return np.dtype(self.pyarrow_dtype.to_pandas_dtype())
np_dtype = pa_type.to_pandas_dtype()
if isinstance(np_dtype, DatetimeTZDtype):
# In theory we shouldn't get here for tz-aware arrow timestamps
# if we've handled them above. This is a fallback.
return np.dtype("datetime64[ns]")
return np.dtype(np_dtype)
except (NotImplementedError, TypeError):
return np.dtype(object)

Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3505,3 +3505,19 @@ def test_map_numeric_na_action():
result = ser.map(lambda x: 42, na_action="ignore")
expected = pd.Series([42.0, 42.0, np.nan], dtype="float64")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"tz", ["UTC", "America/New_York", "Europe/London", "Asia/Tokyo"]
)
def test_pyarrow_timestamp_tz_preserved(tz):
s = pd.Series(
pd.to_datetime(range(5), unit="h", utc=True).tz_convert(tz),
dtype=f"timestamp[ns, tz={tz}][pyarrow]",
)

result = s.convert_dtypes(dtype_backend="pyarrow")
assert result.dtype == s.dtype, f"Expected {s.dtype}, got {result.dtype}"

assert str(result.iloc[0].tzinfo) == str(s.iloc[0].tzinfo)
tm.assert_series_equal(result, s)
Loading