Skip to content

Commit

Permalink
backwards compatible refactor of time coding, preparing for relaxing …
Browse files Browse the repository at this point in the history
…nanosecond-restriction
  • Loading branch information
kmuehlbauer committed Dec 18, 2024
1 parent da78bcc commit b55b903
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 36 deletions.
112 changes: 79 additions & 33 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
from xarray.core.duck_array_ops import asarray, ravel, reshape
from xarray.core.formatting import first_n_items, format_timestamp, last_item
from xarray.core.pdcompat import nanosecond_precision_timestamp
from xarray.core.pdcompat import nanosecond_precision_timestamp, timestamp_as_unit
from xarray.core.utils import attempt_import, emit_user_level_warning
from xarray.core.variable import Variable
from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type
Expand All @@ -36,7 +36,11 @@
except ImportError:
cftime = None

from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray
from xarray.core.types import (
CFCalendar,
NPDatetimeUnitOptions,
T_DuckArray,
)

T_Name = Union[Hashable, None]

Expand Down Expand Up @@ -197,15 +201,18 @@ def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp:
return date


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
def _unpack_time_unit_and_ref_date(
units: str,
) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
time_units, _ref_date = _unpack_netcdf_time_units(units)
time_unit, _ref_date = _unpack_netcdf_time_units(units)
time_unit = _netcdf_to_numpy_timeunit(time_unit)
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
ref_date = _maybe_strip_tz_from_timestamp(ref_date)
return time_units, ref_date
return time_unit, ref_date


def _decode_cf_datetime_dtype(
Expand Down Expand Up @@ -252,6 +259,30 @@ def _decode_datetime_with_cftime(
return np.array([], dtype=object)


def _check_date_for_units_since_refdate(
date, unit: str, ref_date: pd.Timestamp
) -> pd.Timestamp:
# check for out-of-bounds floats and raise
if date > np.iinfo("int64").max or date < np.iinfo("int64").min:
raise OutOfBoundsTimedelta(
f"Value {date} can't be represented as Datetime/Timedelta."
)
delta = date * np.timedelta64(1, unit)
if not np.isnan(delta):
# this will raise on dtype overflow for integer dtypes
if date.dtype.kind in "iu" and not np.int64(delta) == date:
raise OutOfBoundsTimedelta(
"DType overflow in Datetime/Timedelta calculation."
)
# this will raise on overflow if ref_date + delta
# can't be represented in the current ref_date resolution
return timestamp_as_unit(ref_date + delta, ref_date.unit)
else:
# if date is exactly NaT (np.iinfo("int64").min) return refdate
# to make follow-up checks work
return ref_date


def _decode_datetime_with_pandas(
flat_num_dates: np.ndarray, units: str, calendar: str
) -> np.ndarray:
Expand All @@ -270,12 +301,8 @@ def _decode_datetime_with_pandas(
elif flat_num_dates.dtype.kind == "u":
flat_num_dates = flat_num_dates.astype(np.uint64)

time_units, ref_date_str = _unpack_netcdf_time_units(units)
time_units = _netcdf_to_numpy_timeunit(time_units)
try:
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(ref_date_str)
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
except ValueError as err:
# ValueError is raised by pd.Timestamp for non-ISO timestamp
# strings, in which case we fall back to using cftime
Expand All @@ -285,8 +312,12 @@ def _decode_datetime_with_pandas(
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
if flat_num_dates.size > 0:
# avoid size 0 datetimes GH1329
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
_check_date_for_units_since_refdate(
flat_num_dates.min(), time_unit, ref_date
)
_check_date_for_units_since_refdate(
flat_num_dates.max(), time_unit, ref_date
)

# To avoid integer overflow when converting to nanosecond units for integer
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
Expand All @@ -299,20 +330,24 @@ def _decode_datetime_with_pandas(
elif flat_num_dates.dtype.kind in "f":
flat_num_dates = flat_num_dates.astype(np.float64)

# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
# works much faster when dealing with integers (GH 1399).
# properly handle NaN/NaT to prevent casting NaN to int
# keep NaT/nan mask
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)
# in case we need to change the unit, we fix the numbers here
# this should be safe, as errors would have been raised above
ns_time_unit = _NS_PER_TIME_DELTA[time_unit]
ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit]
if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_ref_date_unit):
flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit)
time_unit = ref_date.unit

# Use pd.to_timedelta to safely cast integer values to timedeltas,
# and add those to a Timestamp to safely produce a DatetimeIndex. This
# ensures that we do not encounter integer overflow at any point in the
# process without raising OutOfBoundsDatetime.
return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
# Cast input ordinals to integers and properly handle NaN/NaT
# to prevent casting NaN to int
flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_int[nan] = np.iinfo(np.int64).min
flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# cast to timedelta64[time_unit] and add to ref_date
return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]")


def decode_cf_datetime(
Expand Down Expand Up @@ -344,10 +379,16 @@ def decode_cf_datetime(
dates = _decode_datetime_with_cftime(
flat_num_dates.astype(float), units, calendar
)
# retrieve cftype
cftype = type(dates[np.nanargmin(num_dates)])
# "ns" boarders
# between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807']
lower = cftype(1677, 9, 21, 0, 12, 43, 145224)
upper = cftype(2262, 4, 11, 23, 47, 16, 854775)

if (
dates[np.nanargmin(num_dates)].year < 1678
or dates[np.nanargmax(num_dates)].year >= 2262
dates[np.nanargmin(num_dates)] < lower
or dates[np.nanargmax(num_dates)] > upper
):
if _is_standard_calendar(calendar):
warnings.warn(
Expand Down Expand Up @@ -768,8 +809,8 @@ def _eagerly_encode_cf_datetime(
raise OutOfBoundsDatetime
assert dates.dtype == "datetime64[ns]"

time_units, ref_date = _unpack_time_units_and_ref_date(units)
time_delta = _time_units_to_timedelta64(time_units)
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
time_delta = np.timedelta64(1, time_unit)

# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
# an OverflowError is raised if the ref_date is too far away from
Expand All @@ -778,16 +819,17 @@ def _eagerly_encode_cf_datetime(
time_deltas = dates_as_index - ref_date

# retrieve needed units to faithfully encode to int64
needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units)
needed_units = _numpy_to_netcdf_timeunit(needed_unit)
if data_units != units:
# this accounts for differences in the reference times
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
data_delta = _time_units_to_timedelta64(needed_units)
data_delta = np.timedelta64(1, needed_unit)
if (ref_delta % data_delta) > np.timedelta64(0, "ns"):
needed_units = _infer_time_units_from_diff(ref_delta)

# needed time delta to encode faithfully to int64
needed_time_delta = _time_units_to_timedelta64(needed_units)
needed_time_delta = _unit_timedelta_numpy(needed_units)

floor_division = np.issubdtype(dtype, np.integer) or dtype is None
if time_delta > needed_time_delta:
Expand All @@ -800,6 +842,7 @@ def _eagerly_encode_cf_datetime(
f"Set encoding['dtype'] to floating point dtype to silence this warning."
)
elif np.issubdtype(dtype, np.integer) and allow_units_modification:
floor_division = True
new_units = f"{needed_units} since {format_timestamp(ref_date)}"
emit_user_level_warning(
f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
Expand All @@ -809,9 +852,12 @@ def _eagerly_encode_cf_datetime(
)
units = new_units
time_delta = needed_time_delta
floor_division = True

num = _division(time_deltas, time_delta, floor_division)
# get resolution of TimedeltaIndex and align time_delta
# todo: check, if this works in any case
num = _division(
time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division
)
num = reshape(num.values, dates.shape)

except (OutOfBoundsDatetime, OverflowError, ValueError):
Expand Down
17 changes: 16 additions & 1 deletion xarray/core/pdcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@
from __future__ import annotations

from enum import Enum
from typing import Literal
from typing import Literal, cast

import pandas as pd
from packaging.version import Version

from xarray.core.types import PDDatetimeUnitOptions


def count_not_none(*args) -> int:
"""Compute the number of non-None arguments.
Expand Down Expand Up @@ -73,6 +75,19 @@ def __repr__(self) -> str:
NoDefault = Literal[_NoDefault.no_default] # For typing following pandas


def timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp:
# compatibility function for pandas issue
# where "as_unit" is not defined for pandas.Timestamp
# in pandas versions < 2.2
# can be removed minimum pandas version is >= 2.2
unit = cast(PDDatetimeUnitOptions, unit)
if hasattr(date, "as_unit"):
date = date.as_unit(unit)
elif hasattr(date, "_as_unit"):
date = date._as_unit(unit)
return date


def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
"""Return a nanosecond-precision Timestamp object.
Expand Down
4 changes: 2 additions & 2 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def test_decode_cf_datetime_overflow() -> None:
units = "days since 2000-01-01 00:00:00"

# date after 2262 and before 1678
days = (-117608, 95795)
expected = (datetime(1677, 12, 31), datetime(2262, 4, 12))
days = (-117710, 95795)
expected = (datetime(1677, 9, 20), datetime(2262, 4, 12))

for i, day in enumerate(days):
with warnings.catch_warnings():
Expand Down

0 comments on commit b55b903

Please sign in to comment.