diff --git a/doc/api.rst b/doc/api.rst index 342ae08e1a4..f731ac1c59a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1096,6 +1096,17 @@ DataTree methods .. Missing: .. ``open_mfdatatree`` +Encoding/Decoding +================= + +Coder objects +------------- + +.. autosummary:: + :toctree: generated/ + + coders.CFDatetimeCoder + Coordinates objects =================== diff --git a/doc/internals/index.rst b/doc/internals/index.rst index b2a37900338..4c00376a7b4 100644 --- a/doc/internals/index.rst +++ b/doc/internals/index.rst @@ -26,3 +26,4 @@ The pages in this section are intended for: how-to-add-new-backend how-to-create-custom-index zarr-encoding-spec + time-coding diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst new file mode 100644 index 00000000000..2ad3f11b4d2 --- /dev/null +++ b/doc/internals/time-coding.rst @@ -0,0 +1,442 @@ +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xr + + np.random.seed(123456) + np.set_printoptions(threshold=20) + int64_max = np.iinfo("int64").max + int64_min = np.iinfo("int64").min + 1 + uint64_max = np.iinfo("uint64").max + +.. internals.timecoding: + +Time Coding +=========== + +This page gives an overview how xarray encodes and decodes times and which conventions and functions are used. + +Pandas functionality +-------------------- + +to_datetime +~~~~~~~~~~~ + +The function :py:func:`pandas.to_datetime` is used within xarray for inferring units and for testing purposes. + +In normal operation :py:func:`pandas.to_datetime` returns :py:class:`pandas.Timestamp` (scalar input) or :py:class:`pandas.DatetimeIndex` (array-like input) which are datetime64 with inherited resolution (from the source). If no resolution can be inherited ``'ns'`` is assumed. That has the implication, that the maximum usable timerange for those cases is +-292 years centered around the epoch. To accommodate for that, we are carefully checking the units/resolution in the encoding and decoding step. + +When args are numeric (no strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. + +.. ipython:: python + + f"Maximum datetime range: ({pd.to_datetime(int64_min, unit="ns")}, {pd.to_datetime(int64_max, unit="ns")})" + +For input values which can't be represented in nanosecond resolution :py:class:`pandas.OutOfBoundsDatetime` exception is raised: + +.. ipython:: python + + try: + dtime = pd.to_datetime(int64_max, unit="us") + except Exception as err: + print(err) + try: + dtime = pd.to_datetime(uint64_max, unit="ns") + print("Wrong:", dtime) + dtime = pd.to_datetime([uint64_max], unit="ns") + except Exception as err: + print(err) + +Numpy datetime64 can be extracted with :py:meth:`pandas.Datetime.to_numpy` and :py:meth:`pandas.DatetimeIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Datetime.as_unit` +and :py:meth:`pandas.DatetimeIndex.as_unit` respectively. + +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as argument. That means we are able to represent datetimes with second, millisecond, microsecond or nanosecond resolution. + +.. ipython:: python + + time = pd.to_datetime(np.datetime64(0, "D")) + print("Datetime:", time, np.asarray([time.to_numpy()]).dtype) + print("Datetime as_unit('s'):", time.as_unit("s")) + print("Datetime to_numpy():", time.as_unit("s").to_numpy()) + time = pd.to_datetime(np.array([-1000, 1, 2], dtype="datetime64[Y]")) + print("DatetimeIndex:", time) + print("DatetimeIndex as_unit('s'):", time.as_unit("s")) + print("DatetimeIndex to_numpy():", time.as_unit("s").to_numpy()) + +.. warning:: + Input data with resolution higher than ``'ns'`` (eg. ``'ps'``, ``'fs'``, ``'as'``) is truncated (not rounded) at the ``'ns'``-level. This is currently broken for the ``'ps'`` input, where it is interpreted as ``'ns'``. + + .. ipython:: python + + try: + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "as")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "fs")])) + print(" Bad:", pd.to_datetime([np.datetime64(1901901901901, "ps")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ns")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "us")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ms")])) + print( + "Good:", pd.to_datetime(np.array([np.datetime64(1901901901901, "s")])) + ) + print("Bad:", pd.to_datetime([np.datetime64(1901901901901, "s")])) + except Exception as err: + print("Raise:", err) + +.. warning:: + Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_datetime` when providing :py:class:`numpy.datetime64` as scalar or numpy array as input. + + .. ipython:: python + + print( + "Works:", + np.datetime64(1901901901901, "s"), + pd.to_datetime(np.datetime64(1901901901901, "s")), + ) + print( + "Works:", + np.array([np.datetime64(1901901901901, "s")]), + pd.to_datetime(np.array([np.datetime64(1901901901901, "s")])), + ) + try: + pd.to_datetime([np.datetime64(1901901901901, "s")]) + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime(1901901901901, unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime([1901901901901], unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime(np.array([1901901901901]), unit="s") + except Exception as err: + print("Raises:", err) + + +to_timedelta +~~~~~~~~~~~~ + +The function :py:func:`pandas.to_timedelta` is used within xarray for inferring units and for testing purposes. + +In normal operation :py:func:`pandas.to_timedelta` returns :py:class:`pandas.Timedelta` (scalar input) or :py:class:`pandas.TimedeltaIndex` (array-like input) which are timedelta64 with ``ns`` resolution internally. That has the implication, that the usable timedelta covers only roughly 585 years. To accommodate for that, we are working around that limitation in the encoding and decoding step. + +.. ipython:: python + + f"Maximum timedelta range: ({pd.to_timedelta(int64_min, unit="ns")}, {pd.to_timedelta(int64_max, unit="ns")})" + +For input values which can't be represented in nanosecond resolution :py:class:`pandas.OutOfBoundsTimedelta` exception is raised: + +.. ipython:: python + + try: + delta = pd.to_timedelta(int64_max, unit="us") + except Exception as err: + print("First:", err) + try: + delta = pd.to_timedelta(uint64_max, unit="ns") + except Exception as err: + print("Second:", err) + +When args are numeric (no strings) "unit" can be anything from ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. + +Numpy timedelta64 can be extracted with :py:meth:`pandas.Timedelta.to_numpy` and :py:meth:`pandas.TimedeltaIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timedelta.as_unit` +and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. + +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as argument. That means we are able to represent timedeltas with second, millisecond, microsecond or nanosecond resolution. + +.. ipython:: python + + delta = pd.to_timedelta(1, unit="D") + print("Timedelta:", delta) + print("Timedelta as_unit('s'):", delta.as_unit("s")) + print("Timedelta to_numpy():", delta.as_unit("s").to_numpy()) + delta = pd.to_timedelta([0, 1, 2], unit="D") + print("TimedeltaIndex:", delta) + print("TimedeltaIndex as_unit('s'):", delta.as_unit("s")) + print("TimedeltaIndex to_numpy():", delta.as_unit("s").to_numpy()) + +.. note:: + For the functionality in xarray the output resolution is converted from ``'ns'`` to the lowest needed resolution. + +.. warning:: + Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_timedelta` when providing :py:class:`numpy.timedelta64` as scalar or numpy array as input. + + .. ipython:: python + + print( + "Works:", + np.timedelta64(1901901901901, "s"), + pd.to_timedelta(np.timedelta64(1901901901901, "s")), + ) + print( + "Works:", + np.array([np.timedelta64(1901901901901, "s")]), + pd.to_timedelta(np.array([np.timedelta64(1901901901901, "s")])), + ) + try: + pd.to_timedelta([np.timedelta64(1901901901901, "s")]) + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta(1901901901901, unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta([1901901901901], unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta(np.array([1901901901901]), unit="s") + except Exception as err: + print("Raises:", err) + +Timestamp +~~~~~~~~~ + +:py:class:`pandas.Timestamp` is used within xarray to wrap strings of CF reference times and datetime.datetime. + +When args are numeric (no strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. + +In normal operation :py:class:`pandas.Timestamp` holds the timestamp in the provided resolution, but only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. + +Same conversion rules apply here as for :py:func:`pandas.to_timedelta` (see above). +Depending on the internal resolution Timestamps can be represented in the range: + +.. ipython:: python + + for unit in ["s", "ms", "us", "ns"]: + print( + f"unit: {unit!r} time range ({pd.Timestamp(int64_min, unit=unit)}, {pd.Timestamp(int64_max, unit=unit)})" + ) + +Since relaxing the resolution this enhances the range to several hundreds of thousands of centuries with microsecond representation. ``NaT`` will be at ``np.iinfo("int64").min`` for all of the different representations. + +.. warning:: + When initialized with a datetime string this is only defined from ``-9999-01-01`` to ``9999-12-31``. + + .. ipython:: python + + try: + print("Works:", pd.Timestamp("-9999-01-01 00:00:00")) + print("Works, too:", pd.Timestamp("9999-12-31 23:59:59")) + print(pd.Timestamp("10000-01-01 00:00:00")) + except Exception as err: + print("Errors:", err) + +.. note:: + :py:class:`pandas.Timestamp` is the only current possibility to correctly import time reference strings. It handles non-ISO formatted strings, keeps the resolution of the strings (``'s'``, ``''ms''`` etc.) and imports time zones. When initialized with :py:class:`numpy.datetime64` instead of a string it even overcomes the above limitation of the possible time range. + + .. ipython:: python + + try: + print("Handles non-ISO:", pd.Timestamp("92-1-8 151542")) + print( + "Keeps resolution 1:", + pd.Timestamp("1992-10-08 15:15:42"), + pd.Timestamp("1992-10-08 15:15:42").unit, + ) + print( + "Keeps resolution 2:", + pd.Timestamp("1992-10-08 15:15:42.5"), + pd.Timestamp("1992-10-08 15:15:42.5").unit, + ) + print( + "Keeps timezone:", + pd.Timestamp("1992-10-08 15:15:42.5 -6:00"), + pd.Timestamp("1992-10-08 15:15:42.5 -6:00").unit, + ) + print( + "Extends timerange :", + pd.Timestamp(np.datetime64("-10000-10-08 15:15:42.5001")), + pd.Timestamp(np.datetime64("-10000-10-08 15:15:42.5001")).unit, + ) + except Exception as err: + print("Errors:", err) + +DatetimeIndex +~~~~~~~~~~~~~ + +:py:class:`pandas.DatetimeIndex` is used to wrap numpy datetime64 or other datetime-likes, when encoding. The resolution of the DatetimeIndex depends on the input, but can be only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. +:py:class:`pandas.DatetimeIndex` will raise :py:class:`pandas.OutOfBoundsDatetime` if the input can't be represented in the given resolution. + +.. note:: + For xarray we assume that all :py:class:`numpy.datetime64` provided to :py:class:`pandas.DatetimeIndex` are up to the specs. This is especially true, when those values have been decoded upfront. If the data is provided by users, they should handle any issues before. + +.. ipython:: python + + try: + print( + "Works:", + pd.DatetimeIndex( + np.array(["1992-01-08", "1992-01-09"], dtype="datetime64[D]") + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1992-01-08 15:15:42", "1992-01-09 15:15:42"], + dtype="datetime64[s]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1992-01-08 15:15:42.5", "1992-01-09 15:15:42.0"], + dtype="datetime64[ms]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1970-01-01 00:00:00.401501601701801901", "1970-01-01 00:00:00"], + dtype="datetime64[as]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["-10000-01-01 00:00:00.401501", "1970-01-01 00:00:00"], + dtype="datetime64[us]", + ) + ), + ) + except Exception as err: + print("Errors:", err) + +CF Conventions Time Handling +---------------------------- + +Xarray tries to adhere to the latest version of the `CF Conventions`_. Relevant is the section on `Time Coordinate`_ and the `Calendar`_ subsection. + +.. _CF Conventions: https://cfconventions.org +.. _Time Coordinate: https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#time-coordinate +.. _Calendar: https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#calendar + +CF time decoding +~~~~~~~~~~~~~~~~ + +Decoding of ``values`` with time unit specification like ``seconds since 1992-10-8 15:15:42.5 -6:00`` into datetimes (using CF convention) is a multistage process. + +1. If we have a non-standard calendar (eg. ``noleap``) the decoding is done with ``cftime`` package (which is not covered in this section). For ``standard``/``gregorian`` calendar as well as ``proleptic_gregorian`` the above outlined pandas functionality is used. + +2. ``standard``/``gregorian`` calendar and ``proleptic_gregorian`` are equivalent for any dates and reference times >= ``1582-10-15``. First the reference time is checked and any timezone information stripped off and in a second step, the minimum and maximum ``values`` are checked if they can be represented in the current reference time resolution. At the same time integer overflow would be caught. For ``standard``/``gregorian`` calendar the dates are checked to be >= ``1582-10-15``. If anything fails, the decoding is done with ``cftime``). + +3. As the unit (here ``seconds``) and the resolution of the reference time ``1992-10-8 15:15:42.5 -6:00`` (here ``milliseconds``) might be different, this has to be aligned to the higher resolution (retrieve new unit). User may also specify their wanted target resolution by setting kwarg ``time_unit`` to one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` (default ``'ns'``). This will be included into the alignment process. This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To not break consistency for ``NaT`` a mask is kept and re-introduced after the multiplication. + +4. Times encoded as floating point values are checked for fractional parts and the resolution is enhanced in an iterative process until a fitting resolution (or ``'ns'``) is found. A ``SerializationWarning`` is issued to make the user aware of the possibly problematic encoding. + +5. Finally, the ``values`` (``int64``) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. + +.. ipython:: python + + calendar = "proleptic_gregorian" + values = np.array([-1000 * 365, 0, 1000 * 365], dtype="int64") + units = "days since 2000-01-01 00:00:00.000001" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + print(dt) + assert dt.dtype == "datetime64[us]" + + units = "microseconds since 2000-01-01 00:00:00" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + print(dt) + assert dt.dtype == "datetime64[us]" + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "days since 2000-01-01 00:00:00.001" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + print(dt) + assert dt.dtype == "datetime64[ms]" + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "hours since 2000-01-01" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + print(dt) + assert dt.dtype == "datetime64[s]" + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "hours since 2000-01-01 00:00:00 03:30" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + print(dt) + assert dt.dtype == "datetime64[s]" + + values = np.array([-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64") + units = "days since 0001-01-01 00:00:00" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") + print(dt) + assert dt.dtype == "datetime64[s]" + +CF time encoding +~~~~~~~~~~~~~~~~ + +For encoding the process is more or less a reversal of the above, but we have to make some decisions on default values. + +1. Infer ``data_units`` from the given ``dates``. +2. Infer ``units`` (either cleanup given ``units`` or use ``data_units`` +3. Infer calendar name from given ``dates``. +4. If non standard calendar or object dates (CFTime) encode with ``cftime`` +5. Retrieve ``time_units`` and ``ref_date`` from ``units`` +6. Check ``ref_date`` >= ``1582-10-15``, otherwise -> ``cftime`` +7. Wrap ``dates`` with pd.DatetimeIndex +8. Subtracting ``ref_date`` (:py:class:`pandas.Timestamp`) from above :py:class:`pandas.DatetimeIndex` will return :py:class:`pandas.TimedeltaIndex` +9. Align resolution of :py:class:`pandas.TimedeltaIndex` with resolution of ``time_units`` +10. Retrieve needed ``units`` and ``delta`` to faithfully encode into int64 +11. Divide ``time_deltas`` by ``delta``, use floor division (integer) or normal division (float) +12. Return result + +.. ipython:: python + :okwarning: + + calendar = "proleptic_gregorian" + dates = np.array( + [ + "-2000-01-01T00:00:00", + "0000-01-01T00:00:00", + "0002-01-01T00:00:00", + "2000-01-01T00:00:00", + ], + dtype="datetime64[s]", + ) + orig_values = np.array( + [-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64" + ) + units = "days since 0001-01-01 00:00:00" + values, _, _ = xr.coding.times.encode_cf_datetime( + dates, units, calendar, dtype=np.dtype("int64") + ) + print(values) + np.testing.assert_array_equal(values, orig_values) + + dates = np.array( + [ + "-2000-01-01T01:00:00", + "0000-01-01T00:00:00", + "0002-01-01T00:00:00", + "2000-01-01T00:00:00", + ], + dtype="datetime64[s]", + ) + orig_values = np.array( + [-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64" + ) + units = "days since 0001-01-01 00:00:00" + values, units, _ = xr.coding.times.encode_cf_datetime( + dates, units, calendar, dtype=np.dtype("int64") + ) + print(values, units) + + +Default Time Unit +~~~~~~~~~~~~~~~~~ + +The current default time unit of xarray is ``'ns'``. Setting keyword argument ``time_unit`` unit to ``'s'`` (the lowest resolution pandas allows) datetimes will be converted to at least ``'s'``-resolution, if possible. Same holds true for ``'ms'`` and ``'us'``. diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 8ec5dfea6c1..9233791249e 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -21,9 +21,9 @@ core functionality. Creating datetime64 data ------------------------ -Xarray uses the numpy dtypes ``datetime64[ns]`` and ``timedelta64[ns]`` to -represent datetime data, which offer vectorized (if sometimes buggy) operations -with numpy and smooth integration with pandas. +Xarray uses the numpy dtypes ``datetime64[unit]`` and ``timedelta64[unit]`` +(where unit is one of "s", "ms", "us" and "ns") to represent datetime +data, which offer vectorized operations with numpy and smooth integration with pandas. To convert to or create regular arrays of ``datetime64`` data, we recommend using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: @@ -31,10 +31,21 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: .. ipython:: python pd.to_datetime(["2000-01-01", "2000-02-02"]) + pd.DatetimeIndex( + ["2000-01-01 00:00:00", "2000-02-02 00:00:00"], dtype="datetime64[s]" + ) pd.date_range("2000-01-01", periods=365) + pd.date_range("2000-01-01", periods=365, unit="s") + +.. note:: + Care has to be taken to create the output with the wanted resolution. + For :py:func:`pandas.date_range` the ``unit``-kwarg has to be specified + and for :py:func:`pandas.to_datetime` the selection of the resolution + isn't possible at all. For that :py:class:`pd.DatetimeIndex` can be used + directly. Alternatively, you can supply arrays of Python ``datetime`` objects. These get -converted automatically when used as arguments in xarray objects: +converted automatically when used as arguments in xarray objects (with us-resolution): .. ipython:: python @@ -51,7 +62,7 @@ attribute like ``'days since 2000-01-01'``). .. note:: When decoding/encoding datetimes for non-standard calendars or for dates - before year 1678 or after year 2262, xarray uses the `cftime`_ library. + before [1582-10-15](https://en.wikipedia.org/wiki/Gregorian_calendar), xarray uses the `cftime`_ library by default. It was previously packaged with the ``netcdf4-python`` package under the name ``netcdftime`` but is now distributed separately. ``cftime`` is an :ref:`optional dependency` of xarray. @@ -66,17 +77,15 @@ You can manual decode arrays in this form by passing a dataset to attrs = {"units": "hours since 2000-01-01"} ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) + # Default decoding to 'ns'-resolution xr.decode_cf(ds) + # Decoding to 's'-resolution + coder = xr.coders.CFDatetimeCoder(time_unit="s") + xr.decode_cf(ds, decode_times=coder) -One unfortunate limitation of using ``datetime64[ns]`` is that it limits the -native representation of dates to those that fall between the years 1678 and -2262. When a netCDF file contains dates outside of these bounds, dates will be -returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` -will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables a subset of -the indexing functionality of a :py:class:`pandas.DatetimeIndex` and is only -fully compatible with the standalone version of ``cftime`` (not the version -packaged with earlier versions ``netCDF4``). See :ref:`CFTimeIndex` for more -information. +From xarray TODO: version the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +:py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. +See :ref:`CFTimeIndex` for more information. Datetime indexing ----------------- diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 5cc7b2e5af9..6a56e3030f0 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -10,7 +10,7 @@ Weather and climate data import xarray as xr -Xarray can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include :ref:`automatic labelling of plots` with descriptive names and units if proper metadata is present and support for non-standard calendars used in climate science through the ``cftime`` module(Explained in the :ref:`CFTimeIndex` section). There are also a number of :ref:`geosciences-focused projects that build on xarray`. +Xarray can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include :ref:`automatic labelling of plots` with descriptive names and units if proper metadata is present and support for non-standard calendars used in climate science through the ``cftime`` module (explained in the :ref:`CFTimeIndex` section). There are also a number of :ref:`geosciences-focused projects that build on xarray`. .. _Climate and Forecast (CF) conventions: https://cfconventions.org @@ -64,8 +64,7 @@ Through the standalone ``cftime`` library and a custom subclass of :py:class:`pandas.Index`, xarray supports a subset of the indexing functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for dates from non-standard calendars commonly used in climate science or dates -using a standard calendar, but outside the `nanosecond-precision range`_ -(approximately between years 1678 and 2262). +using a standard calendar, but outside the `precision range`_ and dates [prior to 1582-10-15](https://en.wikipedia.org/wiki/Gregorian_calendar). .. note:: @@ -75,18 +74,14 @@ using a standard calendar, but outside the `nanosecond-precision range`_ any of the following are true: - The dates are from a non-standard calendar - - Any dates are outside the nanosecond-precision range. + - Any dates are outside the nanosecond-precision range (prior xarray version 2024.11) + - Any dates are outside the time span limited by the resolution (from xarray version v2024.11) Otherwise pandas-compatible dates from a standard calendar will be - represented with the ``np.datetime64[ns]`` data type, enabling the use of a - :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[ns]`` - and their full set of associated features. + represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"], enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime - values. For the time being, xarray still automatically casts datetime values - to nanosecond-precision for backwards compatibility with older pandas - versions; however, this is something we would like to relax going forward. - See :issue:`7493` for more discussion. + values. From xarray version 2024.11 the relaxed non-nanosecond precision datetime values will be used. For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a @@ -115,7 +110,7 @@ instance, we can create the same dates and DataArray we created above using: Mirroring pandas' method with the same name, :py:meth:`~xarray.infer_freq` allows one to infer the sampling frequency of a :py:class:`~xarray.CFTimeIndex` or a 1-D :py:class:`~xarray.DataArray` containing cftime objects. It also works transparently with -``np.datetime64[ns]`` and ``np.timedelta64[ns]`` data. +``np.datetime64`` and ``np.timedelta64`` data (with "s", "ms", "us" or "ns" resolution). .. ipython:: python @@ -137,7 +132,7 @@ Conversion between non-standard calendar and to/from pandas DatetimeIndexes is facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to -use ``pandas`` when possible, i.e. when the calendar is standard and dates are within 1678 and 2262. +use ``pandas`` when possible, i.e. when the calendar is ``standard``/``gregorian`` and dates [starting with 1582-10-15]((https://en.wikipedia.org/wiki/Gregorian_calendar)). There is no such restriction when converting to ``proleptic_gregorian`` calendar. .. ipython:: python @@ -241,6 +236,6 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.resample(time="81min", closed="right", label="right", offset="3min").mean() -.. _nanosecond-precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations +.. _precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations .. _ISO 8601 standard: https://en.wikipedia.org/wiki/ISO_8601 .. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#partial-string-indexing diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dec80590c11..7e7ea305300 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,6 +27,9 @@ New Features - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). By `Kai Mühlbauer `_. +- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). + By `Kai Mühlbauer `_. + Breaking changes ~~~~~~~~~~~~~~~~ @@ -39,6 +42,10 @@ Breaking changes Deprecations ~~~~~~~~~~~~ +- Time decoding related kwarg ``use_cftime`` is deprecated. Use keyword argument + ``decode_times=CFDatetimeCoder(use_cftime=True)`` in the respective functions + instead (:pull:`9618`). + By `Kai Mühlbauer `_. - Finalize deprecation of ``closed`` parameters of :py:func:`cftime_range` and :py:func:`date_range` (:pull:`9882`). By `Kai Mühlbauer `_. diff --git a/xarray/__init__.py b/xarray/__init__.py index 622c927b468..8af936ed27a 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -1,6 +1,6 @@ from importlib.metadata import version as _version -from xarray import groupers, testing, tutorial, ufuncs +from xarray import coders, groupers, testing, tutorial, ufuncs from xarray.backends.api import ( load_dataarray, load_dataset, @@ -66,6 +66,7 @@ # `mypy --strict` running in projects that import xarray. __all__ = ( # noqa: RUF022 # Sub-packages + "coders", "groupers", "testing", "tutorial", diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 785ab3913ef..12abb655e14 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -33,6 +33,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler +from xarray.coders import CFDatetimeCoder from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -481,7 +482,10 @@ def open_dataset( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -543,9 +547,9 @@ def open_dataset( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -569,6 +573,8 @@ def open_dataset( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -698,7 +704,10 @@ def open_dataarray( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | None = None, - decode_times: bool | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | None = None, use_cftime: bool | None = None, concat_characters: bool | None = None, @@ -761,9 +770,11 @@ def open_dataarray( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. This keyword may not be supported by all the backends. - decode_times : bool, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in @@ -781,6 +792,8 @@ def open_dataarray( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -903,7 +916,10 @@ def open_datatree( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -961,9 +977,9 @@ def open_datatree( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -987,6 +1003,8 @@ def open_datatree( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -1118,7 +1136,10 @@ def open_groups( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -1180,9 +1201,9 @@ def open_groups( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -1206,6 +1227,8 @@ def open_groups( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and diff --git a/xarray/coders.py b/xarray/coders.py new file mode 100644 index 00000000000..238ac714780 --- /dev/null +++ b/xarray/coders.py @@ -0,0 +1,10 @@ +""" +This module provides coder objects that encapsulate the +"encoding/decoding" process. +""" + +from xarray.coding.times import CFDatetimeCoder + +__all__ = [ + "CFDatetimeCoder", +] diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 2cd8eccd6f3..64261d4d777 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -63,7 +63,7 @@ from xarray.core.common import _contains_datetime_like_objects, is_np_datetime_like from xarray.core.pdcompat import ( count_not_none, - nanosecond_precision_timestamp, + default_precision_timestamp, ) from xarray.core.utils import attempt_import, emit_user_level_warning @@ -80,14 +80,6 @@ T_FreqStr = TypeVar("T_FreqStr", str, None) -def _nanosecond_precision_timestamp(*args, **kwargs): - # As of pandas version 3.0, pd.to_datetime(Timestamp(...)) will try to - # infer the appropriate datetime precision. Until xarray supports - # non-nanosecond precision times, we will use this constructor wrapper to - # explicitly create nanosecond-precision Timestamp objects. - return pd.Timestamp(*args, **kwargs).as_unit("ns") - - def get_date_type(calendar, use_cftime=True): """Return the cftime date type for a given calendar name.""" if TYPE_CHECKING: @@ -96,7 +88,7 @@ def get_date_type(calendar, use_cftime=True): cftime = attempt_import("cftime") if _is_standard_calendar(calendar) and not use_cftime: - return _nanosecond_precision_timestamp + return default_precision_timestamp calendars = { "noleap": cftime.DatetimeNoLeap, @@ -1426,10 +1418,8 @@ def date_range_like(source, calendar, use_cftime=None): if is_np_datetime_like(source.dtype): # We want to use datetime fields (datetime64 object don't have them) source_calendar = "standard" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - source_start = nanosecond_precision_timestamp(source_start) - source_end = nanosecond_precision_timestamp(source_end) + source_start = default_precision_timestamp(source_start) + source_end = default_precision_timestamp(source_end) else: if isinstance(source, CFTimeIndex): source_calendar = source.calendar diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 0494952fc9c..f599b06e36c 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -646,13 +646,14 @@ def to_datetimeindex(self, unsafe=False): CFTimeIndex([2000-01-01 00:00:00, 2000-01-02 00:00:00], dtype='object', length=2, calendar='standard', freq=None) >>> times.to_datetimeindex() - DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[us]', freq=None) """ if not self._data.size: return pd.DatetimeIndex([]) - nptimes = cftime_to_nptime(self) + # transform to us-resolution is needed for DatetimeIndex + nptimes = cftime_to_nptime(self).astype("=M8[us]") calendar = infer_calendar_name(self) if calendar not in _STANDARD_CALENDARS and not unsafe: warnings.warn( diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 4622298e152..e79806bbd61 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -2,10 +2,10 @@ import re import warnings -from collections.abc import Callable, Hashable +from collections.abc import Callable, Hashable, Iterator from datetime import datetime, timedelta from functools import partial -from typing import TYPE_CHECKING, Literal, Union, cast +from typing import TYPE_CHECKING, Union, cast import numpy as np import pandas as pd @@ -24,7 +24,7 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.pdcompat import nanosecond_precision_timestamp +from xarray.core.pdcompat import _timestamp_as_unit, default_precision_timestamp from xarray.core.utils import attempt_import, emit_user_level_warning from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type @@ -36,7 +36,12 @@ except ImportError: cftime = None -from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray +from xarray.core.types import ( + CFCalendar, + NPDatetimeUnitOptions, + PDDatetimeUnitOptions, + T_DuckArray, +) T_Name = Union[Hashable, None] @@ -98,6 +103,13 @@ def _is_numpy_compatible_time_range(times): tmin = times.min() tmax = times.max() try: + # before relaxing the nanosecond constrained + # this raised OutOfBoundsDatetime for + # times < 1678 and times > 2262 + # this isn't the case anymore for other resolutions like "s" + # now, we raise for dates before 1582-10-15 + _check_date_is_after_shift(tmin, "standard") + _check_date_is_after_shift(tmax, "standard") convert_time_or_go_back(tmin, pd.Timestamp) convert_time_or_go_back(tmax, pd.Timestamp) except pd.errors.OutOfBoundsDatetime: @@ -189,22 +201,32 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]: return delta_units, ref_date -def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: - # same us _unpack_netcdf_time_units but finalizes ref_date for - # processing in encode_cf_datetime - time_units, _ref_date = _unpack_netcdf_time_units(units) - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(_ref_date) +def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp: # If the ref_date Timestamp is timezone-aware, convert to UTC and # make it timezone-naive (GH 2649). - if ref_date.tz is not None: - ref_date = ref_date.tz_convert(None) - return time_units, ref_date + if date.tz is not None: + date = date.tz_convert("UTC").tz_convert(None) + return date + + +def _unpack_time_unit_and_ref_date( + units: str, +) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]: + # same us _unpack_netcdf_time_units but finalizes time_unit and ref_date + # for processing in encode_cf_datetime + time_unit, _ref_date = _unpack_netcdf_time_units(units) + time_unit = _netcdf_to_numpy_timeunit(time_unit) + ref_date = pd.Timestamp(_ref_date) + ref_date = _maybe_strip_tz_from_timestamp(ref_date) + return time_unit, ref_date def _decode_cf_datetime_dtype( - data, units: str, calendar: str | None, use_cftime: bool | None + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: PDDatetimeUnitOptions = "ns", ) -> np.dtype: # Verify that at least the first and last date can be decoded # successfully. Otherwise, tracebacks end up swallowed by @@ -215,7 +237,9 @@ def _decode_cf_datetime_dtype( ) try: - result = decode_cf_datetime(example_value, units, calendar, use_cftime) + result = decode_cf_datetime( + example_value, units, calendar, use_cftime, time_unit + ) except Exception as err: calendar_msg = ( "the default calendar" if calendar is None else f"calendar {calendar!r}" @@ -247,8 +271,71 @@ def _decode_datetime_with_cftime( return np.array([], dtype=object) +def _check_date_for_units_since_refdate( + date, unit: str, ref_date: pd.Timestamp +) -> pd.Timestamp: + # check for out-of-bounds floats and raise + if date > np.iinfo("int64").max or date < np.iinfo("int64").min: + raise OutOfBoundsTimedelta( + f"Value {date} can't be represented as Datetime/Timedelta." + ) + delta = date * np.timedelta64(1, unit) + if not np.isnan(delta): + # this will raise on dtype overflow for integer dtypes + if date.dtype.kind in "iu" and not np.int64(delta) == date: + raise OutOfBoundsTimedelta( + "DType overflow in Datetime/Timedelta calculation." + ) + # this will raise on overflow if ref_date + delta + # can't be represented in the current ref_date resolution + return _timestamp_as_unit(ref_date + delta, ref_date.unit) + else: + # if date is exactly NaT (np.iinfo("int64").min) return refdate + # to make follow-up checks work + return ref_date + + +def _align_reference_date_and_unit(ref_date: pd.Timestamp, unit: str) -> pd.Timestamp: + # align to the highest needed resolution of ref_date or unit + if np.timedelta64(1, ref_date.unit) > np.timedelta64(1, unit): + # this will raise accordingly + # if data can't be represented in the higher resolution + return _timestamp_as_unit(ref_date, unit) + return ref_date + + +def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: + # if we have gregorian/standard we need to raise + # if we are outside the well-defined date range + # proleptic_gregorian and standard/gregorian are only equivalent + # if reference date and date range is >= 1582-10-15 + if calendar != "proleptic_gregorian": + if date < type(date)(1582, 10, 15): + raise OutOfBoundsDatetime( + f"Dates before 1582-10-15 cannot be decoded " + f"with pandas using {calendar!r} calendar." + ) + + +def _check_higher_resolution( + flat_num_dates: np.ndarray, + iter_unit: Iterator[PDDatetimeUnitOptions], +) -> tuple[np.ndarray, PDDatetimeUnitOptions]: + """Iterate until fitting resolution found.""" + new_time_unit: PDDatetimeUnitOptions = next(iter_unit) + if (np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns": + flat_num_dates, new_time_unit = _check_higher_resolution( + flat_num_dates * 1000, + iter_unit=iter_unit, + ) + return flat_num_dates, new_time_unit + + def _decode_datetime_with_pandas( - flat_num_dates: np.ndarray, units: str, calendar: str + flat_num_dates: np.ndarray, + units: str, + calendar: str, + time_resolution: PDDatetimeUnitOptions = "ns", ) -> np.ndarray: if not _is_standard_calendar(calendar): raise OutOfBoundsDatetime( @@ -265,23 +352,29 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind == "u": flat_num_dates = flat_num_dates.astype(np.uint64) - time_units, ref_date_str = _unpack_netcdf_time_units(units) - time_units = _netcdf_to_numpy_timeunit(time_units) try: - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(ref_date_str) + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + ref_date = _align_reference_date_and_unit(ref_date, time_unit) + # here the highest wanted resolution is set + ref_date = _align_reference_date_and_unit(ref_date, time_resolution) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime from err + _check_date_is_after_shift(ref_date, calendar) + with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: # avoid size 0 datetimes GH1329 - pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date - pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date + dec_min = _check_date_for_units_since_refdate( + flat_num_dates.min(), time_unit, ref_date + ) + _check_date_for_units_since_refdate( + flat_num_dates.max(), time_unit, ref_date + ) + _check_date_is_after_shift(dec_min, calendar) # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer and unsigned integer dtype @@ -294,24 +387,51 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind in "f": flat_num_dates = flat_num_dates.astype(np.float64) - # Cast input ordinals to integers of nanoseconds because pd.to_timedelta - # works much faster when dealing with integers (GH 1399). - # properly handle NaN/NaT to prevent casting NaN to int + # keep NaT/nan mask nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min) - flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units] - flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64) - flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min - flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64) - # Use pd.to_timedelta to safely cast integer values to timedeltas, - # and add those to a Timestamp to safely produce a DatetimeIndex. This - # ensures that we do not encounter integer overflow at any point in the - # process without raising OutOfBoundsDatetime. - return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values + # in case we need to change the unit, we fix the numbers here + # this should be safe, as errors would have been raised above + ns_time_unit = _NS_PER_TIME_DELTA[time_unit] + ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit] + if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_ref_date_unit): + flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) + time_unit = ref_date.unit + + # estimate fitting resolution for floating point values + # this iterates until all floats are fractionless or time_unit == "ns" + if flat_num_dates.dtype.kind == "f" and time_unit != "ns": + res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] + iter_unit = iter(res[res.index(cast(PDDatetimeUnitOptions, time_unit)) :]) + flat_num_dates, new_time_unit = _check_higher_resolution( + flat_num_dates, iter_unit + ) + if time_unit != new_time_unit: + msg = ( + f"Can't decode floating point datetime to {time_unit!r} without " + f"precision loss, decoding to {new_time_unit!r} instead. " + f"To silence this warning use time_unit={new_time_unit!r} in call to " + f"decoding function." + ) + emit_user_level_warning(msg, SerializationWarning) + time_unit = new_time_unit + + # Cast input ordinals to integers and properly handle NaN/NaT + # to prevent casting NaN to int + flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64) + flat_num_dates_int[nan] = np.iinfo(np.int64).min + flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64) + + # cast to timedelta64[time_unit] and add to ref_date + return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]") def decode_cf_datetime( - num_dates, units: str, calendar: str | None = None, use_cftime: bool | None = None + num_dates, + units: str, + calendar: str | None = None, + use_cftime: bool | None = None, + time_unit: PDDatetimeUnitOptions = "ns", ) -> np.ndarray: """Given an array of numeric dates in netCDF format, convert it into a numpy array of date time objects. @@ -334,15 +454,26 @@ def decode_cf_datetime( if use_cftime is None: try: - dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) + dates = _decode_datetime_with_pandas( + flat_num_dates, units, calendar, time_unit + ) except (KeyError, OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError): dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar ) - + # retrieve cftype + cftype = type(dates[np.nanargmin(num_dates)]) + # create first day of gregorian calendar in current cf calendar type + border = cftype(1582, 10, 15) + # "ns" boarders + # between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807'] + lower = cftype(1677, 9, 21, 0, 12, 43, 145224) + upper = cftype(2262, 4, 11, 23, 47, 16, 854775) + + # todo: check if test for minimum date is enough if ( - dates[np.nanargmin(num_dates)].year < 1678 - or dates[np.nanargmax(num_dates)].year >= 2262 + dates[np.nanargmin(num_dates)] < border + or dates[np.nanargmax(num_dates)] < border ): if _is_standard_calendar(calendar): warnings.warn( @@ -353,36 +484,66 @@ def decode_cf_datetime( SerializationWarning, stacklevel=3, ) + elif time_unit == "ns" and ( + ( + dates[np.nanargmin(num_dates)] < lower + or dates[np.nanargmin(num_dates)] > upper + ) + or ( + dates[np.nanargmax(num_dates)] < lower + or dates[np.nanargmax(num_dates)] > upper + ) + ): + warnings.warn( + "Unable to decode time axis into full " + "numpy.datetime64 objects, continuing using " + "cftime.datetime objects instead, reason: dates out " + "of range", + SerializationWarning, + stacklevel=3, + ) else: if _is_standard_calendar(calendar): - dates = cftime_to_nptime(dates) + dates = cftime_to_nptime(dates, time_unit=time_unit) elif use_cftime: dates = _decode_datetime_with_cftime(flat_num_dates, units, calendar) else: - dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) + dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar, time_unit) return reshape(dates, num_dates.shape) def to_timedelta_unboxed(value, **kwargs): + # todo: check, if the procedure here is correct result = pd.to_timedelta(value, **kwargs).to_numpy() - assert result.dtype == "timedelta64[ns]" + unique_timedeltas = np.unique(result[pd.notnull(result)]) + unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) + if unit not in {"s", "ms", "us", "ns"}: + unit = "s" + result = result.astype(f"timedelta64[{unit}]") + assert np.issubdtype(result.dtype, "timedelta64") return result def to_datetime_unboxed(value, **kwargs): result = pd.to_datetime(value, **kwargs).to_numpy() - assert result.dtype == "datetime64[ns]" + assert np.issubdtype(result.dtype, "datetime64") return result def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: + # todo: check, if this works as intended """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64[ns] array. + numpy timedelta64 ["s", "ms", "us", "ns"] array. """ num_timedeltas = np.asarray(num_timedeltas) - units = _netcdf_to_numpy_timeunit(units) - result = to_timedelta_unboxed(ravel(num_timedeltas), unit=units) + unit = _netcdf_to_numpy_timeunit(units) + as_unit = unit + if unit not in {"s", "ms", "us", "ns"}: + as_unit = "s" + result = ( + pd.to_timedelta(ravel(num_timedeltas), unit=unit).as_unit(as_unit).to_numpy() + ) return reshape(result, num_timedeltas.shape) @@ -392,10 +553,11 @@ def _unit_timedelta_cftime(units: str) -> timedelta: def _unit_timedelta_numpy(units: str) -> np.timedelta64: numpy_units = _netcdf_to_numpy_timeunit(units) - return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns") + return np.timedelta64(1, numpy_units) def _infer_time_units_from_diff(unique_timedeltas) -> str: + # todo: check, if this function works correctly wrt np.timedelta64 unit_timedelta: Callable[[str], timedelta] | Callable[[str], np.timedelta64] zero_timedelta: timedelta | np.timedelta64 if unique_timedeltas.dtype == np.dtype("O"): @@ -412,10 +574,6 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str: return "seconds" -def _time_units_to_timedelta64(units: str) -> np.timedelta64: - return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]") - - def infer_calendar_name(dates) -> CFCalendar: """Given an array of datetimes, infer the CF calendar name""" if is_np_datetime_like(dates.dtype): @@ -442,13 +600,11 @@ def infer_datetime_units(dates) -> str: unique time deltas in `dates`) """ dates = ravel(np.asarray(dates)) - if np.asarray(dates).dtype == "datetime64[ns]": + if np.issubdtype(np.asarray(dates).dtype, "datetime64"): dates = to_datetime_unboxed(dates) dates = dates[pd.notnull(dates)] reference_date = dates[0] if len(dates) > 0 else "1970-01-01" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - reference_date = nanosecond_precision_timestamp(reference_date) + reference_date = pd.Timestamp(reference_date) else: reference_date = dates[0] if len(dates) > 0 else "1970-01-01" reference_date = format_cftime_datetime(reference_date) @@ -474,26 +630,22 @@ def infer_timedelta_units(deltas) -> str: return _infer_time_units_from_diff(unique_timedeltas) -def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: +def cftime_to_nptime( + times, raise_on_invalid: bool = True, time_unit: PDDatetimeUnitOptions = "ns" +) -> np.ndarray: """Given an array of cftime.datetime objects, return an array of numpy.datetime64 objects of the same size If raise_on_invalid is True (default), invalid dates trigger a ValueError. Otherwise, the invalid element is replaced by np.NaT.""" times = np.asarray(times) - # TODO: the strict enforcement of nanosecond precision datetime values can - # be relaxed when addressing GitHub issue #7493. - new = np.empty(times.shape, dtype="M8[ns]") - dt: pd.Timestamp | Literal["NaT"] - for i, t in np.ndenumerate(times): + new = [] + dt: np.datetime64 + for _i, t in np.ndenumerate(times): try: - # Use pandas.Timestamp in place of datetime.datetime, because - # NumPy casts it safely it np.datetime64[ns] for dates outside - # 1678 to 2262 (this is not currently the case for - # datetime.datetime). - dt = nanosecond_precision_timestamp( - t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond - ) + # We expect either "us" resolution or "s" resolution depending on + # whether 'microseconds' are defined for the input or not. + dt = np.datetime64(t.isoformat()).astype(f"=M8[{time_unit}]") except ValueError as e: if raise_on_invalid: raise ValueError( @@ -501,9 +653,9 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: f"standard calendar. Reason: {e}." ) from e else: - dt = "NaT" - new[i] = np.datetime64(dt) - return new + dt = np.datetime64("NaT") + new.append(dt) + return np.asarray(new).reshape(times.shape) def convert_times(times, date_type, raise_on_invalid: bool = True) -> np.ndarray: @@ -548,10 +700,8 @@ def convert_time_or_go_back(date, date_type): This is meant to convert end-of-month dates into a new calendar. """ - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. if date_type == pd.Timestamp: - date_type = nanosecond_precision_timestamp + date_type = default_precision_timestamp try: return date_type( date.year, @@ -746,9 +896,7 @@ def _eagerly_encode_cf_datetime( allow_units_modification: bool = True, ) -> tuple[T_DuckArray, str, str]: dates = asarray(dates) - data_units = infer_datetime_units(dates) - if units is None: units = data_units else: @@ -761,28 +909,32 @@ def _eagerly_encode_cf_datetime( if not _is_standard_calendar(calendar) or dates.dtype.kind == "O": # parse with cftime instead raise OutOfBoundsDatetime - assert dates.dtype == "datetime64[ns]" + assert np.issubdtype(dates.dtype, "datetime64") - time_units, ref_date = _unpack_time_units_and_ref_date(units) - time_delta = _time_units_to_timedelta64(time_units) + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + # calendar equivalence only for days after the reform + _check_date_is_after_shift(ref_date, calendar) + time_delta = np.timedelta64(1, time_unit) # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from # dates to be encoded (GH 2272). + # DatetimeIndex will convert to units of ["s", "ms", "us", "ns"] dates_as_index = pd.DatetimeIndex(ravel(dates)) time_deltas = dates_as_index - ref_date # retrieve needed units to faithfully encode to int64 - needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units) + needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units) + needed_units = _numpy_to_netcdf_timeunit(needed_unit) if data_units != units: # this accounts for differences in the reference times ref_delta = abs(data_ref_date - ref_date).to_timedelta64() - data_delta = _time_units_to_timedelta64(needed_units) + data_delta = np.timedelta64(1, needed_unit) if (ref_delta % data_delta) > np.timedelta64(0, "ns"): needed_units = _infer_time_units_from_diff(ref_delta) # needed time delta to encode faithfully to int64 - needed_time_delta = _time_units_to_timedelta64(needed_units) + needed_time_delta = _unit_timedelta_numpy(needed_units) floor_division = np.issubdtype(dtype, np.integer) or dtype is None if time_delta > needed_time_delta: @@ -795,6 +947,7 @@ def _eagerly_encode_cf_datetime( f"Set encoding['dtype'] to floating point dtype to silence this warning." ) elif np.issubdtype(dtype, np.integer) and allow_units_modification: + floor_division = True new_units = f"{needed_units} since {format_timestamp(ref_date)}" emit_user_level_warning( f"Times can't be serialized faithfully to int64 with requested units {units!r}. " @@ -804,9 +957,12 @@ def _eagerly_encode_cf_datetime( ) units = new_units time_delta = needed_time_delta - floor_division = True - num = _division(time_deltas, time_delta, floor_division) + # get resolution of TimedeltaIndex and align time_delta + # todo: check, if this works in any case + num = _division( + time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division + ) num = reshape(num.values, dates.shape) except (OutOfBoundsDatetime, OverflowError, ValueError): @@ -816,6 +972,7 @@ def _eagerly_encode_cf_datetime( num = cast_to_int_if_safe(num) if dtype is not None: + # todo: check, if this is really needed for all dtypes num = _cast_to_dtype_if_safe(num, dtype) return num, units, calendar @@ -890,12 +1047,15 @@ def _eagerly_encode_cf_timedelta( allow_units_modification: bool = True, ) -> tuple[T_DuckArray, str]: data_units = infer_timedelta_units(timedeltas) - if units is None: units = data_units - time_delta = _time_units_to_timedelta64(units) + time_delta = _unit_timedelta_numpy(units) time_deltas = pd.TimedeltaIndex(ravel(timedeltas)) + # get resolution of TimedeltaIndex and align time_delta + deltas_unit = time_deltas.unit + # todo: check, if this works in any case + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") # retrieve needed units to faithfully encode to int64 needed_units = data_units @@ -903,7 +1063,7 @@ def _eagerly_encode_cf_timedelta( needed_units = _infer_time_units_from_diff(np.unique(time_deltas.dropna())) # needed time delta to encode faithfully to int64 - needed_time_delta = _time_units_to_timedelta64(needed_units) + needed_time_delta = _unit_timedelta_numpy(needed_units) floor_division = np.issubdtype(dtype, np.integer) or dtype is None if time_delta > needed_time_delta: @@ -924,12 +1084,14 @@ def _eagerly_encode_cf_timedelta( ) units = needed_units time_delta = needed_time_delta + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") floor_division = True num = _division(time_deltas, time_delta, floor_division) num = reshape(num.values, timedeltas.shape) if dtype is not None: + # todo: check, if this is needed for all dtypes num = _cast_to_dtype_if_safe(num, dtype) return num, units @@ -974,8 +1136,13 @@ def _lazily_encode_cf_timedelta( class CFDatetimeCoder(VariableCoder): - def __init__(self, use_cftime: bool | None = None) -> None: + def __init__( + self, + use_cftime: bool | None = None, + time_unit: PDDatetimeUnitOptions = "ns", + ) -> None: self.use_cftime = use_cftime + self.time_unit = time_unit def encode(self, variable: Variable, name: T_Name = None) -> Variable: if np.issubdtype( @@ -1002,12 +1169,15 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") calendar = pop_to(attrs, encoding, "calendar") - dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime) + dtype = _decode_cf_datetime_dtype( + data, units, calendar, self.use_cftime, self.time_unit + ) transform = partial( decode_cf_datetime, units=units, calendar=calendar, use_cftime=self.use_cftime, + time_unit=self.time_unit, ) data = lazy_elemwise_func(data, transform, dtype) @@ -1037,6 +1207,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") transform = partial(decode_cf_timedelta, units=units) + # todo: check, if we can relax this one here, too dtype = np.dtype("timedelta64[ns]") data = lazy_elemwise_func(data, transform, dtype=dtype) diff --git a/xarray/conventions.py b/xarray/conventions.py index 57407a15f51..042a7f14032 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -7,6 +7,7 @@ import numpy as np +from xarray.coders import CFDatetimeCoder from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to from xarray.core import indexing @@ -88,7 +89,7 @@ def encode_cf_variable( ensure_not_multiindex(var, name=name) for coder in [ - times.CFDatetimeCoder(), + CFDatetimeCoder(), times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), @@ -109,7 +110,7 @@ def decode_cf_variable( var: Variable, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | CFDatetimeCoder = True, decode_endianness: bool = True, stack_char_dim: bool = True, use_cftime: bool | None = None, @@ -136,7 +137,7 @@ def decode_cf_variable( Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). If the _Unsigned attribute is present treat integer arrays as unsigned. - decode_times : bool + decode_times : bool or CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. decode_endianness : bool Decode arrays from non-native to native endianness. @@ -154,6 +155,8 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + Usage of use_cftime as kwarg is deprecated, please initialize it with + CFDatetimeCoder and ``decode_times``. Returns ------- @@ -167,7 +170,7 @@ def decode_cf_variable( original_dtype = var.dtype if decode_timedelta is None: - decode_timedelta = decode_times + decode_timedelta = True if decode_times else False if concat_characters: if stack_char_dim: @@ -191,7 +194,28 @@ def decode_cf_variable( if decode_timedelta: var = times.CFTimedeltaCoder().decode(var, name=name) if decode_times: - var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) + # remove checks after end of deprecation cycle + if not isinstance(decode_times, CFDatetimeCoder): + if use_cftime is not None: + from warnings import warn + + warn( + "Usage of 'use_cftime' as kwarg is deprecated. " + "Please initialize it with CFDatetimeCoder and " + "'decode_times' kwarg.", + DeprecationWarning, + stacklevel=2, + ) + decode_times = CFDatetimeCoder(use_cftime=use_cftime) + else: + if use_cftime is not None: + raise TypeError( + "Usage of 'use_cftime' as kwarg is not allowed, " + "if 'decode_times' is initialized with " + "CFDatetimeCoder. Please add 'use_cftime' " + "when initializing CFDatetimeCoder." + ) + var = decode_times.decode(var, name=name) if decode_endianness and not var.dtype.isnative: var = variables.EndianCoder().decode(var) @@ -302,7 +326,7 @@ def decode_cf_variables( attributes: T_Attrs, concat_characters: bool | Mapping[str, bool] = True, mask_and_scale: bool | Mapping[str, bool] = True, - decode_times: bool | Mapping[str, bool] = True, + decode_times: bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | Mapping[str, bool] | None = None, @@ -439,7 +463,7 @@ def decode_cf( obj: T_DatasetOrAbstractstore, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | CFDatetimeCoder = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | None = None, @@ -458,7 +482,7 @@ def decode_cf( mask_and_scale : bool, optional Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool, optional + decode_times : bool or CFDatetimeCoder, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. decode_coords : bool or {"coordinates", "all"}, optional @@ -483,6 +507,8 @@ def decode_cf( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + Usage of use_cftime as kwarg is deprecated, please initialize it with + CFDatetimeCoder and ``decode_times``. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} @@ -536,7 +562,7 @@ def cf_decoder( attributes: T_Attrs, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | CFDatetimeCoder = True, ) -> tuple[T_Variables, T_Attrs]: """ Decode a set of CF encoded variables and attributes. @@ -553,7 +579,7 @@ def cf_decoder( mask_and_scale : bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool + decode_times : bool | CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. Returns diff --git a/xarray/convert.py b/xarray/convert.py index 14df7cadb9b..29d8f9650e3 100644 --- a/xarray/convert.py +++ b/xarray/convert.py @@ -4,7 +4,8 @@ import numpy as np -from xarray.coding.times import CFDatetimeCoder, CFTimedeltaCoder +from xarray.coders import CFDatetimeCoder +from xarray.coding.times import CFTimedeltaCoder from xarray.conventions import decode_cf from xarray.core import duck_array_ops from xarray.core.dataarray import DataArray diff --git a/xarray/core/common.py b/xarray/core/common.py index 3a70c9ec585..8c220aab423 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -622,7 +622,7 @@ def assign_coords( lon (x, y) float64 32B 260.2 260.7 260.2 260.8 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 32B 2014-09-06 ... 2014-09-09 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 128B 20.0 20.8 21.6 ... 30.4 31.2 32.0 @@ -636,7 +636,7 @@ def assign_coords( lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 32B 2014-09-06 ... 2014-09-09 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 128B 20.0 20.8 21.6 ... 30.4 31.2 32.0 diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d287564cfe5..cd0428e73ca 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -394,7 +394,7 @@ class DataArray( lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Attributes: description: Ambient temperature. @@ -409,7 +409,7 @@ class DataArray( lon float64 8B -99.32 lat float64 8B 42.21 time datetime64[ns] 8B 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Attributes: description: Ambient temperature. units: degC diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d4a23ac275a..9a9f7fbf847 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -673,7 +673,7 @@ class Dataset( lat (loc) float64 16B 42.25 42.21 * instrument (instrument) Self: lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: precipitation (x, y, time) float64 96B 5.68 9.256 0.7104 ... 4.615 7.805 @@ -8927,7 +8927,7 @@ def filter_by_attrs(self, **kwargs) -> Self: lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 96B 29.11 18.2 22.83 ... 16.15 26.63 diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index ae4febd6beb..271f4cadcb3 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -36,10 +36,11 @@ from __future__ import annotations from enum import Enum -from typing import Literal +from typing import Literal, cast import pandas as pd -from packaging.version import Version + +from xarray.core.types import PDDatetimeUnitOptions def count_not_none(*args) -> int: @@ -73,13 +74,25 @@ def __repr__(self) -> str: NoDefault = Literal[_NoDefault.no_default] # For typing following pandas -def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp: - """Return a nanosecond-precision Timestamp object. +def _timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp: + # compatibility function for pandas issue + # where "as_unit" is not defined for pandas.Timestamp + # in pandas versions < 2.2 + # can be removed minimum pandas version is >= 2.2 + unit = cast(PDDatetimeUnitOptions, unit) + if hasattr(date, "as_unit"): + date = date.as_unit(unit) + elif hasattr(date, "_as_unit"): + date = date._as_unit(unit) + return date + + +def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: + """Return a Timestamp object with the default precision. - Note this function should no longer be needed after addressing GitHub issue - #7493. + Xarray default is "ns". """ - if Version(pd.__version__) >= Version("2.0.0"): - return pd.Timestamp(*args, **kwargs).as_unit("ns") - else: - return pd.Timestamp(*args, **kwargs) + dt = pd.Timestamp(*args, **kwargs) + if dt.unit != "ns": + dt = _timestamp_as_unit(dt, "ns") + return dt diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9d56555f31b..53c1fd76f7b 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -6,6 +6,7 @@ import numbers import warnings from collections.abc import Callable, Hashable, Mapping, Sequence +from datetime import datetime from functools import partial from types import EllipsisType from typing import TYPE_CHECKING, Any, NoReturn, cast @@ -78,13 +79,11 @@ from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint -NON_NANOSECOND_WARNING = ( - "Converting non-nanosecond precision {case} values to nanosecond precision. " - "This behavior can eventually be relaxed in xarray, as it is an artifact from " - "pandas which is now beginning to support non-nanosecond precision values. " - "This warning is caused by passing non-nanosecond np.datetime64 or " +NON_DEFAULTPRECISION_WARNING = ( + "Converting non-default precision {case} values to default precision. " + "This warning is caused by passing non-default np.datetime64 or " "np.timedelta64 values to the DataArray or Variable constructor; it can be " - "silenced by converting the values to nanosecond precision ahead of time." + "silenced by converting the values to default precision {res!r} ahead of time." ) @@ -205,51 +204,28 @@ def _maybe_wrap_data(data): return data -def _as_nanosecond_precision(data): - dtype = data.dtype - non_ns_datetime64 = ( - dtype.kind == "M" - and isinstance(dtype, np.dtype) - and dtype != np.dtype("datetime64[ns]") - ) - non_ns_datetime_tz_dtype = ( - isinstance(dtype, pd.DatetimeTZDtype) and dtype.unit != "ns" - ) - if non_ns_datetime64 or non_ns_datetime_tz_dtype: - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="datetime")) - if isinstance(dtype, pd.DatetimeTZDtype): - nanosecond_precision_dtype = pd.DatetimeTZDtype("ns", dtype.tz) - else: - nanosecond_precision_dtype = "datetime64[ns]" - return duck_array_ops.astype(data, nanosecond_precision_dtype) - elif dtype.kind == "m" and dtype != np.dtype("timedelta64[ns]"): - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="timedelta")) - return duck_array_ops.astype(data, "timedelta64[ns]") - else: - return data - - def _possibly_convert_objects(values): - """Convert arrays of datetime.datetime and datetime.timedelta objects into - datetime64 and timedelta64, according to the pandas convention. + """Convert object arrays into datetime64 and timedelta64 according + to the pandas convention. * datetime.datetime * datetime.timedelta * pd.Timestamp * pd.Timedelta - - For the time being, convert any non-nanosecond precision DatetimeIndex or - TimedeltaIndex objects to nanosecond precision. While pandas is relaxing this - in version 2.0.0, in xarray we will need to make sure we are ready to handle - non-nanosecond precision datetimes or timedeltas in our code before allowing - such values to pass through unchanged. Converting to nanosecond precision - through pandas.Series objects ensures that datetimes and timedeltas are - within the valid date range for ns precision, as pandas will raise an error - if they are not. """ as_series = pd.Series(values.ravel(), copy=False) - if as_series.dtype.kind in "mM": - as_series = _as_nanosecond_precision(as_series) + # When receiving objects which pd.Series can't resolve by its own + # we try astype-conversion to "ns"-resolution for datetimes and pd.Timestamp. + if ( + values.dtype.kind == "O" + and as_series.dtype.kind == "O" + and as_series.size > 0 + and ( + isinstance(as_series[0], datetime | pd.Timestamp) + or pd.api.types.is_datetime64_dtype(as_series[0]) + ) + ): + as_series = as_series.astype("=M8[ns]") result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: # GH8843, pandas copy-on-write mode creates read-only arrays by default @@ -260,28 +236,13 @@ def _possibly_convert_objects(values): return result -def _possibly_convert_datetime_or_timedelta_index(data): - """For the time being, convert any non-nanosecond precision DatetimeIndex or - TimedeltaIndex objects to nanosecond precision. While pandas is relaxing - this in version 2.0.0, in xarray we will need to make sure we are ready to - handle non-nanosecond precision datetimes or timedeltas in our code - before allowing such values to pass through unchanged.""" - if isinstance(data, PandasIndexingAdapter): - if isinstance(data.array, pd.DatetimeIndex | pd.TimedeltaIndex): - data = PandasIndexingAdapter(_as_nanosecond_precision(data.array)) - elif isinstance(data, pd.DatetimeIndex | pd.TimedeltaIndex): - data = _as_nanosecond_precision(data) - return data - - def as_compatible_data( data: T_DuckArray | ArrayLike, fastpath: bool = False ) -> T_DuckArray: """Prepare and wrap data to put in a Variable. - If data does not have the necessary attributes, convert it to ndarray. - - If data has dtype=datetime64, ensure that it has ns precision. If it's a - pandas.Timestamp, convert it to datetime64. + - If it's a pandas.Timestamp, convert it to datetime64. - If data is already a pandas or xarray object (other than an Index), just use the values. @@ -301,7 +262,6 @@ def as_compatible_data( return cast("T_DuckArray", data._variable._data) def convert_non_numpy_type(data): - data = _possibly_convert_datetime_or_timedelta_index(data) return cast("T_DuckArray", _maybe_wrap_data(data)) if isinstance(data, NON_NUMPY_SUPPORTED_ARRAY_TYPES): @@ -361,10 +321,13 @@ def _as_array_or_item(data): """ data = np.asarray(data) if data.ndim == 0: - if data.dtype.kind == "M": - data = np.datetime64(data, "ns") - elif data.dtype.kind == "m": - data = np.timedelta64(data, "ns") + kind = data.dtype.kind + if kind in "mM": + unit, _ = np.datetime_data(data.dtype) + if kind == "M": + data = np.datetime64(data, unit) + elif kind == "m": + data = np.timedelta64(data, unit) return data diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 1f2eedcd8f0..48a5e8c4b66 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -318,7 +318,14 @@ def create_test_data( f'Not enough letters for filling this dimension size ({_dims["dim3"]})' ) obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]])) - obj["time"] = ("time", pd.date_range("2000-01-01", periods=20)) + obj["time"] = ( + "time", + pd.date_range( + "2000-01-01", + periods=20, + unit="ns", + ), + ) for v, dims in sorted(_vars.items()): data = rs.normal(size=tuple(_dims[d] for d in dims)) obj[v] = (dims, data) diff --git a/xarray/tests/conftest.py b/xarray/tests/conftest.py index 97de58c4af2..c3f1ccbfe3c 100644 --- a/xarray/tests/conftest.py +++ b/xarray/tests/conftest.py @@ -220,3 +220,8 @@ def simple_datatree(create_test_datatree): Returns a DataTree. """ return create_test_datatree() + + +@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) +def time_unit(request): + return request.param diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ff254225321..8983b9810f3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -48,6 +48,7 @@ ) from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint +from xarray.coders import CFDatetimeCoder from xarray.coding.cftime_offsets import cftime_range from xarray.coding.strings import check_vlen_dtype, create_vlen_dtype from xarray.coding.variables import SerializationWarning @@ -611,6 +612,12 @@ def test_roundtrip_cftime_datetime_data(self) -> None: warnings.filterwarnings("ignore", "Unable to decode time axis") with self.roundtrip(expected, save_kwargs=kwargs) as actual: + # proleptic gregorian will be decoded into numpy datetime64 + # fixing to expectations + if actual.t.dtype.kind == "M": + dtype = f"datetime64[{np.datetime_data(actual.t)[0]}]" + expected_decoded_t = expected_decoded_t.astype(dtype) + expected_decoded_t0 = expected_decoded_t0.astype(dtype) abs_diff = abs(actual.t.values - expected_decoded_t) assert (abs_diff <= np.timedelta64(1, "s")).all() assert ( @@ -625,7 +632,8 @@ def test_roundtrip_cftime_datetime_data(self) -> None: assert actual.t.encoding["calendar"] == expected_calendar def test_roundtrip_timedelta_data(self) -> None: - time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]) # type: ignore[arg-type, unused-ignore] + # todo: check, if default unit "s" is enough + time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore] expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]}) with self.roundtrip(expected) as actual: assert_identical(expected, actual) @@ -1621,8 +1629,7 @@ def test_open_encodings(self) -> None: ds.variables["time"][:] = np.arange(10) + 4 expected = Dataset() - - time = pd.date_range("1999-01-05", periods=10) + time = pd.date_range("1999-01-05", periods=10, unit="ns") encoding = {"units": units, "dtype": np.dtype("int32")} expected["time"] = ("time", time, {}, encoding) @@ -3206,7 +3213,10 @@ def test_open_zarr_use_cftime(self) -> None: ds.to_zarr(store_target, **self.version_kwargs) ds_a = xr.open_zarr(store_target, **self.version_kwargs) assert_identical(ds, ds_a) - ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs) + decoder = CFDatetimeCoder(use_cftime=True) + ds_b = xr.open_zarr( + store_target, decode_times=decoder, **self.version_kwargs + ) assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable) def test_write_read_select_write(self) -> None: @@ -5567,11 +5577,12 @@ def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: @requires_cftime @requires_scipy_or_netCDF4 -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1582]) def test_use_cftime_standard_calendar_default_out_of_range( calendar, units_year ) -> None: + # todo: check, if we still need to test for two dates import cftime x = [0, 1] @@ -5622,7 +5633,8 @@ def test_use_cftime_true(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with warnings.catch_warnings(record=True) as record: - with open_dataset(tmp_file, use_cftime=True) as ds: + decoder = CFDatetimeCoder(use_cftime=True) + with open_dataset(tmp_file, decode_times=decoder) as ds: assert_identical(expected_x, ds.x) assert_identical(expected_time, ds.time) _assert_no_dates_out_of_range_warning(record) @@ -5660,9 +5672,10 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: @requires_scipy_or_netCDF4 -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1582]) def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) -> None: + # todo: check, if we still need to check for two dates x = [0, 1] time = [0, 720] units = f"days since {units_year}-01-01" @@ -5674,7 +5687,8 @@ def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) - with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with pytest.raises((OutOfBoundsDatetime, ValueError)): - open_dataset(tmp_file, use_cftime=False) + decoder = CFDatetimeCoder(use_cftime=False) + open_dataset(tmp_file, decode_times=decoder) @requires_scipy_or_netCDF4 @@ -5692,7 +5706,8 @@ def test_use_cftime_false_nonstandard_calendar(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with pytest.raises((OutOfBoundsDatetime, ValueError)): - open_dataset(tmp_file, use_cftime=False) + decoder = CFDatetimeCoder(use_cftime=False) + open_dataset(tmp_file, decode_times=decoder) @pytest.mark.parametrize("engine", ["netcdf4", "scipy"]) @@ -5765,7 +5780,9 @@ def test_open_fsspec() -> None: mm = m.get_mapper("out1.zarr") ds.to_zarr(mm) # old interface ds0 = ds.copy() - ds0["time"] = ds.time + pd.to_timedelta("1 day") + # pd.to_timedelta returns ns-precision, but the example data is in second precision + # so we need to fix this + ds0["time"] = ds.time + pd.to_timedelta("1 day").as_unit("s") mm = m.get_mapper("out2.zarr") ds0.to_zarr(mm) # old interface diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 1ab6c611aac..4db73048548 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -1480,7 +1480,7 @@ def test_date_range_like_same_calendar(): assert src is out -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_date_range_like_errors(): src = date_range("1899-02-03", periods=20, freq="D", use_cftime=False) src = src[np.arange(20) != 10] # Remove 1 day so the frequency is not inferable. diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index d1fccc52a9a..2a0b0328542 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1201,7 +1201,7 @@ def test_strftime_of_cftime_array(calendar): @pytest.mark.parametrize("unsafe", [False, True]) def test_to_datetimeindex(calendar, unsafe): index = xr.cftime_range("2000", periods=5, calendar=calendar) - expected = pd.date_range("2000", periods=5) + expected = pd.date_range("2000", periods=5, unit="us") if calendar in _NON_STANDARD_CALENDARS and not unsafe: with pytest.warns(RuntimeWarning, match="non-standard"): @@ -1218,7 +1218,11 @@ def test_to_datetimeindex(calendar, unsafe): @pytest.mark.parametrize("calendar", _ALL_CALENDARS) def test_to_datetimeindex_out_of_range(calendar): index = xr.cftime_range("0001", periods=5, calendar=calendar) - with pytest.raises(ValueError, match="0001"): + # todo: needs discussion, do we need this test? + if calendar in _NON_STANDARD_CALENDARS: + with pytest.warns(RuntimeWarning, match="non-standard"): + index.to_datetimeindex() + else: index.to_datetimeindex() @@ -1242,7 +1246,8 @@ def test_multiindex(): @pytest.mark.parametrize("freq", ["3663s", "33min", "2h"]) @pytest.mark.parametrize("method", ["floor", "ceil", "round"]) def test_rounding_methods_against_datetimeindex(freq, method): - expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s") + # todo: check, if setting to "us" is enough + expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="us") expected = getattr(expected, method)(freq) result = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777s") result = getattr(result, method)(freq).to_datetimeindex() diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 9a51ca40d07..decd870d77c 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -19,9 +19,9 @@ date_range, decode_cf, ) +from xarray.coders import CFDatetimeCoder from xarray.coding.times import _STANDARD_CALENDARS as _STANDARD_CALENDARS_UNSORTED from xarray.coding.times import ( - CFDatetimeCoder, _encode_datetime_with_cftime, _netcdf_to_numpy_timeunit, _numpy_to_netcdf_timeunit, @@ -39,6 +39,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes +from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import is_duck_dask_array from xarray.testing import assert_equal, assert_identical from xarray.tests import ( @@ -123,20 +124,21 @@ def _all_cftime_date_types(): @pytest.mark.filterwarnings("ignore:Ambiguous reference date string") @pytest.mark.filterwarnings("ignore:Times can't be serialized faithfully") @pytest.mark.parametrize(["num_dates", "units", "calendar"], _CF_DATETIME_TESTS) -def test_cf_datetime(num_dates, units, calendar) -> None: +def test_cf_datetime( + num_dates, units, calendar, time_unit: PDDatetimeUnitOptions +) -> None: import cftime expected = cftime.num2date( num_dates, units, calendar, only_use_cftime_datetimes=True ) - min_y = np.ravel(np.atleast_1d(expected))[np.nanargmin(num_dates)].year - max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)].year - if min_y >= 1678 and max_y < 2262: - expected = cftime_to_nptime(expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(num_dates, units, calendar) + actual = decode_cf_datetime(num_dates, units, calendar, time_unit=time_unit) + + if actual.dtype.kind != "O": + expected = cftime_to_nptime(expected) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -146,6 +148,7 @@ def test_cf_datetime(num_dates, units, calendar) -> None: # https://github.com/Unidata/netcdf4-python/issues/355 assert (abs_diff <= np.timedelta64(1, "s")).all() encoded1, _, _ = encode_cf_datetime(actual, units, calendar) + assert_duckarray_allclose(num_dates, encoded1) if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units: @@ -157,7 +160,7 @@ def test_cf_datetime(num_dates, units, calendar) -> None: @requires_cftime -def test_decode_cf_datetime_overflow() -> None: +def test_decode_cf_datetime_overflow(time_unit: PDDatetimeUnitOptions) -> None: # checks for # https://github.com/pydata/pandas/issues/14068 # https://github.com/pydata/xarray/issues/975 @@ -167,13 +170,13 @@ def test_decode_cf_datetime_overflow() -> None: units = "days since 2000-01-01 00:00:00" # date after 2262 and before 1678 - days = (-117608, 95795) - expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) + days = (-117710, 95795) + expected = (datetime(1677, 9, 20), datetime(2262, 4, 12)) for i, day in enumerate(days): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - result = decode_cf_datetime(day, units) + result = decode_cf_datetime(day, units, time_unit=time_unit) assert result == expected[i] @@ -207,17 +210,22 @@ def test_decode_cf_datetime_non_iso_strings() -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: +def test_decode_standard_calendar_inside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: import cftime units = "days since 0001-01-01" - times = pd.date_range("2001-04-01-00", end="2001-04-30-23", freq="h") + times = pd.date_range( + "2001-04-01-00", end="2001-04-30-23", unit=time_unit, freq="h" + ) + # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values - expected_dtype = np.dtype("M8[ns]") - - actual = decode_cf_datetime(time, units, calendar=calendar) - assert actual.dtype == expected_dtype + # for cftime we get "us" resolution + # ns resolution is handled by cftime, too (OutOfBounds) + actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: @@ -250,7 +258,9 @@ def test_decode_non_standard_calendar_inside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_dates_outside_timestamp_range(calendar) -> None: +def test_decode_dates_outside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: from datetime import datetime import cftime @@ -262,30 +272,37 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: expected = cftime.num2date( time, units, calendar=calendar, only_use_cftime_datetimes=True ) + if calendar == "proleptic_gregorian" and time_unit != "ns": + expected = cftime_to_nptime(expected) expected_date_type = type(expected[0]) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(time, units, calendar=calendar) + actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) assert all(isinstance(value, expected_date_type) for value in actual) abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 - assert (abs_diff <= np.timedelta64(1, "s")).all() + assert (abs_diff <= np.timedelta64(1, "us")).all() @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) +@pytest.mark.parametrize("num_time", [735368, [735368], [[735368]]]) def test_decode_standard_calendar_single_element_inside_timestamp_range( calendar, + time_unit: PDDatetimeUnitOptions, + num_time, ) -> None: units = "days since 0001-01-01" - for num_time in [735368, [735368], [[735368]]]: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(num_time, units, calendar=calendar) - assert actual.dtype == np.dtype("M8[ns]") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unable to decode time axis") + actual = decode_cf_datetime( + num_time, units, calendar=calendar, time_unit=time_unit + ) + + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") @requires_cftime @@ -323,6 +340,7 @@ def test_decode_single_element_outside_timestamp_range(calendar) -> None: @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) def test_decode_standard_calendar_multidim_time_inside_timestamp_range( calendar, + time_unit: PDDatetimeUnitOptions, ) -> None: import cftime @@ -338,8 +356,10 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( expected1 = times1.values expected2 = times2.values - actual = decode_cf_datetime(mdim_time, units, calendar=calendar) - assert actual.dtype == np.dtype("M8[ns]") + actual = decode_cf_datetime( + mdim_time, units, calendar=calendar, time_unit=time_unit + ) + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -393,7 +413,9 @@ def test_decode_nonstandard_calendar_multidim_time_inside_timestamp_range( @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: +def test_decode_multidim_time_outside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: from datetime import datetime import cftime @@ -410,11 +432,22 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: expected1 = cftime.num2date(time1, units, calendar, only_use_cftime_datetimes=True) expected2 = cftime.num2date(time2, units, calendar, only_use_cftime_datetimes=True) + if calendar == "proleptic_gregorian" and time_unit != "ns": + expected1 = cftime_to_nptime(expected1) + expected2 = cftime_to_nptime(expected2) + with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(mdim_time, units, calendar=calendar) + actual = decode_cf_datetime( + mdim_time, units, calendar=calendar, time_unit=time_unit + ) - assert actual.dtype == np.dtype("O") + dtype: np.dtype + dtype = np.dtype("O") + if calendar == "proleptic_gregorian" and time_unit != "ns": + dtype = np.dtype(f"=M8[{time_unit}]") + + assert actual.dtype == dtype abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -503,13 +536,13 @@ def test_cf_datetime_nan(num_dates, units, expected_list) -> None: @requires_cftime -def test_decoded_cf_datetime_array_2d() -> None: +def test_decoded_cf_datetime_array_2d(time_unit: PDDatetimeUnitOptions) -> None: # regression test for GH1229 variable = Variable( ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} ) - result = CFDatetimeCoder().decode(variable) - assert result.dtype == "datetime64[ns]" + result = CFDatetimeCoder(time_unit=time_unit).decode(variable) + assert result.dtype == f"datetime64[{time_unit}]" expected = pd.date_range("2000-01-01", periods=4).values.reshape(2, 2) assert_array_equal(np.asarray(result), expected) @@ -591,8 +624,9 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected) -> None: ], ) def test_cf_timedelta(timedeltas, units, numbers) -> None: + # todo: check, if this test is OK if timedeltas == "NaT": - timedeltas = np.timedelta64("NaT", "ns") + timedeltas = np.timedelta64("NaT", "s") else: timedeltas = to_timedelta_unboxed(timedeltas) numbers = np.array(numbers) @@ -608,9 +642,10 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: assert_array_equal(expected, actual) assert expected.dtype == actual.dtype - expected = np.timedelta64("NaT", "ns") + expected = np.timedelta64("NaT", "s") actual = decode_cf_timedelta(np.array(np.nan), "days") assert_array_equal(expected, actual) + assert expected.dtype == actual.dtype def test_cf_timedelta_2d() -> None: @@ -628,10 +663,10 @@ def test_cf_timedelta_2d() -> None: @pytest.mark.parametrize( ["deltas", "expected"], [ - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] + (pd.to_timedelta(["1 day", "2 days"]), "days"), + (pd.to_timedelta(["1h", "1 day 1 hour"]), "hours"), + (pd.to_timedelta(["1m", "2m", np.nan]), "minutes"), + (pd.to_timedelta(["1m3s", "1m4s"]), "seconds"), ], ) def test_infer_timedelta_units(deltas, expected) -> None: @@ -656,7 +691,7 @@ def test_format_cftime_datetime(date_args, expected) -> None: @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_cf(calendar) -> None: +def test_decode_cf(calendar, time_unit: PDDatetimeUnitOptions) -> None: days = [1.0, 2.0, 3.0] # TODO: GH5690 — do we want to allow this type for `coords`? da = DataArray(days, coords=[days], dims=["time"], name="test") @@ -670,15 +705,15 @@ def test_decode_cf(calendar) -> None: with pytest.raises(ValueError): ds = decode_cf(ds) else: - ds = decode_cf(ds) + ds = decode_cf(ds, decode_times=CFDatetimeCoder(time_unit=time_unit)) if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype("M8[ns]") + assert ds.test.dtype == np.dtype(f"=M8[{time_unit}]") -def test_decode_cf_time_bounds() -> None: +def test_decode_cf_time_bounds(time_unit: PDDatetimeUnitOptions) -> None: da = DataArray( np.arange(6, dtype="int64").reshape((3, 2)), coords={"time": [1, 2, 3]}, @@ -699,8 +734,8 @@ def test_decode_cf_time_bounds() -> None: "units": "days since 2001-01", "calendar": "standard", } - dsc = decode_cf(ds) - assert dsc.time_bnds.dtype == np.dtype("M8[ns]") + dsc = decode_cf(ds, decode_times=CFDatetimeCoder(time_unit=time_unit)) + assert dsc.time_bnds.dtype == np.dtype(f"=M8[{time_unit}]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -917,8 +952,8 @@ def test_use_cftime_default_standard_calendar_in_range(calendar) -> None: @requires_cftime -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1580]) def test_use_cftime_default_standard_calendar_out_of_range( calendar, units_year ) -> None: @@ -938,7 +973,9 @@ def test_use_cftime_default_standard_calendar_out_of_range( @requires_cftime @pytest.mark.parametrize("calendar", _NON_STANDARD_CALENDARS) @pytest.mark.parametrize("units_year", [1500, 2000, 2500]) -def test_use_cftime_default_non_standard_calendar(calendar, units_year) -> None: +def test_use_cftime_default_non_standard_calendar( + calendar, units_year, time_unit +) -> None: from cftime import num2date numerical_dates = [0, 1] @@ -947,9 +984,18 @@ def test_use_cftime_default_non_standard_calendar(calendar, units_year) -> None: numerical_dates, units, calendar, only_use_cftime_datetimes=True ) - with assert_no_warnings(): - result = decode_cf_datetime(numerical_dates, units, calendar) - np.testing.assert_array_equal(result, expected) + if time_unit == "ns" and units_year == 2500: + with pytest.warns(SerializationWarning, match="Unable to decode time axis"): + result = decode_cf_datetime( + numerical_dates, units, calendar, time_unit=time_unit + ) + else: + with assert_no_warnings(): + result = decode_cf_datetime( + numerical_dates, units, calendar, time_unit=time_unit + ) + + np.testing.assert_array_equal(result, expected) @requires_cftime @@ -980,8 +1026,8 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1582]) def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) -> None: numerical_dates = [0, 1] units = f"days since {units_year}-01-01" @@ -1052,14 +1098,18 @@ def test_encode_cf_datetime_defaults_to_correct_dtype( @pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) -def test_encode_decode_roundtrip_datetime64(freq) -> None: +def test_encode_decode_roundtrip_datetime64( + freq, time_unit: PDDatetimeUnitOptions +) -> None: # See GH 4045. Prior to GH 4684 this test would fail for frequencies of # "s", "ms", "us", and "ns". initial_time = pd.date_range("1678-01-01", periods=1) times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded) + decoded = conventions.decode_cf_variable( + "time", encoded, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_equal(variable, decoded) @@ -1072,7 +1122,8 @@ def test_encode_decode_roundtrip_cftime(freq) -> None: ) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded, use_cftime=True) + decoder = CFDatetimeCoder(use_cftime=True) + decoded = conventions.decode_cf_variable("time", encoded, decode_times=decoder) assert_equal(variable, decoded) @@ -1099,13 +1150,17 @@ def test__encode_datetime_with_cftime() -> None: @pytest.mark.parametrize("calendar", ["gregorian", "Gregorian", "GREGORIAN"]) -def test_decode_encode_roundtrip_with_non_lowercase_letters(calendar) -> None: +def test_decode_encode_roundtrip_with_non_lowercase_letters( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: # See GH 5093. times = [0, 1] units = "days since 2000-01-01" attrs = {"calendar": calendar, "units": units} variable = Variable(["time"], times, attrs) - decoded = conventions.decode_cf_variable("time", variable) + decoded = conventions.decode_cf_variable( + "time", variable, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) encoded = conventions.encode_cf_variable(decoded) # Previously this would erroneously be an array of cftime.datetime @@ -1182,7 +1237,7 @@ def test_decode_0size_datetime(use_cftime): if use_cftime and not has_cftime: pytest.skip() - dtype = object if use_cftime else "M8[ns]" + dtype = object if use_cftime else "=M8[ns]" expected = np.array([], dtype=dtype) actual = decode_cf_datetime( np.zeros(shape=0, dtype=np.int64), @@ -1209,6 +1264,47 @@ def test_decode_float_datetime(): np.testing.assert_equal(actual, expected) +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_decode_float_datetime_with_decimals( + time_unit: PDDatetimeUnitOptions, +) -> None: + # test resolution enhancement for floats + values = np.array([0, 0.125, 0.25, 0.375, 0.75, 1.0], dtype="float32") + expected = np.array( + [ + "2000-01-01T00:00:00.000", + "2000-01-01T00:00:00.125", + "2000-01-01T00:00:00.250", + "2000-01-01T00:00:00.375", + "2000-01-01T00:00:00.750", + "2000-01-01T00:00:01.000", + ], + dtype=f"=M8[{time_unit}]", + ) + + units = "seconds since 2000-01-01" + calendar = "standard" + actual = decode_cf_datetime(values, units, calendar, time_unit=time_unit) + assert actual.dtype == expected.dtype + np.testing.assert_equal(actual, expected) + + +@pytest.mark.parametrize( + "time_unit, num", [("s", 0.123), ("ms", 0.1234), ("us", 0.1234567)] +) +def test_coding_float_datetime_warning( + time_unit: PDDatetimeUnitOptions, num: float +) -> None: + units = "seconds since 2000-01-01" + calendar = "standard" + values = np.array([num], dtype="float32") + with pytest.warns( + SerializationWarning, + match=f"Can't decode floating point datetime to {time_unit!r}", + ): + decode_cf_datetime(values, units, calendar, time_unit=time_unit) + + @requires_cftime def test_scalar_unit() -> None: # test that a scalar units (often NaN when using to_netcdf) does not raise an error @@ -1255,6 +1351,7 @@ def test_roundtrip_datetime64_nanosecond_precision( dtype: np.typing.DTypeLike, fill_value: int | float | None, use_encoding: bool, + time_unit: PDDatetimeUnitOptions, ) -> None: # test for GH7817 time = np.datetime64(timestr, timeunit) @@ -1266,7 +1363,7 @@ def test_roundtrip_datetime64_nanosecond_precision( encoding = {} var = Variable(["time"], times, encoding=encoding) - assert var.dtype == np.dtype("=M8[ns]") + assert var.dtype == np.dtype(f"=M8[{timeunit}]") encoded_var = conventions.encode_cf_variable(var) assert ( @@ -1275,9 +1372,16 @@ def test_roundtrip_datetime64_nanosecond_precision( ) assert encoded_var.attrs["calendar"] == "proleptic_gregorian" assert encoded_var.data.dtype == dtype + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) - decoded_var = conventions.decode_cf_variable("foo", encoded_var) - assert decoded_var.dtype == np.dtype("=M8[ns]") + result_unit = ( + timeunit + if np.timedelta64(1, timeunit) <= np.timedelta64(1, time_unit) + else time_unit + ) + assert decoded_var.dtype == np.dtype(f"=M8[{result_unit}]") assert ( decoded_var.encoding["units"] == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" @@ -1287,7 +1391,9 @@ def test_roundtrip_datetime64_nanosecond_precision( assert_identical(var, decoded_var) -def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: +def test_roundtrip_datetime64_nanosecond_precision_warning( + time_unit: PDDatetimeUnitOptions, +) -> None: # test warning if times can't be serialized faithfully times = [ np.datetime64("1970-01-01T00:01:00", "ns"), @@ -1319,7 +1425,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == new_units assert encoded_var.attrs["_FillValue"] == 20 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) encoding = dict(dtype="float64", _FillValue=20, units=units) @@ -1331,7 +1439,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == units assert encoded_var.attrs["_FillValue"] == 20.0 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) encoding = dict(dtype="int64", _FillValue=20, units=new_units) @@ -1343,7 +1453,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == new_units assert encoded_var.attrs["_FillValue"] == 20 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) @@ -1352,7 +1464,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: [(np.int64, 20), (np.int64, np.iinfo(np.int64).min), (np.float64, 1e30)], ) def test_roundtrip_timedelta64_nanosecond_precision( - dtype: np.typing.DTypeLike, fill_value: int | float + dtype: np.typing.DTypeLike, + fill_value: int | float, + time_unit: PDDatetimeUnitOptions, ) -> None: # test for GH7942 one_day = np.timedelta64(1, "ns") @@ -1365,7 +1479,9 @@ def test_roundtrip_timedelta64_nanosecond_precision( var = Variable(["time"], timedelta_values, encoding=encoding) encoded_var = conventions.encode_cf_variable(var) - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_identical(var, decoded_var) @@ -1560,6 +1676,7 @@ def test_encode_cf_datetime_casting_value_error(use_cftime, use_dask) -> None: with pytest.warns(UserWarning, match="Times can't be serialized"): encoded = conventions.encode_cf_variable(variable) assert encoded.attrs["units"] == "hours since 2000-01-01" + decoded = conventions.decode_cf_variable("name", encoded) assert_equal(variable, decoded) else: diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index c3caab4e125..c5b817d3401 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -317,7 +317,7 @@ def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: assert_identical(actual, expected) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_concat_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 495d760c534..2886691ce32 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -18,6 +18,7 @@ ) from xarray.backends.common import WritableCFDataStore from xarray.backends.memory import InMemoryDataStore +from xarray.coders import CFDatetimeCoder from xarray.conventions import decode_cf from xarray.testing import assert_identical from xarray.tests import ( @@ -213,7 +214,7 @@ def test_deterministic_coords_encoding(self) -> None: vars, attrs = conventions.encode_dataset_coordinates(ds) assert attrs["coordinates"] == "bar baz" - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_attrs(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -231,7 +232,7 @@ def test_emit_coordinates_attribute_in_attrs(self) -> None: assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_encoding(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -437,7 +438,8 @@ def test_invalid_timedelta_units_do_not_decode(self, decode_times) -> None: assert_identical(expected, decode_cf(ds, decode_times=decode_times)) @requires_cftime - def test_dataset_repr_with_netcdf4_datetimes(self) -> None: + @pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) + def test_dataset_repr_with_netcdf4_datetimes(self, time_unit) -> None: # regression test for #347 attrs = {"units": "days since 0001-01-01", "calendar": "noleap"} with warnings.catch_warnings(): @@ -446,8 +448,11 @@ def test_dataset_repr_with_netcdf4_datetimes(self) -> None: assert "(time) object" in repr(ds) attrs = {"units": "days since 1900-01-01"} - ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)})) - assert "(time) datetime64[ns]" in repr(ds) + ds = decode_cf( + Dataset({"time": ("time", [0, 1], attrs)}), + decode_times=CFDatetimeCoder(time_unit=time_unit), + ) + assert f"(time) datetime64[{time_unit}]" in repr(ds) @requires_cftime def test_decode_cf_datetime_transition_to_invalid(self) -> None: @@ -506,7 +511,8 @@ def test_decode_dask_times(self) -> None: conventions.decode_cf(original).chunk(), ) - def test_decode_cf_time_kwargs(self) -> None: + @pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) + def test_decode_cf_time_kwargs(self, time_unit) -> None: ds = Dataset.from_dict( { "coords": { @@ -528,15 +534,21 @@ def test_decode_cf_time_kwargs(self) -> None: } ) - dsc = conventions.decode_cf(ds) + dsc = conventions.decode_cf( + ds, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert dsc.timedelta.dtype == np.dtype("m8[ns]") - assert dsc.time.dtype == np.dtype("M8[ns]") + assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") - dsc = conventions.decode_cf(ds, decode_times=True, decode_timedelta=False) + dsc = conventions.decode_cf( + ds, + decode_times=CFDatetimeCoder(time_unit=time_unit), + decode_timedelta=False, + ) assert dsc.timedelta.dtype == np.dtype("int64") - assert dsc.time.dtype == np.dtype("M8[ns]") + assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False, decode_timedelta=True) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype("int64") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index ea5186e59d0..fb5c5f8c25d 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -30,7 +30,7 @@ broadcast, set_options, ) -from xarray.coding.times import CFDatetimeCoder +from xarray.coders import CFDatetimeCoder from xarray.core import dtypes from xarray.core.common import full_like from xarray.core.coordinates import Coordinates @@ -3661,7 +3661,7 @@ def test_to_and_from_dict( actual_no_data = da.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_time_dim(self) -> None: x = np.random.randn(10, 3) t = pd.date_range("20130101", periods=10) @@ -3670,7 +3670,7 @@ def test_to_and_from_dict_with_time_dim(self) -> None: roundtripped = DataArray.from_dict(da.to_dict()) assert_identical(da, roundtripped) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_nan_nat(self) -> None: y = np.random.randn(10, 3) y[2] = np.nan diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d92b26fcee5..257c61ae60f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -105,24 +105,25 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: lon = [0, 1, 2] nt1 = 3 nt2 = 2 - time1 = pd.date_range("2000-01-01", periods=nt1) - time2 = pd.date_range("2000-02-01", periods=nt2) + # todo: check, if all changes below are correct + time1 = pd.date_range("2000-01-01", periods=nt1).as_unit("ns") + time2 = pd.date_range("2000-02-01", periods=nt2).as_unit("ns") string_var = np.array(["a", "bc", "def"], dtype=object) string_var_to_append = np.array(["asdf", "asdfg"], dtype=object) string_var_fixed_length = np.array(["aa", "bb", "cc"], dtype="|S2") string_var_fixed_length_to_append = np.array(["dd", "ee"], dtype="|S2") unicode_var = np.array(["áó", "áó", "áó"]) datetime_var = np.array( - ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[s]" + ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[ns]" ) datetime_var_to_append = np.array( - ["2019-01-04", "2019-01-05"], dtype="datetime64[s]" + ["2019-01-04", "2019-01-05"], dtype="datetime64[ns]" ) bool_var = np.array([True, False, True], dtype=bool) bool_var_to_append = np.array([False, True], dtype=bool) with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Converting non-nanosecond") + warnings.filterwarnings("ignore", "Converting non-default") ds = xr.Dataset( data_vars={ "da": xr.DataArray( @@ -289,7 +290,7 @@ def test_repr(self) -> None: Coordinates: * dim2 (dim2) float64 72B 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) {} 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' - * time (time) datetime64[ns] 160B 2000-01-01 2000-01-02 ... 2000-01-20 + * time (time) datetime64[{}] 160B 2000-01-01 2000-01-02 ... 2000-01-20 numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 Data variables: @@ -297,7 +298,10 @@ def test_repr(self) -> None: var2 (dim1, dim2) float64 576B 0.953 1.52 1.704 ... 0.1347 -0.6423 var3 (dim3, dim1) float64 640B 0.4107 0.9941 0.1665 ... 0.716 1.555 Attributes: - foo: bar""".format(data["dim3"].dtype) + foo: bar""".format( + data["dim3"].dtype, + "ns", + ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) @@ -496,7 +500,7 @@ def test_constructor_1d(self) -> None: actual = Dataset({"x": [5, 6, 7, 8, 9]}) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_constructor_0d(self) -> None: expected = Dataset({"x": ([], 1)}) for arg in [1, np.array(1), expected["x"]]: @@ -3546,9 +3550,9 @@ def test_expand_dims_create_index_from_iterable(self): def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 - with pytest.warns(UserWarning, match="non-nanosecond precision"): - ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "s")]}) - assert ds.time.dtype == np.dtype("datetime64[ns]") + # todo: test still needed? + ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "s")]}) + assert ds.time.dtype == np.dtype("datetime64[s]") def test_set_index(self) -> None: expected = create_test_multiindex() @@ -6067,7 +6071,7 @@ def test_dataset_math_auto_align(self) -> None: expected = ds + other.reindex_like(ds) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_dataset_math_errors(self) -> None: ds = self.make_example_math_dataset() @@ -7207,7 +7211,7 @@ def test_differentiate(dask, edge_order) -> None: da.differentiate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: rs = np.random.default_rng(42) @@ -7402,7 +7406,7 @@ def test_cumulative_integrate(dask) -> None: da.cumulative_integrate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapezoid_datetime(dask, which_datetime) -> None: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index e4383dd58a9..512b3e8523d 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -636,7 +636,7 @@ def test_groupby_repr_datetime(obj) -> None: @pytest.mark.filterwarnings("ignore:No index created for dimension id:UserWarning") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning") @pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize( @@ -2198,9 +2198,9 @@ def test_upsample_interpolate(self) -> None: assert_allclose(expected, actual, rtol=1e-16) @requires_scipy - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_upsample_interpolate_bug_2197(self) -> None: - dates = pd.date_range("2007-02-01", "2007-03-01", freq="D") + dates = pd.date_range("2007-02-01", "2007-03-01", freq="D", unit="s") da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) result = da.resample(time="ME").interpolate("linear") expected_times = np.array( diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index 86532a26f65..108b6dc13c2 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -711,7 +711,7 @@ def test_interp_like() -> None: pytest.param("2000-01-01T12:00", 0.5, marks=pytest.mark.xfail), ], ) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(x_new, expected) -> None: da = xr.DataArray( np.arange(24), diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 1e07459061f..97d0441cf81 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2962,7 +2962,7 @@ def test_datetime_plot1d(self) -> None: # mpl.dates.AutoDateLocator passes and no other subclasses: assert type(ax.xaxis.get_major_locator()) is mpl.dates.AutoDateLocator - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime_plot2d(self) -> None: # Test that matplotlib-native datetime works: da = DataArray( diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f4f353eda7d..e3c55081d6a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -37,7 +37,6 @@ assert_identical, assert_no_warnings, has_dask_ge_2024_11_0, - has_pandas_3, raise_if_dask_computes, requires_bottleneck, requires_cupy, @@ -201,24 +200,25 @@ def test_index_0d_string(self): x = self.cls(["x"], [value]) self._assertIndexedLikeNDArray(x, value, dtype) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_datetime(self): d = datetime(2000, 1, 1) x = self.cls(["x"], [d]) self._assertIndexedLikeNDArray(x, np.datetime64(d)) x = self.cls(["x"], [np.datetime64(d)]) - self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") + self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[us]") x = self.cls(["x"], pd.DatetimeIndex([d])) self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_timedelta64(self): td = timedelta(hours=1) - - x = self.cls(["x"], [np.timedelta64(td)]) - self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") + # todo: discussion needed + td64 = np.timedelta64(td, "ns") + x = self.cls(["x"], [td64]) + self._assertIndexedLikeNDArray(x, td64, np.dtype("timedelta64[ns]")) x = self.cls(["x"], pd.to_timedelta([td])) self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") @@ -254,7 +254,7 @@ def test_0d_object_array_with_list(self): assert_array_equal(x[0].data, listarray.squeeze()) assert_array_equal(x.squeeze().data, listarray.squeeze()) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_and_concat_datetime(self): # regression test for #125 date_range = pd.date_range("2011-09-01", periods=10) @@ -275,52 +275,60 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_datetime64_conversion(self): - times = pd.date_range("2000-01-01", periods=3) - for values in [ - times, - times.values, - times.values.astype("datetime64[s]"), - times.to_pydatetime(), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype("datetime64[ns]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("datetime64[ns]") - - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_timedelta64_conversion(self): - times = pd.timedelta_range(start=0, periods=3) - for values in [ - times, - times.values, - times.values.astype("timedelta64[s]"), - times.to_pytimedelta(), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype("timedelta64[ns]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("timedelta64[ns]") + dt64_data = pd.date_range("2000-01-01", periods=3) + + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", + [ + (dt64_data, "ns"), + (dt64_data.values, "ns"), + (dt64_data.values.astype("datetime64[s]"), "s"), + (dt64_data.to_pydatetime(), "ns"), + ], + ) + def test_datetime64_conversion(self, values, unit): + # todo: check, if this test is OK + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"datetime64[{unit}]") + assert_array_equal(v.values, self.dt64_data.values) + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + + td64_data = pd.timedelta_range(start=0, periods=3) + + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", + [ + (td64_data, "ns"), + (td64_data.values, "ns"), + (td64_data.values.astype("timedelta64[s]"), "s"), + (td64_data.to_pytimedelta(), "ns"), + ], + ) + def test_timedelta64_conversion(self, values, unit): + # todo: check, if this test is OK + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert_array_equal(v.values, self.td64_data.values) + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_object_conversion(self): data = np.arange(5).astype(str).astype(object) actual = self.cls("x", data) assert actual.dtype == data.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime64_valid_range(self): + # todo: test still needed? data = np.datetime64("1250-01-01", "us") - pderror = pd.errors.OutOfBoundsDatetime - with pytest.raises(pderror, match=r"Out of bounds nanosecond"): - self.cls(["t"], [data]) + self.cls(["t"], [data]) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_timedelta64_valid_range(self): + # todo: test still needed? data = np.timedelta64("200000", "D") - pderror = pd.errors.OutOfBoundsTimedelta - with pytest.raises(pderror, match=r"Cannot convert"): - self.cls(["t"], [data]) + self.cls(["t"], [data]) def test_pandas_data(self): v = self.cls(["x"], pd.Series([0, 1, 2], index=[3, 2, 1])) @@ -1073,31 +1081,38 @@ def test_numpy_same_methods(self): v = IndexVariable("x", np.arange(5)) assert 2 == v.searchsorted(2) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_datetime64_conversion_scalar(self): - expected = np.datetime64("2000-01-01", "ns") - for values in [ - np.datetime64("2000-01-01"), - pd.Timestamp("2000-01-01T00"), - datetime(2000, 1, 1), - ]: - v = Variable([], values) - assert v.dtype == np.dtype("datetime64[ns]") - assert v.values == expected - assert v.values.dtype == np.dtype("datetime64[ns]") - - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_timedelta64_conversion_scalar(self): - expected = np.timedelta64(24 * 60 * 60 * 10**9, "ns") - for values in [ - np.timedelta64(1, "D"), - pd.Timedelta("1 day"), - timedelta(days=1), - ]: - v = Variable([], values) - assert v.dtype == np.dtype("timedelta64[ns]") - assert v.values == expected - assert v.values.dtype == np.dtype("timedelta64[ns]") + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", + [ + (np.datetime64("2000-01-01"), "s"), + (pd.Timestamp("2000-01-01T00"), "ns"), + (datetime(2000, 1, 1), "ns"), + ], + ) + def test_datetime64_conversion_scalar(self, values, unit): + # todo: check, if this test is OK + v = Variable([], values) + assert v.dtype == np.dtype(f"datetime64[{unit}]") + assert np.issubdtype(v.values, "datetime64") + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", + [ + (np.timedelta64(1, "D"), "s"), + (pd.Timedelta("1 day"), "ns"), + (timedelta(days=1), "ns"), + ], + ) + def test_timedelta64_conversion_scalar(self, values, unit): + # todo: discussion needed + # todo: check, if this test is OK + v = Variable([], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert np.issubdtype(v.values, "timedelta64") + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_0d_str(self): v = Variable([], "foo") @@ -1109,16 +1124,20 @@ def test_0d_str(self): assert v.values == "foo".encode("ascii") def test_0d_datetime(self): + # todo: check, if this test is OK v = Variable([], pd.Timestamp("2000-01-01")) assert v.dtype == np.dtype("datetime64[ns]") - assert v.values == np.datetime64("2000-01-01", "ns") + assert v.values == np.datetime64("2000-01-01", "s") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_0d_timedelta(self): - for td in [pd.to_timedelta("1s"), np.timedelta64(1, "s")]: - v = Variable([], td) - assert v.dtype == np.dtype("timedelta64[ns]") - assert v.values == np.timedelta64(10**9, "ns") + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", [(pd.to_timedelta("1s"), "ns"), (np.timedelta64(1, "s"), "s")] + ) + def test_0d_timedelta(self, values, unit): + # todo: check, if this test is OK + v = Variable([], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert v.values == np.timedelta64(10**9, "ns") def test_equals_and_identical(self): d = np.random.rand(10, 3) @@ -1558,7 +1577,7 @@ def test_transpose(self): v.transpose(..., "not_a_dim", missing_dims="warn") assert_identical(expected_ell, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_transpose_0d(self): for value in [ 3.5, @@ -2634,19 +2653,20 @@ def test_masked_array(self): assert_array_equal(expected, actual) assert actual.dtype == expected.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(self): + # todo: check, if this test is OK expected = np.datetime64("2000-01-01") actual = as_compatible_data(expected) assert expected == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[ns]") == actual.dtype + assert np.dtype("datetime64[s]") == actual.dtype expected = np.array([np.datetime64("2000-01-01")]) actual = as_compatible_data(expected) assert np.asarray(expected) == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[ns]") == actual.dtype + assert np.dtype("datetime64[s]") == actual.dtype expected = np.array([np.datetime64("2000-01-01", "ns")]) actual = as_compatible_data(expected) @@ -2655,13 +2675,14 @@ def test_datetime(self): assert np.dtype("datetime64[ns]") == actual.dtype assert expected is source_ndarray(np.asarray(actual)) - expected = np.datetime64("2000-01-01", "ns") + expected = np.datetime64("2000-01-01", "us") actual = as_compatible_data(datetime(2000, 1, 1)) assert np.asarray(expected) == actual assert np.ndarray is type(actual) assert np.dtype("datetime64[ns]") == actual.dtype def test_tz_datetime(self) -> None: + # todo: check, if this test is OK tz = pytz.timezone("America/New_York") times_ns = pd.date_range("2000", periods=1, tz=tz) @@ -2670,7 +2691,7 @@ def test_tz_datetime(self) -> None: warnings.simplefilter("ignore") actual: T_DuckArray = as_compatible_data(times_s) assert actual.array == times_s - assert actual.array.dtype == pd.DatetimeTZDtype("ns", tz) + assert actual.array.dtype == pd.DatetimeTZDtype("s", tz) # type: ignore[arg-type] series = pd.Series(times_s) with warnings.catch_warnings(): @@ -2678,7 +2699,7 @@ def test_tz_datetime(self) -> None: actual2: T_DuckArray = as_compatible_data(series) np.testing.assert_array_equal(actual2, np.asarray(series.values)) - assert actual2.dtype == np.dtype("datetime64[ns]") + assert actual2.dtype == np.dtype("datetime64[s]") def test_full_like(self) -> None: # For more thorough tests, see test_variable.py @@ -2974,37 +2995,32 @@ def test_from_pint_wrapping_dask(self, Var): @pytest.mark.parametrize( - ("values", "warns"), + ("values", "unit"), [ - (np.datetime64("2000-01-01", "ns"), False), - (np.datetime64("2000-01-01", "s"), True), - (np.array([np.datetime64("2000-01-01", "ns")]), False), - (np.array([np.datetime64("2000-01-01", "s")]), True), - (pd.date_range("2000", periods=1), False), - (datetime(2000, 1, 1), has_pandas_3), - (np.array([datetime(2000, 1, 1)]), has_pandas_3), - (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), False), + (np.datetime64("2000-01-01", "ns"), "ns"), + (np.datetime64("2000-01-01", "s"), "s"), + (np.array([np.datetime64("2000-01-01", "ns")]), "ns"), + (np.array([np.datetime64("2000-01-01", "s")]), "s"), + (pd.date_range("2000", periods=1), "ns"), + (datetime(2000, 1, 1), "ns"), + (np.array([datetime(2000, 1, 1)]), "ns"), + (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), "ns"), ( pd.Series( pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")) ), - False, + "ns", ), ], ids=lambda x: f"{x}", ) -def test_datetime_conversion_warning(values, warns) -> None: +def test_datetime_conversion_warning(values, unit) -> None: + # todo: needs discussion + # todo: check, if this test is OK dims = ["time"] if isinstance(values, np.ndarray | pd.Index | pd.Series) else [] - if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): - var = Variable(dims, values) - else: - with warnings.catch_warnings(): - warnings.simplefilter("error") - var = Variable(dims, values) - + var = Variable(dims, values) if var.dtype.kind == "M": - assert var.dtype == np.dtype("datetime64[ns]") + assert var.dtype == np.dtype(f"datetime64[{unit}]") else: # The only case where a non-datetime64 dtype can occur currently is in # the case that the variable is backed by a timezone-aware @@ -3042,65 +3058,34 @@ def test_datetime_conversion_warning(values, warns) -> None: def test_pandas_two_only_datetime_conversion_warnings( data: pd.DatetimeIndex | pd.Series, dtype: str | pd.DatetimeTZDtype ) -> None: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): - var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] + # todo: check, if this test is OK + var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] if var.dtype.kind == "M": - assert var.dtype == np.dtype("datetime64[ns]") + assert var.dtype == np.dtype("datetime64[s]") else: # The only case where a non-datetime64 dtype can occur currently is in # the case that the variable is backed by a timezone-aware # DatetimeIndex, and thus is hidden within the PandasIndexingAdapter class. assert isinstance(var._data, PandasIndexingAdapter) - assert var._data.array.dtype == pd.DatetimeTZDtype("ns", tz_ny) + assert var._data.array.dtype == pd.DatetimeTZDtype("s", tz_ny) @pytest.mark.parametrize( - ("values", "warns"), + ("values", "unit"), [ - (np.timedelta64(10, "ns"), False), - (np.timedelta64(10, "s"), True), - (np.array([np.timedelta64(10, "ns")]), False), - (np.array([np.timedelta64(10, "s")]), True), - (pd.timedelta_range("1", periods=1), False), - (timedelta(days=1), False), - (np.array([timedelta(days=1)]), False), + (np.timedelta64(10, "ns"), "ns"), + (np.timedelta64(10, "s"), "s"), + (np.array([np.timedelta64(10, "ns")]), "ns"), + (np.array([np.timedelta64(10, "s")]), "s"), + (pd.timedelta_range("1", periods=1), "ns"), + (timedelta(days=1), "ns"), + (np.array([timedelta(days=1)]), "ns"), + (pd.timedelta_range("1", periods=1).astype("timedelta64[s]"), "s"), ], ids=lambda x: f"{x}", ) -def test_timedelta_conversion_warning(values, warns) -> None: +def test_timedelta_conversion_warning(values, unit) -> None: dims = ["time"] if isinstance(values, np.ndarray | pd.Index) else [] - if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): - var = Variable(dims, values) - else: - with warnings.catch_warnings(): - warnings.simplefilter("error") - var = Variable(dims, values) - - assert var.dtype == np.dtype("timedelta64[ns]") - - -def test_pandas_two_only_timedelta_conversion_warning() -> None: - # Note this test relies on a pandas feature that is only present in pandas - # 2.0.0 and above, and so for now cannot be parametrized. - data = pd.timedelta_range("1", periods=1).astype("timedelta64[s]") - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): - var = Variable(["time"], data) - - assert var.dtype == np.dtype("timedelta64[ns]") - - -@pytest.mark.parametrize( - ("index", "dtype"), - [ - (pd.date_range("2000", periods=1), "datetime64"), - (pd.timedelta_range("1", periods=1), "timedelta64"), - ], - ids=lambda x: f"{x}", -) -def test_pandas_indexing_adapter_non_nanosecond_conversion(index, dtype) -> None: - data = PandasIndexingAdapter(index.astype(f"{dtype}[s]")) - with pytest.warns(UserWarning, match="non-nanosecond precision"): - var = Variable(["time"], data) - assert var.dtype == np.dtype(f"{dtype}[ns]") + var = Variable(dims, values) + assert var.dtype == np.dtype(f"timedelta64[{unit}]")