From e6f2208c634b4924043c5720589887934137848c Mon Sep 17 00:00:00 2001 From: madtoinou <32447896+madtoinou@users.noreply.github.com> Date: Sat, 28 Oct 2023 15:55:19 +0200 Subject: [PATCH] Fix/operand error with encoders (#2034) * fix: create a temporary Datetime index when series frequency represents a ambiguous timedelta value to extract the start time index * feat: updated changelog * fix: fixed corner case, generate the shortest temporary datetimeindex possible * feat: added tests to cover the cases where the series freq cannot be converted to Timedelta --- CHANGELOG.md | 7 ++++- .../test_create_lagged_training_data.py | 19 ++++++++----- darts/utils/data/tabularization.py | 27 ++++++++++++++----- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 781378a521..2f1f79038c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,10 +9,15 @@ but cannot always guarantee backwards compatibility. Changes that may **break co [Full Changelog](https://github.com/unit8co/darts/compare/0.26.0...master) ### For users of the library: +**Improved** - Improvements to `TorchForecastingModel`: - Added callback `darts.utils.callbacks.TFMProgressBar` to customize at which model stages to display the progress bar. [#2020](https://github.com/unit8co/darts/pull/2020) by [Dennis Bader](https://github.com/dennisbader). - Improvements to documentation: - - Adapted the example notebooks to properly apply data transformers and avoid look-ahead bias. [#2020](https://github.com/unit8co/darts/pull/2020) by [Samriddhi Singh](https://github.com/SimTheGreat). + - Adapted the example notebooks to properly apply data transformers and avoid look-ahead bias. [#2020](https://github.com/unit8co/darts/pull/2020) by [Samriddhi Singh](https://github.com/SimTheGreat). + +**Fixed** +- Fixed a bug when trying to divide `pd.Timedelta` by a `pd.Offset` with an ambiguous conversion to `pd.Timedelta` when using encoders. [#2034](https://github.com/unit8co/darts/pull/2034) by [Antoine Madrona](https://github.com/madtoinou). + ### For developers of the library: ## [0.26.0](https://github.com/unit8co/darts/tree/0.26.0) (2023-09-16) diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index 98f515e545..9afe53d3f1 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -1132,37 +1132,44 @@ def test_lagged_training_data_extend_past_and_future_covariates_range_idx(self): assert np.allclose(expected_X, X[:, :, 0]) assert np.allclose(expected_y, y[:, :, 0]) - def test_lagged_training_data_extend_past_and_future_covariates_datetime_idx(self): + @pytest.mark.parametrize("freq", ["D", "MS", "Y"]) + def test_lagged_training_data_extend_past_and_future_covariates_datetime_idx( + self, freq + ): """ Tests that `create_lagged_training_data` correctly handles case where features and labels can be created for a time that is *not* contained in `past_covariates` and/or `future_covariates`. This particular test checks this behaviour by using - datetime index timeseries. + datetime index timeseries and three different frequencies: daily, month start and + year end. More specifically, we define the series and lags such that a training example can be generated for time `target.end_time()`, even though this time isn't contained in neither `past` nor `future`. """ - # Can create feature for time `t = '1/11/2000'`, but this time isn't in `past` or `future`: + # Can create feature for time `t = '1/1/2000'+11*freq`, but this time isn't in `past` or `future`: target = linear_timeseries( start=pd.Timestamp("1/1/2000"), - end=pd.Timestamp("1/11/2000"), start_value=1, end_value=2, + length=11, + freq=freq, ) lags = [-1] past = linear_timeseries( start=pd.Timestamp("1/1/2000"), - end=pd.Timestamp("1/9/2000"), start_value=2, end_value=3, + length=9, + freq=freq, ) lags_past = [-2] future = linear_timeseries( start=pd.Timestamp("1/1/2000"), - end=pd.Timestamp("1/7/2000"), start_value=3, end_value=4, + length=7, + freq=freq, ) lags_future = [-4] # Only want to check very last generated observation: diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 835d793196..be28af04f1 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -883,12 +883,27 @@ def _create_lagged_data_by_moving_window( # for all feature times - these values will become labels. # If `start_time` not included in `time_index_i`, can 'manually' calculate # what its index *would* be if `time_index_i` were extended to include that time: - if not is_target_series and (time_index_i[-1] <= start_time): - start_time_idx = ( - len(time_index_i) - - 1 - + (start_time - time_index_i[-1]) // series_i.freq - ) + if not is_target_series and (time_index_i[-1] < start_time): + # Series frequency represents a non-ambiguous timedelta value (not ‘M’, ‘Y’ or ‘y’) + if pd.to_timedelta(series_i.freq, errors="coerce") is not pd.NaT: + start_time_idx = ( + len(time_index_i) + - 1 + + (start_time - time_index_i[-1]) // series_i.freq + ) + else: + # Create a temporary DatetimeIndex to extract the actual start index. + start_time_idx = ( + len(time_index_i) + - 1 + + len( + pd.date_range( + start=time_index_i[-1] + series_i.freq, + end=start_time, + freq=series_i.freq, + ) + ) + ) elif not is_target_series and (time_index_i[0] >= start_time): start_time_idx = max_lag_i # If `start_time` *is* included in `time_index_i`, need to binary search `time_index_i`