From e6f2208c634b4924043c5720589887934137848c Mon Sep 17 00:00:00 2001
From: madtoinou <32447896+madtoinou@users.noreply.github.com>
Date: Sat, 28 Oct 2023 15:55:19 +0200
Subject: [PATCH] Fix/operand error with encoders (#2034)

* fix: create a temporary Datetime index when series frequency represents a ambiguous timedelta value to extract the start time index

* feat: updated changelog

* fix: fixed corner case, generate the shortest temporary datetimeindex possible

* feat: added tests to cover the cases where the series freq cannot be converted to Timedelta
---
 CHANGELOG.md                                  |  7 ++++-
 .../test_create_lagged_training_data.py       | 19 ++++++++-----
 darts/utils/data/tabularization.py            | 27 ++++++++++++++-----
 3 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 781378a521..2f1f79038c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,10 +9,15 @@ but cannot always guarantee backwards compatibility. Changes that may **break co
 [Full Changelog](https://github.com/unit8co/darts/compare/0.26.0...master)
 
 ### For users of the library:
+**Improved**
 - Improvements to `TorchForecastingModel`:
   - Added callback `darts.utils.callbacks.TFMProgressBar` to customize at which model stages to display the progress bar. [#2020](https://github.com/unit8co/darts/pull/2020) by [Dennis Bader](https://github.com/dennisbader).
 - Improvements to documentation:
-  - Adapted the example notebooks to properly apply data transformers and avoid look-ahead bias. [#2020](https://github.com/unit8co/darts/pull/2020) by [Samriddhi Singh](https://github.com/SimTheGreat). 
+  - Adapted the example notebooks to properly apply data transformers and avoid look-ahead bias. [#2020](https://github.com/unit8co/darts/pull/2020) by [Samriddhi Singh](https://github.com/SimTheGreat).
+
+**Fixed**
+- Fixed a bug when trying to divide `pd.Timedelta` by a `pd.Offset` with an ambiguous conversion to `pd.Timedelta` when using encoders. [#2034](https://github.com/unit8co/darts/pull/2034) by [Antoine Madrona](https://github.com/madtoinou).
+
 ### For developers of the library:
 
 ## [0.26.0](https://github.com/unit8co/darts/tree/0.26.0) (2023-09-16)
diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py
index 98f515e545..9afe53d3f1 100644
--- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py
+++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py
@@ -1132,37 +1132,44 @@ def test_lagged_training_data_extend_past_and_future_covariates_range_idx(self):
             assert np.allclose(expected_X, X[:, :, 0])
             assert np.allclose(expected_y, y[:, :, 0])
 
-    def test_lagged_training_data_extend_past_and_future_covariates_datetime_idx(self):
+    @pytest.mark.parametrize("freq", ["D", "MS", "Y"])
+    def test_lagged_training_data_extend_past_and_future_covariates_datetime_idx(
+        self, freq
+    ):
         """
         Tests that `create_lagged_training_data` correctly handles case where features
         and labels can be created for a time that is *not* contained in `past_covariates`
         and/or `future_covariates`. This particular test checks this behaviour by using
-        datetime index timeseries.
+        datetime index timeseries and three different frequencies: daily, month start and
+        year end.
 
         More specifically, we define the series and lags such that a training example can
         be generated for time `target.end_time()`, even though this time isn't contained in
         neither `past` nor `future`.
         """
-        # Can create feature for time `t = '1/11/2000'`, but this time isn't in `past` or `future`:
+        # Can create feature for time `t = '1/1/2000'+11*freq`, but this time isn't in `past` or `future`:
         target = linear_timeseries(
             start=pd.Timestamp("1/1/2000"),
-            end=pd.Timestamp("1/11/2000"),
             start_value=1,
             end_value=2,
+            length=11,
+            freq=freq,
         )
         lags = [-1]
         past = linear_timeseries(
             start=pd.Timestamp("1/1/2000"),
-            end=pd.Timestamp("1/9/2000"),
             start_value=2,
             end_value=3,
+            length=9,
+            freq=freq,
         )
         lags_past = [-2]
         future = linear_timeseries(
             start=pd.Timestamp("1/1/2000"),
-            end=pd.Timestamp("1/7/2000"),
             start_value=3,
             end_value=4,
+            length=7,
+            freq=freq,
         )
         lags_future = [-4]
         # Only want to check very last generated observation:
diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py
index 835d793196..be28af04f1 100644
--- a/darts/utils/data/tabularization.py
+++ b/darts/utils/data/tabularization.py
@@ -883,12 +883,27 @@ def _create_lagged_data_by_moving_window(
             # for all feature times - these values will become labels.
             # If `start_time` not included in `time_index_i`, can 'manually' calculate
             # what its index *would* be if `time_index_i` were extended to include that time:
-            if not is_target_series and (time_index_i[-1] <= start_time):
-                start_time_idx = (
-                    len(time_index_i)
-                    - 1
-                    + (start_time - time_index_i[-1]) // series_i.freq
-                )
+            if not is_target_series and (time_index_i[-1] < start_time):
+                # Series frequency represents a non-ambiguous timedelta value (not ‘M’, ‘Y’ or ‘y’)
+                if pd.to_timedelta(series_i.freq, errors="coerce") is not pd.NaT:
+                    start_time_idx = (
+                        len(time_index_i)
+                        - 1
+                        + (start_time - time_index_i[-1]) // series_i.freq
+                    )
+                else:
+                    # Create a temporary DatetimeIndex to extract the actual start index.
+                    start_time_idx = (
+                        len(time_index_i)
+                        - 1
+                        + len(
+                            pd.date_range(
+                                start=time_index_i[-1] + series_i.freq,
+                                end=start_time,
+                                freq=series_i.freq,
+                            )
+                        )
+                    )
             elif not is_target_series and (time_index_i[0] >= start_time):
                 start_time_idx = max_lag_i
             # If `start_time` *is* included in `time_index_i`, need to binary search `time_index_i`