From 5740da5deffe8ee7472f8c127332ad10c7826add Mon Sep 17 00:00:00 2001 From: AlessiopSymplectic <160468679+AlessiopSymplectic@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:16:16 +0200 Subject: [PATCH] Fix/bug time series.append and prepend (#2522) --- CHANGELOG.md | 4 + darts/tests/test_timeseries.py | 128 ++++++++++++++++++++++++++------ darts/tests/utils/test_utils.py | 50 ++++++++++++- darts/timeseries.py | 73 +++++++++++------- darts/utils/utils.py | 9 ++- 5 files changed, 212 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3584fffb32..02718b8b2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,9 @@ but cannot always guarantee backwards compatibility. Changes that may **break co - Added `IQRDetector`, that allows to detect anomalies using the interquartile range algorithm. [#2441] by [Igor Urbanik](https://github.com/u8-igor). - Added hyperparameters controlling the hidden layer sizes for the feature encoders in `TiDEModel`. [#2408](https://github.com/unit8co/darts/issues/2408) by [eschibli](https://github.com/eschibli). +- Added hyperparameter `activation` to `BlockRNNModel` to specify the activation function in case of a multi-layer output network. [#2408](https://github.com/unit8co/darts/issues/2408) by [eschibli](https://github.com/eschibli). - Added support for broadcasting to TimeSeries on component and sample level. [#2476](https://https://github.com/unit8co/darts/pull/2476) by [Joel L.](https://github.com/Joelius300). +- Helper function `darts.utils.utils.generate_index()` now accepts datetime strings as `start` and `end` parameters to generate the pandas DatetimeIndex. [#2522](https://github.com/unit8co/darts/pull/2522) by [Dennis Bader](https://github.com/dennisbader). - Various improvements in the documentation: - Made README's forecasting model support table more colorblind-friendly. [#2433](https://github.com/unit8co/darts/pull/2433) - Updated the Ray Tune Hyperparameter Optimization example in the [user guide](https://unit8co.github.io/darts/userguide/hyperparameter_optimization.html) to work with the latest `ray` versions (`>=2.31.0`). [#2459](https://github.com/unit8co/darts/pull/2459) by [He Weilin](https://github.com/cnhwl). @@ -22,6 +24,8 @@ but cannot always guarantee backwards compatibility. Changes that may **break co **Fixed** +- Fixed a bug when passing an empty array to `TimeSeries.prepend/append_values()` raised an error. [#2522](https://github.com/unit8co/darts/pull/2522) by [Alessio Pellegrini](https://github.com/AlessiopSymplectic) +- Fixed a bug with `TimeSeries.prepend/append_values()`, where the name of the (time) index was lost. [#2522](https://github.com/unit8co/darts/pull/2522) by [Alessio Pellegrini](https://github.com/AlessiopSymplectic) - Fixed a bug when using `from_group_dataframe()` with a `time_col` of type integer, where the resulting time index was wrongly converted to a DatetimeIndex. [#2512](https://github.com/unit8co/darts/pull/2512) by [Alessio Pellegrini](https://github.com/AlessiopSymplectic) - Fixed a bug when using `historical_forecasts()` with a pre-trained `RegressionModel` that has no target lags `lags=None` but uses static covariates. [#2426](https://github.com/unit8co/darts/pull/2426) by [Dennis Bader](https://github.com/dennisbader). - Fixed a bug with `xgboost>=2.1.0`, where multi output regression was not properly handled. [#2426](https://github.com/unit8co/darts/pull/2426) by [Dennis Bader](https://github.com/dennisbader). diff --git a/darts/tests/test_timeseries.py b/darts/tests/test_timeseries.py index bd5e1b1562..0fc6f577fb 100644 --- a/darts/tests/test_timeseries.py +++ b/darts/tests/test_timeseries.py @@ -11,7 +11,7 @@ from darts import TimeSeries, concatenate from darts.utils.timeseries_generation import constant_timeseries, linear_timeseries -from darts.utils.utils import freqs, generate_index +from darts.utils.utils import expand_arr, freqs, generate_index class TestTimeSeries: @@ -762,6 +762,9 @@ def helper_test_prepend_values(test_case, test_series: TimeSeries): assert test_series.time_index.equals(prepended_sq.time_index) assert prepended_sq.components.equals(test_series.components) + # component and sample dimension should match + assert prepended._xa.shape[1:] == test_series._xa.shape[1:] + def test_slice(self): TestTimeSeries.helper_test_slice(self, self.series1) @@ -797,18 +800,112 @@ def test_append(self): assert appended.time_index.equals(expected_idx) assert appended.components.equals(series_1.components) - def test_append_values(self): - TestTimeSeries.helper_test_append_values(self, self.series1) - # Check `append_values` deals with `RangeIndex` series correctly: - series = linear_timeseries(start=1, length=5, freq=2) - appended = series.append_values(np.ones((2, 1, 1))) - expected_vals = np.concatenate( - [series.all_values(), np.ones((2, 1, 1))], axis=0 + @pytest.mark.parametrize( + "config", + itertools.product( + [ + ( # univariate array + np.array([0, 1, 2]).reshape((3, 1, 1)), + np.array([0, 1]).reshape((2, 1, 1)), + ), + ( # multivariate array + np.array([0, 1, 2, 3, 4, 5]).reshape((3, 2, 1)), + np.array([0, 1, 2, 3]).reshape((2, 2, 1)), + ), + ( # empty array + np.array([0, 1, 2]).reshape((3, 1, 1)), + np.array([]).reshape((0, 1, 1)), + ), + ( + # wrong number of components + np.array([0, 1, 2]).reshape((3, 1, 1)), + np.array([0, 1, 2, 3]).reshape((2, 2, 1)), + ), + ( + # wrong number of samples + np.array([0, 1, 2]).reshape((3, 1, 1)), + np.array([0, 1, 2, 3]).reshape((2, 1, 2)), + ), + ( # univariate list with times + np.array([0, 1, 2]).reshape((3, 1, 1)), + [0, 1], + ), + ( # univariate list with times and components + np.array([0, 1, 2]).reshape((3, 1, 1)), + [[0], [1]], + ), + ( # univariate list with times, components and samples + np.array([0, 1, 2]).reshape((3, 1, 1)), + [[[0]], [[1]]], + ), + ( # multivar with list has wrong shape + np.array([0, 1, 2, 3]).reshape((2, 2, 1)), + [[1, 2], [3, 4]], + ), + ( # list with wrong numer of components + np.array([0, 1, 2]).reshape((3, 1, 1)), + [[1, 2], [3, 4]], + ), + ( # list with wrong numer of samples + np.array([0, 1, 2]).reshape((3, 1, 1)), + [[[0, 1]], [[1, 2]]], + ), + ( # multivar input but list has wrong shape + np.array([0, 1, 2, 3]).reshape((2, 2, 1)), + [1, 2], + ), + ], + [True, False], + ["append_values", "prepend_values"], + ), + ) + def test_append_and_prepend_values(self, config): + (series_vals, vals), is_datetime, method = config + start = "20240101" if is_datetime else 1 + series_idx = generate_index( + start=start, length=len(series_vals), name="some_name" ) - expected_idx = pd.RangeIndex(start=1, stop=15, step=2) + series = TimeSeries.from_times_and_values( + times=series_idx, + values=series_vals, + ) + + # expand if it's a list + vals_arr = np.array(vals) if isinstance(vals, list) else vals + vals_arr = expand_arr(vals_arr, ndim=3) + + ts_method = getattr(TimeSeries, method) + + if vals_arr.shape[1:] != series_vals.shape[1:]: + with pytest.raises(ValueError) as exc: + _ = ts_method(series, vals) + assert str(exc.value).startswith( + "The (expanded) values must have the same number of components and samples" + ) + return + + appended = ts_method(series, vals) + + if method == "append_values": + expected_vals = np.concatenate([series_vals, vals_arr], axis=0) + expected_idx = generate_index( + start=series.start_time(), + length=len(series_vals) + len(vals), + freq=series.freq, + ) + else: + expected_vals = np.concatenate([vals_arr, series_vals], axis=0) + expected_idx = generate_index( + end=series.end_time(), + length=len(series_vals) + len(vals), + freq=series.freq, + ) + assert np.allclose(appended.all_values(), expected_vals) assert appended.time_index.equals(expected_idx) assert appended.components.equals(series.components) + assert appended._xa.shape[1:] == series._xa.shape[1:] + assert appended.time_index.name == series.time_index.name def test_prepend(self): TestTimeSeries.helper_test_prepend(self, self.series1) @@ -824,19 +921,6 @@ def test_prepend(self): assert prepended.time_index.equals(expected_idx) assert prepended.components.equals(series_1.components) - def test_prepend_values(self): - TestTimeSeries.helper_test_prepend_values(self, self.series1) - # Check `prepend_values` deals with `RangeIndex` series correctly: - series = linear_timeseries(start=1, length=5, freq=2) - prepended = series.prepend_values(np.ones((2, 1, 1))) - expected_vals = np.concatenate( - [np.ones((2, 1, 1)), series.all_values()], axis=0 - ) - expected_idx = pd.RangeIndex(start=-3, stop=11, step=2) - assert np.allclose(prepended.all_values(), expected_vals) - assert prepended.time_index.equals(expected_idx) - assert prepended.components.equals(series.components) - @pytest.mark.parametrize( "config", [ diff --git a/darts/tests/utils/test_utils.py b/darts/tests/utils/test_utils.py index 809bf84bf5..3b82c5f1b2 100644 --- a/darts/tests/utils/test_utils.py +++ b/darts/tests/utils/test_utils.py @@ -7,7 +7,7 @@ from darts.utils import _with_sanity_checks from darts.utils.missing_values import extract_subseries from darts.utils.ts_utils import retain_period_common_to_all -from darts.utils.utils import freqs, generate_index, n_steps_between +from darts.utils.utils import expand_arr, freqs, generate_index, n_steps_between class TestUtils: @@ -418,6 +418,25 @@ def test_generate_index_with_end_length(self, config): assert idx[0] == expected_start assert idx[-1] == expected_start + (n_steps - 1) * freq + @pytest.mark.parametrize( + "config", + [ + ("2000-01-01", None), + (None, "2000-01-03"), + ("2000-01-01", "2000-01-03"), + ], + ) + def test_generate_index_with_string(self, config): + """Test that index generation with strings as start or end gives same results as with pandas TimeStamps.""" + start, end = config + length = 3 if (start is None or end is None) else None + idx = generate_index(start=start, end=end, length=length) + + start_ts = pd.Timestamp(start) if start is not None else start + end_ts = pd.Timestamp(end) if end is not None else end + idx_expected = generate_index(start=start_ts, end=end_ts, length=length) + assert idx.equals(idx_expected) + @pytest.mark.parametrize( "config", [ @@ -539,3 +558,32 @@ def test_n_steps_between(self, config): assert n_steps == expected_n_steps n_steps_reversed = n_steps_between(end=start, start=end, freq=freq) assert n_steps_reversed == -expected_n_steps + + @pytest.mark.parametrize( + "config", + [ + (np.array([0, 1, 2]), (3, 1, 1)), + (np.array([[0], [1], [2]]), (3, 1, 1)), + (np.array([[[0]], [[1]], [[2]]]), (3, 1, 1)), + (np.array([[0, 1], [2, 3], [3, 4]]), (3, 2, 1)), + (np.array([[[0], [1]], [[1], [2]], [[3], [4]]]), (3, 2, 1)), + ( + np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8, 9], [10, 11]]]), + (3, 2, 2), + ), + ], + ) + def test_expand_arr(self, config): + """tests array expansion to 3D.""" + arr, shape_expected = config + + if len(arr.shape) == 1: + arr_expected = arr[:, None, None] + elif len(arr.shape) == 2: + arr_expected = arr[:, :, None] + else: + arr_expected = arr + + arr = expand_arr(arr, ndim=3) + assert arr.shape == shape_expected + np.testing.assert_array_almost_equal(arr, arr_expected) diff --git a/darts/timeseries.py b/darts/timeseries.py index b14e37a4b9..f0c9fc006d 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -53,7 +53,11 @@ from darts.logging import get_logger, raise_if, raise_if_not, raise_log from darts.utils import _build_tqdm_iterator, _parallel_apply -from darts.utils.utils import expand_arr, generate_index, n_steps_between +from darts.utils.utils import ( + expand_arr, + generate_index, + n_steps_between, +) try: from typing import Literal @@ -2855,10 +2859,10 @@ def append(self, other: Self) -> Self: "Both series must have the same number of components.", logger, ) - if self._has_datetime_index: + if len(self) > 0 and len(other) > 0: raise_if_not( other.start_time() == self.end_time() + self.freq, - "Appended TimeSeries must start one time step after current one.", + "Appended TimeSeries must start one (time) step after current one.", logger, ) @@ -2892,17 +2896,28 @@ def append_values(self, values: np.ndarray) -> Self: TimeSeries A new TimeSeries with the new values appended """ - if self._has_datetime_index: - idx = pd.DatetimeIndex( - [self.end_time() + i * self._freq for i in range(1, len(values) + 1)], - freq=self._freq, - ) - else: - idx = pd.RangeIndex( - start=self.end_time() + self._freq, - stop=self.end_time() + (len(values) + 1) * self._freq, - step=self._freq, + if len(values) == 0: + return self.copy() + + values = np.array(values) if not isinstance(values, np.ndarray) else values + values = expand_arr(values, ndim=len(DIMS)) + if not values.shape[1:] == self._xa.values.shape[1:]: + raise_log( + ValueError( + f"The (expanded) values must have the same number of components and samples " + f"(second and third dims) as the series to append to. " + f"Received shape: {values.shape}, expected: {self._xa.values.shape}" + ), + logger=logger, ) + + idx = generate_index( + start=self.end_time() + self.freq, + length=len(values), + freq=self.freq, + name=self._time_index.name, + ) + return self.append( self.__class__.from_times_and_values( values=values, @@ -2951,22 +2966,28 @@ def prepend_values(self, values: np.ndarray) -> Self: TimeSeries A new TimeSeries with the new values prepended. """ + if len(values) == 0: + return self.copy() - if self._has_datetime_index: - idx = pd.DatetimeIndex( - [ - self.start_time() - i * self._freq - for i in reversed(range(1, len(values) + 1)) - ], - freq=self._freq, - ) - else: - idx = pd.RangeIndex( - self.start_time() - self.freq * len(values), - self.start_time(), - step=self.freq, + values = np.array(values) if not isinstance(values, np.ndarray) else values + values = expand_arr(values, ndim=len(DIMS)) + if not values.shape[1:] == self._xa.values.shape[1:]: + raise_log( + ValueError( + f"The (expanded) values must have the same number of components and samples " + f"(second and third dims) as the series to prepend to. " + f"Received shape: {values.shape}, expected: {self._xa.values.shape}" + ), + logger=logger, ) + idx = generate_index( + end=self.start_time() - self.freq, + length=len(values), + freq=self.freq, + name=self._time_index.name, + ) + return self.prepend( self.__class__.from_times_and_values( values=values, diff --git a/darts/utils/utils.py b/darts/utils/utils.py index 643c0655f1..125e1b8afc 100644 --- a/darts/utils/utils.py +++ b/darts/utils/utils.py @@ -429,8 +429,8 @@ def n_steps_between( def generate_index( - start: Optional[Union[pd.Timestamp, int]] = None, - end: Optional[Union[pd.Timestamp, int]] = None, + start: Optional[Union[pd.Timestamp, str, int]] = None, + end: Optional[Union[pd.Timestamp, str, int]] = None, length: Optional[int] = None, freq: Union[str, int, pd.DateOffset] = None, name: str = None, @@ -441,7 +441,7 @@ def generate_index( Parameters ---------- start - The start of the returned index. If a pandas Timestamp is passed, the index will be a pandas + The start of the returned index. If a pandas Timestamp or a date string is passed, the index will be a pandas DatetimeIndex. If an integer is passed, the index will be a pandas RangeIndex index. Works only with either `length` or `end`. end @@ -477,6 +477,9 @@ def generate_index( logger, ) + start = pd.Timestamp(start) if isinstance(start, str) else start + end = pd.Timestamp(end) if isinstance(end, str) else end + if isinstance(start, pd.Timestamp) or isinstance(end, pd.Timestamp): freq = "D" if freq is None else freq freq = pd.tseries.frequencies.to_offset(freq) if isinstance(freq, str) else freq