From b3498bf893d56d25340f26092c72fbbfae545028 Mon Sep 17 00:00:00 2001 From: madtoinou <32447896+madtoinou@users.noreply.github.com> Date: Thu, 14 Sep 2023 11:54:35 +0200 Subject: [PATCH] Feat/specify lags per component for RegressionModel (#1962) * feat: updated lags sanity checks to accept dictionnary * fix: better management of corner cases during lags checks * fix: improved modularity * fix: simplified the logic a bit * feat: when generating lagged data, the values can be extracted using component-specific lags * feat: raise error if all the ts in target/past/future don't have the same number of components * feat: added support for component-specific lags in fit() and predict() * test: added tests and fix some bug accordingly * feat: component-wise lags support encoders, improved sanity checks * feat: possibility to declare default lags for all the not specified components, updated changelog * test: adding a test for the lagged data creation * fix: typo * fix: adressing review comments * Apply suggestions from code review Co-authored-by: Dennis Bader * refactor: lags argument are converted to dict before running the type check and processing of the values * refactor: lags argument are converted to dict before running the type check and processing of the values * doc: improved documentation of the component-specific lags in tabularization * test: adding a test for the multivariate scenario * test: checking the appriopriate lags are extracted by the shap explainer * fix: shapexplainer extract the appropriate lags, updated the type hints * fix: passing covariates when trained on multiple series * fix: moved the series components consistency to create_lagged_data to limit iteration of the series * fix: improved the error message for components inconsistency, improve tests parametrization * fix: addressing reviewer comments * Apply suggestions from code review Co-authored-by: Dennis Bader * test: checking that the name of the features is correctly generated when using dict to define the lags * fix: linting * fix: updating the error msg * fix: bug when the number of lags is different across components * fix: future lags in test --------- Co-authored-by: Dennis Bader --- CHANGELOG.md | 3 +- darts/explainability/shap_explainer.py | 12 +- darts/models/forecasting/lgbm.py | 48 +- .../forecasting/linear_regression_model.py | 53 ++- darts/models/forecasting/random_forest.py | 48 +- darts/models/forecasting/regression_model.py | 443 +++++++++++++----- darts/models/forecasting/xgboost.py | 53 ++- .../explainability/test_shap_explainer.py | 50 +- .../forecasting/test_regression_models.py | 385 ++++++++++++++- .../test_create_lagged_prediction_data.py | 4 +- .../test_create_lagged_training_data.py | 4 +- .../tabularization/test_get_feature_times.py | 4 +- darts/utils/data/tabularization.py | 162 ++++--- 13 files changed, 999 insertions(+), 270 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd3bc36089..1bd97f9b39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,9 @@ but cannot always guarantee backwards compatibility. Changes that may **break co - `TimeSeries` with a `RangeIndex` starting in the negative start are now supported by `historical_forecasts`. [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou). - Added a new argument `start_format` to `historical_forecasts()`, `backtest()` and `gridsearch` that allows to use an integer `start` either as the index position or index value/label for `series` indexed with a `pd.RangeIndex`. [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou). - Added `RINorm` (Reversible Instance Norm) as an input normalization option for all `TorchForecastingModel` except `RNNModel`. Activate it with model creation parameter `use_reversible_instance_norm`. [#1969](https://github.com/unit8co/darts/pull/1969) by [Dennis Bader](https://github.com/dennisbader). -- Reduced the size of the Darts docker image `unit8/darts:latest`, and included all optional models as well as dev requirements. [#1878](https://github.com/unit8co/darts/pull/1878) by [Alex Colpitts](https://github.com/alexcolpitts96). +- Reduced the size of the Darts docker image `unit8/darts:latest`, and included all optional models as well as dev requirements. [#1878](https://github.com/unit8co/darts/pull/1878) by [Alex Colpitts](https://github.com/alexcolpitts96). - Added short examples in the docstring of all the models, including covariates usage and some model-specific parameters. [#1956](https://github.com/unit8co/darts/pull/1956) by [Antoine Madrona](https://github.com/madtoinou). +- All `RegressionModel`s now support component/column-specific lags for target, past, and future covariates series. [#1962](https://github.com/unit8co/darts/pull/1962) by [Antoine Madrona](https://github.com/madtoinou). **Fixed** - Fixed a bug in `TimeSeries.from_dataframe()` when using a pandas.DataFrame with `df.columns.name != None`. [#1938](https://github.com/unit8co/darts/pull/1938) by [Antoine Madrona](https://github.com/madtoinou). diff --git a/darts/explainability/shap_explainer.py b/darts/explainability/shap_explainer.py index 29be9d5e3d..143ea0d8b9 100644 --- a/darts/explainability/shap_explainer.py +++ b/darts/explainability/shap_explainer.py @@ -732,9 +732,9 @@ def _build_explainer_sklearn( def _create_regression_model_shap_X( self, - target_series, - past_covariates, - future_covariates, + target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]], + past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]], + future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]], n_samples=None, train=False, ) -> pd.DataFrame: @@ -746,9 +746,9 @@ def _create_regression_model_shap_X( """ - lags_list = self.model.lags.get("target") - lags_past_covariates_list = self.model.lags.get("past") - lags_future_covariates_list = self.model.lags.get("future") + lags_list = self.model._get_lags("target") + lags_past_covariates_list = self.model._get_lags("past") + lags_future_covariates_list = self.model._get_lags("future") X, indexes = create_lagged_prediction_data( target_series=target_series if lags_list else None, diff --git a/darts/models/forecasting/lgbm.py b/darts/models/forecasting/lgbm.py index 7aa2e4cd76..f5ca44e288 100644 --- a/darts/models/forecasting/lgbm.py +++ b/darts/models/forecasting/lgbm.py @@ -10,13 +10,15 @@ https://github.com/unit8co/darts/blob/master/INSTALL.md """ -from typing import List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Union import lightgbm as lgb import numpy as np from darts.logging import get_logger from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, RegressionModelWithCategoricalCovariates, _LikelihoodMixin, ) @@ -28,13 +30,13 @@ class LightGBMModel(RegressionModelWithCategoricalCovariates, _LikelihoodMixin): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, - likelihood: str = None, - quantiles: List[float] = None, + likelihood: Optional[str] = None, + quantiles: Optional[List[float]] = None, random_state: Optional[int] = None, multi_models: Optional[bool] = True, use_static_covariates: bool = True, @@ -48,17 +50,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If an tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/models/forecasting/linear_regression_model.py b/darts/models/forecasting/linear_regression_model.py index 19fc4617b6..fc62c40c8d 100644 --- a/darts/models/forecasting/linear_regression_model.py +++ b/darts/models/forecasting/linear_regression_model.py @@ -5,14 +5,19 @@ A forecasting model using a linear regression of some of the target series' lags, as well as optionally some covariate series lags in order to obtain a forecast. """ -from typing import List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Union import numpy as np from scipy.optimize import linprog from sklearn.linear_model import LinearRegression, PoissonRegressor, QuantileRegressor from darts.logging import get_logger -from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin +from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, + RegressionModel, + _LikelihoodMixin, +) from darts.timeseries import TimeSeries logger = get_logger(__name__) @@ -21,13 +26,13 @@ class LinearRegressionModel(RegressionModel, _LikelihoodMixin): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, - likelihood: str = None, - quantiles: List[float] = None, + likelihood: Optional[str] = None, + quantiles: Optional[List[float]] = None, random_state: Optional[int] = None, multi_models: Optional[bool] = True, use_static_covariates: bool = True, @@ -38,17 +43,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If an tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/models/forecasting/random_forest.py b/darts/models/forecasting/random_forest.py index d4e0b4e58e..a5d91448ed 100644 --- a/darts/models/forecasting/random_forest.py +++ b/darts/models/forecasting/random_forest.py @@ -14,12 +14,16 @@ ---------- .. [1] https://en.wikipedia.org/wiki/Random_forest """ -from typing import List, Optional, Tuple, Union +from typing import Optional from sklearn.ensemble import RandomForestRegressor from darts.logging import get_logger -from darts.models.forecasting.regression_model import RegressionModel +from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, + RegressionModel, +) logger = get_logger(__name__) @@ -27,9 +31,9 @@ class RandomForest(RegressionModel): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, n_estimators: Optional[int] = 100, @@ -43,17 +47,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If an tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 940ec4c2bc..55014c26d7 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -60,13 +60,18 @@ logger = get_logger(__name__) +LAGS_TYPE = Union[int, List[int], Dict[str, Union[int, List[int]]]] +FUTURE_LAGS_TYPE = Union[ + Tuple[int, int], List[int], Dict[str, Union[Tuple[int, int], List[int]]] +] + class RegressionModel(GlobalForecastingModel): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, model=None, @@ -79,17 +84,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise, a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If a tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may @@ -165,6 +186,7 @@ def encode_year(idx): self.model = model self.lags: Dict[str, List[int]] = {} + self.component_lags: Dict[str, Dict[str, List[int]]] = {} self.input_dim = None self.multi_models = multi_models self._considers_static_covariates = use_static_covariates @@ -200,94 +222,158 @@ def encode_year(idx): "At least one of `lags`, `lags_future_covariates` or `lags_past_covariates` must be not None.", ) - lags_type_checks = [ - (lags, "lags"), - (lags_past_covariates, "lags_past_covariates"), - ] + # convert lags arguments to list of int + self.lags, self.component_lags = self._generate_lags( + lags=lags, + lags_past_covariates=lags_past_covariates, + lags_future_covariates=lags_future_covariates, + ) - for _lags, lags_name in lags_type_checks: - raise_if_not( - isinstance(_lags, (int, list)) or _lags is None, - f"`{lags_name}` must be of type int or list. Given: {type(_lags)}.", - ) - raise_if( - isinstance(_lags, bool), - f"`{lags_name}` must be of type int or list, not bool.", - ) + self.pred_dim = self.output_chunk_length if self.multi_models else 1 - raise_if_not( - isinstance(lags_future_covariates, (tuple, list)) - or lags_future_covariates is None, - f"`lags_future_covariates` must be of type tuple or list. Given: {type(lags_future_covariates)}.", - ) + def _generate_lags( + self, + lags: Optional[LAGS_TYPE], + lags_past_covariates: Optional[LAGS_TYPE], + lags_future_covariates: Optional[FUTURE_LAGS_TYPE], + ) -> Tuple[Dict[str, List[int]], Dict[str, Dict[str, List[int]]]]: + """ + Based on the type of the argument and the nature of the covariates, perform some sanity checks before + converting the lags to a list of integer. - if isinstance(lags_future_covariates, tuple): - raise_if_not( - len(lags_future_covariates) == 2 - and isinstance(lags_future_covariates[0], int) - and isinstance(lags_future_covariates[1], int), - "`lags_future_covariates` tuple must be of length 2, and must contain two integers", - ) - raise_if( - isinstance(lags_future_covariates[0], bool) - or isinstance(lags_future_covariates[1], bool), - "`lags_future_covariates` tuple must contain integers, not bool", - ) + If lags are provided as a dictionary, the lags values are contained in self.component_lags and the self.lags + attributes contain only the extreme values + If the lags are provided as integer, list, tuple or dictionary containing only the 'default_lags' keys, the lags + values are contained in the self.lags attribute and the self.component_lags is an empty dictionary. + """ + processed_lags: Dict[str, List[int]] = dict() + processed_component_lags: Dict[str, Dict[str, List[int]]] = dict() + for lags_values, lags_name, lags_abbrev in zip( + [lags, lags_past_covariates, lags_future_covariates], + ["lags", "lags_past_covariates", "lags_future_covariates"], + ["target", "past", "future"], + ): + if lags_values is None: + continue - # set lags - if isinstance(lags, int): - raise_if_not(lags > 0, f"`lags` must be strictly positive. Given: {lags}.") - # selecting last `lags` lags, starting from position 1 (skipping current, pos 0, the one we want to predict) - self.lags["target"] = list(range(-lags, 0)) - elif isinstance(lags, list): - for lag in lags: - raise_if( - not isinstance(lag, int) or (lag >= 0), - f"Every element of `lags` must be a strictly negative integer. Given: {lags}.", + # converting to dictionary to run sanity checks + if not isinstance(lags_values, dict): + lags_values = {"default_lags": lags_values} + elif len(lags_values) == 0: + raise_log( + ValueError( + f"When passed as a dictionary, `{lags_name}` must contain at least one key." + ), + logger, ) - if lags: - self.lags["target"] = sorted(lags) - if isinstance(lags_past_covariates, int): - raise_if_not( - lags_past_covariates > 0, - f"`lags_past_covariates` must be an integer > 0. Given: {lags_past_covariates}.", - ) - self.lags["past"] = list(range(-lags_past_covariates, 0)) - elif isinstance(lags_past_covariates, list): - for lag in lags_past_covariates: - raise_if( - not isinstance(lag, int) or (lag >= 0), - f"Every element of `lags_covariates` must be an integer < 0. Given: {lags_past_covariates}.", - ) - if lags_past_covariates: - self.lags["past"] = sorted(lags_past_covariates) + invalid_type = False + supported_types = "" + min_lags = None + max_lags = None + tmp_components_lags: Dict[str, List[int]] = dict() + for comp_name, comp_lags in lags_values.items(): + if lags_name == "lags_future_covariates": + if isinstance(comp_lags, tuple): + raise_if_not( + len(comp_lags) == 2 + and isinstance(comp_lags[0], int) + and isinstance(comp_lags[1], int), + f"`{lags_name}` - `{comp_name}`: tuple must be of length 2, and must contain two integers", + logger, + ) - if isinstance(lags_future_covariates, tuple): - raise_if_not( - lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, - f"`lags_future_covariates` tuple must contain integers >= 0. Given: {lags_future_covariates}.", - ) - if ( - lags_future_covariates[0] is not None - and lags_future_covariates[1] is not None - ): - if not ( - lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0 - ): - self.lags["future"] = list( - range(-lags_future_covariates[0], lags_future_covariates[1]) + raise_if( + isinstance(comp_lags[0], bool) + or isinstance(comp_lags[1], bool), + f"`{lags_name}` - `{comp_name}`: tuple must contain integers, not bool", + logger, + ) + + raise_if_not( + comp_lags[0] >= 0 and comp_lags[1] >= 0, + f"`{lags_name}` - `{comp_name}`: tuple must contain positive integers. Given: {comp_lags}.", + logger, + ) + raise_if( + comp_lags[0] == 0 and comp_lags[1] == 0, + f"`{lags_name}` - `{comp_name}`: tuple cannot be (0, 0) as it corresponds to an empty " + f"list of lags.", + logger, + ) + tmp_components_lags[comp_name] = list( + range(-comp_lags[0], comp_lags[1]) + ) + elif isinstance(comp_lags, list): + for lag in comp_lags: + raise_if( + not isinstance(lag, int) or isinstance(lag, bool), + f"`{lags_name}` - `{comp_name}`: list must contain only integers. Given: {comp_lags}.", + logger, + ) + tmp_components_lags[comp_name] = sorted(comp_lags) + else: + invalid_type = True + supported_types = "tuple or a list" + else: + if isinstance(comp_lags, int): + raise_if_not( + comp_lags > 0, + f"`{lags_name}` - `{comp_name}`: integer must be strictly positive . Given: {comp_lags}.", + logger, + ) + tmp_components_lags[comp_name] = list(range(-comp_lags, 0)) + elif isinstance(comp_lags, list): + for lag in comp_lags: + raise_if( + not isinstance(lag, int) or (lag >= 0), + f"`{lags_name}` - `{comp_name}`: list must contain only strictly negative integers. " + f"Given: {comp_lags}.", + logger, + ) + tmp_components_lags[comp_name] = sorted(comp_lags) + else: + invalid_type = True + supported_types = "strictly positive integer or a list" + + if invalid_type: + raise_log( + ValueError( + f"`{lags_name}` - `{comp_name}`: must be either a {supported_types}. " + f"Gived : {type(comp_lags)}." + ), + logger, ) - elif isinstance(lags_future_covariates, list): - for lag in lags_future_covariates: - raise_if( - not isinstance(lag, int) or isinstance(lag, bool), - f"Every element of `lags_future_covariates` must be an integer. Given: {lags_future_covariates}.", - ) - if lags_future_covariates: - self.lags["future"] = sorted(lags_future_covariates) - self.pred_dim = self.output_chunk_length if self.multi_models else 1 + # extracting min and max lags va + if min_lags is None: + min_lags = tmp_components_lags[comp_name][0] + else: + min_lags = min(min_lags, tmp_components_lags[comp_name][0]) + + if max_lags is None: + max_lags = tmp_components_lags[comp_name][-1] + else: + max_lags = max(max_lags, tmp_components_lags[comp_name][-1]) + + # revert to shared lags logic when applicable + if list(tmp_components_lags.keys()) == ["default_lags"]: + processed_lags[lags_abbrev] = tmp_components_lags["default_lags"] + else: + processed_lags[lags_abbrev] = [min_lags, max_lags] + processed_component_lags[lags_abbrev] = tmp_components_lags + + return processed_lags, processed_component_lags + + def _get_lags(self, lags_type: str): + """ + If lags were specified in a component-wise manner, they are contained in self.component_lags and + the values in self.lags should be ignored as they correspond just the extreme values. + """ + if lags_type in self.component_lags: + return self.component_lags[lags_type] + else: + return self.lags.get(lags_type) @property def _model_encoder_settings( @@ -328,16 +414,12 @@ def extreme_lags( Optional[int], Optional[int], ]: - min_target_lag = self.lags.get("target")[0] if "target" in self.lags else None + min_target_lag = self.lags["target"][0] if "target" in self.lags else None max_target_lag = self.output_chunk_length - 1 - min_past_cov_lag = self.lags.get("past")[0] if "past" in self.lags else None - max_past_cov_lag = self.lags.get("past")[-1] if "past" in self.lags else None - min_future_cov_lag = ( - self.lags.get("future")[0] if "future" in self.lags else None - ) - max_future_cov_lag = ( - self.lags.get("future")[-1] if "future" in self.lags else None - ) + min_past_cov_lag = self.lags["past"][0] if "past" in self.lags else None + max_past_cov_lag = self.lags["past"][-1] if "past" in self.lags else None + min_future_cov_lag = self.lags["future"][0] if "future" in self.lags else None + max_future_cov_lag = self.lags["future"][-1] if "future" in self.lags else None return ( min_target_lag, max_target_lag, @@ -392,12 +474,12 @@ def _get_last_prediction_time(self, series, forecast_horizon, overlap_end): return last_valid_pred_time def _create_lagged_data( - self, target_series, past_covariates, future_covariates, max_samples_per_ts + self, + target_series: Sequence[TimeSeries], + past_covariates: Sequence[TimeSeries], + future_covariates: Sequence[TimeSeries], + max_samples_per_ts: int, ): - lags = self.lags.get("target") - lags_past_covariates = self.lags.get("past") - lags_future_covariates = self.lags.get("future") - ( features, labels, @@ -408,9 +490,9 @@ def _create_lagged_data( output_chunk_length=self.output_chunk_length, past_covariates=past_covariates, future_covariates=future_covariates, - lags=lags, - lags_past_covariates=lags_past_covariates, - lags_future_covariates=lags_future_covariates, + lags=self._get_lags("target"), + lags_past_covariates=self._get_lags("past"), + lags_future_covariates=self._get_lags("future"), uses_static_covariates=self.uses_static_covariates, last_static_covariates_shape=None, max_samples_per_ts=max_samples_per_ts, @@ -419,7 +501,26 @@ def _create_lagged_data( concatenate=False, ) + expected_nb_feat = ( + features[0].shape[1] + if isinstance(features, Sequence) + else features.shape[1] + ) for i, (X_i, y_i) in enumerate(zip(features, labels)): + # TODO: account for scenario where two wrong shapes can silently hide the problem + if expected_nb_feat != X_i.shape[1]: + shape_error_msg = [] + for ts, cov_name, arg_name in zip( + [target_series, past_covariates, future_covariates], + ["target", "past", "future"], + ["series", "past_covariates", "future_covariates"], + ): + if ts is not None and ts[i].width != self.input_dim[cov_name]: + shape_error_msg.append( + f"Expected {self.input_dim[cov_name]} components but received " + f"{ts[i].width} components at index {i} of `{arg_name}`." + ) + raise_log(ValueError("\n".join(shape_error_msg)), logger) features[i] = X_i[:, :, 0] labels[i] = y_i[:, :, 0] @@ -430,10 +531,10 @@ def _create_lagged_data( def _fit_model( self, - target_series, - past_covariates, - future_covariates, - max_samples_per_ts, + target_series: Sequence[TimeSeries], + past_covariates: Sequence[TimeSeries], + future_covariates: Sequence[TimeSeries], + max_samples_per_ts: int, **kwargs, ): """ @@ -458,9 +559,9 @@ def _fit_model( target_series=target_series, past_covariates=past_covariates, future_covariates=future_covariates, - lags=self.lags.get("target"), - lags_past_covariates=self.lags.get("past"), - lags_future_covariates=self.lags.get("future"), + lags=self._get_lags("target"), + lags_past_covariates=self._get_lags("past"), + lags_future_covariates=self._get_lags("future"), output_chunk_length=self.output_chunk_length, concatenate=False, use_static_covariates=self.uses_static_covariates, @@ -582,6 +683,52 @@ def fit( past_covariates=seq2series(past_covariates), future_covariates=seq2series(future_covariates), ) + variate2arg = { + "target": "lags", + "past": "lags_past_covariates", + "future": "lags_future_covariates", + } + + # if provided, component-wise lags must be defined for all the components of the first series + component_lags_error_msg = [] + for variate_type, variate in zip( + ["target", "past", "future"], [series, past_covariates, future_covariates] + ): + if variate_type not in self.component_lags: + continue + + # ignore the fallback lags entry + provided_components = set(self.component_lags[variate_type].keys()) + required_components = set(variate[0].components) + + wrong_components = list( + provided_components - {"default_lags"} - required_components + ) + missing_keys = list(required_components - provided_components) + # lags were specified for unrecognized components + if len(wrong_components) > 0: + component_lags_error_msg.append( + f"The `{variate2arg[variate_type]}` dictionary specifies lags for components that are not " + f"present in the series : {wrong_components}. They must be removed to avoid any ambiguity." + ) + elif len(missing_keys) > 0 and "default_lags" not in provided_components: + component_lags_error_msg.append( + f"The {variate2arg[variate_type]} dictionary is missing the lags for the following components " + f"present in the series: {missing_keys}. The key 'default_lags' can be used to provide lags for " + f"all the non-explicitely defined components." + ) + else: + # reorder the components based on the input series, insert the default when necessary + self.component_lags[variate_type] = { + comp_name: self.component_lags[variate_type][comp_name] + if comp_name in self.component_lags[variate_type] + else self.component_lags[variate_type]["default_lags"] + for comp_name in variate[0].components + } + + # single error message for all the lags arguments + if len(component_lags_error_msg) > 0: + raise_log(ValueError("\n".join(component_lags_error_msg)), logger) self._fit_model( series, past_covariates, future_covariates, max_samples_per_ts, **kwargs @@ -783,23 +930,61 @@ def predict( series_matrix = np.concatenate( [series_matrix, predictions[-1]], axis=1 ) - np_X.append( - series_matrix[ - :, - [ - lag - (shift + last_step_shift) - for lag in self.lags["target"] - ], - ].reshape(len(series) * num_samples, -1) - ) - # retrieve covariate lags, enforce order (dict only preserves insertion order for python 3.6+) - for cov_type in ["past", "future"]: - if cov_type in covariate_matrices: + # component-wise lags + if "target" in self.component_lags: + tmp_X = [ + series_matrix[ + :, + [lag - (shift + last_step_shift) for lag in comp_lags], + comp_i, + ] + for comp_i, (comp, comp_lags) in enumerate( + self.component_lags["target"].items() + ) + ] + # values are grouped by component + np_X.append( + np.concatenate(tmp_X, axis=1).reshape( + len(series) * num_samples, -1 + ) + ) + else: + # values are grouped by lags np_X.append( - covariate_matrices[cov_type][ - :, relative_cov_lags[cov_type] + t_pred + series_matrix[ + :, + [ + lag - (shift + last_step_shift) + for lag in self.lags["target"] + ], ].reshape(len(series) * num_samples, -1) ) + # retrieve covariate lags, enforce order (dict only preserves insertion order for python 3.6+) + for cov_type in ["past", "future"]: + if cov_type in covariate_matrices: + # component-wise lags + if cov_type in self.component_lags: + tmp_X = [ + covariate_matrices[cov_type][ + :, + np.array(comp_lags) - self.lags[cov_type][0] + t_pred, + comp_i, + ] + for comp_i, (comp, comp_lags) in enumerate( + self.component_lags[cov_type].items() + ) + ] + np_X.append( + np.concatenate(tmp_X, axis=1).reshape( + len(series) * num_samples, -1 + ) + ) + else: + np_X.append( + covariate_matrices[cov_type][ + :, relative_cov_lags[cov_type] + t_pred + ].reshape(len(series) * num_samples, -1) + ) # concatenate retrieved lags X = np.concatenate(np_X, axis=1) diff --git a/darts/models/forecasting/xgboost.py b/darts/models/forecasting/xgboost.py index 962eddbeba..dd9268211e 100644 --- a/darts/models/forecasting/xgboost.py +++ b/darts/models/forecasting/xgboost.py @@ -8,13 +8,18 @@ """ from functools import partial -from typing import List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Union import numpy as np import xgboost as xgb from darts.logging import get_logger -from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin +from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, + RegressionModel, + _LikelihoodMixin, +) from darts.timeseries import TimeSeries from darts.utils.utils import raise_if_not @@ -43,13 +48,13 @@ def xgb_quantile_loss(labels: np.ndarray, preds: np.ndarray, quantile: float): class XGBModel(RegressionModel, _LikelihoodMixin): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, - likelihood: str = None, - quantiles: List[float] = None, + likelihood: Optional[str] = None, + quantiles: Optional[List[float]] = None, random_state: Optional[int] = None, multi_models: Optional[bool] = True, use_static_covariates: bool = True, @@ -60,17 +65,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If a tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/tests/explainability/test_shap_explainer.py b/darts/tests/explainability/test_shap_explainer.py index b9919ba0ca..a5e950adb4 100644 --- a/darts/tests/explainability/test_shap_explainer.py +++ b/darts/tests/explainability/test_shap_explainer.py @@ -14,7 +14,7 @@ from darts import TimeSeries from darts.dataprocessing.transformers import Scaler from darts.explainability.explainability_result import ShapExplainabilityResult -from darts.explainability.shap_explainer import ShapExplainer +from darts.explainability.shap_explainer import MIN_BACKGROUND_SAMPLE, ShapExplainer from darts.models import ( CatBoostModel, ExponentialSmoothing, @@ -24,6 +24,7 @@ RegressionModel, XGBModel, ) +from darts.utils.timeseries_generation import linear_timeseries lgbm_available = not isinstance(LightGBMModel, NotImportedModule) cb_available = not isinstance(CatBoostModel, NotImportedModule) @@ -804,3 +805,50 @@ def test_shapley_multiple_series_with_different_static_covs(self): for explained_forecast in explanation_results.explained_forecasts: comps_out = explained_forecast[1]["price"].columns.tolist() assert comps_out[-1] == "type_statcov_target_price" + + def test_shap_regressor_component_specific_lags(self): + model = LinearRegressionModel( + lags={"price": [-3, -2], "power": [-1]}, + output_chunk_length=1, + ) + # multivariate ts as short as possible + min_ts_length = MIN_BACKGROUND_SAMPLE * np.abs(min(model.lags["target"])) + ts = linear_timeseries( + start_value=1, + end_value=min_ts_length, + length=min_ts_length, + column_name="price", + ).stack( + linear_timeseries( + start_value=102, + end_value=100 + 2 * min_ts_length, + length=min_ts_length, + column_name="power", + ) + ) + model.fit(ts) + shap_explain = ShapExplainer(model) + + # one column per lag, grouped by components + expected_columns = [ + "price_target_lag-3", + "price_target_lag-2", + "power_target_lag-1", + ] + expected_df = pd.DataFrame( + data=np.stack( + [np.arange(1, 29), np.arange(3, 31), np.arange(106, 161, 2)], axis=1 + ), + columns=expected_columns, + ) + + # check that the appropriate lags are extracted + assert all(shap_explain.explainers.background_X == expected_df) + assert model.lagged_feature_names == list(expected_df.columns) + + # check that explain() can be called + explanation_results = shap_explain.explain() + plt.close() + for comp in ts.components: + comps_out = explanation_results.explained_forecasts[1][comp].columns + assert all(comps_out == expected_columns) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 5601f8f2d4..9d5c369526 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -27,7 +27,6 @@ RegressionModel, XGBModel, ) -from darts.models.forecasting.forecasting_model import GlobalForecastingModel from darts.utils import timeseries_generation as tg from darts.utils.multioutput import MultiOutputRegressor @@ -416,7 +415,9 @@ def test_model_construction(self, config): # testing lags_past_covariates model_instance = model(lags=None, lags_past_covariates=3, multi_models=mode) assert model_instance.lags.get("past") == [-3, -2, -1] - # testing lags_future covariates + # lags_future covariates does not support SINGLE INT + + # TESTING TUPLE of int, only supported by lags_future_covariates model_instance = model( lags=None, lags_future_covariates=(3, 5), multi_models=mode ) @@ -431,6 +432,25 @@ def test_model_construction(self, config): model_instance = model(lags_past_covariates=values, multi_models=mode) assert model_instance.lags.get("past") == values # testing lags_future_covariates + values = [-5, -1, 5] + model_instance = model(lags_future_covariates=values, multi_models=mode) + assert model_instance.lags.get("future") == values + + # TESTING DICT, lags are specified component-wise + # model.lags contains the extreme across the components + values = {"comp0": [-4, -2], "comp1": [-5, -3]} + model_instance = model(lags=values, multi_models=mode) + assert model_instance.lags.get("target") == [-5, -2] + assert model_instance.component_lags.get("target") == values + # testing lags_past_covariates + model_instance = model(lags_past_covariates=values, multi_models=mode) + assert model_instance.lags.get("past") == [-5, -2] + assert model_instance.component_lags.get("past") == values + # testing lags_future_covariates + values = {"comp0": [-4, 2], "comp1": [-5, 3]} + model_instance = model(lags_future_covariates=values, multi_models=mode) + assert model_instance.lags.get("future") == [-5, 3] + assert model_instance.component_lags.get("future") == values with pytest.raises(ValueError): model(multi_models=mode) @@ -460,10 +480,15 @@ def test_model_construction(self, config): model(lags=5, lags_future_covariates=(1, True), multi_models=mode) with pytest.raises(ValueError): model(lags=5, lags_future_covariates=(1, 1.0), multi_models=mode) + with pytest.raises(ValueError): + model(lags=5, lags_future_covariates={}, multi_models=mode) + with pytest.raises(ValueError): + model(lags=None, lags_future_covariates={}, multi_models=mode) @pytest.mark.parametrize("mode", [True, False]) def test_training_data_creation(self, mode): - # testing _get_training_data function + """testing _get_training_data function""" + # lags defined using lists of integers model_instance = RegressionModel( lags=self.lags_1["target"], lags_past_covariates=self.lags_1["past"], @@ -512,6 +537,76 @@ def test_training_data_creation(self, mode): ] assert list(training_labels[0]) == [82, 182, 282] + # lags defined using dictionaries + # cannot use 'default_lags' because it's converted in `fit()`, before calling `_created_lagged_data` + model_instance = RegressionModel( + lags={"0-trgt-0": [-4, -3], "0-trgt-1": [-3, -2], "0-trgt-2": [-2, -1]}, + lags_past_covariates={"0-pcov-0": [-10], "0-pcov-1": [-7]}, + lags_future_covariates={"0-fcov-0": (2, 2)}, + multi_models=mode, + ) + + max_samples_per_ts = 3 + + # using only one series of each + training_samples, training_labels = model_instance._create_lagged_data( + target_series=self.target_series[0], + past_covariates=self.past_covariates[0], + future_covariates=self.future_covariates[0], + max_samples_per_ts=max_samples_per_ts, + ) + + # checking number of dimensions + assert len(training_samples.shape) == 2 # samples, features + assert len(training_labels.shape) == 2 # samples, components (multivariate) + assert training_samples.shape[0] == training_labels.shape[0] + assert training_samples.shape[0] == max_samples_per_ts + assert ( + training_samples.shape[1] + == 6 # [-4, -3], [-3, -2], [-2, -1] + + 2 # [-10], [-7] + + 4 # [-2, -1, 0, 1] + ) + + # check last sample + assert list(training_labels[0]) == [97, 197, 297] + # lags are grouped by components instead of lags + assert list(training_samples[0, :]) == [ + 93, + 94, + 194, + 195, + 295, + 296, # comp_i = comp_0 + i*100 + 10087, + 10190, # past cov; target + 10'000 + 20095, + 20096, + 20097, + 20098, # future cov; target + 20'000 + ] + + # checking the name of the lagged features + model_instance.fit( + series=self.target_series[0], + past_covariates=self.past_covariates[0], + future_covariates=self.future_covariates[0], + ) + assert model_instance.lagged_feature_names == [ + "0-trgt-0_target_lag-4", + "0-trgt-0_target_lag-3", + "0-trgt-1_target_lag-3", + "0-trgt-1_target_lag-2", + "0-trgt-2_target_lag-2", + "0-trgt-2_target_lag-1", + "0-pcov-0_pastcov_lag-10", + "0-pcov-1_pastcov_lag-7", + "0-fcov-0_futcov_lag-2", + "0-fcov-0_futcov_lag-1", + "0-fcov-0_futcov_lag0", + "0-fcov-0_futcov_lag1", + ] + @pytest.mark.parametrize("mode", [True, False]) def test_prediction_data_creation(self, mode): # assigning correct names to variables @@ -940,11 +1035,37 @@ def test_models_runnability(self, config): def test_fit(self, config): # test fitting both on univariate and multivariate timeseries model, mode, series = config + # auto-regression but past_covariates does not extend enough in the future with pytest.raises(ValueError): model_instance = model(lags=4, lags_past_covariates=4, multi_models=mode) model_instance.fit(series=series, past_covariates=self.sine_multivariate1) model_instance.predict(n=10) + # inconsistent number of components in series Sequence[TimeSeries] + training_series = [series.stack(series + 10), series] + with pytest.raises(ValueError) as err: + model_instance = model(lags=4, multi_models=mode) + model_instance.fit(series=training_series) + assert ( + str(err.value) + == f"Expected {training_series[0].width} components but received {training_series[1].width} " + f"components at index 1 of `series`." + ) + + # inconsistent number of components in past_covariates Sequence[TimeSeries] + training_past_covs = [series, series.stack(series * 2)] + with pytest.raises(ValueError) as err: + model_instance = model(lags=4, lags_past_covariates=2, multi_models=mode) + model_instance.fit( + series=[series, series + 10], + past_covariates=training_past_covs, + ) + assert ( + str(err.value) + == f"Expected {training_past_covs[0].width} components but received {training_past_covs[1].width} " + f"components at index 1 of `past_covariates`." + ) + model_instance = model(lags=12, multi_models=mode) model_instance.fit(series=series) assert model_instance.lags.get("past") is None @@ -1529,6 +1650,239 @@ def test_integer_indexed_series(self, mode): # the time axis returned by the second model should be as expected assert all(preds[1].time_index == pd.RangeIndex(start=50, stop=70, step=2)) + @pytest.mark.parametrize( + "config", + itertools.product( + [ + ({"lags": [-3, -2, -1]}, {"lags": {"gaussian": 3}}), + ({"lags": 3}, {"lags": {"gaussian": 3, "sine": 3}}), + ( + {"lags_past_covariates": 2}, + {"lags_past_covariates": {"lin_past": 2}}, + ), + ( + {"lags": 5, "lags_future_covariates": [-2, 3]}, + { + "lags": { + "gaussian": [-5, -4, -3, -2, -1], + "sine": [-5, -4, -3, -2, -1], + }, + "lags_future_covariates": { + "lin_future": [-2, 3], + "sine_future": [-2, 3], + }, + }, + ), + ( + {"lags": 5, "lags_future_covariates": [-2, 3]}, + { + "lags": { + "gaussian": [-5, -4, -3, -2, -1], + "sine": [-5, -4, -3, -2, -1], + }, + "lags_future_covariates": { + "sine_future": [-2, 3], + "default_lags": [-2, 3], + }, + }, + ), + ], + [True, False], + ), + ) + def test_component_specific_lags_forecasts(self, config): + """Verify that the same lags, defined using int/list or dictionnaries yield the same results""" + (list_lags, dict_lags), multiple_series = config + multivar_target = "lags" in dict_lags and len(dict_lags["lags"]) > 1 + multivar_future_cov = ( + "lags_future_covariates" in dict_lags + and len(dict_lags["lags_future_covariates"]) > 1 + ) + + # create series based on the model parameters + series = tg.gaussian_timeseries(length=20, column_name="gaussian") + if multivar_target: + series = series.stack(tg.sine_timeseries(length=20, column_name="sine")) + + future_cov = tg.linear_timeseries(length=30, column_name="lin_future") + if multivar_future_cov: + future_cov = future_cov.stack( + tg.sine_timeseries(length=30, column_name="sine_future") + ) + + past_cov = tg.linear_timeseries(length=30, column_name="lin_past") + + if multiple_series: + # second series have different component names + series = [ + series, + series.with_columns_renamed( + ["gaussian", "sine"][: series.width], + ["other", "names"][: series.width], + ) + + 10, + ] + past_cov = [past_cov, past_cov] + future_cov = [future_cov, future_cov] + + # the lags are identical across the components for each series + model = LinearRegressionModel(**list_lags) + model.fit( + series=series, + past_covariates=past_cov if model.supports_past_covariates else None, + future_covariates=future_cov if model.supports_future_covariates else None, + ) + + # the lags are specified for each component, individually + model2 = LinearRegressionModel(**dict_lags) + model2.fit( + series=series, + past_covariates=past_cov if model2.supports_past_covariates else None, + future_covariates=future_cov if model2.supports_future_covariates else None, + ) + + # n == output_chunk_length + pred = model.predict( + 1, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + pred2 = model2.predict( + 1, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model2.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model2.supports_future_covariates + else None, + ) + np.testing.assert_array_almost_equal(pred.values(), pred2.values()) + assert pred.time_index.equals(pred2.time_index) + + # n > output_chunk_length + pred = model.predict( + 3, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + pred2 = model2.predict( + 3, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model2.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model2.supports_future_covariates + else None, + ) + np.testing.assert_array_almost_equal(pred.values(), pred2.values()) + assert pred.time_index.equals(pred2.time_index) + + @pytest.mark.parametrize( + "config", + itertools.product( + [ + {"lags": {"gaussian": [-1, -3], "sine": [-2, -4, -6]}}, + {"lags_past_covariates": {"default_lags": 2}}, + { + "lags": { + "gaussian": [-5, -2, -1], + "sine": [-2, -1], + }, + "lags_future_covariates": { + "lin_future": (1, 4), + "default_lags": (2, 2), + }, + }, + { + "lags": { + "default_lags": [-5, -4], + }, + "lags_future_covariates": { + "sine_future": (1, 1), + "default_lags": [-2, 0, 1, 2], + }, + }, + ], + [True, False], + ), + ) + def test_component_specific_lags(self, config): + """Checking various combination of component-specific lags""" + (dict_lags, multiple_series) = config + multivar_target = "lags" in dict_lags and len(dict_lags["lags"]) > 1 + multivar_future_cov = ( + "lags_future_covariates" in dict_lags + and len(dict_lags["lags_future_covariates"]) > 1 + ) + + # create series based on the model parameters + series = tg.gaussian_timeseries(length=20, column_name="gaussian") + if multivar_target: + series = series.stack(tg.sine_timeseries(length=20, column_name="sine")) + + future_cov = tg.linear_timeseries(length=30, column_name="lin_future") + if multivar_future_cov: + future_cov = future_cov.stack( + tg.sine_timeseries(length=30, column_name="sine_future") + ) + + past_cov = tg.linear_timeseries(length=30, column_name="lin_past") + + if multiple_series: + # second series have different component names + series = [ + series, + series.with_columns_renamed( + ["gaussian", "sine"][: series.width], + ["other", "names"][: series.width], + ) + + 10, + ] + past_cov = [past_cov, past_cov] + future_cov = [future_cov, future_cov] + + model = LinearRegressionModel(**dict_lags, output_chunk_length=4) + model.fit( + series=series, + past_covariates=past_cov if model.supports_past_covariates else None, + future_covariates=future_cov if model.supports_future_covariates else None, + ) + # n < output_chunk_length + model.predict( + 1, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + + # n > output_chunk_length + model.predict( + 7, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + @pytest.mark.parametrize( "config", itertools.product( @@ -2262,29 +2616,34 @@ def test_fit_predict_determinism(self, config): @pytest.mark.parametrize( "config", itertools.product(models_cls_kwargs_errs, [True, False]) ) - def test_probabilistic_forecast_accuracy(self, config): + def test_probabilistic_forecast_accuracy_univariate(self, config): (model_cls, model_kwargs, err), mode = config model_kwargs["multi_models"] = mode + model = model_cls(**model_kwargs) self.helper_test_probabilistic_forecast_accuracy( - model_cls, - model_kwargs, + model, err, self.constant_ts, self.constant_noisy_ts, ) - if issubclass(model_cls, GlobalForecastingModel): + + @pytest.mark.slow + @pytest.mark.parametrize( + "config", itertools.product(models_cls_kwargs_errs, [True, False]) + ) + def test_probabilistic_forecast_accuracy_multivariate(self, config): + (model_cls, model_kwargs, err), mode = config + model_kwargs["multi_models"] = mode + model = model_cls(**model_kwargs) + if model.supports_multivariate: self.helper_test_probabilistic_forecast_accuracy( - model_cls, - model_kwargs, + model, err, self.constant_multivar_ts, self.constant_noisy_multivar_ts, ) - def helper_test_probabilistic_forecast_accuracy( - self, model_cls, model_kwargs, err, ts, noisy_ts - ): - model = model_cls(**model_kwargs) + def helper_test_probabilistic_forecast_accuracy(self, model, err, ts, noisy_ts): model.fit(noisy_ts[:100]) pred = model.predict(n=100, num_samples=100) diff --git a/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py b/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py index 3c46330022..4bff71fbe9 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py @@ -1396,7 +1396,7 @@ def test_lagged_prediction_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags` must be a `Sequence` containing only `int` values less than 0." + "`lags` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # Test invalid `lags_past_covariates` values: with pytest.raises(ValueError) as err: @@ -1407,7 +1407,7 @@ def test_lagged_prediction_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags_past_covariates` must be a `Sequence` containing only `int` values less than 0." + "`lags_past_covariates` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # This should *not* throw an error: create_lagged_prediction_data( diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index b17a3f862c..98f515e545 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -1695,7 +1695,7 @@ def test_lagged_training_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags` must be a `Sequence` containing only `int` values less than 0." + "`lags` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # Test invalid `lags_past_covariates` values: with pytest.raises(ValueError) as err: @@ -1708,7 +1708,7 @@ def test_lagged_training_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags_past_covariates` must be a `Sequence` containing only `int` values less than 0." + "`lags_past_covariates` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # Test invalid `lags_future_covariates` values: create_lagged_training_data( diff --git a/darts/tests/utils/tabularization/test_get_feature_times.py b/darts/tests/utils/tabularization/test_get_feature_times.py index 6402fc2d32..e63a8e4057 100644 --- a/darts/tests/utils/tabularization/test_get_feature_times.py +++ b/darts/tests/utils/tabularization/test_get_feature_times.py @@ -1055,7 +1055,7 @@ def test_feature_times_invalid_lag_values_error(self): with pytest.raises(ValueError) as err: _get_feature_times(target_series=series, lags=[0], is_training=False) assert ( - "`lags` must be a `Sequence` containing only `int` values less than 0." + "`lags` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # `lags_past_covariates` not <= -1: with pytest.raises(ValueError) as err: @@ -1063,7 +1063,7 @@ def test_feature_times_invalid_lag_values_error(self): past_covariates=series, lags_past_covariates=[0], is_training=False ) assert ( - "`lags_past_covariates` must be a `Sequence` containing only `int` values less than 0." + "`lags_past_covariates` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # `lags_future_covariates` can be positive, negative, and/or zero - no error should be thrown: _get_feature_times( diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 74c1c65ea7..835d793196 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -1,13 +1,15 @@ import warnings from functools import reduce from math import inf -from typing import List, Optional, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union try: from typing import Literal except ImportError: from typing_extensions import Literal +from itertools import chain + import numpy as np import pandas as pd from numpy.lib.stride_tricks import as_strided @@ -25,9 +27,9 @@ def create_lagged_data( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, output_chunk_length: int = 1, uses_static_covariates: bool = True, last_static_covariates_shape: Optional[Tuple[int, int]] = None, @@ -152,15 +154,18 @@ def create_lagged_data( Optionally, the lags of the target series to be used as (auto-regressive) features. If not specified, auto-regressive features will *not* be added to `X`. Each lag value is assumed to be negative (e.g. `lags = [-3, -1]` will extract `target_series` values which are 3 timesteps and 1 timestep away from - the current value). + the current value). If the lags are provided as a dictionary, the lags values are specific to each + component in the target series. lags_past_covariates Optionally, the lags of `past_covariates` to be used as features. Like `lags`, each lag value is assumed to - be less than or equal to -1. + be less than or equal to -1. If the lags are provided as a dictionary, the lags values are specific to each + component in the past covariates series. lags_future_covariates Optionally, the lags of `future_covariates` to be used as features. Unlike `lags` and `lags_past_covariates`, `lags_future_covariates` values can be positive (i.e. use values *after* time `t` to predict target at time `t`), zero (i.e. use values *at* time `t` to predict target at time `t`), and/or - negative (i.e. use values *before* time `t` to predict target at time `t`). + negative (i.e. use values *before* time `t` to predict target at time `t`). If the lags are provided as + a dictionary, the lags values are specific to each component in the future covariates series. uses_static_covariates Whether the model uses/expects static covariates. If `True`, it enforces that static covariates must have identical shapes across all target series. @@ -329,9 +334,9 @@ def create_lagged_training_data( output_chunk_length: int, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, uses_static_covariates: bool = True, last_static_covariates_shape: Optional[Tuple[int, int]] = None, max_samples_per_ts: Optional[int] = None, @@ -370,15 +375,18 @@ def create_lagged_training_data( Optionally, the lags of the target series to be used as (auto-regressive) features. If not specified, auto-regressive features will *not* be added to `X`. Each lag value is assumed to be negative (e.g. `lags = [-3, -1]` will extract `target_series` values which are 3 timesteps and 1 timestep away from - the current value). + the current value). If the lags are provided as a dictionary, the lags values are specific to each + component in the target series. lags_past_covariates Optionally, the lags of `past_covariates` to be used as features. Like `lags`, each lag value is assumed to - be less than or equal to -1. + be less than or equal to -1. If the lags are provided as a dictionary, the lags values are specific to each + component in the past covariates series. lags_future_covariates Optionally, the lags of `future_covariates` to be used as features. Unlike `lags` and `lags_past_covariates`, `lags_future_covariates` values can be positive (i.e. use values *after* time `t` to predict target at time `t`), zero (i.e. use values *at* time `t` to predict target at time `t`), and/or negative (i.e. use values - *before* time `t` to predict target at time `t`). + *before* time `t` to predict target at time `t`). If the lags are provided as a dictionary, the lags values + are specific to each component in the future covariates series. uses_static_covariates Whether the model uses/expects static covariates. If `True`, it enforces that static covariates must have identical shapes across all target series. @@ -467,9 +475,9 @@ def create_lagged_prediction_data( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, uses_static_covariates: bool = True, last_static_covariates_shape: Optional[Tuple[int, int]] = None, max_samples_per_ts: Optional[int] = None, @@ -500,15 +508,18 @@ def create_lagged_prediction_data( Optionally, the lags of the target series to be used as (auto-regressive) features. If not specified, auto-regressive features will *not* be added to `X`. Each lag value is assumed to be negative (e.g. `lags = [-3, -1]` will extract `target_series` values which are 3 timesteps and 1 timestep away from - the current value). + the current value). If the lags are provided as a dictionary, the lags values are specific to each + component in the target series. lags_past_covariates Optionally, the lags of `past_covariates` to be used as features. Like `lags`, each lag value is assumed to - be less than or equal to -1. + be less than or equal to -1. If the lags are provided as a dictionary, the lags values are specific to each + component in the past covariates series. lags_future_covariates Optionally, the lags of `future_covariates` to be used as features. Unlike `lags` and `lags_past_covariates`, `lags_future_covariates` values can be positive (i.e. use values *after* time `t` to predict target at time `t`), zero (i.e. use values *at* time `t` to predict target at time `t`), and/or negative (i.e. use - values *before* time `t` to predict target at time `t`). + values *before* time `t` to predict target at time `t`). If the lags are provided as a dictionary, the lags + values are specific to each component in the future covariates series. uses_static_covariates Whether the model uses/expects static covariates. If `True`, it enforces that static covariates must have identical shapes across all target series. @@ -582,11 +593,11 @@ def create_lagged_prediction_data( def add_static_covariates_to_lagged_data( - features: Union[np.array, Sequence[np.array]], + features: Union[np.ndarray, Sequence[np.ndarray]], target_series: Union[TimeSeries, Sequence[TimeSeries]], uses_static_covariates: bool = True, last_shape: Optional[Tuple[int, int]] = None, -) -> Union[np.array, Sequence[np.array]]: +) -> Union[np.ndarray, Sequence[np.ndarray]]: """ Add static covariates to the features' table for RegressionModels. If `uses_static_covariates=True`, all target series used in `fit()` and `predict()` must have static @@ -676,9 +687,9 @@ def create_lagged_component_names( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, output_chunk_length: int = 1, concatenate: bool = True, use_static_covariates: bool = False, @@ -687,11 +698,16 @@ def create_lagged_component_names( Helper function called to retrieve the name of the features and labels arrays created with `create_lagged_data()`. The order of the features is the following: - Along the `n_lagged_features` axis, `X` has the following structure (for `*_lags=[-2,-1]` and - `*_series.n_components = 2`): + Along the `n_lagged_features` axis, `X` has the following structure: lagged_target | lagged_past_covariates | lagged_future_covariates | static covariates - where each `lagged_*` has the following structure: + + For `*_lags=[-2,-1]` and `*_series.n_components = 2` (lags shared across all the components), + each `lagged_*` has the following structure (grouped by lags): comp0_*_lag-2 | comp1_*_lag-2 | comp0_*_lag_-1 | comp1_*_lag-1 + For `*_lags={'comp0':[-2, -1], 'comp1':[-5, -3]}` and `*_series.n_components = 2` (component- + specific lags), each `lagged_*` has the following structure (grouped by components): + comp0_*_lag-2 | comp0_*_lag-1 | comp1_*_lag_-5 | comp1_*_lag-3 + and for static covariates (2 static covariates acting on 2 target components): cov0_*_target_comp0 | cov0_*_target_comp1 | cov1_*_target_comp0 | cov1_*_target_comp1 @@ -743,11 +759,17 @@ def create_lagged_component_names( continue components = get_single_series(variate).components.tolist() - lagged_feature_names += [ - f"{name}_{variate_type}_lag{lag}" - for lag in variate_lags - for name in components - ] + if isinstance(variate_lags, dict): + for name in components: + lagged_feature_names += [ + f"{name}_{variate_type}_lag{lag}" for lag in variate_lags[name] + ] + else: + lagged_feature_names += [ + f"{name}_{variate_type}_lag{lag}" + for lag in variate_lags + for name in components + ] if variate_type == "target" and lags: label_feature_names = [ @@ -774,13 +796,13 @@ def create_lagged_component_names( def _create_lagged_data_by_moving_window( - target_series: TimeSeries, + target_series: Optional[TimeSeries], output_chunk_length: int, past_covariates: Optional[TimeSeries], future_covariates: Optional[TimeSeries], - lags: Optional[Sequence[int]], - lags_past_covariates: Optional[Sequence[int]], - lags_future_covariates: Optional[Sequence[int]], + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], max_samples_per_ts: Optional[int], multi_models: bool, check_inputs: bool, @@ -891,7 +913,14 @@ def _create_lagged_data_by_moving_window( # Within each window, the `-1` indexed value (i.e. the value at the very end of # the window) corresponds to time `t - min_lag_i`. The negative index of the time # `t + lag_i` within this window is, therefore, `-1 + lag_i + min_lag_i`: - lags_to_extract = np.array(lags_i, dtype=int) + min_lag_i - 1 + if isinstance(lags_i, list): + lags_to_extract = np.array(lags_i, dtype=int) + min_lag_i - 1 + else: + # Lags are grouped by component, extracted from the same window + lags_to_extract = [ + np.array(comp_lags, dtype=int) + min_lag_i - 1 + for comp_lags in lags_i.values() + ] lagged_vals = _extract_lagged_vals_from_windows(windows, lags_to_extract) X.append(lagged_vals) # Cache `start_time_idx` for label creation: @@ -928,7 +957,8 @@ def _create_lagged_data_by_moving_window( def _extract_lagged_vals_from_windows( - windows: np.ndarray, lags_to_extract: Optional[np.ndarray] = None + windows: np.ndarray, + lags_to_extract: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> np.ndarray: """ Helper function called by `_create_lagged_data_by_moving_window` that @@ -938,19 +968,34 @@ def _extract_lagged_vals_from_windows( is done such that the order of elements along axis 1 matches the pattern described in the docstring of `create_lagged_data`. - If `lags_to_extract` is specified, then only those values within each window that + If `lags_to_extract` is not specified, all of the values within each window is extracted. + If `lags_to_extract` is specified as an np.ndarray, then only those values within each window that are indexed by `lags_to_extract` will be returned. In such cases, the shape of the returned lagged values is `(num_windows, num_components * lags_to_extract.size, num_series)`. For example, if `lags_to_extract = [-2]`, only the second-to-last values within each window will be extracted. - If `lags_to_extract` is not specified, all of the values within each window is extracted. + If `lags_to_extract` is specified as a list of np.ndarray, the values will be extracted using the + lags provided for each component. In such cases, the shape of the returned lagged values is + `(num_windows, sum([comp_lags.size for comp_lags in lags_to_extract]), num_series)`. For example, + if `lags_to_extract = [[-2, -1], [-1]]`, the second-to-last and last values of the first component + and the last values of the second component within each window will be extracted. """ # windows.shape = (num_windows, num_components, num_samples, window_len): - if lags_to_extract is not None: - windows = windows[:, :, :, lags_to_extract] - # windows.shape = (num_windows, window_len, num_components, num_samples): - windows = np.moveaxis(windows, (0, 3, 1, 2), (0, 1, 2, 3)) - # lagged_vals.shape = (num_windows, num_components*window_len, num_samples): - lagged_vals = windows.reshape((windows.shape[0], -1, windows.shape[-1])) + if isinstance(lags_to_extract, list): + # iterate over the components-specific lags + comp_windows = [ + windows[:, i, :, comp_lags_to_extract] + for i, comp_lags_to_extract in enumerate(lags_to_extract) + ] + # windows.shape = (sum(lags_len) across components, num_windows, num_samples): + windows = np.concatenate(comp_windows, axis=0) + lagged_vals = np.moveaxis(windows, (1, 0, 2), (0, 1, 2)) + else: + if lags_to_extract is not None: + windows = windows[:, :, :, lags_to_extract] + # windows.shape = (num_windows, window_len, num_components, num_samples): + windows = np.moveaxis(windows, (0, 3, 1, 2), (0, 1, 2, 3)) + # lagged_vals.shape = (num_windows, num_components*window_len, num_samples): + lagged_vals = windows.reshape((windows.shape[0], -1, windows.shape[-1])) return lagged_vals @@ -1081,9 +1126,9 @@ def _get_feature_times( target_series: Optional[TimeSeries] = None, past_covariates: Optional[TimeSeries] = None, future_covariates: Optional[TimeSeries] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, output_chunk_length: int = 1, is_training: bool = True, return_min_and_max_lags: bool = False, @@ -1198,6 +1243,9 @@ def _get_feature_times( Optionally, specifies whether the largest magnitude lag value for each series should also be returned along with the 'eligible' feature times + Note: if the lags are provided as a dictionary for the target series or any of the covariates series, the + component-specific lags are grouped into a single list to compute the corresponding feature time. + Returns ------- feature_times @@ -1243,6 +1291,10 @@ def _get_feature_times( [target_series, past_covariates, future_covariates], [lags, lags_past_covariates, lags_future_covariates], ): + # union of the component-specific lags, unsorted + if isinstance(lags_i, dict): + lags_i = list(set(chain(*lags_i.values()))) + if check_inputs and (series_i is not None): _check_series_length( series_i, @@ -1591,9 +1643,9 @@ def _all_equal_freq(*series: Union[TimeSeries, None]) -> bool: def _check_lags( - lags: Sequence[int], - lags_past_covariates: Sequence[int], - lags_future_covariates: Sequence[int], + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], ) -> None: """ Throws `ValueError` if any `lag` values aren't negative OR if no lags have been specified. @@ -1606,9 +1658,13 @@ def _check_lags( if not lags_is_none[-1]: is_target_or_past = i < 2 max_lag = -1 if is_target_or_past else inf + + if isinstance(lags_i, dict): + lags_i = list(set(chain(*lags_i.values()))) + raise_if( any((lag > max_lag or not isinstance(lag, int)) for lag in lags_i), - f"`lags{suffix}` must be a `Sequence` containing only `int` values less than {max_lag + 1}.", + f"`lags{suffix}` must be a `Sequence` or `Dict` containing only `int` values less than {max_lag + 1}.", ) raise_if( all(lags_is_none),