From cfd381bc6c3e167099697c80099bd5407c1cb433 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 18 Aug 2023 11:41:02 +0200 Subject: [PATCH 01/30] feat: updated lags sanity checks to accept dictionnary --- darts/models/forecasting/regression_model.py | 189 ++++++++++++++----- 1 file changed, 146 insertions(+), 43 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index f51c26e902..cd0d797ee8 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -60,13 +60,18 @@ logger = get_logger(__name__) +LAGS_TYPE = Union[int, List[int], Dict[str, Union[int, List[int]]]] +FUTURE_LAGS_TYPE = Union[ + Tuple[int, int], List[int], Dict[str, Union[Tuple[int, int], List[int]]] +] + class RegressionModel(GlobalForecastingModel): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, model=None, @@ -80,7 +85,10 @@ def __init__( ---------- lags Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise, a list of integers with lags is required (each lag must be < 0). + are used (from -1 backward). Otherwise, a list of integers with lags (each lag must be < 0). + In order to specify component-wise lags, a dictionnary with the component name or index as key and the + lags value can be provided. The number of keys in the dictionnary must match the number of components in + the series. lags_past_covariates Number of lagged past_covariates values used to predict the next time step. If an integer is given the last `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers @@ -132,6 +140,7 @@ def __init__( self.model = model self.lags: Dict[str, List[int]] = {} + self.component_lags: Dict[str, Dict[str, List[int]]] = {} self.input_dim = None self.multi_models = multi_models self._considers_static_covariates = use_static_covariates @@ -174,18 +183,18 @@ def __init__( for _lags, lags_name in lags_type_checks: raise_if_not( - isinstance(_lags, (int, list)) or _lags is None, - f"`{lags_name}` must be of type int or list. Given: {type(_lags)}.", + isinstance(_lags, (int, list, dict)) or _lags is None, + f"`{lags_name}` must be of type int, list or dict. Given: {type(_lags)}.", ) raise_if( isinstance(_lags, bool), - f"`{lags_name}` must be of type int or list, not bool.", + f"`{lags_name}` must be of type int, list or dict, not bool.", ) raise_if_not( - isinstance(lags_future_covariates, (tuple, list)) + isinstance(lags_future_covariates, (tuple, list, dict)) or lags_future_covariates is None, - f"`lags_future_covariates` must be of type tuple or list. Given: {type(lags_future_covariates)}.", + f"`lags_future_covariates` must be of type tuple, list or dict. Given: {type(lags_future_covariates)}.", ) if isinstance(lags_future_covariates, tuple): @@ -202,57 +211,151 @@ def __init__( ) # set lags - if isinstance(lags, int): - raise_if_not(lags > 0, f"`lags` must be strictly positive. Given: {lags}.") + def _check_int_lags(lags: int, lags_name: str) -> Optional[List[int]]: + raise_if_not( + lags > 0, f"{lags_name} must be strictly positive. Given: {lags}." + ) # selecting last `lags` lags, starting from position 1 (skipping current, pos 0, the one we want to predict) - self.lags["target"] = list(range(-lags, 0)) - elif isinstance(lags, list): + return list(range(-lags, 0)) + + def _check_list_lags(lags: list, lags_name: str) -> Optional[List[int]]: for lag in lags: raise_if( not isinstance(lag, int) or (lag >= 0), - f"Every element of `lags` must be a strictly negative integer. Given: {lags}.", + f"Every element of {lags_name} must be a strictly negative integer. Given: {lags}.", ) if lags: - self.lags["target"] = sorted(lags) + return sorted(lags) + + def _check_dict_lags( + lags: dict, lags_name: str + ) -> Optional[Tuple[List[int], Dict[str, List[int]]]]: + components_lags = dict() + min_lags = None + max_lags = None + # TODO: use component idx instead of component name for robustness? + for comp_idx, (comp_name, comp_lags) in enumerate(lags.items()): + if isinstance(comp_lags, int): + components_lags[comp_name] = _check_int_lags( + comp_lags, f"{lags_name} for component {comp_name}" + ) + elif isinstance(comp_lags, list): + components_lags[comp_name] = _check_list_lags( + comp_lags, f"{lags_name} for component {comp_name}" + ) + else: + raise_log( + ValueError( + f"when passed as a dictionnary, {lags_name} for component {comp_name} must be either a " + f"strictly positive integer or a list, received : {type(comp_lags)}." + ), + logger, + ) + min_lags: int = min(components_lags[comp_name]) + max_lags: int = max(components_lags[comp_name]) + return [min_lags, max_lags], components_lags + + if isinstance(lags, int): + conv_lags = _check_int_lags(lags, "`lags`") + if conv_lags: + self.lags["target"] = conv_lags + elif isinstance(lags, list): + conv_lags = _check_list_lags(lags, "`lags`") + if conv_lags: + self.lags["target"] = conv_lags + elif isinstance(lags, dict): + conv_lags = _check_dict_lags(lags, "`lags`") + if conv_lags: + # dummy, used to compute the extreme lags + self.lags["target"] = conv_lags[0] + # actual lags + self.component_lags["target"] = conv_lags[1] if isinstance(lags_past_covariates, int): - raise_if_not( - lags_past_covariates > 0, - f"`lags_past_covariates` must be an integer > 0. Given: {lags_past_covariates}.", - ) - self.lags["past"] = list(range(-lags_past_covariates, 0)) + conv_lags = _check_int_lags(lags_past_covariates, "`lags_past_covariates`") + if conv_lags: + self.lags["past"] = conv_lags elif isinstance(lags_past_covariates, list): - for lag in lags_past_covariates: - raise_if( - not isinstance(lag, int) or (lag >= 0), - f"Every element of `lags_covariates` must be an integer < 0. Given: {lags_past_covariates}.", - ) - if lags_past_covariates: - self.lags["past"] = sorted(lags_past_covariates) - - if isinstance(lags_future_covariates, tuple): + conv_lags = _check_list_lags(lags_past_covariates, "`lags_past_covariates`") + if conv_lags: + self.lags["past"] = conv_lags + elif isinstance(lags_past_covariates, dict): + conv_lags = _check_dict_lags(lags_past_covariates, "`lags_past_covariates`") + if conv_lags: + # dummy, used to compute the extreme lags + self.lags["past"] = conv_lags[0] + # actual lags + self.component_lags["past"] = conv_lags[1] + + def _check_tuple_future_lags( + lags_future_covariates: Tuple[int, int], lags_name: str + ): raise_if_not( lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, - f"`lags_future_covariates` tuple must contain integers >= 0. Given: {lags_future_covariates}.", + f"{lags_name} tuple must contain integers >= 0. Given: {lags_future_covariates}.", ) - if ( - lags_future_covariates[0] is not None - and lags_future_covariates[1] is not None - ): - if not ( - lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0 - ): - self.lags["future"] = list( - range(-lags_future_covariates[0], lags_future_covariates[1]) - ) - elif isinstance(lags_future_covariates, list): + # TODO: check if it should return None or [] + if lags_future_covariates[0] + lags_future_covariates[1] == 0: + return None + else: + return list( + range(-lags_future_covariates[0], lags_future_covariates[1]) + ) + + def _check_list_future_lags(lags_future_covariates: List[int], lags_name: str): for lag in lags_future_covariates: raise_if( not isinstance(lag, int) or isinstance(lag, bool), - f"Every element of `lags_future_covariates` must be an integer. Given: {lags_future_covariates}.", + f"Every element of {lags_name} must be an integer. Given: {lags_future_covariates}.", ) if lags_future_covariates: - self.lags["future"] = sorted(lags_future_covariates) + return sorted(lags_future_covariates) + + def _check_dict_future_lags( + lags_future_covariates: Dict[str, Union[Tuple, List]] + ): + components_lags = dict() + # TODO: use component idx instead of component name for robustness? + for comp_idx, (comp_name, comp_lags) in enumerate( + lags_future_covariates.items() + ): + if isinstance(comp_lags, tuple): + components_lags[comp_name] = _check_tuple_future_lags( + comp_lags, f"`future_covariates_lags` for {comp_name}" + ) + elif isinstance(comp_lags, list): + components_lags[comp_name] = _check_list_future_lags( + comp_lags, f"`future_covariates_lags` for {comp_name}" + ) + else: + raise_log( + ValueError( + f"when passed as a dictionnary, `future_covariates_lags` for component {comp_name} must be " + f"either a strictly positive integer or a list, received : {type(comp_lags)}." + ), + logger, + ) + return components_lags + + if isinstance(lags_future_covariates, tuple): + conv_lags = _check_tuple_future_lags( + lags_future_covariates, "`future_covariates_lags`" + ) + if conv_lags: + self.lags["future"] = conv_lags + elif isinstance(lags_future_covariates, list): + conv_lags = _check_list_future_lags( + lags_future_covariates, "`future_covariates_lags`" + ) + if conv_lags: + self.lags["future"] = conv_lags + elif isinstance(lags_future_covariates, dict): + conv_lags = _check_dict_future_lags(lags_future_covariates) + if conv_lags: + # dummy, used to compute the extreme lags + self.lags["future"] = conv_lags[0] + # actual lags + self.component_lags["future"] = conv_lags[1] self.pred_dim = self.output_chunk_length if self.multi_models else 1 From b3ce1f1ee8cebd8303b94d2afbb4707008102ab7 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 18 Aug 2023 13:02:02 +0200 Subject: [PATCH 02/30] fix: better management of corner cases during lags checks --- darts/models/forecasting/regression_model.py | 71 ++++++++++++++------ 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index cd0d797ee8..218b203ec8 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -211,28 +211,34 @@ def __init__( ) # set lags - def _check_int_lags(lags: int, lags_name: str) -> Optional[List[int]]: + def _check_int_lags(lags: int, lags_name: str) -> List[int]: raise_if_not( lags > 0, f"{lags_name} must be strictly positive. Given: {lags}." ) # selecting last `lags` lags, starting from position 1 (skipping current, pos 0, the one we want to predict) return list(range(-lags, 0)) - def _check_list_lags(lags: list, lags_name: str) -> Optional[List[int]]: + def _check_list_lags(lags: list, lags_name: str) -> List[int]: for lag in lags: raise_if( not isinstance(lag, int) or (lag >= 0), f"Every element of {lags_name} must be a strictly negative integer. Given: {lags}.", ) - if lags: - return sorted(lags) + return sorted(lags) def _check_dict_lags( lags: dict, lags_name: str ) -> Optional[Tuple[List[int], Dict[str, List[int]]]]: - components_lags = dict() + + raise_if_not( + len(lags) > 0, + f"When passed as a dictionnary, {lags_name} must contain at least one key.", + logger, + ) + min_lags = None max_lags = None + components_lags = dict() # TODO: use component idx instead of component name for robustness? for comp_idx, (comp_name, comp_lags) in enumerate(lags.items()): if isinstance(comp_lags, int): @@ -246,13 +252,21 @@ def _check_dict_lags( else: raise_log( ValueError( - f"when passed as a dictionnary, {lags_name} for component {comp_name} must be either a " + f"When passed as a dictionnary, {lags_name} for component {comp_name} must be either a " f"strictly positive integer or a list, received : {type(comp_lags)}." ), logger, ) - min_lags: int = min(components_lags[comp_name]) - max_lags: int = max(components_lags[comp_name]) + + if min_lags is None: + min_lags = components_lags[comp_name][0] + else: + min_lags = min(min_lags, components_lags[comp_name][0]) + + if max_lags is None: + max_lags = components_lags[comp_name][-1] + else: + max_lags = max(max_lags, components_lags[comp_name][-1]) return [min_lags, max_lags], components_lags if isinstance(lags, int): @@ -289,31 +303,38 @@ def _check_dict_lags( def _check_tuple_future_lags( lags_future_covariates: Tuple[int, int], lags_name: str - ): + ) -> List[int]: raise_if_not( lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, f"{lags_name} tuple must contain integers >= 0. Given: {lags_future_covariates}.", ) - # TODO: check if it should return None or [] - if lags_future_covariates[0] + lags_future_covariates[1] == 0: - return None - else: - return list( - range(-lags_future_covariates[0], lags_future_covariates[1]) - ) + raise_if( + lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0, + f"{lags_name} tuple cannot be (0,0).", + logger, + ) + return list(range(-lags_future_covariates[0], lags_future_covariates[1])) - def _check_list_future_lags(lags_future_covariates: List[int], lags_name: str): + def _check_list_future_lags( + lags_future_covariates: List[int], lags_name: str + ) -> List[int]: for lag in lags_future_covariates: raise_if( not isinstance(lag, int) or isinstance(lag, bool), f"Every element of {lags_name} must be an integer. Given: {lags_future_covariates}.", ) - if lags_future_covariates: - return sorted(lags_future_covariates) + return sorted(lags_future_covariates) def _check_dict_future_lags( lags_future_covariates: Dict[str, Union[Tuple, List]] ): + raise_if_not( + len(lags) > 0, + "When passed as a dictionnary, `lags_future_covariates` must contain at least one key.", + logger, + ) + min_lags = None + max_lags = None components_lags = dict() # TODO: use component idx instead of component name for robustness? for comp_idx, (comp_name, comp_lags) in enumerate( @@ -330,11 +351,21 @@ def _check_dict_future_lags( else: raise_log( ValueError( - f"when passed as a dictionnary, `future_covariates_lags` for component {comp_name} must be " + f"When passed as a dictionnary, `future_covariates_lags` for component {comp_name} must be " f"either a strictly positive integer or a list, received : {type(comp_lags)}." ), logger, ) + + if min_lags is None: + min_lags = components_lags[comp_name][0] + else: + min_lags = min(min_lags, components_lags[comp_name][0]) + + if max_lags is None: + max_lags = components_lags[comp_name][-1] + else: + max_lags = max(max_lags, components_lags[comp_name][-1]) return components_lags if isinstance(lags_future_covariates, tuple): From 2dde70fe8e95255e39a560aef5240ba92ab46ee6 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 18 Aug 2023 13:20:56 +0200 Subject: [PATCH 03/30] fix: improved modularity --- darts/models/forecasting/regression_model.py | 157 +++++++++---------- 1 file changed, 74 insertions(+), 83 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 218b203ec8..7d3e1a2b6d 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -210,12 +210,26 @@ def __init__( "`lags_future_covariates` tuple must contain integers, not bool", ) - # set lags + self._set_lags( + lags=lags, + lags_past_covariates=lags_past_covariates, + lags_future_covariates=lags_future_covariates, + ) + + self.pred_dim = self.output_chunk_length if self.multi_models else 1 + + def _set_lags( + self, + lags: Optional[LAGS_TYPE], + lags_past_covariates: Optional[LAGS_TYPE], + lags_future_covariates: Optional[FUTURE_LAGS_TYPE], + ): + """Based on the type of the argument and the nature of the covariates, convert the lags to a list.""" + def _check_int_lags(lags: int, lags_name: str) -> List[int]: raise_if_not( lags > 0, f"{lags_name} must be strictly positive. Given: {lags}." ) - # selecting last `lags` lags, starting from position 1 (skipping current, pos 0, the one we want to predict) return list(range(-lags, 0)) def _check_list_lags(lags: list, lags_name: str) -> List[int]: @@ -226,6 +240,30 @@ def _check_list_lags(lags: list, lags_name: str) -> List[int]: ) return sorted(lags) + def _check_tuple_future_lags( + lags_future_covariates: Tuple[int, int], lags_name: str + ) -> List[int]: + raise_if_not( + lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, + f"{lags_name} tuple must contain stricly positibe integers. Given: {lags_future_covariates}.", + ) + raise_if( + lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0, + f"{lags_name} tuple cannot be (0, 0) as it corresponds to an empty list of lags.", + logger, + ) + return list(range(-lags_future_covariates[0], lags_future_covariates[1])) + + def _check_list_future_lags( + lags_future_covariates: List[int], lags_name: str + ) -> List[int]: + for lag in lags_future_covariates: + raise_if( + not isinstance(lag, int) or isinstance(lag, bool), + f"Every element of {lags_name} must be an integer. Given: {lags_future_covariates}.", + ) + return sorted(lags_future_covariates) + def _check_dict_lags( lags: dict, lags_name: str ) -> Optional[Tuple[List[int], Dict[str, List[int]]]]: @@ -236,24 +274,43 @@ def _check_dict_lags( logger, ) + invalid_type = False + supported_types = "" min_lags = None max_lags = None components_lags = dict() # TODO: use component idx instead of component name for robustness? for comp_idx, (comp_name, comp_lags) in enumerate(lags.items()): - if isinstance(comp_lags, int): - components_lags[comp_name] = _check_int_lags( - comp_lags, f"{lags_name} for component {comp_name}" - ) - elif isinstance(comp_lags, list): - components_lags[comp_name] = _check_list_lags( - comp_lags, f"{lags_name} for component {comp_name}" - ) + if lags_name == "lags_future_covariates": + if isinstance(comp_lags, tuple): + components_lags[comp_name] = _check_tuple_future_lags( + comp_lags, f"{lags_name} for component {comp_name}" + ) + elif isinstance(comp_lags, list): + components_lags[comp_name] = _check_list_future_lags( + comp_lags, f"{lags_name} for component {comp_name}" + ) + else: + invalid_type = True + supported_types = "tuple or a list" else: + if isinstance(comp_lags, int): + components_lags[comp_name] = _check_int_lags( + comp_lags, f"{lags_name} for component {comp_name}" + ) + elif isinstance(comp_lags, list): + components_lags[comp_name] = _check_list_lags( + comp_lags, f"{lags_name} for component {comp_name}" + ) + else: + invalid_type = True + supported_types = "strictly positive integer or a list" + + if invalid_type: raise_log( ValueError( f"When passed as a dictionnary, {lags_name} for component {comp_name} must be either a " - f"strictly positive integer or a list, received : {type(comp_lags)}." + f"{supported_types}, received : {type(comp_lags)}." ), logger, ) @@ -269,6 +326,7 @@ def _check_dict_lags( max_lags = max(max_lags, components_lags[comp_name][-1]) return [min_lags, max_lags], components_lags + # perform the type and sanity checks if isinstance(lags, int): conv_lags = _check_int_lags(lags, "`lags`") if conv_lags: @@ -301,95 +359,28 @@ def _check_dict_lags( # actual lags self.component_lags["past"] = conv_lags[1] - def _check_tuple_future_lags( - lags_future_covariates: Tuple[int, int], lags_name: str - ) -> List[int]: - raise_if_not( - lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, - f"{lags_name} tuple must contain integers >= 0. Given: {lags_future_covariates}.", - ) - raise_if( - lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0, - f"{lags_name} tuple cannot be (0,0).", - logger, - ) - return list(range(-lags_future_covariates[0], lags_future_covariates[1])) - - def _check_list_future_lags( - lags_future_covariates: List[int], lags_name: str - ) -> List[int]: - for lag in lags_future_covariates: - raise_if( - not isinstance(lag, int) or isinstance(lag, bool), - f"Every element of {lags_name} must be an integer. Given: {lags_future_covariates}.", - ) - return sorted(lags_future_covariates) - - def _check_dict_future_lags( - lags_future_covariates: Dict[str, Union[Tuple, List]] - ): - raise_if_not( - len(lags) > 0, - "When passed as a dictionnary, `lags_future_covariates` must contain at least one key.", - logger, - ) - min_lags = None - max_lags = None - components_lags = dict() - # TODO: use component idx instead of component name for robustness? - for comp_idx, (comp_name, comp_lags) in enumerate( - lags_future_covariates.items() - ): - if isinstance(comp_lags, tuple): - components_lags[comp_name] = _check_tuple_future_lags( - comp_lags, f"`future_covariates_lags` for {comp_name}" - ) - elif isinstance(comp_lags, list): - components_lags[comp_name] = _check_list_future_lags( - comp_lags, f"`future_covariates_lags` for {comp_name}" - ) - else: - raise_log( - ValueError( - f"When passed as a dictionnary, `future_covariates_lags` for component {comp_name} must be " - f"either a strictly positive integer or a list, received : {type(comp_lags)}." - ), - logger, - ) - - if min_lags is None: - min_lags = components_lags[comp_name][0] - else: - min_lags = min(min_lags, components_lags[comp_name][0]) - - if max_lags is None: - max_lags = components_lags[comp_name][-1] - else: - max_lags = max(max_lags, components_lags[comp_name][-1]) - return components_lags - if isinstance(lags_future_covariates, tuple): conv_lags = _check_tuple_future_lags( - lags_future_covariates, "`future_covariates_lags`" + lags_future_covariates, "`lags_future_covariates`" ) if conv_lags: self.lags["future"] = conv_lags elif isinstance(lags_future_covariates, list): conv_lags = _check_list_future_lags( - lags_future_covariates, "`future_covariates_lags`" + lags_future_covariates, "`lags_future_covariates`" ) if conv_lags: self.lags["future"] = conv_lags elif isinstance(lags_future_covariates, dict): - conv_lags = _check_dict_future_lags(lags_future_covariates) + conv_lags = _check_dict_lags( + lags_future_covariates, "`lags_future_covariates`" + ) if conv_lags: # dummy, used to compute the extreme lags self.lags["future"] = conv_lags[0] # actual lags self.component_lags["future"] = conv_lags[1] - self.pred_dim = self.output_chunk_length if self.multi_models else 1 - @property def _model_encoder_settings( self, From 65c82a70ea8aab068987d0f8e7b74b68fc990522 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 18 Aug 2023 15:53:06 +0200 Subject: [PATCH 04/30] fix: simplified the logic a bit --- darts/models/forecasting/regression_model.py | 48 ++++++++------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 7d3e1a2b6d..5de580173a 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -328,54 +328,46 @@ def _check_dict_lags( # perform the type and sanity checks if isinstance(lags, int): - conv_lags = _check_int_lags(lags, "`lags`") - if conv_lags: - self.lags["target"] = conv_lags + self.lags["target"] = _check_int_lags(lags, "`lags`") elif isinstance(lags, list): - conv_lags = _check_list_lags(lags, "`lags`") - if conv_lags: - self.lags["target"] = conv_lags + self.lags["target"] = _check_list_lags(lags, "`lags`") elif isinstance(lags, dict): conv_lags = _check_dict_lags(lags, "`lags`") - if conv_lags: + if conv_lags is not None: # dummy, used to compute the extreme lags self.lags["target"] = conv_lags[0] # actual lags self.component_lags["target"] = conv_lags[1] if isinstance(lags_past_covariates, int): - conv_lags = _check_int_lags(lags_past_covariates, "`lags_past_covariates`") - if conv_lags: - self.lags["past"] = conv_lags + self.lags["past"] = _check_int_lags( + lags_past_covariates, "`lags_past_covariates`" + ) elif isinstance(lags_past_covariates, list): - conv_lags = _check_list_lags(lags_past_covariates, "`lags_past_covariates`") - if conv_lags: - self.lags["past"] = conv_lags + self.lags["past"] = _check_list_lags( + lags_past_covariates, "`lags_past_covariates`" + ) elif isinstance(lags_past_covariates, dict): conv_lags = _check_dict_lags(lags_past_covariates, "`lags_past_covariates`") - if conv_lags: + if conv_lags is not None: # dummy, used to compute the extreme lags self.lags["past"] = conv_lags[0] # actual lags self.component_lags["past"] = conv_lags[1] if isinstance(lags_future_covariates, tuple): - conv_lags = _check_tuple_future_lags( + self.lags["future"] = _check_tuple_future_lags( lags_future_covariates, "`lags_future_covariates`" ) - if conv_lags: - self.lags["future"] = conv_lags elif isinstance(lags_future_covariates, list): - conv_lags = _check_list_future_lags( + self.lags["future"] = _check_list_future_lags( lags_future_covariates, "`lags_future_covariates`" ) - if conv_lags: - self.lags["future"] = conv_lags elif isinstance(lags_future_covariates, dict): conv_lags = _check_dict_lags( lags_future_covariates, "`lags_future_covariates`" ) - if conv_lags: + if conv_lags is not None: # dummy, used to compute the extreme lags self.lags["future"] = conv_lags[0] # actual lags @@ -420,16 +412,12 @@ def extreme_lags( Optional[int], Optional[int], ]: - min_target_lag = self.lags.get("target")[0] if "target" in self.lags else None + min_target_lag = self.lags["target"][0] if "target" in self.lags else None max_target_lag = self.output_chunk_length - 1 - min_past_cov_lag = self.lags.get("past")[0] if "past" in self.lags else None - max_past_cov_lag = self.lags.get("past")[-1] if "past" in self.lags else None - min_future_cov_lag = ( - self.lags.get("future")[0] if "future" in self.lags else None - ) - max_future_cov_lag = ( - self.lags.get("future")[-1] if "future" in self.lags else None - ) + min_past_cov_lag = self.lags["past"][0] if "past" in self.lags else None + max_past_cov_lag = self.lags["past"][-1] if "past" in self.lags else None + min_future_cov_lag = self.lags["future"][0] if "future" in self.lags else None + max_future_cov_lag = self.lags["future"][-1] if "future" in self.lags else None return ( min_target_lag, max_target_lag, From 9c5b312815aa77d5ef44ff2c7f30f9365467529d Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 18 Aug 2023 15:53:47 +0200 Subject: [PATCH 05/30] feat: when generating lagged data, the values can be extracted using component-specific lags --- darts/utils/data/tabularization.py | 39 ++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 74c1c65ea7..fe825543e1 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -891,7 +891,13 @@ def _create_lagged_data_by_moving_window( # Within each window, the `-1` indexed value (i.e. the value at the very end of # the window) corresponds to time `t - min_lag_i`. The negative index of the time # `t + lag_i` within this window is, therefore, `-1 + lag_i + min_lag_i`: - lags_to_extract = np.array(lags_i, dtype=int) + min_lag_i - 1 + if isinstance(lags_i, list): + lags_to_extract = np.array(lags_i, dtype=int) + min_lag_i - 1 + else: + lags_to_extract = [ + np.array(comp_lags, dtype=int) + min_lag_i - 1 + for comp_lags in lags_i + ] lagged_vals = _extract_lagged_vals_from_windows(windows, lags_to_extract) X.append(lagged_vals) # Cache `start_time_idx` for label creation: @@ -928,7 +934,8 @@ def _create_lagged_data_by_moving_window( def _extract_lagged_vals_from_windows( - windows: np.ndarray, lags_to_extract: Optional[np.ndarray] = None + windows: np.ndarray, + lags_to_extract: Optional[Union[np.ndarray, List[np.ndarray]]] = None, ) -> np.ndarray: """ Helper function called by `_create_lagged_data_by_moving_window` that @@ -938,19 +945,31 @@ def _extract_lagged_vals_from_windows( is done such that the order of elements along axis 1 matches the pattern described in the docstring of `create_lagged_data`. - If `lags_to_extract` is specified, then only those values within each window that + If `lags_to_extract` is not specified, all of the values within each window is extracted. + If `lags_to_extract` is specified as an np.ndarray, then only those values within each window that are indexed by `lags_to_extract` will be returned. In such cases, the shape of the returned lagged values is `(num_windows, num_components * lags_to_extract.size, num_series)`. For example, if `lags_to_extract = [-2]`, only the second-to-last values within each window will be extracted. - If `lags_to_extract` is not specified, all of the values within each window is extracted. + If `lags_to_extract` is specified as a list of np.ndarray, the values will be extracted using the + lags provided for each component. """ # windows.shape = (num_windows, num_components, num_samples, window_len): - if lags_to_extract is not None: - windows = windows[:, :, :, lags_to_extract] - # windows.shape = (num_windows, window_len, num_components, num_samples): - windows = np.moveaxis(windows, (0, 3, 1, 2), (0, 1, 2, 3)) - # lagged_vals.shape = (num_windows, num_components*window_len, num_samples): - lagged_vals = windows.reshape((windows.shape[0], -1, windows.shape[-1])) + if isinstance(lags_to_extract, list): + # iterate over the components-specific lags + comp_windows = [ + windows[:, i, :, comp_lags_to_extract] + for i, comp_lags_to_extract in enumerate(lags_to_extract) + ] + # windows.shape = (sum(lags_len) across components, num_windows, num_samples): + windows = np.concatenate(comp_windows, axis=0) + lagged_vals = np.moveaxis(windows, (1, 0, 2), (0, 1, 2)) + else: + if lags_to_extract is not None: + windows = windows[:, :, :, lags_to_extract] + # windows.shape = (num_windows, window_len, num_components, num_samples): + windows = np.moveaxis(windows, (0, 3, 1, 2), (0, 1, 2, 3)) + # lagged_vals.shape = (num_windows, num_components*window_len, num_samples): + lagged_vals = windows.reshape((windows.shape[0], -1, windows.shape[-1])) return lagged_vals From 753db5b97698ca6c88e1410d28b6f73566406680 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 18 Aug 2023 16:22:42 +0200 Subject: [PATCH 06/30] feat: raise error if all the ts in target/past/future don't have the same number of components --- darts/models/forecasting/forecasting_model.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py index 452d2368cd..a848674570 100644 --- a/darts/models/forecasting/forecasting_model.py +++ b/darts/models/forecasting/forecasting_model.py @@ -2078,6 +2078,20 @@ def fit( ): self.static_covariates = series.static_covariates else: + # check that all the ts within one group have the same number of components + for ts_sequence, cov_name in zip( + [series, past_covariates, future_covariates], + ["series", "past_covariates", "future_covariates"], + ): + raise_if( + ts_sequence is not None + and not all( + [ts_sequence[0].width == ts.width for ts in ts_sequence] + ), + f"All the series in `{cov_name}` should have the same number of components", + logger, + ) + if past_covariates is not None: self._expect_past_covariates = True if future_covariates is not None: From 0cdeee7e70aa90b66e647e51e9e3a1433739fda1 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 21 Aug 2023 15:37:01 +0200 Subject: [PATCH 07/30] feat: added support for component-specific lags in fit() and predict() --- darts/models/forecasting/regression_model.py | 122 +++++++++++++++---- darts/utils/data/tabularization.py | 47 +++++-- 2 files changed, 130 insertions(+), 39 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 5de580173a..727f45870f 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -140,7 +140,7 @@ def __init__( self.model = model self.lags: Dict[str, List[int]] = {} - self.component_lags: Dict[str, Dict[str, List[int]]] = {} + self.component_lags: Dict[str, Dict[str, Sequence[int]]] = {} self.input_dim = None self.multi_models = multi_models self._considers_static_covariates = use_static_covariates @@ -266,7 +266,7 @@ def _check_list_future_lags( def _check_dict_lags( lags: dict, lags_name: str - ) -> Optional[Tuple[List[int], Dict[str, List[int]]]]: + ) -> Optional[Tuple[List[int], Dict[str, Sequence[int]]]]: raise_if_not( len(lags) > 0, @@ -474,10 +474,10 @@ def _get_last_prediction_time(self, series, forecast_horizon, overlap_end): def _create_lagged_data( self, target_series, past_covariates, future_covariates, max_samples_per_ts ): - lags = self.lags.get("target") - lags_past_covariates = self.lags.get("past") - lags_future_covariates = self.lags.get("future") - + """ + If lags were specified component-wise manner, they are contained in self.component_lags and the values + in self.lags should be ignored. + """ ( features, labels, @@ -488,9 +488,15 @@ def _create_lagged_data( output_chunk_length=self.output_chunk_length, past_covariates=past_covariates, future_covariates=future_covariates, - lags=lags, - lags_past_covariates=lags_past_covariates, - lags_future_covariates=lags_future_covariates, + lags=self.component_lags["target"] + if "target" in self.component_lags + else self.lags.get("target"), + lags_past_covariates=self.component_lags["past"] + if "past" in self.component_lags + else self.lags.get("past"), + lags_future_covariates=self.component_lags["future"] + if "future" in self.component_lags + else self.lags.get("future"), uses_static_covariates=self.uses_static_covariates, last_static_covariates_shape=None, max_samples_per_ts=max_samples_per_ts, @@ -538,9 +544,15 @@ def _fit_model( target_series=target_series, past_covariates=past_covariates, future_covariates=future_covariates, - lags=self.lags.get("target"), - lags_past_covariates=self.lags.get("past"), - lags_future_covariates=self.lags.get("future"), + lags=self.component_lags["target"] + if "target" in self.component_lags + else self.lags.get("target"), + lags_past_covariates=self.component_lags["past"] + if "past" in self.component_lags + else self.lags.get("past"), + lags_future_covariates=self.component_lags["future"] + if "future" in self.component_lags + else self.lags.get("future"), output_chunk_length=self.output_chunk_length, concatenate=False, use_static_covariates=self.uses_static_covariates, @@ -663,6 +675,30 @@ def fit( future_covariates=seq2series(future_covariates), ) + # TODO: if the keys are string, check if they are indeed in the series? + # if provided, component-wise lags must be defined for all the components + if "target" in self.component_lags: + raise_if( + len(self.component_lags["target"]) != self.input_dim["target"], + f"The training series contain {self.input_dim['target']} components, " + f"{len(self.component_lags['target'])} lags were provided. These two values must exactly match.", + logger, + ) + if "past" in self.component_lags and "past" in self.input_dim: + raise_if( + len(self.component_lags["past"]) != self.input_dim["past"], + f"The past covariates series contain {self.input_dim['past']} components, " + f"{len(self.component_lags['past'])} lags were provided. These two values must exactly match.", + logger, + ) + if "future" in self.component_lags and "future" in self.input_dim: + raise_if( + len(self.component_lags["future"]) != self.input_dim["future"], + f"The future covariates series contain {self.input_dim['future']} components, " + f"{len(self.component_lags['future'])} lags were provided. These two values must exactly match.", + logger, + ) + self._fit_model( series, past_covariates, future_covariates, max_samples_per_ts, **kwargs ) @@ -863,23 +899,57 @@ def predict( series_matrix = np.concatenate( [series_matrix, predictions[-1]], axis=1 ) - np_X.append( - series_matrix[ - :, - [ - lag - (shift + last_step_shift) - for lag in self.lags["target"] - ], - ].reshape(len(series) * num_samples, -1) - ) - # retrieve covariate lags, enforce order (dict only preserves insertion order for python 3.6+) - for cov_type in ["past", "future"]: - if cov_type in covariate_matrices: + # component-wise lags + if "target" in self.component_lags: + tmp_X = [ + series_matrix[ + :, + [lag - (shift + last_step_shift) for lag in comp_lags], + comp_i, + ] + for comp_i, (comp, comp_lags) in enumerate( + self.component_lags["target"].items() + ) + ] + # values are grouped by component np_X.append( - covariate_matrices[cov_type][ - :, relative_cov_lags[cov_type] + t_pred + np.concatenate(tmp_X).reshape(len(series) * num_samples, -1) + ) + else: + # values are grouped by lags + np_X.append( + series_matrix[ + :, + [ + lag - (shift + last_step_shift) + for lag in self.lags["target"] + ], ].reshape(len(series) * num_samples, -1) ) + # retrieve covariate lags, enforce order (dict only preserves insertion order for python 3.6+) + for cov_type in ["past", "future"]: + if cov_type in covariate_matrices: + # component-wise lags + if cov_type in self.component_lags: + tmp_X = [ + covariate_matrices[cov_type][ + :, + np.array(comp_lags) - self.lags[cov_type][0] + t_pred, + comp_i, + ] + for comp_i, (comp, comp_lags) in enumerate( + self.component_lags[cov_type].items() + ) + ] + np_X.append( + np.concatenate(tmp_X).reshape(len(series) * num_samples, -1) + ) + else: + np_X.append( + covariate_matrices[cov_type][ + :, relative_cov_lags[cov_type] + t_pred + ].reshape(len(series) * num_samples, -1) + ) # concatenate retrieved lags X = np.concatenate(np_X, axis=1) diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index fe825543e1..d5249ee95f 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -1,13 +1,15 @@ import warnings from functools import reduce from math import inf -from typing import List, Optional, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union try: from typing import Literal except ImportError: from typing_extensions import Literal +from itertools import chain + import numpy as np import pandas as pd from numpy.lib.stride_tricks import as_strided @@ -329,9 +331,13 @@ def create_lagged_training_data( output_chunk_length: int, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, Sequence[int]]]] = None, + lags_past_covariates: Optional[ + Union[Sequence[int], Dict[str, Sequence[int]]] + ] = None, + lags_future_covariates: Optional[ + Union[Sequence[int], Dict[str, Sequence[int]]] + ] = None, uses_static_covariates: bool = True, last_static_covariates_shape: Optional[Tuple[int, int]] = None, max_samples_per_ts: Optional[int] = None, @@ -676,9 +682,13 @@ def create_lagged_component_names( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, Sequence[int]]]] = None, + lags_past_covariates: Optional[ + Union[Sequence[int], Dict[str, Sequence[int]]] + ] = None, + lags_future_covariates: Optional[ + Union[Sequence[int], Dict[str, Sequence[int]]] + ] = None, output_chunk_length: int = 1, concatenate: bool = True, use_static_covariates: bool = False, @@ -743,11 +753,17 @@ def create_lagged_component_names( continue components = get_single_series(variate).components.tolist() - lagged_feature_names += [ - f"{name}_{variate_type}_lag{lag}" - for lag in variate_lags - for name in components - ] + if isinstance(variate_lags, dict): + for name in components: + lagged_feature_names += [ + f"{name}_{variate_type}_lag{lag}" for lag in variate_lags[name] + ] + else: + lagged_feature_names += [ + f"{name}_{variate_type}_lag{lag}" + for lag in variate_lags + for name in components + ] if variate_type == "target" and lags: label_feature_names = [ @@ -894,9 +910,10 @@ def _create_lagged_data_by_moving_window( if isinstance(lags_i, list): lags_to_extract = np.array(lags_i, dtype=int) + min_lag_i - 1 else: + # Lags are grouped by component, extracted from the same window lags_to_extract = [ np.array(comp_lags, dtype=int) + min_lag_i - 1 - for comp_lags in lags_i + for comp_lags in lags_i.values() ] lagged_vals = _extract_lagged_vals_from_windows(windows, lags_to_extract) X.append(lagged_vals) @@ -1262,6 +1279,10 @@ def _get_feature_times( [target_series, past_covariates, future_covariates], [lags, lags_past_covariates, lags_future_covariates], ): + # TODO: information is available in model.lags, not sure how to make the info get here + if isinstance(lags_i, dict): + lags_i = list(set(chain(*lags_i.values()))) + if check_inputs and (series_i is not None): _check_series_length( series_i, From f24ea84756923cebfcc0d54c948e3cc6e70d3565 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 21 Aug 2023 16:53:42 +0200 Subject: [PATCH 08/30] test: added tests and fix some bug accordingly --- darts/models/forecasting/regression_model.py | 40 ++++---- .../forecasting/test_regression_models.py | 98 ++++++++++++++++++- 2 files changed, 117 insertions(+), 21 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 727f45870f..b0cdcecc11 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -228,7 +228,7 @@ def _set_lags( def _check_int_lags(lags: int, lags_name: str) -> List[int]: raise_if_not( - lags > 0, f"{lags_name} must be strictly positive. Given: {lags}." + lags > 0, f"`{lags_name}` must be strictly positive. Given: {lags}." ) return list(range(-lags, 0)) @@ -236,7 +236,7 @@ def _check_list_lags(lags: list, lags_name: str) -> List[int]: for lag in lags: raise_if( not isinstance(lag, int) or (lag >= 0), - f"Every element of {lags_name} must be a strictly negative integer. Given: {lags}.", + f"Every element of `{lags_name}` must be a strictly negative integer. Given: {lags}.", ) return sorted(lags) @@ -245,11 +245,11 @@ def _check_tuple_future_lags( ) -> List[int]: raise_if_not( lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, - f"{lags_name} tuple must contain stricly positibe integers. Given: {lags_future_covariates}.", + f"`{lags_name}` tuple must contain stricly positibe integers. Given: {lags_future_covariates}.", ) raise_if( lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0, - f"{lags_name} tuple cannot be (0, 0) as it corresponds to an empty list of lags.", + f"`{lags_name}` tuple cannot be (0, 0) as it corresponds to an empty list of lags.", logger, ) return list(range(-lags_future_covariates[0], lags_future_covariates[1])) @@ -260,7 +260,7 @@ def _check_list_future_lags( for lag in lags_future_covariates: raise_if( not isinstance(lag, int) or isinstance(lag, bool), - f"Every element of {lags_name} must be an integer. Given: {lags_future_covariates}.", + f"Every element of `{lags_name}` must be an integer. Given: {lags_future_covariates}.", ) return sorted(lags_future_covariates) @@ -270,7 +270,7 @@ def _check_dict_lags( raise_if_not( len(lags) > 0, - f"When passed as a dictionnary, {lags_name} must contain at least one key.", + f"When passed as a dictionnary, `{lags_name}` must contain at least one key.", logger, ) @@ -284,11 +284,11 @@ def _check_dict_lags( if lags_name == "lags_future_covariates": if isinstance(comp_lags, tuple): components_lags[comp_name] = _check_tuple_future_lags( - comp_lags, f"{lags_name} for component {comp_name}" + comp_lags, f"`{lags_name}` for component {comp_name}" ) elif isinstance(comp_lags, list): components_lags[comp_name] = _check_list_future_lags( - comp_lags, f"{lags_name} for component {comp_name}" + comp_lags, f"`{lags_name}` for component {comp_name}" ) else: invalid_type = True @@ -296,11 +296,11 @@ def _check_dict_lags( else: if isinstance(comp_lags, int): components_lags[comp_name] = _check_int_lags( - comp_lags, f"{lags_name} for component {comp_name}" + comp_lags, f"`{lags_name}` for component {comp_name}" ) elif isinstance(comp_lags, list): components_lags[comp_name] = _check_list_lags( - comp_lags, f"{lags_name} for component {comp_name}" + comp_lags, f"`{lags_name}` for component {comp_name}" ) else: invalid_type = True @@ -309,7 +309,7 @@ def _check_dict_lags( if invalid_type: raise_log( ValueError( - f"When passed as a dictionnary, {lags_name} for component {comp_name} must be either a " + f"When passed as a dictionnary, `{lags_name}` for component {comp_name} must be either a " f"{supported_types}, received : {type(comp_lags)}." ), logger, @@ -328,11 +328,11 @@ def _check_dict_lags( # perform the type and sanity checks if isinstance(lags, int): - self.lags["target"] = _check_int_lags(lags, "`lags`") + self.lags["target"] = _check_int_lags(lags, "lags") elif isinstance(lags, list): - self.lags["target"] = _check_list_lags(lags, "`lags`") + self.lags["target"] = _check_list_lags(lags, "lags") elif isinstance(lags, dict): - conv_lags = _check_dict_lags(lags, "`lags`") + conv_lags = _check_dict_lags(lags, "lags") if conv_lags is not None: # dummy, used to compute the extreme lags self.lags["target"] = conv_lags[0] @@ -341,14 +341,14 @@ def _check_dict_lags( if isinstance(lags_past_covariates, int): self.lags["past"] = _check_int_lags( - lags_past_covariates, "`lags_past_covariates`" + lags_past_covariates, "lags_past_covariates" ) elif isinstance(lags_past_covariates, list): self.lags["past"] = _check_list_lags( - lags_past_covariates, "`lags_past_covariates`" + lags_past_covariates, "lags_past_covariates" ) elif isinstance(lags_past_covariates, dict): - conv_lags = _check_dict_lags(lags_past_covariates, "`lags_past_covariates`") + conv_lags = _check_dict_lags(lags_past_covariates, "lags_past_covariates") if conv_lags is not None: # dummy, used to compute the extreme lags self.lags["past"] = conv_lags[0] @@ -357,15 +357,15 @@ def _check_dict_lags( if isinstance(lags_future_covariates, tuple): self.lags["future"] = _check_tuple_future_lags( - lags_future_covariates, "`lags_future_covariates`" + lags_future_covariates, "lags_future_covariates" ) elif isinstance(lags_future_covariates, list): self.lags["future"] = _check_list_future_lags( - lags_future_covariates, "`lags_future_covariates`" + lags_future_covariates, "lags_future_covariates" ) elif isinstance(lags_future_covariates, dict): conv_lags = _check_dict_lags( - lags_future_covariates, "`lags_future_covariates`" + lags_future_covariates, "lags_future_covariates" ) if conv_lags is not None: # dummy, used to compute the extreme lags diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 1fecded0f4..78c89c0167 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -420,7 +420,9 @@ def test_model_construction(self, config): # testing lags_past_covariates model_instance = model(lags=None, lags_past_covariates=3, multi_models=mode) assert model_instance.lags.get("past") == [-3, -2, -1] - # testing lags_future covariates + # lags_future covariates does not support SINGLE INT + + # TESTING TUPLE of int, only supported by lags_future_covariates model_instance = model( lags=None, lags_future_covariates=(3, 5), multi_models=mode ) @@ -435,6 +437,25 @@ def test_model_construction(self, config): model_instance = model(lags_past_covariates=values, multi_models=mode) assert model_instance.lags.get("past") == values # testing lags_future_covariates + values = [-5, -1, 5] + model_instance = model(lags_future_covariates=values, multi_models=mode) + assert model_instance.lags.get("future") == values + + # TESTING DICT, lags are specified component-wise + # model.lags contains the extreme across the components + values = {"comp0": [-4, -2], "comp1": [-5, -3]} + model_instance = model(lags=values, multi_models=mode) + assert model_instance.lags.get("target") == [-5, -2] + assert model_instance.component_lags.get("target") == values + # testing lags_past_covariates + model_instance = model(lags_past_covariates=values, multi_models=mode) + assert model_instance.lags.get("past") == [-5, -2] + assert model_instance.component_lags.get("past") == values + # testing lags_future_covariates + values = {"comp0": [-4, 2], "comp1": [-5, 3]} + model_instance = model(lags_future_covariates=values, multi_models=mode) + assert model_instance.lags.get("future") == [-5, 3] + assert model_instance.component_lags.get("future") == values with pytest.raises(ValueError): model(multi_models=mode) @@ -464,6 +485,10 @@ def test_model_construction(self, config): model(lags=5, lags_future_covariates=(1, True), multi_models=mode) with pytest.raises(ValueError): model(lags=5, lags_future_covariates=(1, 1.0), multi_models=mode) + with pytest.raises(ValueError): + model(lags=5, lags_future_covariates={}, multi_models=mode) + with pytest.raises(ValueError): + model(lags=None, lags_future_covariates={}, multi_models=mode) @pytest.mark.parametrize("mode", [True, False]) def test_training_data_creation(self, mode): @@ -1519,6 +1544,77 @@ def test_integer_indexed_series(self, mode): # the time axis returned by the second model should be as expected assert all(preds[1].time_index == pd.RangeIndex(start=50, stop=70, step=2)) + @pytest.mark.parametrize( + "config", + [ + ({"lags": [-3, -2, -1]}, {"lags": {"gaussian": 3}}), + ({"lags": 3}, {"lags": {"gaussian": 3, "sine": 3}}), + ({"lags_past_covariates": 2}, {"lags_past_covariates": {"lin_past": 2}}), + ( + {"lags": 5, "lags_future_covariates": [-2, 3]}, + { + "lags": { + "gaussian": [-5, -4, -3, -2, -1], + "sine": [-5, -4, -3, -2, -1], + }, + "lags_future_covariates": { + "lin_future": [-2, 3], + "sine_future": [-2, 3], + }, + }, + ), + ], + ) + def test_component_specific_lags(self, config): + """Verify that the same lags, defined using int/list or dictionnaries yield the same results""" + list_lags, dict_lags = config + multivar_target = "lags" in dict_lags and len(dict_lags["lags"]) > 1 + multivar_future_cov = ( + "lags_future_covariates" in dict_lags + and len(dict_lags["lags_future_covariates"]) > 1 + ) + + # create series based on the model parameters + series = tg.gaussian_timeseries(length=20, column_name="gaussian") + if multivar_target: + series = series.stack(tg.sine_timeseries(length=20, column_name="sine")) + + future_cov = tg.linear_timeseries(length=30, column_name="lin_future") + if multivar_future_cov: + future_cov = future_cov.stack( + tg.sine_timeseries(length=30, column_name="sine_future") + ) + + past_cov = tg.linear_timeseries(length=30, column_name="lin_past") + + # the lags are identical across the components for each series + model = LinearRegressionModel(**list_lags) + model.fit( + series=series, + past_covariates=past_cov if model.supports_past_covariates else None, + future_covariates=future_cov if model.supports_future_covariates else None, + ) + + # the lags are specified for each component, individually + model2 = LinearRegressionModel(**dict_lags) + model2.fit( + series=series, + past_covariates=past_cov if model2.supports_past_covariates else None, + future_covariates=future_cov if model2.supports_future_covariates else None, + ) + + # n == output_chunk_length + pred = model.predict(1) + pred2 = model2.predict(1) + np.testing.assert_array_almost_equal(pred.values(), pred2.values()) + assert pred.time_index.equals(pred2.time_index) + + # n > output_chunk_length + pred = model.predict(3) + pred2 = model2.predict(3) + np.testing.assert_array_almost_equal(pred.values(), pred2.values()) + assert pred.time_index.equals(pred2.time_index) + @pytest.mark.parametrize( "config", itertools.product( From 01b8409b024b83720b3401bc93bbc448a9f014a1 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 21 Aug 2023 17:35:48 +0200 Subject: [PATCH 09/30] feat: component-wise lags support encoders, improved sanity checks --- darts/models/forecasting/regression_model.py | 50 +++++++++++--------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index b0cdcecc11..c5abb800fb 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -675,29 +675,33 @@ def fit( future_covariates=seq2series(future_covariates), ) - # TODO: if the keys are string, check if they are indeed in the series? - # if provided, component-wise lags must be defined for all the components - if "target" in self.component_lags: - raise_if( - len(self.component_lags["target"]) != self.input_dim["target"], - f"The training series contain {self.input_dim['target']} components, " - f"{len(self.component_lags['target'])} lags were provided. These two values must exactly match.", - logger, - ) - if "past" in self.component_lags and "past" in self.input_dim: - raise_if( - len(self.component_lags["past"]) != self.input_dim["past"], - f"The past covariates series contain {self.input_dim['past']} components, " - f"{len(self.component_lags['past'])} lags were provided. These two values must exactly match.", - logger, - ) - if "future" in self.component_lags and "future" in self.input_dim: - raise_if( - len(self.component_lags["future"]) != self.input_dim["future"], - f"The future covariates series contain {self.input_dim['future']} components, " - f"{len(self.component_lags['future'])} lags were provided. These two values must exactly match.", - logger, - ) + # if provided, component-wise lags must be defined for all the components of the first series + for variate_type, variate in zip( + ["target", "past", "future"], [series, past_covariates, future_covariates] + ): + if variate_type in self.component_lags: + provided_components = set(self.component_lags[variate_type].keys()) + required_components = set(variate[0].components) + # lags were specified for unrecognized components + wrong_components = list(provided_components - required_components) + if len(wrong_components) > 0: + logger.warning( + f"Lags of components not present in the series ({wrong_components}) were ignored." + ) + + missing_keys = list(required_components - provided_components) + raise_if( + len(missing_keys) > 0, + f"The {variate_type} series contains {self.input_dim[variate_type]} components, lags were " + f"provided for {len(self.component_lags[variate_type])} of them. The lags for the " + f"following components must be provided: {missing_keys}.", + logger, + ) + # reorder the components based on the input series + self.component_lags[variate_type] = { + comp_name: self.component_lags[variate_type][comp_name] + for comp_name in variate[0].components + } self._fit_model( series, past_covariates, future_covariates, max_samples_per_ts, **kwargs From a671af875d47834de7856b218b782af52a73b37d Mon Sep 17 00:00:00 2001 From: madtoinou Date: Wed, 23 Aug 2023 09:26:45 +0200 Subject: [PATCH 10/30] feat: possibility to declare default lags for all the not specified components, updated changelog --- CHANGELOG.md | 1 + darts/models/forecasting/regression_model.py | 46 +++++++++++++------ .../forecasting/test_regression_models.py | 13 ++++++ 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7cdc9db33..046e99e8b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co **Improved** - `TimeSeries` with a `RangeIndex` starting in the negative start are now supported by `historical_forecasts`. [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou). - Added a new argument `start_format` to `historical_forecasts()`, `backtest()` and `gridsearch` that allows to use an integer `start` either as the index position or index value/label for `series` indexed with a `pd.RangeIndex`. [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou). +- `RegressionModel` can now be created with different lags for each component of the target and past/future covariates series. [#1962](https://github.com/unit8co/darts/pull/1962) by [Antoine Madrona](https://github.com/madtoinou). **Fixed** - Fixed a bug in `TimeSeries.from_dataframe()` when using a pandas.DataFrame with `df.columns.name != None`. [#1938](https://github.com/unit8co/darts/pull/1938) by [Antoine Madrona](https://github.com/madtoinou). diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index c5abb800fb..5a8ad00b48 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -84,20 +84,31 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise, a list of integers with lags (each lag must be < 0). - In order to specify component-wise lags, a dictionnary with the component name or index as key and the - lags value can be provided. The number of keys in the dictionnary must match the number of components in - the series. + Lagged target values used to predict the next time step. + If an integer is given the last `lags` past lags are used (from -1 backward). + If a list of integers, each lag must be < 0. + If a dictionnary, the keys must be the components' name (first series when using multiple series) and + the values corresponds to the lags (integer or list of integers). The key 'default_lags' can be used to + provide fallback lags values for un-specified components. An error will be raised if some components are + missing and the 'default_lags' key is not present in the dictionnary. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Number of lagged past_covariates values used to predict the next time step. + If an integer is given the last `lags_past_covariates` past lags are used (inclusive, starting from lag -1). + If a list of integers, each lag must be < 0. + If a dictionnary, the keys must be the components' name (first series when using multiple series) and + the values corresponds to the lags (integer or list of integers). The key 'default_lags' can be used to + provide fallback lags values for un-specified components. An error will be raised if some components are + missing and the 'default_lags' key is not present in the dictionnary. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If a tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Number of lagged future_covariates values used to predict the next time step. + If a tuple (past, future) is given the last `past` lags in the past are used (inclusive, starting from + lag -1) along with the first `future` future lags (starting from 0 - the prediction time - up to + `future - 1` included). + If a list of integer, the values will be used as is. + If a dictionnary, the keys must be the components' name (first series when using multiple series) and + the values corresponds to the lags (integer or list of integers). The key 'default_lags' can be used to + provide fallback lags values for un-specified components. An error will be raised if some components are + missing and the 'default_lags' key is not present in the dictionnary. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may @@ -680,10 +691,13 @@ def fit( ["target", "past", "future"], [series, past_covariates, future_covariates] ): if variate_type in self.component_lags: + # ignore the fallback lags entry provided_components = set(self.component_lags[variate_type].keys()) required_components = set(variate[0].components) # lags were specified for unrecognized components - wrong_components = list(provided_components - required_components) + wrong_components = list( + provided_components - {"default_lags"} - required_components + ) if len(wrong_components) > 0: logger.warning( f"Lags of components not present in the series ({wrong_components}) were ignored." @@ -691,15 +705,17 @@ def fit( missing_keys = list(required_components - provided_components) raise_if( - len(missing_keys) > 0, + len(missing_keys) > 0 and "default_lags" not in provided_components, f"The {variate_type} series contains {self.input_dim[variate_type]} components, lags were " f"provided for {len(self.component_lags[variate_type])} of them. The lags for the " f"following components must be provided: {missing_keys}.", logger, ) - # reorder the components based on the input series + # reorder the components based on the input series, insert the default when necessary self.component_lags[variate_type] = { comp_name: self.component_lags[variate_type][comp_name] + if comp_name in self.component_lags[variate_type] + else self.component_lags[variate_type]["default_lags"] for comp_name in variate[0].components } diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 78c89c0167..86d02c85c5 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1563,6 +1563,19 @@ def test_integer_indexed_series(self, mode): }, }, ), + ( + {"lags": 5, "lags_future_covariates": [-2, 3]}, + { + "lags": { + "gaussian": [-5, -4, -3, -2, -1], + "sine": [-5, -4, -3, -2, -1], + }, + "lags_future_covariates": { + "sine_future": [-2, 3], + "default_lags": [-2, 3], + }, + }, + ), ], ) def test_component_specific_lags(self, config): From 2aa96a4fa69e7d0f96da08151d18e7602ab7c965 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Wed, 23 Aug 2023 10:54:01 +0200 Subject: [PATCH 11/30] test: adding a test for the lagged data creation --- darts/models/forecasting/regression_model.py | 2 +- .../forecasting/test_regression_models.py | 52 ++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 5a8ad00b48..444c9c7859 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -256,7 +256,7 @@ def _check_tuple_future_lags( ) -> List[int]: raise_if_not( lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, - f"`{lags_name}` tuple must contain stricly positibe integers. Given: {lags_future_covariates}.", + f"`{lags_name}` tuple must contain stricly positive integers. Given: {lags_future_covariates}.", ) raise_if( lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0, diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 86d02c85c5..51d1256b00 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -492,7 +492,8 @@ def test_model_construction(self, config): @pytest.mark.parametrize("mode", [True, False]) def test_training_data_creation(self, mode): - # testing _get_training_data function + """testing _get_training_data function""" + # lags defined using lists of integers model_instance = RegressionModel( lags=self.lags_1["target"], lags_past_covariates=self.lags_1["past"], @@ -541,6 +542,55 @@ def test_training_data_creation(self, mode): ] assert list(training_labels[0]) == [82, 182, 282] + # lags defined using dictionnaries + # cannot use 'default_lags' because it's converted in `fit()`, before calling `_created_lagged_data` + model_instance = RegressionModel( + lags={"0-trgt-0": [-5, -4], "0-trgt-1": [-3, -2], "0-trgt-2": [-2, -1]}, + lags_past_covariates={"0-pcov-0": [-10], "0-pvoc-1": [-7]}, + lags_future_covariates={"0-fcov-0": (2, 2)}, + multi_models=mode, + ) + + max_samples_per_ts = 3 + + # using only one series of each + training_samples, training_labels = model_instance._create_lagged_data( + target_series=self.target_series[0], + past_covariates=self.past_covariates[0], + future_covariates=self.future_covariates[0], + max_samples_per_ts=max_samples_per_ts, + ) + + # checking number of dimensions + assert len(training_samples.shape) == 2 # samples, features + assert len(training_labels.shape) == 2 # samples, components (multivariate) + assert training_samples.shape[0] == training_labels.shape[0] + assert training_samples.shape[0] == max_samples_per_ts + assert ( + training_samples.shape[1] + == 6 # [-4, -3], [-3, -2], [-2, -1] + + 2 # [-10], [-7] + + 4 # [-2, -1, 0, 1] + ) + + # check last sample + assert list(training_labels[0]) == [97, 197, 297] + # lags are grouped by components instead of lags + assert list(training_samples[0, :]) == [ + 92, + 93, + 194, + 195, + 295, + 296, # comp_i = comp_0 + i*100 + 10087, + 10190, # past cov; target + 10'000 + 20095, + 20096, + 20097, + 20098, # future cov; target + 20'000 + ] + @pytest.mark.parametrize("mode", [True, False]) def test_prediction_data_creation(self, mode): # assigning correct names to variables From c3133b27eab92b71a421fa55b0fbf1a8e30d8d5f Mon Sep 17 00:00:00 2001 From: madtoinou Date: Wed, 23 Aug 2023 10:55:09 +0200 Subject: [PATCH 12/30] fix: typo --- darts/tests/models/forecasting/test_regression_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 51d1256b00..eabee4447d 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -545,7 +545,7 @@ def test_training_data_creation(self, mode): # lags defined using dictionnaries # cannot use 'default_lags' because it's converted in `fit()`, before calling `_created_lagged_data` model_instance = RegressionModel( - lags={"0-trgt-0": [-5, -4], "0-trgt-1": [-3, -2], "0-trgt-2": [-2, -1]}, + lags={"0-trgt-0": [-4, -3], "0-trgt-1": [-3, -2], "0-trgt-2": [-2, -1]}, lags_past_covariates={"0-pcov-0": [-10], "0-pvoc-1": [-7]}, lags_future_covariates={"0-fcov-0": (2, 2)}, multi_models=mode, @@ -577,8 +577,8 @@ def test_training_data_creation(self, mode): assert list(training_labels[0]) == [97, 197, 297] # lags are grouped by components instead of lags assert list(training_samples[0, :]) == [ - 92, 93, + 94, 194, 195, 295, From 646b6716bcc8e9309e0ec70755959ad77fe38dd1 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 25 Aug 2023 16:48:13 +0200 Subject: [PATCH 13/30] fix: adressing review comments --- darts/models/forecasting/regression_model.py | 284 ++++++++++-------- .../forecasting/test_regression_models.py | 2 +- darts/utils/data/tabularization.py | 10 +- 3 files changed, 162 insertions(+), 134 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 444c9c7859..c55d7acca3 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -151,7 +151,7 @@ def __init__( self.model = model self.lags: Dict[str, List[int]] = {} - self.component_lags: Dict[str, Dict[str, Sequence[int]]] = {} + self.component_lags: Dict[str, Dict[str, List[int]]] = {} self.input_dim = None self.multi_models = multi_models self._considers_static_covariates = use_static_covariates @@ -187,63 +187,44 @@ def __init__( "At least one of `lags`, `lags_future_covariates` or `lags_past_covariates` must be not None.", ) - lags_type_checks = [ - (lags, "lags"), - (lags_past_covariates, "lags_past_covariates"), - ] - - for _lags, lags_name in lags_type_checks: - raise_if_not( - isinstance(_lags, (int, list, dict)) or _lags is None, - f"`{lags_name}` must be of type int, list or dict. Given: {type(_lags)}.", - ) - raise_if( - isinstance(_lags, bool), - f"`{lags_name}` must be of type int, list or dict, not bool.", - ) - - raise_if_not( - isinstance(lags_future_covariates, (tuple, list, dict)) - or lags_future_covariates is None, - f"`lags_future_covariates` must be of type tuple, list or dict. Given: {type(lags_future_covariates)}.", - ) - - if isinstance(lags_future_covariates, tuple): - raise_if_not( - len(lags_future_covariates) == 2 - and isinstance(lags_future_covariates[0], int) - and isinstance(lags_future_covariates[1], int), - "`lags_future_covariates` tuple must be of length 2, and must contain two integers", - ) - raise_if( - isinstance(lags_future_covariates[0], bool) - or isinstance(lags_future_covariates[1], bool), - "`lags_future_covariates` tuple must contain integers, not bool", - ) - - self._set_lags( + # convert lags arguments to list of int + processed_lags, processed_component_lags = self._generate_lags( lags=lags, lags_past_covariates=lags_past_covariates, lags_future_covariates=lags_future_covariates, ) + self.lags = processed_lags + self.component_lags = processed_component_lags + self.pred_dim = self.output_chunk_length if self.multi_models else 1 - def _set_lags( + def _generate_lags( self, lags: Optional[LAGS_TYPE], lags_past_covariates: Optional[LAGS_TYPE], lags_future_covariates: Optional[FUTURE_LAGS_TYPE], - ): - """Based on the type of the argument and the nature of the covariates, convert the lags to a list.""" + ) -> Tuple[Dict[str, List[int]], Dict[str, Dict[str, List[int]]]]: + """ + Based on the type of the argument and the nature of the covariates, perform some sanity checks before + converting the lags to a list of integer. + + If lags are provided as a dictionary, the lags values are contained in self.component_lags and the self.lags + attributes contain only the extreme values + If the lags are provided as integer, list, tuple or dictionary containing only the 'default_lags' keys, the lags + values are contained in the self.lags attribute and the self.component_lags is an empty dictionary. + + `lags` and `lags_past_covariates` are processed using the same local functions, + `lags_future_covariates` is processed with different local functions + """ - def _check_int_lags(lags: int, lags_name: str) -> List[int]: + def _process_int_lags(lags: int, lags_name: str) -> List[int]: raise_if_not( lags > 0, f"`{lags_name}` must be strictly positive. Given: {lags}." ) return list(range(-lags, 0)) - def _check_list_lags(lags: list, lags_name: str) -> List[int]: + def _process_list_lags(lags: list, lags_name: str) -> List[int]: for lag in lags: raise_if( not isinstance(lag, int) or (lag >= 0), @@ -251,12 +232,27 @@ def _check_list_lags(lags: list, lags_name: str) -> List[int]: ) return sorted(lags) - def _check_tuple_future_lags( + def _process_tuple_future_lags( lags_future_covariates: Tuple[int, int], lags_name: str ) -> List[int]: + raise_if_not( + len(lags_future_covariates) == 2 + and isinstance(lags_future_covariates[0], int) + and isinstance(lags_future_covariates[1], int), + f"`{lags_name}` tuple must be of length 2, and must contain two integers", + logger, + ) + + raise_if( + isinstance(lags_future_covariates[0], bool) + or isinstance(lags_future_covariates[1], bool), + f"`{lags_name}` tuple must contain integers, not bool", + logger, + ) + raise_if_not( lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, - f"`{lags_name}` tuple must contain stricly positive integers. Given: {lags_future_covariates}.", + f"`{lags_name}` tuple must contain positive integers. Given: {lags_future_covariates}.", ) raise_if( lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0, @@ -265,7 +261,7 @@ def _check_tuple_future_lags( ) return list(range(-lags_future_covariates[0], lags_future_covariates[1])) - def _check_list_future_lags( + def _process_list_future_lags( lags_future_covariates: List[int], lags_name: str ) -> List[int]: for lag in lags_future_covariates: @@ -275,13 +271,13 @@ def _check_list_future_lags( ) return sorted(lags_future_covariates) - def _check_dict_lags( + def _process_dict_lags( lags: dict, lags_name: str - ) -> Optional[Tuple[List[int], Dict[str, Sequence[int]]]]: + ) -> Tuple[List[int], Dict[str, List[int]]]: raise_if_not( len(lags) > 0, - f"When passed as a dictionnary, `{lags_name}` must contain at least one key.", + f"When passed as a dictionary, `{lags_name}` must contain at least one key.", logger, ) @@ -289,16 +285,15 @@ def _check_dict_lags( supported_types = "" min_lags = None max_lags = None - components_lags = dict() - # TODO: use component idx instead of component name for robustness? - for comp_idx, (comp_name, comp_lags) in enumerate(lags.items()): + components_lags: Dict[str, List[int]] = dict() + for comp_name, comp_lags in lags.items(): if lags_name == "lags_future_covariates": if isinstance(comp_lags, tuple): - components_lags[comp_name] = _check_tuple_future_lags( + components_lags[comp_name] = _process_tuple_future_lags( comp_lags, f"`{lags_name}` for component {comp_name}" ) elif isinstance(comp_lags, list): - components_lags[comp_name] = _check_list_future_lags( + components_lags[comp_name] = _process_list_future_lags( comp_lags, f"`{lags_name}` for component {comp_name}" ) else: @@ -306,11 +301,11 @@ def _check_dict_lags( supported_types = "tuple or a list" else: if isinstance(comp_lags, int): - components_lags[comp_name] = _check_int_lags( + components_lags[comp_name] = _process_int_lags( comp_lags, f"`{lags_name}` for component {comp_name}" ) elif isinstance(comp_lags, list): - components_lags[comp_name] = _check_list_lags( + components_lags[comp_name] = _process_list_lags( comp_lags, f"`{lags_name}` for component {comp_name}" ) else: @@ -320,7 +315,7 @@ def _check_dict_lags( if invalid_type: raise_log( ValueError( - f"When passed as a dictionnary, `{lags_name}` for component {comp_name} must be either a " + f"When passed as a dictionary, `{lags_name}` for component {comp_name} must be either a " f"{supported_types}, received : {type(comp_lags)}." ), logger, @@ -335,54 +330,94 @@ def _check_dict_lags( max_lags = components_lags[comp_name][-1] else: max_lags = max(max_lags, components_lags[comp_name][-1]) - return [min_lags, max_lags], components_lags + + # revert to lags shared across components logic + if list(components_lags.keys()) == ["default_lags"]: + return components_lags["default_lags"], {} + else: + return [min_lags, max_lags], components_lags # perform the type and sanity checks - if isinstance(lags, int): - self.lags["target"] = _check_int_lags(lags, "lags") + lags_type_error_msg = [] + processed_lags: Dict[str, List[int]] = dict() + processed_component_lags: Dict[str, Dict[str, List[int]]] = dict() + if lags is None: + pass + elif isinstance(lags, int): + processed_lags["target"] = _process_int_lags(lags, "lags") elif isinstance(lags, list): - self.lags["target"] = _check_list_lags(lags, "lags") + processed_lags["target"] = _process_list_lags(lags, "lags") elif isinstance(lags, dict): - conv_lags = _check_dict_lags(lags, "lags") - if conv_lags is not None: - # dummy, used to compute the extreme lags - self.lags["target"] = conv_lags[0] - # actual lags - self.component_lags["target"] = conv_lags[1] - - if isinstance(lags_past_covariates, int): - self.lags["past"] = _check_int_lags( + conv_lags = _process_dict_lags(lags, "lags") + # dummy, used to compute the extreme lags + processed_lags["target"] = conv_lags[0] + # actual lags + processed_component_lags["target"] = conv_lags[1] + else: + lags_type_error_msg.append( + f"`lags` must be of type int, list or dict." f"Given: {type(lags)}." + ) + + if lags_past_covariates is None: + pass + elif isinstance(lags_past_covariates, int): + processed_lags["past"] = _process_int_lags( lags_past_covariates, "lags_past_covariates" ) elif isinstance(lags_past_covariates, list): - self.lags["past"] = _check_list_lags( + processed_lags["past"] = _process_list_lags( lags_past_covariates, "lags_past_covariates" ) elif isinstance(lags_past_covariates, dict): - conv_lags = _check_dict_lags(lags_past_covariates, "lags_past_covariates") - if conv_lags is not None: - # dummy, used to compute the extreme lags - self.lags["past"] = conv_lags[0] - # actual lags - self.component_lags["past"] = conv_lags[1] - - if isinstance(lags_future_covariates, tuple): - self.lags["future"] = _check_tuple_future_lags( + conv_lags = _process_dict_lags(lags_past_covariates, "lags_past_covariates") + # dummy, used to compute the extreme lags + processed_lags["past"] = conv_lags[0] + # actual lags + processed_component_lags["past"] = conv_lags[1] + else: + lags_type_error_msg.append( + f"`lags_past_covariates` must be of type int, list or dict." + f"Given: {type(lags_past_covariates)}." + ) + + if lags_future_covariates is None: + pass + elif isinstance(lags_future_covariates, tuple): + processed_lags["future"] = _process_tuple_future_lags( lags_future_covariates, "lags_future_covariates" ) elif isinstance(lags_future_covariates, list): - self.lags["future"] = _check_list_future_lags( + processed_lags["future"] = _process_list_future_lags( lags_future_covariates, "lags_future_covariates" ) elif isinstance(lags_future_covariates, dict): - conv_lags = _check_dict_lags( + conv_lags = _process_dict_lags( lags_future_covariates, "lags_future_covariates" ) - if conv_lags is not None: - # dummy, used to compute the extreme lags - self.lags["future"] = conv_lags[0] - # actual lags - self.component_lags["future"] = conv_lags[1] + # dummy, used to compute the extreme lags + processed_lags["future"] = conv_lags[0] + # actual lags + processed_component_lags["future"] = conv_lags[1] + else: + lags_type_error_msg.append( + f"`lags_future_covariates` must be of type tuple, list or dict. " + f"Given: {type(lags_future_covariates)}." + ) + + # error message for all the invalid types + if len(lags_type_error_msg) > 0: + raise_log(ValueError("\n".join(lags_type_error_msg)), logger) + return processed_lags, processed_component_lags + + def _get_lags(self, lags_type: str): + """ + If lags were specified in a component-wise manner, they are contained in self.component_lags and + the values in self.lags should be ignored as they correspond just the extreme values. + """ + if lags_type in self.component_lags: + return self.component_lags[lags_type] + else: + return self.lags.get(lags_type) @property def _model_encoder_settings( @@ -485,10 +520,6 @@ def _get_last_prediction_time(self, series, forecast_horizon, overlap_end): def _create_lagged_data( self, target_series, past_covariates, future_covariates, max_samples_per_ts ): - """ - If lags were specified component-wise manner, they are contained in self.component_lags and the values - in self.lags should be ignored. - """ ( features, labels, @@ -499,15 +530,9 @@ def _create_lagged_data( output_chunk_length=self.output_chunk_length, past_covariates=past_covariates, future_covariates=future_covariates, - lags=self.component_lags["target"] - if "target" in self.component_lags - else self.lags.get("target"), - lags_past_covariates=self.component_lags["past"] - if "past" in self.component_lags - else self.lags.get("past"), - lags_future_covariates=self.component_lags["future"] - if "future" in self.component_lags - else self.lags.get("future"), + lags=self._get_lags("target"), + lags_past_covariates=self._get_lags("past"), + lags_future_covariates=self._get_lags("future"), uses_static_covariates=self.uses_static_covariates, last_static_covariates_shape=None, max_samples_per_ts=max_samples_per_ts, @@ -555,15 +580,9 @@ def _fit_model( target_series=target_series, past_covariates=past_covariates, future_covariates=future_covariates, - lags=self.component_lags["target"] - if "target" in self.component_lags - else self.lags.get("target"), - lags_past_covariates=self.component_lags["past"] - if "past" in self.component_lags - else self.lags.get("past"), - lags_future_covariates=self.component_lags["future"] - if "future" in self.component_lags - else self.lags.get("future"), + lags=self._get_lags("target"), + lags_past_covariates=self._get_lags("past"), + lags_future_covariates=self._get_lags("future"), output_chunk_length=self.output_chunk_length, concatenate=False, use_static_covariates=self.uses_static_covariates, @@ -685,32 +704,41 @@ def fit( past_covariates=seq2series(past_covariates), future_covariates=seq2series(future_covariates), ) + variate2arg = { + "target": "lags", + "past": "lags_past_covariates", + "future": "lags_future_covariates", + } # if provided, component-wise lags must be defined for all the components of the first series + component_lags_error_msg = [] for variate_type, variate in zip( ["target", "past", "future"], [series, past_covariates, future_covariates] ): - if variate_type in self.component_lags: - # ignore the fallback lags entry - provided_components = set(self.component_lags[variate_type].keys()) - required_components = set(variate[0].components) - # lags were specified for unrecognized components - wrong_components = list( - provided_components - {"default_lags"} - required_components - ) - if len(wrong_components) > 0: - logger.warning( - f"Lags of components not present in the series ({wrong_components}) were ignored." - ) + if variate_type not in self.component_lags: + continue - missing_keys = list(required_components - provided_components) - raise_if( - len(missing_keys) > 0 and "default_lags" not in provided_components, - f"The {variate_type} series contains {self.input_dim[variate_type]} components, lags were " - f"provided for {len(self.component_lags[variate_type])} of them. The lags for the " - f"following components must be provided: {missing_keys}.", - logger, + # ignore the fallback lags entry + provided_components = set(self.component_lags[variate_type].keys()) + required_components = set(variate[0].components) + + wrong_components = list( + provided_components - {"default_lags"} - required_components + ) + missing_keys = list(required_components - provided_components) + # lags were specified for unrecognized components + if len(wrong_components) > 0: + component_lags_error_msg.append( + f"The `{variate2arg[variate_type]}` dictionary specifies lags for components that are not " + f"present in the series : {wrong_components}. They must be removed to avoid any ambiguity." + ) + elif len(missing_keys) > 0 and "default_lags" not in provided_components: + component_lags_error_msg.append( + f"The {variate2arg[variate_type]} dictionary is missing the lags for the following components " + f"present in the series: {missing_keys}. The key 'default_lags' can be used to provide lags for " + f"all the non-explicitely defined components." ) + else: # reorder the components based on the input series, insert the default when necessary self.component_lags[variate_type] = { comp_name: self.component_lags[variate_type][comp_name] @@ -719,6 +747,10 @@ def fit( for comp_name in variate[0].components } + # single error message for all the lags arguments + if len(component_lags_error_msg) > 0: + raise_log(ValueError("\n".join(component_lags_error_msg)), logger) + self._fit_model( series, past_covariates, future_covariates, max_samples_per_ts, **kwargs ) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index eabee4447d..9e8426bbe6 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -542,7 +542,7 @@ def test_training_data_creation(self, mode): ] assert list(training_labels[0]) == [82, 182, 282] - # lags defined using dictionnaries + # lags defined using dictionaries # cannot use 'default_lags' because it's converted in `fit()`, before calling `_created_lagged_data` model_instance = RegressionModel( lags={"0-trgt-0": [-4, -3], "0-trgt-1": [-3, -2], "0-trgt-2": [-2, -1]}, diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index d5249ee95f..9078515a2b 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -331,13 +331,9 @@ def create_lagged_training_data( output_chunk_length: int, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Union[Sequence[int], Dict[str, Sequence[int]]]] = None, - lags_past_covariates: Optional[ - Union[Sequence[int], Dict[str, Sequence[int]]] - ] = None, - lags_future_covariates: Optional[ - Union[Sequence[int], Dict[str, Sequence[int]]] - ] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, uses_static_covariates: bool = True, last_static_covariates_shape: Optional[Tuple[int, int]] = None, max_samples_per_ts: Optional[int] = None, From 3221f867cb2fb40670a2eca6616f6396283ee5c9 Mon Sep 17 00:00:00 2001 From: madtoinou <32447896+madtoinou@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:52:03 +0200 Subject: [PATCH 14/30] Apply suggestions from code review Co-authored-by: Dennis Bader --- darts/models/forecasting/regression_model.py | 48 ++++++++++---------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index c55d7acca3..8b990ec369 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -84,31 +84,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. - If an integer is given the last `lags` past lags are used (from -1 backward). - If a list of integers, each lag must be < 0. - If a dictionnary, the keys must be the components' name (first series when using multiple series) and - the values corresponds to the lags (integer or list of integers). The key 'default_lags' can be used to - provide fallback lags values for un-specified components. An error will be raised if some components are - missing and the 'default_lags' key is not present in the dictionnary. + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. - If an integer is given the last `lags_past_covariates` past lags are used (inclusive, starting from lag -1). - If a list of integers, each lag must be < 0. - If a dictionnary, the keys must be the components' name (first series when using multiple series) and - the values corresponds to the lags (integer or list of integers). The key 'default_lags' can be used to - provide fallback lags values for un-specified components. An error will be raised if some components are - missing and the 'default_lags' key is not present in the dictionnary. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. - If a tuple (past, future) is given the last `past` lags in the past are used (inclusive, starting from - lag -1) along with the first `future` future lags (starting from 0 - the prediction time - up to - `future - 1` included). - If a list of integer, the values will be used as is. - If a dictionnary, the keys must be the components' name (first series when using multiple series) and - the values corresponds to the lags (integer or list of integers). The key 'default_lags' can be used to - provide fallback lags values for un-specified components. An error will be raised if some components are - missing and the 'default_lags' key is not present in the dictionnary. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may From 3254db3f96a87f8052b3c4966bb2ff8c405eb15f Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 28 Aug 2023 10:12:00 +0200 Subject: [PATCH 15/30] refactor: lags argument are converted to dict before running the type check and processing of the values --- darts/models/forecasting/regression_model.py | 230 +++++++------------ 1 file changed, 81 insertions(+), 149 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 8b990ec369..a0c5ea4933 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -215,101 +215,102 @@ def _generate_lags( attributes contain only the extreme values If the lags are provided as integer, list, tuple or dictionary containing only the 'default_lags' keys, the lags values are contained in the self.lags attribute and the self.component_lags is an empty dictionary. - - `lags` and `lags_past_covariates` are processed using the same local functions, - `lags_future_covariates` is processed with different local functions """ + processed_lags: Dict[str, List[int]] = dict() + processed_component_lags: Dict[str, Dict[str, List[int]]] = dict() + for lags_values, lags_name, lags_abbrev in zip( + [lags, lags_past_covariates, lags_future_covariates], + ["lags", "lags_past_covariates", "lags_future_covariates"], + ["target", "past", "future"], + ): + if lags_values is None: + continue - def _process_int_lags(lags: int, lags_name: str) -> List[int]: - raise_if_not( - lags > 0, f"`{lags_name}` must be strictly positive. Given: {lags}." - ) - return list(range(-lags, 0)) - - def _process_list_lags(lags: list, lags_name: str) -> List[int]: - for lag in lags: + # check type of argument before converting to dictionary + if not isinstance(lags_values, dict): raise_if( - not isinstance(lag, int) or (lag >= 0), - f"Every element of `{lags_name}` must be a strictly negative integer. Given: {lags}.", + lags_name == "lags_future_covariates" + and not isinstance(lags_values, (tuple, list)), + f"`lags_future_covariates` must be of type tuple, list or dict." + f"Given: {type(lags_values)}.", ) - return sorted(lags) - - def _process_tuple_future_lags( - lags_future_covariates: Tuple[int, int], lags_name: str - ) -> List[int]: - raise_if_not( - len(lags_future_covariates) == 2 - and isinstance(lags_future_covariates[0], int) - and isinstance(lags_future_covariates[1], int), - f"`{lags_name}` tuple must be of length 2, and must contain two integers", - logger, - ) - - raise_if( - isinstance(lags_future_covariates[0], bool) - or isinstance(lags_future_covariates[1], bool), - f"`{lags_name}` tuple must contain integers, not bool", - logger, - ) - raise_if_not( - lags_future_covariates[0] >= 0 and lags_future_covariates[1] >= 0, - f"`{lags_name}` tuple must contain positive integers. Given: {lags_future_covariates}.", - ) - raise_if( - lags_future_covariates[0] == 0 and lags_future_covariates[1] == 0, - f"`{lags_name}` tuple cannot be (0, 0) as it corresponds to an empty list of lags.", - logger, - ) - return list(range(-lags_future_covariates[0], lags_future_covariates[1])) - - def _process_list_future_lags( - lags_future_covariates: List[int], lags_name: str - ) -> List[int]: - for lag in lags_future_covariates: raise_if( - not isinstance(lag, int) or isinstance(lag, bool), - f"Every element of `{lags_name}` must be an integer. Given: {lags_future_covariates}.", + lags_name in ["lags", "lags_past_covariates"] + and not isinstance(lags_values, (int, list)), + f"`{lags_name}` must be of type int, list or dict." + f"Given: {type(lags_values)}.", ) - return sorted(lags_future_covariates) - def _process_dict_lags( - lags: dict, lags_name: str - ) -> Tuple[List[int], Dict[str, List[int]]]: + lags_values = {"default_lags": lags_values} - raise_if_not( - len(lags) > 0, - f"When passed as a dictionary, `{lags_name}` must contain at least one key.", - logger, - ) + elif len(lags_values) == 0: + raise_log( + ValueError( + f"When passed as a dictionary, `{lags_name}` must contain at least one key." + ), + logger, + ) invalid_type = False supported_types = "" min_lags = None max_lags = None - components_lags: Dict[str, List[int]] = dict() - for comp_name, comp_lags in lags.items(): + tmp_components_lags: Dict[str, List[int]] = dict() + for comp_name, comp_lags in lags_values.items(): if lags_name == "lags_future_covariates": if isinstance(comp_lags, tuple): - components_lags[comp_name] = _process_tuple_future_lags( - comp_lags, f"`{lags_name}` for component {comp_name}" + raise_if_not( + len(comp_lags) == 2 + and isinstance(comp_lags[0], int) + and isinstance(comp_lags[1], int), + f"`{lags_name}` tuple must be of length 2, and must contain two integers", + logger, ) - elif isinstance(comp_lags, list): - components_lags[comp_name] = _process_list_future_lags( - comp_lags, f"`{lags_name}` for component {comp_name}" + + raise_if( + isinstance(comp_lags[0], bool) + or isinstance(comp_lags[1], bool), + f"`{lags_name}` tuple must contain integers, not bool", + logger, + ) + + raise_if_not( + comp_lags[0] >= 0 and comp_lags[1] >= 0, + f"`{lags_name}` tuple must contain positive integers. Given: {comp_lags}.", + ) + raise_if( + comp_lags[0] == 0 and comp_lags[1] == 0, + f"`{lags_name}` tuple cannot be (0, 0) as it corresponds to an empty list of lags.", + logger, ) + tmp_components_lags[comp_name] = list( + range(-comp_lags[0], comp_lags[1]) + ) + elif isinstance(comp_lags, list): + for lag in comp_lags: + raise_if( + not isinstance(lag, int) or isinstance(lag, bool), + f"`{lags_name}` list must contain only integers. Given: {comp_lags}.", + ) + tmp_components_lags[comp_name] = sorted(comp_lags) else: invalid_type = True supported_types = "tuple or a list" else: if isinstance(comp_lags, int): - components_lags[comp_name] = _process_int_lags( - comp_lags, f"`{lags_name}` for component {comp_name}" + raise_if_not( + comp_lags > 0, + f"`{lags_name}` integer must be strictly positive . Given: {comp_lags}.", ) + tmp_components_lags[comp_name] = list(range(-comp_lags, 0)) elif isinstance(comp_lags, list): - components_lags[comp_name] = _process_list_lags( - comp_lags, f"`{lags_name}` for component {comp_name}" - ) + for lag in comp_lags: + raise_if( + not isinstance(lag, int) or (lag >= 0), + f"`{lags_name}` list must contain only strictly negative integers. Given: {comp_lags}.", + ) + tmp_components_lags[comp_name] = sorted(comp_lags) else: invalid_type = True supported_types = "strictly positive integer or a list" @@ -317,98 +318,29 @@ def _process_dict_lags( if invalid_type: raise_log( ValueError( - f"When passed as a dictionary, `{lags_name}` for component {comp_name} must be either a " + f"When passed in a dictionary, `{lags_name}` for component {comp_name} must be either a " f"{supported_types}, received : {type(comp_lags)}." ), logger, ) if min_lags is None: - min_lags = components_lags[comp_name][0] + min_lags = tmp_components_lags[comp_name][0] else: - min_lags = min(min_lags, components_lags[comp_name][0]) + min_lags = min(min_lags, tmp_components_lags[comp_name][0]) if max_lags is None: - max_lags = components_lags[comp_name][-1] + max_lags = tmp_components_lags[comp_name][-1] else: - max_lags = max(max_lags, components_lags[comp_name][-1]) + max_lags = max(max_lags, tmp_components_lags[comp_name][-1]) - # revert to lags shared across components logic - if list(components_lags.keys()) == ["default_lags"]: - return components_lags["default_lags"], {} + # revert to shared lags logic when applicable + if list(tmp_components_lags.keys()) == ["default_lags"]: + processed_lags[lags_abbrev] = tmp_components_lags["default_lags"] else: - return [min_lags, max_lags], components_lags - - # perform the type and sanity checks - lags_type_error_msg = [] - processed_lags: Dict[str, List[int]] = dict() - processed_component_lags: Dict[str, Dict[str, List[int]]] = dict() - if lags is None: - pass - elif isinstance(lags, int): - processed_lags["target"] = _process_int_lags(lags, "lags") - elif isinstance(lags, list): - processed_lags["target"] = _process_list_lags(lags, "lags") - elif isinstance(lags, dict): - conv_lags = _process_dict_lags(lags, "lags") - # dummy, used to compute the extreme lags - processed_lags["target"] = conv_lags[0] - # actual lags - processed_component_lags["target"] = conv_lags[1] - else: - lags_type_error_msg.append( - f"`lags` must be of type int, list or dict." f"Given: {type(lags)}." - ) - - if lags_past_covariates is None: - pass - elif isinstance(lags_past_covariates, int): - processed_lags["past"] = _process_int_lags( - lags_past_covariates, "lags_past_covariates" - ) - elif isinstance(lags_past_covariates, list): - processed_lags["past"] = _process_list_lags( - lags_past_covariates, "lags_past_covariates" - ) - elif isinstance(lags_past_covariates, dict): - conv_lags = _process_dict_lags(lags_past_covariates, "lags_past_covariates") - # dummy, used to compute the extreme lags - processed_lags["past"] = conv_lags[0] - # actual lags - processed_component_lags["past"] = conv_lags[1] - else: - lags_type_error_msg.append( - f"`lags_past_covariates` must be of type int, list or dict." - f"Given: {type(lags_past_covariates)}." - ) - - if lags_future_covariates is None: - pass - elif isinstance(lags_future_covariates, tuple): - processed_lags["future"] = _process_tuple_future_lags( - lags_future_covariates, "lags_future_covariates" - ) - elif isinstance(lags_future_covariates, list): - processed_lags["future"] = _process_list_future_lags( - lags_future_covariates, "lags_future_covariates" - ) - elif isinstance(lags_future_covariates, dict): - conv_lags = _process_dict_lags( - lags_future_covariates, "lags_future_covariates" - ) - # dummy, used to compute the extreme lags - processed_lags["future"] = conv_lags[0] - # actual lags - processed_component_lags["future"] = conv_lags[1] - else: - lags_type_error_msg.append( - f"`lags_future_covariates` must be of type tuple, list or dict. " - f"Given: {type(lags_future_covariates)}." - ) + processed_lags[lags_abbrev] = [min_lags, max_lags] + processed_component_lags[lags_abbrev] = tmp_components_lags - # error message for all the invalid types - if len(lags_type_error_msg) > 0: - raise_log(ValueError("\n".join(lags_type_error_msg)), logger) return processed_lags, processed_component_lags def _get_lags(self, lags_type: str): From 269005e41d2bc0a3d79562abdb4e0a2f1be24a25 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 28 Aug 2023 10:12:34 +0200 Subject: [PATCH 16/30] refactor: lags argument are converted to dict before running the type check and processing of the values --- darts/models/forecasting/regression_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index a0c5ea4933..186688b419 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -324,6 +324,7 @@ def _generate_lags( logger, ) + # extracting min and max lags va if min_lags is None: min_lags = tmp_components_lags[comp_name][0] else: From bcd44555ccae60cc4eceb3586977b17bf7db3ba3 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 28 Aug 2023 11:27:55 +0200 Subject: [PATCH 17/30] doc: improved documentation of the component-specific lags in tabularization --- darts/utils/data/tabularization.py | 83 ++++++++++++++++++------------ 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 9078515a2b..404452ba74 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -27,9 +27,9 @@ def create_lagged_data( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, output_chunk_length: int = 1, uses_static_covariates: bool = True, last_static_covariates_shape: Optional[Tuple[int, int]] = None, @@ -154,15 +154,18 @@ def create_lagged_data( Optionally, the lags of the target series to be used as (auto-regressive) features. If not specified, auto-regressive features will *not* be added to `X`. Each lag value is assumed to be negative (e.g. `lags = [-3, -1]` will extract `target_series` values which are 3 timesteps and 1 timestep away from - the current value). + the current value). If the lags are provided as a dictionary, the lags values are specific to each + component in the target series. lags_past_covariates Optionally, the lags of `past_covariates` to be used as features. Like `lags`, each lag value is assumed to - be less than or equal to -1. + be less than or equal to -1. If the lags are provided as a dictionary, the lags values are specific to each + component in the past covariates series. lags_future_covariates Optionally, the lags of `future_covariates` to be used as features. Unlike `lags` and `lags_past_covariates`, `lags_future_covariates` values can be positive (i.e. use values *after* time `t` to predict target at time `t`), zero (i.e. use values *at* time `t` to predict target at time `t`), and/or - negative (i.e. use values *before* time `t` to predict target at time `t`). + negative (i.e. use values *before* time `t` to predict target at time `t`). If the lags are provided as + a dictionary, the lags values are specific to each component in the future covariates series. uses_static_covariates Whether the model uses/expects static covariates. If `True`, it enforces that static covariates must have identical shapes across all target series. @@ -372,15 +375,18 @@ def create_lagged_training_data( Optionally, the lags of the target series to be used as (auto-regressive) features. If not specified, auto-regressive features will *not* be added to `X`. Each lag value is assumed to be negative (e.g. `lags = [-3, -1]` will extract `target_series` values which are 3 timesteps and 1 timestep away from - the current value). + the current value). If the lags are provided as a dictionary, the lags values are specific to each + component in the target series. lags_past_covariates Optionally, the lags of `past_covariates` to be used as features. Like `lags`, each lag value is assumed to - be less than or equal to -1. + be less than or equal to -1. If the lags are provided as a dictionary, the lags values are specific to each + component in the past covariates series. lags_future_covariates Optionally, the lags of `future_covariates` to be used as features. Unlike `lags` and `lags_past_covariates`, `lags_future_covariates` values can be positive (i.e. use values *after* time `t` to predict target at time `t`), zero (i.e. use values *at* time `t` to predict target at time `t`), and/or negative (i.e. use values - *before* time `t` to predict target at time `t`). + *before* time `t` to predict target at time `t`). If the lags are provided as a dictionary, the lags values + are specific to each component in the future covariates series. uses_static_covariates Whether the model uses/expects static covariates. If `True`, it enforces that static covariates must have identical shapes across all target series. @@ -584,11 +590,11 @@ def create_lagged_prediction_data( def add_static_covariates_to_lagged_data( - features: Union[np.array, Sequence[np.array]], + features: Union[np.ndarray, Sequence[np.ndarray]], target_series: Union[TimeSeries, Sequence[TimeSeries]], uses_static_covariates: bool = True, last_shape: Optional[Tuple[int, int]] = None, -) -> Union[np.array, Sequence[np.array]]: +) -> Union[np.ndarray, Sequence[np.ndarray]]: """ Add static covariates to the features' table for RegressionModels. If `uses_static_covariates=True`, all target series used in `fit()` and `predict()` must have static @@ -678,13 +684,9 @@ def create_lagged_component_names( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Union[Sequence[int], Dict[str, Sequence[int]]]] = None, - lags_past_covariates: Optional[ - Union[Sequence[int], Dict[str, Sequence[int]]] - ] = None, - lags_future_covariates: Optional[ - Union[Sequence[int], Dict[str, Sequence[int]]] - ] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, output_chunk_length: int = 1, concatenate: bool = True, use_static_covariates: bool = False, @@ -693,11 +695,16 @@ def create_lagged_component_names( Helper function called to retrieve the name of the features and labels arrays created with `create_lagged_data()`. The order of the features is the following: - Along the `n_lagged_features` axis, `X` has the following structure (for `*_lags=[-2,-1]` and - `*_series.n_components = 2`): + Along the `n_lagged_features` axis, `X` has the following structure: lagged_target | lagged_past_covariates | lagged_future_covariates | static covariates - where each `lagged_*` has the following structure: + + For `*_lags=[-2,-1]` and `*_series.n_components = 2` (lags shared across all the components), + each `lagged_*` has the following structure (grouped by lags): comp0_*_lag-2 | comp1_*_lag-2 | comp0_*_lag_-1 | comp1_*_lag-1 + For `*_lags={'comp0':[-2, -1], 'comp1':[-5, -3]}` and `*_series.n_components = 2` (component- + specific lags), each `lagged_*` has the following structure (grouped by components): + comp0_*_lag-2 | comp0_*_lag-1 | comp1_*_lag_-5 | comp1_*_lag-3 + and for static covariates (2 static covariates acting on 2 target components): cov0_*_target_comp0 | cov0_*_target_comp1 | cov1_*_target_comp0 | cov1_*_target_comp1 @@ -790,9 +797,9 @@ def _create_lagged_data_by_moving_window( output_chunk_length: int, past_covariates: Optional[TimeSeries], future_covariates: Optional[TimeSeries], - lags: Optional[Sequence[int]], - lags_past_covariates: Optional[Sequence[int]], - lags_future_covariates: Optional[Sequence[int]], + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], max_samples_per_ts: Optional[int], multi_models: bool, check_inputs: bool, @@ -964,7 +971,10 @@ def _extract_lagged_vals_from_windows( lagged values is `(num_windows, num_components * lags_to_extract.size, num_series)`. For example, if `lags_to_extract = [-2]`, only the second-to-last values within each window will be extracted. If `lags_to_extract` is specified as a list of np.ndarray, the values will be extracted using the - lags provided for each component. + lags provided for each component. In such cases, the shape of the returned lagged values is + `(num_windows, sum([comp_lags.size for comp_lags in lags_to_extract]), num_series)`. For example, + if `lags_to_extract = [[-2, -1], [-1]]`, the second-to-last and last values of the first component + and the last values of the second component within each window will be extracted. """ # windows.shape = (num_windows, num_components, num_samples, window_len): if isinstance(lags_to_extract, list): @@ -1113,9 +1123,9 @@ def _get_feature_times( target_series: Optional[TimeSeries] = None, past_covariates: Optional[TimeSeries] = None, future_covariates: Optional[TimeSeries] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, output_chunk_length: int = 1, is_training: bool = True, return_min_and_max_lags: bool = False, @@ -1230,6 +1240,9 @@ def _get_feature_times( Optionally, specifies whether the largest magnitude lag value for each series should also be returned along with the 'eligible' feature times + Note: if the lags are provided as a dictionary for the target series or any of the covariates series, the + component-specific lags are grouped into a single list to compute the corresponding feature time. + Returns ------- feature_times @@ -1275,7 +1288,7 @@ def _get_feature_times( [target_series, past_covariates, future_covariates], [lags, lags_past_covariates, lags_future_covariates], ): - # TODO: information is available in model.lags, not sure how to make the info get here + # union of the component-specific lags, unsorted if isinstance(lags_i, dict): lags_i = list(set(chain(*lags_i.values()))) @@ -1627,9 +1640,9 @@ def _all_equal_freq(*series: Union[TimeSeries, None]) -> bool: def _check_lags( - lags: Sequence[int], - lags_past_covariates: Sequence[int], - lags_future_covariates: Sequence[int], + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]], ) -> None: """ Throws `ValueError` if any `lag` values aren't negative OR if no lags have been specified. @@ -1642,9 +1655,13 @@ def _check_lags( if not lags_is_none[-1]: is_target_or_past = i < 2 max_lag = -1 if is_target_or_past else inf + + if isinstance(lags_i, dict): + lags_i = list(set(chain(*lags_i.values()))) + raise_if( any((lag > max_lag or not isinstance(lag, int)) for lag in lags_i), - f"`lags{suffix}` must be a `Sequence` containing only `int` values less than {max_lag + 1}.", + f"`lags{suffix}` must be a `Sequence` or `Dict` containing only `int` values less than {max_lag + 1}.", ) raise_if( all(lags_is_none), From b859d9a7885f74ebc747e946b46f455a27384f28 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 28 Aug 2023 13:37:30 +0200 Subject: [PATCH 18/30] test: adding a test for the multivariate scenario --- .../forecasting/test_regression_models.py | 84 +++++++++++-------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 9e8426bbe6..8786ecbcc1 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1596,41 +1596,47 @@ def test_integer_indexed_series(self, mode): @pytest.mark.parametrize( "config", - [ - ({"lags": [-3, -2, -1]}, {"lags": {"gaussian": 3}}), - ({"lags": 3}, {"lags": {"gaussian": 3, "sine": 3}}), - ({"lags_past_covariates": 2}, {"lags_past_covariates": {"lin_past": 2}}), - ( - {"lags": 5, "lags_future_covariates": [-2, 3]}, - { - "lags": { - "gaussian": [-5, -4, -3, -2, -1], - "sine": [-5, -4, -3, -2, -1], - }, - "lags_future_covariates": { - "lin_future": [-2, 3], - "sine_future": [-2, 3], - }, - }, - ), - ( - {"lags": 5, "lags_future_covariates": [-2, 3]}, - { - "lags": { - "gaussian": [-5, -4, -3, -2, -1], - "sine": [-5, -4, -3, -2, -1], + itertools.product( + [ + ({"lags": [-3, -2, -1]}, {"lags": {"gaussian": 3}}), + ({"lags": 3}, {"lags": {"gaussian": 3, "sine": 3}}), + ( + {"lags_past_covariates": 2}, + {"lags_past_covariates": {"lin_past": 2}}, + ), + ( + {"lags": 5, "lags_future_covariates": [-2, 3]}, + { + "lags": { + "gaussian": [-5, -4, -3, -2, -1], + "sine": [-5, -4, -3, -2, -1], + }, + "lags_future_covariates": { + "lin_future": [-2, 3], + "sine_future": [-2, 3], + }, }, - "lags_future_covariates": { - "sine_future": [-2, 3], - "default_lags": [-2, 3], + ), + ( + {"lags": 5, "lags_future_covariates": [-2, 3]}, + { + "lags": { + "gaussian": [-5, -4, -3, -2, -1], + "sine": [-5, -4, -3, -2, -1], + }, + "lags_future_covariates": { + "sine_future": [-2, 3], + "default_lags": [-2, 3], + }, }, - }, - ), - ], + ), + ], + [True, False], + ), ) def test_component_specific_lags(self, config): """Verify that the same lags, defined using int/list or dictionnaries yield the same results""" - list_lags, dict_lags = config + (list_lags, dict_lags), multiple_series = config multivar_target = "lags" in dict_lags and len(dict_lags["lags"]) > 1 multivar_future_cov = ( "lags_future_covariates" in dict_lags @@ -1641,6 +1647,16 @@ def test_component_specific_lags(self, config): series = tg.gaussian_timeseries(length=20, column_name="gaussian") if multivar_target: series = series.stack(tg.sine_timeseries(length=20, column_name="sine")) + if multiple_series: + # second series have different component names + series = [ + series, + series.with_columns_renamed( + ["gaussian", "sine"][: series.width], + ["other", "names"][: series.width], + ) + + 10, + ] future_cov = tg.linear_timeseries(length=30, column_name="lin_future") if multivar_future_cov: @@ -1667,14 +1683,14 @@ def test_component_specific_lags(self, config): ) # n == output_chunk_length - pred = model.predict(1) - pred2 = model2.predict(1) + pred = model.predict(1, series=series[0] if multiple_series else None) + pred2 = model2.predict(1, series=series[0] if multiple_series else None) np.testing.assert_array_almost_equal(pred.values(), pred2.values()) assert pred.time_index.equals(pred2.time_index) # n > output_chunk_length - pred = model.predict(3) - pred2 = model2.predict(3) + pred = model.predict(3, series=series[0] if multiple_series else None) + pred2 = model2.predict(3, series=series[0] if multiple_series else None) np.testing.assert_array_almost_equal(pred.values(), pred2.values()) assert pred.time_index.equals(pred2.time_index) From c0121a549a217d8f10980b46d54211b3b66c0498 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Tue, 29 Aug 2023 15:48:31 +0200 Subject: [PATCH 19/30] test: checking the appriopriate lags are extracted by the shap explainer --- .../explainability/test_shap_explainer.py | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/darts/tests/explainability/test_shap_explainer.py b/darts/tests/explainability/test_shap_explainer.py index e526d1b81b..a953c79dd3 100644 --- a/darts/tests/explainability/test_shap_explainer.py +++ b/darts/tests/explainability/test_shap_explainer.py @@ -14,7 +14,7 @@ from darts import TimeSeries from darts.dataprocessing.transformers import Scaler from darts.explainability.explainability_result import ShapExplainabilityResult -from darts.explainability.shap_explainer import ShapExplainer +from darts.explainability.shap_explainer import MIN_BACKGROUND_SAMPLE, ShapExplainer from darts.models import ( CatBoostModel, ExponentialSmoothing, @@ -24,6 +24,7 @@ RegressionModel, XGBModel, ) +from darts.utils.timeseries_generation import linear_timeseries lgbm_available = not isinstance(LightGBMModel, NotImportedModule) cb_available = not isinstance(CatBoostModel, NotImportedModule) @@ -799,3 +800,38 @@ def test_shapley_multiple_series_with_different_static_covs(self): for explained_forecast in explanation_results.explained_forecasts: comps_out = explained_forecast[1]["price"].columns.tolist() assert comps_out[-1] == "type_statcov_target_price" + + def test_shap_regressor_component_specific_lags(self): + model = LinearRegressionModel( + lags={"price": [-3, -2], "power": [-1]}, + output_chunk_length=1, + ) + # multivariate ts as short as possible + min_ts_length = MIN_BACKGROUND_SAMPLE * np.abs(min(model.lags["target"])) + ts = linear_timeseries( + start_value=1, + end_value=min_ts_length, + length=min_ts_length, + column_name="price", + ).stack( + linear_timeseries( + start_value=102, + end_value=100 + 2 * min_ts_length, + length=min_ts_length, + column_name="power", + ) + ) + model.fit(ts) + shap_explain = ShapExplainer(model) + + # one column per lag, grouped by components + expected_df = pd.DataFrame( + data=np.stack( + [np.arange(1, 29), np.arange(3, 31), np.arange(106, 161, 2)], axis=1 + ), + columns=["price_target_lag-3", "price_target_lag-2", "power_target_lag-1"], + ) + + # check that the appropriate lags are extracted + assert all(shap_explain.explainers.background_X == expected_df) + assert model.lagged_feature_names == list(expected_df.columns) From d682f1303e11f0ab71ee52da21586cd412646ef0 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Tue, 29 Aug 2023 15:51:10 +0200 Subject: [PATCH 20/30] fix: shapexplainer extract the appropriate lags, updated the type hints --- darts/explainability/shap_explainer.py | 12 ++--- darts/models/forecasting/lgbm.py | 48 +++++++++++------ .../forecasting/linear_regression_model.py | 53 +++++++++++++------ darts/models/forecasting/random_forest.py | 48 ++++++++++++----- darts/models/forecasting/xgboost.py | 53 +++++++++++++------ darts/utils/data/tabularization.py | 17 +++--- 6 files changed, 157 insertions(+), 74 deletions(-) diff --git a/darts/explainability/shap_explainer.py b/darts/explainability/shap_explainer.py index 29be9d5e3d..143ea0d8b9 100644 --- a/darts/explainability/shap_explainer.py +++ b/darts/explainability/shap_explainer.py @@ -732,9 +732,9 @@ def _build_explainer_sklearn( def _create_regression_model_shap_X( self, - target_series, - past_covariates, - future_covariates, + target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]], + past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]], + future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]], n_samples=None, train=False, ) -> pd.DataFrame: @@ -746,9 +746,9 @@ def _create_regression_model_shap_X( """ - lags_list = self.model.lags.get("target") - lags_past_covariates_list = self.model.lags.get("past") - lags_future_covariates_list = self.model.lags.get("future") + lags_list = self.model._get_lags("target") + lags_past_covariates_list = self.model._get_lags("past") + lags_future_covariates_list = self.model._get_lags("future") X, indexes = create_lagged_prediction_data( target_series=target_series if lags_list else None, diff --git a/darts/models/forecasting/lgbm.py b/darts/models/forecasting/lgbm.py index 8f85fe3237..b4ef4c020a 100644 --- a/darts/models/forecasting/lgbm.py +++ b/darts/models/forecasting/lgbm.py @@ -10,13 +10,15 @@ https://github.com/unit8co/darts/blob/master/INSTALL.md """ -from typing import List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Union import lightgbm as lgb import numpy as np from darts.logging import get_logger from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, RegressionModelWithCategoricalCovariates, _LikelihoodMixin, ) @@ -28,13 +30,13 @@ class LightGBMModel(RegressionModelWithCategoricalCovariates, _LikelihoodMixin): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, - likelihood: str = None, - quantiles: List[float] = None, + likelihood: Optional[str] = None, + quantiles: Optional[List[float]] = None, random_state: Optional[int] = None, multi_models: Optional[bool] = True, use_static_covariates: bool = True, @@ -48,17 +50,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If an tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/models/forecasting/linear_regression_model.py b/darts/models/forecasting/linear_regression_model.py index 778619bae4..4e09a948ed 100644 --- a/darts/models/forecasting/linear_regression_model.py +++ b/darts/models/forecasting/linear_regression_model.py @@ -5,14 +5,19 @@ A forecasting model using a linear regression of some of the target series' lags, as well as optionally some covariate series lags in order to obtain a forecast. """ -from typing import List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Union import numpy as np from scipy.optimize import linprog from sklearn.linear_model import LinearRegression, PoissonRegressor, QuantileRegressor from darts.logging import get_logger -from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin +from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, + RegressionModel, + _LikelihoodMixin, +) from darts.timeseries import TimeSeries logger = get_logger(__name__) @@ -21,13 +26,13 @@ class LinearRegressionModel(RegressionModel, _LikelihoodMixin): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, - likelihood: str = None, - quantiles: List[float] = None, + likelihood: Optional[str] = None, + quantiles: Optional[List[float]] = None, random_state: Optional[int] = None, multi_models: Optional[bool] = True, use_static_covariates: bool = True, @@ -38,17 +43,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If an tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/models/forecasting/random_forest.py b/darts/models/forecasting/random_forest.py index 600f307302..fe9a4e4096 100644 --- a/darts/models/forecasting/random_forest.py +++ b/darts/models/forecasting/random_forest.py @@ -14,12 +14,16 @@ ---------- .. [1] https://en.wikipedia.org/wiki/Random_forest """ -from typing import List, Optional, Tuple, Union +from typing import Optional from sklearn.ensemble import RandomForestRegressor from darts.logging import get_logger -from darts.models.forecasting.regression_model import RegressionModel +from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, + RegressionModel, +) logger = get_logger(__name__) @@ -27,9 +31,9 @@ class RandomForest(RegressionModel): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, n_estimators: Optional[int] = 100, @@ -43,17 +47,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If an tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/models/forecasting/xgboost.py b/darts/models/forecasting/xgboost.py index ef693f4723..302e190781 100644 --- a/darts/models/forecasting/xgboost.py +++ b/darts/models/forecasting/xgboost.py @@ -8,13 +8,18 @@ """ from functools import partial -from typing import List, Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Union import numpy as np import xgboost as xgb from darts.logging import get_logger -from darts.models.forecasting.regression_model import RegressionModel, _LikelihoodMixin +from darts.models.forecasting.regression_model import ( + FUTURE_LAGS_TYPE, + LAGS_TYPE, + RegressionModel, + _LikelihoodMixin, +) from darts.timeseries import TimeSeries from darts.utils.utils import raise_if_not @@ -43,13 +48,13 @@ def xgb_quantile_loss(labels: np.ndarray, preds: np.ndarray, quantile: float): class XGBModel(RegressionModel, _LikelihoodMixin): def __init__( self, - lags: Union[int, list] = None, - lags_past_covariates: Union[int, List[int]] = None, - lags_future_covariates: Union[Tuple[int, int], List[int]] = None, + lags: Optional[LAGS_TYPE] = None, + lags_past_covariates: Optional[LAGS_TYPE] = None, + lags_future_covariates: Optional[FUTURE_LAGS_TYPE] = None, output_chunk_length: int = 1, add_encoders: Optional[dict] = None, - likelihood: str = None, - quantiles: List[float] = None, + likelihood: Optional[str] = None, + quantiles: Optional[List[float]] = None, random_state: Optional[int] = None, multi_models: Optional[bool] = True, use_static_covariates: bool = True, @@ -60,17 +65,33 @@ def __init__( Parameters ---------- lags - Lagged target values used to predict the next time step. If an integer is given the last `lags` past lags - are used (from -1 backward). Otherwise a list of integers with lags is required (each lag must be < 0). + Lagged target `series` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags` past lags; e.g. `(-1, -2, ..., -lags)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `series` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_past_covariates - Number of lagged past_covariates values used to predict the next time step. If an integer is given the last - `lags_past_covariates` past lags are used (inclusive, starting from lag -1). Otherwise a list of integers - with lags < 0 is required. + Lagged `past_covariates` values used to predict the next time step/s. + If an integer, must be > 0. Uses the last `n=lags_past_covariates` past lags; e.g. `(-1, -2, ..., -lags)`, + where `0` corresponds to the first predicted time step of each sample. + If a list of integers, each value must be < 0. Uses only the specified values as lags. + If a dictionary, the keys correspond to the `past_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (integer or list of integers). The + key 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. lags_future_covariates - Number of lagged future_covariates values used to predict the next time step. If a tuple (past, future) is - given the last `past` lags in the past are used (inclusive, starting from lag -1) along with the first - `future` future lags (starting from 0 - the prediction time - up to `future - 1` included). Otherwise a list - of integers with lags is required. + Lagged `future_covariates` values used to predict the next time step/s. + If a tuple of `(past, future)`, both values must be > 0. Uses the last `n=past` past lags and `n=future` + future lags; e.g. `(-past, -(past - 1), ..., -1, 0, 1, .... future - 1)`, where `0` + corresponds the first predicted time step of each sample. + If a list of integers, uses only the specified values as lags. + If a dictionary, the keys correspond to the `future_covariates` component names (of the first series when + using multiple series) and the values correspond to the component lags (tuple or list of integers). The key + 'default_lags' can be used to provide default lags for un-specified components. Raises and error if some + components are missing and the 'default_lags' key is not provided. output_chunk_length Number of time steps predicted at once by the internal regression model. Does not have to equal the forecast horizon `n` used in `predict()`. However, setting `output_chunk_length` equal to the forecast horizon may diff --git a/darts/utils/data/tabularization.py b/darts/utils/data/tabularization.py index 404452ba74..835d793196 100644 --- a/darts/utils/data/tabularization.py +++ b/darts/utils/data/tabularization.py @@ -475,9 +475,9 @@ def create_lagged_prediction_data( target_series: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, - lags: Optional[Sequence[int]] = None, - lags_past_covariates: Optional[Sequence[int]] = None, - lags_future_covariates: Optional[Sequence[int]] = None, + lags: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_past_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, + lags_future_covariates: Optional[Union[Sequence[int], Dict[str, List[int]]]] = None, uses_static_covariates: bool = True, last_static_covariates_shape: Optional[Tuple[int, int]] = None, max_samples_per_ts: Optional[int] = None, @@ -508,15 +508,18 @@ def create_lagged_prediction_data( Optionally, the lags of the target series to be used as (auto-regressive) features. If not specified, auto-regressive features will *not* be added to `X`. Each lag value is assumed to be negative (e.g. `lags = [-3, -1]` will extract `target_series` values which are 3 timesteps and 1 timestep away from - the current value). + the current value). If the lags are provided as a dictionary, the lags values are specific to each + component in the target series. lags_past_covariates Optionally, the lags of `past_covariates` to be used as features. Like `lags`, each lag value is assumed to - be less than or equal to -1. + be less than or equal to -1. If the lags are provided as a dictionary, the lags values are specific to each + component in the past covariates series. lags_future_covariates Optionally, the lags of `future_covariates` to be used as features. Unlike `lags` and `lags_past_covariates`, `lags_future_covariates` values can be positive (i.e. use values *after* time `t` to predict target at time `t`), zero (i.e. use values *at* time `t` to predict target at time `t`), and/or negative (i.e. use - values *before* time `t` to predict target at time `t`). + values *before* time `t` to predict target at time `t`). If the lags are provided as a dictionary, the lags + values are specific to each component in the future covariates series. uses_static_covariates Whether the model uses/expects static covariates. If `True`, it enforces that static covariates must have identical shapes across all target series. @@ -793,7 +796,7 @@ def create_lagged_component_names( def _create_lagged_data_by_moving_window( - target_series: TimeSeries, + target_series: Optional[TimeSeries], output_chunk_length: int, past_covariates: Optional[TimeSeries], future_covariates: Optional[TimeSeries], From 96f1a7f07e4dee03605b577ceaeb70ccc372bf26 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Thu, 31 Aug 2023 10:07:24 +0200 Subject: [PATCH 21/30] fix: passing covariates when trained on multiple series --- Dockerfile | 32 +++++----- .../forecasting/test_regression_models.py | 61 ++++++++++++++++--- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index b604f92713..bf07201462 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,24 @@ -FROM jupyter/base-notebook:python-3.9.5 +FROM ubuntu:latest -RUN conda update --all -y --quiet \ - && conda install -c conda-forge ipywidgets -y --quiet \ - && conda clean --all -f -y +# setup packages +RUN apt-get update -y +RUN apt-get install -y python3 python-is-python3 python3-pip default-jre +RUN pip install --upgrade pip -USER root +# install python requirements before copying the rest of the files +# this way we can cache the requirements and not have to reinstall them +COPY requirements/ /app/requirements/ +RUN pip install -r /app/requirements/dev-all.txt -# to build pystan -RUN apt-get update \ - && apt-get -y install build-essential \ - && apt-get clean && rm -rf /var/lib/apt/lists/* +# copy local files +COPY . /app -USER $NB_USER +# set work directory +WORKDIR /app -ADD . /home/jovyan/work +# install darts +RUN pip install -e . -WORKDIR /home/jovyan/work - -RUN pip install . +# assuming you are working out of your darts directory: +# docker build . -t darts-test:latest +# docker run -it -v $(pwd)/:/app/ darts-test:latest bash \ No newline at end of file diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 8786ecbcc1..1ef940a6c9 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1647,6 +1647,15 @@ def test_component_specific_lags(self, config): series = tg.gaussian_timeseries(length=20, column_name="gaussian") if multivar_target: series = series.stack(tg.sine_timeseries(length=20, column_name="sine")) + + future_cov = tg.linear_timeseries(length=30, column_name="lin_future") + if multivar_future_cov: + future_cov = future_cov.stack( + tg.sine_timeseries(length=30, column_name="sine_future") + ) + + past_cov = tg.linear_timeseries(length=30, column_name="lin_past") + if multiple_series: # second series have different component names series = [ @@ -1658,13 +1667,9 @@ def test_component_specific_lags(self, config): + 10, ] - future_cov = tg.linear_timeseries(length=30, column_name="lin_future") - if multivar_future_cov: - future_cov = future_cov.stack( - tg.sine_timeseries(length=30, column_name="sine_future") - ) + past_cov = [past_cov, past_cov] - past_cov = tg.linear_timeseries(length=30, column_name="lin_past") + future_cov = [future_cov, future_cov] # the lags are identical across the components for each series model = LinearRegressionModel(**list_lags) @@ -1683,14 +1688,50 @@ def test_component_specific_lags(self, config): ) # n == output_chunk_length - pred = model.predict(1, series=series[0] if multiple_series else None) - pred2 = model2.predict(1, series=series[0] if multiple_series else None) + pred = model.predict( + 1, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + pred2 = model2.predict( + 1, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model2.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model2.supports_future_covariates + else None, + ) np.testing.assert_array_almost_equal(pred.values(), pred2.values()) assert pred.time_index.equals(pred2.time_index) # n > output_chunk_length - pred = model.predict(3, series=series[0] if multiple_series else None) - pred2 = model2.predict(3, series=series[0] if multiple_series else None) + pred = model.predict( + 3, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + pred2 = model2.predict( + 3, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model2.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model2.supports_future_covariates + else None, + ) np.testing.assert_array_almost_equal(pred.values(), pred2.values()) assert pred.time_index.equals(pred2.time_index) From d987141279b2188fecea11ec8702f69cc885a8d5 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Thu, 31 Aug 2023 14:17:41 +0200 Subject: [PATCH 22/30] fix: moved the series components consistency to create_lagged_data to limit iteration of the series --- darts/models/forecasting/forecasting_model.py | 14 ---------- darts/models/forecasting/regression_model.py | 26 +++++++++++++++---- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py index a848674570..452d2368cd 100644 --- a/darts/models/forecasting/forecasting_model.py +++ b/darts/models/forecasting/forecasting_model.py @@ -2078,20 +2078,6 @@ def fit( ): self.static_covariates = series.static_covariates else: - # check that all the ts within one group have the same number of components - for ts_sequence, cov_name in zip( - [series, past_covariates, future_covariates], - ["series", "past_covariates", "future_covariates"], - ): - raise_if( - ts_sequence is not None - and not all( - [ts_sequence[0].width == ts.width for ts in ts_sequence] - ), - f"All the series in `{cov_name}` should have the same number of components", - logger, - ) - if past_covariates is not None: self._expect_past_covariates = True if future_covariates is not None: diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 6b0c52842a..164817c2fa 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -456,7 +456,11 @@ def _get_last_prediction_time(self, series, forecast_horizon, overlap_end): return last_valid_pred_time def _create_lagged_data( - self, target_series, past_covariates, future_covariates, max_samples_per_ts + self, + target_series: Sequence[TimeSeries], + past_covariates: Sequence[TimeSeries], + future_covariates: Sequence[TimeSeries], + max_samples_per_ts: int, ): ( features, @@ -479,7 +483,19 @@ def _create_lagged_data( concatenate=False, ) + expected_nb_feat = ( + features[0].shape[1] + if isinstance(features, Sequence) + else features.shape[1] + ) for i, (X_i, y_i) in enumerate(zip(features, labels)): + # number of components inconsistency, cannot determine from which argument without iterating + raise_if( + expected_nb_feat != X_i.shape[1], + "When `series`, `past_covariates` or `future_covariates` is provided as a `Sequence[TimeSeries]`, " + "all the `TimeSeries` in the `Sequence` must have the same number of components.", + logger, + ) features[i] = X_i[:, :, 0] labels[i] = y_i[:, :, 0] @@ -490,10 +506,10 @@ def _create_lagged_data( def _fit_model( self, - target_series, - past_covariates, - future_covariates, - max_samples_per_ts, + target_series: Sequence[TimeSeries], + past_covariates: Sequence[TimeSeries], + future_covariates: Sequence[TimeSeries], + max_samples_per_ts: int, **kwargs, ): """ From 70467cf7fa4972f230bcb731a86bc8ff9bf011eb Mon Sep 17 00:00:00 2001 From: madtoinou Date: Thu, 31 Aug 2023 15:27:18 +0200 Subject: [PATCH 23/30] fix: improved the error message for components inconsistency, improve tests parametrization --- darts/models/forecasting/regression_model.py | 21 +++++--- .../forecasting/test_regression_models.py | 48 ++++++++++++++----- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 164817c2fa..00310d0ea9 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -489,13 +489,20 @@ def _create_lagged_data( else features.shape[1] ) for i, (X_i, y_i) in enumerate(zip(features, labels)): - # number of components inconsistency, cannot determine from which argument without iterating - raise_if( - expected_nb_feat != X_i.shape[1], - "When `series`, `past_covariates` or `future_covariates` is provided as a `Sequence[TimeSeries]`, " - "all the `TimeSeries` in the `Sequence` must have the same number of components.", - logger, - ) + # TODO: account for scenario where two wrong shapes can silently hide the problem + if expected_nb_feat != X_i.shape[1]: + shape_error_msg = [] + for ts, cov_name, arg_name in zip( + [target_series, past_covariates, future_covariates], + ["target", "past", "future"], + ["series", "past_covariates", "future_covariates"], + ): + if ts is not None and ts[i].width != self.input_dim[cov_name]: + shape_error_msg.append( + f"Expected {self.input_dim[cov_name]} components but received " + f"{target_series[i].width} components at index {i} of `{arg_name}`." + ) + raise_log(ValueError("\n".join(shape_error_msg)), logger) features[i] = X_i[:, :, 0] labels[i] = y_i[:, :, 0] diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 1ef940a6c9..c6446ae426 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -27,7 +27,6 @@ RegressionModel, XGBModel, ) -from darts.models.forecasting.forecasting_model import GlobalForecastingModel from darts.utils import timeseries_generation as tg from darts.utils.multioutput import MultiOutputRegressor @@ -1019,11 +1018,33 @@ def test_models_runnability(self, config): def test_fit(self, config): # test fitting both on univariate and multivariate timeseries model, mode, series = config + # auto-regression but past_covariates does not extend enough in the future with pytest.raises(ValueError): model_instance = model(lags=4, lags_past_covariates=4, multi_models=mode) model_instance.fit(series=series, past_covariates=self.sine_multivariate1) model_instance.predict(n=10) + # inconsistent number of components in series Sequence[TimeSeries] + with pytest.raises(ValueError) as err: + model_instance = model(lags=4, multi_models=mode) + model_instance.fit(series=[series.stack(series + 10), series]) + assert ( + str(err.value) + == "Expected 2 components but received 1 components at index 1 of `series`" + ) + + # inconsistent number of components in past_covariates Sequence[TimeSeries] + with pytest.raises(ValueError) as err: + model_instance = model(lags=4, lags_past_covariates=2, multi_models=mode) + model_instance.fit( + series=[series, series + 10], + past_covariates=[self.sine_univariate1, self.sine_multivariate1], + ) + assert ( + str(err.value) + == "Expected 1 components but received 2 components at index 1 of `past_covariates`" + ) + model_instance = model(lags=12, multi_models=mode) model_instance.fit(series=series) assert model_instance.lags.get("past") is None @@ -2468,29 +2489,34 @@ def test_fit_predict_determinism(self, config): @pytest.mark.parametrize( "config", itertools.product(models_cls_kwargs_errs, [True, False]) ) - def test_probabilistic_forecast_accuracy(self, config): + def test_probabilistic_forecast_accuracy_univariate(self, config): (model_cls, model_kwargs, err), mode = config model_kwargs["multi_models"] = mode + model = model_cls(**model_kwargs) self.helper_test_probabilistic_forecast_accuracy( - model_cls, - model_kwargs, + model, err, self.constant_ts, self.constant_noisy_ts, ) - if issubclass(model_cls, GlobalForecastingModel): + + @pytest.mark.slow + @pytest.mark.parametrize( + "config", itertools.product(models_cls_kwargs_errs, [True, False]) + ) + def test_probabilistic_forecast_accuracy_multivariate(self, config): + (model_cls, model_kwargs, err), mode = config + model_kwargs["multi_models"] = mode + model = model_cls(**model_kwargs) + if model.supports_multivariate: self.helper_test_probabilistic_forecast_accuracy( - model_cls, - model_kwargs, + model, err, self.constant_multivar_ts, self.constant_noisy_multivar_ts, ) - def helper_test_probabilistic_forecast_accuracy( - self, model_cls, model_kwargs, err, ts, noisy_ts - ): - model = model_cls(**model_kwargs) + def helper_test_probabilistic_forecast_accuracy(self, model, err, ts, noisy_ts): model.fit(noisy_ts[:100]) pred = model.predict(n=100, num_samples=100) From f2a9e0802bcd670310873ee3f26c089207f0ec63 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Fri, 1 Sep 2023 18:01:52 +0200 Subject: [PATCH 24/30] fix: addressing reviewer comments --- darts/models/forecasting/regression_model.py | 43 ++++++++----------- .../explainability/test_shap_explainer.py | 14 +++++- .../forecasting/test_regression_models.py | 24 ++++++----- 3 files changed, 44 insertions(+), 37 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 00310d0ea9..497de459fb 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -229,24 +229,9 @@ def _generate_lags( if lags_values is None: continue - # check type of argument before converting to dictionary + # converting to dictionary to run sanity checks if not isinstance(lags_values, dict): - raise_if( - lags_name == "lags_future_covariates" - and not isinstance(lags_values, (tuple, list)), - f"`lags_future_covariates` must be of type tuple, list or dict." - f"Given: {type(lags_values)}.", - ) - - raise_if( - lags_name in ["lags", "lags_past_covariates"] - and not isinstance(lags_values, (int, list)), - f"`{lags_name}` must be of type int, list or dict." - f"Given: {type(lags_values)}.", - ) - lags_values = {"default_lags": lags_values} - elif len(lags_values) == 0: raise_log( ValueError( @@ -267,24 +252,26 @@ def _generate_lags( len(comp_lags) == 2 and isinstance(comp_lags[0], int) and isinstance(comp_lags[1], int), - f"`{lags_name}` tuple must be of length 2, and must contain two integers", + f"`{lags_name}` - `{comp_name}`: tuple must be of length 2, and must contain two integers", logger, ) raise_if( isinstance(comp_lags[0], bool) or isinstance(comp_lags[1], bool), - f"`{lags_name}` tuple must contain integers, not bool", + f"`{lags_name}` - `{comp_name}`: tuple must contain integers, not bool", logger, ) raise_if_not( comp_lags[0] >= 0 and comp_lags[1] >= 0, - f"`{lags_name}` tuple must contain positive integers. Given: {comp_lags}.", + f"`{lags_name}` - `{comp_name}`: tuple must contain positive integers. Given: {comp_lags}.", + logger, ) raise_if( comp_lags[0] == 0 and comp_lags[1] == 0, - f"`{lags_name}` tuple cannot be (0, 0) as it corresponds to an empty list of lags.", + f"`{lags_name}` - `{comp_name}`: tuple cannot be (0, 0) as it corresponds to an empty " + f"list of lags.", logger, ) tmp_components_lags[comp_name] = list( @@ -294,7 +281,8 @@ def _generate_lags( for lag in comp_lags: raise_if( not isinstance(lag, int) or isinstance(lag, bool), - f"`{lags_name}` list must contain only integers. Given: {comp_lags}.", + f"`{lags_name}` - `{comp_name}`: list must contain only integers. Given: {comp_lags}.", + logger, ) tmp_components_lags[comp_name] = sorted(comp_lags) else: @@ -304,14 +292,17 @@ def _generate_lags( if isinstance(comp_lags, int): raise_if_not( comp_lags > 0, - f"`{lags_name}` integer must be strictly positive . Given: {comp_lags}.", + f"`{lags_name}` - `{comp_name}`: integer must be strictly positive . Given: {comp_lags}.", + logger, ) tmp_components_lags[comp_name] = list(range(-comp_lags, 0)) elif isinstance(comp_lags, list): for lag in comp_lags: raise_if( not isinstance(lag, int) or (lag >= 0), - f"`{lags_name}` list must contain only strictly negative integers. Given: {comp_lags}.", + f"`{lags_name}` - `{comp_name}`: list must contain only strictly negative integers. " + f"Given: {comp_lags}.", + logger, ) tmp_components_lags[comp_name] = sorted(comp_lags) else: @@ -321,8 +312,8 @@ def _generate_lags( if invalid_type: raise_log( ValueError( - f"When passed in a dictionary, `{lags_name}` for component {comp_name} must be either a " - f"{supported_types}, received : {type(comp_lags)}." + f"`{lags_name}` - `{comp_name}`: must be either a {supported_types}. " + f"Gived : {type(comp_lags)}." ), logger, ) @@ -500,7 +491,7 @@ def _create_lagged_data( if ts is not None and ts[i].width != self.input_dim[cov_name]: shape_error_msg.append( f"Expected {self.input_dim[cov_name]} components but received " - f"{target_series[i].width} components at index {i} of `{arg_name}`." + f"{ts[i].width} components at index {i} of `{arg_name}`." ) raise_log(ValueError("\n".join(shape_error_msg)), logger) features[i] = X_i[:, :, 0] diff --git a/darts/tests/explainability/test_shap_explainer.py b/darts/tests/explainability/test_shap_explainer.py index 2ea6e61f97..a5e950adb4 100644 --- a/darts/tests/explainability/test_shap_explainer.py +++ b/darts/tests/explainability/test_shap_explainer.py @@ -830,13 +830,25 @@ def test_shap_regressor_component_specific_lags(self): shap_explain = ShapExplainer(model) # one column per lag, grouped by components + expected_columns = [ + "price_target_lag-3", + "price_target_lag-2", + "power_target_lag-1", + ] expected_df = pd.DataFrame( data=np.stack( [np.arange(1, 29), np.arange(3, 31), np.arange(106, 161, 2)], axis=1 ), - columns=["price_target_lag-3", "price_target_lag-2", "power_target_lag-1"], + columns=expected_columns, ) # check that the appropriate lags are extracted assert all(shap_explain.explainers.background_X == expected_df) assert model.lagged_feature_names == list(expected_df.columns) + + # check that explain() can be called + explanation_results = shap_explain.explain() + plt.close() + for comp in ts.components: + comps_out = explanation_results.explained_forecasts[1][comp].columns + assert all(comps_out == expected_columns) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index c6446ae426..09afd4a773 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1025,25 +1025,29 @@ def test_fit(self, config): model_instance.predict(n=10) # inconsistent number of components in series Sequence[TimeSeries] + training_series = [series.stack(series + 10), series] with pytest.raises(ValueError) as err: model_instance = model(lags=4, multi_models=mode) - model_instance.fit(series=[series.stack(series + 10), series]) - assert ( - str(err.value) - == "Expected 2 components but received 1 components at index 1 of `series`" - ) + model_instance.fit(series=training_series) + assert ( + str(err.value) + == f"Expected {training_series[0].width} components but received {training_series[1].width} " + f"components at index 1 of `series`." + ) # inconsistent number of components in past_covariates Sequence[TimeSeries] + training_past_covs = [series, series.stack(series * 2)] with pytest.raises(ValueError) as err: model_instance = model(lags=4, lags_past_covariates=2, multi_models=mode) model_instance.fit( series=[series, series + 10], - past_covariates=[self.sine_univariate1, self.sine_multivariate1], - ) - assert ( - str(err.value) - == "Expected 1 components but received 2 components at index 1 of `past_covariates`" + past_covariates=training_past_covs, ) + assert ( + str(err.value) + == f"Expected {training_past_covs[0].width} components but received {training_past_covs[1].width} " + f"components at index 1 of `past_covariates`." + ) model_instance = model(lags=12, multi_models=mode) model_instance.fit(series=series) From f0967f65b36425fd1d2a9901c755d09392bf3deb Mon Sep 17 00:00:00 2001 From: madtoinou <32447896+madtoinou@users.noreply.github.com> Date: Fri, 1 Sep 2023 18:04:20 +0200 Subject: [PATCH 25/30] Apply suggestions from code review Co-authored-by: Dennis Bader --- CHANGELOG.md | 2 +- darts/models/forecasting/regression_model.py | 4 +--- darts/tests/models/forecasting/test_regression_models.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 785147bdff..e9a5186fbd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ but cannot always guarantee backwards compatibility. Changes that may **break co - `TimeSeries` with a `RangeIndex` starting in the negative start are now supported by `historical_forecasts`. [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou). - Added a new argument `start_format` to `historical_forecasts()`, `backtest()` and `gridsearch` that allows to use an integer `start` either as the index position or index value/label for `series` indexed with a `pd.RangeIndex`. [#1866](https://github.com/unit8co/darts/pull/1866) by [Antoine Madrona](https://github.com/madtoinou). - Reduced the size of the Darts docker image `unit8/darts:latest`, and included all optional models as well as dev requirements. [#1878](https://github.com/unit8co/darts/pull/1878) by [Alex Colpitts](https://github.com/alexcolpitts96). -- `RegressionModel` can now be created with different lags for each component of the target and past/future covariates series. [#1962](https://github.com/unit8co/darts/pull/1962) by [Antoine Madrona](https://github.com/madtoinou). +- All `RegressionModel`s now support component/column-specific lags for target, past, and future covariates series. [#1962](https://github.com/unit8co/darts/pull/1962) by [Antoine Madrona](https://github.com/madtoinou). **Fixed** - Fixed a bug in `TimeSeries.from_dataframe()` when using a pandas.DataFrame with `df.columns.name != None`. [#1938](https://github.com/unit8co/darts/pull/1938) by [Antoine Madrona](https://github.com/madtoinou). diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 497de459fb..e765201a6b 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -193,14 +193,12 @@ def encode_year(idx): ) # convert lags arguments to list of int - processed_lags, processed_component_lags = self._generate_lags( + self.lags, self.component_lags = self._generate_lags( lags=lags, lags_past_covariates=lags_past_covariates, lags_future_covariates=lags_future_covariates, ) - self.lags = processed_lags - self.component_lags = processed_component_lags self.pred_dim = self.output_chunk_length if self.multi_models else 1 diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 09afd4a773..9149239dc4 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -545,7 +545,7 @@ def test_training_data_creation(self, mode): # cannot use 'default_lags' because it's converted in `fit()`, before calling `_created_lagged_data` model_instance = RegressionModel( lags={"0-trgt-0": [-4, -3], "0-trgt-1": [-3, -2], "0-trgt-2": [-2, -1]}, - lags_past_covariates={"0-pcov-0": [-10], "0-pvoc-1": [-7]}, + lags_past_covariates={"0-pcov-0": [-10], "0-pcov-1": [-7]}, lags_future_covariates={"0-fcov-0": (2, 2)}, multi_models=mode, ) From be536952423c1977f98aa42f06d0b5ce87f2ec11 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 4 Sep 2023 09:18:12 +0200 Subject: [PATCH 26/30] test: checking that the name of the features is correctly generated when using dict to define the lags --- .../forecasting/test_regression_models.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 9149239dc4..9ce0a4fa85 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -590,6 +590,27 @@ def test_training_data_creation(self, mode): 20098, # future cov; target + 20'000 ] + # checking the name of the lagged features + model_instance.fit( + series=self.target_series[0], + past_covariates=self.past_covariates[0], + future_covariates=self.future_covariates[0], + ) + assert model_instance.lagged_feature_names == [ + "0-trgt-0_target_lag-4", + "0-trgt-0_target_lag-3", + "0-trgt-1_target_lag-3", + "0-trgt-1_target_lag-2", + "0-trgt-2_target_lag-2", + "0-trgt-2_target_lag-1", + "0-pcov-0_pastcov_lag-10", + "0-pcov-1_pastcov_lag-7", + "0-fcov-0_futcov_lag-2", + "0-fcov-0_futcov_lag-1", + "0-fcov-0_futcov_lag0", + "0-fcov-0_futcov_lag1", + ] + @pytest.mark.parametrize("mode", [True, False]) def test_prediction_data_creation(self, mode): # assigning correct names to variables From 1b2bd4c481a9a3e438f796930ce79e6d330f9fe3 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 4 Sep 2023 09:19:57 +0200 Subject: [PATCH 27/30] fix: linting --- darts/models/forecasting/regression_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index e765201a6b..539f74bc41 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -199,7 +199,6 @@ def encode_year(idx): lags_future_covariates=lags_future_covariates, ) - self.pred_dim = self.output_chunk_length if self.multi_models else 1 def _generate_lags( From 1ea2c7f0a92885d78062f5e6f763efd0c688188f Mon Sep 17 00:00:00 2001 From: madtoinou Date: Mon, 4 Sep 2023 11:23:53 +0200 Subject: [PATCH 28/30] fix: updating the error msg --- .../tabularization/test_create_lagged_prediction_data.py | 4 ++-- .../utils/tabularization/test_create_lagged_training_data.py | 4 ++-- darts/tests/utils/tabularization/test_get_feature_times.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py b/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py index 3c46330022..4bff71fbe9 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_prediction_data.py @@ -1396,7 +1396,7 @@ def test_lagged_prediction_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags` must be a `Sequence` containing only `int` values less than 0." + "`lags` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # Test invalid `lags_past_covariates` values: with pytest.raises(ValueError) as err: @@ -1407,7 +1407,7 @@ def test_lagged_prediction_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags_past_covariates` must be a `Sequence` containing only `int` values less than 0." + "`lags_past_covariates` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # This should *not* throw an error: create_lagged_prediction_data( diff --git a/darts/tests/utils/tabularization/test_create_lagged_training_data.py b/darts/tests/utils/tabularization/test_create_lagged_training_data.py index b17a3f862c..98f515e545 100644 --- a/darts/tests/utils/tabularization/test_create_lagged_training_data.py +++ b/darts/tests/utils/tabularization/test_create_lagged_training_data.py @@ -1695,7 +1695,7 @@ def test_lagged_training_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags` must be a `Sequence` containing only `int` values less than 0." + "`lags` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # Test invalid `lags_past_covariates` values: with pytest.raises(ValueError) as err: @@ -1708,7 +1708,7 @@ def test_lagged_training_data_invalid_lag_values_error(self): use_moving_windows=use_moving_windows, ) assert ( - "`lags_past_covariates` must be a `Sequence` containing only `int` values less than 0." + "`lags_past_covariates` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # Test invalid `lags_future_covariates` values: create_lagged_training_data( diff --git a/darts/tests/utils/tabularization/test_get_feature_times.py b/darts/tests/utils/tabularization/test_get_feature_times.py index 6402fc2d32..e63a8e4057 100644 --- a/darts/tests/utils/tabularization/test_get_feature_times.py +++ b/darts/tests/utils/tabularization/test_get_feature_times.py @@ -1055,7 +1055,7 @@ def test_feature_times_invalid_lag_values_error(self): with pytest.raises(ValueError) as err: _get_feature_times(target_series=series, lags=[0], is_training=False) assert ( - "`lags` must be a `Sequence` containing only `int` values less than 0." + "`lags` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # `lags_past_covariates` not <= -1: with pytest.raises(ValueError) as err: @@ -1063,7 +1063,7 @@ def test_feature_times_invalid_lag_values_error(self): past_covariates=series, lags_past_covariates=[0], is_training=False ) assert ( - "`lags_past_covariates` must be a `Sequence` containing only `int` values less than 0." + "`lags_past_covariates` must be a `Sequence` or `Dict` containing only `int` values less than 0." ) == str(err.value) # `lags_future_covariates` can be positive, negative, and/or zero - no error should be thrown: _get_feature_times( From 970d8a3952a42d7d7b40c8ac293b7692d4d0d224 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Thu, 14 Sep 2023 10:19:15 +0200 Subject: [PATCH 29/30] fix: bug when the number of lags is different across components --- darts/models/forecasting/regression_model.py | 8 +- .../forecasting/test_regression_models.py | 98 ++++++++++++++++++- 2 files changed, 101 insertions(+), 5 deletions(-) diff --git a/darts/models/forecasting/regression_model.py b/darts/models/forecasting/regression_model.py index 539f74bc41..2d052eafbf 100644 --- a/darts/models/forecasting/regression_model.py +++ b/darts/models/forecasting/regression_model.py @@ -914,7 +914,9 @@ def predict( ] # values are grouped by component np_X.append( - np.concatenate(tmp_X).reshape(len(series) * num_samples, -1) + np.concatenate(tmp_X, axis=1).reshape( + len(series) * num_samples, -1 + ) ) else: # values are grouped by lags @@ -943,7 +945,9 @@ def predict( ) ] np_X.append( - np.concatenate(tmp_X).reshape(len(series) * num_samples, -1) + np.concatenate(tmp_X, axis=1).reshape( + len(series) * num_samples, -1 + ) ) else: np_X.append( diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 9ce0a4fa85..852fa4adec 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1680,7 +1680,7 @@ def test_integer_indexed_series(self, mode): [True, False], ), ) - def test_component_specific_lags(self, config): + def test_component_specific_lags_forecasts(self, config): """Verify that the same lags, defined using int/list or dictionnaries yield the same results""" (list_lags, dict_lags), multiple_series = config multivar_target = "lags" in dict_lags and len(dict_lags["lags"]) > 1 @@ -1712,9 +1712,7 @@ def test_component_specific_lags(self, config): ) + 10, ] - past_cov = [past_cov, past_cov] - future_cov = [future_cov, future_cov] # the lags are identical across the components for each series @@ -1781,6 +1779,100 @@ def test_component_specific_lags(self, config): np.testing.assert_array_almost_equal(pred.values(), pred2.values()) assert pred.time_index.equals(pred2.time_index) + @pytest.mark.parametrize( + "config", + itertools.product( + [ + {"lags": {"gaussian": [-1, -3], "sine": [-2, -4, -6]}}, + {"lags_past_covariates": {"default_lags": 2}}, + { + "lags": { + "gaussian": [-5, -2, -1], + "sine": [-2, -1], + }, + "lags_future_covariates": { + "lin_future": (1, 4), + "default_lags": (2, 2), + }, + }, + { + "lags": { + "default_lags": [-5, -4], + }, + "lags_future_covariates": { + "sine_future": (1, 1), + "default_lags": [-2, 4, 6, 7, 8], + }, + }, + ], + [True, False], + ), + ) + def test_component_specific_lags(self, config): + """Checking various combination of component-specific lags""" + (dict_lags, multiple_series) = config + multivar_target = "lags" in dict_lags and len(dict_lags["lags"]) > 1 + multivar_future_cov = ( + "lags_future_covariates" in dict_lags + and len(dict_lags["lags_future_covariates"]) > 1 + ) + + # create series based on the model parameters + series = tg.gaussian_timeseries(length=20, column_name="gaussian") + if multivar_target: + series = series.stack(tg.sine_timeseries(length=20, column_name="sine")) + + future_cov = tg.linear_timeseries(length=30, column_name="lin_future") + if multivar_future_cov: + future_cov = future_cov.stack( + tg.sine_timeseries(length=30, column_name="sine_future") + ) + + past_cov = tg.linear_timeseries(length=30, column_name="lin_past") + + if multiple_series: + # second series have different component names + series = [ + series, + series.with_columns_renamed( + ["gaussian", "sine"][: series.width], + ["other", "names"][: series.width], + ) + + 10, + ] + past_cov = [past_cov, past_cov] + future_cov = [future_cov, future_cov] + + model = LinearRegressionModel(**dict_lags, output_chunk_length=4) + model.fit( + series=series, + past_covariates=past_cov if model.supports_past_covariates else None, + future_covariates=future_cov if model.supports_future_covariates else None, + ) + # n < output_chunk_length + model.predict( + 1, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + + # n > output_chunk_length + model.predict( + 7, + series=series[0] if multiple_series else None, + past_covariates=past_cov[0] + if multiple_series and model.supports_past_covariates + else None, + future_covariates=future_cov[0] + if multiple_series and model.supports_future_covariates + else None, + ) + @pytest.mark.parametrize( "config", itertools.product( From edf855461750d750cf2cb65cf5a96b5483a32984 Mon Sep 17 00:00:00 2001 From: madtoinou Date: Thu, 14 Sep 2023 10:30:04 +0200 Subject: [PATCH 30/30] fix: future lags in test --- darts/tests/models/forecasting/test_regression_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darts/tests/models/forecasting/test_regression_models.py b/darts/tests/models/forecasting/test_regression_models.py index 6e2566a627..9d5c369526 100644 --- a/darts/tests/models/forecasting/test_regression_models.py +++ b/darts/tests/models/forecasting/test_regression_models.py @@ -1811,7 +1811,7 @@ def test_component_specific_lags_forecasts(self, config): }, "lags_future_covariates": { "sine_future": (1, 1), - "default_lags": [-2, 4, 6, 7, 8], + "default_lags": [-2, 0, 1, 2], }, }, ],