From c00c301975a4455e4d30a8454360de1a568f5c2b Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sun, 12 Feb 2023 11:32:49 -0300 Subject: [PATCH 01/31] update import --- lightwood/mixer/gluonts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index ce264bef4..6083396cd 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -8,8 +8,7 @@ from gluonts.dataset.pandas import PandasDataset -from gluonts.model.deepar import DeepAREstimator # @TODO: support for other estimators -from gluonts.mx import Trainer +from gluonts.mx import DeepAREstimator, Trainer # @TODO: support for other estimators from gluonts.mx.trainer.callback import TrainingHistory from gluonts.mx.distribution.student_t import StudentTOutput From aff0d908b49d5c0418fb257f5c9b25a7ea8b57d8 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sun, 12 Feb 2023 14:28:29 -0300 Subject: [PATCH 02/31] faster, no indexing --- lightwood/data/encoded_ds.py | 6 +-- lightwood/mixer/gluonts.py | 41 ++++++++-------- lightwood/mixer/lightgbm_array.py | 2 +- lightwood/mixer/neural_ts.py | 2 +- lightwood/mixer/nhits.py | 2 +- lightwood/mixer/sktime.py | 2 +- tests/integration/advanced/test_timeseries.py | 48 +++++++++---------- 7 files changed, 52 insertions(+), 51 deletions(-) diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index 504335a07..b7f90993f 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -144,7 +144,7 @@ class ConcatedEncodedDs(EncodedDs): def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None: # @TODO: missing super() call here? self.encoded_ds_arr = encoded_ds_arr - self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr] + self.encoded_ds_lengths = [len(x) for x in self.encoded_ds_arr] self.encoders = self.encoded_ds_arr[0].encoders self.encoder_spans = self.encoded_ds_arr[0].encoder_spans self.target = self.encoded_ds_arr[0].target @@ -155,13 +155,13 @@ def __len__(self): See `lightwood.data.encoded_ds.EncodedDs.__len__()`. """ # @TODO: behavior here is not intuitive - return max(0, np.sum(self.encoded_ds_lenghts) - 2) + return max(0, np.sum(self.encoded_ds_lengths) - 2) def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: """ See `lightwood.data.encoded_ds.EncodedDs.__getitem__()`. """ - for ds_idx, length in enumerate(self.encoded_ds_lenghts): + for ds_idx, length in enumerate(self.encoded_ds_lengths): if idx - length < 0: return self.encoded_ds_arr[ds_idx][idx] else: diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index 6083396cd..6d01bcc02 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -13,6 +13,7 @@ from gluonts.mx.distribution.student_t import StudentTOutput from lightwood.helpers.log import log +from lightwood.helpers.ts import get_group_matches from lightwood.mixer.base import BaseMixer from lightwood.api.types import PredictionArguments from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs @@ -39,6 +40,8 @@ def __init__( """ Wrapper around GluonTS probabilistic deep learning models. For now, only DeepAR is supported. + Due to inference speed, predictions are only generated for the last data point (as opposed to other mixers). + :param stop_after: time budget in seconds. :param target: column to forecast. :param horizon: length of forecasted horizon. @@ -128,11 +131,15 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], """ Calls the mixer to emit forecasts. """ # noqa - length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) - ydf = pd.DataFrame(0, # zero-filled - index=np.arange(length), - columns=['prediction', 'lower', 'upper'], - dtype=object) + mx.random.seed(self.seed) + np.random.seed(self.seed) + length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds) + + ydf = pd.DataFrame(index=np.arange(length), dtype=object) + init_arr = [0 for _ in range(self.ts_analysis['tss'].horizon)] + for col in ['prediction', 'lower', 'upper']: + ydf.at[:, col] = [init_arr for _ in range(len(ydf))] + ydf['index'] = ds.data_frame.index conf = args.fixed_confidence if args.fixed_confidence else 0.9 ydf['confidence'] = conf @@ -140,20 +147,16 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], gby = self.ts_analysis["tss"].group_by if self.ts_analysis["tss"].group_by else [] groups = ds.data_frame[gby[0]].unique().tolist() if gby else None - for idx in range(length): - df = ds.data_frame.iloc[:idx] if idx != 0 else None - input_ds = self._make_initial_ds(df, groups=groups) - if not input_ds: - # edge case: new group - for col in ['prediction', 'lower', 'upper']: - ydf.at[idx, col] = [0 for _ in range(self.ts_analysis["tss"].horizon)] - else: - mx.random.seed(self.seed) - np.random.seed(self.seed) - forecasts = list(self.model.predict(input_ds))[0] - ydf.at[idx, 'prediction'] = [entry for entry in forecasts.quantile(0.5)] - ydf.at[idx, 'lower'] = [entry for entry in forecasts.quantile(1 - conf)] - ydf.at[idx, 'upper'] = [entry for entry in forecasts.quantile(conf)] + df = ds.data_frame + ydf['__original_index'] = df['__mdb_original_index'].values + input_ds = self._make_initial_ds(df, groups=groups) # TODO test with novel group + forecasts = list(self.model.predict(input_ds)) + for group, group_forecast in zip(groups, forecasts): + _, subdf = get_group_matches(df, (group, ), gby) + idx = ydf[ydf['__original_index'] == max(subdf['__mdb_original_index'])].index.values[0] + ydf.at[idx, 'prediction'] = [entry for entry in group_forecast.quantile(0.5)] + ydf.at[idx, 'lower'] = [entry for entry in group_forecast.quantile(1 - conf)] + ydf.at[idx, 'upper'] = [entry for entry in group_forecast.quantile(conf)] return ydf diff --git a/lightwood/mixer/lightgbm_array.py b/lightwood/mixer/lightgbm_array.py index 89e1b9d8b..a52bc0c90 100644 --- a/lightwood/mixer/lightgbm_array.py +++ b/lightwood/mixer/lightgbm_array.py @@ -85,7 +85,7 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], log.warning('This model does not output probability estimates') original_df = deepcopy(ds.data_frame) - length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) + length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds) ydf = pd.DataFrame(0, # zero-filled index=np.arange(length), columns=[f'prediction_{i}' for i in range(self.horizon)]) diff --git a/lightwood/mixer/neural_ts.py b/lightwood/mixer/neural_ts.py index f73b44dd0..ef34b53f7 100644 --- a/lightwood/mixer/neural_ts.py +++ b/lightwood/mixer/neural_ts.py @@ -140,7 +140,7 @@ def __call__(self, ds: EncodedDs, all_probs: List[List[float]] = [] rev_map = {} - length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) + length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds) pred_cols = [f'prediction_{i}' for i in range(self.timeseries_settings.horizon)] ydf = pd.DataFrame(0, # zero-filled index=np.arange(length), diff --git a/lightwood/mixer/nhits.py b/lightwood/mixer/nhits.py index a60bdeb85..cc02c0a37 100644 --- a/lightwood/mixer/nhits.py +++ b/lightwood/mixer/nhits.py @@ -162,7 +162,7 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], if args.predict_proba: log.warning('This mixer does not output probability estimates') - length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) + length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds) ydf = pd.DataFrame(0, # zero-filled index=np.arange(length), columns=['prediction', 'lower', 'upper'], diff --git a/lightwood/mixer/sktime.py b/lightwood/mixer/sktime.py index 50ac7e066..6df73d8b4 100644 --- a/lightwood/mixer/sktime.py +++ b/lightwood/mixer/sktime.py @@ -240,7 +240,7 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], df = deepcopy(ds.data_frame) df = df.rename_axis('__sktime_index').reset_index() - length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) + length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds) ydf = pd.DataFrame(0, # zero-filled index=df.index, columns=['prediction'], diff --git a/tests/integration/advanced/test_timeseries.py b/tests/integration/advanced/test_timeseries.py index c0867b62f..2709e15ab 100644 --- a/tests/integration/advanced/test_timeseries.py +++ b/tests/integration/advanced/test_timeseries.py @@ -576,28 +576,26 @@ def test_12_gluonts(self): if GluonTSMixer is not None: data = pd.read_csv('tests/data/arrivals.csv') - for i, subdata in enumerate([data, data[data['Country'] == 'US']]): - order_by = 'T' - train_df, test_df = self.split_arrivals(subdata, grouped=False) - pdef = {'target': 'Traffic', - 'use_default_analysis': False, - 'timeseries_settings': { - 'order_by': order_by, - 'window': 4 * 5, - 'horizon': 4 * 2}} - if i == 0: - pdef['timeseries_settings']['group_by'] = ['Country'] - jai = json_ai_from_problem(train_df, ProblemDefinition.from_dict(pdef)) - jai.model['args']['submodels'] = [{ - "module": "GluonTSMixer", - "args": {} - }] - predictor = predictor_from_json_ai(jai) - predictor.learn(train_df) - predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'}) - - # adjust - adjust_n_epochs = 5 - predictor.adjust(test_df, adjust_args={'n_epochs': adjust_n_epochs}) - predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'}) - assert predictor.mixers[0].n_epochs == adjust_n_epochs + order_by = 'T' + train_df, test_df = self.split_arrivals(data, grouped=True) + pdef = {'target': 'Traffic', + 'timeseries_settings': { + 'order_by': order_by, + 'group_by': ['Country'], + 'window': 4 * 5, + 'horizon': 4 * 2}} + jai = json_ai_from_problem(train_df, ProblemDefinition.from_dict(pdef)) + jai.model['args']['submodels'] = [{ + "module": "GluonTSMixer", + "args": {} + }] + predictor = predictor_from_json_ai(jai) + predictor.learn(train_df) + predictor.predict(test_df, args={'time_format': 'infer'}) + predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'}) + + # adjust + adjust_n_epochs = 5 + predictor.adjust(test_df, adjust_args={'n_epochs': adjust_n_epochs}) + predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'}) + assert predictor.mixers[0].n_epochs == adjust_n_epochs From b090e6ae128cad4415daa47490505b14aac767c4 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 13 Feb 2023 21:58:18 -0300 Subject: [PATCH 03/31] fit_on_all for gluonTS --- lightwood/api/json_ai.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 1b8c32378..38d44b5a9 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -622,7 +622,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI: mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True") mixers[i]["args"]["use_stl"] = mixers[i]["args"].get("use_stl", "False") - elif mixers[i]["module"] in ("NHitsMixer", "GluonTSMixer"): + elif mixers[i]["module"] in ("NHitsMixer", ): mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon" mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window" mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get( @@ -630,12 +630,12 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI: ) problem_definition.fit_on_all = False # takes too long otherwise - elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer"): + elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer", "GluonTSMixer"): mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get( "ts_analysis", "$ts_analysis" ) - if "horizon" not in mixers[i]["args"]: - mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon" + mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window" + mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon" # enforce fit_on_all if this mixer is specified problem_definition.fit_on_all = True From a7f0bfc39981652e242f12b97bfffaee8bcfaf46 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 14 Feb 2023 01:37:55 -0300 Subject: [PATCH 04/31] add trains once option --- lightwood/api/json_ai.py | 12 ++++++++---- lightwood/mixer/base.py | 4 +++- lightwood/mixer/gluonts.py | 1 + 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 38d44b5a9..586b1f1c4 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -1025,7 +1025,12 @@ def code_from_json_ai(json_ai: JsonAI) -> str: trained_mixers = [] for mixer in self.mixers: try: - self.fit_mixer(mixer, encoded_train_data, encoded_dev_data) + if mixer.trains_once: + self.fit_mixer(mixer, + ConcatedEncodedDs([encoded_train_data, encoded_dev_data]), + encoded_test_data) + else: + self.fit_mixer(mixer, encoded_train_data, encoded_dev_data) trained_mixers.append(mixer) except Exception as e: log.warning(f'Exception: {{e}} when training mixer: {{mixer}}') @@ -1107,7 +1112,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str: log.info('Updating the mixers') for mixer in self.mixers: - mixer.partial_fit(train_data, dev_data, adjust_args) + mixer.partial_fit(train_data, dev_data, adjust_args) """ # noqa adjust_body = align(adjust_body, 2) @@ -1154,8 +1159,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str: # SET `json_ai.problem_definition.fit_on_all=False` TO TURN THIS BLOCK OFF. # Update the mixers with partial fit -if self.problem_definition.fit_on_all: - +if self.problem_definition.fit_on_all and all([not m.trains_once for m in self.mixers]): log.info(f'[Learn phase 8/{n_phases}] - Adjustment on validation requested') self.adjust(enc_train_test["test"].data_frame, ConcatedEncodedDs([enc_train_test["train"], enc_train_test["dev"]]).data_frame, diff --git a/lightwood/mixer/base.py b/lightwood/mixer/base.py index ba98f0865..a91bb6946 100644 --- a/lightwood/mixer/base.py +++ b/lightwood/mixer/base.py @@ -21,11 +21,12 @@ class BaseMixer: - stable: If set to `True`, this mixer should always work. Any mixer with `stable=False` can be expected to fail under some circumstances. - fit_data_len: Length of the training data. - supports_proba: For classification tasks, whether the mixer supports yielding per-class scores rather than only returning the predicted label. - + - trains_once: If True, the mixer is trained once during learn, using all available input data (`train` and `dev` splits for training, `test` for validation). Otherwise, it trains once with the `train`` split & `dev` for validation, and optionally (depending on the problem definition `fit_on_all` and mixer-wise `fit_on_dev` arguments) a second time after post-training analysis via partial_fit, with `train` and `dev` splits as training subset, and `test` split as validation. Should only be set to True for mixers that don't require post-training analysis, as otherwise actual validation data would be treated as a held-out portion, which is a mistake. """ # noqa stable: bool fit_data_len: int # @TODO (Patricio): should this really be in `BaseMixer`? supports_proba: bool + trains_once: bool def __init__(self, stop_after: float): """ @@ -33,6 +34,7 @@ def __init__(self, stop_after: float): """ self.stop_after = stop_after self.supports_proba = False + self.trains_once = False def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: """ diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index 6d01bcc02..ce5ca90fd 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -70,6 +70,7 @@ def __init__( self.train_cache = None self.patience = early_stop_patience self.seed = seed + self.trains_once = True dist_module = importlib.import_module('.'.join(['gluonts.mx.distribution', *distribution_output.split(".")[:-1]])) From bfdbb0905c504fc8df65b9c9023393de6625170b Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 14 Feb 2023 01:45:04 -0300 Subject: [PATCH 05/31] fix mixer arg injection --- lightwood/api/json_ai.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 586b1f1c4..df57af77a 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -622,7 +622,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI: mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True") mixers[i]["args"]["use_stl"] = mixers[i]["args"].get("use_stl", "False") - elif mixers[i]["module"] in ("NHitsMixer", ): + elif mixers[i]["module"] in ("NHitsMixer", "GluonTSMixer"): mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon" mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window" mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get( @@ -630,12 +630,12 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI: ) problem_definition.fit_on_all = False # takes too long otherwise - elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer", "GluonTSMixer"): + elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer"): mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get( "ts_analysis", "$ts_analysis" ) - mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window" - mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon" + if "horizon" not in mixers[i]["args"]: + mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon" # enforce fit_on_all if this mixer is specified problem_definition.fit_on_all = True From c965c74c073a9b413f6004abdbfb537794f11edf Mon Sep 17 00:00:00 2001 From: Talaat Hasanin <105648065+TalaatHasanin@users.noreply.github.com> Date: Fri, 17 Feb 2023 17:34:29 +0200 Subject: [PATCH 06/31] Update README.md Add main module to the sample code --- README.md | 56 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index de6db694f..a2eccd463 100644 --- a/README.md +++ b/README.md @@ -65,40 +65,44 @@ from lightwood.api.high_level import ( predictor_from_code, ) -# Load a pandas dataset -df = pd.read_csv( - "https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv" -) -# Define the prediction task by naming the target column -pdef = ProblemDefinition.from_dict( - { - "target": "Development Index", # column you want to predict - } -) +def main(): + # Load a pandas dataset + df = pd.read_csv("https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv") + + # Define the prediction task by naming the target column + pdef = ProblemDefinition.from_dict( + { + "target": "Development Index", # column you want to predict + } + ) + + # Generate JSON-AI code to model the problem + json_ai = json_ai_from_problem(df, problem_definition=pdef) + + # OPTIONAL - see the JSON-AI syntax + # print(json_ai.to_json()) -# Generate JSON-AI code to model the problem -json_ai = json_ai_from_problem(df, problem_definition=pdef) + # Generate python code + code = code_from_json_ai(json_ai) -# OPTIONAL - see the JSON-AI syntax -#print(json_ai.to_json()) + # OPTIONAL - see generated code + # print(code) -# Generate python code -code = code_from_json_ai(json_ai) + # Create a predictor from python code + predictor = predictor_from_code(code) -# OPTIONAL - see generated code -#print(code) + # Train a model end-to-end from raw data to a finalized predictor + predictor.learn(df) -# Create a predictor from python code -predictor = predictor_from_code(code) + # Make the train/test splits and show predictions for a few examples + test_df = predictor.split(predictor.preprocess(df))["test"] + preds = predictor.predict(test_df).iloc[:10] + print(preds) -# Train a model end-to-end from raw data to a finalized predictor -predictor.learn(df) -# Make the train/test splits and show predictions for a few examples -test_df = predictor.split(predictor.preprocess(df))["test"] -preds = predictor.predict(test_df).iloc[:10] -print(preds) +if __name__ == '__main__': + main() ``` ### BYOM: Bring your own models From f4607562abccbdeb5d87a5cc0bf944cadb2520db Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 20 Feb 2023 12:18:36 -0300 Subject: [PATCH 07/31] bump statsforecast==1.4.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 972251172..bffa494ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ scikit-learn >=1.0.0, <=1.0.2 dataclasses_json >=0.5.4 dill ==0.3.4 sktime >=0.14.0,<0.15.0 -statsforecast ==0.7.0 +statsforecast ==1.4.0 torch_optimizer ==0.1.0 black >=21.9b0 typing_extensions From b4549a4f411a02dbeacb3eb35f69632de1918f3f Mon Sep 17 00:00:00 2001 From: Talaat Hasanin <105648065+TalaatHasanin@users.noreply.github.com> Date: Mon, 20 Feb 2023 19:06:58 +0200 Subject: [PATCH 08/31] Add some comments Add some comments to clarify the use of the main module --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a2eccd463..92d859ee7 100644 --- a/README.md +++ b/README.md @@ -100,8 +100,9 @@ def main(): preds = predictor.predict(test_df).iloc[:10] print(preds) - +# "main module" is to guard your code through the multiprocessing for windows users if __name__ == '__main__': + # Load a pandas dataset and start define, create and train predictors main() ``` From 22c3566fa9921e5745672e9b8c20fec222828c93 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 20 Feb 2023 15:32:49 -0300 Subject: [PATCH 09/31] bump pytorch to 1.13 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bffa494ea..809c67639 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ nltk >=3,<3.6 python-dateutil >=2.8.1 pandas >=1.1.5 schema >=0.6.8 -torch >=1.9.0 +torch >=1.13.0, <1.14.0 requests >=2.0.0 transformers optuna >=2.8.0,<2.10.0 From 7e45c792f6a09bdd7e61088ab98aed9fb0ae0c25 Mon Sep 17 00:00:00 2001 From: Talaat Hasanin <105648065+TalaatHasanin@users.noreply.github.com> Date: Mon, 20 Feb 2023 21:20:20 +0200 Subject: [PATCH 10/31] Update README.md --- README.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 92d859ee7..9e272cebf 100644 --- a/README.md +++ b/README.md @@ -65,10 +65,10 @@ from lightwood.api.high_level import ( predictor_from_code, ) - -def main(): +if __name__ == '__main__': # Load a pandas dataset - df = pd.read_csv("https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv") + df = pd.read_csv("https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv" + ) # Define the prediction task by naming the target column pdef = ProblemDefinition.from_dict( @@ -99,11 +99,6 @@ def main(): test_df = predictor.split(predictor.preprocess(df))["test"] preds = predictor.predict(test_df).iloc[:10] print(preds) - -# "main module" is to guard your code through the multiprocessing for windows users -if __name__ == '__main__': - # Load a pandas dataset and start define, create and train predictors - main() ``` ### BYOM: Bring your own models From 3367cc9969f6ef2e302a8d71d3d8f18a9e11d721 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 00:05:58 -0300 Subject: [PATCH 11/31] feat: TabTransformer mixer --- lightwood/mixer/__init__.py | 8 +- lightwood/mixer/neural.py | 7 +- lightwood/mixer/tabtransformer.py | 188 ++++++++++++++++++++++++++++++ requirements_extra.txt | 1 + 4 files changed, 201 insertions(+), 3 deletions(-) create mode 100644 lightwood/mixer/tabtransformer.py diff --git a/lightwood/mixer/__init__.py b/lightwood/mixer/__init__.py index 1347f3ee5..3d7c1c2fa 100644 --- a/lightwood/mixer/__init__.py +++ b/lightwood/mixer/__init__.py @@ -36,5 +36,11 @@ LightGBM = None LightGBMArray = None +try: + from lightwood.mixer.tabtransformer import TabTransformerMixer +except Exception: + TabTransformerMixer = None + __all__ = ['BaseMixer', 'Neural', 'NeuralTs', 'LightGBM', 'RandomForest', 'LightGBMArray', 'Unit', 'Regression', - 'SkTime', 'QClassic', 'ProphetMixer', 'ETSMixer', 'ARIMAMixer', 'NHitsMixer', 'GluonTSMixer', 'XGBoostMixer'] + 'SkTime', 'QClassic', 'ProphetMixer', 'ETSMixer', 'ARIMAMixer', 'NHitsMixer', 'GluonTSMixer', 'XGBoostMixer', + 'TabTransformerMixer'] diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index c285fc519..9e10a690b 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -63,7 +63,7 @@ def __init__( self.epochs_to_best = 0 self.n_epochs = n_epochs self.fit_on_dev = fit_on_dev - self.net_class = DefaultNet if net == 'DefaultNet' else ArNet + self.net_name = net self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical] self.search_hyperparameters = search_hyperparameters self.stable = True @@ -241,6 +241,8 @@ def _error(self, dev_dl, criterion) -> float: return np.mean(running_losses) def _init_net(self, ds: EncodedDs): + self.net_class = DefaultNet if self.net_name == 'DefaultNet' else ArNet + net_kwargs = {'input_size': len(ds[0][0]), 'output_size': len(ds[0][1]), 'num_hidden': self.num_hidden, @@ -275,7 +277,8 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: # Find learning rate # keep the weights self._init_net(train_data) - self.lr, self.model = self._find_lr(train_dl) + if not self.lr: + self.lr, self.model = self._find_lr(train_dl) # Keep on training optimizer = self._select_optimizer() diff --git a/lightwood/mixer/tabtransformer.py b/lightwood/mixer/tabtransformer.py new file mode 100644 index 000000000..eaf98ff0a --- /dev/null +++ b/lightwood/mixer/tabtransformer.py @@ -0,0 +1,188 @@ +import time +from copy import deepcopy +from typing import Dict, List, Optional + +import torch +# import torch.nn as nn +import numpy as np +import pandas as pd +from tab_transformer_pytorch import FTTransformer + +# from type_infer.dtype import dtype +from lightwood.helpers.log import log +from lightwood.helpers.torch import LightwoodAutocast +from lightwood.api.types import PredictionArguments +from lightwood.helpers.device import get_device_from_name +from lightwood.data.encoded_ds import EncodedDs +from lightwood.encoder.base import BaseEncoder +from lightwood.mixer.neural import Neural + + +class TabTransformerMixer(Neural): + def __init__( + self, + stop_after: float, + target: str, + dtype_dict: Dict[str, str], + target_encoder: BaseEncoder, + fit_on_dev: bool, + search_hyperparameters: bool, + train_args: Optional[dict] = None + ): + """ + This mixer trains a TabTransformer network (FT variant), using concatenated encoder outputs for each dataset feature as input, to predict the encoded target column representation as output. + + Training logic is based on the Neural mixer, please refer to it for more details on each input parameter. + """ # noqa + self.train_args = train_args if train_args else {} + super().__init__( + stop_after, + target, + dtype_dict, + target_encoder, + 'FTTransformer', + False, # fit_on_dev + search_hyperparameters, + n_epochs=self.train_args.get('n_epochs', None) + ) + self.lr = self.train_args.get('lr') + self.stable = True # still experimental + + def _init_net(self, ds: EncodedDs): + self.net_class = FTTransformer + + self.model = FTTransformer( + categories=(), # unused here, as by the point it arrives to the mixer, everything is numerical # noqa + num_continuous=len(ds[0][0]), # ds.input_length, # TODO define based on DS + dim=self.train_args.get('dim', 32), + dim_out=self.train_args.get('dim_out', len(ds[0][1])), + depth=self.train_args.get('depth', 6), + heads=self.train_args.get('heads', 8), + attn_dropout=self.train_args.get('attn_dropout', 0.1), # post-attention dropout + ff_dropout=self.train_args.get('ff_dropout', 0.1), # feed forward dropout + mlp_hidden_mults=self.train_args.get('mlp_hidden_mults', (4, 2)), # relative multiples of each hidden dimension of the last mlp to logits # noqa + # mlp_act=self.train_args.get('mlp_act', nn.ReLU()), # TODO: import string from nn activations + ) + self.model.device = get_device_from_name('') + self.model.to(self.model.device) + + def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after): + epochs_to_best = 0 + best_dev_error = pow(2, 32) + running_errors = [] + best_model = self.model + + for epoch in range(1, return_model_after + 1): + self.model = self.model.train() + running_losses: List[float] = [] + for i, (X, Y) in enumerate(train_dl): + X = X.to(self.model.device) + Y = Y.to(self.model.device) + with LightwoodAutocast(): + optimizer.zero_grad() + Yh = self.model(torch.Tensor(), X) + loss = criterion(Yh, Y) + if LightwoodAutocast.active: + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + optimizer.step() + + running_losses.append(loss.item()) + if (time.time() - self.started) > stop_after: + break + + train_error = np.mean(running_losses) + epoch_error = self._error(dev_dl, criterion) + running_errors.append(epoch_error) + log.info(f'Loss @ epoch {epoch}: {epoch_error}') + + if np.isnan(train_error) or np.isnan( + running_errors[-1]) or np.isinf(train_error) or np.isinf( + running_errors[-1]): + break + + if best_dev_error > running_errors[-1]: + best_dev_error = running_errors[-1] + best_model = deepcopy(self.model) + epochs_to_best = epoch + + # manually set epoch limit + if self.n_epochs is not None: + if epoch > self.n_epochs: + break + + # automated early stopping + else: + if len(running_errors) >= 5: + delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)], + weights=[(1 / 2)**i for i in range(1, 5)]) + if delta_mean <= 0: + break + elif (time.time() - self.started) > stop_after: + break + elif running_errors[-1] < 0.0001 or train_error < 0.0001: + break + + if np.isnan(best_dev_error): + best_dev_error = pow(2, 32) + return best_model, epochs_to_best, best_dev_error + + def _error(self, dev_dl, criterion) -> float: + self.model = self.model.eval() + running_losses: List[float] = [] + with torch.no_grad(): + for X, Y in dev_dl: + X = X.to(self.model.device) + Y = Y.to(self.model.device) + Yh = self.model(torch.Tensor(), X) + running_losses.append(criterion(Yh, Y).item()) + return np.mean(running_losses) + + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + self._fit(train_data, dev_data) + + def __call__(self, ds: EncodedDs, + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + """ + Make predictions based on datasource with the same features as the ones used for fitting + + :param ds: Predictions are generate from it + :param arg: Any additional arguments used in predicting + + :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class + """ # noqa + self.model = self.model.eval() + decoded_predictions: List[object] = [] + all_probs: List[List[float]] = [] + rev_map = {} + + with torch.no_grad(): + for idx, (X, Y) in enumerate(ds): + X = X.to(self.model.device) + Yh = self.model(torch.Tensor(), X.unsqueeze(0)) + Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh + + kwargs = {} + for dep in self.target_encoder.dependencies: + kwargs['dependency_data'] = {dep: ds.data_frame.iloc[idx][[dep]].values} + + if args.predict_proba and self.supports_proba: + decoded_prediction, probs, rev_map = self.target_encoder.decode_probabilities(Yh, **kwargs) + all_probs.append(probs) + else: + decoded_prediction = self.target_encoder.decode(Yh, **kwargs) + + decoded_predictions.extend(decoded_prediction) + + ydf = pd.DataFrame({'prediction': decoded_predictions}) + + if args.predict_proba and self.supports_proba: + raw_predictions = np.array(all_probs).squeeze(axis=1) + + for idx, label in enumerate(rev_map.values()): + ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] + + return ydf \ No newline at end of file diff --git a/requirements_extra.txt b/requirements_extra.txt index 9a5b8e5aa..2b6ad2579 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -1 +1,2 @@ lightgbm >=3.3.0,<=3.3.3 +tab-transformer-pytorch \ No newline at end of file From ec95efb9d1664d18e97817b5fc5498f3de779726 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 01:09:17 -0300 Subject: [PATCH 12/31] lint: flake8 --- lightwood/mixer/tabtransformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightwood/mixer/tabtransformer.py b/lightwood/mixer/tabtransformer.py index eaf98ff0a..ebff2988a 100644 --- a/lightwood/mixer/tabtransformer.py +++ b/lightwood/mixer/tabtransformer.py @@ -6,7 +6,7 @@ # import torch.nn as nn import numpy as np import pandas as pd -from tab_transformer_pytorch import FTTransformer +from tab_transformer_pytorch import TabTransformer # from type_infer.dtype import dtype from lightwood.helpers.log import log @@ -49,9 +49,9 @@ def __init__( self.stable = True # still experimental def _init_net(self, ds: EncodedDs): - self.net_class = FTTransformer + self.net_class = TabTransformer - self.model = FTTransformer( + self.model = TabTransformer( categories=(), # unused here, as by the point it arrives to the mixer, everything is numerical # noqa num_continuous=len(ds[0][0]), # ds.input_length, # TODO define based on DS dim=self.train_args.get('dim', 32), @@ -185,4 +185,4 @@ def __call__(self, ds: EncodedDs, for idx, label in enumerate(rev_map.values()): ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] - return ydf \ No newline at end of file + return ydf From e78fbb03cc81b361646e10816cc8ce0e66f9558f Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 14:33:09 -0300 Subject: [PATCH 13/31] lower bound for tab-transformer-pytorch >= 0.2.1 --- requirements_extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_extra.txt b/requirements_extra.txt index 2b6ad2579..db3751a04 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -1,2 +1,2 @@ lightgbm >=3.3.0,<=3.3.3 -tab-transformer-pytorch \ No newline at end of file +tab-transformer-pytorch >= 0.2.1 From 294b81dfcc8ddcbdfa4256ddacb4794383fee23a Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 17:30:14 -0300 Subject: [PATCH 14/31] feat: move tabtransformer to core reqs; simplify implementation (no duplicate code); add tests --- lightwood/api/json_ai.py | 23 ++- lightwood/mixer/neural.py | 13 +- lightwood/mixer/tabtransformer.py | 147 ++---------------- requirements.txt | 1 + requirements_extra.txt | 1 - tests/unit_tests/mixer/test_tabtransformer.py | 52 +++++++ 6 files changed, 89 insertions(+), 148 deletions(-) create mode 100644 tests/unit_tests/mixer/test_tabtransformer.py diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index df57af77a..1f21007ed 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -571,22 +571,31 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI: mixers[i]["args"]["stop_after"] = mixers[i]["args"].get("stop_after", "$problem_definition.seconds_per_mixer") # specific - if mixers[i]["module"] in ("Neural", "NeuralTs"): + if mixers[i]["module"] in ("Neural", "NeuralTs", "TabTransformerMixer"): mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get( "target_encoder", "$encoders[self.target]" ) - mixers[i]["args"]["net"] = mixers[i]["args"].get( - "net", - '"DefaultNet"' - if not tss.is_timeseries or not tss.use_previous_target - else '"ArNet"', - ) + + if mixers[i]["module"] in ("Neural", "NeuralTs"): + mixers[i]["args"]["net"] = mixers[i]["args"].get( + "net", + '"DefaultNet"' + if not tss.is_timeseries or not tss.use_previous_target + else '"ArNet"', + ) + mixers[i]["args"]["search_hyperparameters"] = mixers[i]["args"].get("search_hyperparameters", True) + mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", True) + if mixers[i]["module"] == "NeuralTs": mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get( "timeseries_settings", "$problem_definition.timeseries_settings" ) mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis") + if mixers[i]["module"] == "TabTransformerMixer": + mixers[i]["args"]["search_hyperparameters"] = mixers[i]["args"].get("search_hyperparameters", False) + mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", False) + elif mixers[i]["module"] in ("LightGBM", "XGBoostMixer"): mixers[i]["args"]["input_cols"] = mixers[i]["args"].get( "input_cols", "$input_cols" diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index 9e10a690b..90040a3aa 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -80,7 +80,7 @@ def _final_tuning(self, data): for X, Y in data: X = X.to(self.model.device) Y = Y.to(self.model.device) - Yh = self.model(X) + Yh = self._net_call(X) Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh Y = torch.unsqueeze(Y, 0) if len(Y.shape) < 2 else Y @@ -134,7 +134,7 @@ def _find_lr(self, dl): Y = Y.to(self.model.device) with LightwoodAutocast(): optimizer.zero_grad() - Yh = self.model(X) + Yh = self._net_call(X) loss = criterion(Yh, Y) if LightwoodAutocast.active: scaler.scale(loss).backward() @@ -179,7 +179,7 @@ def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, r Y = Y.to(self.model.device) with LightwoodAutocast(): optimizer.zero_grad() - Yh = self.model(X) + Yh = self._net_call(X) loss = criterion(Yh, Y) if LightwoodAutocast.active: scaler.scale(loss).backward() @@ -236,7 +236,7 @@ def _error(self, dev_dl, criterion) -> float: for X, Y in dev_dl: X = X.to(self.model.device) Y = Y.to(self.model.device) - Yh = self.model(X) + Yh = self._net_call(X) running_losses.append(criterion(Yh, Y).item()) return np.mean(running_losses) @@ -254,6 +254,9 @@ def _init_net(self, ds: EncodedDs): self.model = self.net_class(**net_kwargs) + def _net_call(self, x: torch.Tensor) -> torch.Tensor: + return self.model(x) + # @TODO: Compare partial fitting fully on and fully off on the benchmarks! # @TODO: Writeup on the methodology for partial fitting def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: @@ -336,7 +339,7 @@ def __call__(self, ds: EncodedDs, with torch.no_grad(): for idx, (X, Y) in enumerate(ds): X = X.to(self.model.device) - Yh = self.model(X) + Yh = self._net_call(X) Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh kwargs = {} diff --git a/lightwood/mixer/tabtransformer.py b/lightwood/mixer/tabtransformer.py index ebff2988a..96613d02a 100644 --- a/lightwood/mixer/tabtransformer.py +++ b/lightwood/mixer/tabtransformer.py @@ -1,17 +1,8 @@ -import time -from copy import deepcopy -from typing import Dict, List, Optional +from typing import Dict, Optional import torch -# import torch.nn as nn -import numpy as np -import pandas as pd from tab_transformer_pytorch import TabTransformer -# from type_infer.dtype import dtype -from lightwood.helpers.log import log -from lightwood.helpers.torch import LightwoodAutocast -from lightwood.api.types import PredictionArguments from lightwood.helpers.device import get_device_from_name from lightwood.data.encoded_ds import EncodedDs from lightwood.encoder.base import BaseEncoder @@ -46,143 +37,29 @@ def __init__( n_epochs=self.train_args.get('n_epochs', None) ) self.lr = self.train_args.get('lr') - self.stable = True # still experimental + self.stable = False # still experimental def _init_net(self, ds: EncodedDs): self.net_class = TabTransformer - self.model = TabTransformer( - categories=(), # unused here, as by the point it arrives to the mixer, everything is numerical # noqa - num_continuous=len(ds[0][0]), # ds.input_length, # TODO define based on DS + categories=(), # unused, everything is numerical by now + num_continuous=len(ds[0][0]), dim=self.train_args.get('dim', 32), dim_out=self.train_args.get('dim_out', len(ds[0][1])), depth=self.train_args.get('depth', 6), heads=self.train_args.get('heads', 8), - attn_dropout=self.train_args.get('attn_dropout', 0.1), # post-attention dropout - ff_dropout=self.train_args.get('ff_dropout', 0.1), # feed forward dropout - mlp_hidden_mults=self.train_args.get('mlp_hidden_mults', (4, 2)), # relative multiples of each hidden dimension of the last mlp to logits # noqa - # mlp_act=self.train_args.get('mlp_act', nn.ReLU()), # TODO: import string from nn activations + attn_dropout=self.train_args.get('attn_dropout', 0.1), # post-attention dropout + ff_dropout=self.train_args.get('ff_dropout', 0.1), # feed forward dropout + mlp_hidden_mults=self.train_args.get('mlp_hidden_mults', (4, 2)), # relative multiples of each hidden dimension of the last mlp to logits # noqa + # mlp_act=self.train_args.get('mlp_act', nn.ReLU()), # TODO: import str from nn activations ) self.model.device = get_device_from_name('') self.model.to(self.model.device) - def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after): - epochs_to_best = 0 - best_dev_error = pow(2, 32) - running_errors = [] - best_model = self.model - - for epoch in range(1, return_model_after + 1): - self.model = self.model.train() - running_losses: List[float] = [] - for i, (X, Y) in enumerate(train_dl): - X = X.to(self.model.device) - Y = Y.to(self.model.device) - with LightwoodAutocast(): - optimizer.zero_grad() - Yh = self.model(torch.Tensor(), X) - loss = criterion(Yh, Y) - if LightwoodAutocast.active: - scaler.scale(loss).backward() - scaler.step(optimizer) - scaler.update() - else: - loss.backward() - optimizer.step() - - running_losses.append(loss.item()) - if (time.time() - self.started) > stop_after: - break - - train_error = np.mean(running_losses) - epoch_error = self._error(dev_dl, criterion) - running_errors.append(epoch_error) - log.info(f'Loss @ epoch {epoch}: {epoch_error}') - - if np.isnan(train_error) or np.isnan( - running_errors[-1]) or np.isinf(train_error) or np.isinf( - running_errors[-1]): - break - - if best_dev_error > running_errors[-1]: - best_dev_error = running_errors[-1] - best_model = deepcopy(self.model) - epochs_to_best = epoch - - # manually set epoch limit - if self.n_epochs is not None: - if epoch > self.n_epochs: - break - - # automated early stopping - else: - if len(running_errors) >= 5: - delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)], - weights=[(1 / 2)**i for i in range(1, 5)]) - if delta_mean <= 0: - break - elif (time.time() - self.started) > stop_after: - break - elif running_errors[-1] < 0.0001 or train_error < 0.0001: - break - - if np.isnan(best_dev_error): - best_dev_error = pow(2, 32) - return best_model, epochs_to_best, best_dev_error - - def _error(self, dev_dl, criterion) -> float: - self.model = self.model.eval() - running_losses: List[float] = [] - with torch.no_grad(): - for X, Y in dev_dl: - X = X.to(self.model.device) - Y = Y.to(self.model.device) - Yh = self.model(torch.Tensor(), X) - running_losses.append(criterion(Yh, Y).item()) - return np.mean(running_losses) + def _net_call(self, x: torch.Tensor) -> torch.Tensor: + x = torch.unsqueeze(x, 0) if len(x.shape) < 2 else x + return self.model(torch.Tensor(), x) def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + """ Skip the usual partial_fit call at the end. """ # noqa self._fit(train_data, dev_data) - - def __call__(self, ds: EncodedDs, - args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: - """ - Make predictions based on datasource with the same features as the ones used for fitting - - :param ds: Predictions are generate from it - :param arg: Any additional arguments used in predicting - - :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class - """ # noqa - self.model = self.model.eval() - decoded_predictions: List[object] = [] - all_probs: List[List[float]] = [] - rev_map = {} - - with torch.no_grad(): - for idx, (X, Y) in enumerate(ds): - X = X.to(self.model.device) - Yh = self.model(torch.Tensor(), X.unsqueeze(0)) - Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh - - kwargs = {} - for dep in self.target_encoder.dependencies: - kwargs['dependency_data'] = {dep: ds.data_frame.iloc[idx][[dep]].values} - - if args.predict_proba and self.supports_proba: - decoded_prediction, probs, rev_map = self.target_encoder.decode_probabilities(Yh, **kwargs) - all_probs.append(probs) - else: - decoded_prediction = self.target_encoder.decode(Yh, **kwargs) - - decoded_predictions.extend(decoded_prediction) - - ydf = pd.DataFrame({'prediction': decoded_predictions}) - - if args.predict_proba and self.supports_proba: - raw_predictions = np.array(all_probs).squeeze(axis=1) - - for idx, label in enumerate(rev_map.values()): - ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] - - return ydf diff --git a/requirements.txt b/requirements.txt index bffa494ea..c275bd659 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,7 @@ langid==1.1.6 pydateinfer==0.3.0 protobuf<3.21.0 xgboost>=1.6.0, <=1.8.0 +tab-transformer-pytorch >= 0.2.1 typing-inspect six regex diff --git a/requirements_extra.txt b/requirements_extra.txt index db3751a04..9a5b8e5aa 100644 --- a/requirements_extra.txt +++ b/requirements_extra.txt @@ -1,2 +1 @@ lightgbm >=3.3.0,<=3.3.3 -tab-transformer-pytorch >= 0.2.1 diff --git a/tests/unit_tests/mixer/test_tabtransformer.py b/tests/unit_tests/mixer/test_tabtransformer.py new file mode 100644 index 000000000..29ae7e40e --- /dev/null +++ b/tests/unit_tests/mixer/test_tabtransformer.py @@ -0,0 +1,52 @@ +import unittest +import numpy as np +import pandas as pd +from sklearn.metrics import balanced_accuracy_score +from lightwood.api.types import ProblemDefinition +from lightwood.api.high_level import json_ai_from_problem, predictor_from_json_ai, JsonAI, code_from_json_ai, predictor_from_code # noqa + + +np.random.seed(42) + + +class TestBasic(unittest.TestCase): + def get_submodels(self): + submodels = [ + { + 'module': 'TabTransformerMixer', + 'args': { + 'train_args': {'n_epochs': 5}, + } + }, + ] + return submodels + + def test_0_regression(self): + df = pd.read_csv('tests/data/concrete_strength.csv')[:500] + target = 'concrete_strength' + + pdef = ProblemDefinition.from_dict({'target': target}) + jai = json_ai_from_problem(df, pdef) + + jai.model['args']['submodels'] = self.get_submodels() + code = code_from_json_ai(jai) + predictor = predictor_from_code(code) + + predictor.learn(df) + predictor.predict(df) + + def test_1_binary(self): + df = pd.read_csv('tests/data/ionosphere.csv')[:100] + target = 'target' + + pdef = ProblemDefinition.from_dict({'target': target, 'unbias_target': False}) + jai = json_ai_from_problem(df, pdef) + jai.model['args']['submodels'] = self.get_submodels() + code = code_from_json_ai(jai) + predictor = predictor_from_code(code) + + predictor.learn(df) + predictions = predictor.predict(df) + + acc = balanced_accuracy_score(df[target], predictions['prediction']) + self.assertTrue(all([0 <= p <= 1 for p in predictions['confidence']])) From 10be2c5e3cbb954e040ec5162c8f02056e396ed6 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 17:35:42 -0300 Subject: [PATCH 15/31] lint: flake8 --- tests/unit_tests/mixer/test_tabtransformer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit_tests/mixer/test_tabtransformer.py b/tests/unit_tests/mixer/test_tabtransformer.py index 29ae7e40e..0fc74bb6a 100644 --- a/tests/unit_tests/mixer/test_tabtransformer.py +++ b/tests/unit_tests/mixer/test_tabtransformer.py @@ -1,7 +1,6 @@ import unittest import numpy as np import pandas as pd -from sklearn.metrics import balanced_accuracy_score from lightwood.api.types import ProblemDefinition from lightwood.api.high_level import json_ai_from_problem, predictor_from_json_ai, JsonAI, code_from_json_ai, predictor_from_code # noqa @@ -48,5 +47,4 @@ def test_1_binary(self): predictor.learn(df) predictions = predictor.predict(df) - acc = balanced_accuracy_score(df[target], predictions['prediction']) self.assertTrue(all([0 <= p <= 1 for p in predictions['confidence']])) From 494ccbfcaf0803f42527ac64261b28e18cbb9035 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 21:34:48 -0300 Subject: [PATCH 16/31] fix: 1096 --- lightwood/analysis/nc/calibrate.py | 23 ++++++++++++++++------- lightwood/data/timeseries_transform.py | 2 +- lightwood/helpers/ts.py | 5 ++++- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index 5357c0ad3..789bea7e3 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -105,16 +105,17 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: icp_df = deepcopy(ns.data) # setup prediction cache to avoid additional .predict() calls - pred_is_list = isinstance(ns.normal_predictions['prediction'], list) and \ - isinstance(ns.normal_predictions['prediction'][0], list) + try: + pred_is_list = isinstance(ns.normal_predictions['prediction'][0], list) + except TypeError: + pred_is_list = False + if ns.is_classification: if ns.predictor.supports_proba: icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values else: if ns.is_multi_ts: - icp.nc_function.model.prediction_cache = np.array( - [p[0] for p in ns.normal_predictions['prediction']]) - preds = icp.nc_function.model.prediction_cache + preds = np.array([p[0] for p in ns.normal_predictions['prediction']]) else: preds = ns.normal_predictions['prediction'] predicted_classes = pd.get_dummies(preds).values # inflate to one-hot enc @@ -198,8 +199,14 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: # save relevant predictions in the caches, then calibrate the ICP pred_cache = icp_df.pop(f'__predicted_{ns.target}').values - if ns.is_multi_ts: + if ns.is_multi_ts and ns.is_classification: + pred_cache = pd.get_dummies(np.array([p[0] for p in pred_cache])).values # TODO: don't use dummies if not all columns are present, use OHE instead + # el + elif ns.is_multi_ts: pred_cache = np.array([np.array(p) for p in pred_cache]) + # elif ns.is_classification: + # pred_cache = pd.get_dummies(pred_cache).values # inflate to one-hot enc + icps[tuple(group)].nc_function.model.prediction_cache = pred_cache icp_df, y = clean_df(icp_df, ns, output.get('label_encoders', None)) if icps[tuple(group)].nc_function.normalizer is not None: @@ -386,6 +393,8 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object] for i in range(1, ns.tss.horizon)] icp.nc_function.model.prediction_cache = X[target_cols].values [X.pop(col) for col in target_cols] + elif is_multi_ts and is_categorical: + icp.nc_function.model.prediction_cache = pd.get_dummies(X.pop(ns.target_name)).values else: icp.nc_function.model.prediction_cache = X.pop(ns.target_name).values if icp.nc_function.normalizer: @@ -431,7 +440,7 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object] all_ranges = np.array([icp.predict(X.values)]) all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1) significances = get_categorical_conf(all_confs) - result.loc[X.index, 'significance'] = significances + result.loc[X.index, 'significance'] = significances.flatten() row_insights['confidence'] = result['significance'] diff --git a/lightwood/data/timeseries_transform.py b/lightwood/data/timeseries_transform.py index a57ee526a..2204e15d6 100644 --- a/lightwood/data/timeseries_transform.py +++ b/lightwood/data/timeseries_transform.py @@ -69,7 +69,7 @@ def transform_timeseries( subsets = [] for group in groups: if (tss.group_by and group != '__default') or not tss.group_by: - idxs, subset = get_group_matches(data, group, tss.group_by) + idxs, subset = get_group_matches(data, group, tss.group_by, copy=True) if subset.shape[0] > 0: if periods.get(group, periods['__default']) == 0 and subset.shape[0] > 1: raise Exception( diff --git a/lightwood/helpers/ts.py b/lightwood/helpers/ts.py index 0acaaab27..445492cf6 100644 --- a/lightwood/helpers/ts.py +++ b/lightwood/helpers/ts.py @@ -18,7 +18,8 @@ def get_ts_groups(df: pd.DataFrame, tss) -> list: def get_group_matches( data: Union[pd.Series, pd.DataFrame], combination: tuple, - group_columns: List[str] + group_columns: List[str], + copy: bool = False ) -> Tuple[list, pd.DataFrame]: """Given a particular group combination, return the data subset that belongs to it.""" @@ -34,6 +35,8 @@ def get_group_matches( for val, col in zip(combination, group_columns): subset = subset[subset[col] == val] if len(subset) > 0: + if copy: + subset = subset.copy() return list(subset.index), subset else: return [], pd.DataFrame() From c5508d279cb92f8bff2a8e4ba08f23f53bab1b87 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 21:36:33 -0300 Subject: [PATCH 17/31] cleanup --- lightwood/analysis/nc/calibrate.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index 789bea7e3..8b8e43b95 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -201,11 +201,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: pred_cache = icp_df.pop(f'__predicted_{ns.target}').values if ns.is_multi_ts and ns.is_classification: pred_cache = pd.get_dummies(np.array([p[0] for p in pred_cache])).values # TODO: don't use dummies if not all columns are present, use OHE instead - # el elif ns.is_multi_ts: pred_cache = np.array([np.array(p) for p in pred_cache]) - # elif ns.is_classification: - # pred_cache = pd.get_dummies(pred_cache).values # inflate to one-hot enc icps[tuple(group)].nc_function.model.prediction_cache = pred_cache icp_df, y = clean_df(icp_df, ns, output.get('label_encoders', None)) @@ -528,6 +525,5 @@ def _ts_assign_confs(result, df, confs, significances, tss) -> pd.DataFrame: added_cols = [f'{base_col}_timestep_{t}' for t in range(1, tss.horizon)] cols = [base_col] + added_cols result.loc[df.index, base_col] = result.loc[df.index, cols].values.tolist() - # result[base_col] = result[cols].values.tolist() return result From d82e3428425474ff19631f651499707da2c0b3bc Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 21:58:28 -0300 Subject: [PATCH 18/31] rm get_dummies --- lightwood/analysis/nc/calibrate.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index 8b8e43b95..fb2e2a19f 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -118,7 +118,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: preds = np.array([p[0] for p in ns.normal_predictions['prediction']]) else: preds = ns.normal_predictions['prediction'] - predicted_classes = pd.get_dummies(preds).values # inflate to one-hot enc + predicted_classes = output['label_encoders'].transform(preds.reshape(-1, 1)) # inflate to OHE icp.nc_function.model.prediction_cache = predicted_classes elif ns.is_multi_ts or pred_is_list: @@ -200,7 +200,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: # save relevant predictions in the caches, then calibrate the ICP pred_cache = icp_df.pop(f'__predicted_{ns.target}').values if ns.is_multi_ts and ns.is_classification: - pred_cache = pd.get_dummies(np.array([p[0] for p in pred_cache])).values # TODO: don't use dummies if not all columns are present, use OHE instead + # output['label_encoders'].transform(preds.reshape(-1, 1)) + pred_cache = output['label_encoders'].transform([[p[0] for p in pred_cache]]) elif ns.is_multi_ts: pred_cache = np.array([np.array(p) for p in pred_cache]) @@ -338,7 +339,8 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object] for icol, cat_col in enumerate(all_cat_cols): row_insights.loc[X.index, cat_col] = class_dists[:, icol] else: - class_dists = pd.get_dummies(preds).values + ohe_enc = ns.analysis['label_encoders'] + class_dists = ohe_enc.transform(np.array([p[0] for p in preds]).reshape(-1, 1)) base_icp.nc_function.model.prediction_cache = class_dists @@ -391,7 +393,10 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object] icp.nc_function.model.prediction_cache = X[target_cols].values [X.pop(col) for col in target_cols] elif is_multi_ts and is_categorical: - icp.nc_function.model.prediction_cache = pd.get_dummies(X.pop(ns.target_name)).values + ohe_enc = ns.analysis['label_encoders'] + preds = X.pop(ns.target_name).values + pred_cache = ohe_enc.transform(np.array([p[0] for p in preds]).reshape(-1, 1)) + icp.nc_function.model.prediction_cache = pred_cache else: icp.nc_function.model.prediction_cache = X.pop(ns.target_name).values if icp.nc_function.normalizer: From d055585de0578ef11c518aff487971d34a34ce5e Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 22:28:01 -0300 Subject: [PATCH 19/31] fix catch --- lightwood/analysis/nc/calibrate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index fb2e2a19f..9c0d576f2 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -107,7 +107,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: # setup prediction cache to avoid additional .predict() calls try: pred_is_list = isinstance(ns.normal_predictions['prediction'][0], list) - except TypeError: + except KeyError: pred_is_list = False if ns.is_classification: From d1629d9c9830df758043940548893cffb2588566 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 22:57:04 -0300 Subject: [PATCH 20/31] fix: values.reshape --- lightwood/analysis/nc/calibrate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index 9c0d576f2..c1f3ff7ce 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -118,7 +118,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: preds = np.array([p[0] for p in ns.normal_predictions['prediction']]) else: preds = ns.normal_predictions['prediction'] - predicted_classes = output['label_encoders'].transform(preds.reshape(-1, 1)) # inflate to OHE + predicted_classes = output['label_encoders'].transform(preds.values.reshape(-1, 1)) # inflate OHE icp.nc_function.model.prediction_cache = predicted_classes elif ns.is_multi_ts or pred_is_list: From 736c3e90235de2906e5cb6f7b6c39e1fa01b297e Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 21 Feb 2023 23:42:18 -0300 Subject: [PATCH 21/31] fix: values.reshape --- lightwood/analysis/nc/calibrate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index c1f3ff7ce..4da8a93cd 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -117,8 +117,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: if ns.is_multi_ts: preds = np.array([p[0] for p in ns.normal_predictions['prediction']]) else: - preds = ns.normal_predictions['prediction'] - predicted_classes = output['label_encoders'].transform(preds.values.reshape(-1, 1)) # inflate OHE + preds = ns.normal_predictions['prediction'].values + predicted_classes = output['label_encoders'].transform(preds.reshape(-1, 1)) # inflate OHE icp.nc_function.model.prediction_cache = predicted_classes elif ns.is_multi_ts or pred_is_list: From da3e3bdc0b7662087b9a05d73aa65ac1abcf4f2e Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 22 Feb 2023 00:04:58 -0300 Subject: [PATCH 22/31] fix mdb#4360 --- lightwood/analysis/nc/calibrate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index 4da8a93cd..e58ec5d82 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -366,8 +366,8 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object] result.loc[X.index, 'significance'] = significances else: - significances = get_categorical_conf(all_confs.squeeze()) - result.loc[X.index, 'significance'] = significances + significances = get_categorical_conf(all_confs) + result.loc[X.index, 'significance'] = significances.flatten() # grouped time series, we replace bounds in rows that have a trained ICP if ns.analysis['icp'].get('__mdb_groups', False): From c81dbe01574df2948a90cd7e7f4362a8b52c2941 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 22 Feb 2023 02:00:34 -0300 Subject: [PATCH 23/31] bump dataprep==0.0.9 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0c943a690..30718a06d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ type_infer ==0.0.9 -dataprep_ml ==0.0.8 +dataprep_ml ==0.0.9 mindsdb-evaluator ==0.0.6 numpy nltk >=3,<3.6 From 67c97d6d49e1b4716eb80e243ba80a1da699c71a Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 22 Feb 2023 02:03:09 -0300 Subject: [PATCH 24/31] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 30718a06d..0c943a690 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ type_infer ==0.0.9 -dataprep_ml ==0.0.9 +dataprep_ml ==0.0.8 mindsdb-evaluator ==0.0.6 numpy nltk >=3,<3.6 From c63223597da9d916e58919df9ed989977cf6052e Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 22 Feb 2023 02:09:53 -0300 Subject: [PATCH 25/31] checkpoint --- lightwood/mixer/gluonts.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index ce5ca90fd..fdd3cea42 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -90,13 +90,19 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: cat_ds = ConcatedEncodedDs([train_data, dev_data]) fit_groups = list(cat_ds.data_frame[self.grouped_by[0]].unique()) if self.grouped_by != ['__default'] else None train_ds = self._make_initial_ds(cat_ds.data_frame, phase='train', groups=fit_groups) + batch_size = 32 self.model_train_stats = TrainingHistory() self.estimator = DeepAREstimator( freq=train_ds.freq, prediction_length=self.horizon, distr_output=self.distribution, - trainer=Trainer(epochs=self.n_epochs, callbacks=[EarlyStop(patience=self.patience), self.model_train_stats]) + lags_seq=[i + 1 for i in range(self.window)], + batch_size=batch_size, + trainer=Trainer( + epochs=self.n_epochs, + num_batches_per_epoch=max(1, len(train_ds) // batch_size), + callbacks=[EarlyStop(patience=self.patience), self.model_train_stats]) ) self.model = self.estimator.train(train_ds) self.prepared = True @@ -162,10 +168,10 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], return ydf def _make_initial_ds(self, df=None, phase='predict', groups=None): - oby = self.ts_analysis["tss"].order_by + oby_col_name = '__gluon_timestamp' gby = self.ts_analysis["tss"].group_by if self.ts_analysis["tss"].group_by else [] freq = self.ts_analysis['sample_freqs']['__default'] - keep_cols = [f'__mdb_original_{oby}', self.target] + [col for col in gby] + keep_cols = [self.target] + [col for col in gby] if groups is None and gby: groups = self.groups @@ -208,7 +214,8 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None): gby = '__default_group' df[gby] = '__default_group' - ds = PandasDataset.from_long_dataframe(df, target=self.target, item_id=gby, freq=freq) + df[oby_col_name] = df.index + ds = PandasDataset.from_long_dataframe(df, target=self.target, item_id=gby, freq=freq, timestamp=oby_col_name) return ds From 64952fd2d76fb9ae54261a77fe836212c237c354 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 22 Feb 2023 12:43:05 -0300 Subject: [PATCH 26/31] fix batch size --- lightwood/mixer/gluonts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index fdd3cea42..1061241f6 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -101,7 +101,7 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: batch_size=batch_size, trainer=Trainer( epochs=self.n_epochs, - num_batches_per_epoch=max(1, len(train_ds) // batch_size), + num_batches_per_epoch=max(1, len(cat_ds.data_frame) // batch_size), callbacks=[EarlyStop(patience=self.patience), self.model_train_stats]) ) self.model = self.estimator.train(train_ds) From 817c93b2f6365e5aba66a9b4ae3739f13c537ffe Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 1 Mar 2023 02:05:58 -0300 Subject: [PATCH 27/31] add static real feature --- lightwood/mixer/gluonts.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index 1061241f6..d44dae897 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -36,6 +36,8 @@ def __init__( early_stop_patience: int = 3, distribution_output: str = '', seed: int = 0, + static_features_cat: Optional[list[str]] = None, + static_features_real: Optional[list[str]] = None, ): """ Wrapper around GluonTS probabilistic deep learning models. For now, only DeepAR is supported. @@ -71,6 +73,8 @@ def __init__( self.patience = early_stop_patience self.seed = seed self.trains_once = True + self.static_features_cat = static_features_cat if static_features_cat else [] + self.static_features_real = static_features_real if static_features_real else [] dist_module = importlib.import_module('.'.join(['gluonts.mx.distribution', *distribution_output.split(".")[:-1]])) @@ -99,6 +103,8 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: distr_output=self.distribution, lags_seq=[i + 1 for i in range(self.window)], batch_size=batch_size, + use_feat_static_cat=True if self.static_features_cat else False, + use_feat_static_real=True if self.static_features_real else False, trainer=Trainer( epochs=self.n_epochs, num_batches_per_epoch=max(1, len(cat_ds.data_frame) // batch_size), @@ -171,7 +177,13 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None): oby_col_name = '__gluon_timestamp' gby = self.ts_analysis["tss"].group_by if self.ts_analysis["tss"].group_by else [] freq = self.ts_analysis['sample_freqs']['__default'] - keep_cols = [self.target] + [col for col in gby] + keep_cols = [self.target] + [col for col in gby] + self.static_features_cat + self.static_features_real + + agg_map = {self.target: 'sum'} + for col in self.static_features_cat: + agg_map[col] = 'first' + for col in self.static_features_real: + agg_map[col] = 'mean' if groups is None and gby: groups = self.groups @@ -207,15 +219,25 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None): return None if gby: - df = df.groupby(by=gby[0]).resample(freq).sum().reset_index(level=[0]) # @TODO: multiple group support and remove groups without enough data + df = df.groupby(by=gby[0]).resample(freq).agg(agg_map).reset_index(level=[0]) else: - df = df.resample(freq).sum() + df = df.resample(freq).agg(agg_map) gby = '__default_group' df[gby] = '__default_group' df[oby_col_name] = df.index - ds = PandasDataset.from_long_dataframe(df, target=self.target, item_id=gby, freq=freq, timestamp=oby_col_name) + ds = PandasDataset.from_long_dataframe( + df, + target=self.target, + item_id=gby, + freq=freq, + timestamp=oby_col_name, + # feat_dynamic_real=None, + # feat_dynamic_cat=None, + feat_static_real=self.static_features_real if self.static_features_real else None, + feat_static_cat=self.static_features_cat if self.static_features_cat else None, + ) return ds From 0ce12c1a36c0df4f8a0a468798267a9697a95b05 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 1 Mar 2023 02:39:00 -0300 Subject: [PATCH 28/31] add static cat features --- lightwood/mixer/gluonts.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index d44dae897..92972ce08 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import mxnet as mx +from sklearn.preprocessing import OrdinalEncoder from gluonts.dataset.pandas import PandasDataset @@ -73,6 +74,7 @@ def __init__( self.patience = early_stop_patience self.seed = seed self.trains_once = True + self.static_features_cat_encoders = {} self.static_features_cat = static_features_cat if static_features_cat else [] self.static_features_real = static_features_real if static_features_real else [] @@ -92,6 +94,9 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: # prepare data cat_ds = ConcatedEncodedDs([train_data, dev_data]) + for col in self.static_features_cat: + self.static_features_cat_encoders[col] = OrdinalEncoder().fit(cat_ds.data_frame[col].values.reshape(-1, 1)) + fit_groups = list(cat_ds.data_frame[self.grouped_by[0]].unique()) if self.grouped_by != ['__default'] else None train_ds = self._make_initial_ds(cat_ds.data_frame, phase='train', groups=fit_groups) batch_size = 32 @@ -227,14 +232,15 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None): df[gby] = '__default_group' df[oby_col_name] = df.index + for col in self.static_features_cat: + df[col] = self.static_features_cat_encoders[col].transform(df[col].values.reshape(-1,1)) + ds = PandasDataset.from_long_dataframe( df, target=self.target, item_id=gby, freq=freq, timestamp=oby_col_name, - # feat_dynamic_real=None, - # feat_dynamic_cat=None, feat_static_real=self.static_features_real if self.static_features_real else None, feat_static_cat=self.static_features_cat if self.static_features_cat else None, ) From ae18b5d96ae4cfdafe0b46ab64d8a6ec6c938259 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 1 Mar 2023 02:39:26 -0300 Subject: [PATCH 29/31] lint: flake8 --- lightwood/mixer/gluonts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index 92972ce08..ac7d2bc5f 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -233,7 +233,7 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None): df[oby_col_name] = df.index for col in self.static_features_cat: - df[col] = self.static_features_cat_encoders[col].transform(df[col].values.reshape(-1,1)) + df[col] = self.static_features_cat_encoders[col].transform(df[col].values.reshape(-1, 1)) ds = PandasDataset.from_long_dataframe( df, From fc49017a063a0435db25d7cdaa12d5771014b832 Mon Sep 17 00:00:00 2001 From: Max Stepanov Date: Mon, 13 Mar 2023 20:38:29 +0300 Subject: [PATCH 30/31] bound pandas requirement --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0c943a690..990f4de54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ mindsdb-evaluator ==0.0.6 numpy nltk >=3,<3.6 python-dateutil >=2.8.1 -pandas >=1.1.5 +pandas >=1.1.5, <1.5.0 schema >=0.6.8 torch >=1.13.0, <1.14.0 requests >=2.0.0 From 019df037d03b8c72545f3d3447f225ba7bb82fbe Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 13 Mar 2023 17:51:21 -0300 Subject: [PATCH 31/31] version bump: 23.3.2.0 --- lightwood/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/__about__.py b/lightwood/__about__.py index e8fa521e5..ede584c93 100644 --- a/lightwood/__about__.py +++ b/lightwood/__about__.py @@ -1,6 +1,6 @@ __title__ = 'lightwood' __package_name__ = 'lightwood' -__version__ = '23.2.1.0' +__version__ = '23.3.2.0' __description__ = "Lightwood is a toolkit for automatic machine learning model building" __email__ = "community@mindsdb.com" __author__ = 'MindsDB Inc'