From c00c301975a4455e4d30a8454360de1a568f5c2b Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sun, 12 Feb 2023 11:32:49 -0300
Subject: [PATCH 01/31] update import

---
 lightwood/mixer/gluonts.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index ce264bef4..6083396cd 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -8,8 +8,7 @@
 
 from gluonts.dataset.pandas import PandasDataset
 
-from gluonts.model.deepar import DeepAREstimator  # @TODO: support for other estimators
-from gluonts.mx import Trainer
+from gluonts.mx import DeepAREstimator, Trainer  # @TODO: support for other estimators
 from gluonts.mx.trainer.callback import TrainingHistory
 from gluonts.mx.distribution.student_t import StudentTOutput
 

From aff0d908b49d5c0418fb257f5c9b25a7ea8b57d8 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sun, 12 Feb 2023 14:28:29 -0300
Subject: [PATCH 02/31] faster, no indexing

---
 lightwood/data/encoded_ds.py                  |  6 +--
 lightwood/mixer/gluonts.py                    | 41 ++++++++--------
 lightwood/mixer/lightgbm_array.py             |  2 +-
 lightwood/mixer/neural_ts.py                  |  2 +-
 lightwood/mixer/nhits.py                      |  2 +-
 lightwood/mixer/sktime.py                     |  2 +-
 tests/integration/advanced/test_timeseries.py | 48 +++++++++----------
 7 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index 504335a07..b7f90993f 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -144,7 +144,7 @@ class ConcatedEncodedDs(EncodedDs):
     def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None:
         # @TODO: missing super() call here?
         self.encoded_ds_arr = encoded_ds_arr
-        self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr]
+        self.encoded_ds_lengths = [len(x) for x in self.encoded_ds_arr]
         self.encoders = self.encoded_ds_arr[0].encoders
         self.encoder_spans = self.encoded_ds_arr[0].encoder_spans
         self.target = self.encoded_ds_arr[0].target
@@ -155,13 +155,13 @@ def __len__(self):
         See `lightwood.data.encoded_ds.EncodedDs.__len__()`.
         """
         # @TODO: behavior here is not intuitive
-        return max(0, np.sum(self.encoded_ds_lenghts) - 2)
+        return max(0, np.sum(self.encoded_ds_lengths) - 2)
 
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         See `lightwood.data.encoded_ds.EncodedDs.__getitem__()`.
         """
-        for ds_idx, length in enumerate(self.encoded_ds_lenghts):
+        for ds_idx, length in enumerate(self.encoded_ds_lengths):
             if idx - length < 0:
                 return self.encoded_ds_arr[ds_idx][idx]
             else:
diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index 6083396cd..6d01bcc02 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -13,6 +13,7 @@
 from gluonts.mx.distribution.student_t import StudentTOutput
 
 from lightwood.helpers.log import log
+from lightwood.helpers.ts import get_group_matches
 from lightwood.mixer.base import BaseMixer
 from lightwood.api.types import PredictionArguments
 from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs
@@ -39,6 +40,8 @@ def __init__(
         """
         Wrapper around GluonTS probabilistic deep learning models. For now, only DeepAR is supported.
 
+        Due to inference speed, predictions are only generated for the last data point (as opposed to other mixers).  
+
         :param stop_after: time budget in seconds.
         :param target: column to forecast.
         :param horizon: length of forecasted horizon.
@@ -128,11 +131,15 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs],
         """ 
         Calls the mixer to emit forecasts.
         """  # noqa
-        length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds)
-        ydf = pd.DataFrame(0,  # zero-filled
-                           index=np.arange(length),
-                           columns=['prediction', 'lower', 'upper'],
-                           dtype=object)
+        mx.random.seed(self.seed)
+        np.random.seed(self.seed)
+        length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds)
+
+        ydf = pd.DataFrame(index=np.arange(length), dtype=object)
+        init_arr = [0 for _ in range(self.ts_analysis['tss'].horizon)]
+        for col in ['prediction', 'lower', 'upper']:
+            ydf.at[:, col] = [init_arr for _ in range(len(ydf))]
+
         ydf['index'] = ds.data_frame.index
         conf = args.fixed_confidence if args.fixed_confidence else 0.9
         ydf['confidence'] = conf
@@ -140,20 +147,16 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs],
         gby = self.ts_analysis["tss"].group_by if self.ts_analysis["tss"].group_by else []
         groups = ds.data_frame[gby[0]].unique().tolist() if gby else None
 
-        for idx in range(length):
-            df = ds.data_frame.iloc[:idx] if idx != 0 else None
-            input_ds = self._make_initial_ds(df, groups=groups)
-            if not input_ds:
-                # edge case: new group
-                for col in ['prediction', 'lower', 'upper']:
-                    ydf.at[idx, col] = [0 for _ in range(self.ts_analysis["tss"].horizon)]
-            else:
-                mx.random.seed(self.seed)
-                np.random.seed(self.seed)
-                forecasts = list(self.model.predict(input_ds))[0]
-                ydf.at[idx, 'prediction'] = [entry for entry in forecasts.quantile(0.5)]
-                ydf.at[idx, 'lower'] = [entry for entry in forecasts.quantile(1 - conf)]
-                ydf.at[idx, 'upper'] = [entry for entry in forecasts.quantile(conf)]
+        df = ds.data_frame
+        ydf['__original_index'] = df['__mdb_original_index'].values
+        input_ds = self._make_initial_ds(df, groups=groups)  # TODO test with novel group
+        forecasts = list(self.model.predict(input_ds))
+        for group, group_forecast in zip(groups, forecasts):
+            _, subdf = get_group_matches(df, (group, ), gby)
+            idx = ydf[ydf['__original_index'] == max(subdf['__mdb_original_index'])].index.values[0]
+            ydf.at[idx, 'prediction'] = [entry for entry in group_forecast.quantile(0.5)]
+            ydf.at[idx, 'lower'] = [entry for entry in group_forecast.quantile(1 - conf)]
+            ydf.at[idx, 'upper'] = [entry for entry in group_forecast.quantile(conf)]
 
         return ydf
 
diff --git a/lightwood/mixer/lightgbm_array.py b/lightwood/mixer/lightgbm_array.py
index 89e1b9d8b..a52bc0c90 100644
--- a/lightwood/mixer/lightgbm_array.py
+++ b/lightwood/mixer/lightgbm_array.py
@@ -85,7 +85,7 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs],
             log.warning('This model does not output probability estimates')
 
         original_df = deepcopy(ds.data_frame)
-        length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds)
+        length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds)
         ydf = pd.DataFrame(0,  # zero-filled
                            index=np.arange(length),
                            columns=[f'prediction_{i}' for i in range(self.horizon)])
diff --git a/lightwood/mixer/neural_ts.py b/lightwood/mixer/neural_ts.py
index f73b44dd0..ef34b53f7 100644
--- a/lightwood/mixer/neural_ts.py
+++ b/lightwood/mixer/neural_ts.py
@@ -140,7 +140,7 @@ def __call__(self, ds: EncodedDs,
         all_probs: List[List[float]] = []
         rev_map = {}
 
-        length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds)
+        length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds)
         pred_cols = [f'prediction_{i}' for i in range(self.timeseries_settings.horizon)]
         ydf = pd.DataFrame(0,  # zero-filled
                            index=np.arange(length),
diff --git a/lightwood/mixer/nhits.py b/lightwood/mixer/nhits.py
index a60bdeb85..cc02c0a37 100644
--- a/lightwood/mixer/nhits.py
+++ b/lightwood/mixer/nhits.py
@@ -162,7 +162,7 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs],
         if args.predict_proba:
             log.warning('This mixer does not output probability estimates')
 
-        length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds)
+        length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds)
         ydf = pd.DataFrame(0,  # zero-filled
                            index=np.arange(length),
                            columns=['prediction', 'lower', 'upper'],
diff --git a/lightwood/mixer/sktime.py b/lightwood/mixer/sktime.py
index 50ac7e066..6df73d8b4 100644
--- a/lightwood/mixer/sktime.py
+++ b/lightwood/mixer/sktime.py
@@ -240,7 +240,7 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs],
         df = deepcopy(ds.data_frame)
         df = df.rename_axis('__sktime_index').reset_index()
 
-        length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds)
+        length = sum(ds.encoded_ds_lengths) if isinstance(ds, ConcatedEncodedDs) else len(ds)
         ydf = pd.DataFrame(0,  # zero-filled
                            index=df.index,
                            columns=['prediction'],
diff --git a/tests/integration/advanced/test_timeseries.py b/tests/integration/advanced/test_timeseries.py
index c0867b62f..2709e15ab 100644
--- a/tests/integration/advanced/test_timeseries.py
+++ b/tests/integration/advanced/test_timeseries.py
@@ -576,28 +576,26 @@ def test_12_gluonts(self):
 
         if GluonTSMixer is not None:
             data = pd.read_csv('tests/data/arrivals.csv')
-            for i, subdata in enumerate([data, data[data['Country'] == 'US']]):
-                order_by = 'T'
-                train_df, test_df = self.split_arrivals(subdata, grouped=False)
-                pdef = {'target': 'Traffic',
-                        'use_default_analysis': False,
-                        'timeseries_settings': {
-                            'order_by': order_by,
-                            'window': 4 * 5,
-                            'horizon': 4 * 2}}
-                if i == 0:
-                    pdef['timeseries_settings']['group_by'] = ['Country']
-                jai = json_ai_from_problem(train_df, ProblemDefinition.from_dict(pdef))
-                jai.model['args']['submodels'] = [{
-                    "module": "GluonTSMixer",
-                    "args": {}
-                }]
-                predictor = predictor_from_json_ai(jai)
-                predictor.learn(train_df)
-                predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'})
-
-                # adjust
-                adjust_n_epochs = 5
-                predictor.adjust(test_df, adjust_args={'n_epochs': adjust_n_epochs})
-                predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'})
-                assert predictor.mixers[0].n_epochs == adjust_n_epochs
+            order_by = 'T'
+            train_df, test_df = self.split_arrivals(data, grouped=True)
+            pdef = {'target': 'Traffic',
+                    'timeseries_settings': {
+                        'order_by': order_by,
+                        'group_by': ['Country'],
+                        'window': 4 * 5,
+                        'horizon': 4 * 2}}
+            jai = json_ai_from_problem(train_df, ProblemDefinition.from_dict(pdef))
+            jai.model['args']['submodels'] = [{
+                "module": "GluonTSMixer",
+                "args": {}
+            }]
+            predictor = predictor_from_json_ai(jai)
+            predictor.learn(train_df)
+            predictor.predict(test_df, args={'time_format': 'infer'})
+            predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'})
+
+            # adjust
+            adjust_n_epochs = 5
+            predictor.adjust(test_df, adjust_args={'n_epochs': adjust_n_epochs})
+            predictor.predict(test_df.iloc[[-1]], args={'time_format': 'infer'})
+            assert predictor.mixers[0].n_epochs == adjust_n_epochs

From b090e6ae128cad4415daa47490505b14aac767c4 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 13 Feb 2023 21:58:18 -0300
Subject: [PATCH 03/31] fit_on_all for gluonTS

---
 lightwood/api/json_ai.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 1b8c32378..38d44b5a9 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -622,7 +622,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
             mixers[i]["args"]["use_stl"] = mixers[i]["args"].get("use_stl", "False")
 
-        elif mixers[i]["module"] in ("NHitsMixer", "GluonTSMixer"):
+        elif mixers[i]["module"] in ("NHitsMixer", ):
             mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
             mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window"
             mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
@@ -630,12 +630,12 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             )
             problem_definition.fit_on_all = False  # takes too long otherwise
 
-        elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer"):
+        elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer", "GluonTSMixer"):
             mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
                 "ts_analysis", "$ts_analysis"
             )
-            if "horizon" not in mixers[i]["args"]:
-                mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+            mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window"
+            mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
 
             # enforce fit_on_all if this mixer is specified
             problem_definition.fit_on_all = True

From a7f0bfc39981652e242f12b97bfffaee8bcfaf46 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 14 Feb 2023 01:37:55 -0300
Subject: [PATCH 04/31] add trains once option

---
 lightwood/api/json_ai.py   | 12 ++++++++----
 lightwood/mixer/base.py    |  4 +++-
 lightwood/mixer/gluonts.py |  1 +
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 38d44b5a9..586b1f1c4 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -1025,7 +1025,12 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 trained_mixers = []
 for mixer in self.mixers:
     try:
-        self.fit_mixer(mixer, encoded_train_data, encoded_dev_data)
+        if mixer.trains_once:
+            self.fit_mixer(mixer,
+                           ConcatedEncodedDs([encoded_train_data, encoded_dev_data]),
+                           encoded_test_data)
+        else:
+            self.fit_mixer(mixer, encoded_train_data, encoded_dev_data)
         trained_mixers.append(mixer)
     except Exception as e:
         log.warning(f'Exception: {{e}} when training mixer: {{mixer}}')
@@ -1107,7 +1112,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 log.info('Updating the mixers')
 
 for mixer in self.mixers:
-    mixer.partial_fit(train_data, dev_data, adjust_args)
+        mixer.partial_fit(train_data, dev_data, adjust_args)
 """  # noqa
 
     adjust_body = align(adjust_body, 2)
@@ -1154,8 +1159,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 # SET `json_ai.problem_definition.fit_on_all=False` TO TURN THIS BLOCK OFF.
 
 # Update the mixers with partial fit
-if self.problem_definition.fit_on_all:
-
+if self.problem_definition.fit_on_all and all([not m.trains_once for m in self.mixers]):
     log.info(f'[Learn phase 8/{n_phases}] - Adjustment on validation requested')
     self.adjust(enc_train_test["test"].data_frame, ConcatedEncodedDs([enc_train_test["train"],
                                                                       enc_train_test["dev"]]).data_frame,
diff --git a/lightwood/mixer/base.py b/lightwood/mixer/base.py
index ba98f0865..a91bb6946 100644
--- a/lightwood/mixer/base.py
+++ b/lightwood/mixer/base.py
@@ -21,11 +21,12 @@ class BaseMixer:
     - stable: If set to `True`, this mixer should always work. Any mixer with `stable=False` can be expected to fail under some circumstances.
     - fit_data_len: Length of the training data.
     - supports_proba: For classification tasks, whether the mixer supports yielding per-class scores rather than only returning the predicted label. 
-
+    - trains_once: If True, the mixer is trained once during learn, using all available input data (`train` and `dev` splits for training, `test` for validation). Otherwise, it trains once with the `train`` split & `dev` for validation, and optionally (depending on the problem definition `fit_on_all` and mixer-wise `fit_on_dev` arguments) a second time after post-training analysis via partial_fit, with `train` and `dev` splits as training subset, and `test` split as validation. Should only be set to True for mixers that don't require post-training analysis, as otherwise actual validation data would be treated as a held-out portion, which is a mistake. 
     """  # noqa
     stable: bool
     fit_data_len: int  # @TODO (Patricio): should this really be in `BaseMixer`?
     supports_proba: bool
+    trains_once: bool
 
     def __init__(self, stop_after: float):
         """
@@ -33,6 +34,7 @@ def __init__(self, stop_after: float):
         """
         self.stop_after = stop_after
         self.supports_proba = False
+        self.trains_once = False
 
     def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         """
diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index 6d01bcc02..ce5ca90fd 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -70,6 +70,7 @@ def __init__(
         self.train_cache = None
         self.patience = early_stop_patience
         self.seed = seed
+        self.trains_once = True
 
         dist_module = importlib.import_module('.'.join(['gluonts.mx.distribution',
                                                         *distribution_output.split(".")[:-1]]))

From bfdbb0905c504fc8df65b9c9023393de6625170b Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 14 Feb 2023 01:45:04 -0300
Subject: [PATCH 05/31] fix mixer arg injection

---
 lightwood/api/json_ai.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 586b1f1c4..df57af77a 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -622,7 +622,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
             mixers[i]["args"]["use_stl"] = mixers[i]["args"].get("use_stl", "False")
 
-        elif mixers[i]["module"] in ("NHitsMixer", ):
+        elif mixers[i]["module"] in ("NHitsMixer", "GluonTSMixer"):
             mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
             mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window"
             mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
@@ -630,12 +630,12 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             )
             problem_definition.fit_on_all = False  # takes too long otherwise
 
-        elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer", "GluonTSMixer"):
+        elif mixers[i]["module"] in ("SkTime", "ProphetMixer", "ETSMixer", "ARIMAMixer"):
             mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
                 "ts_analysis", "$ts_analysis"
             )
-            mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window"
-            mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+            if "horizon" not in mixers[i]["args"]:
+                mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
 
             # enforce fit_on_all if this mixer is specified
             problem_definition.fit_on_all = True

From c965c74c073a9b413f6004abdbfb537794f11edf Mon Sep 17 00:00:00 2001
From: Talaat Hasanin <105648065+TalaatHasanin@users.noreply.github.com>
Date: Fri, 17 Feb 2023 17:34:29 +0200
Subject: [PATCH 06/31] Update README.md

Add main module to the sample code
---
 README.md | 56 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index de6db694f..a2eccd463 100644
--- a/README.md
+++ b/README.md
@@ -65,40 +65,44 @@ from lightwood.api.high_level import (
     predictor_from_code,
 )
 
-# Load a pandas dataset
-df = pd.read_csv(
-    "https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv"
-)
 
-# Define the prediction task by naming the target column
-pdef = ProblemDefinition.from_dict(
-    {
-        "target": "Development Index",  # column you want to predict
-    }
-)
+def main():
+    # Load a pandas dataset
+    df = pd.read_csv("https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv")
+
+    # Define the prediction task by naming the target column
+    pdef = ProblemDefinition.from_dict(
+        {
+            "target": "Development Index",  # column you want to predict
+        }
+    )
+
+    # Generate JSON-AI code to model the problem
+    json_ai = json_ai_from_problem(df, problem_definition=pdef)
+
+    # OPTIONAL - see the JSON-AI syntax
+    # print(json_ai.to_json())
 
-# Generate JSON-AI code to model the problem
-json_ai = json_ai_from_problem(df, problem_definition=pdef)
+    # Generate python code
+    code = code_from_json_ai(json_ai)
 
-# OPTIONAL - see the JSON-AI syntax
-#print(json_ai.to_json())
+    # OPTIONAL - see generated code
+    # print(code)
 
-# Generate python code
-code = code_from_json_ai(json_ai)
+    # Create a predictor from python code
+    predictor = predictor_from_code(code)
 
-# OPTIONAL - see generated code
-#print(code)
+    # Train a model end-to-end from raw data to a finalized predictor
+    predictor.learn(df)
 
-# Create a predictor from python code
-predictor = predictor_from_code(code)
+    # Make the train/test splits and show predictions for a few examples
+    test_df = predictor.split(predictor.preprocess(df))["test"]
+    preds = predictor.predict(test_df).iloc[:10]
+    print(preds)
 
-# Train a model end-to-end from raw data to a finalized predictor
-predictor.learn(df)
 
-# Make the train/test splits and show predictions for a few examples
-test_df = predictor.split(predictor.preprocess(df))["test"]
-preds = predictor.predict(test_df).iloc[:10]
-print(preds)
+if __name__ == '__main__':
+    main()
 ```
 
 ### BYOM: Bring your own models

From f4607562abccbdeb5d87a5cc0bf944cadb2520db Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 20 Feb 2023 12:18:36 -0300
Subject: [PATCH 07/31] bump statsforecast==1.4.0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 972251172..bffa494ea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ scikit-learn >=1.0.0, <=1.0.2
 dataclasses_json >=0.5.4
 dill ==0.3.4
 sktime >=0.14.0,<0.15.0
-statsforecast ==0.7.0
+statsforecast ==1.4.0
 torch_optimizer ==0.1.0
 black >=21.9b0
 typing_extensions

From b4549a4f411a02dbeacb3eb35f69632de1918f3f Mon Sep 17 00:00:00 2001
From: Talaat Hasanin <105648065+TalaatHasanin@users.noreply.github.com>
Date: Mon, 20 Feb 2023 19:06:58 +0200
Subject: [PATCH 08/31] Add some comments

Add some comments to clarify the use of the main module
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a2eccd463..92d859ee7 100644
--- a/README.md
+++ b/README.md
@@ -100,8 +100,9 @@ def main():
     preds = predictor.predict(test_df).iloc[:10]
     print(preds)
 
-
+# "main module" is to guard your code through the multiprocessing for windows users
 if __name__ == '__main__':
+    # Load a pandas dataset and start define, create and train predictors
     main()
 ```
 

From 22c3566fa9921e5745672e9b8c20fec222828c93 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 20 Feb 2023 15:32:49 -0300
Subject: [PATCH 09/31] bump pytorch to 1.13

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index bffa494ea..809c67639 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ nltk >=3,<3.6
 python-dateutil >=2.8.1
 pandas >=1.1.5
 schema >=0.6.8
-torch >=1.9.0
+torch >=1.13.0, <1.14.0
 requests >=2.0.0
 transformers
 optuna >=2.8.0,<2.10.0

From 7e45c792f6a09bdd7e61088ab98aed9fb0ae0c25 Mon Sep 17 00:00:00 2001
From: Talaat Hasanin <105648065+TalaatHasanin@users.noreply.github.com>
Date: Mon, 20 Feb 2023 21:20:20 +0200
Subject: [PATCH 10/31] Update README.md

---
 README.md | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 92d859ee7..9e272cebf 100644
--- a/README.md
+++ b/README.md
@@ -65,10 +65,10 @@ from lightwood.api.high_level import (
     predictor_from_code,
 )
 
-
-def main():
+if __name__ == '__main__':
     # Load a pandas dataset
-    df = pd.read_csv("https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv")
+    df = pd.read_csv("https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/hdi/data.csv"
+    )
 
     # Define the prediction task by naming the target column
     pdef = ProblemDefinition.from_dict(
@@ -99,11 +99,6 @@ def main():
     test_df = predictor.split(predictor.preprocess(df))["test"]
     preds = predictor.predict(test_df).iloc[:10]
     print(preds)
-
-# "main module" is to guard your code through the multiprocessing for windows users
-if __name__ == '__main__':
-    # Load a pandas dataset and start define, create and train predictors
-    main()
 ```
 
 ### BYOM: Bring your own models

From 3367cc9969f6ef2e302a8d71d3d8f18a9e11d721 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 00:05:58 -0300
Subject: [PATCH 11/31] feat: TabTransformer mixer

---
 lightwood/mixer/__init__.py       |   8 +-
 lightwood/mixer/neural.py         |   7 +-
 lightwood/mixer/tabtransformer.py | 188 ++++++++++++++++++++++++++++++
 requirements_extra.txt            |   1 +
 4 files changed, 201 insertions(+), 3 deletions(-)
 create mode 100644 lightwood/mixer/tabtransformer.py

diff --git a/lightwood/mixer/__init__.py b/lightwood/mixer/__init__.py
index 1347f3ee5..3d7c1c2fa 100644
--- a/lightwood/mixer/__init__.py
+++ b/lightwood/mixer/__init__.py
@@ -36,5 +36,11 @@
     LightGBM = None
     LightGBMArray = None
 
+try:
+    from lightwood.mixer.tabtransformer import TabTransformerMixer
+except Exception:
+    TabTransformerMixer = None
+
 __all__ = ['BaseMixer', 'Neural', 'NeuralTs', 'LightGBM', 'RandomForest', 'LightGBMArray', 'Unit', 'Regression',
-           'SkTime', 'QClassic', 'ProphetMixer', 'ETSMixer', 'ARIMAMixer', 'NHitsMixer', 'GluonTSMixer', 'XGBoostMixer']
+           'SkTime', 'QClassic', 'ProphetMixer', 'ETSMixer', 'ARIMAMixer', 'NHitsMixer', 'GluonTSMixer', 'XGBoostMixer',
+           'TabTransformerMixer']
diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index c285fc519..9e10a690b 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -63,7 +63,7 @@ def __init__(
         self.epochs_to_best = 0
         self.n_epochs = n_epochs
         self.fit_on_dev = fit_on_dev
-        self.net_class = DefaultNet if net == 'DefaultNet' else ArNet
+        self.net_name = net
         self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical]
         self.search_hyperparameters = search_hyperparameters
         self.stable = True
@@ -241,6 +241,8 @@ def _error(self, dev_dl, criterion) -> float:
             return np.mean(running_losses)
 
     def _init_net(self, ds: EncodedDs):
+        self.net_class = DefaultNet if self.net_name == 'DefaultNet' else ArNet
+
         net_kwargs = {'input_size': len(ds[0][0]),
                       'output_size': len(ds[0][1]),
                       'num_hidden': self.num_hidden,
@@ -275,7 +277,8 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         # Find learning rate
         # keep the weights
         self._init_net(train_data)
-        self.lr, self.model = self._find_lr(train_dl)
+        if not self.lr:
+            self.lr, self.model = self._find_lr(train_dl)
 
         # Keep on training
         optimizer = self._select_optimizer()
diff --git a/lightwood/mixer/tabtransformer.py b/lightwood/mixer/tabtransformer.py
new file mode 100644
index 000000000..eaf98ff0a
--- /dev/null
+++ b/lightwood/mixer/tabtransformer.py
@@ -0,0 +1,188 @@
+import time
+from copy import deepcopy
+from typing import Dict, List, Optional
+
+import torch
+# import torch.nn as nn
+import numpy as np
+import pandas as pd
+from tab_transformer_pytorch import FTTransformer
+
+# from type_infer.dtype import dtype
+from lightwood.helpers.log import log
+from lightwood.helpers.torch import LightwoodAutocast
+from lightwood.api.types import PredictionArguments
+from lightwood.helpers.device import get_device_from_name
+from lightwood.data.encoded_ds import EncodedDs
+from lightwood.encoder.base import BaseEncoder
+from lightwood.mixer.neural import Neural
+
+
+class TabTransformerMixer(Neural):
+    def __init__(
+            self,
+            stop_after: float,
+            target: str,
+            dtype_dict: Dict[str, str],
+            target_encoder: BaseEncoder,
+            fit_on_dev: bool,
+            search_hyperparameters: bool,
+            train_args: Optional[dict] = None
+    ):
+        """
+        This mixer trains a TabTransformer network (FT variant), using concatenated encoder outputs for each dataset feature as input, to predict the encoded target column representation as output.
+        
+        Training logic is based on the Neural mixer, please refer to it for more details on each input parameter.
+        """  # noqa
+        self.train_args = train_args if train_args else {}
+        super().__init__(
+            stop_after,
+            target,
+            dtype_dict,
+            target_encoder,
+            'FTTransformer',
+            False,  # fit_on_dev
+            search_hyperparameters,
+            n_epochs=self.train_args.get('n_epochs', None)
+        )
+        self.lr = self.train_args.get('lr')
+        self.stable = True  # still experimental
+
+    def _init_net(self, ds: EncodedDs):
+        self.net_class = FTTransformer
+
+        self.model = FTTransformer(
+            categories=(),                                                       # unused here, as by the point it arrives to the mixer, everything is numerical  # noqa
+            num_continuous=len(ds[0][0]),  # ds.input_length,                         # TODO define based on DS
+            dim=self.train_args.get('dim', 32),
+            dim_out=self.train_args.get('dim_out', len(ds[0][1])),
+            depth=self.train_args.get('depth', 6),
+            heads=self.train_args.get('heads', 8),
+            attn_dropout=self.train_args.get('attn_dropout', 0.1),               # post-attention dropout
+            ff_dropout=self.train_args.get('ff_dropout', 0.1),                   # feed forward dropout
+            mlp_hidden_mults=self.train_args.get('mlp_hidden_mults', (4, 2)),    # relative multiples of each hidden dimension of the last mlp to logits  # noqa
+            # mlp_act=self.train_args.get('mlp_act', nn.ReLU()),  # TODO: import string from nn activations
+        )
+        self.model.device = get_device_from_name('')
+        self.model.to(self.model.device)
+
+    def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after):
+        epochs_to_best = 0
+        best_dev_error = pow(2, 32)
+        running_errors = []
+        best_model = self.model
+
+        for epoch in range(1, return_model_after + 1):
+            self.model = self.model.train()
+            running_losses: List[float] = []
+            for i, (X, Y) in enumerate(train_dl):
+                X = X.to(self.model.device)
+                Y = Y.to(self.model.device)
+                with LightwoodAutocast():
+                    optimizer.zero_grad()
+                    Yh = self.model(torch.Tensor(), X)
+                    loss = criterion(Yh, Y)
+                    if LightwoodAutocast.active:
+                        scaler.scale(loss).backward()
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        loss.backward()
+                        optimizer.step()
+
+                running_losses.append(loss.item())
+                if (time.time() - self.started) > stop_after:
+                    break
+
+            train_error = np.mean(running_losses)
+            epoch_error = self._error(dev_dl, criterion)
+            running_errors.append(epoch_error)
+            log.info(f'Loss @ epoch {epoch}: {epoch_error}')
+
+            if np.isnan(train_error) or np.isnan(
+                    running_errors[-1]) or np.isinf(train_error) or np.isinf(
+                    running_errors[-1]):
+                break
+
+            if best_dev_error > running_errors[-1]:
+                best_dev_error = running_errors[-1]
+                best_model = deepcopy(self.model)
+                epochs_to_best = epoch
+
+            # manually set epoch limit
+            if self.n_epochs is not None:
+                if epoch > self.n_epochs:
+                    break
+
+            # automated early stopping
+            else:
+                if len(running_errors) >= 5:
+                    delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)],
+                                            weights=[(1 / 2)**i for i in range(1, 5)])
+                    if delta_mean <= 0:
+                        break
+                elif (time.time() - self.started) > stop_after:
+                    break
+                elif running_errors[-1] < 0.0001 or train_error < 0.0001:
+                    break
+
+        if np.isnan(best_dev_error):
+            best_dev_error = pow(2, 32)
+        return best_model, epochs_to_best, best_dev_error
+
+    def _error(self, dev_dl, criterion) -> float:
+        self.model = self.model.eval()
+        running_losses: List[float] = []
+        with torch.no_grad():
+            for X, Y in dev_dl:
+                X = X.to(self.model.device)
+                Y = Y.to(self.model.device)
+                Yh = self.model(torch.Tensor(), X)
+                running_losses.append(criterion(Yh, Y).item())
+            return np.mean(running_losses)
+
+    def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
+        self._fit(train_data, dev_data)
+
+    def __call__(self, ds: EncodedDs,
+                 args: PredictionArguments = PredictionArguments()) -> pd.DataFrame:
+        """
+        Make predictions based on datasource with the same features as the ones used for fitting
+
+        :param ds: Predictions are generate from it
+        :param arg: Any additional arguments used in predicting
+
+        :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class
+        """ # noqa
+        self.model = self.model.eval()
+        decoded_predictions: List[object] = []
+        all_probs: List[List[float]] = []
+        rev_map = {}
+
+        with torch.no_grad():
+            for idx, (X, Y) in enumerate(ds):
+                X = X.to(self.model.device)
+                Yh = self.model(torch.Tensor(), X.unsqueeze(0))
+                Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh
+
+                kwargs = {}
+                for dep in self.target_encoder.dependencies:
+                    kwargs['dependency_data'] = {dep: ds.data_frame.iloc[idx][[dep]].values}
+
+                if args.predict_proba and self.supports_proba:
+                    decoded_prediction, probs, rev_map = self.target_encoder.decode_probabilities(Yh, **kwargs)
+                    all_probs.append(probs)
+                else:
+                    decoded_prediction = self.target_encoder.decode(Yh, **kwargs)
+
+                decoded_predictions.extend(decoded_prediction)
+
+            ydf = pd.DataFrame({'prediction': decoded_predictions})
+
+            if args.predict_proba and self.supports_proba:
+                raw_predictions = np.array(all_probs).squeeze(axis=1)
+
+                for idx, label in enumerate(rev_map.values()):
+                    ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx]
+
+            return ydf
\ No newline at end of file
diff --git a/requirements_extra.txt b/requirements_extra.txt
index 9a5b8e5aa..2b6ad2579 100644
--- a/requirements_extra.txt
+++ b/requirements_extra.txt
@@ -1 +1,2 @@
 lightgbm >=3.3.0,<=3.3.3
+tab-transformer-pytorch
\ No newline at end of file

From ec95efb9d1664d18e97817b5fc5498f3de779726 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 01:09:17 -0300
Subject: [PATCH 12/31] lint: flake8

---
 lightwood/mixer/tabtransformer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightwood/mixer/tabtransformer.py b/lightwood/mixer/tabtransformer.py
index eaf98ff0a..ebff2988a 100644
--- a/lightwood/mixer/tabtransformer.py
+++ b/lightwood/mixer/tabtransformer.py
@@ -6,7 +6,7 @@
 # import torch.nn as nn
 import numpy as np
 import pandas as pd
-from tab_transformer_pytorch import FTTransformer
+from tab_transformer_pytorch import TabTransformer
 
 # from type_infer.dtype import dtype
 from lightwood.helpers.log import log
@@ -49,9 +49,9 @@ def __init__(
         self.stable = True  # still experimental
 
     def _init_net(self, ds: EncodedDs):
-        self.net_class = FTTransformer
+        self.net_class = TabTransformer
 
-        self.model = FTTransformer(
+        self.model = TabTransformer(
             categories=(),                                                       # unused here, as by the point it arrives to the mixer, everything is numerical  # noqa
             num_continuous=len(ds[0][0]),  # ds.input_length,                         # TODO define based on DS
             dim=self.train_args.get('dim', 32),
@@ -185,4 +185,4 @@ def __call__(self, ds: EncodedDs,
                 for idx, label in enumerate(rev_map.values()):
                     ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx]
 
-            return ydf
\ No newline at end of file
+            return ydf

From e78fbb03cc81b361646e10816cc8ce0e66f9558f Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 14:33:09 -0300
Subject: [PATCH 13/31] lower bound for tab-transformer-pytorch >= 0.2.1

---
 requirements_extra.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_extra.txt b/requirements_extra.txt
index 2b6ad2579..db3751a04 100644
--- a/requirements_extra.txt
+++ b/requirements_extra.txt
@@ -1,2 +1,2 @@
 lightgbm >=3.3.0,<=3.3.3
-tab-transformer-pytorch
\ No newline at end of file
+tab-transformer-pytorch >= 0.2.1

From 294b81dfcc8ddcbdfa4256ddacb4794383fee23a Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 17:30:14 -0300
Subject: [PATCH 14/31] feat: move tabtransformer to core reqs; simplify
 implementation (no duplicate code); add tests

---
 lightwood/api/json_ai.py                      |  23 ++-
 lightwood/mixer/neural.py                     |  13 +-
 lightwood/mixer/tabtransformer.py             | 147 ++----------------
 requirements.txt                              |   1 +
 requirements_extra.txt                        |   1 -
 tests/unit_tests/mixer/test_tabtransformer.py |  52 +++++++
 6 files changed, 89 insertions(+), 148 deletions(-)
 create mode 100644 tests/unit_tests/mixer/test_tabtransformer.py

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index df57af77a..1f21007ed 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -571,22 +571,31 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
         mixers[i]["args"]["stop_after"] = mixers[i]["args"].get("stop_after", "$problem_definition.seconds_per_mixer")
 
         # specific
-        if mixers[i]["module"] in ("Neural", "NeuralTs"):
+        if mixers[i]["module"] in ("Neural", "NeuralTs", "TabTransformerMixer"):
             mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
                 "target_encoder", "$encoders[self.target]"
             )
-            mixers[i]["args"]["net"] = mixers[i]["args"].get(
-                "net",
-                '"DefaultNet"'
-                if not tss.is_timeseries or not tss.use_previous_target
-                else '"ArNet"',
-            )
+
+            if mixers[i]["module"] in ("Neural", "NeuralTs"):
+                mixers[i]["args"]["net"] = mixers[i]["args"].get(
+                    "net",
+                    '"DefaultNet"'
+                    if not tss.is_timeseries or not tss.use_previous_target
+                    else '"ArNet"',
+                )
+                mixers[i]["args"]["search_hyperparameters"] = mixers[i]["args"].get("search_hyperparameters", True)
+                mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", True)
+
             if mixers[i]["module"] == "NeuralTs":
                 mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
                     "timeseries_settings", "$problem_definition.timeseries_settings"
                 )
                 mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
 
+            if mixers[i]["module"] == "TabTransformerMixer":
+                mixers[i]["args"]["search_hyperparameters"] = mixers[i]["args"].get("search_hyperparameters", False)
+                mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", False)
+
         elif mixers[i]["module"] in ("LightGBM", "XGBoostMixer"):
             mixers[i]["args"]["input_cols"] = mixers[i]["args"].get(
                 "input_cols", "$input_cols"
diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index 9e10a690b..90040a3aa 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -80,7 +80,7 @@ def _final_tuning(self, data):
                     for X, Y in data:
                         X = X.to(self.model.device)
                         Y = Y.to(self.model.device)
-                        Yh = self.model(X)
+                        Yh = self._net_call(X)
 
                         Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh
                         Y = torch.unsqueeze(Y, 0) if len(Y.shape) < 2 else Y
@@ -134,7 +134,7 @@ def _find_lr(self, dl):
                 Y = Y.to(self.model.device)
                 with LightwoodAutocast():
                     optimizer.zero_grad()
-                    Yh = self.model(X)
+                    Yh = self._net_call(X)
                     loss = criterion(Yh, Y)
                     if LightwoodAutocast.active:
                         scaler.scale(loss).backward()
@@ -179,7 +179,7 @@ def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, r
                 Y = Y.to(self.model.device)
                 with LightwoodAutocast():
                     optimizer.zero_grad()
-                    Yh = self.model(X)
+                    Yh = self._net_call(X)
                     loss = criterion(Yh, Y)
                     if LightwoodAutocast.active:
                         scaler.scale(loss).backward()
@@ -236,7 +236,7 @@ def _error(self, dev_dl, criterion) -> float:
             for X, Y in dev_dl:
                 X = X.to(self.model.device)
                 Y = Y.to(self.model.device)
-                Yh = self.model(X)
+                Yh = self._net_call(X)
                 running_losses.append(criterion(Yh, Y).item())
             return np.mean(running_losses)
 
@@ -254,6 +254,9 @@ def _init_net(self, ds: EncodedDs):
 
         self.model = self.net_class(**net_kwargs)
 
+    def _net_call(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
     # @TODO: Compare partial fitting fully on and fully off on the benchmarks!
     # @TODO: Writeup on the methodology for partial fitting
     def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
@@ -336,7 +339,7 @@ def __call__(self, ds: EncodedDs,
         with torch.no_grad():
             for idx, (X, Y) in enumerate(ds):
                 X = X.to(self.model.device)
-                Yh = self.model(X)
+                Yh = self._net_call(X)
                 Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh
 
                 kwargs = {}
diff --git a/lightwood/mixer/tabtransformer.py b/lightwood/mixer/tabtransformer.py
index ebff2988a..96613d02a 100644
--- a/lightwood/mixer/tabtransformer.py
+++ b/lightwood/mixer/tabtransformer.py
@@ -1,17 +1,8 @@
-import time
-from copy import deepcopy
-from typing import Dict, List, Optional
+from typing import Dict, Optional
 
 import torch
-# import torch.nn as nn
-import numpy as np
-import pandas as pd
 from tab_transformer_pytorch import TabTransformer
 
-# from type_infer.dtype import dtype
-from lightwood.helpers.log import log
-from lightwood.helpers.torch import LightwoodAutocast
-from lightwood.api.types import PredictionArguments
 from lightwood.helpers.device import get_device_from_name
 from lightwood.data.encoded_ds import EncodedDs
 from lightwood.encoder.base import BaseEncoder
@@ -46,143 +37,29 @@ def __init__(
             n_epochs=self.train_args.get('n_epochs', None)
         )
         self.lr = self.train_args.get('lr')
-        self.stable = True  # still experimental
+        self.stable = False  # still experimental
 
     def _init_net(self, ds: EncodedDs):
         self.net_class = TabTransformer
-
         self.model = TabTransformer(
-            categories=(),                                                       # unused here, as by the point it arrives to the mixer, everything is numerical  # noqa
-            num_continuous=len(ds[0][0]),  # ds.input_length,                         # TODO define based on DS
+            categories=(),                                                      # unused, everything is numerical by now
+            num_continuous=len(ds[0][0]),
             dim=self.train_args.get('dim', 32),
             dim_out=self.train_args.get('dim_out', len(ds[0][1])),
             depth=self.train_args.get('depth', 6),
             heads=self.train_args.get('heads', 8),
-            attn_dropout=self.train_args.get('attn_dropout', 0.1),               # post-attention dropout
-            ff_dropout=self.train_args.get('ff_dropout', 0.1),                   # feed forward dropout
-            mlp_hidden_mults=self.train_args.get('mlp_hidden_mults', (4, 2)),    # relative multiples of each hidden dimension of the last mlp to logits  # noqa
-            # mlp_act=self.train_args.get('mlp_act', nn.ReLU()),  # TODO: import string from nn activations
+            attn_dropout=self.train_args.get('attn_dropout', 0.1),              # post-attention dropout
+            ff_dropout=self.train_args.get('ff_dropout', 0.1),                  # feed forward dropout
+            mlp_hidden_mults=self.train_args.get('mlp_hidden_mults', (4, 2)),   # relative multiples of each hidden dimension of the last mlp to logits  # noqa
+            # mlp_act=self.train_args.get('mlp_act', nn.ReLU()),                # TODO: import str from nn activations
         )
         self.model.device = get_device_from_name('')
         self.model.to(self.model.device)
 
-    def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after):
-        epochs_to_best = 0
-        best_dev_error = pow(2, 32)
-        running_errors = []
-        best_model = self.model
-
-        for epoch in range(1, return_model_after + 1):
-            self.model = self.model.train()
-            running_losses: List[float] = []
-            for i, (X, Y) in enumerate(train_dl):
-                X = X.to(self.model.device)
-                Y = Y.to(self.model.device)
-                with LightwoodAutocast():
-                    optimizer.zero_grad()
-                    Yh = self.model(torch.Tensor(), X)
-                    loss = criterion(Yh, Y)
-                    if LightwoodAutocast.active:
-                        scaler.scale(loss).backward()
-                        scaler.step(optimizer)
-                        scaler.update()
-                    else:
-                        loss.backward()
-                        optimizer.step()
-
-                running_losses.append(loss.item())
-                if (time.time() - self.started) > stop_after:
-                    break
-
-            train_error = np.mean(running_losses)
-            epoch_error = self._error(dev_dl, criterion)
-            running_errors.append(epoch_error)
-            log.info(f'Loss @ epoch {epoch}: {epoch_error}')
-
-            if np.isnan(train_error) or np.isnan(
-                    running_errors[-1]) or np.isinf(train_error) or np.isinf(
-                    running_errors[-1]):
-                break
-
-            if best_dev_error > running_errors[-1]:
-                best_dev_error = running_errors[-1]
-                best_model = deepcopy(self.model)
-                epochs_to_best = epoch
-
-            # manually set epoch limit
-            if self.n_epochs is not None:
-                if epoch > self.n_epochs:
-                    break
-
-            # automated early stopping
-            else:
-                if len(running_errors) >= 5:
-                    delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)],
-                                            weights=[(1 / 2)**i for i in range(1, 5)])
-                    if delta_mean <= 0:
-                        break
-                elif (time.time() - self.started) > stop_after:
-                    break
-                elif running_errors[-1] < 0.0001 or train_error < 0.0001:
-                    break
-
-        if np.isnan(best_dev_error):
-            best_dev_error = pow(2, 32)
-        return best_model, epochs_to_best, best_dev_error
-
-    def _error(self, dev_dl, criterion) -> float:
-        self.model = self.model.eval()
-        running_losses: List[float] = []
-        with torch.no_grad():
-            for X, Y in dev_dl:
-                X = X.to(self.model.device)
-                Y = Y.to(self.model.device)
-                Yh = self.model(torch.Tensor(), X)
-                running_losses.append(criterion(Yh, Y).item())
-            return np.mean(running_losses)
+    def _net_call(self, x: torch.Tensor) -> torch.Tensor:
+        x = torch.unsqueeze(x, 0) if len(x.shape) < 2 else x
+        return self.model(torch.Tensor(), x)
 
     def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
+        """ Skip the usual partial_fit call at the end. """  # noqa
         self._fit(train_data, dev_data)
-
-    def __call__(self, ds: EncodedDs,
-                 args: PredictionArguments = PredictionArguments()) -> pd.DataFrame:
-        """
-        Make predictions based on datasource with the same features as the ones used for fitting
-
-        :param ds: Predictions are generate from it
-        :param arg: Any additional arguments used in predicting
-
-        :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class
-        """ # noqa
-        self.model = self.model.eval()
-        decoded_predictions: List[object] = []
-        all_probs: List[List[float]] = []
-        rev_map = {}
-
-        with torch.no_grad():
-            for idx, (X, Y) in enumerate(ds):
-                X = X.to(self.model.device)
-                Yh = self.model(torch.Tensor(), X.unsqueeze(0))
-                Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh
-
-                kwargs = {}
-                for dep in self.target_encoder.dependencies:
-                    kwargs['dependency_data'] = {dep: ds.data_frame.iloc[idx][[dep]].values}
-
-                if args.predict_proba and self.supports_proba:
-                    decoded_prediction, probs, rev_map = self.target_encoder.decode_probabilities(Yh, **kwargs)
-                    all_probs.append(probs)
-                else:
-                    decoded_prediction = self.target_encoder.decode(Yh, **kwargs)
-
-                decoded_predictions.extend(decoded_prediction)
-
-            ydf = pd.DataFrame({'prediction': decoded_predictions})
-
-            if args.predict_proba and self.supports_proba:
-                raw_predictions = np.array(all_probs).squeeze(axis=1)
-
-                for idx, label in enumerate(rev_map.values()):
-                    ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx]
-
-            return ydf
diff --git a/requirements.txt b/requirements.txt
index bffa494ea..c275bd659 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,6 +28,7 @@ langid==1.1.6
 pydateinfer==0.3.0
 protobuf<3.21.0
 xgboost>=1.6.0, <=1.8.0
+tab-transformer-pytorch >= 0.2.1
 typing-inspect
 six
 regex
diff --git a/requirements_extra.txt b/requirements_extra.txt
index db3751a04..9a5b8e5aa 100644
--- a/requirements_extra.txt
+++ b/requirements_extra.txt
@@ -1,2 +1 @@
 lightgbm >=3.3.0,<=3.3.3
-tab-transformer-pytorch >= 0.2.1
diff --git a/tests/unit_tests/mixer/test_tabtransformer.py b/tests/unit_tests/mixer/test_tabtransformer.py
new file mode 100644
index 000000000..29ae7e40e
--- /dev/null
+++ b/tests/unit_tests/mixer/test_tabtransformer.py
@@ -0,0 +1,52 @@
+import unittest
+import numpy as np
+import pandas as pd
+from sklearn.metrics import balanced_accuracy_score
+from lightwood.api.types import ProblemDefinition
+from lightwood.api.high_level import json_ai_from_problem, predictor_from_json_ai, JsonAI, code_from_json_ai, predictor_from_code  # noqa
+
+
+np.random.seed(42)
+
+
+class TestBasic(unittest.TestCase):
+    def get_submodels(self):
+        submodels = [
+            {
+                'module': 'TabTransformerMixer',
+                'args': {
+                    'train_args': {'n_epochs': 5},
+                }
+            },
+        ]
+        return submodels
+
+    def test_0_regression(self):
+        df = pd.read_csv('tests/data/concrete_strength.csv')[:500]
+        target = 'concrete_strength'
+
+        pdef = ProblemDefinition.from_dict({'target': target})
+        jai = json_ai_from_problem(df, pdef)
+
+        jai.model['args']['submodels'] = self.get_submodels()
+        code = code_from_json_ai(jai)
+        predictor = predictor_from_code(code)
+
+        predictor.learn(df)
+        predictor.predict(df)
+
+    def test_1_binary(self):
+        df = pd.read_csv('tests/data/ionosphere.csv')[:100]
+        target = 'target'
+
+        pdef = ProblemDefinition.from_dict({'target': target, 'unbias_target': False})
+        jai = json_ai_from_problem(df, pdef)
+        jai.model['args']['submodels'] = self.get_submodels()
+        code = code_from_json_ai(jai)
+        predictor = predictor_from_code(code)
+
+        predictor.learn(df)
+        predictions = predictor.predict(df)
+
+        acc = balanced_accuracy_score(df[target], predictions['prediction'])
+        self.assertTrue(all([0 <= p <= 1 for p in predictions['confidence']]))

From 10be2c5e3cbb954e040ec5162c8f02056e396ed6 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 17:35:42 -0300
Subject: [PATCH 15/31] lint: flake8

---
 tests/unit_tests/mixer/test_tabtransformer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit_tests/mixer/test_tabtransformer.py b/tests/unit_tests/mixer/test_tabtransformer.py
index 29ae7e40e..0fc74bb6a 100644
--- a/tests/unit_tests/mixer/test_tabtransformer.py
+++ b/tests/unit_tests/mixer/test_tabtransformer.py
@@ -1,7 +1,6 @@
 import unittest
 import numpy as np
 import pandas as pd
-from sklearn.metrics import balanced_accuracy_score
 from lightwood.api.types import ProblemDefinition
 from lightwood.api.high_level import json_ai_from_problem, predictor_from_json_ai, JsonAI, code_from_json_ai, predictor_from_code  # noqa
 
@@ -48,5 +47,4 @@ def test_1_binary(self):
         predictor.learn(df)
         predictions = predictor.predict(df)
 
-        acc = balanced_accuracy_score(df[target], predictions['prediction'])
         self.assertTrue(all([0 <= p <= 1 for p in predictions['confidence']]))

From 494ccbfcaf0803f42527ac64261b28e18cbb9035 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 21:34:48 -0300
Subject: [PATCH 16/31] fix: 1096

---
 lightwood/analysis/nc/calibrate.py     | 23 ++++++++++++++++-------
 lightwood/data/timeseries_transform.py |  2 +-
 lightwood/helpers/ts.py                |  5 ++++-
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
index 5357c0ad3..789bea7e3 100644
--- a/lightwood/analysis/nc/calibrate.py
+++ b/lightwood/analysis/nc/calibrate.py
@@ -105,16 +105,17 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
             icp_df = deepcopy(ns.data)
 
             # setup prediction cache to avoid additional .predict() calls
-            pred_is_list = isinstance(ns.normal_predictions['prediction'], list) and \
-                isinstance(ns.normal_predictions['prediction'][0], list)
+            try:
+                pred_is_list = isinstance(ns.normal_predictions['prediction'][0], list)
+            except TypeError:
+                pred_is_list = False
+
             if ns.is_classification:
                 if ns.predictor.supports_proba:
                     icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values
                 else:
                     if ns.is_multi_ts:
-                        icp.nc_function.model.prediction_cache = np.array(
-                            [p[0] for p in ns.normal_predictions['prediction']])
-                        preds = icp.nc_function.model.prediction_cache
+                        preds = np.array([p[0] for p in ns.normal_predictions['prediction']])
                     else:
                         preds = ns.normal_predictions['prediction']
                     predicted_classes = pd.get_dummies(preds).values  # inflate to one-hot enc
@@ -198,8 +199,14 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
 
                     # save relevant predictions in the caches, then calibrate the ICP
                     pred_cache = icp_df.pop(f'__predicted_{ns.target}').values
-                    if ns.is_multi_ts:
+                    if ns.is_multi_ts and ns.is_classification:
+                        pred_cache = pd.get_dummies(np.array([p[0] for p in pred_cache])).values  # TODO: don't use dummies if not all columns are present, use OHE instead
+                    # el
+                    elif ns.is_multi_ts:
                         pred_cache = np.array([np.array(p) for p in pred_cache])
+                    # elif ns.is_classification:
+                    #     pred_cache = pd.get_dummies(pred_cache).values  # inflate to one-hot enc
+
                     icps[tuple(group)].nc_function.model.prediction_cache = pred_cache
                     icp_df, y = clean_df(icp_df, ns, output.get('label_encoders', None))
                     if icps[tuple(group)].nc_function.normalizer is not None:
@@ -386,6 +393,8 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                                                                       for i in range(1, ns.tss.horizon)]
                                     icp.nc_function.model.prediction_cache = X[target_cols].values
                                     [X.pop(col) for col in target_cols]
+                                elif is_multi_ts and is_categorical:
+                                   icp.nc_function.model.prediction_cache = pd.get_dummies(X.pop(ns.target_name)).values
                                 else:
                                     icp.nc_function.model.prediction_cache = X.pop(ns.target_name).values
                                 if icp.nc_function.normalizer:
@@ -431,7 +440,7 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                                     all_ranges = np.array([icp.predict(X.values)])
                                     all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1)
                                     significances = get_categorical_conf(all_confs)
-                                    result.loc[X.index, 'significance'] = significances
+                                    result.loc[X.index, 'significance'] = significances.flatten()
 
                 row_insights['confidence'] = result['significance']
 
diff --git a/lightwood/data/timeseries_transform.py b/lightwood/data/timeseries_transform.py
index a57ee526a..2204e15d6 100644
--- a/lightwood/data/timeseries_transform.py
+++ b/lightwood/data/timeseries_transform.py
@@ -69,7 +69,7 @@ def transform_timeseries(
     subsets = []
     for group in groups:
         if (tss.group_by and group != '__default') or not tss.group_by:
-            idxs, subset = get_group_matches(data, group, tss.group_by)
+            idxs, subset = get_group_matches(data, group, tss.group_by, copy=True)
             if subset.shape[0] > 0:
                 if periods.get(group, periods['__default']) == 0 and subset.shape[0] > 1:
                     raise Exception(
diff --git a/lightwood/helpers/ts.py b/lightwood/helpers/ts.py
index 0acaaab27..445492cf6 100644
--- a/lightwood/helpers/ts.py
+++ b/lightwood/helpers/ts.py
@@ -18,7 +18,8 @@ def get_ts_groups(df: pd.DataFrame, tss) -> list:
 def get_group_matches(
         data: Union[pd.Series, pd.DataFrame],
         combination: tuple,
-        group_columns: List[str]
+        group_columns: List[str],
+        copy: bool = False
 ) -> Tuple[list, pd.DataFrame]:
     """Given a particular group combination, return the data subset that belongs to it."""
 
@@ -34,6 +35,8 @@ def get_group_matches(
         for val, col in zip(combination, group_columns):
             subset = subset[subset[col] == val]
         if len(subset) > 0:
+            if copy:
+                subset = subset.copy()
             return list(subset.index), subset
         else:
             return [], pd.DataFrame()

From c5508d279cb92f8bff2a8e4ba08f23f53bab1b87 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 21:36:33 -0300
Subject: [PATCH 17/31] cleanup

---
 lightwood/analysis/nc/calibrate.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
index 789bea7e3..8b8e43b95 100644
--- a/lightwood/analysis/nc/calibrate.py
+++ b/lightwood/analysis/nc/calibrate.py
@@ -201,11 +201,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                     pred_cache = icp_df.pop(f'__predicted_{ns.target}').values
                     if ns.is_multi_ts and ns.is_classification:
                         pred_cache = pd.get_dummies(np.array([p[0] for p in pred_cache])).values  # TODO: don't use dummies if not all columns are present, use OHE instead
-                    # el
                     elif ns.is_multi_ts:
                         pred_cache = np.array([np.array(p) for p in pred_cache])
-                    # elif ns.is_classification:
-                    #     pred_cache = pd.get_dummies(pred_cache).values  # inflate to one-hot enc
 
                     icps[tuple(group)].nc_function.model.prediction_cache = pred_cache
                     icp_df, y = clean_df(icp_df, ns, output.get('label_encoders', None))
@@ -528,6 +525,5 @@ def _ts_assign_confs(result, df, confs, significances, tss) -> pd.DataFrame:
             added_cols = [f'{base_col}_timestep_{t}' for t in range(1, tss.horizon)]
             cols = [base_col] + added_cols
             result.loc[df.index, base_col] = result.loc[df.index, cols].values.tolist()
-            # result[base_col] = result[cols].values.tolist()
 
         return result

From d82e3428425474ff19631f651499707da2c0b3bc Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 21:58:28 -0300
Subject: [PATCH 18/31] rm get_dummies

---
 lightwood/analysis/nc/calibrate.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
index 8b8e43b95..fb2e2a19f 100644
--- a/lightwood/analysis/nc/calibrate.py
+++ b/lightwood/analysis/nc/calibrate.py
@@ -118,7 +118,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                         preds = np.array([p[0] for p in ns.normal_predictions['prediction']])
                     else:
                         preds = ns.normal_predictions['prediction']
-                    predicted_classes = pd.get_dummies(preds).values  # inflate to one-hot enc
+                    predicted_classes = output['label_encoders'].transform(preds.reshape(-1, 1))  # inflate to OHE
                     icp.nc_function.model.prediction_cache = predicted_classes
 
             elif ns.is_multi_ts or pred_is_list:
@@ -200,7 +200,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                     # save relevant predictions in the caches, then calibrate the ICP
                     pred_cache = icp_df.pop(f'__predicted_{ns.target}').values
                     if ns.is_multi_ts and ns.is_classification:
-                        pred_cache = pd.get_dummies(np.array([p[0] for p in pred_cache])).values  # TODO: don't use dummies if not all columns are present, use OHE instead
+                        # output['label_encoders'].transform(preds.reshape(-1, 1))
+                        pred_cache = output['label_encoders'].transform([[p[0] for p in pred_cache]])
                     elif ns.is_multi_ts:
                         pred_cache = np.array([np.array(p) for p in pred_cache])
 
@@ -338,7 +339,8 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                         for icol, cat_col in enumerate(all_cat_cols):
                             row_insights.loc[X.index, cat_col] = class_dists[:, icol]
                     else:
-                        class_dists = pd.get_dummies(preds).values
+                        ohe_enc = ns.analysis['label_encoders']
+                        class_dists = ohe_enc.transform(np.array([p[0] for p in preds]).reshape(-1, 1))
 
                     base_icp.nc_function.model.prediction_cache = class_dists
 
@@ -391,7 +393,10 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                                     icp.nc_function.model.prediction_cache = X[target_cols].values
                                     [X.pop(col) for col in target_cols]
                                 elif is_multi_ts and is_categorical:
-                                   icp.nc_function.model.prediction_cache = pd.get_dummies(X.pop(ns.target_name)).values
+                                    ohe_enc = ns.analysis['label_encoders']
+                                    preds = X.pop(ns.target_name).values
+                                    pred_cache = ohe_enc.transform(np.array([p[0] for p in preds]).reshape(-1, 1))
+                                    icp.nc_function.model.prediction_cache = pred_cache
                                 else:
                                     icp.nc_function.model.prediction_cache = X.pop(ns.target_name).values
                                 if icp.nc_function.normalizer:

From d055585de0578ef11c518aff487971d34a34ce5e Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 22:28:01 -0300
Subject: [PATCH 19/31] fix catch

---
 lightwood/analysis/nc/calibrate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
index fb2e2a19f..9c0d576f2 100644
--- a/lightwood/analysis/nc/calibrate.py
+++ b/lightwood/analysis/nc/calibrate.py
@@ -107,7 +107,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
             # setup prediction cache to avoid additional .predict() calls
             try:
                 pred_is_list = isinstance(ns.normal_predictions['prediction'][0], list)
-            except TypeError:
+            except KeyError:
                 pred_is_list = False
 
             if ns.is_classification:

From d1629d9c9830df758043940548893cffb2588566 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 22:57:04 -0300
Subject: [PATCH 20/31] fix: values.reshape

---
 lightwood/analysis/nc/calibrate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
index 9c0d576f2..c1f3ff7ce 100644
--- a/lightwood/analysis/nc/calibrate.py
+++ b/lightwood/analysis/nc/calibrate.py
@@ -118,7 +118,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                         preds = np.array([p[0] for p in ns.normal_predictions['prediction']])
                     else:
                         preds = ns.normal_predictions['prediction']
-                    predicted_classes = output['label_encoders'].transform(preds.reshape(-1, 1))  # inflate to OHE
+                    predicted_classes = output['label_encoders'].transform(preds.values.reshape(-1, 1))  # inflate OHE
                     icp.nc_function.model.prediction_cache = predicted_classes
 
             elif ns.is_multi_ts or pred_is_list:

From 736c3e90235de2906e5cb6f7b6c39e1fa01b297e Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 21 Feb 2023 23:42:18 -0300
Subject: [PATCH 21/31] fix: values.reshape

---
 lightwood/analysis/nc/calibrate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
index c1f3ff7ce..4da8a93cd 100644
--- a/lightwood/analysis/nc/calibrate.py
+++ b/lightwood/analysis/nc/calibrate.py
@@ -117,8 +117,8 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                     if ns.is_multi_ts:
                         preds = np.array([p[0] for p in ns.normal_predictions['prediction']])
                     else:
-                        preds = ns.normal_predictions['prediction']
-                    predicted_classes = output['label_encoders'].transform(preds.values.reshape(-1, 1))  # inflate OHE
+                        preds = ns.normal_predictions['prediction'].values
+                    predicted_classes = output['label_encoders'].transform(preds.reshape(-1, 1))  # inflate OHE
                     icp.nc_function.model.prediction_cache = predicted_classes
 
             elif ns.is_multi_ts or pred_is_list:

From da3e3bdc0b7662087b9a05d73aa65ac1abcf4f2e Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 22 Feb 2023 00:04:58 -0300
Subject: [PATCH 22/31] fix mdb#4360

---
 lightwood/analysis/nc/calibrate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
index 4da8a93cd..e58ec5d82 100644
--- a/lightwood/analysis/nc/calibrate.py
+++ b/lightwood/analysis/nc/calibrate.py
@@ -366,8 +366,8 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                     result.loc[X.index, 'significance'] = significances
 
                 else:
-                    significances = get_categorical_conf(all_confs.squeeze())
-                    result.loc[X.index, 'significance'] = significances
+                    significances = get_categorical_conf(all_confs)
+                    result.loc[X.index, 'significance'] = significances.flatten()
 
                 # grouped time series, we replace bounds in rows that have a trained ICP
                 if ns.analysis['icp'].get('__mdb_groups', False):

From c81dbe01574df2948a90cd7e7f4362a8b52c2941 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 22 Feb 2023 02:00:34 -0300
Subject: [PATCH 23/31] bump dataprep==0.0.9

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0c943a690..30718a06d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 type_infer ==0.0.9
-dataprep_ml ==0.0.8
+dataprep_ml ==0.0.9
 mindsdb-evaluator ==0.0.6
 numpy
 nltk >=3,<3.6

From 67c97d6d49e1b4716eb80e243ba80a1da699c71a Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 22 Feb 2023 02:03:09 -0300
Subject: [PATCH 24/31] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 30718a06d..0c943a690 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 type_infer ==0.0.9
-dataprep_ml ==0.0.9
+dataprep_ml ==0.0.8
 mindsdb-evaluator ==0.0.6
 numpy
 nltk >=3,<3.6

From c63223597da9d916e58919df9ed989977cf6052e Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 22 Feb 2023 02:09:53 -0300
Subject: [PATCH 25/31] checkpoint

---
 lightwood/mixer/gluonts.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index ce5ca90fd..fdd3cea42 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -90,13 +90,19 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         cat_ds = ConcatedEncodedDs([train_data, dev_data])
         fit_groups = list(cat_ds.data_frame[self.grouped_by[0]].unique()) if self.grouped_by != ['__default'] else None
         train_ds = self._make_initial_ds(cat_ds.data_frame, phase='train', groups=fit_groups)
+        batch_size = 32
         self.model_train_stats = TrainingHistory()
 
         self.estimator = DeepAREstimator(
             freq=train_ds.freq,
             prediction_length=self.horizon,
             distr_output=self.distribution,
-            trainer=Trainer(epochs=self.n_epochs, callbacks=[EarlyStop(patience=self.patience), self.model_train_stats])
+            lags_seq=[i + 1 for i in range(self.window)],
+            batch_size=batch_size,
+            trainer=Trainer(
+                epochs=self.n_epochs,
+                num_batches_per_epoch=max(1, len(train_ds) // batch_size),
+                callbacks=[EarlyStop(patience=self.patience), self.model_train_stats])
         )
         self.model = self.estimator.train(train_ds)
         self.prepared = True
@@ -162,10 +168,10 @@ def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs],
         return ydf
 
     def _make_initial_ds(self, df=None, phase='predict', groups=None):
-        oby = self.ts_analysis["tss"].order_by
+        oby_col_name = '__gluon_timestamp'
         gby = self.ts_analysis["tss"].group_by if self.ts_analysis["tss"].group_by else []
         freq = self.ts_analysis['sample_freqs']['__default']
-        keep_cols = [f'__mdb_original_{oby}', self.target] + [col for col in gby]
+        keep_cols = [self.target] + [col for col in gby]
 
         if groups is None and gby:
             groups = self.groups
@@ -208,7 +214,8 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None):
             gby = '__default_group'
             df[gby] = '__default_group'
 
-        ds = PandasDataset.from_long_dataframe(df, target=self.target, item_id=gby, freq=freq)
+        df[oby_col_name] = df.index
+        ds = PandasDataset.from_long_dataframe(df, target=self.target, item_id=gby, freq=freq, timestamp=oby_col_name)
         return ds
 
 

From 64952fd2d76fb9ae54261a77fe836212c237c354 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 22 Feb 2023 12:43:05 -0300
Subject: [PATCH 26/31] fix batch size

---
 lightwood/mixer/gluonts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index fdd3cea42..1061241f6 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -101,7 +101,7 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
             batch_size=batch_size,
             trainer=Trainer(
                 epochs=self.n_epochs,
-                num_batches_per_epoch=max(1, len(train_ds) // batch_size),
+                num_batches_per_epoch=max(1, len(cat_ds.data_frame) // batch_size),
                 callbacks=[EarlyStop(patience=self.patience), self.model_train_stats])
         )
         self.model = self.estimator.train(train_ds)

From 817c93b2f6365e5aba66a9b4ae3739f13c537ffe Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 1 Mar 2023 02:05:58 -0300
Subject: [PATCH 27/31] add static real feature

---
 lightwood/mixer/gluonts.py | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index 1061241f6..d44dae897 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -36,6 +36,8 @@ def __init__(
             early_stop_patience: int = 3,
             distribution_output: str = '',
             seed: int = 0,
+            static_features_cat: Optional[list[str]] = None,
+            static_features_real: Optional[list[str]] = None,
     ):
         """
         Wrapper around GluonTS probabilistic deep learning models. For now, only DeepAR is supported.
@@ -71,6 +73,8 @@ def __init__(
         self.patience = early_stop_patience
         self.seed = seed
         self.trains_once = True
+        self.static_features_cat = static_features_cat if static_features_cat else []
+        self.static_features_real = static_features_real if static_features_real else []
 
         dist_module = importlib.import_module('.'.join(['gluonts.mx.distribution',
                                                         *distribution_output.split(".")[:-1]]))
@@ -99,6 +103,8 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
             distr_output=self.distribution,
             lags_seq=[i + 1 for i in range(self.window)],
             batch_size=batch_size,
+            use_feat_static_cat=True if self.static_features_cat else False,
+            use_feat_static_real=True if self.static_features_real else False,
             trainer=Trainer(
                 epochs=self.n_epochs,
                 num_batches_per_epoch=max(1, len(cat_ds.data_frame) // batch_size),
@@ -171,7 +177,13 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None):
         oby_col_name = '__gluon_timestamp'
         gby = self.ts_analysis["tss"].group_by if self.ts_analysis["tss"].group_by else []
         freq = self.ts_analysis['sample_freqs']['__default']
-        keep_cols = [self.target] + [col for col in gby]
+        keep_cols = [self.target] + [col for col in gby] + self.static_features_cat + self.static_features_real
+
+        agg_map = {self.target: 'sum'}
+        for col in self.static_features_cat:
+            agg_map[col] = 'first'
+        for col in self.static_features_real:
+            agg_map[col] = 'mean'
 
         if groups is None and gby:
             groups = self.groups
@@ -207,15 +219,25 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None):
             return None
 
         if gby:
-            df = df.groupby(by=gby[0]).resample(freq).sum().reset_index(level=[0])
             # @TODO: multiple group support and remove groups without enough data
+            df = df.groupby(by=gby[0]).resample(freq).agg(agg_map).reset_index(level=[0])
         else:
-            df = df.resample(freq).sum()
+            df = df.resample(freq).agg(agg_map)
             gby = '__default_group'
             df[gby] = '__default_group'
 
         df[oby_col_name] = df.index
-        ds = PandasDataset.from_long_dataframe(df, target=self.target, item_id=gby, freq=freq, timestamp=oby_col_name)
+        ds = PandasDataset.from_long_dataframe(
+            df,
+            target=self.target,
+            item_id=gby,
+            freq=freq,
+            timestamp=oby_col_name,
+            # feat_dynamic_real=None,
+            # feat_dynamic_cat=None,
+            feat_static_real=self.static_features_real if self.static_features_real else None,
+            feat_static_cat=self.static_features_cat if self.static_features_cat else None,
+        )
         return ds
 
 

From 0ce12c1a36c0df4f8a0a468798267a9697a95b05 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 1 Mar 2023 02:39:00 -0300
Subject: [PATCH 28/31] add static cat features

---
 lightwood/mixer/gluonts.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index d44dae897..92972ce08 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import mxnet as mx
+from sklearn.preprocessing import OrdinalEncoder
 
 from gluonts.dataset.pandas import PandasDataset
 
@@ -73,6 +74,7 @@ def __init__(
         self.patience = early_stop_patience
         self.seed = seed
         self.trains_once = True
+        self.static_features_cat_encoders = {}
         self.static_features_cat = static_features_cat if static_features_cat else []
         self.static_features_real = static_features_real if static_features_real else []
 
@@ -92,6 +94,9 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
 
         # prepare data
         cat_ds = ConcatedEncodedDs([train_data, dev_data])
+        for col in self.static_features_cat:
+            self.static_features_cat_encoders[col] = OrdinalEncoder().fit(cat_ds.data_frame[col].values.reshape(-1, 1))
+
         fit_groups = list(cat_ds.data_frame[self.grouped_by[0]].unique()) if self.grouped_by != ['__default'] else None
         train_ds = self._make_initial_ds(cat_ds.data_frame, phase='train', groups=fit_groups)
         batch_size = 32
@@ -227,14 +232,15 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None):
             df[gby] = '__default_group'
 
         df[oby_col_name] = df.index
+        for col in self.static_features_cat:
+            df[col] = self.static_features_cat_encoders[col].transform(df[col].values.reshape(-1,1))
+
         ds = PandasDataset.from_long_dataframe(
             df,
             target=self.target,
             item_id=gby,
             freq=freq,
             timestamp=oby_col_name,
-            # feat_dynamic_real=None,
-            # feat_dynamic_cat=None,
             feat_static_real=self.static_features_real if self.static_features_real else None,
             feat_static_cat=self.static_features_cat if self.static_features_cat else None,
         )

From ae18b5d96ae4cfdafe0b46ab64d8a6ec6c938259 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 1 Mar 2023 02:39:26 -0300
Subject: [PATCH 29/31] lint: flake8

---
 lightwood/mixer/gluonts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py
index 92972ce08..ac7d2bc5f 100644
--- a/lightwood/mixer/gluonts.py
+++ b/lightwood/mixer/gluonts.py
@@ -233,7 +233,7 @@ def _make_initial_ds(self, df=None, phase='predict', groups=None):
 
         df[oby_col_name] = df.index
         for col in self.static_features_cat:
-            df[col] = self.static_features_cat_encoders[col].transform(df[col].values.reshape(-1,1))
+            df[col] = self.static_features_cat_encoders[col].transform(df[col].values.reshape(-1, 1))
 
         ds = PandasDataset.from_long_dataframe(
             df,

From fc49017a063a0435db25d7cdaa12d5771014b832 Mon Sep 17 00:00:00 2001
From: Max Stepanov <stpmax@yandex.ru>
Date: Mon, 13 Mar 2023 20:38:29 +0300
Subject: [PATCH 30/31] bound pandas requirement

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0c943a690..990f4de54 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ mindsdb-evaluator ==0.0.6
 numpy
 nltk >=3,<3.6
 python-dateutil >=2.8.1
-pandas >=1.1.5
+pandas >=1.1.5, <1.5.0
 schema >=0.6.8
 torch >=1.13.0, <1.14.0
 requests >=2.0.0

From 019df037d03b8c72545f3d3447f225ba7bb82fbe Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 13 Mar 2023 17:51:21 -0300
Subject: [PATCH 31/31] version bump: 23.3.2.0

---
 lightwood/__about__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/__about__.py b/lightwood/__about__.py
index e8fa521e5..ede584c93 100644
--- a/lightwood/__about__.py
+++ b/lightwood/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '23.2.1.0'
+__version__ = '23.3.2.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "community@mindsdb.com"
 __author__ = 'MindsDB Inc'