More major refactoring

rvandewater · Oct 17, 2024 · 47c594a · 47c594a
1 parent 151d68f
commit 47c594a
Show file tree

Hide file tree

Showing 14 changed files with 126 additions and 110 deletions.
diff --git a/icu_benchmarks/cross_validation.py b/icu_benchmarks/cross_validation.py
@@ -13,7 +13,6 @@
 from icu_benchmarks.run_utils import log_full_line
 from icu_benchmarks.constants import RunMode
 
-
 @gin.configurable
 def execute_repeated_cv(
     data_dir: Path,
@@ -104,10 +103,6 @@ def execute_repeated_cv(
                 runmode=mode,
                 complete_train=complete_train,
             )
-            # logging.debug(f"{data}")
-            # data_pickle_path = log_dir / "data.pkl"
-            # with open(data_pickle_path, "wb") as f:
-            #     pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
             preprocess_time = datetime.now() - start_time
             start_time = datetime.now()
             agg_loss += train_common(
@@ -123,8 +118,6 @@ def execute_repeated_cv(
                 verbose=verbose,
                 use_wandb=wandb,
                 train_only=complete_train,
-                epochs=20,
-                patience=5,
             )
             train_time = datetime.now() - start_time
 
@@ -146,3 +139,4 @@ def execute_repeated_cv(
         log_full_line(f"FINISHED CV REPETITION {repetition}", level=logging.INFO, char="=", num_newlines=3)
 
     return agg_loss / (cv_repetitions_to_train * cv_folds_to_train)
+
diff --git a/icu_benchmarks/data/loader.py b/icu_benchmarks/data/loader.py
@@ -42,7 +42,6 @@ def __init__(
             self.features_df = data[split][Segment.features]
             self.features_df = self.features_df.sort([self.vars["GROUP"], self.vars["SEQUENCE"]])
             self.features_df = self.features_df.drop(self.vars["SEQUENCE"])
-            self.grouping_df = self.grouping_df.sort([self.vars["GROUP"], self.vars["SEQUENCE"]])
         else:
             # We have a static dataset
             logging.info("Using static dataset")

diff --git a/icu_benchmarks/data/pooling.py b/icu_benchmarks/data/pooling.py
@@ -66,8 +66,7 @@ def generate(
             if folder.is_dir():
                 if folder.name in datasets:
                     data[folder.name] = {
-                        f: pq.read_table(folder / self.file_names[f]).to_pandas(self_destruct=True)
-                        for f in self.file_names
+                        f: pq.read_table(folder / self.file_names[f]).to_pandas(self_destruct=True) for f in self.file_names
                     }
         data = self._pool_datasets(
             datasets=data,
@@ -103,9 +102,9 @@ def _save_pooled_data(self, data_dir, data, datasets, file_names, samples=10000)
 
     def _pool_datasets(
         self,
-        datasets={},
+        datasets=None,
         samples=10000,
-        vars=[],
+        vars=None,
         seed=42,
         shuffle=True,
         runmode=RunMode.classification,
@@ -126,6 +125,10 @@ def _pool_datasets(
         Returns:
             pooled dataset
         """
+        if datasets is None:
+            datasets = {}
+        if vars is None:
+            vars = []
         if len(datasets) == 0:
             raise ValueError("No datasets supplied.")
         pooled_data = {Segment.static: [], Segment.dynamic: [], Segment.outcome: []}

diff --git a/icu_benchmarks/data/preprocessor.py b/icu_benchmarks/data/preprocessor.py
@@ -44,6 +44,8 @@ def set_imputation_model(self, imputation_model):
             update_wandb_config({"imputation_model": self.imputation_model.__class__.__name__})
 
 
+
+
 @gin.configurable("base_classification_preprocessor")
 class PolarsClassificationPreprocessor(Preprocessor):
     def __init__(
@@ -260,14 +262,17 @@ def _process_outcome(self, data, vars, split):
         outcome_rec = Recipe(data[split][Segment.outcome], vars["LABEL"], [], vars["GROUP"])
         # If the range is predefined, use predefined transformation function
         if self.outcome_max is not None and self.outcome_min is not None:
-            outcome_rec.add_step(
-                StepSklearn(
-                    sklearn_transformer=FunctionTransformer(
-                        func=lambda x: ((x - self.outcome_min) / (self.outcome_max - self.outcome_min))
-                    ),
-                    sel=all_outcomes(),
+            if self.outcome_max == self.outcome_min:
+                logging.warning("outcome_max equals outcome_min. Skipping outcome scaling.")
+            else:
+                outcome_rec.add_step(
+                    StepSklearn(
+                        sklearn_transformer=FunctionTransformer(
+                            func=lambda x: ((x - self.outcome_min) / (self.outcome_max - self.outcome_min))
+                        ),
+                        sel=all_outcomes(),
+                    )
                 )
-            )
         else:
             # If the range is not predefined, use MinMaxScaler
             outcome_rec.add_step(StepSklearn(MinMaxScaler(), sel=all_outcomes()))

diff --git a/icu_benchmarks/data/split_process_data.py b/icu_benchmarks/data/split_process_data.py
@@ -13,6 +13,7 @@
 from sklearn.model_selection import StratifiedKFold, KFold, StratifiedShuffleSplit, ShuffleSplit
 from icu_benchmarks.data.preprocessor import Preprocessor, PandasClassificationPreprocessor, PolarsClassificationPreprocessor
 from icu_benchmarks.constants import RunMode
+from icu_benchmarks.run_utils import check_required_keys
 from .constants import DataSplit as Split, DataSegment as Segment, VarType as Var
 
 
@@ -38,7 +39,6 @@ def preprocess_data(
     complete_train: bool = False,
     runmode: RunMode = RunMode.classification,
     label: str = None,
-    vars_to_exclude: list[str] = [],
 ) -> dict[dict[pl.DataFrame]] or dict[dict[pd.DataFrame]]:
     """Perform loading, splitting, imputing and normalising of task data.
 
@@ -67,7 +67,8 @@ def preprocess_data(
     """
 
     cache_dir = data_dir / "cache"
-
+    required_keys = ["GROUP", "SEQUENCE", "LABEL"]
+    check_required_keys(vars, required_keys)
     if not use_static:
         file_names.pop(Segment.static)
         vars.pop(Segment.static)

diff --git a/icu_benchmarks/models/custom_metrics.py b/icu_benchmarks/models/custom_metrics.py
@@ -138,12 +138,8 @@ def confusion_matrix(y_true: ndarray, y_pred: ndarray, normalize=False) -> torch
     confusion = sk_confusion_matrix(y_true, y_pred)
     if normalize:
         confusion = confusion / confusion.sum()
-    # confusion_tensor = torch.tensor(confusion)
-    # confusion = confusion.tolist()
     confusion_dict = {}
     for i in range(confusion.shape[0]):
         for j in range(confusion.shape[1]):
             confusion_dict[f"class_{i}_pred_{j}"] = confusion[i][j]
-    # logging.info(f"Confusion matrix: {confusion_dict}")
-    # dict = {"TP": confusion[0][0], "FP": confusion[0][1], "FN": confusion[1][0], "TN": confusion[1][1]}
     return confusion_dict
diff --git a/icu_benchmarks/models/dl_models/rnn.py b/icu_benchmarks/models/dl_models/rnn.py
@@ -13,7 +13,7 @@ class RNNet(DLPredictionWrapper):
 
     def __init__(self, input_size, hidden_dim, layer_dim, num_classes, *args, **kwargs):
         super().__init__(
-            input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, *args, **kwargs
+            *args, input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes,**kwargs
         )
         self.hidden_dim = hidden_dim
         self.layer_dim = layer_dim
@@ -39,7 +39,7 @@ class LSTMNet(DLPredictionWrapper):
 
     def __init__(self, input_size, hidden_dim, layer_dim, num_classes, *args, **kwargs):
         super().__init__(
-            input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, *args, **kwargs
+            *args, input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, **kwargs
         )
         self.hidden_dim = hidden_dim
         self.layer_dim = layer_dim
@@ -66,7 +66,7 @@ class GRUNet(DLPredictionWrapper):
 
     def __init__(self, input_size, hidden_dim, layer_dim, num_classes, *args, **kwargs):
         super().__init__(
-            input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, *args, **kwargs
+            *args, input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, **kwargs
         )
         self.hidden_dim = hidden_dim
         self.layer_dim = layer_dim
@@ -81,5 +81,4 @@ def forward(self, x):
         h0 = self.init_hidden(x)
         out, hn = self.rnn(x, h0)
         pred = self.logit(out)
-
         return pred
diff --git a/icu_benchmarks/models/dl_models/tcn.py b/icu_benchmarks/models/dl_models/tcn.py
@@ -17,10 +17,10 @@ class TemporalConvNet(DLPredictionWrapper):
 
     def __init__(self, input_size, num_channels, num_classes, *args, max_seq_length=0, kernel_size=2, dropout=0.0, **kwargs):
         super().__init__(
+            *args,
             input_size=input_size,
             num_channels=num_channels,
             num_classes=num_classes,
-            *args,
             max_seq_length=max_seq_length,
             kernel_size=kernel_size,
             dropout=dropout,

diff --git a/icu_benchmarks/models/dl_models/transformer.py b/icu_benchmarks/models/dl_models/transformer.py
@@ -6,26 +6,36 @@
 from icu_benchmarks.models.wrappers import DLPredictionWrapper
 
 
+class BaseTransformer(DLPredictionWrapper):
+    _supported_run_modes = [RunMode.classification, RunMode.regression]
+
+    def forward(self, x):
+        x = self.input_embedding(x)
+        if self.pos_encoder is not None:
+            x = self.pos_encoder(x)
+        x = self.tblocks(x)
+        pred = self.logit(x)
+        return pred
+
+
 @gin.configurable
-class Transformer(DLPredictionWrapper):
+class Transformer(BaseTransformer):
     """Transformer model as defined by the HiRID-Benchmark (https://github.com/ratschlab/HIRID-ICU-Benchmark)."""
 
-    _supported_run_modes = [RunMode.classification, RunMode.regression]
-
     def __init__(
-        self,
-        input_size,
-        hidden,
-        heads,
-        ff_hidden_mult,
-        depth,
-        num_classes,
-        *args,
-        dropout=0.0,
-        l1_reg=0,
-        pos_encoding=True,
-        dropout_att=0.0,
-        **kwargs,
+            self,
+            input_size,
+            hidden,
+            heads,
+            ff_hidden_mult,
+            depth,
+            num_classes,
+            dropout=0.0,
+            l1_reg=0,
+            pos_encoding=True,
+            dropout_att=0.0,
+            *args,
+            **kwargs,
     ):
         super().__init__(
             input_size=input_size,
@@ -66,35 +76,26 @@ def __init__(
         self.logit = nn.Linear(hidden, num_classes)
         self.l1_reg = l1_reg
 
-    def forward(self, x):
-        x = self.input_embedding(x)
-        if self.pos_encoder is not None:
-            x = self.pos_encoder(x)
-        x = self.tblocks(x)
-        pred = self.logit(x)
-
-        return pred
-
 
 @gin.configurable
-class LocalTransformer(DLPredictionWrapper):
+class LocalTransformer(BaseTransformer):
     _supported_run_modes = [RunMode.classification, RunMode.regression]
 
     def __init__(
-        self,
-        input_size,
-        hidden,
-        heads,
-        ff_hidden_mult,
-        depth,
-        num_classes,
-        *args,
-        dropout=0.0,
-        l1_reg=0,
-        pos_encoding=True,
-        local_context=1,
-        dropout_att=0.0,
-        **kwargs,
+            self,
+            input_size,
+            hidden,
+            heads,
+            ff_hidden_mult,
+            depth,
+            num_classes,
+            *args,
+            dropout=0.0,
+            l1_reg=0,
+            pos_encoding=True,
+            local_context=1,
+            dropout_att=0.0,
+            **kwargs,
     ):
         super().__init__(
             input_size=input_size,
@@ -137,12 +138,3 @@ def __init__(
         self.tblocks = nn.Sequential(*tblocks)
         self.logit = nn.Linear(hidden, num_classes)
         self.l1_reg = l1_reg
-
-    def forward(self, x):
-        x = self.input_embedding(x)
-        if self.pos_encoder is not None:
-            x = self.pos_encoder(x)
-        x = self.tblocks(x)
-        pred = self.logit(x)
-
-        return pred
diff --git a/icu_benchmarks/models/ml_models/catboost.py b/icu_benchmarks/models/ml_models/catboost.py
@@ -8,10 +8,9 @@
 class CBClassifier(MLWrapper):
     _supported_run_modes = [RunMode.classification]
 
-    def __init__(self, *args, **kwargs):
-        # self.model = self.set_model_args(cb.CatBoostClassifier, task_type="GPU"
-        # if not kwargs['cpu'] else "CPU", *args, **kwargs)
-        self.model = self.set_model_args(cb.CatBoostClassifier, task_type="CPU", *args, **kwargs)
+    def __init__(self, task_type="CPU", *args, **kwargs):
+        model_kwargs = {'task_type': task_type, **kwargs}
+        self.model = self.set_model_args(cb.CatBoostClassifier, *args, **model_kwargs)
         super().__init__(*args, **kwargs)
 
     def predict(self, features):

diff --git a/icu_benchmarks/models/ml_models/xgboost.py b/icu_benchmarks/models/ml_models/xgboost.py
@@ -74,8 +74,6 @@ def test_step(self, dataset, _):
             # Save as: id, time (hours), ground truth, prediction 0, prediction 1
             np.savetxt(os.path.join(self.logger.save_dir, "pred_indicators.csv"), pred_indicators, delimiter=",")
             logging.debug(f"Saved row indicators to {os.path.join(self.logger.save_dir,f'row_indicators.csv')}")
-        if self._explain_values and self.explainer is not None:
-            self.test_shap_values = self.explainer(test_rep)
         if self.mps:
             self.log("test/loss", np.float32(self.loss(test_label, test_pred)), sync_dist=True)
             self.log_metrics(np.float32(test_label), np.float32(test_pred), "test")