Skip to content

Commit

Permalink
More major refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
rvandewater committed Oct 17, 2024
1 parent 151d68f commit 47c594a
Show file tree
Hide file tree
Showing 14 changed files with 126 additions and 110 deletions.
8 changes: 1 addition & 7 deletions icu_benchmarks/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from icu_benchmarks.run_utils import log_full_line
from icu_benchmarks.constants import RunMode


@gin.configurable
def execute_repeated_cv(
data_dir: Path,
Expand Down Expand Up @@ -104,10 +103,6 @@ def execute_repeated_cv(
runmode=mode,
complete_train=complete_train,
)
# logging.debug(f"{data}")
# data_pickle_path = log_dir / "data.pkl"
# with open(data_pickle_path, "wb") as f:
# pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
preprocess_time = datetime.now() - start_time
start_time = datetime.now()
agg_loss += train_common(
Expand All @@ -123,8 +118,6 @@ def execute_repeated_cv(
verbose=verbose,
use_wandb=wandb,
train_only=complete_train,
epochs=20,
patience=5,
)
train_time = datetime.now() - start_time

Expand All @@ -146,3 +139,4 @@ def execute_repeated_cv(
log_full_line(f"FINISHED CV REPETITION {repetition}", level=logging.INFO, char="=", num_newlines=3)

return agg_loss / (cv_repetitions_to_train * cv_folds_to_train)

1 change: 0 additions & 1 deletion icu_benchmarks/data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def __init__(
self.features_df = data[split][Segment.features]
self.features_df = self.features_df.sort([self.vars["GROUP"], self.vars["SEQUENCE"]])
self.features_df = self.features_df.drop(self.vars["SEQUENCE"])
self.grouping_df = self.grouping_df.sort([self.vars["GROUP"], self.vars["SEQUENCE"]])
else:
# We have a static dataset
logging.info("Using static dataset")
Expand Down
11 changes: 7 additions & 4 deletions icu_benchmarks/data/pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ def generate(
if folder.is_dir():
if folder.name in datasets:
data[folder.name] = {
f: pq.read_table(folder / self.file_names[f]).to_pandas(self_destruct=True)
for f in self.file_names
f: pq.read_table(folder / self.file_names[f]).to_pandas(self_destruct=True) for f in self.file_names
}
data = self._pool_datasets(
datasets=data,
Expand Down Expand Up @@ -103,9 +102,9 @@ def _save_pooled_data(self, data_dir, data, datasets, file_names, samples=10000)

def _pool_datasets(
self,
datasets={},
datasets=None,
samples=10000,
vars=[],
vars=None,
seed=42,
shuffle=True,
runmode=RunMode.classification,
Expand All @@ -126,6 +125,10 @@ def _pool_datasets(
Returns:
pooled dataset
"""
if datasets is None:
datasets = {}
if vars is None:
vars = []
if len(datasets) == 0:
raise ValueError("No datasets supplied.")
pooled_data = {Segment.static: [], Segment.dynamic: [], Segment.outcome: []}
Expand Down
19 changes: 12 additions & 7 deletions icu_benchmarks/data/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def set_imputation_model(self, imputation_model):
update_wandb_config({"imputation_model": self.imputation_model.__class__.__name__})




@gin.configurable("base_classification_preprocessor")
class PolarsClassificationPreprocessor(Preprocessor):
def __init__(
Expand Down Expand Up @@ -260,14 +262,17 @@ def _process_outcome(self, data, vars, split):
outcome_rec = Recipe(data[split][Segment.outcome], vars["LABEL"], [], vars["GROUP"])
# If the range is predefined, use predefined transformation function
if self.outcome_max is not None and self.outcome_min is not None:
outcome_rec.add_step(
StepSklearn(
sklearn_transformer=FunctionTransformer(
func=lambda x: ((x - self.outcome_min) / (self.outcome_max - self.outcome_min))
),
sel=all_outcomes(),
if self.outcome_max == self.outcome_min:
logging.warning("outcome_max equals outcome_min. Skipping outcome scaling.")
else:
outcome_rec.add_step(
StepSklearn(
sklearn_transformer=FunctionTransformer(
func=lambda x: ((x - self.outcome_min) / (self.outcome_max - self.outcome_min))
),
sel=all_outcomes(),
)
)
)
else:
# If the range is not predefined, use MinMaxScaler
outcome_rec.add_step(StepSklearn(MinMaxScaler(), sel=all_outcomes()))
Expand Down
5 changes: 3 additions & 2 deletions icu_benchmarks/data/split_process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedShuffleSplit, ShuffleSplit
from icu_benchmarks.data.preprocessor import Preprocessor, PandasClassificationPreprocessor, PolarsClassificationPreprocessor
from icu_benchmarks.constants import RunMode
from icu_benchmarks.run_utils import check_required_keys
from .constants import DataSplit as Split, DataSegment as Segment, VarType as Var


Expand All @@ -38,7 +39,6 @@ def preprocess_data(
complete_train: bool = False,
runmode: RunMode = RunMode.classification,
label: str = None,
vars_to_exclude: list[str] = [],
) -> dict[dict[pl.DataFrame]] or dict[dict[pd.DataFrame]]:
"""Perform loading, splitting, imputing and normalising of task data.
Expand Down Expand Up @@ -67,7 +67,8 @@ def preprocess_data(
"""

cache_dir = data_dir / "cache"

required_keys = ["GROUP", "SEQUENCE", "LABEL"]
check_required_keys(vars, required_keys)
if not use_static:
file_names.pop(Segment.static)
vars.pop(Segment.static)
Expand Down
4 changes: 0 additions & 4 deletions icu_benchmarks/models/custom_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,8 @@ def confusion_matrix(y_true: ndarray, y_pred: ndarray, normalize=False) -> torch
confusion = sk_confusion_matrix(y_true, y_pred)
if normalize:
confusion = confusion / confusion.sum()
# confusion_tensor = torch.tensor(confusion)
# confusion = confusion.tolist()
confusion_dict = {}
for i in range(confusion.shape[0]):
for j in range(confusion.shape[1]):
confusion_dict[f"class_{i}_pred_{j}"] = confusion[i][j]
# logging.info(f"Confusion matrix: {confusion_dict}")
# dict = {"TP": confusion[0][0], "FP": confusion[0][1], "FN": confusion[1][0], "TN": confusion[1][1]}
return confusion_dict
7 changes: 3 additions & 4 deletions icu_benchmarks/models/dl_models/rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class RNNet(DLPredictionWrapper):

def __init__(self, input_size, hidden_dim, layer_dim, num_classes, *args, **kwargs):
super().__init__(
input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, *args, **kwargs
*args, input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes,**kwargs
)
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
Expand All @@ -39,7 +39,7 @@ class LSTMNet(DLPredictionWrapper):

def __init__(self, input_size, hidden_dim, layer_dim, num_classes, *args, **kwargs):
super().__init__(
input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, *args, **kwargs
*args, input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, **kwargs
)
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
Expand All @@ -66,7 +66,7 @@ class GRUNet(DLPredictionWrapper):

def __init__(self, input_size, hidden_dim, layer_dim, num_classes, *args, **kwargs):
super().__init__(
input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, *args, **kwargs
*args, input_size=input_size, hidden_dim=hidden_dim, layer_dim=layer_dim, num_classes=num_classes, **kwargs
)
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
Expand All @@ -81,5 +81,4 @@ def forward(self, x):
h0 = self.init_hidden(x)
out, hn = self.rnn(x, h0)
pred = self.logit(out)

return pred
2 changes: 1 addition & 1 deletion icu_benchmarks/models/dl_models/tcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ class TemporalConvNet(DLPredictionWrapper):

def __init__(self, input_size, num_channels, num_classes, *args, max_seq_length=0, kernel_size=2, dropout=0.0, **kwargs):
super().__init__(
*args,
input_size=input_size,
num_channels=num_channels,
num_classes=num_classes,
*args,
max_seq_length=max_seq_length,
kernel_size=kernel_size,
dropout=dropout,
Expand Down
90 changes: 41 additions & 49 deletions icu_benchmarks/models/dl_models/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,36 @@
from icu_benchmarks.models.wrappers import DLPredictionWrapper


class BaseTransformer(DLPredictionWrapper):
_supported_run_modes = [RunMode.classification, RunMode.regression]

def forward(self, x):
x = self.input_embedding(x)
if self.pos_encoder is not None:
x = self.pos_encoder(x)
x = self.tblocks(x)
pred = self.logit(x)
return pred


@gin.configurable
class Transformer(DLPredictionWrapper):
class Transformer(BaseTransformer):
"""Transformer model as defined by the HiRID-Benchmark (https://github.com/ratschlab/HIRID-ICU-Benchmark)."""

_supported_run_modes = [RunMode.classification, RunMode.regression]

def __init__(
self,
input_size,
hidden,
heads,
ff_hidden_mult,
depth,
num_classes,
*args,
dropout=0.0,
l1_reg=0,
pos_encoding=True,
dropout_att=0.0,
**kwargs,
self,
input_size,
hidden,
heads,
ff_hidden_mult,
depth,
num_classes,
dropout=0.0,
l1_reg=0,
pos_encoding=True,
dropout_att=0.0,
*args,
**kwargs,
):
super().__init__(
input_size=input_size,
Expand Down Expand Up @@ -66,35 +76,26 @@ def __init__(
self.logit = nn.Linear(hidden, num_classes)
self.l1_reg = l1_reg

def forward(self, x):
x = self.input_embedding(x)
if self.pos_encoder is not None:
x = self.pos_encoder(x)
x = self.tblocks(x)
pred = self.logit(x)

return pred


@gin.configurable
class LocalTransformer(DLPredictionWrapper):
class LocalTransformer(BaseTransformer):
_supported_run_modes = [RunMode.classification, RunMode.regression]

def __init__(
self,
input_size,
hidden,
heads,
ff_hidden_mult,
depth,
num_classes,
*args,
dropout=0.0,
l1_reg=0,
pos_encoding=True,
local_context=1,
dropout_att=0.0,
**kwargs,
self,
input_size,
hidden,
heads,
ff_hidden_mult,
depth,
num_classes,
*args,
dropout=0.0,
l1_reg=0,
pos_encoding=True,
local_context=1,
dropout_att=0.0,
**kwargs,
):
super().__init__(
input_size=input_size,
Expand Down Expand Up @@ -137,12 +138,3 @@ def __init__(
self.tblocks = nn.Sequential(*tblocks)
self.logit = nn.Linear(hidden, num_classes)
self.l1_reg = l1_reg

def forward(self, x):
x = self.input_embedding(x)
if self.pos_encoder is not None:
x = self.pos_encoder(x)
x = self.tblocks(x)
pred = self.logit(x)

return pred
7 changes: 3 additions & 4 deletions icu_benchmarks/models/ml_models/catboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
class CBClassifier(MLWrapper):
_supported_run_modes = [RunMode.classification]

def __init__(self, *args, **kwargs):
# self.model = self.set_model_args(cb.CatBoostClassifier, task_type="GPU"
# if not kwargs['cpu'] else "CPU", *args, **kwargs)
self.model = self.set_model_args(cb.CatBoostClassifier, task_type="CPU", *args, **kwargs)
def __init__(self, task_type="CPU", *args, **kwargs):
model_kwargs = {'task_type': task_type, **kwargs}
self.model = self.set_model_args(cb.CatBoostClassifier, *args, **model_kwargs)
super().__init__(*args, **kwargs)

def predict(self, features):
Expand Down
2 changes: 0 additions & 2 deletions icu_benchmarks/models/ml_models/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ def test_step(self, dataset, _):
# Save as: id, time (hours), ground truth, prediction 0, prediction 1
np.savetxt(os.path.join(self.logger.save_dir, "pred_indicators.csv"), pred_indicators, delimiter=",")
logging.debug(f"Saved row indicators to {os.path.join(self.logger.save_dir,f'row_indicators.csv')}")
if self._explain_values and self.explainer is not None:
self.test_shap_values = self.explainer(test_rep)
if self.mps:
self.log("test/loss", np.float32(self.loss(test_label, test_pred)), sync_dist=True)
self.log_metrics(np.float32(test_label), np.float32(test_pred), "test")
Expand Down
Loading

0 comments on commit 47c594a

Please sign in to comment.