From 7a2cffdef8936b1b3273ea52a8493626f0cee92d Mon Sep 17 00:00:00 2001 From: Florence Townend Date: Wed, 31 Jan 2024 16:17:56 +0000 Subject: [PATCH 1/6] Added input arguments to traintestdatamodule and traintestgraphdatamodule to specify own test indices --- .gitignore | 5 +++++ fusilli/data.py | 54 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index a290e06..b61c06e 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,8 @@ cython_debug/ # tracks the version _version.py + +# rogue directories from example notebooks running in local space +checkpoints/ +loss_figures/ +loss_logs/ \ No newline at end of file diff --git a/fusilli/data.py b/fusilli/data.py index 2a2a505..b507651 100644 --- a/fusilli/data.py +++ b/fusilli/data.py @@ -422,6 +422,9 @@ class TrainTestDataModule(pl.LightningDataModule): Early stopping callback class. num_workers : int Number of workers for the dataloader (default 0). + test_indices : list + List of indices to use for testing (default None). If None, the test indices are + randomly selected using the test_size parameter. kwargs : dict Dictionary of extra arguments for the subspace method class. """ @@ -443,6 +446,7 @@ def __init__( extra_log_string_dict=None, own_early_stopping_callback=None, num_workers=0, + test_indices=None, kwargs=None, ): """ @@ -482,6 +486,9 @@ def __init__( Early stopping callback class (default None). num_workers : int Number of workers for the dataloader (default 0). + test_indices : list + List of indices to use for testing (default None). If None, the test indices are + randomly selected using the test_size parameter. kwargs : dict Dictionary of extra arguments for the subspace method class. """ @@ -515,6 +522,7 @@ def __init__( self.max_epochs = max_epochs self.own_early_stopping_callback = own_early_stopping_callback self.num_workers = num_workers + self.test_indices = test_indices self.kwargs = kwargs def prepare_data(self): @@ -555,9 +563,17 @@ def setup( """ # split the dataset into train and test sets - [self.train_dataset, self.test_dataset] = torch.utils.data.random_split( - self.dataset, [1 - self.test_size, self.test_size] - ) + if self.test_indices is None: + [self.train_dataset, self.test_dataset] = torch.utils.data.random_split( + self.dataset, [1 - self.test_size, self.test_size] + ) + else: + self.test_dataset = torch.utils.data.Subset( + self.dataset, self.test_indices + ) + self.train_dataset = torch.utils.data.Subset( + self.dataset, list(set(range(len(self.dataset))) - set(self.test_indices)) + ) if self.subspace_method is not None: # if subspace method is specified if ( @@ -834,6 +850,7 @@ def kfold_split(self): """ # split the dataset into k folds + # TODO change this into a function which takes in indices or random split directives kf = KFold(n_splits=self.num_folds, shuffle=True) # get the indices of the dataset @@ -1046,7 +1063,9 @@ class TrainTestGraphDataModule: List of indices for testing. Created in setup(). graph_data : graph data structure Graph data structure. Created in setup(). - + own_test_indices : list + List of indices to use for testing (default None). If None, the test indices are + randomly selected using the test_size parameter. """ def __init__( @@ -1058,6 +1077,7 @@ def __init__( image_downsample_size=None, layer_mods=None, extra_log_string_dict=None, + own_test_indices=None, ): """ Parameters @@ -1079,6 +1099,9 @@ def __init__( (default None) extra_log_string_dict : dict Dictionary of extra strings to add to the log. + own_test_indices : list + List of indices to use for testing (default None). If None, the test indices are + randomly selected using the test_size parameter. """ @@ -1107,6 +1130,7 @@ def __init__( self.test_size = test_size self.graph_creation_method = graph_creation_method self.layer_mods = layer_mods + self.own_test_indices = own_test_indices def prepare_data(self): """ @@ -1133,11 +1157,17 @@ def setup(self): None """ # get random train and test idxs - [train_dataset, test_dataset] = torch.utils.data.random_split( - self.dataset, [1 - self.test_size, self.test_size] - ) - self.train_idxs = train_dataset.indices - self.test_idxs = test_dataset.indices + if self.own_test_indices is not None: + [train_dataset, test_dataset] = torch.utils.data.random_split( + self.dataset, [1 - self.test_size, self.test_size] + ) + self.train_idxs = train_dataset.indices + self.test_idxs = test_dataset.indices + else: + self.test_idxs = self.own_test_indices + self.train_idxs = list( + set(range(len(self.dataset))) - set(self.test_idxs) + ) # get the graph data structure self.graph_maker_instance = self.graph_creation_method(self.dataset) @@ -1378,6 +1408,7 @@ def prepare_fusion_data( extra_log_string_dict=None, own_early_stopping_callback=None, num_workers=0, + test_indices=None, **kwargs, ): """ @@ -1425,6 +1456,8 @@ def prepare_fusion_data( Early stopping callback class (default None). num_workers : int Number of workers for the dataloader (default 0). + test_indices : list or None + List of indices to use for testing (default None). If None, then random split is used. **kwargs : dict Extra keyword arguments. Usable for extra arguments for the subspace method MCVAE's early stopping callback: "mcvae_patience" and "mcvae_tolerance". @@ -1461,6 +1494,7 @@ def prepare_fusion_data( image_downsample_size=image_downsample_size, layer_mods=layer_mods, extra_log_string_dict=extra_log_string_dict, + # here is where the kfold split will go ) else: graph_data_module = TrainTestGraphDataModule( @@ -1471,6 +1505,7 @@ def prepare_fusion_data( image_downsample_size=image_downsample_size, layer_mods=layer_mods, extra_log_string_dict=extra_log_string_dict, + own_test_indices=test_indices, ) graph_data_module.prepare_data() @@ -1519,6 +1554,7 @@ def prepare_fusion_data( extra_log_string_dict=extra_log_string_dict, own_early_stopping_callback=own_early_stopping_callback, num_workers=num_workers, + test_indices=test_indices, kwargs=kwargs, ) data_module.prepare_data() From 4a9948a57b6fb0b1f1daefefbbc47bfca8c02c44 Mon Sep 17 00:00:00 2001 From: Florence Townend Date: Wed, 31 Jan 2024 16:29:20 +0000 Subject: [PATCH 2/6] added test for traintestdatamodule using specified test indices --- fusilli/data.py | 1 + tests/test_data/test_TrainTestDataModule.py | 43 +++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/fusilli/data.py b/fusilli/data.py index b507651..17ff5fb 100644 --- a/fusilli/data.py +++ b/fusilli/data.py @@ -571,6 +571,7 @@ def setup( self.test_dataset = torch.utils.data.Subset( self.dataset, self.test_indices ) + self.train_dataset = torch.utils.data.Subset( self.dataset, list(set(range(len(self.dataset))) - set(self.test_indices)) ) diff --git a/tests/test_data/test_TrainTestDataModule.py b/tests/test_data/test_TrainTestDataModule.py index e0b2289..f31a1e6 100644 --- a/tests/test_data/test_TrainTestDataModule.py +++ b/tests/test_data/test_TrainTestDataModule.py @@ -255,6 +255,49 @@ def test_setup_calls_subspace_method(create_test_files): ) +# Testing that the test indices are correctly input and used instead of a random split +def test_owntestindices(create_test_files_more_features): + tabular1_csv = create_test_files_more_features["tabular1_csv"] + tabular2_csv = create_test_files_more_features["tabular2_csv"] + image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"] + + test_size = 0.2 + prediction_task = "binary" + multiclass_dimensions = None + + sources = [tabular1_csv, tabular2_csv, image_torch_file_2d] + batch_size = 23 + + example_fusion_model = Mock() + example_fusion_model.modality_type = "tabular_image" + + # make test indices people 30 to 36 + test_indices = list(range(25, 36)) + + datamodule = TrainTestDataModule(fusion_model=example_fusion_model, + sources=sources, + output_paths=None, + prediction_task=prediction_task, + batch_size=batch_size, + test_size=test_size, + multiclass_dimensions=multiclass_dimensions, + num_folds=None, + test_indices=test_indices) + datamodule.prepare_data() + datamodule.setup() + + # check that the test indices are correctly input + assert datamodule.test_indices == test_indices + # look at the test dataset + test_dataset = datamodule.test_dataset + # check that the test dataset has the correct number of people + assert len(test_dataset) == len(test_indices) + # check train dataset + train_dataset = datamodule.train_dataset + # check that the train dataset has the correct number of people + assert len(train_dataset) == 25 + + # Run pytest if __name__ == "__main__": pytest.main() From a22507753b79d528bab8ed097e138d684c8dc192 Mon Sep 17 00:00:00 2001 From: Florence Townend Date: Wed, 31 Jan 2024 16:33:29 +0000 Subject: [PATCH 3/6] added test for the traintestgraphdatamodule --- fusilli/data.py | 2 +- tests/test_data/test_TrainTestDataModule.py | 2 +- .../test_TrainTestGraphDataModule.py | 44 +++++++++++++++++-- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/fusilli/data.py b/fusilli/data.py index 17ff5fb..ad32de9 100644 --- a/fusilli/data.py +++ b/fusilli/data.py @@ -1158,7 +1158,7 @@ def setup(self): None """ # get random train and test idxs - if self.own_test_indices is not None: + if self.own_test_indices is None: [train_dataset, test_dataset] = torch.utils.data.random_split( self.dataset, [1 - self.test_size, self.test_size] ) diff --git a/tests/test_data/test_TrainTestDataModule.py b/tests/test_data/test_TrainTestDataModule.py index f31a1e6..d8ce212 100644 --- a/tests/test_data/test_TrainTestDataModule.py +++ b/tests/test_data/test_TrainTestDataModule.py @@ -271,7 +271,7 @@ def test_owntestindices(create_test_files_more_features): example_fusion_model = Mock() example_fusion_model.modality_type = "tabular_image" - # make test indices people 30 to 36 + # make test indices people 25 to 36 test_indices = list(range(25, 36)) datamodule = TrainTestDataModule(fusion_model=example_fusion_model, diff --git a/tests/test_data/test_TrainTestGraphDataModule.py b/tests/test_data/test_TrainTestGraphDataModule.py index e5c3711..49be25a 100644 --- a/tests/test_data/test_TrainTestGraphDataModule.py +++ b/tests/test_data/test_TrainTestGraphDataModule.py @@ -1,6 +1,6 @@ import pytest from fusilli.data import TrainTestGraphDataModule -from .test_TrainTestDataModule import create_test_files +from .test_TrainTestDataModule import create_test_files, create_test_files_more_features from pytest import approx from pytest_mock import mocker @@ -28,8 +28,6 @@ def create_graph_data_module(create_test_files): sources = [tabular1_csv, tabular2_csv, image_torch_file_2d] batch_size = 23 - # modality_type = "tabular_tabular" - class example_fusion_model: modality_type = "tabular_tabular" @@ -98,5 +96,45 @@ def test_get_lightning_module(create_graph_data_module): assert lightning_module is not None +# Testing the TrainTestGraphDataModule class for the case where the user specifies their own test indices +def test_owntestindices(create_test_files_more_features): + params = { + "test_size": 0.3, + "pred_type": "binary", + "multiclass_dims": None, + } + + tabular1_csv = create_test_files_more_features["tabular1_csv"] + tabular2_csv = create_test_files_more_features["tabular2_csv"] + image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"] + + sources = [tabular1_csv, tabular2_csv, image_torch_file_2d] + batch_size = 23 + + # make test indices people 25 to 36 + test_indices = list(range(25, 36)) + + class example_fusion_model: + modality_type = "tabular_tabular" + + def __init__(self): + pass + + data_module = TrainTestGraphDataModule( + fusion_model=example_fusion_model, + sources=sources, + graph_creation_method=MockGraphMakerModule, + test_size=params["test_size"], + own_test_indices=test_indices, + ) + + data_module.prepare_data() + data_module.setup() + lightning_module = data_module.get_lightning_module() + + # check that the test indices are the same as the ones we specified + assert data_module.test_idxs == test_indices + + if __name__ == "__main__": pytest.main() From f789c832970ea1107282cefb65c0812f7fdc8ddf Mon Sep 17 00:00:00 2001 From: Florence Townend Date: Wed, 31 Jan 2024 16:59:22 +0000 Subject: [PATCH 4/6] added setting own kfold indices to kfolddatamodule and wrote test --- fusilli/data.py | 24 +++++++++++---- tests/test_data/test_KFoldDataModule.py | 41 ++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/fusilli/data.py b/fusilli/data.py index ad32de9..7a97d14 100644 --- a/fusilli/data.py +++ b/fusilli/data.py @@ -722,6 +722,10 @@ class KFoldDataModule(pl.LightningDataModule): Early stopping callback class. num_workers : int Number of workers for the dataloader (default 0). + own_kfold_indices : list + List of indices to use for k-fold cross validation (default None). If None, the k-fold + indices are randomly selected. Structure is a list of tuples of (train_indices, + test_indices). Must be the same length as num_folds. kwargs : dict Dictionary of extra arguments for the subspace method class. """ @@ -743,6 +747,7 @@ def __init__( extra_log_string_dict=None, own_early_stopping_callback=None, num_workers=0, + own_kfold_indices=None, kwargs=None, ): """ @@ -782,6 +787,10 @@ def __init__( Early stopping callback class (default None). num_workers : int Number of workers for the dataloader (default 0). + own_kfold_indices : list + List of indices to use for k-fold cross validation (default None). If None, the k-fold + indices are randomly selected. Structure is a list of tuples of (train_indices, + test_indices). Must be the same length as num_folds. kwargs : dict Dictionary of extra arguments for the subspace method class. """ @@ -822,6 +831,7 @@ def __init__( self.max_epochs = max_epochs self.own_early_stopping_callback = own_early_stopping_callback self.num_workers = num_workers + self.own_kfold_indices = own_kfold_indices self.kwargs = kwargs def prepare_data(self): @@ -849,16 +859,18 @@ def kfold_split(self): folds : list List of tuples of (train_dataset, test_dataset) """ - - # split the dataset into k folds - # TODO change this into a function which takes in indices or random split directives - kf = KFold(n_splits=self.num_folds, shuffle=True) - # get the indices of the dataset indices = list(range(len(self.dataset))) + # split the dataset into k folds + if self.own_kfold_indices is None: + kf = KFold(n_splits=self.num_folds, shuffle=True) + split_kf = kf.split(indices) + else: + split_kf = self.own_kfold_indices + folds = [] - for train_indices, val_indices in kf.split(indices): + for train_indices, val_indices in split_kf: # split the dataset into train and test sets for each fold train_dataset = torch.utils.data.Subset(self.dataset, train_indices) test_dataset = torch.utils.data.Subset(self.dataset, val_indices) diff --git a/tests/test_data/test_KFoldDataModule.py b/tests/test_data/test_KFoldDataModule.py index bd2b2c8..8d05f5e 100644 --- a/tests/test_data/test_KFoldDataModule.py +++ b/tests/test_data/test_KFoldDataModule.py @@ -1,8 +1,9 @@ import pytest import torch from fusilli.data import KFoldDataModule -from .test_TrainTestDataModule import create_test_files, MockSubspaceMethod +from .test_TrainTestDataModule import create_test_files, MockSubspaceMethod, create_test_files_more_features from unittest.mock import patch, Mock +from sklearn.model_selection import KFold @pytest.fixture @@ -66,6 +67,44 @@ def test_kfold_split(create_kfold_data_module): assert len(folds) == 5 # Check if the correct number of folds is generated +def test_kfold_split_own_indices(create_test_files_more_features): + tabular1_csv = create_test_files_more_features["tabular1_csv"] + tabular2_csv = create_test_files_more_features["tabular2_csv"] + image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"] + + test_size = 0.2 + prediction_task = "binary" + multiclass_dimensions = None + + sources = [tabular1_csv, tabular2_csv, image_torch_file_2d] + + # specifying own kfold indices using a non random split + own_folds = [(train_index, test_index) for train_index, test_index in KFold(n_splits=5).split(range(36))] + + example_fusion_model = Mock() + example_fusion_model.modality_type = "tabular_image" + + datamodule = KFoldDataModule( + fusion_model=example_fusion_model, + sources=sources, + output_paths={}, + prediction_task=prediction_task, + multiclass_dimensions=multiclass_dimensions, + num_folds=5, + test_size=test_size, + own_kfold_indices=own_folds, + batch_size=9, + ) + + datamodule.prepare_data() + folds = datamodule.kfold_split() # returns list of tuples of datasets + + assert len(folds) == 5 # Check if the correct number of folds is generated + + # check if the correct number of samples is in each fold + assert len(folds[0][0]) == len(own_folds[0][0]) + + def test_train_dataloader(create_kfold_data_module): data_module = create_kfold_data_module data_module.prepare_data() From 022569c5d2d4ca4ab79d2c21c9edc6780ad08142 Mon Sep 17 00:00:00 2001 From: Florence Townend Date: Wed, 31 Jan 2024 17:14:13 +0000 Subject: [PATCH 5/6] separated out kfold and train test data module function call in prepare_fusion_data so we dont need to specify test size for kfold and num folds for train test redundantly --- fusilli/data.py | 96 +++++++++++++------- tests/test_data/test_KFoldDataModule.py | 3 - tests/test_data/test_KFoldGraphDataModule.py | 36 +++++++- tests/test_data/test_TrainTestDataModule.py | 8 +- 4 files changed, 98 insertions(+), 45 deletions(-) diff --git a/fusilli/data.py b/fusilli/data.py index 7a97d14..b6c54ef 100644 --- a/fusilli/data.py +++ b/fusilli/data.py @@ -437,7 +437,6 @@ def __init__( prediction_task, batch_size, test_size, - num_folds, # not needed for train/test split multiclass_dimensions, subspace_method=None, image_downsample_size=None, @@ -465,8 +464,6 @@ def __init__( Batch size (default 8). test_size : float Fraction of data to use for testing (default 0.2). - num_folds : int - Total number of folds. Not needed for this class for train/test split but it's here to be consistent with KFoldDataModule. multiclass_dimensions : int Number of classes for multiclass prediction (default None). subspace_method : class @@ -738,7 +735,6 @@ def __init__( prediction_task, batch_size, num_folds, - test_size, # not needed for k-fold multiclass_dimensions, subspace_method=None, image_downsample_size=None, @@ -1259,6 +1255,7 @@ def __init__( image_downsample_size=None, layer_mods=None, extra_log_string_dict=None, + own_kfold_indices=None, ): """ Parameters @@ -1279,6 +1276,10 @@ def __init__( (default None) extra_log_string_dict : dict Dictionary of extra strings to add to the log. + own_kfold_indices : list + List of indices to use for k-fold cross validation (default None). If None, the k-fold + indices are randomly selected. Structure is a list of tuples of (train_indices, + test_indices). Must be the same length as num_folds. """ super().__init__() self.num_folds = num_folds # total number of folds @@ -1304,6 +1305,7 @@ def __init__( self.modality_type = self.fusion_model.modality_type self.graph_creation_method = graph_creation_method self.layer_mods = layer_mods + self.own_kfold_indices = own_kfold_indices def prepare_data(self): """ @@ -1322,21 +1324,28 @@ def kfold_split(self): Returns ------ folds : list - List of tuples of (graph_data, train_idxs, test_idxs) + List of tuples of (train_dataset, test_dataset) """ - # splits the dataset into k folds - kf = KFold(n_splits=self.num_folds, shuffle=True) - indices = list(range(len(self.dataset))) # get the indices of the dataset + # get the indices of the dataset + indices = list(range(len(self.dataset))) + + # split the dataset into k folds + if self.own_kfold_indices is None: + kf = KFold(n_splits=self.num_folds, shuffle=True) + split_kf = kf.split(indices) + else: + split_kf = self.own_kfold_indices folds = [] - for train_indices, val_indices in kf.split(indices): + for train_indices, val_indices in split_kf: # split the dataset into train and test sets for each fold train_dataset = torch.utils.data.Subset(self.dataset, train_indices) test_dataset = torch.utils.data.Subset(self.dataset, val_indices) - folds.append( - (train_dataset, test_dataset) - ) # list of tuples of (train_dataset, test_dataset) - return folds + + # append the train and test datasets to the folds list + folds.append((train_dataset, test_dataset)) + + return folds # list of tuples of (train_dataset, test_dataset) def setup(self): """ @@ -1422,6 +1431,7 @@ def prepare_fusion_data( own_early_stopping_callback=None, num_workers=0, test_indices=None, + own_kfold_indices=None, **kwargs, ): """ @@ -1471,6 +1481,8 @@ def prepare_fusion_data( Number of workers for the dataloader (default 0). test_indices : list or None List of indices to use for testing (default None). If None, then random split is used. + own_kfold_indices : list or None + List of indices to use for k-fold cross validation (default None). If None, then random split is used. **kwargs : dict Extra keyword arguments. Usable for extra arguments for the subspace method MCVAE's early stopping callback: "mcvae_patience" and "mcvae_tolerance". @@ -1547,29 +1559,43 @@ def prepare_fusion_data( else: # another other than graph fusion if kfold: - datamodule_func = KFoldDataModule + data_module = KFoldDataModule( + fusion_model, + sources=data_sources, + output_paths=output_paths, + prediction_task=prediction_task, + batch_size=batch_size, + num_folds=num_folds, + multiclass_dimensions=multiclass_dimensions, + subspace_method=fusion_model.subspace_method, + image_downsample_size=image_downsample_size, + layer_mods=layer_mods, + max_epochs=max_epochs, + extra_log_string_dict=extra_log_string_dict, + own_early_stopping_callback=own_early_stopping_callback, + num_workers=num_workers, + own_kfold_indices=own_kfold_indices, + kwargs=kwargs, + ) else: - datamodule_func = TrainTestDataModule - - data_module = datamodule_func( - fusion_model, - sources=data_sources, - output_paths=output_paths, - prediction_task=prediction_task, - batch_size=batch_size, - test_size=test_size, - num_folds=num_folds, - multiclass_dimensions=multiclass_dimensions, - subspace_method=fusion_model.subspace_method, - image_downsample_size=image_downsample_size, - layer_mods=layer_mods, - max_epochs=max_epochs, - extra_log_string_dict=extra_log_string_dict, - own_early_stopping_callback=own_early_stopping_callback, - num_workers=num_workers, - test_indices=test_indices, - kwargs=kwargs, - ) + data_module = TrainTestDataModule( + fusion_model, + sources=data_sources, + output_paths=output_paths, + prediction_task=prediction_task, + batch_size=batch_size, + test_size=test_size, + multiclass_dimensions=multiclass_dimensions, + subspace_method=fusion_model.subspace_method, + image_downsample_size=image_downsample_size, + layer_mods=layer_mods, + max_epochs=max_epochs, + extra_log_string_dict=extra_log_string_dict, + own_early_stopping_callback=own_early_stopping_callback, + num_workers=num_workers, + test_indices=test_indices, + kwargs=kwargs, + ) data_module.prepare_data() data_module.setup(checkpoint_path=checkpoint_path) diff --git a/tests/test_data/test_KFoldDataModule.py b/tests/test_data/test_KFoldDataModule.py index 8d05f5e..1ef7d2e 100644 --- a/tests/test_data/test_KFoldDataModule.py +++ b/tests/test_data/test_KFoldDataModule.py @@ -32,7 +32,6 @@ def create_kfold_data_module(create_test_files): multiclass_dimensions=params["multiclass_dims"], num_folds=params["num_k"], batch_size=batch_size, - test_size=0.2 ) return data_module @@ -72,7 +71,6 @@ def test_kfold_split_own_indices(create_test_files_more_features): tabular2_csv = create_test_files_more_features["tabular2_csv"] image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"] - test_size = 0.2 prediction_task = "binary" multiclass_dimensions = None @@ -91,7 +89,6 @@ def test_kfold_split_own_indices(create_test_files_more_features): prediction_task=prediction_task, multiclass_dimensions=multiclass_dimensions, num_folds=5, - test_size=test_size, own_kfold_indices=own_folds, batch_size=9, ) diff --git a/tests/test_data/test_KFoldGraphDataModule.py b/tests/test_data/test_KFoldGraphDataModule.py index 28b97e8..cc6d36f 100644 --- a/tests/test_data/test_KFoldGraphDataModule.py +++ b/tests/test_data/test_KFoldGraphDataModule.py @@ -1,10 +1,11 @@ import pytest from fusilli.data import KFoldGraphDataModule -from .test_TrainTestDataModule import create_test_files +from .test_TrainTestDataModule import create_test_files, create_test_files_more_features import torch_geometric import numpy as np from unittest.mock import patch, Mock from pytest_mock import mocker +from sklearn.model_selection import KFold class MockGraphMakerModule: @@ -54,6 +55,39 @@ def test_kfold_split(create_graph_data_module): assert len(fold) == 2 +def test_kfold_split_own_indices(create_test_files_more_features): + tabular1_csv = create_test_files_more_features["tabular1_csv"] + tabular2_csv = create_test_files_more_features["tabular2_csv"] + image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"] + + prediction_task = "binary" + multiclass_dimensions = None + + sources = [tabular1_csv, tabular2_csv, image_torch_file_2d] + + # specifying own kfold indices using a non random split + own_folds = [(train_index, test_index) for train_index, test_index in KFold(n_splits=5).split(range(36))] + + example_fusion_model = Mock() + example_fusion_model.modality_type = "tabular_image" + + datamodule = KFoldGraphDataModule( + num_folds=5, + fusion_model=example_fusion_model, + sources=sources, + graph_creation_method=MockGraphMakerModule, + own_kfold_indices=own_folds, + ) + + datamodule.prepare_data() + folds = datamodule.kfold_split() # returns list of tuples of datasets + + assert len(folds) == 5 # Check if the correct number of folds is generated + + # check if the correct number of samples is in each fold + assert len(folds[0][0]) == len(own_folds[0][0]) + + def test_setup(create_graph_data_module, mocker): datamodule = create_graph_data_module mocker.patch.object( diff --git a/tests/test_data/test_TrainTestDataModule.py b/tests/test_data/test_TrainTestDataModule.py index d8ce212..4901bf6 100644 --- a/tests/test_data/test_TrainTestDataModule.py +++ b/tests/test_data/test_TrainTestDataModule.py @@ -153,8 +153,7 @@ def test_train_dataloader(create_test_files): prediction_task=prediction_task, batch_size=batch_size, test_size=test_size, - multiclass_dimensions=None, - num_folds=None) + multiclass_dimensions=None, ) datamodule.prepare_data() datamodule.setup() @@ -190,8 +189,7 @@ def test_val_dataloader(create_test_files): prediction_task=prediction_task, batch_size=batch_size, test_size=test_size, - multiclass_dimensions=multiclass_dimensions, - num_folds=None) + multiclass_dimensions=multiclass_dimensions, ) datamodule.prepare_data() datamodule.setup() @@ -244,7 +242,6 @@ def test_setup_calls_subspace_method(create_test_files): batch_size=batch_size, test_size=test_size, multiclass_dimensions=multiclass_dimensions, - num_folds=None, subspace_method=mock_subspace_method) datamodule.prepare_data() datamodule.setup() @@ -281,7 +278,6 @@ def test_owntestindices(create_test_files_more_features): batch_size=batch_size, test_size=test_size, multiclass_dimensions=multiclass_dimensions, - num_folds=None, test_indices=test_indices) datamodule.prepare_data() datamodule.setup() From 0f5fdb82930e4f3f6cfe554e5a8a2663588056f0 Mon Sep 17 00:00:00 2001 From: Florence Townend Date: Wed, 31 Jan 2024 17:43:34 +0000 Subject: [PATCH 6/6] added train test customisability to documentation on customising training --- docs/customising_training.rst | 63 +++++++++++++++++++ .../test_subspace_and_graph_methods.py | 2 - .../test_subspace_modifications.py | 10 ++- 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/docs/customising_training.rst b/docs/customising_training.rst index 9a80418..5a455c4 100644 --- a/docs/customising_training.rst +++ b/docs/customising_training.rst @@ -11,6 +11,7 @@ We will cover the following topics: * Number of epochs * Checkpoint suffix modification * Number of workers in PyTorch DataLoader +* Train/test and cross-validation splitting yourself Early stopping -------------- @@ -248,3 +249,65 @@ You can change the number of workers in the PyTorch DataLoader using the ``num_w fusion_model=example_model, ) + + +----- + +Train/test and cross-validation splitting yourself +--------------------------------------------------- + +By default, fusilli will split your data into train/test or cross-validation splits for you randomly based on a test size or a number of folds you specify in the :func:`~.fusilli.data.prepare_fusion_data` function. + +You can remove the randomness and specify the data indices for train and test, or for the different cross validation folds yourself by passing in optional arguments to :func:`~.fusilli.data.prepare_fusion_data`. + + +For train/test splitting, the argument `test_indices` should be a list of indices for the test set. To make the test set the first 6 data points in the overall dataset, follow the example below: + +.. code-block:: python + + from fusilli.data import prepare_fusion_data + from fusilli.train import train_and_save_models + + test_indices = [0, 1, 2, 3, 4, 5] + + datamodule = prepare_fusion_data( + prediction_task="binary", + fusion_model=example_model, + data_paths=data_paths, + output_paths=output_path, + test_indices=test_indices, + ) + +For specifying your own cross validation folds, the argument `own_kfold_indices` should be a list of lists of indices for each fold. + +If you wanted to have non-random cross validation folds through your data, you can either specify the folds like so for 3 folds: + +.. code-block:: python + + own_kfold_indices = [ + ([ 4, 5, 6, 7, 8, 9, 10, 11], [0, 1, 2, 3]), # first fold + ([ 0, 1, 2, 3, 8, 9, 10, 11], [4, 5, 6, 7]), # second fold + ([ 0, 1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11]) # third fold + ] + +Or to do this automatically, use the Scikit-Learn `KFold functionality `_ to generate the folds outside of the fusilli functions, like so: + +.. code-block:: python + + from sklearn.model_selection import KFold + + num_folds = 5 + + own_kfold_indices = [(train_index, test_index) for train_index, test_index in KFold(n_splits=num_folds).split(range(len(dataset)))] + + + datamodule = prepare_fusion_data( + kfold=True, + prediction_task="binary", + fusion_model=example_model, + data_paths=data_paths, + output_paths=output_path, + own_kfold_indices=own_kfold_indices, + num_folds=num_folds, + ) + diff --git a/tests/test_models/test_subspace_and_graph_methods.py b/tests/test_models/test_subspace_and_graph_methods.py index 5bc9a37..a30a514 100644 --- a/tests/test_models/test_subspace_and_graph_methods.py +++ b/tests/test_models/test_subspace_and_graph_methods.py @@ -58,7 +58,6 @@ def sample_datamodule(create_test_files): prediction_task="binary", batch_size=8, test_size=0.3, - num_folds=None, multiclass_dimensions=None, ) @@ -96,7 +95,6 @@ def sample_tabimg_datamodule(create_test_files): prediction_task="binary", batch_size=8, test_size=0.3, - num_folds=None, multiclass_dimensions=None, ) diff --git a/tests/test_modifications/test_subspace_modifications.py b/tests/test_modifications/test_subspace_modifications.py index f5f8e16..9deade7 100644 --- a/tests/test_modifications/test_subspace_modifications.py +++ b/tests/test_modifications/test_subspace_modifications.py @@ -731,8 +731,7 @@ def model_instance_denoising_autoencoder_subspace_method_2D(create_test_files): prediction_task="binary", batch_size=batch_size, test_size=0.2, - multiclass_dimensions=None, - num_folds=None) + multiclass_dimensions=None, ) dm.prepare_data() dm.setup() @@ -757,8 +756,7 @@ def model_instance_denoising_autoencoder_subspace_method_3D(create_test_files): prediction_task="binary", batch_size=batch_size, test_size=0.2, - multiclass_dimensions=None, - num_folds=None) + multiclass_dimensions=None, ) dm.prepare_data() dm.setup() @@ -783,7 +781,7 @@ def model_instance_concat_img_latent_tab_subspace_method_2D(create_test_files): prediction_task="binary", batch_size=batch_size, test_size=0.2, - multiclass_dimensions=None, num_folds=None) + multiclass_dimensions=None, ) dm.prepare_data() dm.setup() @@ -808,7 +806,7 @@ def model_instance_concat_img_latent_tab_subspace_method_3D(create_test_files): prediction_task="binary", batch_size=batch_size, test_size=0.2, - multiclass_dimensions=None, num_folds=None) + multiclass_dimensions=None, ) datamodule.prepare_data() datamodule.setup()