From 7a2cffdef8936b1b3273ea52a8493626f0cee92d Mon Sep 17 00:00:00 2001
From: Florence Townend <f.j.townend@live.com>
Date: Wed, 31 Jan 2024 16:17:56 +0000
Subject: [PATCH 1/6] Added input arguments to traintestdatamodule and
 traintestgraphdatamodule to specify own test indices

---
 .gitignore      |  5 +++++
 fusilli/data.py | 54 ++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index a290e06..b61c06e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,8 @@ cython_debug/
 
 # tracks the version
 _version.py
+
+# rogue directories from example notebooks running in local space
+checkpoints/
+loss_figures/
+loss_logs/
\ No newline at end of file
diff --git a/fusilli/data.py b/fusilli/data.py
index 2a2a505..b507651 100644
--- a/fusilli/data.py
+++ b/fusilli/data.py
@@ -422,6 +422,9 @@ class TrainTestDataModule(pl.LightningDataModule):
         Early stopping callback class.
     num_workers : int
         Number of workers for the dataloader (default 0).
+    test_indices : list
+        List of indices to use for testing (default None). If None, the test indices are
+        randomly selected using the test_size parameter.
     kwargs : dict
         Dictionary of extra arguments for the subspace method class.
     """
@@ -443,6 +446,7 @@ def __init__(
             extra_log_string_dict=None,
             own_early_stopping_callback=None,
             num_workers=0,
+            test_indices=None,
             kwargs=None,
     ):
         """
@@ -482,6 +486,9 @@ def __init__(
             Early stopping callback class (default None).
         num_workers : int
             Number of workers for the dataloader (default 0).
+        test_indices : list
+            List of indices to use for testing (default None). If None, the test indices are
+            randomly selected using the test_size parameter.
         kwargs : dict
             Dictionary of extra arguments for the subspace method class.
         """
@@ -515,6 +522,7 @@ def __init__(
         self.max_epochs = max_epochs
         self.own_early_stopping_callback = own_early_stopping_callback
         self.num_workers = num_workers
+        self.test_indices = test_indices
         self.kwargs = kwargs
 
     def prepare_data(self):
@@ -555,9 +563,17 @@ def setup(
         """
 
         # split the dataset into train and test sets
-        [self.train_dataset, self.test_dataset] = torch.utils.data.random_split(
-            self.dataset, [1 - self.test_size, self.test_size]
-        )
+        if self.test_indices is None:
+            [self.train_dataset, self.test_dataset] = torch.utils.data.random_split(
+                self.dataset, [1 - self.test_size, self.test_size]
+            )
+        else:
+            self.test_dataset = torch.utils.data.Subset(
+                self.dataset, self.test_indices
+            )
+            self.train_dataset = torch.utils.data.Subset(
+                self.dataset, list(set(range(len(self.dataset))) - set(self.test_indices))
+            )
 
         if self.subspace_method is not None:  # if subspace method is specified
             if (
@@ -834,6 +850,7 @@ def kfold_split(self):
         """
 
         # split the dataset into k folds
+        # TODO change this into a function which takes in indices or random split directives
         kf = KFold(n_splits=self.num_folds, shuffle=True)
 
         # get the indices of the dataset
@@ -1046,7 +1063,9 @@ class TrainTestGraphDataModule:
         List of indices for testing. Created in setup().
     graph_data : graph data structure
         Graph data structure. Created in setup().
-
+    own_test_indices : list
+        List of indices to use for testing (default None). If None, the test indices are
+        randomly selected using the test_size parameter.
     """
 
     def __init__(
@@ -1058,6 +1077,7 @@ def __init__(
             image_downsample_size=None,
             layer_mods=None,
             extra_log_string_dict=None,
+            own_test_indices=None,
     ):
         """
         Parameters
@@ -1079,6 +1099,9 @@ def __init__(
             (default None)
         extra_log_string_dict : dict
             Dictionary of extra strings to add to the log.
+        own_test_indices : list
+            List of indices to use for testing (default None). If None, the test indices are
+            randomly selected using the test_size parameter.
 
         """
 
@@ -1107,6 +1130,7 @@ def __init__(
         self.test_size = test_size
         self.graph_creation_method = graph_creation_method
         self.layer_mods = layer_mods
+        self.own_test_indices = own_test_indices
 
     def prepare_data(self):
         """
@@ -1133,11 +1157,17 @@ def setup(self):
         None
         """
         # get random train and test idxs
-        [train_dataset, test_dataset] = torch.utils.data.random_split(
-            self.dataset, [1 - self.test_size, self.test_size]
-        )
-        self.train_idxs = train_dataset.indices
-        self.test_idxs = test_dataset.indices
+        if self.own_test_indices is not None:
+            [train_dataset, test_dataset] = torch.utils.data.random_split(
+                self.dataset, [1 - self.test_size, self.test_size]
+            )
+            self.train_idxs = train_dataset.indices
+            self.test_idxs = test_dataset.indices
+        else:
+            self.test_idxs = self.own_test_indices
+            self.train_idxs = list(
+                set(range(len(self.dataset))) - set(self.test_idxs)
+            )
 
         # get the graph data structure
         self.graph_maker_instance = self.graph_creation_method(self.dataset)
@@ -1378,6 +1408,7 @@ def prepare_fusion_data(
         extra_log_string_dict=None,
         own_early_stopping_callback=None,
         num_workers=0,
+        test_indices=None,
         **kwargs,
 ):
     """
@@ -1425,6 +1456,8 @@ def prepare_fusion_data(
         Early stopping callback class (default None).
     num_workers : int
         Number of workers for the dataloader (default 0).
+    test_indices : list or None
+        List of indices to use for testing (default None). If None, then random split is used.
     **kwargs : dict
         Extra keyword arguments. Usable for extra arguments for the subspace method MCVAE's early stopping callback: "mcvae_patience" and "mcvae_tolerance".
 
@@ -1461,6 +1494,7 @@ def prepare_fusion_data(
                 image_downsample_size=image_downsample_size,
                 layer_mods=layer_mods,
                 extra_log_string_dict=extra_log_string_dict,
+                # here is where the kfold split will go
             )
         else:
             graph_data_module = TrainTestGraphDataModule(
@@ -1471,6 +1505,7 @@ def prepare_fusion_data(
                 image_downsample_size=image_downsample_size,
                 layer_mods=layer_mods,
                 extra_log_string_dict=extra_log_string_dict,
+                own_test_indices=test_indices,
             )
 
         graph_data_module.prepare_data()
@@ -1519,6 +1554,7 @@ def prepare_fusion_data(
             extra_log_string_dict=extra_log_string_dict,
             own_early_stopping_callback=own_early_stopping_callback,
             num_workers=num_workers,
+            test_indices=test_indices,
             kwargs=kwargs,
         )
         data_module.prepare_data()

From 4a9948a57b6fb0b1f1daefefbbc47bfca8c02c44 Mon Sep 17 00:00:00 2001
From: Florence Townend <f.j.townend@live.com>
Date: Wed, 31 Jan 2024 16:29:20 +0000
Subject: [PATCH 2/6] added test for traintestdatamodule using specified test
 indices

---
 fusilli/data.py                             |  1 +
 tests/test_data/test_TrainTestDataModule.py | 43 +++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/fusilli/data.py b/fusilli/data.py
index b507651..17ff5fb 100644
--- a/fusilli/data.py
+++ b/fusilli/data.py
@@ -571,6 +571,7 @@ def setup(
             self.test_dataset = torch.utils.data.Subset(
                 self.dataset, self.test_indices
             )
+
             self.train_dataset = torch.utils.data.Subset(
                 self.dataset, list(set(range(len(self.dataset))) - set(self.test_indices))
             )
diff --git a/tests/test_data/test_TrainTestDataModule.py b/tests/test_data/test_TrainTestDataModule.py
index e0b2289..f31a1e6 100644
--- a/tests/test_data/test_TrainTestDataModule.py
+++ b/tests/test_data/test_TrainTestDataModule.py
@@ -255,6 +255,49 @@ def test_setup_calls_subspace_method(create_test_files):
         )
 
 
+# Testing that the test indices are correctly input and used instead of a random split
+def test_owntestindices(create_test_files_more_features):
+    tabular1_csv = create_test_files_more_features["tabular1_csv"]
+    tabular2_csv = create_test_files_more_features["tabular2_csv"]
+    image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"]
+
+    test_size = 0.2
+    prediction_task = "binary"
+    multiclass_dimensions = None
+
+    sources = [tabular1_csv, tabular2_csv, image_torch_file_2d]
+    batch_size = 23
+
+    example_fusion_model = Mock()
+    example_fusion_model.modality_type = "tabular_image"
+
+    # make test indices people 30 to 36
+    test_indices = list(range(25, 36))
+
+    datamodule = TrainTestDataModule(fusion_model=example_fusion_model,
+                                     sources=sources,
+                                     output_paths=None,
+                                     prediction_task=prediction_task,
+                                     batch_size=batch_size,
+                                     test_size=test_size,
+                                     multiclass_dimensions=multiclass_dimensions,
+                                     num_folds=None,
+                                     test_indices=test_indices)
+    datamodule.prepare_data()
+    datamodule.setup()
+
+    # check that the test indices are correctly input
+    assert datamodule.test_indices == test_indices
+    # look at the test dataset
+    test_dataset = datamodule.test_dataset
+    # check that the test dataset has the correct number of people
+    assert len(test_dataset) == len(test_indices)
+    # check train dataset
+    train_dataset = datamodule.train_dataset
+    # check that the train dataset has the correct number of people
+    assert len(train_dataset) == 25
+
+
 # Run pytest
 if __name__ == "__main__":
     pytest.main()

From a22507753b79d528bab8ed097e138d684c8dc192 Mon Sep 17 00:00:00 2001
From: Florence Townend <f.j.townend@live.com>
Date: Wed, 31 Jan 2024 16:33:29 +0000
Subject: [PATCH 3/6] added test for the traintestgraphdatamodule

---
 fusilli/data.py                               |  2 +-
 tests/test_data/test_TrainTestDataModule.py   |  2 +-
 .../test_TrainTestGraphDataModule.py          | 44 +++++++++++++++++--
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/fusilli/data.py b/fusilli/data.py
index 17ff5fb..ad32de9 100644
--- a/fusilli/data.py
+++ b/fusilli/data.py
@@ -1158,7 +1158,7 @@ def setup(self):
         None
         """
         # get random train and test idxs
-        if self.own_test_indices is not None:
+        if self.own_test_indices is None:
             [train_dataset, test_dataset] = torch.utils.data.random_split(
                 self.dataset, [1 - self.test_size, self.test_size]
             )
diff --git a/tests/test_data/test_TrainTestDataModule.py b/tests/test_data/test_TrainTestDataModule.py
index f31a1e6..d8ce212 100644
--- a/tests/test_data/test_TrainTestDataModule.py
+++ b/tests/test_data/test_TrainTestDataModule.py
@@ -271,7 +271,7 @@ def test_owntestindices(create_test_files_more_features):
     example_fusion_model = Mock()
     example_fusion_model.modality_type = "tabular_image"
 
-    # make test indices people 30 to 36
+    # make test indices people 25 to 36
     test_indices = list(range(25, 36))
 
     datamodule = TrainTestDataModule(fusion_model=example_fusion_model,
diff --git a/tests/test_data/test_TrainTestGraphDataModule.py b/tests/test_data/test_TrainTestGraphDataModule.py
index e5c3711..49be25a 100644
--- a/tests/test_data/test_TrainTestGraphDataModule.py
+++ b/tests/test_data/test_TrainTestGraphDataModule.py
@@ -1,6 +1,6 @@
 import pytest
 from fusilli.data import TrainTestGraphDataModule
-from .test_TrainTestDataModule import create_test_files
+from .test_TrainTestDataModule import create_test_files, create_test_files_more_features
 from pytest import approx
 from pytest_mock import mocker
 
@@ -28,8 +28,6 @@ def create_graph_data_module(create_test_files):
     sources = [tabular1_csv, tabular2_csv, image_torch_file_2d]
     batch_size = 23
 
-    # modality_type = "tabular_tabular"
-
     class example_fusion_model:
         modality_type = "tabular_tabular"
 
@@ -98,5 +96,45 @@ def test_get_lightning_module(create_graph_data_module):
     assert lightning_module is not None
 
 
+# Testing the TrainTestGraphDataModule class for the case where the user specifies their own test indices
+def test_owntestindices(create_test_files_more_features):
+    params = {
+        "test_size": 0.3,
+        "pred_type": "binary",
+        "multiclass_dims": None,
+    }
+
+    tabular1_csv = create_test_files_more_features["tabular1_csv"]
+    tabular2_csv = create_test_files_more_features["tabular2_csv"]
+    image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"]
+
+    sources = [tabular1_csv, tabular2_csv, image_torch_file_2d]
+    batch_size = 23
+
+    # make test indices people 25 to 36
+    test_indices = list(range(25, 36))
+
+    class example_fusion_model:
+        modality_type = "tabular_tabular"
+
+        def __init__(self):
+            pass
+
+    data_module = TrainTestGraphDataModule(
+        fusion_model=example_fusion_model,
+        sources=sources,
+        graph_creation_method=MockGraphMakerModule,
+        test_size=params["test_size"],
+        own_test_indices=test_indices,
+    )
+
+    data_module.prepare_data()
+    data_module.setup()
+    lightning_module = data_module.get_lightning_module()
+
+    # check that the test indices are the same as the ones we specified
+    assert data_module.test_idxs == test_indices
+
+
 if __name__ == "__main__":
     pytest.main()

From f789c832970ea1107282cefb65c0812f7fdc8ddf Mon Sep 17 00:00:00 2001
From: Florence Townend <f.j.townend@live.com>
Date: Wed, 31 Jan 2024 16:59:22 +0000
Subject: [PATCH 4/6] added setting own kfold indices to kfolddatamodule and
 wrote test

---
 fusilli/data.py                         | 24 +++++++++++----
 tests/test_data/test_KFoldDataModule.py | 41 ++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/fusilli/data.py b/fusilli/data.py
index ad32de9..7a97d14 100644
--- a/fusilli/data.py
+++ b/fusilli/data.py
@@ -722,6 +722,10 @@ class KFoldDataModule(pl.LightningDataModule):
         Early stopping callback class.
     num_workers : int
         Number of workers for the dataloader (default 0).
+    own_kfold_indices : list
+        List of indices to use for k-fold cross validation (default None). If None, the k-fold
+        indices are randomly selected. Structure is a list of tuples of (train_indices,
+        test_indices). Must be the same length as num_folds.
     kwargs : dict
         Dictionary of extra arguments for the subspace method class.
     """
@@ -743,6 +747,7 @@ def __init__(
             extra_log_string_dict=None,
             own_early_stopping_callback=None,
             num_workers=0,
+            own_kfold_indices=None,
             kwargs=None,
     ):
         """
@@ -782,6 +787,10 @@ def __init__(
             Early stopping callback class (default None).
         num_workers : int
             Number of workers for the dataloader (default 0).
+        own_kfold_indices : list
+            List of indices to use for k-fold cross validation (default None). If None, the k-fold
+            indices are randomly selected. Structure is a list of tuples of (train_indices,
+            test_indices). Must be the same length as num_folds.
         kwargs : dict
             Dictionary of extra arguments for the subspace method class.
         """
@@ -822,6 +831,7 @@ def __init__(
         self.max_epochs = max_epochs
         self.own_early_stopping_callback = own_early_stopping_callback
         self.num_workers = num_workers
+        self.own_kfold_indices = own_kfold_indices
         self.kwargs = kwargs
 
     def prepare_data(self):
@@ -849,16 +859,18 @@ def kfold_split(self):
         folds : list
             List of tuples of (train_dataset, test_dataset)
         """
-
-        # split the dataset into k folds
-        # TODO change this into a function which takes in indices or random split directives
-        kf = KFold(n_splits=self.num_folds, shuffle=True)
-
         # get the indices of the dataset
         indices = list(range(len(self.dataset)))
 
+        # split the dataset into k folds
+        if self.own_kfold_indices is None:
+            kf = KFold(n_splits=self.num_folds, shuffle=True)
+            split_kf = kf.split(indices)
+        else:
+            split_kf = self.own_kfold_indices
+
         folds = []
-        for train_indices, val_indices in kf.split(indices):
+        for train_indices, val_indices in split_kf:
             # split the dataset into train and test sets for each fold
             train_dataset = torch.utils.data.Subset(self.dataset, train_indices)
             test_dataset = torch.utils.data.Subset(self.dataset, val_indices)
diff --git a/tests/test_data/test_KFoldDataModule.py b/tests/test_data/test_KFoldDataModule.py
index bd2b2c8..8d05f5e 100644
--- a/tests/test_data/test_KFoldDataModule.py
+++ b/tests/test_data/test_KFoldDataModule.py
@@ -1,8 +1,9 @@
 import pytest
 import torch
 from fusilli.data import KFoldDataModule
-from .test_TrainTestDataModule import create_test_files, MockSubspaceMethod
+from .test_TrainTestDataModule import create_test_files, MockSubspaceMethod, create_test_files_more_features
 from unittest.mock import patch, Mock
+from sklearn.model_selection import KFold
 
 
 @pytest.fixture
@@ -66,6 +67,44 @@ def test_kfold_split(create_kfold_data_module):
     assert len(folds) == 5  # Check if the correct number of folds is generated
 
 
+def test_kfold_split_own_indices(create_test_files_more_features):
+    tabular1_csv = create_test_files_more_features["tabular1_csv"]
+    tabular2_csv = create_test_files_more_features["tabular2_csv"]
+    image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"]
+
+    test_size = 0.2
+    prediction_task = "binary"
+    multiclass_dimensions = None
+
+    sources = [tabular1_csv, tabular2_csv, image_torch_file_2d]
+
+    # specifying own kfold indices using a non random split
+    own_folds = [(train_index, test_index) for train_index, test_index in KFold(n_splits=5).split(range(36))]
+
+    example_fusion_model = Mock()
+    example_fusion_model.modality_type = "tabular_image"
+
+    datamodule = KFoldDataModule(
+        fusion_model=example_fusion_model,
+        sources=sources,
+        output_paths={},
+        prediction_task=prediction_task,
+        multiclass_dimensions=multiclass_dimensions,
+        num_folds=5,
+        test_size=test_size,
+        own_kfold_indices=own_folds,
+        batch_size=9,
+    )
+
+    datamodule.prepare_data()
+    folds = datamodule.kfold_split()  # returns list of tuples of datasets
+
+    assert len(folds) == 5  # Check if the correct number of folds is generated
+
+    # check if the correct number of samples is in each fold
+    assert len(folds[0][0]) == len(own_folds[0][0])
+
+
 def test_train_dataloader(create_kfold_data_module):
     data_module = create_kfold_data_module
     data_module.prepare_data()

From 022569c5d2d4ca4ab79d2c21c9edc6780ad08142 Mon Sep 17 00:00:00 2001
From: Florence Townend <f.j.townend@live.com>
Date: Wed, 31 Jan 2024 17:14:13 +0000
Subject: [PATCH 5/6] separated out kfold and train test data module function
 call in prepare_fusion_data so we dont need to specify test size for kfold
 and num folds for train test redundantly

---
 fusilli/data.py                              | 96 +++++++++++++-------
 tests/test_data/test_KFoldDataModule.py      |  3 -
 tests/test_data/test_KFoldGraphDataModule.py | 36 +++++++-
 tests/test_data/test_TrainTestDataModule.py  |  8 +-
 4 files changed, 98 insertions(+), 45 deletions(-)

diff --git a/fusilli/data.py b/fusilli/data.py
index 7a97d14..b6c54ef 100644
--- a/fusilli/data.py
+++ b/fusilli/data.py
@@ -437,7 +437,6 @@ def __init__(
             prediction_task,
             batch_size,
             test_size,
-            num_folds,  # not needed for train/test split
             multiclass_dimensions,
             subspace_method=None,
             image_downsample_size=None,
@@ -465,8 +464,6 @@ def __init__(
             Batch size (default 8).
         test_size : float
             Fraction of data to use for testing (default 0.2).
-        num_folds : int
-            Total number of folds. Not needed for this class for train/test split but it's here to be consistent with KFoldDataModule.
         multiclass_dimensions : int
             Number of classes for multiclass prediction (default None).
         subspace_method : class
@@ -738,7 +735,6 @@ def __init__(
             prediction_task,
             batch_size,
             num_folds,
-            test_size,  # not needed for k-fold
             multiclass_dimensions,
             subspace_method=None,
             image_downsample_size=None,
@@ -1259,6 +1255,7 @@ def __init__(
             image_downsample_size=None,
             layer_mods=None,
             extra_log_string_dict=None,
+            own_kfold_indices=None,
     ):
         """
         Parameters
@@ -1279,6 +1276,10 @@ def __init__(
             (default None)
         extra_log_string_dict : dict
             Dictionary of extra strings to add to the log.
+        own_kfold_indices : list
+            List of indices to use for k-fold cross validation (default None). If None, the k-fold
+            indices are randomly selected. Structure is a list of tuples of (train_indices,
+            test_indices). Must be the same length as num_folds.
         """
         super().__init__()
         self.num_folds = num_folds  # total number of folds
@@ -1304,6 +1305,7 @@ def __init__(
         self.modality_type = self.fusion_model.modality_type
         self.graph_creation_method = graph_creation_method
         self.layer_mods = layer_mods
+        self.own_kfold_indices = own_kfold_indices
 
     def prepare_data(self):
         """
@@ -1322,21 +1324,28 @@ def kfold_split(self):
         Returns
         ------
         folds : list
-            List of tuples of (graph_data, train_idxs, test_idxs)
+            List of tuples of (train_dataset, test_dataset)
         """
-        # splits the dataset into k folds
-        kf = KFold(n_splits=self.num_folds, shuffle=True)
-        indices = list(range(len(self.dataset)))  # get the indices of the dataset
+        # get the indices of the dataset
+        indices = list(range(len(self.dataset)))
+
+        # split the dataset into k folds
+        if self.own_kfold_indices is None:
+            kf = KFold(n_splits=self.num_folds, shuffle=True)
+            split_kf = kf.split(indices)
+        else:
+            split_kf = self.own_kfold_indices
 
         folds = []
-        for train_indices, val_indices in kf.split(indices):
+        for train_indices, val_indices in split_kf:
             # split the dataset into train and test sets for each fold
             train_dataset = torch.utils.data.Subset(self.dataset, train_indices)
             test_dataset = torch.utils.data.Subset(self.dataset, val_indices)
-            folds.append(
-                (train_dataset, test_dataset)
-            )  # list of tuples of (train_dataset, test_dataset)
-        return folds
+
+            # append the train and test datasets to the folds list
+            folds.append((train_dataset, test_dataset))
+
+        return folds  # list of tuples of (train_dataset, test_dataset)
 
     def setup(self):
         """
@@ -1422,6 +1431,7 @@ def prepare_fusion_data(
         own_early_stopping_callback=None,
         num_workers=0,
         test_indices=None,
+        own_kfold_indices=None,
         **kwargs,
 ):
     """
@@ -1471,6 +1481,8 @@ def prepare_fusion_data(
         Number of workers for the dataloader (default 0).
     test_indices : list or None
         List of indices to use for testing (default None). If None, then random split is used.
+    own_kfold_indices : list or None
+        List of indices to use for k-fold cross validation (default None). If None, then random split is used.
     **kwargs : dict
         Extra keyword arguments. Usable for extra arguments for the subspace method MCVAE's early stopping callback: "mcvae_patience" and "mcvae_tolerance".
 
@@ -1547,29 +1559,43 @@ def prepare_fusion_data(
     else:
         # another other than graph fusion
         if kfold:
-            datamodule_func = KFoldDataModule
+            data_module = KFoldDataModule(
+                fusion_model,
+                sources=data_sources,
+                output_paths=output_paths,
+                prediction_task=prediction_task,
+                batch_size=batch_size,
+                num_folds=num_folds,
+                multiclass_dimensions=multiclass_dimensions,
+                subspace_method=fusion_model.subspace_method,
+                image_downsample_size=image_downsample_size,
+                layer_mods=layer_mods,
+                max_epochs=max_epochs,
+                extra_log_string_dict=extra_log_string_dict,
+                own_early_stopping_callback=own_early_stopping_callback,
+                num_workers=num_workers,
+                own_kfold_indices=own_kfold_indices,
+                kwargs=kwargs,
+            )
         else:
-            datamodule_func = TrainTestDataModule
-
-        data_module = datamodule_func(
-            fusion_model,
-            sources=data_sources,
-            output_paths=output_paths,
-            prediction_task=prediction_task,
-            batch_size=batch_size,
-            test_size=test_size,
-            num_folds=num_folds,
-            multiclass_dimensions=multiclass_dimensions,
-            subspace_method=fusion_model.subspace_method,
-            image_downsample_size=image_downsample_size,
-            layer_mods=layer_mods,
-            max_epochs=max_epochs,
-            extra_log_string_dict=extra_log_string_dict,
-            own_early_stopping_callback=own_early_stopping_callback,
-            num_workers=num_workers,
-            test_indices=test_indices,
-            kwargs=kwargs,
-        )
+            data_module = TrainTestDataModule(
+                fusion_model,
+                sources=data_sources,
+                output_paths=output_paths,
+                prediction_task=prediction_task,
+                batch_size=batch_size,
+                test_size=test_size,
+                multiclass_dimensions=multiclass_dimensions,
+                subspace_method=fusion_model.subspace_method,
+                image_downsample_size=image_downsample_size,
+                layer_mods=layer_mods,
+                max_epochs=max_epochs,
+                extra_log_string_dict=extra_log_string_dict,
+                own_early_stopping_callback=own_early_stopping_callback,
+                num_workers=num_workers,
+                test_indices=test_indices,
+                kwargs=kwargs,
+            )
         data_module.prepare_data()
         data_module.setup(checkpoint_path=checkpoint_path)
 
diff --git a/tests/test_data/test_KFoldDataModule.py b/tests/test_data/test_KFoldDataModule.py
index 8d05f5e..1ef7d2e 100644
--- a/tests/test_data/test_KFoldDataModule.py
+++ b/tests/test_data/test_KFoldDataModule.py
@@ -32,7 +32,6 @@ def create_kfold_data_module(create_test_files):
         multiclass_dimensions=params["multiclass_dims"],
         num_folds=params["num_k"],
         batch_size=batch_size,
-        test_size=0.2
     )
 
     return data_module
@@ -72,7 +71,6 @@ def test_kfold_split_own_indices(create_test_files_more_features):
     tabular2_csv = create_test_files_more_features["tabular2_csv"]
     image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"]
 
-    test_size = 0.2
     prediction_task = "binary"
     multiclass_dimensions = None
 
@@ -91,7 +89,6 @@ def test_kfold_split_own_indices(create_test_files_more_features):
         prediction_task=prediction_task,
         multiclass_dimensions=multiclass_dimensions,
         num_folds=5,
-        test_size=test_size,
         own_kfold_indices=own_folds,
         batch_size=9,
     )
diff --git a/tests/test_data/test_KFoldGraphDataModule.py b/tests/test_data/test_KFoldGraphDataModule.py
index 28b97e8..cc6d36f 100644
--- a/tests/test_data/test_KFoldGraphDataModule.py
+++ b/tests/test_data/test_KFoldGraphDataModule.py
@@ -1,10 +1,11 @@
 import pytest
 from fusilli.data import KFoldGraphDataModule
-from .test_TrainTestDataModule import create_test_files
+from .test_TrainTestDataModule import create_test_files, create_test_files_more_features
 import torch_geometric
 import numpy as np
 from unittest.mock import patch, Mock
 from pytest_mock import mocker
+from sklearn.model_selection import KFold
 
 
 class MockGraphMakerModule:
@@ -54,6 +55,39 @@ def test_kfold_split(create_graph_data_module):
         assert len(fold) == 2
 
 
+def test_kfold_split_own_indices(create_test_files_more_features):
+    tabular1_csv = create_test_files_more_features["tabular1_csv"]
+    tabular2_csv = create_test_files_more_features["tabular2_csv"]
+    image_torch_file_2d = create_test_files_more_features["image_torch_file_2d"]
+
+    prediction_task = "binary"
+    multiclass_dimensions = None
+
+    sources = [tabular1_csv, tabular2_csv, image_torch_file_2d]
+
+    # specifying own kfold indices using a non random split
+    own_folds = [(train_index, test_index) for train_index, test_index in KFold(n_splits=5).split(range(36))]
+
+    example_fusion_model = Mock()
+    example_fusion_model.modality_type = "tabular_image"
+
+    datamodule = KFoldGraphDataModule(
+        num_folds=5,
+        fusion_model=example_fusion_model,
+        sources=sources,
+        graph_creation_method=MockGraphMakerModule,
+        own_kfold_indices=own_folds,
+    )
+
+    datamodule.prepare_data()
+    folds = datamodule.kfold_split()  # returns list of tuples of datasets
+
+    assert len(folds) == 5  # Check if the correct number of folds is generated
+
+    # check if the correct number of samples is in each fold
+    assert len(folds[0][0]) == len(own_folds[0][0])
+
+
 def test_setup(create_graph_data_module, mocker):
     datamodule = create_graph_data_module
     mocker.patch.object(
diff --git a/tests/test_data/test_TrainTestDataModule.py b/tests/test_data/test_TrainTestDataModule.py
index d8ce212..4901bf6 100644
--- a/tests/test_data/test_TrainTestDataModule.py
+++ b/tests/test_data/test_TrainTestDataModule.py
@@ -153,8 +153,7 @@ def test_train_dataloader(create_test_files):
                                      prediction_task=prediction_task,
                                      batch_size=batch_size,
                                      test_size=test_size,
-                                     multiclass_dimensions=None,
-                                     num_folds=None)
+                                     multiclass_dimensions=None, )
 
     datamodule.prepare_data()
     datamodule.setup()
@@ -190,8 +189,7 @@ def test_val_dataloader(create_test_files):
                                      prediction_task=prediction_task,
                                      batch_size=batch_size,
                                      test_size=test_size,
-                                     multiclass_dimensions=multiclass_dimensions,
-                                     num_folds=None)
+                                     multiclass_dimensions=multiclass_dimensions, )
     datamodule.prepare_data()
     datamodule.setup()
 
@@ -244,7 +242,6 @@ def test_setup_calls_subspace_method(create_test_files):
                                          batch_size=batch_size,
                                          test_size=test_size,
                                          multiclass_dimensions=multiclass_dimensions,
-                                         num_folds=None,
                                          subspace_method=mock_subspace_method)
         datamodule.prepare_data()
         datamodule.setup()
@@ -281,7 +278,6 @@ def test_owntestindices(create_test_files_more_features):
                                      batch_size=batch_size,
                                      test_size=test_size,
                                      multiclass_dimensions=multiclass_dimensions,
-                                     num_folds=None,
                                      test_indices=test_indices)
     datamodule.prepare_data()
     datamodule.setup()

From 0f5fdb82930e4f3f6cfe554e5a8a2663588056f0 Mon Sep 17 00:00:00 2001
From: Florence Townend <f.j.townend@live.com>
Date: Wed, 31 Jan 2024 17:43:34 +0000
Subject: [PATCH 6/6] added train test customisability to documentation on
 customising training

---
 docs/customising_training.rst                 | 63 +++++++++++++++++++
 .../test_subspace_and_graph_methods.py        |  2 -
 .../test_subspace_modifications.py            | 10 ++-
 3 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/docs/customising_training.rst b/docs/customising_training.rst
index 9a80418..5a455c4 100644
--- a/docs/customising_training.rst
+++ b/docs/customising_training.rst
@@ -11,6 +11,7 @@ We will cover the following topics:
 * Number of epochs
 * Checkpoint suffix modification
 * Number of workers in PyTorch DataLoader
+* Train/test and cross-validation splitting yourself
 
 Early stopping
 --------------
@@ -248,3 +249,65 @@ You can change the number of workers in the PyTorch DataLoader using the ``num_w
             fusion_model=example_model,
         )
 
+
+
+-----
+
+Train/test and cross-validation splitting yourself
+---------------------------------------------------
+
+By default, fusilli will split your data into train/test or cross-validation splits for you randomly based on a test size or a number of folds you specify in the :func:`~.fusilli.data.prepare_fusion_data` function.
+
+You can remove the randomness and specify the data indices for train and test, or for the different cross validation folds yourself by passing in optional arguments to :func:`~.fusilli.data.prepare_fusion_data`.
+
+
+For train/test splitting, the argument `test_indices` should be a list of indices for the test set. To make the test set the first 6 data points in the overall dataset, follow the example below:
+
+.. code-block:: python
+
+    from fusilli.data import prepare_fusion_data
+    from fusilli.train import train_and_save_models
+
+    test_indices = [0, 1, 2, 3, 4, 5]
+
+    datamodule = prepare_fusion_data(
+            prediction_task="binary",
+            fusion_model=example_model,
+            data_paths=data_paths,
+            output_paths=output_path,
+            test_indices=test_indices,
+        )
+
+For specifying your own cross validation folds, the argument `own_kfold_indices` should be a list of lists of indices for each fold.
+
+If you wanted to have non-random cross validation folds through your data, you can either specify the folds like so for 3 folds:
+
+.. code-block:: python
+
+    own_kfold_indices = [
+        ([ 4,  5,  6,  7,  8,  9, 10, 11], [0, 1, 2, 3]), # first fold
+        ([ 0,  1,  2,  3,  8,  9, 10, 11], [4, 5, 6, 7]), # second fold
+        ([ 0,  1,  2,  3,  4,  5,  6,  7], [8, 9, 10, 11]) # third fold
+    ]
+
+Or to do this automatically, use the Scikit-Learn `KFold functionality <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html>`_ to generate the folds outside of the fusilli functions, like so:
+
+.. code-block:: python
+
+    from sklearn.model_selection import KFold
+
+    num_folds = 5
+
+    own_kfold_indices = [(train_index, test_index) for train_index, test_index in KFold(n_splits=num_folds).split(range(len(dataset)))]
+
+
+    datamodule = prepare_fusion_data(
+        kfold=True,
+        prediction_task="binary",
+        fusion_model=example_model,
+        data_paths=data_paths,
+        output_paths=output_path,
+        own_kfold_indices=own_kfold_indices,
+        num_folds=num_folds,
+    )
+
diff --git a/tests/test_models/test_subspace_and_graph_methods.py b/tests/test_models/test_subspace_and_graph_methods.py
index 5bc9a37..a30a514 100644
--- a/tests/test_models/test_subspace_and_graph_methods.py
+++ b/tests/test_models/test_subspace_and_graph_methods.py
@@ -58,7 +58,6 @@ def sample_datamodule(create_test_files):
                              prediction_task="binary",
                              batch_size=8,
                              test_size=0.3,
-                             num_folds=None,
                              multiclass_dimensions=None,
                              )
 
@@ -96,7 +95,6 @@ def sample_tabimg_datamodule(create_test_files):
                              prediction_task="binary",
                              batch_size=8,
                              test_size=0.3,
-                             num_folds=None,
                              multiclass_dimensions=None,
                              )
 
diff --git a/tests/test_modifications/test_subspace_modifications.py b/tests/test_modifications/test_subspace_modifications.py
index f5f8e16..9deade7 100644
--- a/tests/test_modifications/test_subspace_modifications.py
+++ b/tests/test_modifications/test_subspace_modifications.py
@@ -731,8 +731,7 @@ def model_instance_denoising_autoencoder_subspace_method_2D(create_test_files):
                              prediction_task="binary",
                              batch_size=batch_size,
                              test_size=0.2,
-                             multiclass_dimensions=None,
-                             num_folds=None)
+                             multiclass_dimensions=None, )
     dm.prepare_data()
     dm.setup()
 
@@ -757,8 +756,7 @@ def model_instance_denoising_autoencoder_subspace_method_3D(create_test_files):
                              prediction_task="binary",
                              batch_size=batch_size,
                              test_size=0.2,
-                             multiclass_dimensions=None,
-                             num_folds=None)
+                             multiclass_dimensions=None, )
     dm.prepare_data()
     dm.setup()
 
@@ -783,7 +781,7 @@ def model_instance_concat_img_latent_tab_subspace_method_2D(create_test_files):
                              prediction_task="binary",
                              batch_size=batch_size,
                              test_size=0.2,
-                             multiclass_dimensions=None, num_folds=None)
+                             multiclass_dimensions=None, )
     dm.prepare_data()
     dm.setup()
 
@@ -808,7 +806,7 @@ def model_instance_concat_img_latent_tab_subspace_method_3D(create_test_files):
                                      prediction_task="binary",
                                      batch_size=batch_size,
                                      test_size=0.2,
-                                     multiclass_dimensions=None, num_folds=None)
+                                     multiclass_dimensions=None, )
     datamodule.prepare_data()
     datamodule.setup()