added train test customisability to documentation on customising trai…

…ning
florencejt · Jan 31, 2024 · 0f5fdb8 · 0f5fdb8
1 parent 022569c
commit 0f5fdb8
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 8 deletions.
diff --git a/docs/customising_training.rst b/docs/customising_training.rst
@@ -11,6 +11,7 @@ We will cover the following topics:
 * Number of epochs
 * Checkpoint suffix modification
 * Number of workers in PyTorch DataLoader
+* Train/test and cross-validation splitting yourself
 
 Early stopping
 --------------
@@ -248,3 +249,65 @@ You can change the number of workers in the PyTorch DataLoader using the ``num_w
             fusion_model=example_model,
         )
 
+
+
+-----
+
+Train/test and cross-validation splitting yourself
+---------------------------------------------------
+
+By default, fusilli will split your data into train/test or cross-validation splits for you randomly based on a test size or a number of folds you specify in the :func:`~.fusilli.data.prepare_fusion_data` function.
+
+You can remove the randomness and specify the data indices for train and test, or for the different cross validation folds yourself by passing in optional arguments to :func:`~.fusilli.data.prepare_fusion_data`.
+
+
+For train/test splitting, the argument `test_indices` should be a list of indices for the test set. To make the test set the first 6 data points in the overall dataset, follow the example below:
+
+.. code-block:: python
+
+    from fusilli.data import prepare_fusion_data
+    from fusilli.train import train_and_save_models
+
+    test_indices = [0, 1, 2, 3, 4, 5]
+
+    datamodule = prepare_fusion_data(
+            prediction_task="binary",
+            fusion_model=example_model,
+            data_paths=data_paths,
+            output_paths=output_path,
+            test_indices=test_indices,
+        )
+
+For specifying your own cross validation folds, the argument `own_kfold_indices` should be a list of lists of indices for each fold.
+
+If you wanted to have non-random cross validation folds through your data, you can either specify the folds like so for 3 folds:
+
+.. code-block:: python
+
+    own_kfold_indices = [
+        ([ 4,  5,  6,  7,  8,  9, 10, 11], [0, 1, 2, 3]), # first fold
+        ([ 0,  1,  2,  3,  8,  9, 10, 11], [4, 5, 6, 7]), # second fold
+        ([ 0,  1,  2,  3,  4,  5,  6,  7], [8, 9, 10, 11]) # third fold
+    ]
+
+Or to do this automatically, use the Scikit-Learn `KFold functionality <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html>`_ to generate the folds outside of the fusilli functions, like so:
+
+.. code-block:: python
+
+    from sklearn.model_selection import KFold
+
+    num_folds = 5
+
+    own_kfold_indices = [(train_index, test_index) for train_index, test_index in KFold(n_splits=num_folds).split(range(len(dataset)))]
+
+
+    datamodule = prepare_fusion_data(
+        kfold=True,
+        prediction_task="binary",
+        fusion_model=example_model,
+        data_paths=data_paths,
+        output_paths=output_path,
+        own_kfold_indices=own_kfold_indices,
+        num_folds=num_folds,
+    )
+
diff --git a/tests/test_models/test_subspace_and_graph_methods.py b/tests/test_models/test_subspace_and_graph_methods.py
@@ -58,7 +58,6 @@ def sample_datamodule(create_test_files):
                              prediction_task="binary",
                              batch_size=8,
                              test_size=0.3,
-                             num_folds=None,
                              multiclass_dimensions=None,
                              )
 
@@ -96,7 +95,6 @@ def sample_tabimg_datamodule(create_test_files):
                              prediction_task="binary",
                              batch_size=8,
                              test_size=0.3,
-                             num_folds=None,
                              multiclass_dimensions=None,
                              )
 

diff --git a/tests/test_modifications/test_subspace_modifications.py b/tests/test_modifications/test_subspace_modifications.py
@@ -731,8 +731,7 @@ def model_instance_denoising_autoencoder_subspace_method_2D(create_test_files):
                              prediction_task="binary",
                              batch_size=batch_size,
                              test_size=0.2,
-                             multiclass_dimensions=None,
-                             num_folds=None)
+                             multiclass_dimensions=None, )
     dm.prepare_data()
     dm.setup()
 
@@ -757,8 +756,7 @@ def model_instance_denoising_autoencoder_subspace_method_3D(create_test_files):
                              prediction_task="binary",
                              batch_size=batch_size,
                              test_size=0.2,
-                             multiclass_dimensions=None,
-                             num_folds=None)
+                             multiclass_dimensions=None, )
     dm.prepare_data()
     dm.setup()
 
@@ -783,7 +781,7 @@ def model_instance_concat_img_latent_tab_subspace_method_2D(create_test_files):
                              prediction_task="binary",
                              batch_size=batch_size,
                              test_size=0.2,
-                             multiclass_dimensions=None, num_folds=None)
+                             multiclass_dimensions=None, )
     dm.prepare_data()
     dm.setup()
 
@@ -808,7 +806,7 @@ def model_instance_concat_img_latent_tab_subspace_method_3D(create_test_files):
                                      prediction_task="binary",
                                      batch_size=batch_size,
                                      test_size=0.2,
-                                     multiclass_dimensions=None, num_folds=None)
+                                     multiclass_dimensions=None, )
     datamodule.prepare_data()
     datamodule.setup()