Started interpretation chapter

Plant-Food-Research-Open · Jan 25, 2024 · 3e556d0 · 3e556d0
1 parent d653713
commit 3e556d0
Show file tree

Hide file tree

Showing 7 changed files with 835 additions and 8 deletions.
diff --git a/_targets.R b/_targets.R
@@ -541,13 +541,38 @@ list(
   ## Results interpretation ----
   ##========================##
 
-  ## Formatting DIABLO output
+  ## Formatting outputs
+  tar_target(
+    spls_output,
+    get_output(spls_final_run)
+  ),
+
+  tar_target(
+    so2pls_output,
+    get_output(so2pls_final_run)
+  ),
+
+  tar_target(
+    mofa_output,
+    get_output(mofa_trained)
+  ),
+
   tar_target(
     diablo_output,
     get_output(diablo_final_run)
   ),
 
-  ## Formatting DIABLO output - individual latent dimensions
+  ## Formatting output - individual latent dimensions
+  tar_target(
+    spls_output_no_average,
+    get_output(spls_final_run, use_average_dimensions = FALSE)
+  ),
+
+  tar_target(
+    so2pls_output_no_average,
+    get_output(so2pls_final_run, use_average_dimensions = FALSE)
+  ),
+
   tar_target(
     diablo_output_no_average,
     get_output(diablo_final_run, use_average_dimensions = FALSE)

diff --git a/_targets/meta/meta b/_targets/meta/meta
@@ -53,6 +53,7 @@ mo_set_de|stem|2b7e2ace37c0bed5|50f81c565a9dc6aa|1d4929f8d72b5561|1698094396||t1
 mo_set_transformed|stem|28334a82bdfebaa7|1a49152e06c5222d|6f732476e2857317|385723480||t19739.9580970037s|5c19986154657d81|57773059|rds|local|vector|||0.252||
 mo_set_with_names|stem|e64ebcd89ddb70bd|b88cddcfa5d2a267|6eebd2f68cb73320|-1640374052||t19738.0568285842s|593ace30823e21f2|36753195|rds|local|vector|||0.114||
 mofa_input|stem|4d8e1c14434288d1|3d5db0316055019b|b1ff01b4e620143d|1284502458||t19746.8899165855s|4110beb5e0ca03cd|877459|rds|local|vector|||0.201|Dataset snps is to be modelled with a poisson likelihood, but is not integer. Transforming to integer.|
+mofa_output|stem|5352fa96ef81a7b7|0d8c9707690b395c|e568211333f1077a|-1474909642||t19746.9922984972s|a3806d14d44620e6|524047|rds|local|vector|||0.733||
 mofa_trained|stem|f523901c69cc8163|ec971dccb1086e60|dec5748f700e63fc|1854191013||t19746.8912693121s|b341cd3de1b49e2f|1004645|rds|local|vector|||116.597|No output filename provided. Using tmpRtmpaM7XWKmofa_20240125102128.hdf5 to store the trained model.. Factors 1 are strongly correlated with the total number of expressed features for at least one of your omics. Such factors appear when there are differences in the total levels between your samples, sometimes because of poor normalisation in the preprocessing steps.|
 omicspls_input|stem|488cc165105ef587|5103f8e1eb31dfb4|3fe933389d7e1821|1468063698||t19746.0144579806s|8dac9e868840a7ff|957717|rds|local|vector|||0.009||
 pca_mats_list|pattern|c831051151e4add2|72b302a65c0920ed||1078442447||||12164806|rds|local|list||pca_mats_list_302d7473*pca_mats_list_84d36937*pca_mats_list_64d37e6c|0.09||
@@ -81,12 +82,16 @@ so2pls_cv_sparsity_plot|stem|b5b7db300d09f754|2bf66d7f84b86f7d|93dca9057097acdb|
 so2pls_cv_sparsity_res|stem|bbb19afe218657af|067919e1b7030d67|dca9de770ecaf520|44551085||t19746.0265823152s|5a7113535e3388cd|147|rds|local|vector|||0.001||
 so2pls_cv_sparsity_table|stem|781bb5699071e489|ec976e7fed4ebc35|aed3d19ef29685db|1893373008||t19746.0265830958s|0cca50742c16a705|215|rds|local|vector|||0.042||
 so2pls_final_run|stem|1b5fda3f0c9ef4de|16cefdd61090d51e|2dad9253d5d29425|-1731732847||t19746.0266013504s|b09eaec0c8fed384|140427|rds|local|vector|||1.555||
+so2pls_output|stem|a5eca781575f961d|c5a60834f955fae6|71196bf0103fc69c|236025425||t19746.9923031487s|c7ea9b7f646d2838|130837|rds|local|vector|||0.225||
+so2pls_output_no_average|stem|d5a8f2b14dec6fb7|b225132f8f3d4451|71196bf0103fc69c|-886261613||t19747.0777562921s|d3f8ef5c8e7df8b4|131975|rds|local|vector|||0.242||
 spls_final_run|stem|aa85c5b6a3d8ec0c|bd201898ff611d8b|ae125251e89a9069|-1455060732||t19745.8622943744s|87decae32b6cb84a|1828750|rds|local|vector|||0.024||
 spls_input|stem|822f9a95396547da|3096087fd4d5c989|cb02bfd15d457e4a|70678725||t19745.8585853016s|8f898ab1f7e895d8|838523|rds|local|vector|||0.029||
 spls_multilevel1|stem|21cb78d9bdf5ef8f|740ca0497728ae2d|15eb1ccc2d3995df|-1820918706||t19745.8585170022s|bbe0e43529942e5e|163|rds|local|vector|||0.25||
 spls_multilevel2|stem|27547dc59b1b797d|de611361e6a5ff7d|1cd1f9a9228aec67|-2126981166||t19745.8585179626s|c8c79f11dd07570f|276|rds|local|vector|||0.064||
 spls_novarsel|stem|2073bd192ac0dfda|30372f335fc03181|d6f3e150a3c8976b|-1501000619||t19745.8585960049s|fdd5d4e3f6846793|1911012|rds|local|vector|||0.782||
 spls_optim_ncomp|stem|868e446eec7768a0|b1383d8e0854aeaf|def8f3d8099ba68d|1415549958||t19745.8587854776s|ded833868582137a|50|rds|local|vector|||0||
+spls_output|stem|629ea9b921d70759|6f07c3949ff47eeb|cfb7c1c93cfa9d4d|546222073||t19746.9923001833s|5d631a45034ba74c|9842|rds|local|vector|||0.075||
+spls_output_no_average|stem|8d15e72f10ebca21|c5dea81b152a78a1|cfb7c1c93cfa9d4d|-304276829||t19747.0777529793s|639b123d18780fa9|12261|rds|local|vector|||0.1||
 spls_perf_plot|stem|4dcf242f5c9a1018|47637718d3ef1230|0d5a2d90554e3cd7|851070939||t19745.8587849226s|9b5a79b4b56b45a1|2952976|rds|local|vector|||0.033||
 spls_perf_res|stem|1a1bcbdff1925787|511d051310ac405a|7e0bf23b466860d6|67901905||t19745.8587796978s|2ba6a48a8b1bb66b|2366615|rds|local|vector|||15.534||
 spls_smeta1|stem|f28cc0eb8f9dc6a1|2aeb86c6eeb5ba67|a4e7b89125ffabc5|1796145472||t19745.8579963991s|c3cbfe28e1565b13|202|rds|local|vector|||1.645||

diff --git a/docs/interpretation.html b/docs/interpretation.html
diff --git a/docs/prefiltering.html b/docs/prefiltering.html
@@ -296,7 +296,7 @@ <h1 class="quarto-secondary-nav-title"><span class="chapter-number">7</span>&nbs
 <a href="#features-preselection" id="toc-features-preselection" class="nav-link" data-scroll-target="#features-preselection"><span class="toc-section-number">7.3</span>  Features preselection</a>
   <ul class="collapse">
 <li><a href="#unsupervised-features-preselection" id="toc-unsupervised-features-preselection" class="nav-link" data-scroll-target="#unsupervised-features-preselection"><span class="toc-section-number">7.3.1</span>  Unsupervised features preselection</a></li>
-  <li><a href="#supervised-features-preselection" id="toc-supervised-features-preselection" class="nav-link" data-scroll-target="#supervised-features-preselection"><span class="toc-section-number">7.3.2</span>  Supervised features preselection</a></li>
+  <li><a href="#sec-prefiltering-supervised" id="toc-sec-prefiltering-supervised" class="nav-link" data-scroll-target="#sec-prefiltering-supervised"><span class="toc-section-number">7.3.2</span>  Supervised features preselection</a></li>
   </ul>
 </li>
   <li><a href="#recap-targets-list" id="toc-recap-targets-list" class="nav-link" data-scroll-target="#recap-targets-list"><span class="toc-section-number">7.4</span>  Recap – targets list</a></li>
@@ -614,7 +614,7 @@ <h1 class="title"><span id="sec-prefiltering" class="quarto-section-identifier d
 <div class="sourceCode" id="cb21"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/pkg/moiraine/man/select_features_mad.html">select_features_mad</a></span><span class="op">(</span><span class="va">mo_set_complete</span>, <span class="st">"rnaseq"</span>, to_keep_n <span class="op">=</span> <span class="fl">1000</span><span class="op">)</span></span>
 <span><span class="fu"><a href="https://rdrr.io/pkg/moiraine/man/select_features_mad.html">select_features_mad</a></span><span class="op">(</span><span class="va">mo_set_complete</span>, <span class="st">"rnaseq"</span>, to_keep_prop <span class="op">=</span> <span class="fl">0.5</span><span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
-</section><section id="supervised-features-preselection" class="level3" data-number="7.3.2"><h3 data-number="7.3.2" class="anchored" data-anchor-id="supervised-features-preselection">
+</section><section id="sec-prefiltering-supervised" class="level3" data-number="7.3.2"><h3 data-number="7.3.2" class="anchored" data-anchor-id="sec-prefiltering-supervised">
 <span class="header-section-number">7.3.2</span> Supervised features preselection</h3>
 <p>Another approach to features preselection can be preferred when we are trying to assess the features most relevant to an outcome of interest or to differences between sample groups. In this scenario, prior to integrating the datasets, it could be useful to reduce the size of the datasets by filtering out the features that are least associated with the outcome of interest. In this case, we can use some single-omics feature selection method to perform a first “crude” prefiltering.</p>
 <p><code>moiraine</code> relies on the sPLS-DA algorithm implemented in the <code>mixOmics</code> package for this. <a href="https://mixomicsteam.github.io/mixOmics-Vignette/id_05.html">sparse Partial Least-Squares Discriminant Analysis</a> (or sPLS-DA for short) is a feature selection method that aims to detect, in a multivariate dataset, the variables or features that best discriminate a categorical outcome of interest in the observations. The advantages of sPLS-DA is that it can handle datasets in which there are more features than samples, which is typically the case in omics datasets. More information can be found in <span class="citation" data-cites="lêcao2011">Lê Cao, Boitard, and Besse (<a href="references.html#ref-l%C3%AAcao2011" role="doc-biblioref">2011</a>)</span> or in the <a href="https://mixomicsteam.github.io/mixOmics-Vignette/id_05.html"><code>mixOmics</code> vignette</a>. By running an sPLS-DA analysis on each dataset separately, we can remove the features that are least informative with respect to the trait or outcome of interest. We refer to this approach as supervised preselection, as it relies on information about the samples to select the features of interest.</p>

diff --git a/docs/search.json b/docs/search.json
diff --git a/interpretation.qmd b/interpretation.qmd
@@ -4,6 +4,256 @@
 #| child: "_setup.qmd"
 ```
 
+```{r loading-packages}
+#| include: false
+
+library(targets)
+library(moiraine)
+library(purrr)
+library(dplyr)
+library(ggplot2)
+library(patchwork)
+```
+
+```{r setup-visible}
+#| eval: false
+
+library(targets)
+library(moiraine)
+
+## For working with lists
+library(purrr)
+
+## For data-frames manipulation
+library(dplyr)
+
+## For colour palettes
+library(ggplot2)
+
+## For manipulating patchworks of plots
+library(patchwork)
+```
+
+<details>
+
+<summary>`_targets.R` script</summary>
+
+```{file targets-script}
+```
+
+::: {.panel-tabset group="method"}
+#### sPLS
+
+Tab content
+
+#### sO2PLS
+
+Tab content
+
+#### MOFA
+
+Tab content
+
+#### DIABLO
+
+Tab content
+:::
+
+</details>
+
+
+
+
+## Generating a standardised output
+
+Despite relying on very different statistical approaches, the different integration methods included in the pipeline all perform dimension reduction of the omics datasets through feature extraction. That is, they construct a small number of latent components/variables/dimensions (that we refer to as **latent dimensions** in the `moiraine` package) that capture as much information from the original datasets as possible. A dimension reduction approach typically returns, for each latent dimension constructed, two sets of values:
+
+* **Features weight**: the contribution of the features from the different omics dataset to the latent dimension. All methods included in the pipeline construct latent dimensions as linear combinations of the original features, and therefore the features contribution is quantified by their weight in the linear combination.
+
+* **Samples score**: the projection of the samples onto the latent dimension.
+
+In addition, the fraction or percentage of variance that each latent dimension explains in the different omics datasets is usually calculated.
+
+### `get_output` function
+
+
+In the `moiraine` package, the output of the different integration methods can be converted to a standardised output containing these three pieces of information (features weight, samples score and percentage of variance explained) stored in a consistent format. This enables us to construct functions for visualisation or analysis which can be applied to the results of any integration method, rather than having to implement one for each object type returned by the different integration packages.
+
+The `get_output()` function transforms the output from any integration package included in `moiraine` into an `output_dimension_reduction` object, which is a list with three tibbles: `features_weight`, `samples_score` and `variance_explained`:
+
+::: {.panel-tabset group="method"}
+#### sPLS
+
+```{targets get-output-spls}
+tar_target(
+  spls_output,
+  get_output(spls_final_run)
+)
+```
+
+```{r show-output-spls}
+tar_load(spls_output)
+spls_output
+```
+
+#### sO2PLS
+
+```{targets get-output-so2pls}
+tar_target(
+  so2pls_output,
+  get_output(so2pls_final_run)
+)
+```
+
+```{r show-output-so2pls}
+tar_load(so2pls_output)
+so2pls_output
+```
+
+#### MOFA
+
+```{targets get-output-mofa}
+tar_target(
+  mofa_output,
+  get_output(mofa_trained)
+)
+```
+
+```{r show-output-mofa}
+tar_load(mofa_output)
+mofa_output
+```
+
+#### DIABLO
+
+```{targets get-output-diablo}
+tar_target(
+  diablo_output,
+  get_output(diablo_final_run)
+)
+```
+
+```{r show-output-diablo}
+tar_load(diablo_output)
+diablo_output
+```
+:::
+
+The `features_weight` tibble contains one row per combination of feature and latent dimension. The ID of the features and the name of the dataset from which they originate are stored in the `feature_id` and `dataset` columns, respectively. The `latent_dimension` column gives the name of the latent dimension; this is a factor column. For each feature and latent dimension, the `weight` column shows the weight that was attributed to the feature for the corresponding latent dimension. In addition, the `importance` column contains the features importance score, which is computed as the absolute value of the features weight, divided by the maximum absolute weight across all features from the same omics dataset for the corresponding latent dimension. This importance score allows us to compare the contribution of the features across latent dimensions or integration methods, as the weight can be on different scales and thus cannot be directly compared. The importance scores range from 0 to 1. For any method performing feature selection (e.g. sPLS or DIABLO), features that were not selected for a given latent dimension are assigned a weight and importance score of 0.
+
+The `samples score` tibble contains for each sample (`sample_id`) and latent dimension (`latent_dimension`) the sample's coordinate for the corresponding latent dimension. 
+
+The `variance_explained` tibble gives for each latent dimension (`latent_dimension`) the proportion of variance explained (`prop_var_expl`) for each dataset (`dataset`). The values in `prop_var_expl` are between 0 and 1.
+
+For convenience, the `get_latent_dimensions()` function can be used on an `output_dimension_reduction` object to see the names of the latent dimensions (the levels used for the `latent_dimension` column in each tibble):
+
+::: {.panel-tabset group="method"}
+#### sPLS
+
+```{r show-latent-dimensions-spls}
+get_latent_dimensions(spls_output)
+```
+
+
+#### sO2PLS
+
+```{r show-latent-dimensions-so2pls}
+get_latent_dimensions(so2pls_output)
+```
+
+#### MOFA
+
+```{r show-latent-dimensions-mofa}
+get_latent_dimensions(mofa_output)
+```
+
+#### DIABLO
+
+```{r show-latent-dimensions-diablo}
+get_latent_dimensions(diablo_output)
+```
+:::
+
+::: {.callout-note}
+## Other methods covered by `get_output`
+
+Note that both PCA and sPLS-DA (the method used for supervised features preselection in @sec-prefiltering-supervised) are also both dimension reduction methods. Therefore, the `get_output` function also converts `pcaRes` objects (from `run_pca()` or `pcaMethods::pca()`) and `mixo_splsda` objects (from `run_splsda()` or `mixOmics::splsda()`).
+:::
+
+### Averaging latent dimensions over datasets
+
+While MOFA computes one score per sample for each latent dimension created, sPLS, DIABLO and sO2PLS all compute one score per dataset for each sample and latent dimension. For each latent dimension. the samples score obtained for the different datasets are then compared, to assess the agreement or covariation between datasets. Ideally, these scores should be highly correlated across datasets, since the methods aim at maximising the variation between datasets, but it is not always the case. However, when they are highly correlated, it becomes redundant to interpret the latent dimensions for each dataset.
+
+Instead, the `mixOmics` authors proposed a solution for `DIABLO`, which is to construct a weighted average space: for each latent component, the samples score are averaged over the different datasets. The weight is given per dataset and determined by how well the corresponding dataset discriminate between the samples group of interest. This way, rather than looking at samples score for each dataset for any given latent component, we can look at an average of them.
+
+The `get_output()` function uses this idea to construct, for the output of sPLS, sO2PLS and DIABLO a set of average samples score for each latent dimension, rather than returning a set of samples score per dataset. For DIABLO, the average is weighted as explained above, while for sPLS and sO2PLS each dataset is given equal weight in the average. This calculation can be disabled in the `get_output()` function to extract the dataset-specific samples score, by setting the `use_average_dimensions` parameter to `FALSE`. Note that this only affects the `samples_score` tibble in terms of dimensions, but the name of the latent dimensions will change to reflect the dataset to which they refer.
+
+::: {.panel-tabset group="method"}
+#### sPLS
+
+```{targets get-output-spls-no-average}
+tar_target(
+  spls_output_no_average,
+  get_output(spls_final_run, use_average_dimensions = FALSE)
+)
+```
+
+```{r show-output-spls-no-average}
+tar_load(spls_output_no_average)
+
+get_latent_dimensions(spls_output_no_average)
+
+nrow(spls_output$samples_score)
+nrow(spls_output_no_average$samples_score)
+```
+
+#### sO2PLS
+
+```{targets get-output-so2pls-no-average}
+tar_target(
+  so2pls_output_no_average,
+  get_output(so2pls_final_run, use_average_dimensions = FALSE)
+)
+```
+
+```{r show-output-so2pls-no-average}
+tar_load(so2pls_output_no_average)
+
+get_latent_dimensions(so2pls_output_no_average)
+
+nrow(so2pls_output$samples_score)
+nrow(so2pls_output_no_average$samples_score)
+```
+
+#### DIABLO
+
+```{targets get-output-diablo-no-average}
+tar_target(
+  diablo_output_no_average,
+  get_output(diablo_final_run, use_average_dimensions = FALSE)
+)
+```
+
+```{r show-output-diablo-no-average}
+tar_load(diablo_output_no_average)
+  
+get_latent_dimensions(diablo_output_no_average)
+
+nrow(diablo_output$samples_score)
+nrow(diablo_output_no_average$samples_score)
+```
+:::
+
+## Interpretation
+
+Interpreting the results of a dimension reduction method involves:
+
+* Understanding the source of the variation captured by each latent dimension: is a given latent dimension representing an important source of biological variation, such as effect of a treatment, or age of the samples? Or do they show a source of technical variation, for example highlighting a group of outlier samples with different omics profiles from the rest of the observations? Answering these questions allows us to identify which latent dimensions capture the biological phenomenon investigated, or whether there are some sources of noise that should be accounted for in follow-up experiments.
+
+* Investigating which omics features are driving the latent dimensions: once we have identified some latent dimensions of interest, we can look at the features that contribute the most to understand the molecular mechanisms or pathways involved. This is typically done after looking into the phenomenon captured by the latent dimensions, but can also help to identify it.
+
+
+
 <!-- ## DIABLO -->
 
 <!-- Some of the plots that will be used to interpret the results require us to transform the DIABLO result object into a standard "dimensions reduction output" object, through the `get_output()` function. The returned object contains the results of the DIABLO run stored in a standard way for the `moiraine` package, i.e. as a list with the samples scores for the different latent components stored in the `samples_score` element, the features loading for the latent components stored in the `features_weight` element, and the proportion of variance explained by each latent component in the corresponding dataset in the `variance_explained` element. -->