Merge pull request #11 from alegiac95/correlation

Correlation
alegiac95 · Nov 24, 2021 · 54fc100 · 54fc100
2 parents b2a12b7 + 4056a9f
commit 54fc100
Show file tree

Hide file tree

Showing 14 changed files with 471 additions and 178 deletions.
diff --git a/docs/chapters/04_usage.rst b/docs/chapters/04_usage.rst
@@ -52,7 +52,15 @@ To the library can be imported by running:
 
     import imaging_transcriptomics as imt
 
-Once imported the package will contain the core ``ImagingTranscriptomics`` class, along with other useful functions.
+Once imported the package will contain the core ``ImagingTranscriptomics``
+class, along with other useful functions. To see all the available functions
+imported in the library run:
+
+..code:: python
+
+    dir(imt)
+
+which will display all the functions and modules imported in the library.
 
 ImagingTranscriptomics Class
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -65,8 +73,8 @@ To start using the class the first step is to initialise it. A way to do this is
     my_analysis = imt.ImagingTranscriptomics(my_data, n_components=1)
 
 The ``my_data`` is an array that contains the data you want to analyse (e.g., the average values from the scan). This vector has to have some characteristics, mainly:
+
 * it has to be a ``numpy.array`` with length 41, which corresponds to the number of regions in the left hemisphere of the Desikan-Killiany atlas.
 * it has to contain the values you want to analyse but not the ``zscore`` of the values as this is computed automatically during the initialisation.
 
-When initialising the class one might choose to initialise it using the ``n_components`` or the ``var`` attribute which represent the number of components to use for the regression or the variance to extract respectively.
-After setting one, and only one, of the values are set, one can estiamte the other by running the ``my_analysis.pls_all_components()`` method which will compute the PLS regression with 15 components and estimate the variance explained by the user defined
+
diff --git a/docs/chapters/08_faq.rst b/docs/chapters/08_faq.rst
@@ -14,4 +14,4 @@ FAQ
 
 #. **Why did you use the pypls library instead of some more maintained PLS library, e.g., sklearn?**
     We used pypls instead of sklearn because the latter one, and most of the other available, are implemented using the NIPALS algorithm, while pypls uses the SIMPLS.
-    One of the main advantages of the SIMPLS algorithm in respoect to the NIPALS is that is is less time consuming.
+    One of the main advantages of the SIMPLS algorithm in respect to the NIPALS is that is is less time consuming.
diff --git a/imaging_transcriptomics/__init__.py b/imaging_transcriptomics/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.0.0"
+__version__ = "1.0.1"
 
 from . import errors
 from . import bootstrap
@@ -8,11 +8,12 @@
 from . import reporting
 
 from .transcriptomics import ImagingTranscriptomics
-from .bootstrap import (bootstrap_pls,
-                        bootstrap_genes)
-from .inputs import (load_gene_expression,
-                     load_gene_labels,
-                     get_components,
-                     read_scan,
-                     extract_average)
+from .bootstrap import bootstrap_pls, bootstrap_genes, bootstrap_correlation
+from .inputs import (
+    load_gene_expression,
+    load_gene_labels,
+    get_components,
+    read_scan,
+    extract_average,
+)
 from .genes import GeneResults
diff --git a/imaging_transcriptomics/bootstrap.py b/imaging_transcriptomics/bootstrap.py
@@ -2,7 +2,11 @@
 import logging.config
 import yaml
 from pathlib import Path
+from itertools import product
+from multiprocessing import Pool
+from functools import partial
 
+from scipy.stats import spearmanr
 import numpy
 import numpy as np
 from tqdm import tqdm
@@ -25,9 +29,7 @@ def correlate(corr1, corr2):
     :param corr1: first element to correlate.
     :param corr2: second element to correlate.
     """
-    return np.corrcoef(np.hstack(
-        (corr1, corr2)
-    ), rowvar=False)[0, 1:]
+    return np.corrcoef(np.hstack((corr1, corr2)), rowvar=False)[0, 1:]
 
 
 def bootstrap_pls(x, y, y_perm, dim, iterations=1_000):
@@ -50,26 +52,31 @@ def bootstrap_pls(x, y, y_perm, dim, iterations=1_000):
     R_boot = np.zeros(dim)
     p_boot = np.zeros(dim)
     for component in range(1, dim + 1):
-        pls_results = pls_regression(x, y,
-                                     n_components=component,
-                                     n_perm=0,
-                                     n_boot=0)
+        pls_results = pls_regression(x, y, n_components=component, n_perm=0, n_boot=0)
         exp_var = pls_results.get("varexp")
         temp = 100 * exp_var.cumsum(axis=0)
         R_squared = temp[component - 1]
         R_sq = np.zeros(iterations)
-        for i in tqdm(range(1000), desc=f"Bootstrapping on PLS component {component}", unit=" iterations"):
+        for i in tqdm(
+            range(1000),
+            desc=f"Bootstrapping on PLS component {component}",
+            unit=" iterations",
+        ):
             y_data = y_perm[:, i].reshape(41, 1)
-            _result = pls_regression(x, y_data,
-                                     n_components=component,
-                                     n_perm=0,
-                                     n_boot=0)
+            _result = pls_regression(
+                x, y_data, n_components=component, n_perm=0, n_boot=0
+            )
             _exp_var = 100 * np.cumsum(_result.get("varexp"))
             R_sq[i] = _exp_var[component - 1]
         R_boot[component - 1] = R_squared
-        p_boot[component - 1] = float(len(R_sq[numpy.nonzero(R_sq >= R_squared)])) / iterations
-    logger.debug("Computed components p value(s) and coefficient(s) of determination: \n R: %s \n p: %s",
-                 R_boot, p_boot)
+        p_boot[component - 1] = (
+            float(len(R_sq[numpy.nonzero(R_sq >= R_squared)])) / iterations
+        )
+    logger.debug(
+        "Computed components p value(s) and coefficient(s) of determination: \n R: %s \n p: %s",
+        R_boot,
+        p_boot,
+    )
     return R_boot, p_boot
 
 
@@ -88,41 +95,94 @@ def bootstrap_genes(x_data, y, n_components, y_norm, genes, n_iterations=1000):
     results as well.
     """
     n_genes = 15_633
-    gene_index = np.array(list(range(1, n_genes+1)))
+    gene_index = np.array(list(range(1, n_genes + 1)))
     results = pls_regression(x_data, y, n_components=n_components, n_boot=0, n_perm=0)
-    r1 = correlate(results.get("x_scores").reshape(41, n_components), y_norm.reshape(41, 1))
+    r1 = correlate(
+        results.get("x_scores").reshape(41, n_components), y_norm.reshape(41, 1)
+    )
     logger.debug("Correlation between original data and regression scores: %s", r1)
     weights = results.get("x_weights")
-    gene_results = GeneResults(n_components, dim1=weights.shape[0], dim2=weights.shape[1])
+    gene_results = GeneResults(
+        n_components, dim1=weights.shape[0], dim2=weights.shape[1]
+    )
     scores = results.get("x_scores")
     for i in range(r1.size):
         if r1[i] < 0:
             weights[:, i] *= -1
             scores[:, i] *= -1
-    for idx in range(1, n_components+1):
-        x = np.argsort(weights[:, idx-1], kind='mergesort')[::-1]
-        gene_results.original_results.set_result_values(idx,
-                                                        np.sort(weights[:, idx-1], kind='mergesort')[::-1],
-                                                        x,
-                                                        genes[x],
-                                                        gene_index[x])
+    for idx in range(1, n_components + 1):
+        x = np.argsort(weights[:, idx - 1], kind="mergesort")[::-1]
+        gene_results.original_results.set_result_values(
+            idx,
+            np.sort(weights[:, idx - 1], kind="mergesort")[::-1],
+            x,
+            genes[x],
+            gene_index[x],
+        )
 
     # Main genes bootstrap
-    for iteration in tqdm(range(n_iterations), desc="Bootstrapping gene list", unit=" iterations"):
+    for iteration in tqdm(
+        range(n_iterations), desc="Bootstrapping gene list", unit=" iterations"
+    ):
         my_resample = np.random.choice(41, size=41)
         x_perm = x_data[my_resample, :]
         y_perm = y[my_resample].reshape(41, 1)
-        results = pls_regression(x_perm, y_perm, n_components=n_components, n_perm=0, n_boot=0)
+        results = pls_regression(
+            x_perm, y_perm, n_components=n_components, n_perm=0, n_boot=0
+        )
         _weights = results.get("x_weights")
-        for component in range(1, n_components+1):
-            __temp = _weights[:, component-1]
-            __new_weights = __temp[gene_results.original_results.index[component-1]]
-            __t_genes = np.array(gene_results.original_results.pls_weights[component-1])
+        for component in range(1, n_components + 1):
+            __temp = _weights[:, component - 1]
+            __new_weights = __temp[gene_results.original_results.index[component - 1]]
+            __t_genes = np.array(
+                gene_results.original_results.pls_weights[component - 1]
+            )
             __correlation = correlate(
-                __t_genes.reshape(15633, 1),
-                __new_weights.reshape(15633, 1)
+                __t_genes.reshape(15633, 1), __new_weights.reshape(15633, 1)
             )
             if __correlation < 0:
                 __new_weights *= -1
-            gene_results.boot_results.pls_weights_boot[component-1][:, component-1, iteration] = __new_weights
+            gene_results.boot_results.pls_weights_boot[component - 1][
+                :, component - 1, iteration
+            ] = __new_weights
+    return gene_results
+
+
+def spearman_op(idx, permuted, y_data):
+    return spearmanr(permuted[:, idx[0]], y_data[:, idx[1]])[0]
+
+
+def bootstrap_correlation(x_data, y_data, permuted, labels, n_iterations=1000):
+    """
+    Bootstrap the results using pearson correlation.
+
+    :param x_data: imaging data
+    :param y_data: gene expression data
+    :param permuted: permuted matrix of imaging data
+    :param labels: labels of the genes (original order)
+    :return gene_results: GeneResults class with correlation results.
+    """
+    gene_results = GeneResults(n_comp=1, dim1=1, dim2=y_data.shape[1])
+    n_genes = 15633
+    pool = Pool()
+    # original correlation
+    corr_ = np.zeros(y_data.shape[1])
+    for gene in range(n_genes):
+        corr_[gene], _ = spearmanr(x_data, y_data[:, gene])
+    # order results and set indexes in gene results
+    gene_results.original_results.pls_weights = np.sort(corr_, kind="mergesort")[::-1]
+    __idx = np.argsort(corr_, kind="mergesort")[::-1]
+    gene_results.original_results.pls_gene = labels[__idx]
+    gene_results.original_results.gene_id = __idx
+    # bootstrap
+    __res = np.zeros((15633, 1000))
+    _iter = product(range(n_iterations), range(n_genes))
+    for ind, result in tqdm(enumerate(pool.imap(partial(
+            spearman_op, permuted=permuted, y_data=y_data),
+                                                _iter, chunksize=25_000
+                                                )),
+                            desc="Bootstrapping correlation",
+                            unit="iterations"):
+        __res.flat[ind] = result
+    gene_results.boot_results.pls_weights_boot = __res
     return gene_results
diff --git a/imaging_transcriptomics/errors.py b/imaging_transcriptomics/errors.py
@@ -10,8 +10,12 @@ class InvalidFormatError(Exception):
         message -- optional user overridden error message to display.
     """
 
-    def __init__(self, error_file, message="The provided file has an invalid format. Please use files in the .nii, "
-                                           ".nii.gz format."):
+    def __init__(
+        self,
+        error_file,
+        message="The provided file has an invalid format. Please use files in the .nii, "
+        ".nii.gz format.",
+    ):
         self.error_file = error_file
         self.message = message
         super().__init__(self.message)
@@ -29,7 +33,9 @@ class InvalidSizeError(Exception):
         * message -- optional user defined error message
     """
 
-    def __init__(self, shape=(182, 218, 182), message="The provided file has a wrong shape."):
+    def __init__(
+        self, shape=(182, 218, 182), message="The provided file has a wrong shape."
+    ):
         self.message = message
         self.shape = shape
         super().__init__(self.message)
@@ -40,16 +46,17 @@ def __str__(self):
 
 # Checks Decorators
 class CheckPath:
-    """ Decorator to check if a path exists.
+    """Decorator to check if a path exists.
 
     In order to run the function decorated the path provided to the function has to exists, otehrwise an error is
     raised.
 
     :raises FileNotFoundError:
     """
+
     def __init__(self, function):
         self.function = function
-    
+
     def __call__(self, path, *args, **kwargs):
         if not Path(path).absolute().exists():
             raise FileNotFoundError
@@ -81,9 +88,10 @@ class CheckShape:
 
     :raises InvalidSizeError:
     """
+
     def __init__(self, function):
         self.function = function
-    
+
     def __call__(self, image, *args, **kwargs):
         if not image.shape == (182, 218, 182):
             raise InvalidSizeError(image.shape)
@@ -97,6 +105,7 @@ class CheckVariance:
 
     :raises ValueError:
     """
+
     def __init__(self, function):
         self.function = function