Skip to content

Commit

Permalink
Merge pull request #11 from alegiac95/correlation
Browse files Browse the repository at this point in the history
Correlation
  • Loading branch information
alegiac95 authored Nov 24, 2021
2 parents b2a12b7 + 4056a9f commit 54fc100
Show file tree
Hide file tree
Showing 14 changed files with 471 additions and 178 deletions.
14 changes: 11 additions & 3 deletions docs/chapters/04_usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,15 @@ To the library can be imported by running:
import imaging_transcriptomics as imt
Once imported the package will contain the core ``ImagingTranscriptomics`` class, along with other useful functions.
Once imported the package will contain the core ``ImagingTranscriptomics``
class, along with other useful functions. To see all the available functions
imported in the library run:

..code:: python

dir(imt)

which will display all the functions and modules imported in the library.

ImagingTranscriptomics Class
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand All @@ -65,8 +73,8 @@ To start using the class the first step is to initialise it. A way to do this is
my_analysis = imt.ImagingTranscriptomics(my_data, n_components=1)
The ``my_data`` is an array that contains the data you want to analyse (e.g., the average values from the scan). This vector has to have some characteristics, mainly:

* it has to be a ``numpy.array`` with length 41, which corresponds to the number of regions in the left hemisphere of the Desikan-Killiany atlas.
* it has to contain the values you want to analyse but not the ``zscore`` of the values as this is computed automatically during the initialisation.

When initialising the class one might choose to initialise it using the ``n_components`` or the ``var`` attribute which represent the number of components to use for the regression or the variance to extract respectively.
After setting one, and only one, of the values are set, one can estiamte the other by running the ``my_analysis.pls_all_components()`` method which will compute the PLS regression with 15 components and estimate the variance explained by the user defined

2 changes: 1 addition & 1 deletion docs/chapters/08_faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ FAQ

#. **Why did you use the pypls library instead of some more maintained PLS library, e.g., sklearn?**
We used pypls instead of sklearn because the latter one, and most of the other available, are implemented using the NIPALS algorithm, while pypls uses the SIMPLS.
One of the main advantages of the SIMPLS algorithm in respoect to the NIPALS is that is is less time consuming.
One of the main advantages of the SIMPLS algorithm in respect to the NIPALS is that is is less time consuming.
17 changes: 9 additions & 8 deletions imaging_transcriptomics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.0.0"
__version__ = "1.0.1"

from . import errors
from . import bootstrap
Expand All @@ -8,11 +8,12 @@
from . import reporting

from .transcriptomics import ImagingTranscriptomics
from .bootstrap import (bootstrap_pls,
bootstrap_genes)
from .inputs import (load_gene_expression,
load_gene_labels,
get_components,
read_scan,
extract_average)
from .bootstrap import bootstrap_pls, bootstrap_genes, bootstrap_correlation
from .inputs import (
load_gene_expression,
load_gene_labels,
get_components,
read_scan,
extract_average,
)
from .genes import GeneResults
128 changes: 94 additions & 34 deletions imaging_transcriptomics/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
import logging.config
import yaml
from pathlib import Path
from itertools import product
from multiprocessing import Pool
from functools import partial

from scipy.stats import spearmanr
import numpy
import numpy as np
from tqdm import tqdm
Expand All @@ -25,9 +29,7 @@ def correlate(corr1, corr2):
:param corr1: first element to correlate.
:param corr2: second element to correlate.
"""
return np.corrcoef(np.hstack(
(corr1, corr2)
), rowvar=False)[0, 1:]
return np.corrcoef(np.hstack((corr1, corr2)), rowvar=False)[0, 1:]


def bootstrap_pls(x, y, y_perm, dim, iterations=1_000):
Expand All @@ -50,26 +52,31 @@ def bootstrap_pls(x, y, y_perm, dim, iterations=1_000):
R_boot = np.zeros(dim)
p_boot = np.zeros(dim)
for component in range(1, dim + 1):
pls_results = pls_regression(x, y,
n_components=component,
n_perm=0,
n_boot=0)
pls_results = pls_regression(x, y, n_components=component, n_perm=0, n_boot=0)
exp_var = pls_results.get("varexp")
temp = 100 * exp_var.cumsum(axis=0)
R_squared = temp[component - 1]
R_sq = np.zeros(iterations)
for i in tqdm(range(1000), desc=f"Bootstrapping on PLS component {component}", unit=" iterations"):
for i in tqdm(
range(1000),
desc=f"Bootstrapping on PLS component {component}",
unit=" iterations",
):
y_data = y_perm[:, i].reshape(41, 1)
_result = pls_regression(x, y_data,
n_components=component,
n_perm=0,
n_boot=0)
_result = pls_regression(
x, y_data, n_components=component, n_perm=0, n_boot=0
)
_exp_var = 100 * np.cumsum(_result.get("varexp"))
R_sq[i] = _exp_var[component - 1]
R_boot[component - 1] = R_squared
p_boot[component - 1] = float(len(R_sq[numpy.nonzero(R_sq >= R_squared)])) / iterations
logger.debug("Computed components p value(s) and coefficient(s) of determination: \n R: %s \n p: %s",
R_boot, p_boot)
p_boot[component - 1] = (
float(len(R_sq[numpy.nonzero(R_sq >= R_squared)])) / iterations
)
logger.debug(
"Computed components p value(s) and coefficient(s) of determination: \n R: %s \n p: %s",
R_boot,
p_boot,
)
return R_boot, p_boot


Expand All @@ -88,41 +95,94 @@ def bootstrap_genes(x_data, y, n_components, y_norm, genes, n_iterations=1000):
results as well.
"""
n_genes = 15_633
gene_index = np.array(list(range(1, n_genes+1)))
gene_index = np.array(list(range(1, n_genes + 1)))
results = pls_regression(x_data, y, n_components=n_components, n_boot=0, n_perm=0)
r1 = correlate(results.get("x_scores").reshape(41, n_components), y_norm.reshape(41, 1))
r1 = correlate(
results.get("x_scores").reshape(41, n_components), y_norm.reshape(41, 1)
)
logger.debug("Correlation between original data and regression scores: %s", r1)
weights = results.get("x_weights")
gene_results = GeneResults(n_components, dim1=weights.shape[0], dim2=weights.shape[1])
gene_results = GeneResults(
n_components, dim1=weights.shape[0], dim2=weights.shape[1]
)
scores = results.get("x_scores")
for i in range(r1.size):
if r1[i] < 0:
weights[:, i] *= -1
scores[:, i] *= -1
for idx in range(1, n_components+1):
x = np.argsort(weights[:, idx-1], kind='mergesort')[::-1]
gene_results.original_results.set_result_values(idx,
np.sort(weights[:, idx-1], kind='mergesort')[::-1],
x,
genes[x],
gene_index[x])
for idx in range(1, n_components + 1):
x = np.argsort(weights[:, idx - 1], kind="mergesort")[::-1]
gene_results.original_results.set_result_values(
idx,
np.sort(weights[:, idx - 1], kind="mergesort")[::-1],
x,
genes[x],
gene_index[x],
)

# Main genes bootstrap
for iteration in tqdm(range(n_iterations), desc="Bootstrapping gene list", unit=" iterations"):
for iteration in tqdm(
range(n_iterations), desc="Bootstrapping gene list", unit=" iterations"
):
my_resample = np.random.choice(41, size=41)
x_perm = x_data[my_resample, :]
y_perm = y[my_resample].reshape(41, 1)
results = pls_regression(x_perm, y_perm, n_components=n_components, n_perm=0, n_boot=0)
results = pls_regression(
x_perm, y_perm, n_components=n_components, n_perm=0, n_boot=0
)
_weights = results.get("x_weights")
for component in range(1, n_components+1):
__temp = _weights[:, component-1]
__new_weights = __temp[gene_results.original_results.index[component-1]]
__t_genes = np.array(gene_results.original_results.pls_weights[component-1])
for component in range(1, n_components + 1):
__temp = _weights[:, component - 1]
__new_weights = __temp[gene_results.original_results.index[component - 1]]
__t_genes = np.array(
gene_results.original_results.pls_weights[component - 1]
)
__correlation = correlate(
__t_genes.reshape(15633, 1),
__new_weights.reshape(15633, 1)
__t_genes.reshape(15633, 1), __new_weights.reshape(15633, 1)
)
if __correlation < 0:
__new_weights *= -1
gene_results.boot_results.pls_weights_boot[component-1][:, component-1, iteration] = __new_weights
gene_results.boot_results.pls_weights_boot[component - 1][
:, component - 1, iteration
] = __new_weights
return gene_results


def spearman_op(idx, permuted, y_data):
return spearmanr(permuted[:, idx[0]], y_data[:, idx[1]])[0]


def bootstrap_correlation(x_data, y_data, permuted, labels, n_iterations=1000):
"""
Bootstrap the results using pearson correlation.
:param x_data: imaging data
:param y_data: gene expression data
:param permuted: permuted matrix of imaging data
:param labels: labels of the genes (original order)
:return gene_results: GeneResults class with correlation results.
"""
gene_results = GeneResults(n_comp=1, dim1=1, dim2=y_data.shape[1])
n_genes = 15633
pool = Pool()
# original correlation
corr_ = np.zeros(y_data.shape[1])
for gene in range(n_genes):
corr_[gene], _ = spearmanr(x_data, y_data[:, gene])
# order results and set indexes in gene results
gene_results.original_results.pls_weights = np.sort(corr_, kind="mergesort")[::-1]
__idx = np.argsort(corr_, kind="mergesort")[::-1]
gene_results.original_results.pls_gene = labels[__idx]
gene_results.original_results.gene_id = __idx
# bootstrap
__res = np.zeros((15633, 1000))
_iter = product(range(n_iterations), range(n_genes))
for ind, result in tqdm(enumerate(pool.imap(partial(
spearman_op, permuted=permuted, y_data=y_data),
_iter, chunksize=25_000
)),
desc="Bootstrapping correlation",
unit="iterations"):
__res.flat[ind] = result
gene_results.boot_results.pls_weights_boot = __res
return gene_results
21 changes: 15 additions & 6 deletions imaging_transcriptomics/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@ class InvalidFormatError(Exception):
message -- optional user overridden error message to display.
"""

def __init__(self, error_file, message="The provided file has an invalid format. Please use files in the .nii, "
".nii.gz format."):
def __init__(
self,
error_file,
message="The provided file has an invalid format. Please use files in the .nii, "
".nii.gz format.",
):
self.error_file = error_file
self.message = message
super().__init__(self.message)
Expand All @@ -29,7 +33,9 @@ class InvalidSizeError(Exception):
* message -- optional user defined error message
"""

def __init__(self, shape=(182, 218, 182), message="The provided file has a wrong shape."):
def __init__(
self, shape=(182, 218, 182), message="The provided file has a wrong shape."
):
self.message = message
self.shape = shape
super().__init__(self.message)
Expand All @@ -40,16 +46,17 @@ def __str__(self):

# Checks Decorators
class CheckPath:
""" Decorator to check if a path exists.
"""Decorator to check if a path exists.
In order to run the function decorated the path provided to the function has to exists, otehrwise an error is
raised.
:raises FileNotFoundError:
"""

def __init__(self, function):
self.function = function

def __call__(self, path, *args, **kwargs):
if not Path(path).absolute().exists():
raise FileNotFoundError
Expand Down Expand Up @@ -81,9 +88,10 @@ class CheckShape:
:raises InvalidSizeError:
"""

def __init__(self, function):
self.function = function

def __call__(self, image, *args, **kwargs):
if not image.shape == (182, 218, 182):
raise InvalidSizeError(image.shape)
Expand All @@ -97,6 +105,7 @@ class CheckVariance:
:raises ValueError:
"""

def __init__(self, function):
self.function = function

Expand Down
Loading

0 comments on commit 54fc100

Please sign in to comment.