From faef6fb177615825b7a6cd3e405f3a020b3cd7e1 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Fri, 22 Oct 2021 11:00:20 +0200 Subject: [PATCH 01/15] first commit to new branch --- imaging_transcriptomics/__init__.py | 15 ++-- imaging_transcriptomics/bootstrap.py | 84 +++++++++++--------- imaging_transcriptomics/errors.py | 21 +++-- imaging_transcriptomics/genes.py | 26 ++++--- imaging_transcriptomics/inputs.py | 28 ++++--- imaging_transcriptomics/reporting.py | 33 +++++--- imaging_transcriptomics/transcriptomics.py | 89 ++++++++++++++-------- script/imagingtranscriptomics | 86 ++++++++++++--------- tests/errors_test.py | 20 ++++- tests/inputs_test.py | 8 +- tests/transcriptomics_test.py | 56 +++++++++++--- 11 files changed, 305 insertions(+), 161 deletions(-) diff --git a/imaging_transcriptomics/__init__.py b/imaging_transcriptomics/__init__.py index e290e2f..d8527c4 100644 --- a/imaging_transcriptomics/__init__.py +++ b/imaging_transcriptomics/__init__.py @@ -8,11 +8,12 @@ from . import reporting from .transcriptomics import ImagingTranscriptomics -from .bootstrap import (bootstrap_pls, - bootstrap_genes) -from .inputs import (load_gene_expression, - load_gene_labels, - get_components, - read_scan, - extract_average) +from .bootstrap import bootstrap_pls, bootstrap_genes +from .inputs import ( + load_gene_expression, + load_gene_labels, + get_components, + read_scan, + extract_average, +) from .genes import GeneResults diff --git a/imaging_transcriptomics/bootstrap.py b/imaging_transcriptomics/bootstrap.py index 003adbb..2197c84 100644 --- a/imaging_transcriptomics/bootstrap.py +++ b/imaging_transcriptomics/bootstrap.py @@ -25,9 +25,7 @@ def correlate(corr1, corr2): :param corr1: first element to correlate. :param corr2: second element to correlate. """ - return np.corrcoef(np.hstack( - (corr1, corr2) - ), rowvar=False)[0, 1:] + return np.corrcoef(np.hstack((corr1, corr2)), rowvar=False)[0, 1:] def bootstrap_pls(x, y, y_perm, dim, iterations=1_000): @@ -50,26 +48,31 @@ def bootstrap_pls(x, y, y_perm, dim, iterations=1_000): R_boot = np.zeros(dim) p_boot = np.zeros(dim) for component in range(1, dim + 1): - pls_results = pls_regression(x, y, - n_components=component, - n_perm=0, - n_boot=0) + pls_results = pls_regression(x, y, n_components=component, n_perm=0, n_boot=0) exp_var = pls_results.get("varexp") temp = 100 * exp_var.cumsum(axis=0) R_squared = temp[component - 1] R_sq = np.zeros(iterations) - for i in tqdm(range(1000), desc=f"Bootstrapping on PLS component {component}", unit=" iterations"): + for i in tqdm( + range(1000), + desc=f"Bootstrapping on PLS component {component}", + unit=" iterations", + ): y_data = y_perm[:, i].reshape(41, 1) - _result = pls_regression(x, y_data, - n_components=component, - n_perm=0, - n_boot=0) + _result = pls_regression( + x, y_data, n_components=component, n_perm=0, n_boot=0 + ) _exp_var = 100 * np.cumsum(_result.get("varexp")) R_sq[i] = _exp_var[component - 1] R_boot[component - 1] = R_squared - p_boot[component - 1] = float(len(R_sq[numpy.nonzero(R_sq >= R_squared)])) / iterations - logger.debug("Computed components p value(s) and coefficient(s) of determination: \n R: %s \n p: %s", - R_boot, p_boot) + p_boot[component - 1] = ( + float(len(R_sq[numpy.nonzero(R_sq >= R_squared)])) / iterations + ) + logger.debug( + "Computed components p value(s) and coefficient(s) of determination: \n R: %s \n p: %s", + R_boot, + p_boot, + ) return R_boot, p_boot @@ -88,41 +91,54 @@ def bootstrap_genes(x_data, y, n_components, y_norm, genes, n_iterations=1000): results as well. """ n_genes = 15_633 - gene_index = np.array(list(range(1, n_genes+1))) + gene_index = np.array(list(range(1, n_genes + 1))) results = pls_regression(x_data, y, n_components=n_components, n_boot=0, n_perm=0) - r1 = correlate(results.get("x_scores").reshape(41, n_components), y_norm.reshape(41, 1)) + r1 = correlate( + results.get("x_scores").reshape(41, n_components), y_norm.reshape(41, 1) + ) logger.debug("Correlation between original data and regression scores: %s", r1) weights = results.get("x_weights") - gene_results = GeneResults(n_components, dim1=weights.shape[0], dim2=weights.shape[1]) + gene_results = GeneResults( + n_components, dim1=weights.shape[0], dim2=weights.shape[1] + ) scores = results.get("x_scores") for i in range(r1.size): if r1[i] < 0: weights[:, i] *= -1 scores[:, i] *= -1 - for idx in range(1, n_components+1): - x = np.argsort(weights[:, idx-1], kind='mergesort')[::-1] - gene_results.original_results.set_result_values(idx, - np.sort(weights[:, idx-1], kind='mergesort')[::-1], - x, - genes[x], - gene_index[x]) + for idx in range(1, n_components + 1): + x = np.argsort(weights[:, idx - 1], kind="mergesort")[::-1] + gene_results.original_results.set_result_values( + idx, + np.sort(weights[:, idx - 1], kind="mergesort")[::-1], + x, + genes[x], + gene_index[x], + ) # Main genes bootstrap - for iteration in tqdm(range(n_iterations), desc="Bootstrapping gene list", unit=" iterations"): + for iteration in tqdm( + range(n_iterations), desc="Bootstrapping gene list", unit=" iterations" + ): my_resample = np.random.choice(41, size=41) x_perm = x_data[my_resample, :] y_perm = y[my_resample].reshape(41, 1) - results = pls_regression(x_perm, y_perm, n_components=n_components, n_perm=0, n_boot=0) + results = pls_regression( + x_perm, y_perm, n_components=n_components, n_perm=0, n_boot=0 + ) _weights = results.get("x_weights") - for component in range(1, n_components+1): - __temp = _weights[:, component-1] - __new_weights = __temp[gene_results.original_results.index[component-1]] - __t_genes = np.array(gene_results.original_results.pls_weights[component-1]) + for component in range(1, n_components + 1): + __temp = _weights[:, component - 1] + __new_weights = __temp[gene_results.original_results.index[component - 1]] + __t_genes = np.array( + gene_results.original_results.pls_weights[component - 1] + ) __correlation = correlate( - __t_genes.reshape(15633, 1), - __new_weights.reshape(15633, 1) + __t_genes.reshape(15633, 1), __new_weights.reshape(15633, 1) ) if __correlation < 0: __new_weights *= -1 - gene_results.boot_results.pls_weights_boot[component-1][:, component-1, iteration] = __new_weights + gene_results.boot_results.pls_weights_boot[component - 1][ + :, component - 1, iteration + ] = __new_weights return gene_results diff --git a/imaging_transcriptomics/errors.py b/imaging_transcriptomics/errors.py index 2e5df27..932f7ea 100644 --- a/imaging_transcriptomics/errors.py +++ b/imaging_transcriptomics/errors.py @@ -10,8 +10,12 @@ class InvalidFormatError(Exception): message -- optional user overridden error message to display. """ - def __init__(self, error_file, message="The provided file has an invalid format. Please use files in the .nii, " - ".nii.gz format."): + def __init__( + self, + error_file, + message="The provided file has an invalid format. Please use files in the .nii, " + ".nii.gz format.", + ): self.error_file = error_file self.message = message super().__init__(self.message) @@ -29,7 +33,9 @@ class InvalidSizeError(Exception): * message -- optional user defined error message """ - def __init__(self, shape=(182, 218, 182), message="The provided file has a wrong shape."): + def __init__( + self, shape=(182, 218, 182), message="The provided file has a wrong shape." + ): self.message = message self.shape = shape super().__init__(self.message) @@ -40,16 +46,17 @@ def __str__(self): # Checks Decorators class CheckPath: - """ Decorator to check if a path exists. + """Decorator to check if a path exists. In order to run the function decorated the path provided to the function has to exists, otehrwise an error is raised. :raises FileNotFoundError: """ + def __init__(self, function): self.function = function - + def __call__(self, path, *args, **kwargs): if not Path(path).absolute().exists(): raise FileNotFoundError @@ -81,9 +88,10 @@ class CheckShape: :raises InvalidSizeError: """ + def __init__(self, function): self.function = function - + def __call__(self, image, *args, **kwargs): if not image.shape == (182, 218, 182): raise InvalidSizeError(image.shape) @@ -97,6 +105,7 @@ class CheckVariance: :raises ValueError: """ + def __init__(self, function): self.function = function diff --git a/imaging_transcriptomics/genes.py b/imaging_transcriptomics/genes.py index 8baa623..b25db57 100644 --- a/imaging_transcriptomics/genes.py +++ b/imaging_transcriptomics/genes.py @@ -19,6 +19,7 @@ class OriginalResults(dict): """Class to hold the result from gene analysis (non bootstrapped).""" + def __init__(self, n_comp): super().__init__() self.pls_weights = [None] * n_comp @@ -30,26 +31,27 @@ def __init__(self, n_comp): def set_result_values(self, idx, pls_weights, x, pls_genes, gene_id): """Set the values for all class attributes. - + :param idx: index of the component to set the values in the array (must be in the range 0:n_components-1). :param pls_weights: weights to pass to the array at the index idx. :param pls_genes: genes to pass to the array at the index idx :param gene_id: id of the genes to pass to the array at the index idx :param x: index of the pls genes in the original gene list. - :return: + :return: """ self.pls_weights[idx - 1] = np.array(pls_weights) - logger.debug("Weights at index %s set as %s", idx-1, pls_weights) + logger.debug("Weights at index %s set as %s", idx - 1, pls_weights) self.pls_gene[idx - 1] = np.array(pls_genes) - logger.debug("Genes at index %s set as %s", idx-1, pls_genes) + logger.debug("Genes at index %s set as %s", idx - 1, pls_genes) self.gene_id[idx - 1] = gene_id - logger.debug("Genes ids at index %s set as %s", idx-1, gene_id) + logger.debug("Genes ids at index %s set as %s", idx - 1, gene_id) self.index[idx - 1] = np.array(x) self.pls_weights_z[idx - 1] = np.array(zscore(pls_weights, axis=0, ddof=1)) class BootResults(dict): """Class to hold the results from the bootstrapping gene analysis.""" + def __init__(self, n_comp, dim1, dim2, n_perm=1000): super().__init__() self.pls_weights_boot = [np.zeros((dim1, dim2, n_perm))] * n_comp @@ -71,20 +73,24 @@ def compute_values(self, n_comp, original_weights, original_ids): logger.info("Computing bootstrap gene results.") for component in range(1, n_comp + 1): logger.debug("Computing results for component %s", component) - self.std[component - 1] = self.pls_weights_boot[component - 1][:, component - 1, :].std(ddof=1, axis=1) + self.std[component - 1] = self.pls_weights_boot[component - 1][ + :, component - 1, : + ].std(ddof=1, axis=1) __temp = original_weights[component - 1] / self.std[component - 1] - self.z_scores[component - 1] = np.sort(__temp, kind='mergesort')[::-1] - __idx = np.argsort(__temp, kind='mergesort')[::-1] + self.z_scores[component - 1] = np.sort(__temp, kind="mergesort")[::-1] + __idx = np.argsort(__temp, kind="mergesort")[::-1] self.pls_genes[component - 1] = original_ids[component - 1][__idx] __p = norm.sf(abs(self.z_scores[component - 1])) self.pval[component - 1] = __p - _, __p_corr, _, _ = multipletests(__p[::-1].reshape(1, 15633), - method="fdr_bh", is_sorted=True) + _, __p_corr, _, _ = multipletests( + __p[::-1].reshape(1, 15633), method="fdr_bh", is_sorted=True + ) self.pval_corrected[component - 1] = __p_corr class GeneResults(dict): """Class to save the gene results.""" + def __init__(self, n_comp, dim1, dim2): super().__init__() self.original_results = OriginalResults(n_comp) diff --git a/imaging_transcriptomics/inputs.py b/imaging_transcriptomics/inputs.py index f81f2ca..87ff7a6 100644 --- a/imaging_transcriptomics/inputs.py +++ b/imaging_transcriptomics/inputs.py @@ -8,10 +8,7 @@ import pandas as pd from scipy.stats import zscore -from .errors import (CheckShape, - CheckPath, - CheckExtension, - CheckVariance) +from .errors import CheckShape, CheckPath, CheckExtension, CheckVariance cfg_file_path = Path(__file__).parent / "log_config.yaml" @@ -53,8 +50,11 @@ def extract_average(imaging_matrix): """ n_regions = 41 logger.debug("Extracting average from scan.") - atlas_data = nib.load(Path(__file__).resolve().parent / "data" / - "atlas-desikankilliany_1mm_MNI152.nii.gz").get_fdata() + atlas_data = nib.load( + Path(__file__).resolve().parent + / "data" + / "atlas-desikankilliany_1mm_MNI152.nii.gz" + ).get_fdata() data = np.zeros(n_regions) for i in range(1, n_regions + 1): data[i - 1] = np.mean(imaging_matrix[np.where(atlas_data == i)]) @@ -74,9 +74,11 @@ def get_components(target_variance, explained_var): """ dim = 1 cumulative_var = np.cumsum(explained_var) - while cumulative_var[dim-1] < target_variance: + while cumulative_var[dim - 1] < target_variance: dim += 1 - logger.debug("Extracted variance is %s with %s components", cumulative_var[dim-1], dim) + logger.debug( + "Extracted variance is %s with %s components", cumulative_var[dim - 1], dim + ) return dim @@ -89,8 +91,10 @@ def load_gene_expression(): :return: numpy array with the gene expression data. """ logger.debug("Loading gene_expression data.") - expression_file_path = Path(__file__).resolve().parent / "data" / "gene_expression_data.csv" - expression_data = pd.read_csv(expression_file_path, sep=',') + expression_file_path = ( + Path(__file__).resolve().parent / "data" / "gene_expression_data.csv" + ) + expression_data = pd.read_csv(expression_file_path, sep=",") my_data_x = expression_data.iloc[0:41, 2:].to_numpy() return zscore(my_data_x, ddof=1) @@ -102,5 +106,7 @@ def load_gene_labels(): :return: numpy array with the labels of the genes. """ logger.debug("Loading gene labels.") - genes_labels_path = Path(__file__).resolve().parent / "data" / "gene_expression_labels.txt" + genes_labels_path = ( + Path(__file__).resolve().parent / "data" / "gene_expression_labels.txt" + ) return pd.read_fwf(genes_labels_path, header=None).to_numpy() diff --git a/imaging_transcriptomics/reporting.py b/imaging_transcriptomics/reporting.py index 6e948db..25e3087 100644 --- a/imaging_transcriptomics/reporting.py +++ b/imaging_transcriptomics/reporting.py @@ -11,14 +11,16 @@ class PDF(FPDF): - """Class to generate a PDF report for the imaging-transcriptomics script. - """ + """Class to generate a PDF report for the imaging-transcriptomics script.""" + def header(self): """The header will contain always the title.""" self.rect(10, 10, 190, 280) self.line(10, 50, 200, 50) self.set_font("Helvetica", "B", 14) - self.cell(w=0, h=15, align="C", txt="Imaging Transcriptomics Analysis Report", ln=True) + self.cell( + w=0, h=15, align="C", txt="Imaging Transcriptomics Analysis Report", ln=True + ) def analysis_info(self, filename, date, filepath): """Info on the analysis performed. Information included are the name of @@ -90,8 +92,14 @@ def make_plots(path, limit_x, data_y): # Plot cumulative percentage variance plt.plot(range(1, 16), varexp, marker="o", color="sandybrown") - plt.plot(limit_x, varexp[limit_x - 1], 'o', color="red") - plt.vlines(limit_x, varexp[0] - 10, varexp[limit_x - 1], colors="lightgrey", linestyles="dashed") + plt.plot(limit_x, varexp[limit_x - 1], "o", color="red") + plt.vlines( + limit_x, + varexp[0] - 10, + varexp[limit_x - 1], + colors="lightgrey", + linestyles="dashed", + ) plt.hlines(varexp[limit_x - 1], 0, limit_x, colors="lightgrey", linestyles="dashed") plt.title("Cumulative variance explained by PLS components") plt.ylabel("Total explained variance (%)") @@ -124,7 +132,7 @@ def create_pdf(filepath, save_dir): filepath = Path(filepath) analysis_date = datetime.now().strftime("%d-%m-%Y") - report = PDF(orientation="P", unit="mm", format='A4') + report = PDF(orientation="P", unit="mm", format="A4") report.add_page() report.analysis_info(filename=filepath.name, date=analysis_date, filepath=filepath) report.pls_regression(path_plots=save_dir) @@ -145,10 +153,13 @@ def create_csv(analysis_results, n_comp, save_dir): if not isinstance(analysis_results, GeneResults): raise TypeError("The data are not of the GeneResults class.") for i in range(n_comp): - data = np.vstack((np.array(analysis_results.boot_results.pls_genes[i].reshape(1, 15633)), - np.array(analysis_results.boot_results.z_scores[i]), - np.array(analysis_results.boot_results.pval[i]), - np.array(analysis_results.boot_results.pval_corrected[i]) - )).T + data = np.vstack( + ( + np.array(analysis_results.boot_results.pls_genes[i].reshape(1, 15633)), + np.array(analysis_results.boot_results.z_scores[i]), + np.array(analysis_results.boot_results.pval[i]), + np.array(analysis_results.boot_results.pval_corrected[i]), + ) + ).T data = pd.DataFrame(data, columns=["Gene ID", "Z", "p", "p corrected"]) data.to_csv(save_dir / f"PLS{i+1}.csv", index=False) diff --git a/imaging_transcriptomics/transcriptomics.py b/imaging_transcriptomics/transcriptomics.py index 3ac23e0..103bab1 100644 --- a/imaging_transcriptomics/transcriptomics.py +++ b/imaging_transcriptomics/transcriptomics.py @@ -13,9 +13,7 @@ warnings.filterwarnings("ignore") from netneurotools import freesurfer, stats -from .inputs import (load_gene_expression, - load_gene_labels, - get_components) +from .inputs import load_gene_expression, load_gene_labels, get_components from .bootstrap import bootstrap_pls, bootstrap_genes cfg_file_path = Path(__file__).parent / "log_config.yaml" @@ -63,8 +61,10 @@ def check_input_length(data): :return: data if it has correct length. """ if not len(data) == 41: - raise AttributeError("The data must have a length of 41, corresponding to the number of regions in the " - "left brain hemisphere!") + raise AttributeError( + "The data must have a length of 41, corresponding to the number of regions in the " + "left brain hemisphere!" + ) return data @staticmethod @@ -83,7 +83,9 @@ def check_in_var(variance): elif 0.0 <= variance <= 1.0: return variance elif 1.0 < variance < 100: - logger.warning("The variance inputted was in the range 1-100. It has been converted to the range 0.0-1.0") + logger.warning( + "The variance inputted was in the range 1-100. It has been converted to the range 0.0-1.0" + ) return variance / 100 elif variance < 0.0: raise ValueError("The input variance cannot be negative!") @@ -110,7 +112,9 @@ def check_in_components(components): @staticmethod def check_var_or_comp(variance, components): if variance is None and components is None: - raise AttributeError("You must set either the variance or the number of components!") + raise AttributeError( + "You must set either the variance or the number of components!" + ) def permute_data(self, iterations=1_000): """Permute the scan data for the analysis. @@ -138,14 +142,20 @@ def permute_data(self, iterations=1_000): parcel_centroids, parcel_hemi = freesurfer.find_parcel_centroids( lhannot=annot_lh, rhannot=annot_rh, - version='fsaverage5', - surf='sphere', - method="surface") + version="fsaverage5", + surf="sphere", + method="surface", + ) # Mask the results to have only the left hemisphere left_hemi_mask = parcel_hemi == 0 - parcel_centroids, parcel_hemi = parcel_centroids[left_hemi_mask], parcel_hemi[left_hemi_mask] + parcel_centroids, parcel_hemi = ( + parcel_centroids[left_hemi_mask], + parcel_hemi[left_hemi_mask], + ) # Get the spin samples - spins = stats.gen_spinsamples(parcel_centroids, parcel_hemi, n_rotate=iterations, method='vasa', seed=1234) + spins = stats.gen_spinsamples( + parcel_centroids, parcel_hemi, n_rotate=iterations, method="vasa", seed=1234 + ) cort_permuted = np.array(self._cortical[spins]).reshape(34, iterations) self.permuted[0:34, :] = cort_permuted logger.debug("End permutations.") @@ -157,8 +167,10 @@ def save_permutations(self, path): "~/Documents/my_permuted.csv" """ if self.permuted is None: - raise AttributeError("There are no permutations of the scan available to save. Before saving the " - "permutations you need to compute them.") + raise AttributeError( + "There are no permutations of the scan available to save. Before saving the " + "permutations you need to compute them." + ) logger.info("Saving permutations to file %s", path) pd.DataFrame(self.permuted).to_csv(Path(path), header=None, index=False) @@ -169,15 +181,22 @@ def pls_all_components(self): given by the components is estimated, depending on what is set by the user in the __init__() method. """ logger.debug("Performing PLS with all 15 components.") - results = pls_regression(self._gene_expression, self.zscore_data.reshape(41, 1), - n_components=15, n_perm=0, n_boot=0) + results = pls_regression( + self._gene_expression, + self.zscore_data.reshape(41, 1), + n_components=15, + n_perm=0, + n_boot=0, + ) var_exp = results.get("varexp") if self.n_components is None and self.var != 0.0: self.n_components = get_components(self.var, var_exp) logger.debug("Number of components has been set to: %s", self.n_components) elif self.var is None and self.n_components != 0: - self.var = np.cumsum(var_exp)[self.n_components-1] - logger.debug("Variance has been set to: %s", self.var) # add number of variance set + self.var = np.cumsum(var_exp)[self.n_components - 1] + logger.debug( + "Variance has been set to: %s", self.var + ) # add number of variance set self.var_components = var_exp def run(self, n_iter=1_000): @@ -188,18 +207,24 @@ def run(self, n_iter=1_000): logger.info("Starting imaging transcriptomics analysis.") self.pls_all_components() self.permute_data(iterations=n_iter) - self.r_boot, self.p_boot = bootstrap_pls(self._gene_expression, - self.zscore_data.reshape(41, 1), - self.permuted, - self.n_components, - iterations=n_iter) - self.gene_results = bootstrap_genes(self._gene_expression, - self.zscore_data.reshape(41, 1), - self.n_components, - self.scan_data, - self._gene_labels, - n_iter) - self.gene_results.boot_results.compute_values(self.n_components, - self.gene_results.original_results.pls_weights, - self.gene_results.original_results.pls_gene) + self.r_boot, self.p_boot = bootstrap_pls( + self._gene_expression, + self.zscore_data.reshape(41, 1), + self.permuted, + self.n_components, + iterations=n_iter, + ) + self.gene_results = bootstrap_genes( + self._gene_expression, + self.zscore_data.reshape(41, 1), + self.n_components, + self.scan_data, + self._gene_labels, + n_iter, + ) + self.gene_results.boot_results.compute_values( + self.n_components, + self.gene_results.original_results.pls_weights, + self.gene_results.original_results.pls_gene, + ) logger.info("Imaging transcriptomics analysis completed.") diff --git a/script/imagingtranscriptomics b/script/imagingtranscriptomics index bfb07c9..47241f7 100644 --- a/script/imagingtranscriptomics +++ b/script/imagingtranscriptomics @@ -20,7 +20,9 @@ def get_args(): :return: Data structure with all args parsed from the command line. """ - DESCRIPTION = """Perform imaging transcriptomics analysis on a neuroimaging scan. """ + DESCRIPTION = ( + """Perform imaging transcriptomics analysis on a neuroimaging scan. """ + ) EPILOG = """Check your results in the specified folder or in the file path of the input scan, if you have not specified an output path. If you used this software in your research please cite: * Imaging transcriptomics: **Convergent cellular, transcriptomic, and molecular neuroimaging signatures in the @@ -29,36 +31,53 @@ def get_args(): PET templates working group*; bioRxiv 2021.06.18.448872; doi: `https://doi.org/10.1101/2021.06.18.448872 `_ """ - parser = argparse.ArgumentParser(description=DESCRIPTION, - epilog=EPILOG) - - parser.add_argument("-i", "--input", - type=str, - help=("Input imaging file in NIfTI format (.nii, .nii.gz).\n" - "The input file is expected to have the same matrix size as the atlas used (182x218x182)," - "if the input image has different matrix size this can be resliced to match the" - "resolution of the MNI152 1mm template matrix size (e.g. the one provided with FSL)."), - required=True) - parser.add_argument("-o", "--out", - type=str, - help="Path where to save the output, if not specified the path of the path of the input scan " - "is used.", - required=False) + parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG) + + parser.add_argument( + "-i", + "--input", + type=str, + help=( + "Input imaging file in NIfTI format (.nii, .nii.gz).\n" + "The input file is expected to have the same matrix size as the atlas used (182x218x182)," + "if the input image has different matrix size this can be resliced to match the" + "resolution of the MNI152 1mm template matrix size (e.g. the one provided with FSL)." + ), + required=True, + ) + parser.add_argument( + "-o", + "--out", + type=str, + help="Path where to save the output, if not specified the path of the path of the input scan " + "is used.", + required=False, + ) group = parser.add_mutually_exclusive_group(required=True) - group.add_argument("-n", "--ncomp", - type=int, - help="Number of PLS components to use. The number of components has to be between 1 and 15.") - group.add_argument("-v", "--variance", - help="""Variance explained by the components. The variance input should be between 10 and + group.add_argument( + "-n", + "--ncomp", + type=int, + help="Number of PLS components to use. The number of components has to be between 1 and 15.", + ) + group.add_argument( + "-v", + "--variance", + help="""Variance explained by the components. The variance input should be between 10 and 100, and the program will select the number of components that explain a variance closest to - the desired (with the explained variance used as a minimum). """) + the desired (with the explained variance used as a minimum). """, + ) verbosity = parser.add_mutually_exclusive_group(required=False) - verbosity.add_argument("--suppress", - action="store_true", - help="Suppress the log on console. Only shows WARNINGS if present.") - verbosity.add_argument("--verbose", - action="store_true", - help="Show all logging messages from the script.") + verbosity.add_argument( + "--suppress", + action="store_true", + help="Suppress the log on console. Only shows WARNINGS if present.", + ) + verbosity.add_argument( + "--verbose", + action="store_true", + help="Show all logging messages from the script.", + ) return parser.parse_args() @@ -88,10 +107,7 @@ def main(): imaging_transcriptomics.inputs.read_scan(input_path) ) # Don't check inputs as checks are in the initialization of the analysis! - initial_dict = { - "variance": inputs.variance, - "n_components": inputs.ncomp - } + initial_dict = {"variance": inputs.variance, "n_components": inputs.ncomp} # Get IO paths to save files if not inputs.out: @@ -104,7 +120,9 @@ def main(): file_handler = logging.FileHandler(save_dir / "logs.log", "a") logger.addHandler(file_handler) logger.info("Setting up analysis ...") - analysis = imaging_transcriptomics.ImagingTranscriptomics(data_to_analyse, **initial_dict) + analysis = imaging_transcriptomics.ImagingTranscriptomics( + data_to_analyse, **initial_dict + ) logger.info("Running analysis.") analysis.run() @@ -114,5 +132,5 @@ def main(): reporting.create_pdf(input_path, save_dir) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/errors_test.py b/tests/errors_test.py index 8c41256..b92a23a 100644 --- a/tests/errors_test.py +++ b/tests/errors_test.py @@ -8,16 +8,18 @@ CheckShape, InvalidSizeError, CheckVariance, - CheckPath + CheckPath, ) def test_check_extension(): """Test that the CheckExtension decorator works""" + @CheckExtension def function(path): """Dummy function to test the decorator.""" return path + test_path = Path(__file__).parent / "data" p = function(test_path / "anatomical.nii") assert str(p) == str(Path(__file__).parent / "data" / "anatomical.nii") @@ -25,6 +27,7 @@ def function(path): def test_check_extension_error(): """Test that the CheckExtension decorator throws the correct error.""" + @CheckExtension def function(path): """Dummy function to test the decorator.""" @@ -34,8 +37,11 @@ def function(path): with pytest.raises(InvalidFormatError) as ex: function(test_path / "wrong_format.txt") - assert str(ex.value) == f"The provided file has an invalid format. Please use files in the .nii, .nii.gz format. " \ - f"The error was caused by the file {test_path.absolute()}/wrong_format.txt." + assert ( + str(ex.value) + == f"The provided file has an invalid format. Please use files in the .nii, .nii.gz format. " + f"The error was caused by the file {test_path.absolute()}/wrong_format.txt." + ) def test_check_shape(): @@ -46,6 +52,7 @@ def test_check_shape(): def function(in_matrix): """Dummy function to test the decorator.""" return in_matrix.shape + assert function(matrix) == (182, 218, 182) @@ -61,11 +68,15 @@ def function(in_matrix): with pytest.raises(InvalidSizeError) as ex: function(matrix) - assert str(ex.value) == "The provided file has a wrong shape. The file has shape: (171, 230, 167)" + assert ( + str(ex.value) + == "The provided file has a wrong shape. The file has shape: (171, 230, 167)" + ) def test_check_variance(): """Test the CheckVariance decorator.""" + @CheckVariance def function(var): """Dummy function to test the decorator.""" @@ -76,6 +87,7 @@ def function(var): def test_check_variance_error(): """Test that the CheckVariance decorator throws the correct error.""" + @CheckVariance def function(var): """Dummy function to test the decorator.""" diff --git a/tests/inputs_test.py b/tests/inputs_test.py index 875151c..575415c 100644 --- a/tests/inputs_test.py +++ b/tests/inputs_test.py @@ -1,7 +1,12 @@ import numpy as np import pytest -from imaging_transcriptomics.inputs import extract_average, get_components, load_gene_expression, load_gene_labels +from imaging_transcriptomics.inputs import ( + extract_average, + get_components, + load_gene_expression, + load_gene_labels, +) # def test_read_scan(): @@ -54,4 +59,3 @@ def test_gene_labels_load(): assert labels[1635] == "C6orf106" assert "SLC7A10" in labels assert "audhd49b" not in labels # just some randoms string to mimic a gene. - diff --git a/tests/transcriptomics_test.py b/tests/transcriptomics_test.py index e3b9209..ea2c7a0 100644 --- a/tests/transcriptomics_test.py +++ b/tests/transcriptomics_test.py @@ -31,7 +31,7 @@ def test_init_error_variance(): for test in t_param_err: with pytest.raises(ValueError): imt.ImagingTranscriptomics(np.ones(41), variance=test) - test = 'a' + test = "a" with pytest.raises(TypeError): imt.ImagingTranscriptomics(np.ones(41), variance=test) @@ -53,15 +53,51 @@ def test_init_length_error(): # METHODS TESTS def test_permutations(): """Test the permutations are computed correctly.""" - t_data = np.array([2.49176123, 2.12076098, 1.88912675, 2.29363057, 2.17108429, - 2.44944779, 1.9532944, 2.13951822, 1.81947959, 1.58996705, - 1.91860982, 2.30857561, 2.39706742, 2.03412347, 2.0920649, - 1.89473161, 2.05717326, 1.20646305, 1.72044527, 2.0083166, - 1.66318842, 2.06091217, 1.72413881, 2.33628019, 2.61411213, - 1.807411, 1.96163793, 1.85169722, 2.11455623, 1.92936416, - 1.28974378, 1.81579151, 2.66449885, 2.67599858, 1.13808303, - 1.40784474, 2.70367057, 2.00515875, 2.49107748, 1.75756543, - 2.29094877]) + t_data = np.array( + [ + 2.49176123, + 2.12076098, + 1.88912675, + 2.29363057, + 2.17108429, + 2.44944779, + 1.9532944, + 2.13951822, + 1.81947959, + 1.58996705, + 1.91860982, + 2.30857561, + 2.39706742, + 2.03412347, + 2.0920649, + 1.89473161, + 2.05717326, + 1.20646305, + 1.72044527, + 2.0083166, + 1.66318842, + 2.06091217, + 1.72413881, + 2.33628019, + 2.61411213, + 1.807411, + 1.96163793, + 1.85169722, + 2.11455623, + 1.92936416, + 1.28974378, + 1.81579151, + 2.66449885, + 2.67599858, + 1.13808303, + 1.40784474, + 2.70367057, + 2.00515875, + 2.49107748, + 1.75756543, + 2.29094877, + ] + ) test = imt.ImagingTranscriptomics(t_data, n_components=1) assert test.permuted is None test.permute_data(10) From a845cf069b2ee60e1bbbcaf9d6c0a3c0c4a4101c Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Tue, 26 Oct 2021 09:59:56 +0200 Subject: [PATCH 02/15] first correlation nanalysis draft --- imaging_transcriptomics/transcriptomics.py | 77 +++++++++++++++------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/imaging_transcriptomics/transcriptomics.py b/imaging_transcriptomics/transcriptomics.py index 103bab1..231fbdb 100644 --- a/imaging_transcriptomics/transcriptomics.py +++ b/imaging_transcriptomics/transcriptomics.py @@ -7,14 +7,14 @@ import pandas as pd import yaml from pyls import pls_regression -from scipy.stats import zscore +from scipy.stats import zscore, pearsonr with warnings.catch_warnings(): warnings.filterwarnings("ignore") from netneurotools import freesurfer, stats from .inputs import load_gene_expression, load_gene_labels, get_components -from .bootstrap import bootstrap_pls, bootstrap_genes +from .bootstrap import bootstrap_pls, bootstrap_genes, bootstrap_correlation cfg_file_path = Path(__file__).parent / "log_config.yaml" with open(cfg_file_path, "r") as config_file: @@ -174,6 +174,18 @@ def save_permutations(self, path): logger.info("Saving permutations to file %s", path) pd.DataFrame(self.permuted).to_csv(Path(path), header=None, index=False) + def correlation(self): + """ Calculate the correlation between the imaging and genetic data. + + :return corr_: pearson correlation coefficient + :return p_val: p_val of the correlation + """ + corr_ = np.zeros(self._gene_expression.shape) + p_val = np.zeros(self._gene_expression.shape) + for gene in range(15633): + corr_[:,gene], p_val[:,gene] = pearsonr(self.zscore_data, self._gene_expression[:,gene]) + return corr_, p_val + def pls_all_components(self): """Compute a PLS regression with all components. @@ -199,32 +211,47 @@ def pls_all_components(self): ) # add number of variance set self.var_components = var_exp - def run(self, n_iter=1_000): + def run(self, n_iter=1_000, method="pls"): """Run the analysis of the imaging scan. :param int n_iter: number of permutations to make. + :param str method: method to run the analysis, can be either "pls" + for pls regression or "corr" cor simple correlation analysis. """ logger.info("Starting imaging transcriptomics analysis.") - self.pls_all_components() - self.permute_data(iterations=n_iter) - self.r_boot, self.p_boot = bootstrap_pls( - self._gene_expression, - self.zscore_data.reshape(41, 1), - self.permuted, - self.n_components, - iterations=n_iter, - ) - self.gene_results = bootstrap_genes( - self._gene_expression, - self.zscore_data.reshape(41, 1), - self.n_components, - self.scan_data, - self._gene_labels, - n_iter, - ) - self.gene_results.boot_results.compute_values( - self.n_components, - self.gene_results.original_results.pls_weights, - self.gene_results.original_results.pls_gene, - ) + if method is "pls": + logger.info("Running analysis with PLS regression") + self.pls_all_components() + self.permute_data(iterations=n_iter) + self.r_boot, self.p_boot = bootstrap_pls( + self._gene_expression, + self.zscore_data.reshape(41, 1), + self.permuted, + self.n_components, + iterations=n_iter, + ) + self.gene_results = bootstrap_genes( + self._gene_expression, + self.zscore_data.reshape(41, 1), + self.n_components, + self.scan_data, + self._gene_labels, + n_iter, + ) + self.gene_results.boot_results.compute_values( + self.n_components, + self.gene_results.original_results.pls_weights, + self.gene_results.original_results.pls_gene, + ) + elif method is "corr": + logger.info("Running analysis with correlation.") + # run first analysis + self.permute_data(iterations=n_iter) + # bootstrap analysis + self.gene_results = bootstrap_correlation() + pass + else: + raise NotImplementedError(f"The method {method} does not exist. " + f"Please choose either pls or corr as " + f"method to run the analysis.") logger.info("Imaging transcriptomics analysis completed.") From 5f42f34ca2b1cd97da9a682f6d15499f92a43f40 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Thu, 28 Oct 2021 16:49:00 +0200 Subject: [PATCH 03/15] added correlation method --- imaging_transcriptomics/transcriptomics.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/imaging_transcriptomics/transcriptomics.py b/imaging_transcriptomics/transcriptomics.py index 231fbdb..efdf298 100644 --- a/imaging_transcriptomics/transcriptomics.py +++ b/imaging_transcriptomics/transcriptomics.py @@ -177,14 +177,18 @@ def save_permutations(self, path): def correlation(self): """ Calculate the correlation between the imaging and genetic data. - :return corr_: pearson correlation coefficient - :return p_val: p_val of the correlation + :return corr_genes: pearson correlation coefficient ordered in + descending order. + :return corr_gene_labels: labels of the genes ordered by correlation coefficient. """ - corr_ = np.zeros(self._gene_expression.shape) - p_val = np.zeros(self._gene_expression.shape) + corr_ = np.zeros(self._gene_expression.shape[1]) + p_val = np.zeros(self._gene_expression.shape[1]) for gene in range(15633): - corr_[:,gene], p_val[:,gene] = pearsonr(self.zscore_data, self._gene_expression[:,gene]) - return corr_, p_val + corr_[gene], p_val[gene] = pearsonr(self.zscore_data, + self._gene_expression[:,gene]) + corr_genes = np.sort(corr_) + corr_gene_labels = self._gene_labels[np.argsort(corr_)] + return corr_genes, corr_gene_labels def pls_all_components(self): """Compute a PLS regression with all components. @@ -248,7 +252,9 @@ def run(self, n_iter=1_000, method="pls"): # run first analysis self.permute_data(iterations=n_iter) # bootstrap analysis - self.gene_results = bootstrap_correlation() + self.gene_results = bootstrap_correlation(self.zscore_data, + self._gene_results, + self._gene_labels) pass else: raise NotImplementedError(f"The method {method} does not exist. " From 99352b301e3a335adaeffef546ea3e892c3516d2 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Thu, 28 Oct 2021 16:49:20 +0200 Subject: [PATCH 04/15] changed version --- imaging_transcriptomics/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imaging_transcriptomics/__init__.py b/imaging_transcriptomics/__init__.py index d8527c4..d6533df 100644 --- a/imaging_transcriptomics/__init__.py +++ b/imaging_transcriptomics/__init__.py @@ -1,4 +1,4 @@ -__version__ = "1.0.0" +__version__ = "1.0.1" from . import errors from . import bootstrap From 879a26f0b0c1b38808975b293c25c70ba189a3e1 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Fri, 5 Nov 2021 20:07:40 +0100 Subject: [PATCH 05/15] analysis works, need fix on genes value computation --- imaging_transcriptomics/__init__.py | 2 +- imaging_transcriptomics/bootstrap.py | 44 ++++++++++++++++++++++ imaging_transcriptomics/genes.py | 28 +++++++++++++- imaging_transcriptomics/transcriptomics.py | 32 ++++++++++------ 4 files changed, 92 insertions(+), 14 deletions(-) diff --git a/imaging_transcriptomics/__init__.py b/imaging_transcriptomics/__init__.py index d6533df..651e4d1 100644 --- a/imaging_transcriptomics/__init__.py +++ b/imaging_transcriptomics/__init__.py @@ -8,7 +8,7 @@ from . import reporting from .transcriptomics import ImagingTranscriptomics -from .bootstrap import bootstrap_pls, bootstrap_genes +from .bootstrap import bootstrap_pls, bootstrap_genes, bootstrap_correlation from .inputs import ( load_gene_expression, load_gene_labels, diff --git a/imaging_transcriptomics/bootstrap.py b/imaging_transcriptomics/bootstrap.py index 2197c84..92cdbc6 100644 --- a/imaging_transcriptomics/bootstrap.py +++ b/imaging_transcriptomics/bootstrap.py @@ -2,7 +2,11 @@ import logging.config import yaml from pathlib import Path +from itertools import product +from multiprocessing import Pool +from functools import partial +from scipy.stats import spearmanr import numpy import numpy as np from tqdm import tqdm @@ -142,3 +146,43 @@ def bootstrap_genes(x_data, y, n_components, y_norm, genes, n_iterations=1000): :, component - 1, iteration ] = __new_weights return gene_results + + +def spearman_op(idx, permuted, y_data): + return spearmanr(permuted[:, idx[0]], y_data[:, idx[1]])[0] + + +def bootstrap_correlation(x_data, y_data, permuted, labels, n_iterations=1000): + """ + Bootstrap the results using pearson correlation. + + :param x_data: imaging data + :param y_data: gene expression data + :param permuted: permuted matrix of imaging data + :param labels: labels of the genes (original order) + :return gene_results: GeneResults class with correlation results. + """ + gene_results = GeneResults(n_comp=1, dim1=1, dim2=y_data.shape[1]) + n_genes = 15633 + pool = Pool() + # original correlation + corr_ = np.zeros(y_data.shape[1]) + for gene in range(n_genes): + corr_[gene], _ = spearmanr(x_data, y_data[:, gene]) + # order results and set indexes in gene results + gene_results.original_results.pls_weights = np.sort(corr_, kind="mergesort")[::-1] + __idx = np.argsort(corr_, kind="mergesort")[::-1] + gene_results.original_results.labels = labels[__idx] + gene_results.original_results.gene_id = __idx + # bootstrap + __res = np.zeros((15633, 1000)) + _iter = product(range(n_iterations), range(n_genes)) + for ind, result in tqdm(enumerate(pool.imap(partial( + spearman_op, permuted=permuted, y_data=y_data), + _iter, chunksize=25_000 + )), + desc="Bootstrapping correlation", + unit="iterations"): + __res.flat[ind] = result + gene_results.boot_results.pls_weights_boot = __res + return gene_results diff --git a/imaging_transcriptomics/genes.py b/imaging_transcriptomics/genes.py index b25db57..d8fd570 100644 --- a/imaging_transcriptomics/genes.py +++ b/imaging_transcriptomics/genes.py @@ -66,8 +66,8 @@ def compute_values(self, n_comp, original_weights, original_ids): """Compute the values of the bootstrap for each of the components. :param n_comp: number of PLS components. - :param original_weights: weights obtained from the original analysis (non bootstrapped). - :param original_ids: original ids (labels) from the original analysis (non bootstrapped). + :param original_weights: weights obtained from the original analysis (not bootstrapped) + :param original_ids: original ids (labels) from the original analysis (not bootstrapped) :return: """ logger.info("Computing bootstrap gene results.") @@ -87,6 +87,30 @@ def compute_values(self, n_comp, original_weights, original_ids): ) self.pval_corrected[component - 1] = __p_corr + def compute_corr(self, original_corr, originals_ids, original_index): + """Compute bootstrapping values for correlation data. + :param + """ + logger.info("Computing bootstrap correlation results") + n_iter = 1000 + for i in range(n_iter): + self.pls_weights_boot[:, i] = self.pls_weights_boot[:, i][ + original_index + ] + p = np.zeros(15633) + for i in range(15633): + p[i] = ( + float( + len(np.nonzero(self.pls_weights_boot[i, :] >= original_corr[i])) + ) + / n_iter + ) + self.pval = p + _, p_corrected, _ ,_= multipletests(p[::-1], method="fdr_bh", + is_sorted=True) + self.pval_corrected = p_corrected + self.pls_genes = originals_ids[original_index] + class GeneResults(dict): """Class to save the gene results.""" diff --git a/imaging_transcriptomics/transcriptomics.py b/imaging_transcriptomics/transcriptomics.py index efdf298..7a94425 100644 --- a/imaging_transcriptomics/transcriptomics.py +++ b/imaging_transcriptomics/transcriptomics.py @@ -7,7 +7,7 @@ import pandas as pd import yaml from pyls import pls_regression -from scipy.stats import zscore, pearsonr +from scipy.stats import zscore, spearmanr with warnings.catch_warnings(): warnings.filterwarnings("ignore") @@ -175,7 +175,7 @@ def save_permutations(self, path): pd.DataFrame(self.permuted).to_csv(Path(path), header=None, index=False) def correlation(self): - """ Calculate the correlation between the imaging and genetic data. + """Calculate the correlation between the imaging and genetic data. :return corr_genes: pearson correlation coefficient ordered in descending order. @@ -184,8 +184,9 @@ def correlation(self): corr_ = np.zeros(self._gene_expression.shape[1]) p_val = np.zeros(self._gene_expression.shape[1]) for gene in range(15633): - corr_[gene], p_val[gene] = pearsonr(self.zscore_data, - self._gene_expression[:,gene]) + corr_[gene], p_val[gene] = spearmanr( + self.zscore_data, self._gene_expression[:, gene] + ) corr_genes = np.sort(corr_) corr_gene_labels = self._gene_labels[np.argsort(corr_)] return corr_genes, corr_gene_labels @@ -252,12 +253,21 @@ def run(self, n_iter=1_000, method="pls"): # run first analysis self.permute_data(iterations=n_iter) # bootstrap analysis - self.gene_results = bootstrap_correlation(self.zscore_data, - self._gene_results, - self._gene_labels) - pass + self.gene_results = bootstrap_correlation( + self.zscore_data, + self._gene_expression, + self.permuted, + self._gene_labels, + ) + self.gene_results.boot_results.compute_corr( + self.gene_results.original_results.pls_weights, + self.gene_results.original_results.pls_gene, + self.gene_results.original_results.gene_id, + ) else: - raise NotImplementedError(f"The method {method} does not exist. " - f"Please choose either pls or corr as " - f"method to run the analysis.") + raise NotImplementedError( + f"The method {method} does not exist. " + f"Please choose either pls or corr as " + f"method to run the analysis." + ) logger.info("Imaging transcriptomics analysis completed.") From 0cad67b9968b17ffcb37dd7faf0557abafc51598 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Mon, 8 Nov 2021 14:48:37 +0000 Subject: [PATCH 06/15] changes to analysis --- imaging_transcriptomics/genes.py | 9 +++++---- imaging_transcriptomics/transcriptomics.py | 2 +- script/imagingtranscriptomics | 6 +++++- tests/transcriptomics_test.py | 7 +++++++ 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/imaging_transcriptomics/genes.py b/imaging_transcriptomics/genes.py index d8fd570..4d24193 100644 --- a/imaging_transcriptomics/genes.py +++ b/imaging_transcriptomics/genes.py @@ -101,15 +101,16 @@ def compute_corr(self, original_corr, originals_ids, original_index): for i in range(15633): p[i] = ( float( - len(np.nonzero(self.pls_weights_boot[i, :] >= original_corr[i])) + len(np.nonzero(original_corr[i] < self.pls_weights_boot[ + i,:])) ) / n_iter ) self.pval = p - _, p_corrected, _ ,_= multipletests(p[::-1], method="fdr_bh", - is_sorted=True) + _, p_corrected, _, _ = multipletests(p[::-1], method="fdr_bh", + is_sorted=True) self.pval_corrected = p_corrected - self.pls_genes = originals_ids[original_index] + self.pls_genes = np.array(originals_ids)[original_index.astype(int)] class GeneResults(dict): diff --git a/imaging_transcriptomics/transcriptomics.py b/imaging_transcriptomics/transcriptomics.py index 7a94425..3067211 100644 --- a/imaging_transcriptomics/transcriptomics.py +++ b/imaging_transcriptomics/transcriptomics.py @@ -261,7 +261,7 @@ def run(self, n_iter=1_000, method="pls"): ) self.gene_results.boot_results.compute_corr( self.gene_results.original_results.pls_weights, - self.gene_results.original_results.pls_gene, + self.gene_results.original_results.labels, self.gene_results.original_results.gene_id, ) else: diff --git a/script/imagingtranscriptomics b/script/imagingtranscriptomics index 47241f7..1bc352d 100644 --- a/script/imagingtranscriptomics +++ b/script/imagingtranscriptomics @@ -53,7 +53,11 @@ def get_args(): "is used.", required=False, ) - group = parser.add_mutually_exclusive_group(required=True) + method = parser.add_mutually_exclusive_group(required=True) + method.add_argument("--corr", action="store_true", help="run with the " + "correlation " + "method") + group = method.add_mutually_exclusive_group(required=True) group.add_argument( "-n", "--ncomp", diff --git a/tests/transcriptomics_test.py b/tests/transcriptomics_test.py index ea2c7a0..50328a4 100644 --- a/tests/transcriptomics_test.py +++ b/tests/transcriptomics_test.py @@ -110,3 +110,10 @@ def test_saving_error(): test = imt.ImagingTranscriptomics(np.zeros(41), n_components=1) with pytest.raises(AttributeError): test.save_permutations(Path().cwd()) + + +def test_correlation(): + """Test the correlation method is performing correctly.""" + test = imt.ImagingTranscriptomics(np.random.rand(41), n_components=1) + genes, labels = test.correlation() + assert genes.shape == (41,) From c8d162ea0f663128751f8a663bb16f40d326949d Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 10 Nov 2021 08:57:42 +0000 Subject: [PATCH 07/15] fixed tests --- tests/auto_test.py | 2 +- tests/transcriptomics_test.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/auto_test.py b/tests/auto_test.py index 7c98e0c..61473b4 100644 --- a/tests/auto_test.py +++ b/tests/auto_test.py @@ -16,9 +16,9 @@ def test_modules_import(): assert "errors" in imported assert "genes" in imported assert "inputs" in imported - assert "permutations" in imported assert "reporting" in imported assert "transcriptomics" in imported + assert "oermutatiuons" not in imported def test_functions_import(): diff --git a/tests/transcriptomics_test.py b/tests/transcriptomics_test.py index 50328a4..0b17ad5 100644 --- a/tests/transcriptomics_test.py +++ b/tests/transcriptomics_test.py @@ -116,4 +116,5 @@ def test_correlation(): """Test the correlation method is performing correctly.""" test = imt.ImagingTranscriptomics(np.random.rand(41), n_components=1) genes, labels = test.correlation() - assert genes.shape == (41,) + assert genes.shape == (15633,) + assert labels.shape == (15633,1) From c444161ea8e17d3e72dd7011de4124d497712480 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 10 Nov 2021 12:53:53 +0000 Subject: [PATCH 08/15] added function to save correlation results --- imaging_transcriptomics/reporting.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/imaging_transcriptomics/reporting.py b/imaging_transcriptomics/reporting.py index 25e3087..948fcf8 100644 --- a/imaging_transcriptomics/reporting.py +++ b/imaging_transcriptomics/reporting.py @@ -163,3 +163,26 @@ def create_csv(analysis_results, n_comp, save_dir): ).T data = pd.DataFrame(data, columns=["Gene ID", "Z", "p", "p corrected"]) data.to_csv(save_dir / f"PLS{i+1}.csv", index=False) + + +def create_corr_csv(analysis_results, save_dir): + """Create a csv file for the correlation coefficients. + + :param analysis_results: GeneResults data structure with the results of bootstrapping. + :param save_dir: path where the output will be saved. + :return: + """ + if not isinstance(analysis_results, GeneResults): + raise TypeError("The data are not of the GeneResults class.") + if not isinstance(save_dir, Path): + save_dir = Path(save_dir) + data = np.vstack( + ( + np.array(analysis_results.boot_results.pls_genes.reshape(1, 15633)), + np.array(analysis_results.original_results.pls_weights), + np.array(analysis_results.boot_results.pval), + np.array(analysis_results.boot_results.pval_corrected), + ) + ).T + data = pd.DataFrame(data, columns=["Gene ID", "Correlation coefficient", "p", "p corrected"]) + data.to_csv(save_dir / "Correlation_coefficients.csv", index=False) From f7817417065b6d426fd7bec0767cfff62a32f107 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 10 Nov 2021 12:54:44 +0000 Subject: [PATCH 09/15] corrected genes to correlation function --- imaging_transcriptomics/transcriptomics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imaging_transcriptomics/transcriptomics.py b/imaging_transcriptomics/transcriptomics.py index 3067211..244e433 100644 --- a/imaging_transcriptomics/transcriptomics.py +++ b/imaging_transcriptomics/transcriptomics.py @@ -259,9 +259,9 @@ def run(self, n_iter=1_000, method="pls"): self.permuted, self._gene_labels, ) - self.gene_results.boot_results.compute_corr( + self.gene_results.boot_results.compute_correlation( self.gene_results.original_results.pls_weights, - self.gene_results.original_results.labels, + self.gene_results.original_results.pls_gene, self.gene_results.original_results.gene_id, ) else: From 8327cef894c7b99d603b64b242df7349d5fb7669 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 10 Nov 2021 12:55:08 +0000 Subject: [PATCH 10/15] fixed some typos --- docs/chapters/04_usage.rst | 14 +++++++++++--- docs/chapters/08_faq.rst | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/chapters/04_usage.rst b/docs/chapters/04_usage.rst index 37f89f6..758e51b 100644 --- a/docs/chapters/04_usage.rst +++ b/docs/chapters/04_usage.rst @@ -52,7 +52,15 @@ To the library can be imported by running: import imaging_transcriptomics as imt -Once imported the package will contain the core ``ImagingTranscriptomics`` class, along with other useful functions. +Once imported the package will contain the core ``ImagingTranscriptomics`` +class, along with other useful functions. To see all the available functions +imported in the library run: + +..code:: python + + dir(imt) + +which will display all the functions and modules imported in the library. ImagingTranscriptomics Class ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -65,8 +73,8 @@ To start using the class the first step is to initialise it. A way to do this is my_analysis = imt.ImagingTranscriptomics(my_data, n_components=1) The ``my_data`` is an array that contains the data you want to analyse (e.g., the average values from the scan). This vector has to have some characteristics, mainly: + * it has to be a ``numpy.array`` with length 41, which corresponds to the number of regions in the left hemisphere of the Desikan-Killiany atlas. * it has to contain the values you want to analyse but not the ``zscore`` of the values as this is computed automatically during the initialisation. -When initialising the class one might choose to initialise it using the ``n_components`` or the ``var`` attribute which represent the number of components to use for the regression or the variance to extract respectively. -After setting one, and only one, of the values are set, one can estiamte the other by running the ``my_analysis.pls_all_components()`` method which will compute the PLS regression with 15 components and estimate the variance explained by the user defined + diff --git a/docs/chapters/08_faq.rst b/docs/chapters/08_faq.rst index f81f92e..78771d9 100644 --- a/docs/chapters/08_faq.rst +++ b/docs/chapters/08_faq.rst @@ -14,4 +14,4 @@ FAQ #. **Why did you use the pypls library instead of some more maintained PLS library, e.g., sklearn?** We used pypls instead of sklearn because the latter one, and most of the other available, are implemented using the NIPALS algorithm, while pypls uses the SIMPLS. - One of the main advantages of the SIMPLS algorithm in respoect to the NIPALS is that is is less time consuming. + One of the main advantages of the SIMPLS algorithm in respect to the NIPALS is that is is less time consuming. From a489bf9509f49da6cbadfb125915fafe05608658 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 10 Nov 2021 12:55:35 +0000 Subject: [PATCH 11/15] fixed small indexing bug --- imaging_transcriptomics/bootstrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imaging_transcriptomics/bootstrap.py b/imaging_transcriptomics/bootstrap.py index 92cdbc6..e571ada 100644 --- a/imaging_transcriptomics/bootstrap.py +++ b/imaging_transcriptomics/bootstrap.py @@ -172,7 +172,7 @@ def bootstrap_correlation(x_data, y_data, permuted, labels, n_iterations=1000): # order results and set indexes in gene results gene_results.original_results.pls_weights = np.sort(corr_, kind="mergesort")[::-1] __idx = np.argsort(corr_, kind="mergesort")[::-1] - gene_results.original_results.labels = labels[__idx] + gene_results.original_results.pls_gene = labels[__idx] gene_results.original_results.gene_id = __idx # bootstrap __res = np.zeros((15633, 1000)) From a9385f8ff2f8f3aaa92e43fa6527736c1a625ae2 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Thu, 18 Nov 2021 12:21:15 +0000 Subject: [PATCH 12/15] general update --- imaging_transcriptomics/genes.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/imaging_transcriptomics/genes.py b/imaging_transcriptomics/genes.py index 4d24193..5757af9 100644 --- a/imaging_transcriptomics/genes.py +++ b/imaging_transcriptomics/genes.py @@ -87,30 +87,23 @@ def compute_values(self, n_comp, original_weights, original_ids): ) self.pval_corrected[component - 1] = __p_corr - def compute_corr(self, original_corr, originals_ids, original_index): - """Compute bootstrapping values for correlation data. - :param - """ + def compute_correlation(self, original_corr, originals_ids, original_index): + """Compute the p value of the correlation after the permutation.""" logger.info("Computing bootstrap correlation results") n_iter = 1000 for i in range(n_iter): - self.pls_weights_boot[:, i] = self.pls_weights_boot[:, i][ - original_index - ] + tmp = self.pls_weights_boot[:, i] + self.pls_weights_boot[:, i] = tmp[original_index] p = np.zeros(15633) for i in range(15633): - p[i] = ( - float( - len(np.nonzero(original_corr[i] < self.pls_weights_boot[ - i,:])) - ) - / n_iter - ) + original = original_corr[i] + boot = self.pls_weights_boot[i, :] + p[i] = float(len(boot[np.nonzero(boot >= original)])) / n_iter self.pval = p - _, p_corrected, _, _ = multipletests(p[::-1], method="fdr_bh", - is_sorted=True) + _, p_corrected, _, _ = multipletests(p, method="fdr_bh", + is_sorted=False) self.pval_corrected = p_corrected - self.pls_genes = np.array(originals_ids)[original_index.astype(int)] + self.pls_genes = np.array(originals_ids) class GeneResults(dict): From 81afae0af999e53591d7211eae154495f49caf61 Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 24 Nov 2021 15:52:21 +0000 Subject: [PATCH 13/15] added correlation and script update --- script/imagingtranscriptomics | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/script/imagingtranscriptomics b/script/imagingtranscriptomics index 1bc352d..4c3fd9a 100644 --- a/script/imagingtranscriptomics +++ b/script/imagingtranscriptomics @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import logging import logging.config @@ -53,11 +53,7 @@ def get_args(): "is used.", required=False, ) - method = parser.add_mutually_exclusive_group(required=True) - method.add_argument("--corr", action="store_true", help="run with the " - "correlation " - "method") - group = method.add_mutually_exclusive_group(required=True) + group = parser.add_mutually_exclusive_group(required=True) group.add_argument( "-n", "--ncomp", @@ -71,6 +67,8 @@ def get_args(): 100, and the program will select the number of components that explain a variance closest to the desired (with the explained variance used as a minimum). """, ) + group.add_argument("--corr", action="store_true", help="run with the correlation " + "method") verbosity = parser.add_mutually_exclusive_group(required=False) verbosity.add_argument( "--suppress", @@ -111,7 +109,8 @@ def main(): imaging_transcriptomics.inputs.read_scan(input_path) ) # Don't check inputs as checks are in the initialization of the analysis! - initial_dict = {"variance": inputs.variance, "n_components": inputs.ncomp} + n_comp = 1 if inputs.corr else inputs.ncomp + initial_dict = {"variance": inputs.variance, "n_components": n_comp} # Get IO paths to save files if not inputs.out: @@ -128,7 +127,8 @@ def main(): data_to_analyse, **initial_dict ) logger.info("Running analysis.") - analysis.run() + _method = "corr" if inputs.corr else "pls" + analysis.run(method=_method) # Save the results reporting.make_plots(save_dir, analysis.n_components, analysis.var_components) From 5c60bba7518acb7ba16742e76dcf07df32e4b89b Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 24 Nov 2021 15:54:41 +0000 Subject: [PATCH 14/15] removed graphs and pdf from correlation run --- script/imagingtranscriptomics | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/script/imagingtranscriptomics b/script/imagingtranscriptomics index 4c3fd9a..a3f5f9a 100644 --- a/script/imagingtranscriptomics +++ b/script/imagingtranscriptomics @@ -131,9 +131,11 @@ def main(): analysis.run(method=_method) # Save the results - reporting.make_plots(save_dir, analysis.n_components, analysis.var_components) + if not inputs.corr: + reporting.make_plots(save_dir, analysis.n_components, analysis.var_components) + reporting.create_pdf(input_path, save_dir) reporting.create_csv(analysis.gene_results, analysis.n_components, save_dir) - reporting.create_pdf(input_path, save_dir) + if __name__ == "__main__": From 4056a9f50f3a8a16ced3240c5dc014617449ea2b Mon Sep 17 00:00:00 2001 From: Alessio Giacomel Date: Wed, 24 Nov 2021 15:56:18 +0000 Subject: [PATCH 15/15] style cleaning --- script/imagingtranscriptomics | 1 - 1 file changed, 1 deletion(-) diff --git a/script/imagingtranscriptomics b/script/imagingtranscriptomics index a3f5f9a..4793160 100644 --- a/script/imagingtranscriptomics +++ b/script/imagingtranscriptomics @@ -137,6 +137,5 @@ def main(): reporting.create_csv(analysis.gene_results, analysis.n_components, save_dir) - if __name__ == "__main__": main()