Merge pull request #184 from rmarkello/areasim

[ENH] Adds interareal similarity thresholding
rmarkello · Mar 25, 2021 · 577e3ff · 577e3ff
2 parents bb23d7c + 66b07c3
commit 577e3ff
Show file tree

Hide file tree

Showing 7 changed files with 103 additions and 28 deletions.
diff --git a/abagen/allen.py b/abagen/allen.py
@@ -26,6 +26,7 @@ def get_expression_data(atlas,
                         ibf_threshold=0.5,
                         probe_selection='diff_stability',
                         donor_probes='aggregate',
+                        sim_threshold=None,
                         lr_mirror=None,
                         exact=None, missing=None,
                         tolerance=2,
@@ -126,6 +127,12 @@ def get_expression_data(atlas,
         donor ('independent'), or based on the most common selected probe
         across donors ('common'). Not all combinations of `probe_selection`
         and `donor_probes` methods are viable. Default: 'aggregate'
+    sim_threshold : (0, inf) float, optional
+        Threshold for inter-areal similarity filtering. Samples are correlated
+        across probes and those samples with a total correlation less than
+        `sim_threshold` standard deviations below the mean across samples are
+        excluded from futher analysis. If not specified no filtering is
+        performed. Default: None
     lr_mirror : {None, 'bidirectional', 'leftright', 'rightleft'}, optional
         Whether to mirror microarray expression samples across hemispheres to
         increase spatial coverage. Using 'bidirectional' will mirror samples
@@ -396,6 +403,15 @@ def get_expression_data(atlas,
     if n_gb > 1:
         LGR.warning(f'Output matrix may require up to {n_gb:.2f} GB RAM')
 
+    # get dataframe of probe information (reannotated or otherwise)
+    # the Probes.csv files are the same for every donor so just grab the first
+    probe_info = io.read_probes(first_entry(files, 'probes'))
+    if reannotated:
+        probe_info = probes_.reannotate_probes(probe_info)
+
+    # drop probes with no/invalid Entrez ID
+    probe_info = probe_info.dropna(subset=['entrez_id'])
+
     # update the annotation "files". this handles updating the MNI coordinates,
     # dropping mistmatched samples (where MNI coordinates don't match the
     # provided ontology), and mirroring samples across hemispheres, if desired
@@ -411,18 +427,13 @@ def get_expression_data(atlas,
         if lr_mirror is not None:
             annot = samples_.mirror_samples(annot, ontol, swap=lr_mirror)
         annot = samples_.drop_mismatch_samples(annot, ontol)
+        if sim_threshold is not None:
+            annot = samples_.similarity_threshold(data['microarray'],
+                                                  annot, probe_info,
+                                                  threshold=sim_threshold)
         data['annotation'] = annot
     annotation = flatten_dict(files, 'annotation')
 
-    # get dataframe of probe information (reannotated or otherwise)
-    # the Probes.csv files are the same for every donor so just grab the first
-    probe_info = io.read_probes(first_entry(files, 'probes'))
-    if reannotated:
-        probe_info = probes_.reannotate_probes(probe_info)
-
-    # drop probes with no/invalid Entrez ID
-    probe_info = probe_info.dropna(subset=['entrez_id'])
-
     # intensity-based filtering of probes
     probe_info = probes_.filter_probes(flatten_dict(files, 'pacall'),
                                        annotation, probe_info,

diff --git a/abagen/cli/run.py b/abagen/cli/run.py
@@ -204,6 +204,15 @@ def get_parser():
                              'mirror samples in the left hemisphere to the '
                              'right, and "rightleft" will mirror the right to '
                              'the left. Default: None')
+    w_data.add_argument('--sim_threshold', '--sim-threshold',
+                        type=_resolve_none, default=None, metavar='THRESHOLD',
+                        help='Threshold for inter-areal similarity filtering. '
+                             'Samples are correlated across probes and those '
+                             'samples with a total correlation less than the '
+                             'the provided threshold s.d. below the mean '
+                             'across samples are excluded from futher'
+                             'analysis. If not specified no filtering is '
+                             'performed. Default: None')
     w_data.add_argument('--missing', dest='missing', metavar='METHOD',
                         type=_resolve_none, default=None, choices=(
                             None, 'centroids', 'interpolate'),
@@ -358,6 +367,7 @@ def main(args=None):
                                      atlas_info=opts.atlas_info,
                                      ibf_threshold=opts.ibf_threshold,
                                      probe_selection=opts.probe_selection,
+                                     sim_threshold=opts.sim_threshold,
                                      lr_mirror=opts.lr_mirror,
                                      missing=opts.missing,
                                      tolerance=opts.tolerance,

diff --git a/abagen/io.py b/abagen/io.py
@@ -235,7 +235,8 @@ def read_probes(fname, copy=False):
     """
 
     try:
-        data = pd.read_csv(fname, index_col=0)
+        data = pd.read_csv(fname, index_col=0,
+                           dtype={'entrez_id': pd.Int64Dtype()})
     except (ValueError, TypeError):
         if not isinstance(fname, pd.DataFrame):
             raise TypeError('Provided fname must be filepath to Probes.csv '

diff --git a/abagen/samples_.py b/abagen/samples_.py
@@ -333,6 +333,42 @@ def mirror_samples(annotation, ontology, swap='bidirectional'):
     return annotation
 
 
+def similarity_threshold(microarray, annotation, probes, threshold=5):
+    """
+    Performs inter-areal similarity filtering of tissue samples
+
+    Parameters
+    ----------
+    microarray : str or pandas.DataFrame
+        Dataframe with `P` rows representing probes and `S` columns
+        representing distinct samples, with values indicating microarray
+        expression levels
+    annotation : str or pandas.DataFrame
+        DataFrame with information about `S` tissue samples in `microarray`
+    probes : str or pandas.DataFrame
+        Dataframe with information about `P` microarray probes in `microarray`
+    threshold : (0, np.inf) float, optional
+        Threshold for filtering samples. Specifies the standard deviation
+        cutoff used to remove samples. Default: 5
+
+    Returns
+    -------
+    samples : pandas.DataFrame
+        Dataframe containing information on samples that should be retained
+        according to inter-areal similarity filtering
+    """
+
+    annotation = io.read_annotation(annotation)
+    probes = io.read_probes(probes)
+    microarray = io.read_microarray(microarray, copy=False)
+
+    corrs = np.corrcoef(microarray.loc[probes.index, annotation.index].T)
+    sim = np.sum(corrs, axis=1)
+    thresh = np.mean(sim) - (threshold * np.std(sim, ddof=1))
+
+    return annotation.loc[sim >= thresh]
+
+
 def groupby_index(microarray, labels=None, metric='mean'):
     """
     Averages expression data in `microarray` over samples with same label

diff --git a/abagen/tests/test_samples.py b/abagen/tests/test_samples.py
@@ -8,7 +8,7 @@
 import pytest
 
 from abagen import samples_
-from abagen.utils import flatten_dict
+from abagen.utils import first_entry, flatten_dict
 
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 #  generate fake data (based largely on real data) so we know what to expect  #
@@ -309,3 +309,14 @@ def test__mirror_ontology_real(testfiles):
     for a, o, l in zip(annotation, ontology, orig):
         annot = samples_._mirror_ontology(a, o)
         assert len(annot) == l
+
+
+def test_similarity_threshold_real(testfiles):
+    annotation = first_entry(testfiles, 'annotation')
+    probes = first_entry(testfiles, 'probes')
+    microarray = first_entry(testfiles, 'microarray')
+
+    out1 = samples_.similarity_threshold(microarray, annotation, probes)
+    out2 = samples_.similarity_threshold(microarray, annotation, probes,
+                                         threshold=np.inf)
+    assert out1.shape[0] < out2.shape[0]
diff --git a/docs/user_guide/download.rst b/docs/user_guide/download.rst
@@ -87,6 +87,7 @@ looking at the :ref:`api_ref`. Notably, all IO functions return
 For example, you can load the annotation file for the first donor with:
 
 .. doctest::
+    :options: +NORMALIZE_WHITESPACE
 
     >>> data = files['9861']
     >>> annotation = abagen.io.read_annotation(data['annotation'])
@@ -107,18 +108,19 @@ For example, you can load the annotation file for the first donor with:
 And you can do the same for, e.g., the probe file with:
 
 .. doctest::
+   :options: +NORMALIZE_WHITESPACE
 
     >>> probes = abagen.io.read_probes(data['probes'])
     >>> print(probes)
-                          probe_name  gene_id  ... entrez_id chromosome
-    probe_id                                   ...
-    1058685              A_23_P20713      729  ...     733.0          9
-    1058684   CUST_15185_PI416261804      731  ...     735.0          5
-    1058683             A_32_P203917      731  ...     735.0          5
-    ...                          ...      ...  ...       ...        ...
-    1071209             A_32_P885445  1012197  ...       NaN        NaN
-    1071210               A_32_P9207  1012198  ...       NaN        NaN
-    1071211              A_32_P94122  1012199  ...       NaN        NaN
+                          probe_name  gene_id   gene_symbol                                  gene_name  entrez_id chromosome
+    probe_id
+    1058685              A_23_P20713      729           C8G  complement component 8, gamma polypeptide        733          9
+    1058684   CUST_15185_PI416261804      731            C9                     complement component 9        735          5
+    1058683             A_32_P203917      731            C9                     complement component 9        735          5
+    ...                          ...      ...           ...                                        ...        ...        ...
+    1071209             A_32_P885445  1012197  A_32_P885445    AGILENT probe A_32_P885445 (non-RefSeq)       <NA>        NaN
+    1071210               A_32_P9207  1012198    A_32_P9207      AGILENT probe A_32_P9207 (non-RefSeq)       <NA>        NaN
+    1071211              A_32_P94122  1012199   A_32_P94122     AGILENT probe A_32_P94122 (non-RefSeq)       <NA>        NaN
     <BLANKLINE>
     [58692 rows x 6 columns]
 

diff --git a/docs/user_guide/expression.rst b/docs/user_guide/expression.rst
@@ -46,25 +46,29 @@ about what's happening as it goes. However, briefly the function:
     4. Reannotates microarray probe-to-gene mappings with information from
        Arnatkevic̆iūtė et al., 2019, NeuroImage. This occurs by default; refer
        to parameter ``reannotated`` for more info.
-    5. Performs intensity-based filtering of probes to remove those that do not
+    5. Performs a similarity-based filtering of tissue samples, removing those
+       samples whose expression across probes is poorly correlated with other
+       samples. This does not occur by default; refer to parameter
+      ``sim_threshold`` for more info.
+    6. Performs intensity-based filtering of probes to remove those that do not
        exceed background noise. This occurs by default with a threshold of
        0.5 (i.e., probes must exceed background noise in 50% of all tissue
        samples); refer to parameter ``ibf_threshold`` for more info.
-    6. Selects a representative probe amongst those probes indexing the same
+    7. Selects a representative probe amongst those probes indexing the same
        gene. This occurs by default by selecting the probe with the highest
        differential stability amongst donors; refer to parameter
        ``probe_selection`` for more info (or see :ref:`usage_probes`).
-    7. Matches tissue samples to regions in the user-specified ``atlas``. Refer
+    8. Matches tissue samples to regions in the user-specified ``atlas``. Refer
        to parameters ``atlas``, ``atlas_info``, ``missing``, and ``tolerance``
        for more info (or see :ref:`usage_expression_missing`).
-    8. Normalizes expression values for each sample across genes for each
+    9. Normalizes expression values for each sample across genes for each
        donor. This occurs by default using a scaled robust sigmoid
        normalization function; refere to parameter ``sample_norm`` for more
        info.
-    9. Normalizes expression values for each gene across samples for each
-       donor. This occurs by default using a scaled robust sigmoid
-       normalization function; refer to parameter ``gene_norm`` for more info.
-    10. Aggregates samples within regions in the user-specified ``atlas`` based
+    10. Normalizes expression values for each gene across samples for each
+        donor. This occurs by default using a scaled robust sigmoid
+        normalization function; refer to parameter ``gene_norm`` for more info.
+    11. Aggregates samples within regions in the user-specified ``atlas`` based
         on matches made in Step 7. By default, samples are averaged separately
         for each donor and then averaged across donors. Refer to parameters
         ``region_agg``, ``agg_metric``, and ``return_donors`` for more info.