Skip to content

Commit

Permalink
Merge pull request #184 from rmarkello/areasim
Browse files Browse the repository at this point in the history
[ENH] Adds interareal similarity thresholding
  • Loading branch information
rmarkello authored Mar 25, 2021
2 parents bb23d7c + 66b07c3 commit 577e3ff
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 28 deletions.
29 changes: 20 additions & 9 deletions abagen/allen.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def get_expression_data(atlas,
ibf_threshold=0.5,
probe_selection='diff_stability',
donor_probes='aggregate',
sim_threshold=None,
lr_mirror=None,
exact=None, missing=None,
tolerance=2,
Expand Down Expand Up @@ -126,6 +127,12 @@ def get_expression_data(atlas,
donor ('independent'), or based on the most common selected probe
across donors ('common'). Not all combinations of `probe_selection`
and `donor_probes` methods are viable. Default: 'aggregate'
sim_threshold : (0, inf) float, optional
Threshold for inter-areal similarity filtering. Samples are correlated
across probes and those samples with a total correlation less than
`sim_threshold` standard deviations below the mean across samples are
excluded from futher analysis. If not specified no filtering is
performed. Default: None
lr_mirror : {None, 'bidirectional', 'leftright', 'rightleft'}, optional
Whether to mirror microarray expression samples across hemispheres to
increase spatial coverage. Using 'bidirectional' will mirror samples
Expand Down Expand Up @@ -396,6 +403,15 @@ def get_expression_data(atlas,
if n_gb > 1:
LGR.warning(f'Output matrix may require up to {n_gb:.2f} GB RAM')

# get dataframe of probe information (reannotated or otherwise)
# the Probes.csv files are the same for every donor so just grab the first
probe_info = io.read_probes(first_entry(files, 'probes'))
if reannotated:
probe_info = probes_.reannotate_probes(probe_info)

# drop probes with no/invalid Entrez ID
probe_info = probe_info.dropna(subset=['entrez_id'])

# update the annotation "files". this handles updating the MNI coordinates,
# dropping mistmatched samples (where MNI coordinates don't match the
# provided ontology), and mirroring samples across hemispheres, if desired
Expand All @@ -411,18 +427,13 @@ def get_expression_data(atlas,
if lr_mirror is not None:
annot = samples_.mirror_samples(annot, ontol, swap=lr_mirror)
annot = samples_.drop_mismatch_samples(annot, ontol)
if sim_threshold is not None:
annot = samples_.similarity_threshold(data['microarray'],
annot, probe_info,
threshold=sim_threshold)
data['annotation'] = annot
annotation = flatten_dict(files, 'annotation')

# get dataframe of probe information (reannotated or otherwise)
# the Probes.csv files are the same for every donor so just grab the first
probe_info = io.read_probes(first_entry(files, 'probes'))
if reannotated:
probe_info = probes_.reannotate_probes(probe_info)

# drop probes with no/invalid Entrez ID
probe_info = probe_info.dropna(subset=['entrez_id'])

# intensity-based filtering of probes
probe_info = probes_.filter_probes(flatten_dict(files, 'pacall'),
annotation, probe_info,
Expand Down
10 changes: 10 additions & 0 deletions abagen/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,15 @@ def get_parser():
'mirror samples in the left hemisphere to the '
'right, and "rightleft" will mirror the right to '
'the left. Default: None')
w_data.add_argument('--sim_threshold', '--sim-threshold',
type=_resolve_none, default=None, metavar='THRESHOLD',
help='Threshold for inter-areal similarity filtering. '
'Samples are correlated across probes and those '
'samples with a total correlation less than the '
'the provided threshold s.d. below the mean '
'across samples are excluded from futher'
'analysis. If not specified no filtering is '
'performed. Default: None')
w_data.add_argument('--missing', dest='missing', metavar='METHOD',
type=_resolve_none, default=None, choices=(
None, 'centroids', 'interpolate'),
Expand Down Expand Up @@ -358,6 +367,7 @@ def main(args=None):
atlas_info=opts.atlas_info,
ibf_threshold=opts.ibf_threshold,
probe_selection=opts.probe_selection,
sim_threshold=opts.sim_threshold,
lr_mirror=opts.lr_mirror,
missing=opts.missing,
tolerance=opts.tolerance,
Expand Down
3 changes: 2 additions & 1 deletion abagen/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,8 @@ def read_probes(fname, copy=False):
"""

try:
data = pd.read_csv(fname, index_col=0)
data = pd.read_csv(fname, index_col=0,
dtype={'entrez_id': pd.Int64Dtype()})
except (ValueError, TypeError):
if not isinstance(fname, pd.DataFrame):
raise TypeError('Provided fname must be filepath to Probes.csv '
Expand Down
36 changes: 36 additions & 0 deletions abagen/samples_.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,42 @@ def mirror_samples(annotation, ontology, swap='bidirectional'):
return annotation


def similarity_threshold(microarray, annotation, probes, threshold=5):
"""
Performs inter-areal similarity filtering of tissue samples
Parameters
----------
microarray : str or pandas.DataFrame
Dataframe with `P` rows representing probes and `S` columns
representing distinct samples, with values indicating microarray
expression levels
annotation : str or pandas.DataFrame
DataFrame with information about `S` tissue samples in `microarray`
probes : str or pandas.DataFrame
Dataframe with information about `P` microarray probes in `microarray`
threshold : (0, np.inf) float, optional
Threshold for filtering samples. Specifies the standard deviation
cutoff used to remove samples. Default: 5
Returns
-------
samples : pandas.DataFrame
Dataframe containing information on samples that should be retained
according to inter-areal similarity filtering
"""

annotation = io.read_annotation(annotation)
probes = io.read_probes(probes)
microarray = io.read_microarray(microarray, copy=False)

corrs = np.corrcoef(microarray.loc[probes.index, annotation.index].T)
sim = np.sum(corrs, axis=1)
thresh = np.mean(sim) - (threshold * np.std(sim, ddof=1))

return annotation.loc[sim >= thresh]


def groupby_index(microarray, labels=None, metric='mean'):
"""
Averages expression data in `microarray` over samples with same label
Expand Down
13 changes: 12 additions & 1 deletion abagen/tests/test_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest

from abagen import samples_
from abagen.utils import flatten_dict
from abagen.utils import first_entry, flatten_dict

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# generate fake data (based largely on real data) so we know what to expect #
Expand Down Expand Up @@ -309,3 +309,14 @@ def test__mirror_ontology_real(testfiles):
for a, o, l in zip(annotation, ontology, orig):
annot = samples_._mirror_ontology(a, o)
assert len(annot) == l


def test_similarity_threshold_real(testfiles):
annotation = first_entry(testfiles, 'annotation')
probes = first_entry(testfiles, 'probes')
microarray = first_entry(testfiles, 'microarray')

out1 = samples_.similarity_threshold(microarray, annotation, probes)
out2 = samples_.similarity_threshold(microarray, annotation, probes,
threshold=np.inf)
assert out1.shape[0] < out2.shape[0]
20 changes: 11 additions & 9 deletions docs/user_guide/download.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ looking at the :ref:`api_ref`. Notably, all IO functions return
For example, you can load the annotation file for the first donor with:

.. doctest::
:options: +NORMALIZE_WHITESPACE

>>> data = files['9861']
>>> annotation = abagen.io.read_annotation(data['annotation'])
Expand All @@ -107,18 +108,19 @@ For example, you can load the annotation file for the first donor with:
And you can do the same for, e.g., the probe file with:

.. doctest::
:options: +NORMALIZE_WHITESPACE

>>> probes = abagen.io.read_probes(data['probes'])
>>> print(probes)
probe_name gene_id ... entrez_id chromosome
probe_id ...
1058685 A_23_P20713 729 ... 733.0 9
1058684 CUST_15185_PI416261804 731 ... 735.0 5
1058683 A_32_P203917 731 ... 735.0 5
... ... ... ... ... ...
1071209 A_32_P885445 1012197 ... NaN NaN
1071210 A_32_P9207 1012198 ... NaN NaN
1071211 A_32_P94122 1012199 ... NaN NaN
probe_name gene_id gene_symbol gene_name entrez_id chromosome
probe_id
1058685 A_23_P20713 729 C8G complement component 8, gamma polypeptide 733 9
1058684 CUST_15185_PI416261804 731 C9 complement component 9 735 5
1058683 A_32_P203917 731 C9 complement component 9 735 5
... ... ... ... ... ... ...
1071209 A_32_P885445 1012197 A_32_P885445 AGILENT probe A_32_P885445 (non-RefSeq) <NA> NaN
1071210 A_32_P9207 1012198 A_32_P9207 AGILENT probe A_32_P9207 (non-RefSeq) <NA> NaN
1071211 A_32_P94122 1012199 A_32_P94122 AGILENT probe A_32_P94122 (non-RefSeq) <NA> NaN
<BLANKLINE>
[58692 rows x 6 columns]

Expand Down
20 changes: 12 additions & 8 deletions docs/user_guide/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,25 +46,29 @@ about what's happening as it goes. However, briefly the function:
4. Reannotates microarray probe-to-gene mappings with information from
Arnatkevic̆iūtė et al., 2019, NeuroImage. This occurs by default; refer
to parameter ``reannotated`` for more info.
5. Performs intensity-based filtering of probes to remove those that do not
5. Performs a similarity-based filtering of tissue samples, removing those
samples whose expression across probes is poorly correlated with other
samples. This does not occur by default; refer to parameter
``sim_threshold`` for more info.
6. Performs intensity-based filtering of probes to remove those that do not
exceed background noise. This occurs by default with a threshold of
0.5 (i.e., probes must exceed background noise in 50% of all tissue
samples); refer to parameter ``ibf_threshold`` for more info.
6. Selects a representative probe amongst those probes indexing the same
7. Selects a representative probe amongst those probes indexing the same
gene. This occurs by default by selecting the probe with the highest
differential stability amongst donors; refer to parameter
``probe_selection`` for more info (or see :ref:`usage_probes`).
7. Matches tissue samples to regions in the user-specified ``atlas``. Refer
8. Matches tissue samples to regions in the user-specified ``atlas``. Refer
to parameters ``atlas``, ``atlas_info``, ``missing``, and ``tolerance``
for more info (or see :ref:`usage_expression_missing`).
8. Normalizes expression values for each sample across genes for each
9. Normalizes expression values for each sample across genes for each
donor. This occurs by default using a scaled robust sigmoid
normalization function; refere to parameter ``sample_norm`` for more
info.
9. Normalizes expression values for each gene across samples for each
donor. This occurs by default using a scaled robust sigmoid
normalization function; refer to parameter ``gene_norm`` for more info.
10. Aggregates samples within regions in the user-specified ``atlas`` based
10. Normalizes expression values for each gene across samples for each
donor. This occurs by default using a scaled robust sigmoid
normalization function; refer to parameter ``gene_norm`` for more info.
11. Aggregates samples within regions in the user-specified ``atlas`` based
on matches made in Step 7. By default, samples are averaged separately
for each donor and then averaged across donors. Refer to parameters
``region_agg``, ``agg_metric``, and ``return_donors`` for more info.
Expand Down

0 comments on commit 577e3ff

Please sign in to comment.