From b874deaacf01c91ad54d8115b5e02ba5475ad88c Mon Sep 17 00:00:00 2001 From: Olivier Winter Date: Tue, 23 Jan 2024 17:57:43 +0000 Subject: [PATCH] Cellatlas (#5) * refactor get cache function to return higher level instead of `flatmaps` folder * wip plenty of stuff * make sure slice dimension are consistent regardless of the volume memory layout * add the agea atlas backend with geometry * bugfixes * windows CI tests require memmap to be closed to remove tempdir * review comments modifications --- .flake8 | 2 +- CHANGELOG.md | 10 ++ README.md | 22 +++-- ...nomics_load_allen_gene_expression_atlas.py | 14 +++ iblatlas/__init__.py | 28 +++--- iblatlas/atlas.py | 99 ++++++++++--------- iblatlas/flatmaps.py | 18 ++-- iblatlas/genes.py | 38 ------- iblatlas/genomics/__init__.py | 0 iblatlas/genomics/agea.py | 59 +++++++++++ .../01-make_experiments_parquet.py | 72 ++++++++++++++ .../02-scrape-experiments.py | 49 +++++++++ .../03-scrape-correlations.py | 9 ++ .../04-read-experiments.py | 65 ++++++++++++ .../05-generate-atlas-templates.py | 84 ++++++++++++++++ iblatlas/plots.py | 2 +- iblatlas/tests/test_atlas.py | 40 +++++--- iblatlas/tests/test_genomics.py | 40 ++++++++ pyproject.toml | 3 +- 19 files changed, 523 insertions(+), 131 deletions(-) create mode 100644 examples/atlas_genomics_load_allen_gene_expression_atlas.py delete mode 100644 iblatlas/genes.py create mode 100644 iblatlas/genomics/__init__.py create mode 100644 iblatlas/genomics/agea.py create mode 100644 iblatlas/genomics/gene_expression_scrapping/01-make_experiments_parquet.py create mode 100644 iblatlas/genomics/gene_expression_scrapping/02-scrape-experiments.py create mode 100644 iblatlas/genomics/gene_expression_scrapping/03-scrape-correlations.py create mode 100644 iblatlas/genomics/gene_expression_scrapping/04-read-experiments.py create mode 100644 iblatlas/genomics/gene_expression_scrapping/05-generate-atlas-templates.py create mode 100644 iblatlas/tests/test_genomics.py diff --git a/.flake8 b/.flake8 index 3f09d70..50d87ec 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,6 @@ [flake8] max-line-length = 130 -ignore = W504, W503, E266 +ignore = W504, W503, E266, D, BLK exclude = .git, __pycache__, diff --git a/CHANGELOG.md b/CHANGELOG.md index ebd49a7..4661b00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## [1.0.0] + +### Added +- `iblatlas.genomics` module for working with genomics data from Allen contains + - the Allen gene expression atlas in the `iblatlas.genomics.agea` module + - the Allen cell types atlas in the `iblatlas.genomics.merfish` module +### Modified +- slices of the atlas are now always returned with consistent sizes regardless of the volume layout on disk +- atlases now can have an extra dimension in the image volume, to allow for multiple layers + ## [0.3.0] ### Modified - Insertion._get_surface_intersection: can return None if and when mode is not set to raise and insertion is not intersecting with any surface diff --git a/README.md b/README.md index 0cc59d5..fa5fd6c 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,24 @@ # iblatlas -This repository contains tools to manipulate 3D representations of the mouse brain anatomy for electrophysiolgy experiments. -The tools are mainly using the the Allen Brain Atlas although some other atlases can be used and are written in Python. +Tools to manipulate hierarchical 3D representations of the mouse brain anatomy for electrophysiolgy experiments. +The tools are mainly using the the Allen CCF although some other atlases can be used. -One of the requirment of this repository is to use minimal requirements, based of standard matplotlib, numpy and scipy libraries, and excludes visualisation tool. +**This repository uses minimal requirements, based of standard `matplotlib`, `numpy` and `scipy` libraries, to the exclusion +of more complex visualization tools such as `pyqt`.** -The documentation can be found here: https://int-brain-lab.github.io/iblenv/notebooks_external/atlas_working_with_ibllib_atlas.html +## Documentation +[Full package documentation](https://int-brain-lab.github.io/iblenv/_autosummary/iblatlas.html#module-iblatlas) + +[Notebooks and examples](https://int-brain-lab.github.io/iblenv/notebooks_external/atlas_working_with_ibllib_atlas.html) ## Installation `pip install iblatlas` - ## Contributing Changes are merged by pull requests. Release checklist: -- [ ] Update version in `iblatlas/__init__.py` -- [ ] Update `CHANGELOG.md` -- [ ] Create a pull request to the `main` branch on GitHub +- [x] Update version in `iblatlas/__init__.py` +- [x] Update `CHANGELOG.md` +- [x] Create a pull request to the `main` branch on GitHub +- [x] Once the PR is merged, create a new tag and push the tag -Once merged the package is uploaded to PyPI using GitHub Actions. +Once a tag is pushed on main the package is uploaded to PyPI using GitHub Actions. diff --git a/examples/atlas_genomics_load_allen_gene_expression_atlas.py b/examples/atlas_genomics_load_allen_gene_expression_atlas.py new file mode 100644 index 0000000..fcff9a0 --- /dev/null +++ b/examples/atlas_genomics_load_allen_gene_expression_atlas.py @@ -0,0 +1,14 @@ +import matplotlib.pyplot as plt +from iblatlas.genomics import agea + +df_genes, gene_expression_volumes, atlas_agea = agea.load() + +igenes = (0,) +fig, axs = plt.subplots(3, 2, sharex=True, sharey=True) +atlas_agea.plot_cslice(0, ax=axs[0, 0]) +atlas_agea.plot_cslice(0, ax=axs[1, 0], volume='annotation') +atlas_agea.plot_cslice(0, ax=axs[2, 0], volume=gene_expression_volumes[igenes[0]], cmap='viridis') +atlas_agea.plot_sslice(0, ax=axs[0, 1]) +atlas_agea.plot_sslice(0, ax=axs[1, 1], volume='annotation') +atlas_agea.plot_sslice(0, ax=axs[2, 1], volume=gene_expression_volumes[igenes[0]], cmap='viridis') +fig.tight_layout() diff --git a/iblatlas/__init__.py b/iblatlas/__init__.py index b522ae2..098b670 100644 --- a/iblatlas/__init__.py +++ b/iblatlas/__init__.py @@ -3,8 +3,7 @@ For examples and tutorials on using the IBL atlas package, see https://docs.internationalbrainlab.org/atlas_examples.html -.. TODO Explain differences between this package and the Allen SDK. -Much of this was adapted from the `cortexlab allenCCF repository `_. + Terminology ----------- @@ -61,11 +60,15 @@ bregma and lambda are at the same DV height) [6]_, however this is not currently accounted for. -Mappings --------- -In addition to the atlases there are also multiple brain region mappings that serve one of two purposes: 1. control the -granularity particular brain regions; 2. support differing anatomical sub-devisions or nomenclature. The two Allen atlas mappings -below were created somewhat arbirarily by Nick Steinmetz to aid in analysis: +Brain Regions Mappings +---------------------- +The Allen CCF ontology regroups brain regions in a hierarchical manner, with up to 9 levels of parcellation. The +labels volumes provided by the Allen Institute are at the finest level of parcellation, ie. leaf nodes. +However it is often useful to regroup regions at higher levels of the hierarchy for aggregation purposes. The mappings +represent the correspondence between the leaf nodes and their grouping for given analysis purposes. This is useful for: + 1. control the granularity particular brain regions; + 2. support differing anatomical sub-devisions or nomenclature. +The two Allen atlas mappings below were created by Nick Steinmetz to aid in neural analysis: 1. Beryl - brain atlas annotations without layer sub-divisions or certain ganglial/nucleus sub-devisisions (e.g. the core/shell sub-division of the lateral geniculate nucleus). Fibre tracts, pia, etc. are also absent. The choice of which areas to combine @@ -90,7 +93,7 @@ Notes ----- The Allen atlas and the CCF annotations have different release dates and versions [10]_. The annotations used by IBL are the 2017 -version. +version (CCFv2). The IBL uses the following conventions: @@ -107,7 +110,7 @@ Examples -------- -Below are some breif API examples. For in depth tutorials on using the IBL atlas package, see +Below are some brief API examples. For in depth tutorials on using the IBL atlas package, see https://docs.internationalbrainlab.org/atlas_examples.html. Find bregma position in indices * resolution in um @@ -138,7 +141,7 @@ `Mappings`_. * **swanson_regions.npy** - A 1D array of length 323 containing the Allen CCF brain region IDs * **mappings.pqt** - A table of mappings. Each column defines a mapping, with the '-lr' suffix indicating a lateralized version. - The rows contain the correspondence of each mapping to the int64 index of the lateralized Allen structure tree. The table is + The rows contain the correspondence of each mapping to the int64 index of the lateralized Allen structure tree. The table is generated by iblatlas.regions.BrainRegions._compute_mappings. Remote files @@ -159,8 +162,6 @@ region were then simplified using the `Ramer Douglas Peucker algorithm `_ * **swanson2allen.npz** - TODO Document who made this, its contents, purpose and data type * **_.nrrd** - TODO Document who made this, its contents, purpose and data type -* **gene-expression.pqt** - TODO Document who made this, its contents, purpose and data type -* **gene-expression.bin** - TODO Document who made this, its contents, purpose and data type. .. [*] The annotation and average template volumes were created from the images provided in Supplemtary Data 4 of Chon et al. [3]_ and stitched together as a single volume using SimpleITK. @@ -192,6 +193,5 @@ ontology and flatmaps. J Comp Neurol. [doi 10.1002/cne.24381] .. [10] Allen Mouse Common Coordinate Framework Technical White Paper (October 2017 v3) http://help.brain-map.org/download/attachments/8323525/Mouse_Common_Coordinate_Framework.pdf - """ -__version__ = '0.3.0' \ No newline at end of file +__version__ = '0.4.0' diff --git a/iblatlas/atlas.py b/iblatlas/atlas.py index 2828c31..b2f1a93 100644 --- a/iblatlas/atlas.py +++ b/iblatlas/atlas.py @@ -148,19 +148,6 @@ def nxyz(self): """numpy.array: Coordinates of the element volume[0, 0, 0] in the coordinate space.""" return np.array([self.nx, self.ny, self.nz]) - """Methods ratios to indices""" - def r2ix(self, r): - # FIXME Document - return int((self.nx - 1) * r) - - def r2iy(self, r): - # FIXME Document - return int((self.nz - 1) * r) - - def r2iz(self, r): - # FIXME Document - return int((self.nz - 1) * r) - """Methods distance to indices""" @staticmethod def _round(i, round=True): @@ -459,10 +446,13 @@ class BrainAtlas: yet this class can be used for other atlases arises. """ - """numpy.array: An image volume.""" + """numpy.array: A 3D image volume.""" image = None - """numpy.array: An annotation label volume.""" + """numpy.array: A 3D annotation label volume.""" label = None + """numpy.array: One or several optional data volumes, the 3 last dimensions should match + the image and the label volumes dimensions""" + volumes = None def __init__(self, image, label, dxyz, regions, iorigin=[0, 0, 0], dims2xyz=[0, 1, 2], xyz2dims=[0, 1, 2]): @@ -478,8 +468,8 @@ def __init__(self, image, label, dxyz, regions, iorigin=[0, 0, 0], self.image = image self.label = label self.regions = regions - self.dims2xyz = dims2xyz - self.xyz2dims = xyz2dims + self.dims2xyz = np.array(dims2xyz) + self.xyz2dims = np.array(xyz2dims) assert np.all(self.dims2xyz[self.xyz2dims] == np.array([0, 1, 2])) assert np.all(self.xyz2dims[self.dims2xyz] == np.array([0, 1, 2])) # create the coordinate transform object that maps volume indices to real world coordinates @@ -492,8 +482,13 @@ def __init__(self, image, label, dxyz, regions, iorigin=[0, 0, 0], @staticmethod def _get_cache_dir(): + """ + ./histology/ATLAS/Needles/Allen + Where . is the main ONE cache directory + :return: pathlib.Path + """ par = one.params.get(silent=True) - path_atlas = Path(par.CACHE_DIR).joinpath('histology', 'ATLAS', 'Needles', 'Allen', 'flatmaps') + path_atlas = Path(par.CACHE_DIR).joinpath('histology', 'ATLAS', 'Needles', 'Allen') return path_atlas def mask(self): @@ -729,7 +724,7 @@ def _plot_slice(im, extent, ax=None, cmap=None, volume=None, **kwargs): cmap : str, matplotlib.colors.Colormap The Colormap instance or registered colormap name used to map scalar data to colors. Defaults to 'bone'. - volume : str + volume : str | np.array If 'boundary', assumes image is an outline of boundaries between all regions. FIXME How does this affect the plot? **kwargs @@ -746,10 +741,11 @@ def _plot_slice(im, extent, ax=None, cmap=None, volume=None, **kwargs): if not cmap: cmap = plt.get_cmap('bone') - if volume == 'boundary': - imb = np.zeros((*im.shape[:2], 4), dtype=np.uint8) - imb[im == 1] = np.array([0, 0, 0, 255]) - im = imb + if isinstance(volume, str): + if volume == 'boundary': + imb = np.zeros((*im.shape[:2], 4), dtype=np.uint8) + imb[im == 1] = np.array([0, 0, 0, 255]) + im = imb ax.imshow(im, extent=extent, cmap=cmap, **kwargs) return ax @@ -795,7 +791,10 @@ def slice(self, coordinate, axis, volume='image', mode='raise', region_values=No - if volume='volume', region_values must have shape ba.image.shape - if volume='value', region_values must have shape ba.regions.id :param mapping: mapping to use. Options can be found using ba.regions.mappings.keys() - :return: 2d array or 3d RGB numpy int8 array + :return: 2d array or 3d RGB numpy int8 array of dimensions: + - 0: nap x ndv (sagittal slice) + - 1: nml x ndv (coronal slice) + - 2: nap x nml (horizontal slice) """ if axis == 0: index = self.bc.x2i(np.array(coordinate), mode=mode) @@ -804,43 +803,53 @@ def slice(self, coordinate, axis, volume='image', mode='raise', region_values=No elif axis == 2: index = self.bc.z2i(np.array(coordinate), mode=mode) - # np.take is 50 thousand times slower than straight slicing ! def _take(vol, ind, axis): + """ + This is a 2 steps process to get the slice along the correct axis + 1) slice the volume according to the mapped axis corresponding to the sclice + we do this because np.take is 50 thousand times slower than straight slicing ! + 2) reshape the output array according to the slice specifications + """ + volume_axis = self.xyz2dims[axis] if mode == 'clip': - ind = np.minimum(np.maximum(ind, 0), vol.shape[axis] - 1) - if axis == 0: - return vol[ind, :, :] - elif axis == 1: - return vol[:, ind, :] - elif axis == 2: - return vol[:, :, ind] + ind = np.minimum(np.maximum(ind, 0), vol.shape[volume_axis] - 1) + if volume_axis == 0: + slic = vol[ind, :, :] + elif volume_axis == 1: + slic = vol[:, ind, :] + elif volume_axis == 2: + slic = vol[:, :, ind] + output_sizes = [[1, 2], [0, 2], [1, 0]] # we expect those sizes where index is the axis + if np.diff(self.xyz2dims[output_sizes[axis]])[0] < 0: + slic = slic.transpose().copy() + return slic def _take_remap(vol, ind, axis, mapping): # For the labels, remap the regions indices according to the mapping return self._get_mapping(mapping=mapping)[_take(vol, ind, axis)] if isinstance(volume, np.ndarray): - return _take(volume, index, axis=self.xyz2dims[axis]) + return _take(volume, index, axis=axis) elif volume in 'annotation': - iregion = _take_remap(self.label, index, self.xyz2dims[axis], mapping) + iregion = _take_remap(self.label, index, axis=axis, mapping=mapping) return self._label2rgb(iregion) elif volume == 'image': - return _take(self.image, index, axis=self.xyz2dims[axis]) + return _take(self.image, index, axis=axis) elif volume == 'value': - return region_values[_take_remap(self.label, index, self.xyz2dims[axis], mapping)] + return region_values[_take_remap(self.label, index, axis, mapping)] elif volume == 'image': - return _take(self.image, index, axis=self.xyz2dims[axis]) + return _take(self.image, index, axis=axis) elif volume in ['surface', 'edges']: self.compute_surface() - return _take(self.surface, index, axis=self.xyz2dims[axis]) + return _take(self.surface, index, axis=axis) elif volume == 'boundary': - iregion = _take_remap(self.label, index, self.xyz2dims[axis], mapping) + iregion = _take_remap(self.label, index, axis, mapping) return self.compute_boundaries(iregion) elif volume == 'volume': if bc is not None: index = bc.xyz2i(np.array([coordinate] * 3))[axis] - return _take(region_values, index, axis=self.xyz2dims[axis]) + return _take(region_values, index, axis=axis) def compute_boundaries(self, values): """ @@ -896,7 +905,6 @@ def plot_cslice(self, ap_coordinate, volume='image', mapping=None, region_values :param **kwargs: matplotlib.pyplot.imshow kwarg arguments :return: matplotlib ax object """ - cslice = self.slice(ap_coordinate, axis=1, volume=volume, mapping=mapping, region_values=region_values) return self._plot_slice(np.moveaxis(cslice, 0, 1), extent=self.extent(axis=1), volume=volume, **kwargs) @@ -964,14 +972,11 @@ def plot_top(self, volume='annotation', mapping=None, region_values=None, ax=Non :param kwargs: :return: """ - self.compute_surface() ix, iy = np.meshgrid(np.arange(self.bc.nx), np.arange(self.bc.ny)) iz = self.bc.z2i(self.top) inds = self._lookup_inds(np.stack((ix, iy, iz), axis=-1)) - regions = self._get_mapping(mapping=mapping)[self.label.flat[inds]] - if volume == 'annotation': im = self._label2rgb(regions) elif volume == 'image': @@ -1290,8 +1295,10 @@ class AllenAtlas(BrainAtlas): """numpy.array: A diffusion weighted imaging (DWI) image volume. - The Allen atlas DWI average template volume has with the shape (ap, ml, dv) and contains uint16 - values. FIXME What do the values represent? + This average template brain was created from images of 1,675 young adult C57BL/6J mouse brains + acquired using serial two-photon tomography. + This volume has a C-ordered shape of (ap, ml, dv) and contains uint16 + values. """ image = None diff --git a/iblatlas/flatmaps.py b/iblatlas/flatmaps.py index a5838ec..22e63b0 100644 --- a/iblatlas/flatmaps.py +++ b/iblatlas/flatmaps.py @@ -19,10 +19,8 @@ class FlatMap(AllenAtlas): - """The Allen Atlas flatmap. + """ - FIXME Document! How are these flatmaps determined? Are they related to the Swansan atlas or is - that something else? """ def __init__(self, flatmap='dorsal_cortex', res_um=25): @@ -46,7 +44,7 @@ def __init__(self, flatmap='dorsal_cortex', res_um=25): def _get_flatmap_from_file(self): # gets the file in the ONE cache for the flatmap name in the property, downloads it if needed - file_flatmap = self._get_cache_dir().joinpath(f'{self.name}_{self.res_um}.nrrd') + file_flatmap = self._get_cache_dir().joinpath('flatmaps', f'{self.name}_{self.res_um}.nrrd') if not file_flatmap.exists(): file_flatmap.parent.mkdir(exist_ok=True, parents=True) aws.s3_download_file(f'atlas/{file_flatmap.name}', file_flatmap) @@ -214,8 +212,12 @@ def circles(N=5, atlas=None, display='flat'): def swanson(filename="swanson2allen.npz"): """ - FIXME Document! Which publication to reference? Are these specifically for flat maps? - Shouldn't this be made into an Atlas class with a mapping or scaling applied? + A rasterized rendition of the Swanson projection of the mouse brain, which is a 2D + representation of the mouse brain. + Each pixel in the image corresponds to a region index in the Allen CCFv2 annotation volume. + [1] J. D. Hahn et al., “An open access mouse brain flatmap and upgraded rat and human + brain flatmaps based on current reference atlases,” + J Comp Neurol, vol. 529, no. 3, pp. 576–594, Feb. 2021, doi: 10.1002/cne.24966. Parameters ---------- @@ -231,7 +233,7 @@ def swanson(filename="swanson2allen.npz"): 'bb0554ecc704dd4b540151ab57f73822', # version 2022-05-02 (remapped) '7722c1307cf9a6f291ad7632e5dcc88b', # version 2022-05-09 (removed wolf pixels and 2 artefact regions) ] - npz_file = AllenAtlas._get_cache_dir().joinpath(filename) + npz_file = AllenAtlas._get_cache_dir().joinpath('flatmaps', filename) if not npz_file.exists() or md5(npz_file) in OLD_MD5: npz_file.parent.mkdir(exist_ok=True, parents=True) _logger.info(f'downloading swanson image from {aws.S3_BUCKET_IBL} s3 bucket...') @@ -260,7 +262,7 @@ def swanson_json(filename="swansonpaths.json", remap=True): # and CUL4 split (on s3 called swansonpaths_56daa.json) 'f848783954883c606ca390ceda9e37d2'] - json_file = AllenAtlas._get_cache_dir().joinpath(filename) + json_file = AllenAtlas._get_cache_dir().joinpath('flatmaps', filename) if not json_file.exists() or md5(json_file) in OLD_MD5: json_file.parent.mkdir(exist_ok=True, parents=True) _logger.info(f'downloading swanson paths from {aws.S3_BUCKET_IBL} s3 bucket...') diff --git a/iblatlas/genes.py b/iblatlas/genes.py deleted file mode 100644 index 34deadc..0000000 --- a/iblatlas/genes.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Gene expression maps.""" -import logging -from pathlib import Path - -import numpy as np -import pandas as pd - -from iblutil.io.hashfile import md5 -import one.remote.aws as aws - -from iblatlas.atlas import AllenAtlas - -_logger = logging.getLogger(__name__) - - -def allen_gene_expression(filename='gene-expression.pqt', folder_cache=None): - """ - Reads in the Allen gene expression experiments binary data. - :param filename: - :param folder_cache: - :return: a dataframe of experiments, where each record corresponds to a single gene expression - and a memmap of all experiments volumes, size (4345, 58, 41, 67) corresponding to - (nexperiments, ml, dv, ap). The spacing between slices is 200 um - """ - OLD_MD5 = [] - DIM_EXP = (4345, 58, 41, 67) - folder_cache = folder_cache or AllenAtlas._get_cache_dir().joinpath(filename) - file_parquet = Path(folder_cache).joinpath('gene-expression.pqt') - file_bin = file_parquet.with_suffix(".bin") - - if not file_parquet.exists() or md5(file_parquet) in OLD_MD5: - file_parquet.parent.mkdir(exist_ok=True, parents=True) - _logger.info(f'downloading gene expression data from {aws.S3_BUCKET_IBL} s3 bucket...') - aws.s3_download_file(f'atlas/{file_parquet.name}', file_parquet) - aws.s3_download_file(f'atlas/{file_bin.name}', file_bin) - df_genes = pd.read_parquet(file_parquet) - gexp_all = np.memmap(file_bin, dtype=np.float16, mode='r', offset=0, shape=DIM_EXP) - return df_genes, gexp_all diff --git a/iblatlas/genomics/__init__.py b/iblatlas/genomics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/iblatlas/genomics/agea.py b/iblatlas/genomics/agea.py new file mode 100644 index 0000000..8788a88 --- /dev/null +++ b/iblatlas/genomics/agea.py @@ -0,0 +1,59 @@ +""" +[1] E. S. Lein et al., “Genome-wide atlas of gene expression in the adult mouse brain,” + Nature, vol. 445, no. 7124, Art. no. 7124, Jan. 2007, doi: 10.1038/nature05453. +[2] L. Ng et al., “An anatomic gene expression atlas of the adult mouse brain,” + Nat Neurosci, vol. 12, no. 3, Art. no. 3, Mar. 2009, doi: 10.1038/nn.2281. +""" +import logging +from pathlib import Path + +import numpy as np +import pandas as pd + +import one.remote.aws as aws + +from iblatlas import atlas + +_logger = logging.getLogger(__name__) + +_, NML, NDV, NAP = DIM_EXP = (4345, 58, 41, 67) # nexperiments, nml, ndv, nap + + +def load(folder_cache=None, expression_size=DIM_EXP): + """ + Reads in the Allen gene expression experiments binary data. + Generation scripts from the Allen Institute are available in the gene-expression-scrapping folder + :param filename: + :param folder_cache: + :return: + a dataframe of experiments (4345, 2), where each record corresponds to a single gene expression + a memmap of all experiments brain volumes, size (4345, 58, 41, 67) corresponding to + (nexperiments, ml, dv, ap). The spacing between slices is 200 um + a brainatlas object with the labels and coordinates matching the gene expression volumes + """ + OLD_VERSIONS = ['2023-06-12'] + folder_cache = Path(folder_cache or atlas.AllenAtlas._get_cache_dir().joinpath('agea')) + # check the AWS version and download the files if needed + version_flag = next(folder_cache.glob('*.version'), None) + if version_flag is None or version_flag.stem in OLD_VERSIONS: + _logger.info(f'downloading gene expression data from {aws.S3_BUCKET_IBL} s3 bucket...') + aws.s3_download_folder('atlas/agea', folder_cache) + + # load the genes dataframe and the gene expression volumes + file_parquet = Path(folder_cache).joinpath('gene-expression.pqt') + file_expression = Path(folder_cache).joinpath('gene-expression.bin') + df_genes = pd.read_parquet(file_parquet) + expression_volumes = np.memmap(file_expression, dtype=np.float16, mode='r', offset=0, shape=expression_size) + + # create a brain atlas object with the gene expression volume geometry and pre-computed labels + atlas_agea = atlas.BrainAtlas( + image=np.load(Path(folder_cache).joinpath('image.npy')), + label=np.load(Path(folder_cache).joinpath('label.npy')), + dxyz=200 / 1e6 * np.array([1, -1, -1]), + regions=atlas.BrainRegions(), + iorigin=atlas.ALLEN_CCF_LANDMARKS_MLAPDV_UM['bregma'] / 200 + np.array([0, 0, 0]), + dims2xyz=[0, 2, 1], + xyz2dims=[0, 2, 1] + ) + + return df_genes, expression_volumes, atlas_agea diff --git a/iblatlas/genomics/gene_expression_scrapping/01-make_experiments_parquet.py b/iblatlas/genomics/gene_expression_scrapping/01-make_experiments_parquet.py new file mode 100644 index 0000000..2bd7c2e --- /dev/null +++ b/iblatlas/genomics/gene_expression_scrapping/01-make_experiments_parquet.py @@ -0,0 +1,72 @@ +""" +Module to download and format gene expression atlas data from Allen Institute AGEA project +https://mouse.brain-map.org/agea +https://www.nature.com/articles/nature05453 +http://help.brain-map.org/display/mousebrain/Documentation + +volumes for all genes (size 51533 voxels x 4376 genes) ~ .84 Gb float32 +correlation volumes (size 51533 voxels x 51533 genes) ~ 9.89 Gb float32 + +They are 200 micron voxel + +ElementSpacing = 200 200 200 +DimSize = 67 41 58 (159_326 voxels) + +Which means you need to query: +X = 0 to (66*200) +Y = 0 to (40*200) +Z = 0 to (57*200) +""" +# %% +from pathlib import Path +import numpy as np +import pandas as pd + + +folder_download = Path("/datadisk/gdrive/2022/08_gene_expression_atlas") +file_parquet = folder_download.joinpath('experiments.pqt') + + +def make_experiments_parquet(file_parquet): + """ + Query experiments and gene acronyms for the xml API and write dataframe + with ids and corresponding acronyms + :param file_parquet: + :return: + """ + import requests + from xml.etree import ElementTree + from copy import copy + + http_query = ("http://api.brain-map.org/api/v2/data/query.xml?criteria=model::SectionDataSet,rma::" + "criteria,[failed$eqfalse],products[id$eq1],plane_of_section[name$eqcoronal]," + "treatments[name$eqISH],rma::include,genes,rma::options[num_rows$eqall]") + response = requests.get(http_query) + tree = ElementTree.fromstring(response.text) + + def dictify(r, root=True, tags=None): + if root: + return {r.tag: dictify(r, False)} + d = copy(r.attrib) + if r.text and r.text.strip() != '': + d = r.text.strip() + for x in r.findall("./*"): + if x.tag not in d: + d[x.tag] = [] + d[x.tag].append(dictify(x, False)) + return d + + datasets = dictify(tree)['Response']['section-data-sets'][0]['section-data-set'] + ids = np.zeros(len(datasets), dtype=np.int64) + genes = [] + for i, ds in enumerate(datasets): + ids[i] = ds['id'][0] + genes.append(ds['genes'][0]['gene'][0]['acronym'][0]) + + df_genes = pd.DataFrame(data=np.c_[ids, np.array(genes)], columns=['id', 'gene']) + df_genes.to_parquet(file_parquet) + + +if not file_parquet.exists(): + make_experiments_parquet(file_parquet) +df_experiments = pd.read_parquet(file_parquet) diff --git a/iblatlas/genomics/gene_expression_scrapping/02-scrape-experiments.py b/iblatlas/genomics/gene_expression_scrapping/02-scrape-experiments.py new file mode 100644 index 0000000..51ec768 --- /dev/null +++ b/iblatlas/genomics/gene_expression_scrapping/02-scrape-experiments.py @@ -0,0 +1,49 @@ +from pathlib import Path +import subprocess +import shutil + +import numpy as np +import pandas as pd +from zipfile import ZipFile +import skimage.io as io + +folder_download = Path("/datadisk/gdrive/2022/08_gene_expression_atlas") +file_parquet = folder_download.joinpath('gene-expression.pqt') +folder_experiments = folder_download.joinpath('experiments') +experiments = pd.read_parquet(file_parquet) + + +# %% scrape all zip files +folder_experiments.mkdir(exist_ok=True) +for i, exp in experiments.iterrows(): + file_experiment = folder_experiments.joinpath(f"{exp.id}_{exp.gene}.zip") + if not file_experiment.exists(): + print(i, file_experiment) + cmd = f"wget http://api.brain-map.org/grid_data/download/{exp.id} -P {folder_experiments}" + process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + info, error = process.communicate() + shutil.move(folder_experiments.joinpath(exp.id), file_experiment) + + +# %% Interpret +file_out_npy = folder_download.joinpath("gene-expression.bin") +scratch_dir = folder_download.joinpath('scratch') + +with open(file_out_npy, 'wb+') as fp: + for i, exp in experiments.iterrows(): + print(i, file_experiment) + file_experiment = folder_experiments.joinpath(f"{exp.id}_{exp.gene}.zip") + scratch_dir.mkdir(exist_ok=True) + with ZipFile(file_experiment, 'r') as zip: + zip.extractall(scratch_dir) + + # (58, 41, 67) corresponds to ML, DV, AP + mhd_file = next(scratch_dir.glob('*.mhd')) + img = io.imread(mhd_file, plugin='simpleitk') + + shutil.rmtree(scratch_dir) + + img.astype(np.float16).tofile(fp) + # import matplotlib.pyplot as plt + # plt.imshow(np.squeeze(img[:, :, 25].T)) diff --git a/iblatlas/genomics/gene_expression_scrapping/03-scrape-correlations.py b/iblatlas/genomics/gene_expression_scrapping/03-scrape-correlations.py new file mode 100644 index 0000000..9503d0d --- /dev/null +++ b/iblatlas/genomics/gene_expression_scrapping/03-scrape-correlations.py @@ -0,0 +1,9 @@ +# download xross correlation values +from pathlib import Path +from one.webclient import http_download_file +url = "https://mouse.brain-map.org/agea/data/P56/download?seed=6800,4200,5600&age=P56" +zip_name = "6800_4200_5600.zip" +folder_download = Path("/datadisk/gdrive/2022/08_gene_expression_atlas") + +fn = http_download_file(url, target_dir=folder_download) +Path(fn).rename(zip_name) diff --git a/iblatlas/genomics/gene_expression_scrapping/04-read-experiments.py b/iblatlas/genomics/gene_expression_scrapping/04-read-experiments.py new file mode 100644 index 0000000..1a116a4 --- /dev/null +++ b/iblatlas/genomics/gene_expression_scrapping/04-read-experiments.py @@ -0,0 +1,65 @@ +from pathlib import Path + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import train_test_split + +from ibllib.atlas import AllenAtlas, BrainCoordinates + +DIM_EXP = (4345, 58, 41, 67) +ne, nml, ndv, nap = DIM_EXP +folder_download = Path("/datadisk/gdrive/2022/08_gene_expression_atlas") +file_parquet = folder_download.joinpath('gene-expression.pqt') +file_bin = file_parquet.with_suffix(".bin") + +folder_experiments = folder_download.joinpath('experiments') +experiments = pd.read_parquet(file_parquet) + +ba = AllenAtlas() + +gexp_all = np.memmap(file_bin, dtype=np.float16, mode='r', offset=0, shape=DIM_EXP) + +# %% (58, 41, 67) corresponds to ML, DV, AP +ie = np.where(experiments['gene'] == 'Pantr1')[0][0] + +gexp = gexp_all[ie, :, :, :].astype(np.float32) +gexp[gexp <= 0] = np.nan +bc = BrainCoordinates(nxyz=[nml, nap, ndv], dxyz=[200, 200, 200]) + +xyz = np.array([0, 0, -0.002]) +ba.plot_slices(xyz, volume="annotation") + +iml, iap, idv = np.round(ba.xyz2ccf(xyz) / 200).astype(np.int32) + +# BrainAtlas(image=gexp, label, dxyz, regions, iorigin=[0, 0, 0], +# dims2xyz=[0, 1, 2], xyz2dims=[0, 1, 2] +fig, axs = plt.subplots(2, 2) +axs[0, 0].imshow(np.squeeze(gexp[:, :, iap]).T) +axs[0, 1].imshow(np.squeeze(gexp[iml, :, :])) +axs[1, 0].imshow(np.squeeze(gexp[:, idv, :])) + + +# %% This cell takes +ccf_coords = np.meshgrid(*[np.arange(DIM_EXP[i]) * 200 for i in [1, 2, 3]]) # ml, dv, ap +xyzs = ba.ccf2xyz(np.c_[ccf_coords[0].flatten(), ccf_coords[2].flatten(), ccf_coords[1].flatten()]) +aids = ba.get_labels(xyzs, mode='clip', mapping='Cosmos') + +sel = ~np.logical_or(aids == 0, aids == 997) # remove void and root voxels +# reshape in a big array nexp x nvoxels +gexps = gexp_all.reshape((ne, nml * nap * ndv))[:, sel].astype(np.float32).transpose() +xyzs = xyzs[sel] +aids = aids[sel] + +# %% ok ready for the learning part +X_train, X_test, y_train, y_test = train_test_split(gexps, aids, stratify=aids) +scaler = StandardScaler() +scaler.fit(gexps) +X_train = scaler.transform(X_train) +X_test = scaler.transform(X_test) +clf = MLPClassifier(random_state=1, max_iter=300, verbose=True).fit(X_train, y_train) +clf.predict_proba(X_test[:1]) +clf.predict(X_test) +clf.score(X_test, y_test) diff --git a/iblatlas/genomics/gene_expression_scrapping/05-generate-atlas-templates.py b/iblatlas/genomics/gene_expression_scrapping/05-generate-atlas-templates.py new file mode 100644 index 0000000..cc6b187 --- /dev/null +++ b/iblatlas/genomics/gene_expression_scrapping/05-generate-atlas-templates.py @@ -0,0 +1,84 @@ +""" +When looking at one or a few genes it is possible to upsample the gene expression volume +to the 25um or even the 10um resolution of the Allen CCF. +However when looking at thousand of genes, it is more feasible to keep the 200um resolution +Here we are downsampling a version of the CCF to 200um resolution that matches the gene expression atlas to +use as a label volume and to compute the brain coordinates of the gene expression voxels +""" +# %% +from pathlib import Path + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import scipy.stats + +from iblatlas import atlas + +DIM_EXP = (4345, 58, 41, 67) # nexperiments, nml, ndv, nap +# path_agea_cache = atlas.AllenAtlas._get_cache_dir().joinpath('agea') +path_agea_cache = Path('/Users/olivier/Downloads/ONE/alyx.internationalbrainlab.org/histology/ATLAS/Needles/Allen/agea') +df_genes = pd.read_parquet(Path(path_agea_cache).joinpath('gene-expression.pqt')) +gexp_all = np.memmap(Path(path_agea_cache).joinpath('gene-expression.bin'), dtype=np.float16, mode='r', offset=0, shape=DIM_EXP) + +# create a brain atlas object with the gene expression volume geometry +bg = atlas.BrainAtlas( + image=gexp_all[0], + label=gexp_all[0] != -1, + dxyz=200 / 1e6 * np.array([1, -1, -1]), + regions=atlas.BrainRegions(), + iorigin=atlas.ALLEN_CCF_LANDMARKS_MLAPDV_UM['bregma'] / 200 + np.array([0, 0, 0]), + dims2xyz=[0, 2, 1], + xyz2dims=[0, 2, 1] +) +# instantiate the allen atlas at 25um resolution and compute the transform to the agea volume +ba = atlas.AllenAtlas(res_um=25) +xia2g = bg.bc.x2i(ba.bc.xscale) +yia2g = bg.bc.y2i(ba.bc.yscale) +zia2g = bg.bc.z2i(ba.bc.zscale) +# meshgrid the coordinates and aggregate per agea voxel, mean for image, and mode for the label +# (this takes less than a minute) +Xia2g, Yia2g, Zia2g = [aa.flatten() for aa in np.meshgrid(xia2g, yia2g, zia2g)] +df = pd.DataFrame( + np.c_[Xia2g, Yia2g, Zia2g, ba.image.flatten(), ba.label.flatten()], + columns=['ix', 'iy', 'iz', 'image', 'label']) +gene_expression_labels = df.groupby(['ix', 'iy', 'iz']).agg( + image=pd.NamedAgg(column="image", aggfunc="mean"), + label=pd.NamedAgg(column="label", aggfunc=lambda x: scipy.stats.mode(x)[0][0]), +).reset_index() + +# create the image and label volume for the agea brain atlas +image = np.zeros(gexp_all[0].shape, dtype=np.float32) +label = np.zeros(gexp_all[0].shape, dtype=np.int16) +dim_columns = [['ix', 'iy', 'iz'][i] for i in bg.xyz2dims] +linear_indices = np.ravel_multi_index([gene_expression_labels[c].values for c in dim_columns], image.shape) +np.put(image, linear_indices, gene_expression_labels['image'].values) +np.put(label, linear_indices, gene_expression_labels['label'].values) + +# make sure the transform went well by plotting a coronal and a sagittal slice from +# the same coordinate in both atlases +np.save(Path(path_agea_cache).joinpath('image.npy'), image) +np.save(Path(path_agea_cache).joinpath('label.npy'), label) +bg.image = image +bg.label = label + +igenes = (0, 2325) +fig, axs = plt.subplots(3, 2, sharex=True, sharey=True) +ba.plot_cslice(0, ax=axs[0, 0]) +bg.plot_cslice(0, ax=axs[1, 0]) +ba.plot_cslice(0, ax=axs[0, 1], volume='annotation') +bg.plot_cslice(0, ax=axs[1, 1], volume='annotation') +bg.plot_cslice(0, ax=axs[2, 0], volume=gexp_all[igenes[0]], cmap='viridis') +bg.plot_cslice(0, ax=axs[2, 1], volume=gexp_all[igenes[1]], cmap='viridis') +fig.tight_layout() + +fig, axs = plt.subplots(3, 2, sharex=True, sharey=True) +ba.plot_sslice(0, ax=axs[0, 0]) +bg.plot_sslice(0, ax=axs[1, 0]) +ba.plot_sslice(0, ax=axs[0, 1], volume='annotation') +bg.plot_sslice(0, ax=axs[1, 1], volume='annotation') +bg.plot_sslice(0, ax=axs[2, 0], volume=gexp_all[igenes[0]], cmap='viridis') +bg.plot_sslice(0, ax=axs[2, 1], volume=gexp_all[igenes[1]], cmap='viridis') +fig.tight_layout() + +# %% diff --git a/iblatlas/plots.py b/iblatlas/plots.py index 1e2e8d7..d7c6afc 100644 --- a/iblatlas/plots.py +++ b/iblatlas/plots.py @@ -231,7 +231,7 @@ def load_slice_files(slice, mapping): 'top': [] } - slice_file = AllenAtlas._get_cache_dir().parent.joinpath('svg', f'{slice}_{mapping}_paths.npy') + slice_file = AllenAtlas._get_cache_dir().joinpath('svg', f'{slice}_{mapping}_paths.npy') if not slice_file.exists() or md5(slice_file) in OLD_MD5[slice]: slice_file.parent.mkdir(exist_ok=True, parents=True) _logger.info(f'downloading swanson paths from {aws.S3_BUCKET_IBL} s3 bucket...') diff --git a/iblatlas/tests/test_atlas.py b/iblatlas/tests/test_atlas.py index d8a242d..49d9aba 100644 --- a/iblatlas/tests/test_atlas.py +++ b/iblatlas/tests/test_atlas.py @@ -3,7 +3,7 @@ import numpy as np import matplotlib.pyplot as plt -from iblatlas.atlas import (BrainCoordinates, cart2sph, sph2cart, Trajectory, +from iblatlas.atlas import (BrainCoordinates, cart2sph, sph2cart, Trajectory, BrainAtlas, Insertion, ALLEN_CCF_LANDMARKS_MLAPDV_UM, AllenAtlas) from iblatlas.regions import BrainRegions, FranklinPaxinosRegions from iblatlas.plots import prepare_lr_data, reorder_data @@ -420,21 +420,35 @@ def test_plot_slices(self): axs = self.ba.plot_slices(np.array([0, -.0024, -.0038])) assert axs.shape == (2, 2) - def test_slice(self): - ba = self.ba - nx, ny, nz = ba.bc.nxyz - # tests output shapes - self.assertTrue(ba.slice(axis=0, coordinate=0).shape == (ny, nz)) # sagittal - self.assertTrue(ba.slice(axis=1, coordinate=0).shape == (nx, nz)) # coronal - self.assertTrue(ba.slice(axis=2, coordinate=-.002).shape == (ny, nx)) # horizontal - # tests out of bound - with self.assertRaises(ValueError): - ba.slice(axis=1, coordinate=123) - self.assertTrue(ba.slice(axis=1, coordinate=21, mode='clip').shape == (nx, nz)) + def test_slice_axes(self): + nx, nz, ny = (58, 41, 67) + ba_gene_atlas_like = BrainAtlas( + image=np.random.randn(nx, nz, ny), + label=np.ones((nx, nz, ny)), + dxyz=200 / 1e6, + regions=BrainRegions(), + iorigin=[0, 0, 0], + dims2xyz=[0, 2, 1], + xyz2dims=[0, 2, 1] + ) + for k, ba in {'allen': self.ba, 'gene_expression': ba_gene_atlas_like}.items(): + with self.subTest(ba=k): + nx, ny, nz = ba.bc.nxyz + # tests output shapes + self.assertTrue(ba.slice(axis=0, coordinate=0).shape == (ny, nz)) # sagittal + self.assertTrue(ba.slice(axis=1, coordinate=0).shape == (nx, nz)) # coronal + self.assertTrue(ba.slice(axis=2, coordinate=.0002).shape == (ny, nx)) # horizontal + # tests out of bound + with self.assertRaises(ValueError): + ba.slice(axis=1, coordinate=123) + self.assertTrue(ba.slice(axis=1, coordinate=21, mode='clip').shape == (nx, nz)) # another coronal + + def test_slice_volumes(self): """ here we test the different volumes and mappings """ - # the label volume contains the region index (not id!), + ba = self.ba + nx, ny, nz = ba.bc.nxyz iregions = ba.slice(axis=0, coordinate=0, volume=ba.label) assert np.all(np.unique(iregions) == np.array([0, 1327])) rgb_slice = ba.slice(axis=0, coordinate=0, volume='annotation') diff --git a/iblatlas/tests/test_genomics.py b/iblatlas/tests/test_genomics.py new file mode 100644 index 0000000..75ff9e4 --- /dev/null +++ b/iblatlas/tests/test_genomics.py @@ -0,0 +1,40 @@ +import unittest +import tempfile +from pathlib import Path +import unittest.mock + +import numpy as np +import pandas as pd + +from iblatlas.genomics import agea + + +class TestLoadAgea(unittest.TestCase): + + def test_load(self): + """ + Tests the mechanics of the load function by faking a 3 experiments gene expression dataset + Here we just want to make sure the function returns the right dataframes and volumes, + and that the atlas reads the right label and image volumes + """ + with tempfile.TemporaryDirectory() as folder_cache: + df_genes = pd.DataFrame({ + 'id': np.array(['74658173', '71247618', '74511936']), + 'gene': ['Ndufa10', 'Grik2', 'Dmp1'] + }) + df_genes.to_parquet(Path(folder_cache).joinpath('gene-expression.pqt')) + expression_matrices = np.random.rand(3, *agea.DIM_EXP[1:]) + expression_matrices.tofile(Path(folder_cache).joinpath('gene-expression.bin')) + image = np.random.rand(*agea.DIM_EXP[1:]) + np.save(Path(folder_cache).joinpath('image.npy'), image) + label = np.random.rand(*agea.DIM_EXP[1:]) + np.save(Path(folder_cache).joinpath('label.npy'), label) + Path(folder_cache).joinpath('toto.version').touch() + df_genes, gene_expression_volumes, atlas_agea = agea.load( + folder_cache=folder_cache, expression_size=(3, 58, 41, 67)) + self.assertEqual(df_genes.shape, (3, 2)) + self.assertEqual(gene_expression_volumes.shape, (3, 58, 41, 67)) + self.assertEqual(atlas_agea.image.shape, (58, 41, 67)) + self.assertEqual(atlas_agea.label.shape, (58, 41, 67)) + # on windows we need to close the memmap for the tempdir to be deleted + gene_expression_volumes._mmap.close() diff --git a/pyproject.toml b/pyproject.toml index 2a0f4fc..4718e52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,4 +35,5 @@ version = {attr = "iblatlas.__version__"} include-package-data = true [tool.setuptools.packages.find] -exclude = ["iblatlas.tests*", "examples*"] # exclude packages matching these glob patterns +# exclude packages matching these glob patterns +exclude = ["iblatlas.tests*", "examples*", "iblatlas.genomics.gene_expression_scrapping"]