From e09c88937fb08b25da47fb70e50b4b2ef598eb96 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Thu, 22 Feb 2024 08:54:10 +0100 Subject: [PATCH 1/9] fast fit average densities --- atlas_densities/densities/fitting.py | 99 +++++++++++++++++++++++++++- tests/densities/test_fitting.py | 12 ++-- 2 files changed, 104 insertions(+), 7 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 7d2c440..7181e6e 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -214,10 +214,105 @@ def compute_average_intensity( return 0.0 +#from atlas_densities.densities import utils +#import voxcell +#region_map = voxcell.RegionMap.load_json('input-data/hierarchy_ccfv2_l23split_barrelsplit.json') +# +#region_name = 'root' +#hierarchy_info = utils.get_hierarchy_info(region_map, root=region_name) + +def add_depths(region_map_df, root_id): + region_map_df['depth'] = 0 + parent_ids = [root_id, ] + depth = 1 + while parent_ids: + children_ids = region_map_df[np.isin(region_map_df['parent_id'], list(parent_ids))].index + region_map_df.loc[children_ids, 'depth'] = depth + parent_ids = set(children_ids) + depth += 1 + +def fill_densities(region_map, region_map_df, df): + order_idx = np.argsort(region_map_df.depth.to_numpy()) + + for idx in region_map_df.index[order_idx[::-1]]: + if idx not in region_map._children: + continue + children = region_map._children[idx] + if not children: + continue + print(region_map_df.loc[idx, 'name'], region_map_df.loc[idx, 'depth']) + voxel_count = df.loc[children, "voxel_count"] + count = voxel_count.sum() + if count: + df.loc[idx, "voxel_count"] = count + df.loc[idx, "density"] = np.average(df.loc[children, "density"], weights=voxel_count) + + def compute_average_intensities( annotation: AnnotationT, gene_marker_volumes: MarkerVolumes, hierarchy_info: pd.DataFrame, + region_map, +) -> pd.DataFrame: + + densities = {k: pd.DataFrame( + data=0.0, + index=hierarchy_info.index, + columns=['voxel_count', 'density'], + ) for k in gene_marker_volumes} + + for id_ in np.unique(annotation): + if id_ == 0: + continue + mask = annotation == id_ + for marker, intensity in gene_marker_volumes.items(): + intensity, slices = intensity["intensity"], intensity["slices"] + + if slices is None: + restricted_mask = mask + else: + restricted_mask = np.zeros_like(mask, dtype=bool) + slices_ = [slice_ for slice_ in slices if 0 <= slice_ < mask.shape[0]] + restricted_mask[slices_] = True + restricted_mask = np.logical_and(restricted_mask, mask) + + count = restricted_mask.sum() + if count <= 0: + continue + + mean_density = np.mean(intensity[restricted_mask]) + if mean_density == 0.: + #print(f"Mean density for {hierarchy_info.loc[id_]} and {marker}") + breakpoint() # XXX BREAKPOINT + pass + + densities[marker.lower()].loc[id_]['voxel_count'] = count + densities[marker.lower()].loc[id_]['density'] = mean_density + + hierarchy_info = hierarchy_info.set_index("brain_region") + result = pd.DataFrame( + data=np.nan, + index=hierarchy_info.index, + columns=[marker_name.lower() for marker_name in gene_marker_volumes.keys()], + ) + + region_map_df = region_map.as_dataframe() + add_depths(region_map_df, 997) + + for marker in gene_marker_volumes: + df = densities[marker.lower()] + fill_densities(region_map, region_map_df, df) + df['name'] = region_map_df.loc[df.index].name + df = df.set_index('name') + result[marker.lower()] = df['density'] + + return result + + +def compute_average_intensities1( + annotation: AnnotationT, + gene_marker_volumes: MarkerVolumes, + hierarchy_info: pd.DataFrame, ) -> pd.DataFrame: """ Compute the average marker intensity of every region in `hierarchy_info` for every marker @@ -732,9 +827,7 @@ def linear_fitting( # pylint: disable=too-many-arguments invert=True, ) average_intensities = compute_average_intensities( - annotation, - gene_marker_volumes, - hierarchy_info.drop(hierarchy_info.index[indexes]), + annotation, gene_marker_volumes, hierarchy_info, region_map ) L.info("Computing fitting coefficients ...") diff --git a/tests/densities/test_fitting.py b/tests/densities/test_fitting.py index 51be985..4047a2c 100644 --- a/tests/densities/test_fitting.py +++ b/tests/densities/test_fitting.py @@ -50,7 +50,7 @@ def test_create_dataframe_from_known_densities(): @pytest.fixture -def hierarchy_info(): +def region_map(): hierarchy = { "id": 8, "name": "Basic cell groups and regions", @@ -104,7 +104,11 @@ def hierarchy_info(): ], } - return utils.get_hierarchy_info(RegionMap.from_dict(hierarchy)) + return RegionMap.from_dict(hierarchy) + +@pytest.fixture +def hierarchy_info(region_map): + return utils.get_hierarchy_info(region_map) def test_fill_in_homogenous_regions(hierarchy_info): @@ -226,7 +230,7 @@ def test_compute_average_intensity(): assert actual == 0 -def test_compute_average_intensities(hierarchy_info): +def test_compute_average_intensities(region_map, hierarchy_info): annotation = np.array( [[[0, 976], [976, 936]], [[976, 936], [936, 936]]] # 976 = Lobule II, 936 = "Declive (VI)"" ) @@ -261,7 +265,7 @@ def test_compute_average_intensities(hierarchy_info): index=hierarchy_info["brain_region"], ) - actual = tested.compute_average_intensities(annotation, marker_volumes, hierarchy_info) + actual = tested.compute_average_intensities(annotation, marker_volumes, hierarchy_info, region_map) pdt.assert_frame_equal(actual, expected) From 5a0d86cc4e0c90503d9b625d47883bd5d60588b8 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Thu, 22 Feb 2024 18:32:42 +0100 Subject: [PATCH 2/9] joblib --- atlas_densities/densities/fitting.py | 147 +++++++++++++++++++-------- 1 file changed, 105 insertions(+), 42 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 7181e6e..0d7cc21 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -221,9 +221,9 @@ def compute_average_intensity( #region_name = 'root' #hierarchy_info = utils.get_hierarchy_info(region_map, root=region_name) -def add_depths(region_map_df, root_id): +def add_depths(region_map_df): region_map_df['depth'] = 0 - parent_ids = [root_id, ] + parent_ids = [-1, ] depth = 1 while parent_ids: children_ids = region_map_df[np.isin(region_map_df['parent_id'], list(parent_ids))].index @@ -231,21 +231,69 @@ def add_depths(region_map_df, root_id): parent_ids = set(children_ids) depth += 1 + def fill_densities(region_map, region_map_df, df): order_idx = np.argsort(region_map_df.depth.to_numpy()) - for idx in region_map_df.index[order_idx[::-1]]: - if idx not in region_map._children: + ids = region_map_df.index[order_idx[::-1]] + for id_ in ids: + #if id_ == 607 or id_ == 112: + # breakpoint() # XXX BREAKPOINT + + if id_ not in region_map._children: continue - children = region_map._children[idx] + children = region_map._children[id_] if not children: continue - print(region_map_df.loc[idx, 'name'], region_map_df.loc[idx, 'depth']) + #print(region_map_df.loc[idx, 'name'], region_map_df.loc[idx, 'depth']) voxel_count = df.loc[children, "voxel_count"] count = voxel_count.sum() if count: - df.loc[idx, "voxel_count"] = count - df.loc[idx, "density"] = np.average(df.loc[children, "density"], weights=voxel_count) + df.loc[id_, "voxel_count"] = count + df.loc[id_, "density"] = np.average(df.loc[children, "density"], weights=voxel_count) + + +from joblib import delayed, Parallel + +def _apply_density_slices(gene_marker_volumes): + ret = {} + for marker, intensity in gene_marker_volumes.items(): + intensity, slices = intensity["intensity"], intensity["slices"] + + if slices is None: + mask = np.ones_like(intensity, dtype=bool) + else: + mask = np.zeros_like(intensity, dtype=bool) + slices_ = [slice_ for slice_ in slices if 0 <= slice_ < mask.shape[0]] + mask[slices_] = True + + ret[marker] = { + 'intensity': intensity, + 'slices': slices, + 'mask': mask + } + + return ret + + +def _compute_average_intensities_helper(annotation, gene_marker_volumes, id_): + mask = annotation == id_ + res = [] + for marker, intensity in gene_marker_volumes.items(): + restricted_mask = np.logical_and(intensity['mask'], mask) + count = restricted_mask.sum() + + if count <= 0: + continue + + mean_density = np.mean(intensity['intensity'][restricted_mask]) + if mean_density == 0.: + #print(f"Mean density for {hierarchy_info.loc[id_]} and {marker}") + breakpoint() # XXX BREAKPOINT + pass + res.append((marker.lower(), id_, count, mean_density)) + + return res def compute_average_intensities( @@ -255,60 +303,75 @@ def compute_average_intensities( region_map, ) -> pd.DataFrame: - densities = {k: pd.DataFrame( - data=0.0, - index=hierarchy_info.index, - columns=['voxel_count', 'density'], - ) for k in gene_marker_volumes} + gene_marker_volumes = _apply_density_slices(gene_marker_volumes) + _helper = delayed(_compute_average_intensities_helper) + work = [] for id_ in np.unique(annotation): if id_ == 0: continue - mask = annotation == id_ - for marker, intensity in gene_marker_volumes.items(): - intensity, slices = intensity["intensity"], intensity["slices"] + work.append(_helper(annotation, gene_marker_volumes, id_)) - if slices is None: - restricted_mask = mask - else: - restricted_mask = np.zeros_like(mask, dtype=bool) - slices_ = [slice_ for slice_ in slices if 0 <= slice_ < mask.shape[0]] - restricted_mask[slices_] = True - restricted_mask = np.logical_and(restricted_mask, mask) - count = restricted_mask.sum() - if count <= 0: - continue + res = Parallel(n_jobs=-2)(work) + densities = (pd.DataFrame(list(it.chain(*res)), + columns=['marker', 'id', 'voxel_count', 'density']) + .set_index('id') + .pivot(columns='marker') + .swaplevel(axis=1) + ) - mean_density = np.mean(intensity[restricted_mask]) - if mean_density == 0.: - #print(f"Mean density for {hierarchy_info.loc[id_]} and {marker}") - breakpoint() # XXX BREAKPOINT - pass - - densities[marker.lower()].loc[id_]['voxel_count'] = count - densities[marker.lower()].loc[id_]['density'] = mean_density + region_map_df = region_map.as_dataframe() + add_depths(region_map_df) - hierarchy_info = hierarchy_info.set_index("brain_region") + #hierarchy_info = hierarchy_info.set_index("brain_region") + #gene_marker_volumes = ['gad67', 'pv', 'sst', 'vip'] result = pd.DataFrame( data=np.nan, index=hierarchy_info.index, - columns=[marker_name.lower() for marker_name in gene_marker_volumes.keys()], + columns=[marker_name.lower() for marker_name in gene_marker_volumes], ) - - region_map_df = region_map.as_dataframe() - add_depths(region_map_df, 997) + result['name'] = region_map_df.loc[result.index].name for marker in gene_marker_volumes: - df = densities[marker.lower()] + df = pd.DataFrame(data=0.0, index=result.index, columns=['voxel_count', 'density']) + df.update(densities[marker.lower()]) fill_densities(region_map, region_map_df, df) - df['name'] = region_map_df.loc[df.index].name - df = df.set_index('name') result[marker.lower()] = df['density'] + result = result.set_index('name') return result +''' +from atlas_densities.densities import fitting +from voxcell import VoxelData, RegionMap +from atlas_densities.densities import utils + +annotation = VoxelData.load_nrrd('input-data/annotation_ccfv2_l23split_barrelsplit_validated.nrrd') + +gene_voxeldata = { + 'gad67': VoxelData.load_nrrd('input-data/gene_gad67_correctednissl.nrrd'), + 'pv': VoxelData.load_nrrd('input-data/gene_pv_correctednissl.nrrd'), + 'sst': VoxelData.load_nrrd('input-data/gene_sst_correctednissl.nrrd'), + 'vip': VoxelData.load_nrrd('input-data/gene_vip_correctednissl.nrrd'), +} +gids = { 'gad67': '479', 'pv': '868', 'sst': '1001', 'vip': '77371835', } +slices = utils.load_json('input-data/realigned_slices.json') +gene_marker_volumes = { + gene: { + "intensity": gene_data.raw, + "slices": slices[gids[gene]], # list of integer slice indices + } + for (gene, gene_data) in gene_voxeldata.items() +} +region_map = RegionMap.load_json('input-data/hierarchy_ccfv2_l23split_barrelsplit.json') +hierarchy_info = utils.get_hierarchy_info(region_map, root='root') + +fitting.compute_average_intensities(annotation, gene_marker_volumes, hierarchy_info, region_map) +breakpoint() # XXX BREAKPOINT +''' + def compute_average_intensities1( annotation: AnnotationT, gene_marker_volumes: MarkerVolumes, From d33a1cd53ac42b55081dd541393da7eec09e1885 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Fri, 1 Mar 2024 15:49:46 +0100 Subject: [PATCH 3/9] faster compute_average_intensities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most of the speed up comes from being able to parallelize across cores: old: 1920s new: 2min 26s ± 199 ms so about 13 times faster Small change in computed densities: (Pdb++) np.abs(result.to_numpy() - old.to_numpy()).max() 9.518734625513225e-08 Test code: ``` from atlas_densities.densities import fitting from voxcell import VoxelData, RegionMap from atlas_densities.densities import utils annotation = VoxelData.load_nrrd('input-data/annotation_ccfv2_l23split_barrelsplit_validated.nrrd') gene_voxeldata = { 'gad67': VoxelData.load_nrrd('input-data/gene_gad67_correctednissl.nrrd'), 'pv': VoxelData.load_nrrd('input-data/gene_pv_correctednissl.nrrd'), 'sst': VoxelData.load_nrrd('input-data/gene_sst_correctednissl.nrrd'), 'vip': VoxelData.load_nrrd('input-data/gene_vip_correctednissl.nrrd'), } gids = { 'gad67': '479', 'pv': '868', 'sst': '1001', 'vip': '77371835', } slices = utils.load_json('input-data/realigned_slices.json') gene_marker_volumes = { gene: { "intensity": gene_data.raw, "slices": slices[gids[gene]], # list of integer slice indices } for (gene, gene_data) in gene_voxeldata.items() } region_map = RegionMap.load_json('input-data/hierarchy_ccfv2_l23split_barrelsplit.json') hierarchy_info = utils.get_hierarchy_info(region_map, root='root') res = fitting.compute_average_intensities(annotation.raw, gene_marker_volumes, hierarchy_info, region_map) ``` --- atlas_densities/densities/fitting.py | 90 ---------------------------- 1 file changed, 90 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 0d7cc21..b002eb6 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -324,8 +324,6 @@ def compute_average_intensities( region_map_df = region_map.as_dataframe() add_depths(region_map_df) - #hierarchy_info = hierarchy_info.set_index("brain_region") - #gene_marker_volumes = ['gad67', 'pv', 'sst', 'vip'] result = pd.DataFrame( data=np.nan, index=hierarchy_info.index, @@ -343,94 +341,6 @@ def compute_average_intensities( return result -''' -from atlas_densities.densities import fitting -from voxcell import VoxelData, RegionMap -from atlas_densities.densities import utils - -annotation = VoxelData.load_nrrd('input-data/annotation_ccfv2_l23split_barrelsplit_validated.nrrd') - -gene_voxeldata = { - 'gad67': VoxelData.load_nrrd('input-data/gene_gad67_correctednissl.nrrd'), - 'pv': VoxelData.load_nrrd('input-data/gene_pv_correctednissl.nrrd'), - 'sst': VoxelData.load_nrrd('input-data/gene_sst_correctednissl.nrrd'), - 'vip': VoxelData.load_nrrd('input-data/gene_vip_correctednissl.nrrd'), -} -gids = { 'gad67': '479', 'pv': '868', 'sst': '1001', 'vip': '77371835', } -slices = utils.load_json('input-data/realigned_slices.json') -gene_marker_volumes = { - gene: { - "intensity": gene_data.raw, - "slices": slices[gids[gene]], # list of integer slice indices - } - for (gene, gene_data) in gene_voxeldata.items() -} -region_map = RegionMap.load_json('input-data/hierarchy_ccfv2_l23split_barrelsplit.json') -hierarchy_info = utils.get_hierarchy_info(region_map, root='root') - -fitting.compute_average_intensities(annotation, gene_marker_volumes, hierarchy_info, region_map) -breakpoint() # XXX BREAKPOINT -''' - -def compute_average_intensities1( - annotation: AnnotationT, - gene_marker_volumes: MarkerVolumes, - hierarchy_info: pd.DataFrame, -) -> pd.DataFrame: - """ - Compute the average marker intensity of every region in `hierarchy_info` for every marker - of `gene_marker_volumes`. - - If a region does not intersect any of the slices of a gene marker volume, the average density - of the marked cell type of this region is set with np.nan. - - Args: - annotation: int array of shape (W, H, D) holding the annotation of the whole - brain model. (The integers W, H and D are the dimensions of the array). - gene_marker_volumes: dict of the form { - "gad67": {"intensity": , "slices": }, - "pv": {"intensity": , "slices": }, - ... - } - where each intensity array is of shape (W, H, D) and where the items of each slice - list range in [0, W - 1]. - hierarchy_info: data frame with colums "descendant_ids" (set[int]) and "brain_region" - (str) and whose index is a list of region ids. - See :fun:`atlas_densities.densities.utils.get_hierarchy_info`. - - Returns: - A data frame of the following form (values are fake): - gad67 pv sst vip - Isocortex 1.5 1.1 0.2 12.0 - Cerebellum 2.5 0.9 0.1 11.0 - ... ... ... ... ... - The index of the data frame is the list of regions `hierarchy_info["brain_region"]`. - The column labels are the keys of `gene_marker_volumes` in lower case. - """ - region_count = len(hierarchy_info["brain_region"]) - data = np.full((region_count, len(gene_marker_volumes)), np.nan) - hierarchy_info = hierarchy_info.set_index("brain_region") - result = pd.DataFrame( - data=data, - index=hierarchy_info.index, - columns=[marker_name.lower() for marker_name in gene_marker_volumes.keys()], - ) - - L.info( - "Computing average intensities for %d markers in %d regions ...", - len(gene_marker_volumes), - region_count, - ) - for region_name in tqdm(result.index): - region_mask = np.isin(annotation, list(hierarchy_info.at[region_name, "descendant_ids"])) - for marker, intensity in gene_marker_volumes.items(): - result.at[region_name, marker.lower()] = compute_average_intensity( - intensity["intensity"], region_mask, intensity["slices"] - ) - - return result - - def linear_fitting_xy( xdata: list[float], ydata: list[float], sigma: Union[list[float], FloatArray] ) -> dict[str, float]: From ab7b30908b02953680e4a8d5f73bc83330a694c1 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Thu, 21 Mar 2024 15:09:33 +0100 Subject: [PATCH 4/9] cleanup --- atlas_densities/densities/fitting.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index b002eb6..5bc2ef5 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -26,6 +26,7 @@ import numpy as np import pandas as pd from atlas_commons.typing import AnnotationT, BoolArray, FloatArray +from joblib import Parallel, delayed from scipy.optimize import curve_fit from tqdm import tqdm @@ -237,15 +238,14 @@ def fill_densities(region_map, region_map_df, df): ids = region_map_df.index[order_idx[::-1]] for id_ in ids: - #if id_ == 607 or id_ == 112: - # breakpoint() # XXX BREAKPOINT - if id_ not in region_map._children: continue + children = region_map._children[id_] + if not children: continue - #print(region_map_df.loc[idx, 'name'], region_map_df.loc[idx, 'depth']) + voxel_count = df.loc[children, "voxel_count"] count = voxel_count.sum() if count: @@ -253,8 +253,6 @@ def fill_densities(region_map, region_map_df, df): df.loc[id_, "density"] = np.average(df.loc[children, "density"], weights=voxel_count) -from joblib import delayed, Parallel - def _apply_density_slices(gene_marker_volumes): ret = {} for marker, intensity in gene_marker_volumes.items(): @@ -288,9 +286,7 @@ def _compute_average_intensities_helper(annotation, gene_marker_volumes, id_): mean_density = np.mean(intensity['intensity'][restricted_mask]) if mean_density == 0.: - #print(f"Mean density for {hierarchy_info.loc[id_]} and {marker}") - breakpoint() # XXX BREAKPOINT - pass + L.warning("Mean density for id=%s and marker=%s", id_, marker) res.append((marker.lower(), id_, count, mean_density)) return res @@ -329,7 +325,7 @@ def compute_average_intensities( index=hierarchy_info.index, columns=[marker_name.lower() for marker_name in gene_marker_volumes], ) - result['name'] = region_map_df.loc[result.index].name + result['brain_region'] = region_map_df.loc[result.index].name for marker in gene_marker_volumes: df = pd.DataFrame(data=0.0, index=result.index, columns=['voxel_count', 'density']) @@ -337,7 +333,7 @@ def compute_average_intensities( fill_densities(region_map, region_map_df, df) result[marker.lower()] = df['density'] - result = result.set_index('name') + result = result.set_index('brain_region') return result From 6fb1a2a37bed9ed8a3dc9ff08e0e728d5e7baff8 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Thu, 21 Mar 2024 15:19:22 +0100 Subject: [PATCH 5/9] docstrings --- atlas_densities/densities/fitting.py | 100 +++++++++++++++++---------- tests/densities/test_fitting.py | 5 +- 2 files changed, 68 insertions(+), 37 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 5bc2ef5..6d1c961 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -215,33 +215,35 @@ def compute_average_intensity( return 0.0 -#from atlas_densities.densities import utils -#import voxcell -#region_map = voxcell.RegionMap.load_json('input-data/hierarchy_ccfv2_l23split_barrelsplit.json') -# -#region_name = 'root' -#hierarchy_info = utils.get_hierarchy_info(region_map, root=region_name) - -def add_depths(region_map_df): - region_map_df['depth'] = 0 - parent_ids = [-1, ] +def _add_depths(region_map_df): + """Rdd a `depth` column to the `region_map_df` with how deep it is within the hierarchy.""" + region_map_df["depth"] = 0 + parent_ids = [ + -1, + ] depth = 1 while parent_ids: - children_ids = region_map_df[np.isin(region_map_df['parent_id'], list(parent_ids))].index - region_map_df.loc[children_ids, 'depth'] = depth + children_ids = region_map_df[np.isin(region_map_df["parent_id"], list(parent_ids))].index + region_map_df.loc[children_ids, "depth"] = depth parent_ids = set(children_ids) depth += 1 -def fill_densities(region_map, region_map_df, df): +def _fill_densities(region_map, region_map_df, df): + """Fill all `voxel_count` and `density` for each region within `def`. + + This assumes the leaf nodes (ie: the deapest nodes in region_map) have had + their values filled in, and thus each of their parents can recursively be + filled in. + """ order_idx = np.argsort(region_map_df.depth.to_numpy()) ids = region_map_df.index[order_idx[::-1]] for id_ in ids: - if id_ not in region_map._children: + if id_ not in region_map._children: # pylint: disable=protected-access continue - children = region_map._children[id_] + children = region_map._children[id_] # pylint: disable=protected-access if not children: continue @@ -254,6 +256,7 @@ def fill_densities(region_map, region_map_df, df): def _apply_density_slices(gene_marker_volumes): + """For each marker in `gene_marker_volumes`, apply the slice mask to the indensities volume.""" ret = {} for marker, intensity in gene_marker_volumes.items(): intensity, slices = intensity["intensity"], intensity["slices"] @@ -265,27 +268,24 @@ def _apply_density_slices(gene_marker_volumes): slices_ = [slice_ for slice_ in slices if 0 <= slice_ < mask.shape[0]] mask[slices_] = True - ret[marker] = { - 'intensity': intensity, - 'slices': slices, - 'mask': mask - } + ret[marker] = {"intensity": intensity, "slices": slices, "mask": mask} return ret def _compute_average_intensities_helper(annotation, gene_marker_volumes, id_): + """Compute the average intensity for `id_`, for all makers in `gene_marker_volumes`""" mask = annotation == id_ res = [] for marker, intensity in gene_marker_volumes.items(): - restricted_mask = np.logical_and(intensity['mask'], mask) + restricted_mask = np.logical_and(intensity["mask"], mask) count = restricted_mask.sum() if count <= 0: continue - mean_density = np.mean(intensity['intensity'][restricted_mask]) - if mean_density == 0.: + mean_density = np.mean(intensity["intensity"][restricted_mask]) + if mean_density == 0.0: L.warning("Mean density for id=%s and marker=%s", id_, marker) res.append((marker.lower(), id_, count, mean_density)) @@ -298,7 +298,36 @@ def compute_average_intensities( hierarchy_info: pd.DataFrame, region_map, ) -> pd.DataFrame: + """ + Compute the average marker intensity of every region in `hierarchy_info` for every marker + of `gene_marker_volumes`. + + If a region does not intersect any of the slices of a gene marker volume, the average density + of the marked cell type of this region is set with np.nan. + + Args: + annotation: int array of shape (W, H, D) holding the annotation of the whole + brain model. (The integers W, H and D are the dimensions of the array). + gene_marker_volumes: dict of the form { + "gad67": {"intensity": , "slices": }, + "pv": {"intensity": , "slices": }, + ... + } + where each intensity array is of shape (W, H, D) and where the items of each slice + list range in [0, W - 1]. + hierarchy_info: data frame with colums "descendant_ids" (set[int]) and "brain_region" + (str) and whose index is a list of region ids. + See :fun:`atlas_densities.densities.utils.get_hierarchy_info`. + Returns: + A data frame of the following form (values are fake): + gad67 pv sst vip + Isocortex 1.5 1.1 0.2 12.0 + Cerebellum 2.5 0.9 0.1 11.0 + ... ... ... ... ... + The index of the data frame is the list of regions `hierarchy_info["brain_region"]`. + The column labels are the keys of `gene_marker_volumes` in lower case. + """ gene_marker_volumes = _apply_density_slices(gene_marker_volumes) _helper = delayed(_compute_average_intensities_helper) @@ -308,32 +337,31 @@ def compute_average_intensities( continue work.append(_helper(annotation, gene_marker_volumes, id_)) - res = Parallel(n_jobs=-2)(work) - densities = (pd.DataFrame(list(it.chain(*res)), - columns=['marker', 'id', 'voxel_count', 'density']) - .set_index('id') - .pivot(columns='marker') - .swaplevel(axis=1) - ) + densities = ( + pd.DataFrame(list(it.chain(*res)), columns=["marker", "id", "voxel_count", "density"]) + .set_index("id") + .pivot(columns="marker") + .swaplevel(axis=1) + ) region_map_df = region_map.as_dataframe() - add_depths(region_map_df) + _add_depths(region_map_df) result = pd.DataFrame( data=np.nan, index=hierarchy_info.index, columns=[marker_name.lower() for marker_name in gene_marker_volumes], ) - result['brain_region'] = region_map_df.loc[result.index].name + result["brain_region"] = region_map_df.loc[result.index].name for marker in gene_marker_volumes: - df = pd.DataFrame(data=0.0, index=result.index, columns=['voxel_count', 'density']) + df = pd.DataFrame(data=0.0, index=result.index, columns=["voxel_count", "density"]) df.update(densities[marker.lower()]) - fill_densities(region_map, region_map_df, df) - result[marker.lower()] = df['density'] + _fill_densities(region_map, region_map_df, df) + result[marker.lower()] = df["density"] - result = result.set_index('brain_region') + result = result.set_index("brain_region") return result diff --git a/tests/densities/test_fitting.py b/tests/densities/test_fitting.py index 4047a2c..b696b6f 100644 --- a/tests/densities/test_fitting.py +++ b/tests/densities/test_fitting.py @@ -106,6 +106,7 @@ def region_map(): return RegionMap.from_dict(hierarchy) + @pytest.fixture def hierarchy_info(region_map): return utils.get_hierarchy_info(region_map) @@ -265,7 +266,9 @@ def test_compute_average_intensities(region_map, hierarchy_info): index=hierarchy_info["brain_region"], ) - actual = tested.compute_average_intensities(annotation, marker_volumes, hierarchy_info, region_map) + actual = tested.compute_average_intensities( + annotation, marker_volumes, hierarchy_info, region_map + ) pdt.assert_frame_equal(actual, expected) From 0a87506751129d0f201d171b2f334743994e4753 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Fri, 22 Mar 2024 13:49:52 +0100 Subject: [PATCH 6/9] clean up test_mtype_densities_from_probability_map test --- atlas_densities/densities/fitting.py | 2 +- tests/app/test_mtype_densities.py | 67 ++++++++++++++-------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 6d1c961..25b9b0f 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -337,7 +337,7 @@ def compute_average_intensities( continue work.append(_helper(annotation, gene_marker_volumes, id_)) - res = Parallel(n_jobs=-2)(work) + res = Parallel()(work) densities = ( pd.DataFrame(list(it.chain(*res)), columns=["marker", "id", "voxel_count", "density"]) .set_index("id") diff --git a/tests/app/test_mtype_densities.py b/tests/app/test_mtype_densities.py index 4f16ae0..408b504 100644 --- a/tests/app/test_mtype_densities.py +++ b/tests/app/test_mtype_densities.py @@ -87,54 +87,53 @@ def test_mtype_densities_from_profiles(tmp_path): assert "neuron density file" in str(result.exception) -def get_result_from_probablity_map_(runner, td): - return runner.invoke( - tested.app, - [ - # fmt: off - "--log-output-path", td, - "create-from-probability-map", - "--annotation-path", "annotation.nrrd", - "--hierarchy-path", "hierarchy.json", - "--probability-map", "probability_map01.csv", - "--probability-map", "probability_map02.csv", - "--marker", "pv", "pv.nrrd", - "--marker", "sst", "sst.nrrd", - "--marker", "vip", "vip.nrrd", - "--marker", "gad67", "gad67.nrrd", - "--marker", "approx_lamp5", "approx_lamp5.nrrd", - "--synapse-class", "EXC", - "--output-dir", "output_dir", - # fmt: on - ], - ) - - def test_mtype_densities_from_probability_map(tmp_path): data = create_from_probability_map_data() runner = CliRunner() with runner.isolated_filesystem(temp_dir=tmp_path) as td: - data["annotation"].save_nrrd("annotation.nrrd") - write_json("hierarchy.json", data["hierarchy"]) + td = Path(td) + data["annotation"].save_nrrd(td / "annotation.nrrd") + with open(td / "hierarchy.json", "w", encoding="utf-8") as file: + json.dump(data["hierarchy"], file) - data["probability_map01"].to_csv("probability_map01.csv", index=True) - data["probability_map02"].to_csv("probability_map02.csv", index=True) + data["probability_map01"].to_csv(td / "probability_map01.csv", index=True) + data["probability_map02"].to_csv(td / "probability_map02.csv", index=True) - for molecular_type, raw_data in data["molecular_type_densities"].items(): + for molecular_type, raw in data["molecular_type_densities"].items(): VoxelData( - raw_data, + raw, voxel_dimensions=data["annotation"].voxel_dimensions, - ).save_nrrd(f"{molecular_type}.nrrd") - - result = get_result_from_probablity_map_(runner, td) + ).save_nrrd(td / f"{molecular_type}.nrrd") + + result = runner.invoke( + tested.app, + [ + # fmt: off + "--log-output-path", str(td), + "create-from-probability-map", + "--annotation-path", "annotation.nrrd", + "--hierarchy-path", "hierarchy.json", + "--probability-map", "probability_map01.csv", + "--probability-map", "probability_map02.csv", + "--marker", "pv", "pv.nrrd", + "--marker", "sst", "sst.nrrd", + "--marker", "vip", "vip.nrrd", + "--marker", "gad67", "gad67.nrrd", + "--marker", "approx_lamp5", "approx_lamp5.nrrd", + "--synapse-class", "EXC", + "--output-dir", "output_dir", + # fmt: on + ], + ) assert result.exit_code == 0 - BPbAC = VoxelData.load_nrrd(str(Path("output_dir") / "BP|bAC_EXC_densities.nrrd")) + BPbAC = VoxelData.load_nrrd(Path("output_dir") / "BP|bAC_EXC_densities.nrrd") assert BPbAC.raw.dtype == float npt.assert_array_equal(BPbAC.voxel_dimensions, data["annotation"].voxel_dimensions) - with open(str(Path("output_dir") / "metadata.json"), "r") as file: + with open(Path("output_dir") / "metadata.json", "r") as file: metadata = json.load(file) + assert "BP" in metadata["density_files"] assert "bAC" in metadata["density_files"]["BP"] assert "EXC" == metadata["synapse_class"] From c4fc10a99f41d8efbf8a1ec7ba95ecef61066440 Mon Sep 17 00:00:00 2001 From: Eleftherios Zisis Date: Mon, 25 Mar 2024 11:43:12 +0100 Subject: [PATCH 7/9] Use voxel index to speed up the computation of densities (#77) * Use voxel index to speed up the computation of densities * Revert format changes * Make lint happy --- atlas_densities/densities/fitting.py | 80 ++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 10 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 25b9b0f..4084960 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -26,7 +26,6 @@ import numpy as np import pandas as pd from atlas_commons.typing import AnnotationT, BoolArray, FloatArray -from joblib import Parallel, delayed from scipy.optimize import curve_fit from tqdm import tqdm @@ -273,18 +272,21 @@ def _apply_density_slices(gene_marker_volumes): return ret -def _compute_average_intensities_helper(annotation, gene_marker_volumes, id_): +def _compute_average_intensities_helper(index, gene_marker_volumes, id_): """Compute the average intensity for `id_`, for all makers in `gene_marker_volumes`""" - mask = annotation == id_ + voxel_ids = index.value_to_1d_indices(id_) + res = [] for marker, intensity in gene_marker_volumes.items(): - restricted_mask = np.logical_and(intensity["mask"], mask) - count = restricted_mask.sum() + mask_voxels = index.ravel(intensity["mask"])[voxel_ids] + + count = mask_voxels.sum() if count <= 0: continue - mean_density = np.mean(intensity["intensity"][restricted_mask]) + mean_density = index.ravel(intensity["intensity"])[voxel_ids][mask_voxels].sum() / count + if mean_density == 0.0: L.warning("Mean density for id=%s and marker=%s", id_, marker) res.append((marker.lower(), id_, count, mean_density)) @@ -292,6 +294,62 @@ def _compute_average_intensities_helper(annotation, gene_marker_volumes, id_): return res +class ValueToIndexVoxels: + """Efficient access to indices of unique values of the values array. + + Useful for when one has an "annotations volume" or "brain region volume" that has + regions indicated by unique values, and these are used to create masks. Often, + it's faster to avoid mask creation, and use indices directly + + Example: + vtiv = ValueToIndexVoxels(br.raw) + values, funcs = zip(*((i, np.sum if i % 2 else np.mean) for i in vtiv.values[:10])) + list(vtiv.apply(values, funcs, density.raw)) + """ + + def __init__(self, values): + """Initialize. + + Args: + values(np.array): volume with each voxel marked with a value; usually to group regions + """ + self._order = "C" if values.flags["C_CONTIGUOUS"] else "F" + + values = values.ravel(order=self._order) + uniques, codes, counts = np.unique(values, return_inverse=True, return_counts=True) + + offsets = np.empty(len(counts) + 1, dtype=np.uint64) + offsets[0] = 0 + offsets[1:] = np.cumsum(counts) + + self._codes = codes + self._offsets = offsets + self._indices = np.argsort(values, kind="stable") + self._mapping = {v: i for i, v in enumerate(uniques)} + self.index_dtype = values.dtype + + def ravel(self, voxel_data): + """Ensures `voxel_data` matches the layout that the 1D indices can be used.""" + return voxel_data.ravel(order=self._order) + + @property + def values(self): + """Unique values that are found in the original volume.""" + return np.fromiter(self._mapping, dtype=self.index_dtype) + + def value_to_1d_indices(self, value): + """Return the indices array indices corresponding to the 'value'. + + Note: These are 1D indices, so the assumption is they are applied to a volume + who has been ValueToIndexVoxels::ravel(volume) + """ + if value not in self._mapping: + return np.array([], dtype=np.uint64) + + group_index = self._mapping[value] + return self._indices[self._offsets[group_index] : self._offsets[group_index + 1]] + + def compute_average_intensities( annotation: AnnotationT, gene_marker_volumes: MarkerVolumes, @@ -330,14 +388,16 @@ def compute_average_intensities( """ gene_marker_volumes = _apply_density_slices(gene_marker_volumes) - _helper = delayed(_compute_average_intensities_helper) + index = ValueToIndexVoxels(annotation) + + _helper = _compute_average_intensities_helper work = [] - for id_ in np.unique(annotation): + for id_ in index.values: if id_ == 0: continue - work.append(_helper(annotation, gene_marker_volumes, id_)) + work.append(_helper(index, gene_marker_volumes, id_)) - res = Parallel()(work) + res = work densities = ( pd.DataFrame(list(it.chain(*res)), columns=["marker", "id", "voxel_count", "density"]) .set_index("id") From d428080ede6babd1abf4c6ccb0c7f52155704978 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Tue, 26 Mar 2024 10:50:57 +0100 Subject: [PATCH 8/9] switch to ValueToIndexVoxels in voxcell --- atlas_densities/densities/fitting.py | 68 ++-------------------------- 1 file changed, 5 insertions(+), 63 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 4084960..6c4968c 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -21,21 +21,19 @@ import itertools as it import logging import warnings -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd from atlas_commons.typing import AnnotationT, BoolArray, FloatArray from scipy.optimize import curve_fit from tqdm import tqdm +from voxcell import voxel_data, RegionMap from atlas_densities.densities import utils from atlas_densities.densities.measurement_to_density import remove_unknown_regions from atlas_densities.exceptions import AtlasDensitiesError, AtlasDensitiesWarning -if TYPE_CHECKING: # pragma: no cover - from voxcell import RegionMap - L = logging.getLogger(__name__) MarkerVolumes = Dict[str, Dict[str, Union[FloatArray, List[int]]]] @@ -294,62 +292,6 @@ def _compute_average_intensities_helper(index, gene_marker_volumes, id_): return res -class ValueToIndexVoxels: - """Efficient access to indices of unique values of the values array. - - Useful for when one has an "annotations volume" or "brain region volume" that has - regions indicated by unique values, and these are used to create masks. Often, - it's faster to avoid mask creation, and use indices directly - - Example: - vtiv = ValueToIndexVoxels(br.raw) - values, funcs = zip(*((i, np.sum if i % 2 else np.mean) for i in vtiv.values[:10])) - list(vtiv.apply(values, funcs, density.raw)) - """ - - def __init__(self, values): - """Initialize. - - Args: - values(np.array): volume with each voxel marked with a value; usually to group regions - """ - self._order = "C" if values.flags["C_CONTIGUOUS"] else "F" - - values = values.ravel(order=self._order) - uniques, codes, counts = np.unique(values, return_inverse=True, return_counts=True) - - offsets = np.empty(len(counts) + 1, dtype=np.uint64) - offsets[0] = 0 - offsets[1:] = np.cumsum(counts) - - self._codes = codes - self._offsets = offsets - self._indices = np.argsort(values, kind="stable") - self._mapping = {v: i for i, v in enumerate(uniques)} - self.index_dtype = values.dtype - - def ravel(self, voxel_data): - """Ensures `voxel_data` matches the layout that the 1D indices can be used.""" - return voxel_data.ravel(order=self._order) - - @property - def values(self): - """Unique values that are found in the original volume.""" - return np.fromiter(self._mapping, dtype=self.index_dtype) - - def value_to_1d_indices(self, value): - """Return the indices array indices corresponding to the 'value'. - - Note: These are 1D indices, so the assumption is they are applied to a volume - who has been ValueToIndexVoxels::ravel(volume) - """ - if value not in self._mapping: - return np.array([], dtype=np.uint64) - - group_index = self._mapping[value] - return self._indices[self._offsets[group_index] : self._offsets[group_index + 1]] - - def compute_average_intensities( annotation: AnnotationT, gene_marker_volumes: MarkerVolumes, @@ -388,7 +330,7 @@ def compute_average_intensities( """ gene_marker_volumes = _apply_density_slices(gene_marker_volumes) - index = ValueToIndexVoxels(annotation) + index = voxel_data.ValueToIndexVoxels(annotation) _helper = _compute_average_intensities_helper work = [] @@ -732,7 +674,7 @@ def _check_average_densities_sanity(average_densities: pd.DataFrame) -> None: ) -def _get_group_names(region_map: "RegionMap", group_ids_config: dict) -> dict[str, set[str]]: +def _get_group_names(region_map: RegionMap, group_ids_config: dict) -> dict[str, set[str]]: """ Get AIBS names for regions in several region groups of interest. @@ -780,7 +722,7 @@ def _get_group_region_names(groups): def linear_fitting( # pylint: disable=too-many-arguments - region_map: "RegionMap", + region_map: RegionMap, annotation: AnnotationT, neuron_density: FloatArray, gene_marker_volumes: MarkerVolumes, From 651062d4675cfecbe45725b1bc33e6f1ed05f33c Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Tue, 26 Mar 2024 11:06:35 +0100 Subject: [PATCH 9/9] cleanup --- atlas_densities/densities/fitting.py | 64 ++++++++++++++-------------- tests/app/test_mtype_densities.py | 5 +-- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 6c4968c..6f9d558 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -28,7 +28,7 @@ from atlas_commons.typing import AnnotationT, BoolArray, FloatArray from scipy.optimize import curve_fit from tqdm import tqdm -from voxcell import voxel_data, RegionMap +from voxcell import RegionMap, voxel_data from atlas_densities.densities import utils from atlas_densities.densities.measurement_to_density import remove_unknown_regions @@ -270,25 +270,39 @@ def _apply_density_slices(gene_marker_volumes): return ret -def _compute_average_intensities_helper(index, gene_marker_volumes, id_): +def _compute_per_marker_intensity(annotation, gene_marker_volumes): """Compute the average intensity for `id_`, for all makers in `gene_marker_volumes`""" - voxel_ids = index.value_to_1d_indices(id_) + vtiv = voxel_data.ValueToIndexVoxels(annotation) - res = [] - for marker, intensity in gene_marker_volumes.items(): - mask_voxels = index.ravel(intensity["mask"])[voxel_ids] + count_and_density = [] + for id_ in vtiv.values: + if id_ == 0: + continue - count = mask_voxels.sum() + voxel_ids = vtiv.value_to_1d_indices(id_) - if count <= 0: - continue + res = [] + for marker, intensity in gene_marker_volumes.items(): + mask_voxels = vtiv.ravel(intensity["mask"])[voxel_ids] + + count = mask_voxels.sum() - mean_density = index.ravel(intensity["intensity"])[voxel_ids][mask_voxels].sum() / count + if count <= 0: + continue - if mean_density == 0.0: - L.warning("Mean density for id=%s and marker=%s", id_, marker) - res.append((marker.lower(), id_, count, mean_density)) + mean_density = vtiv.ravel(intensity["intensity"])[voxel_ids][mask_voxels].sum() / count + if mean_density == 0.0: + L.warning("Mean density for id=%s and marker=%s", id_, marker) + + count_and_density.append((marker.lower(), id_, count, mean_density)) + + res = ( + pd.DataFrame(count_and_density, columns=["marker", "id", "voxel_count", "density"]) + .set_index("id") + .pivot(columns="marker") + .swaplevel(axis=1) + ) return res @@ -330,22 +344,7 @@ def compute_average_intensities( """ gene_marker_volumes = _apply_density_slices(gene_marker_volumes) - index = voxel_data.ValueToIndexVoxels(annotation) - - _helper = _compute_average_intensities_helper - work = [] - for id_ in index.values: - if id_ == 0: - continue - work.append(_helper(index, gene_marker_volumes, id_)) - - res = work - densities = ( - pd.DataFrame(list(it.chain(*res)), columns=["marker", "id", "voxel_count", "density"]) - .set_index("id") - .pivot(columns="marker") - .swaplevel(axis=1) - ) + intensity = _compute_per_marker_intensity(annotation, gene_marker_volumes) region_map_df = region_map.as_dataframe() _add_depths(region_map_df) @@ -359,7 +358,7 @@ def compute_average_intensities( for marker in gene_marker_volumes: df = pd.DataFrame(data=0.0, index=result.index, columns=["voxel_count", "density"]) - df.update(densities[marker.lower()]) + df.update(intensity[marker.lower()]) _fill_densities(region_map, region_map_df, df) result[marker.lower()] = df["density"] @@ -826,7 +825,10 @@ def linear_fitting( # pylint: disable=too-many-arguments invert=True, ) average_intensities = compute_average_intensities( - annotation, gene_marker_volumes, hierarchy_info, region_map + annotation, + gene_marker_volumes, + hierarchy_info.drop(hierarchy_info.index[indexes]), + region_map, ) L.info("Computing fitting coefficients ...") diff --git a/tests/app/test_mtype_densities.py b/tests/app/test_mtype_densities.py index 408b504..1ca562f 100644 --- a/tests/app/test_mtype_densities.py +++ b/tests/app/test_mtype_densities.py @@ -92,10 +92,9 @@ def test_mtype_densities_from_probability_map(tmp_path): runner = CliRunner() with runner.isolated_filesystem(temp_dir=tmp_path) as td: td = Path(td) - data["annotation"].save_nrrd(td / "annotation.nrrd") - with open(td / "hierarchy.json", "w", encoding="utf-8") as file: - json.dump(data["hierarchy"], file) + data["annotation"].save_nrrd(td / "annotation.nrrd") + write_json("hierarchy.json", data["hierarchy"]) data["probability_map01"].to_csv(td / "probability_map01.csv", index=True) data["probability_map02"].to_csv(td / "probability_map02.csv", index=True)