Fix fitting and optimization steps (#75)

* Offset slices indexes to take into account marker dataset offset. * Remove unnecessary check on inhibitory neuron density. if a mother region count is given as certain, children regions can be given as uncertain. This can happen when literature provides certain counts in mother region (e.g., Striatum is purely inhibitory). * Limit fitting to regions groups. * Update docs
BlueBrain · Mar 25, 2024 · 31a341d · 31a341d
1 parent 2b89cfd
commit 31a341d
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 97 deletions.
diff --git a/atlas_densities/app/cell_densities.py b/atlas_densities/app/cell_densities.py
@@ -905,13 +905,18 @@ def fit_average_densities(
         for (name, stddev) in cell_density_stddev.items()
     }
 
-    gene_marker_volumes = {
-        gene: {
+    gene_marker_volumes = {}
+    for gene, gene_data in gene_voxeldata.items():
+        loc_slices = slices[config["sectionDataSetID"][gene]]
+        gene_marker_volumes[gene] = {
             "intensity": gene_data.raw,
-            "slices": slices[config["sectionDataSetID"][gene]],  # list of integer slice indices
+            # list of integer slice indices
+            "slices": (
+                loc_slices - np.asarray(gene_data.offset / gene_data.voxel_dimensions, dtype=int)[0]
+                if loc_slices is not None
+                else None
+            ),
         }
-        for (gene, gene_data) in gene_voxeldata.items()
-    }
 
     group_ids_config = utils.load_json(group_ids_config_path)
 
@@ -932,7 +937,7 @@ def fit_average_densities(
         group_ids_config=group_ids_config,
     )
 
-    # Turn index into column so as to ease off the save and load operations on csv files
+    # Turn index into column to ease off the save and load operations on csv files
     fitted_densities_df["brain_region"] = fitted_densities_df.index
 
     L.info("Saving fitted densities to file %s ...", fitted_densities_output_path)

diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py
@@ -18,6 +18,7 @@
 """
 from __future__ import annotations
 
+import itertools as it
 import logging
 import warnings
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
@@ -135,6 +136,9 @@ def fill_in_homogenous_regions(
             in number of neurons per mm^3.
         densities: the data frame returned by
             :fun:`atlas_densities.densities.fitting.create_dataframe_from_known_densities`.
+        hierarchy_info: data frame with columns "descendant_id_set" (set[int]) and "brain_region"
+            (str) and whose index is a list of region ids.
+            See :fun:`atlas_densities.densities.utils.get_hierarchy_info`.
         cell_density_stddevs: (Optional) dict whose keys are brain regions names and whose values
             are standard deviations of average cell densities of the corresponding regions.
             Defaults to None, in which case the output standard deviation is set with the density
@@ -418,24 +422,32 @@ def compute_fitting_coefficients(
             len(cell_types),
         )
         for region_name in tqdm(groups[group_name]):
-            for cell_type in cell_types:
-                intensity = average_intensities.at[region_name, cell_type[:-1]]
-                density = densities.at[region_name, cell_type]
-                if not (
-                    np.isnan(density) or np.isnan(intensity) or intensity == 0.0 or density == 0.0
-                ):
-                    assert_msg = f"in region {region_name} for cell type {cell_type}"
-                    assert intensity >= 0.0, "Negative average intensity " + assert_msg
-                    assert density >= 0.0, "Negative density " + assert_msg
-                    standard_deviation = densities.at[
-                        region_name, cell_type + "_standard_deviation"
-                    ]
-                    assert not np.isnan(standard_deviation), "NaN standard deviation " + assert_msg
-                    assert standard_deviation >= 0.0, "Negative standard deviation " + assert_msg
-
-                    clouds[group_name][cell_type]["xdata"].append(intensity)
-                    clouds[group_name][cell_type]["ydata"].append(density)
-                    clouds[group_name][cell_type]["sigma"].append(standard_deviation)
+            if region_name in average_intensities.index:
+                for cell_type in cell_types:
+                    intensity = average_intensities.at[region_name, cell_type[:-1]]
+                    density = densities.at[region_name, cell_type]
+                    if not (
+                        np.isnan(density)
+                        or np.isnan(intensity)
+                        or intensity == 0.0
+                        or density == 0.0
+                    ):
+                        assert_msg = f"in region {region_name} for cell type {cell_type}"
+                        assert intensity >= 0.0, "Negative average intensity " + assert_msg
+                        assert density >= 0.0, "Negative density " + assert_msg
+                        standard_deviation = densities.at[
+                            region_name, cell_type + "_standard_deviation"
+                        ]
+                        assert not np.isnan(standard_deviation), (
+                            "NaN standard deviation " + assert_msg
+                        )
+                        assert standard_deviation >= 0.0, (
+                            "Negative standard deviation " + assert_msg
+                        )
+
+                        clouds[group_name][cell_type]["xdata"].append(intensity)
+                        clouds[group_name][cell_type]["ydata"].append(density)
+                        clouds[group_name][cell_type]["sigma"].append(standard_deviation)
 
         L.info("Computing regression coefficients for %d cell types ...", len(cell_types))
         for cell_type in tqdm(cell_types):
@@ -479,23 +491,51 @@ def fit_unknown_densities(
     for group_name, region_names in groups.items():
         for region_name in region_names:
             for marker in average_intensities.columns:
-                intensity = average_intensities.at[region_name, marker]
-                cell_type = marker + "+"
-                if np.isnan(densities.at[region_name, cell_type]) and not np.isnan(intensity):
-                    fitting = fitting_coefficients[group_name][cell_type]
-                    if np.isnan(fitting["coefficient"]):
-                        warnings.warn(
-                            f"Interpolating density of region {region_name} for cell type "
-                            f"{cell_type} using NaN fitting coefficient of region group "
-                            f"{group_name}.",
-                            AtlasDensitiesWarning,
-                        )
-                    fitted_value = fitting["coefficient"] * intensity
-                    standard_deviation = fitted_value * fitting["standard_deviation"]
-                    densities.at[region_name, cell_type] = fitted_value
-                    densities.at[
-                        region_name, cell_type + "_standard_deviation"
-                    ] = standard_deviation
+                if region_name in average_intensities.index:
+                    _apply_fitting(
+                        region_name,
+                        marker,
+                        group_name,
+                        average_intensities,
+                        densities,
+                        fitting_coefficients,
+                    )
+
+
+def _apply_fitting(
+    region_name, marker, group_name, average_intensities, densities, fitting_coefficients
+):
+    """
+    Apply the fitting function to a given region and marker based on the corresponding marker
+    intensity.
+
+    Args:
+        region_name: region name
+        marker: ISH marker name
+        group_name: fitting group name
+        average_intensities: pandas data frame returned by
+          :fun:`atlas_densities.densities.fitting.compute_average_intensities`.
+        densities: data frame returned by
+          :fun:`atlas_densities.densities.fitting.create_dataframe_from_known_densities`.
+        fitting_coefficients: dict returned by
+          :fun:`atlas_densities.densities.fitting.compute_fitting_coefficients`.
+    """
+
+    intensity = average_intensities.at[region_name, marker]
+    cell_type = marker + "+"
+    if np.isnan(densities.at[region_name, cell_type]) and not np.isnan(intensity):
+        fitting = fitting_coefficients[group_name][cell_type]
+        if np.isnan(fitting["coefficient"]):
+            warnings.warn(
+                f"Interpolating density of region {region_name} for cell type "
+                f"{cell_type} using NaN fitting coefficient of region group "
+                f"{group_name}.",
+                AtlasDensitiesWarning,
+            )
+        fitted_value = fitting["coefficient"] * intensity
+        standard_deviation = fitted_value * fitting["standard_deviation"]
+        densities.at[region_name, cell_type] = fitted_value
+        densities.at[region_name, cell_type + "_standard_deviation"] = standard_deviation
 
 
 def _check_homogenous_regions_sanity(homogenous_regions: pd.DataFrame) -> None:
@@ -556,7 +596,7 @@ def _get_group_names(region_map: "RegionMap", group_ids_config: dict) -> dict[st
 
     group_ids = utils.get_group_ids(region_map, group_ids_config)
 
-    # Make the Rest group stable under taking descendants
+    # Make the Rest group stable undertaking descendants
     # The name of any ascendant region of the Cerebellum and
     #    Isocortex groups are removed from the names of the Rest group. This makes sure that
     #    the set of names of the Rest group is closed under taking descendants.
@@ -573,6 +613,20 @@ def _get_group_names(region_map: "RegionMap", group_ids_config: dict) -> dict[st
     }
 
 
+def _get_group_region_names(groups):
+    """
+    Create a set of the union of AIBS region names that belong to the given groups.
+
+    Args:
+        groups: dictionary from group name to list of region name.
+
+    Returns:
+        A list of region names.
+    """
+
+    return list(set(it.chain.from_iterable(groups.values())))
+
+
 def linear_fitting(  # pylint: disable=too-many-arguments
     region_map: "RegionMap",
     annotation: AnnotationT,
@@ -601,6 +655,9 @@ def linear_fitting(  # pylint: disable=too-many-arguments
             define the regions whose cell densities will be computed.
         annotation: int array of shape (W, H, D) holding the annotation of the whole
             brain model. (The integers W, H and D are the dimensions of the array).
+        neuron_density: non-negative float array of shape (W, H, D) where W, H and D are integer
+            dimensions. This array holds the volumetric neuron density of the brain model expressed
+            in number of neurons per mm^3.
         gene_marker_volumes: dict of the form {
                   "gad67": {"intensity": <array>, "slices": <list>},
                   "pv": {"intensity": <array>, "slices": <list>},
@@ -616,7 +673,7 @@ def linear_fitting(  # pylint: disable=too-many-arguments
         homogenous_regions: data frame with two columns, brain_region and cell_type.
             The brain_region column holds region names and the values of the cell_type column are
             "inhibitory" or "excitatory" depending on whether every neuron of the region is
-            inhibitory or excitatory. Note that the "inhibitory" cell type will superseded by
+            inhibitory or excitatory. Note that the "inhibitory" cell type will supersede by
             its synonym "gad67+" in the output data frame.
         cell_density_stddevs: dict whose keys are brain regions names and whose values are
             standard deviations of average cell densities of the corresponding regions.
@@ -667,16 +724,23 @@ def linear_fitting(  # pylint: disable=too-many-arguments
     )
 
     L.info("Computing average intensities ...")
+    groups = _get_group_names(region_map, group_ids_config)
+    # Limit the fitting to the regions groups.
+    indexes = np.isin(
+        hierarchy_info["brain_region"].to_numpy(),
+        _get_group_region_names(groups),
+        invert=True,
+    )
     average_intensities = compute_average_intensities(
-        annotation, gene_marker_volumes, hierarchy_info
+        annotation,
+        gene_marker_volumes,
+        hierarchy_info.drop(hierarchy_info.index[indexes]),
     )
 
-    L.info("Getting group names ...")
-    # We want group region names to be stable under taking descendants
-    groups = _get_group_names(region_map, group_ids_config)
-
     L.info("Computing fitting coefficients ...")
-    fitting_coefficients = compute_fitting_coefficients(groups, average_intensities, densities)
+    fitting_coefficients = compute_fitting_coefficients(
+        groups, average_intensities, densities.drop(densities.index[indexes])
+    )
     L.info("Fitting unknown average densities ...")
     fit_unknown_densities(groups, average_intensities, densities, fitting_coefficients)
 

diff --git a/atlas_densities/densities/inhibitory_neuron_densities_optimization.py b/atlas_densities/densities/inhibitory_neuron_densities_optimization.py
@@ -16,8 +16,8 @@
 across the brain region hierarchy.
 
 The file ``doc/source/bbpp82_628_linear_program.pdf`` serves as a documentation of the linear
-program, see Section 2 in particular. The initalization of the linear program relies on average
-density estimates, obtained obtained for instance via the ``fitting`` module. We refer to these
+program, see Section 2 in particular. The initialization of the linear program relies on average
+density estimates, obtained for instance via the ``fitting`` module. We refer to these
 estimates as the initial estimates.
 
 The output are volumetric density nrrd files, one for each of the aforementioned inhibitory
@@ -286,7 +286,7 @@ def _zero_descendants(id_, cell_type, x_result, deltas, hierarchy_info):
     L.info("Setting known values ...")
     # Set delta_{r, m} with `SKIP` if the corresponding cell count estimate is missing.
     # This means that the region r won't impose any constraint wrt to the marker m in the linear
-    # program, i. e., the delta_{r, m} variable is omitted for such r and m.
+    # program, i.e., the delta_{r, m} variable is omitted for such r and m.
     for id_, region_name in zip(hierarchy_info.index, region_counts.index):
         for cell_type in cell_types:
             # A region without cell count estimate for `cell_type` adds no constraint
@@ -353,7 +353,7 @@ def create_bounds(
             bound.
         - `x_map` is a dict mapping pairs (id_, cell_type) to indices of `bounds`.
             The keys of this dict corresponds to cell count variables in the linear program.
-        - `deltas_map` is a dict mapping pairs (region_name, cell_type) to to indices of `bounds`.
+        - `deltas_map` is a dict mapping pairs (region_name, cell_type) to indices of `bounds`.
             The keys of this dict corresponds to the slack "delta" variables which represent
             distances of cell count variables to initial estimates.
     """
@@ -434,7 +434,7 @@ def check_constraint_3e(
         constraint: FloatArray, b_value: float, region_name: str, id_: int
     ) -> bool:
         """
-        Check if `contraint` is a valid inequality constraint of type (3e).
+        Check if `constraint` is a valid inequality constraint of type (3e).
 
         Returns:
             True if the constraint is not void.
@@ -571,7 +571,7 @@ def _compute_initial_cell_counts(
             :fun:`densities.inhibitory_densities_helper.average_densities_to_cell_counts`.
             It holds region cell counts as well as associated standard deviations.
         - `id_counts` has the return value format of
-            :fun:`densities.inhbitory_densities_helper.average_densities_to_cell_counts` except
+            :fun:`densities.inhibitory_densities_helper.average_densities_to_cell_counts` except
             that its index is a list of unique integer region identifiers.
             It holds the cell counts of the 3D regions labeled by a unique identifier, as well as
             associated standard deviations (descendant subregions are excluded).
@@ -595,47 +595,29 @@ def _compute_initial_cell_counts(
 
 def _check_variables_consistency(
     x_result: pd.DataFrame,
-    deltas: pd.DataFrame,
     cell_types: List[str],
     neuron_counts: pd.DataFrame,
     hierarchy_info: pd.DataFrame,
 ) -> None:
     """
-    Raises an error if a delta variable indexed by (region_name, cell_type) is set with a non-NaN
-    value whereas an x_result variable indexed by (desc_id, cell_type) is set with a NaN value for
-    some descendant id of region_name.
-
     Raises also an error if a cell count value marked as final in `x_result` exceeds its
     prescribed upper bound from `neuron_counts`.
 
     Args:
         x_result: see return value of :fun:`set_known_values`.
-        deltas: idem
         cell_types: list of cell type names, e.g., ["pv+", "sst+", "vip+", "gad67+"].
         hierarchy_info: data frame returned by
             :func:`atlas_densities.densities.utils.get_hierarchy_info`.
         neuron_counts: data frame with one column "cell_count" holding the neuron count each 3D
             region labeled by an integer identifier in `neuron_count.index`.
 
     Raises:
-        AtlasDensitiesError if on the the following assumptions is violated:
-        - if cell count estimate of a region is known with certainty for a given cell type,
-        then the cell count of every descendant region is also known with certainty.
-        - a cell count estimate which is given for certain does not
+        AtlasDensitiesError if a cell count estimate which is given for certain exceeds the neuron
+        count constraint.
     """
     cell_count_tolerance = 1e-2  # absolute tolerance to rule out round-off errors
-    for region_name, id_, id_set in zip(
-        deltas.index, hierarchy_info.index, hierarchy_info["descendant_ids"]
-    ):
+    for id_ in hierarchy_info.index:
         for cell_type in cell_types:
-            if np.isfinite(deltas.loc[region_name, cell_type]):
-                for desc_id in id_set:
-                    if np.isnan(x_result.loc[desc_id, cell_type]):
-                        raise AtlasDensitiesError(
-                            f"Cell count estimate of region named '{region_name}' for cell type "
-                            f"{cell_type} was given for certain whereas the cell count of "
-                            f"descendant id {desc_id} is not certain."
-                        )
             neuron_count = neuron_counts.loc[id_, "cell_count"]
             if (
                 not np.isnan(x_result.loc[id_, cell_type])
@@ -770,7 +752,7 @@ def create_inhibitory_neuron_densities(  # pylint: disable=too-many-locals
     # We assume that if the cell count of `cell_type` in `region_name` is known with certainty
     # then the same holds in every descendant subregion.
     _check_variables_consistency(
-        x_result, deltas, get_cell_types(region_counts), neuron_counts, hierarchy_info
+        x_result, get_cell_types(region_counts), neuron_counts, hierarchy_info
     )
 
     L.info("Setting variable bounds and further inequality constraints ...")

diff --git a/atlas_densities/densities/utils.py b/atlas_densities/densities/utils.py
@@ -412,7 +412,7 @@ def get_group_ids(region_map: "RegionMap", config: dict, _skip_check=False) -> d
 
     Args:
         region_map: object to navigate the mouse brain regions hierarchy
-            (instantied from AIBS 1.json).
+            (instantiated from AIBS 1.json).
         config: mapping of regions to their constituent ids
     Returns:
         A dictionary whose keys are region group names and whose values are