From 6e76e8cbf507d00e2d28516d9e4f7592a58c3dac Mon Sep 17 00:00:00 2001
From: Dimitri RODARIE <d.rodarie@gmail.com>
Date: Wed, 21 Feb 2024 11:12:21 +0100
Subject: [PATCH] Add r_square values to the results of the fitting. (#68)

---
 README.rst                           |  8 ++++--
 atlas_densities/densities/fitting.py | 38 ++++++++++++++++++----------
 tests/densities/test_fitting.py      |  7 +++++
 3 files changed, 38 insertions(+), 15 deletions(-)
diff --git a/README.rst b/README.rst
index 7c544e6..7b3d708 100644
--- a/README.rst
+++ b/README.rst
@@ -265,8 +265,11 @@ details). This step leverages AIBS ISH marker datasets (in their expression form
 literature density values.
 These transfer functions are used to obtain first estimates of neuron densities in regions not
 covered by literature.
-The result of the following command is a list of first density estimates for each neuron type and
-for each region of the annotation volume.
+
+The results of the following command includes a csv file (`first_estimates.csv`) storing the list
+of first density estimates for each neuron type and for each region of the annotation volume.
+The `fitting.json` output file contains the coefficients fitted by the algorithm together with their
+respective standard deviation and `coefficient of determination`_ (`r_square`).
 
 .. code-block:: bash
 
@@ -449,6 +452,7 @@ Copyright (c) 2022-2024 Blue Brain Project/EPFL
 .. _`mmc3.xlsx`: https://github.com/BlueBrain/atlas-densities/blob/main/atlas_densities/app/data/measurements/mmc3.xlsx
 .. _`gaba_papers.xlsx`: https://github.com/BlueBrain/atlas-densities/blob/main/atlas_densities/app/data/measurements/gaba_papers.xlsx
 .. _`mtypes_probability_map_config.yaml`: https://github.com/BlueBrain/atlas-densities/blob/main/atlas_densities/app/data/mtypes/mtypes_probability_map_config.yaml
+.. _`coefficient of determination`: https://en.wikipedia.org/wiki/Coefficient_of_determination
 
 .. substitutions
 
diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py
index 14428ad..3595160 100644
--- a/atlas_densities/densities/fitting.py
+++ b/atlas_densities/densities/fitting.py
@@ -285,7 +285,9 @@ def linear_fitting_xy(
             otherwise by 1.0.
 
     Returns:
-        a dict of the form {"coefficient": <float>, "standard_deviation": <float>}.
+        a dict of the form {"coefficient": <float>,
+                            "standard_deviation": <float>,
+                            "r_square": <float>}.
 
     Raises:
        AtlasDensitiesError if some of the `sigma`values are negative.
@@ -311,15 +313,24 @@ def linear_fitting_xy(
         least_positive = np.min(sigma[~zero_mask])
         sigma[zero_mask] = least_positive / 2.0
 
+    def _optimize_func(x, coefficient):
+        return coefficient * np.array(x)
+
     parameters = curve_fit(
-        lambda x, coefficient: coefficient * x,
+        _optimize_func,
         xdata=xdata,
         ydata=ydata,
         sigma=sigma,
         absolute_sigma=True,
     )
-
-    return {"coefficient": parameters[0][0], "standard_deviation": np.sqrt(parameters[1][0][0])}
+    ss_reg = np.sum((_optimize_func(xdata, parameters[0][0]) - np.mean(ydata)) ** 2)
+    ss_tot = np.sum((np.array(ydata) - _optimize_func(xdata, parameters[0][0])) ** 2) + ss_reg
+    # if total sum of square is null, no variance can be explained by the fitting
+    return {
+        "coefficient": parameters[0][0],
+        "standard_deviation": np.sqrt(parameters[1][0][0]),
+        "r_square": (ss_reg / ss_tot) if ss_tot > 0 else np.nan,
+    }
 
 
 FittingData = Dict[str, Dict[str, Dict[str, float]]]
@@ -350,16 +361,16 @@ def compute_fitting_coefficients(
         dict of the form (values are fake):
         {
             "Isocortex" : {
-                "gad67": { "coefficient": 0.85, "standard_deviation": 0.15},
-                "pv": { "coefficient": 1.00, "standard_deviation": 0.5},
-                "sst": { "coefficient": 1.07, "standard_deviation": 0.75},
-                "vip": { "coefficient": 0.95, "standard_deviation": 0.15},
+                "gad67": { "coefficient": 0.85, "standard_deviation": 0.15, "r_square": 0.75},
+                "pv": { "coefficient": 1.00, "standard_deviation": 0.5, "r_square": 0.66},
+                "sst": { "coefficient": 1.07, "standard_deviation": 0.75, "r_square": 0.80},
+                "vip": { "coefficient": 0.95, "standard_deviation": 0.15, "r_square": 0.90},
             },
             "Cerebellum" : {
-                "gad67": { "coefficient": 0.75, "standard_deviation": 0.15},
-                "pv": { "coefficient": 1.10, "standard_deviation": 0.55},
-                "sst": { "coefficient": 1.20, "standard_deviation": 0.45},
-                "vip": { "coefficient": 0.95, "standard_deviation": 0.15},
+                "gad67": { "coefficient": 0.75, "standard_deviation": 0.15, "r_square": 0.55},
+                "pv": { "coefficient": 1.10, "standard_deviation": 0.55, "r_square": 0.40},
+                "sst": { "coefficient": 1.20, "standard_deviation": 0.45, "r_square": 0.58},
+                "vip": { "coefficient": 0.95, "standard_deviation": 0.15, "r_square": 0.77},
             },
            ...
         }
@@ -367,6 +378,7 @@ def compute_fitting_coefficients(
         performed. A "coefficient" value is the value of the linear regression coefficient for a
         given region group and a given cell type.
         The "standard_deviation" value is the standard deviation of the coefficient value.
+        The "r_square" value is the coefficient of determination of the coefficient value.
     """
 
     if len(densities.index) != len(average_intensities.index) or np.any(
@@ -387,7 +399,7 @@ def compute_fitting_coefficients(
 
     result = {
         group_name: {
-            cell_type: {"coefficient": np.nan, "standard_deviation": np.nan}
+            cell_type: {"coefficient": np.nan, "standard_deviation": np.nan, "r_square": np.nan}
             for cell_type in cell_types
         }
         for group_name in groups
diff --git a/tests/densities/test_fitting.py b/tests/densities/test_fitting.py
index 6bf319d..51be985 100644
--- a/tests/densities/test_fitting.py
+++ b/tests/densities/test_fitting.py
@@ -268,15 +268,18 @@ def test_compute_average_intensities(hierarchy_info):
 def test_linear_fitting_xy():
     actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0])
     assert np.allclose(actual["coefficient"], 2.0)
+    assert np.allclose(actual["r_square"], 1.0)
     assert not np.isinf(actual["standard_deviation"])
 
     actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5])
     assert np.allclose(actual["coefficient"], 2.0)
     assert not np.isinf(actual["standard_deviation"])
+    assert np.allclose(actual["r_square"], 0.89286)
 
     actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0])
     assert np.allclose(actual["coefficient"], 2.0)
     assert not np.isinf(actual["standard_deviation"])
+    assert np.allclose(actual["r_square"], 1.0)
 
 
 def get_fitting_input_data_(hierarchy_info):
@@ -319,6 +322,10 @@ def test_compute_fitting_coefficients(hierarchy_info):
         assert not np.isnan(actual[group_name]["gad67+"]["standard_deviation"])
         assert not np.isinf(actual[group_name]["pv+"]["standard_deviation"])
         assert not np.isnan(actual[group_name]["pv+"]["standard_deviation"])
+    assert actual["Whole"]["pv+"]["r_square"] == 1.0
+    assert actual["Whole"]["gad67+"]["r_square"] == 1.0
+    assert np.isnan(actual["Central lobule"]["pv+"]["r_square"])  # only one value so no variation.
+    assert np.isnan(actual["Central lobule"]["gad67+"]["r_square"])
 
 
 def test_compute_fitting_coefficients_exceptions(hierarchy_info):