Add APIs for attributing unit change without a mechanism change

Signed-off-by: Kailash <[email protected]>
EgorKraevTransferwise · Nov 14, 2022 · ac04706 · ac04706
1 parent 675cbc4
commit ac04706
Show file tree

Hide file tree

Showing 3 changed files with 288 additions and 9 deletions.
diff --git a/dowhy/gcm/__init__.py b/dowhy/gcm/__init__.py
@@ -25,5 +25,6 @@
 from .independence_test import approx_kernel_based, independence_test, kernel_based, regression_based
 from .influence import arrow_strength, intrinsic_causal_influence
 from .stochastic_models import BayesianGaussianMixtureDistribution, EmpiricalDistribution, ScipyDistribution
+from .unit_change import unit_change
 from .validation import RejectionResult, refute_causal_structure, refute_invertible_model
 from .whatif import average_causal_effect, counterfactual_samples, interventional_samples
diff --git a/dowhy/gcm/unit_change.py b/dowhy/gcm/unit_change.py
@@ -31,6 +31,50 @@ def coefficients(self) -> np.ndarray:
         return self.sklearn_model.coef_
 
 
+def unit_change(
+    background_df: pd.DataFrame,
+    foreground_df: pd.DataFrame,
+    input_column_names: List[str],
+    background_mechanism: PredictionModel,
+    foreground_mechanism: Optional[PredictionModel] = None,
+    shapley_config: Optional[ShapleyConfig] = None,
+) -> pd.DataFrame:
+    """
+    This function attributes the change in the output value of a deterministic mechanism for a statistical unit to each input and optionally for the mechanism if `foreground_mechanism` is provided.
+    The technical method is described in the following research paper:
+    Kailash Budhathoki, George Michailidis, Dominik Janzing. *Explaining the root causes of unit-level changes*. arXiv, 2022.
+
+    :param background_df: The background dataset.
+    :param foreground_df: The foreground dataset.
+    :param input_column_names: The names of the input columns.
+    :param background_mechanism: The background mechanism. If the mechanism does not change, then this mechanism is used for attribution.
+    :param foreground_mechanism: The foreground mechanism. If provided, the method also attributes the output change to the change in the mechanism.
+    :param shapley_config: The configuration for calculating Shapley values.
+    :return: A dataframe containing the contributions of each input and optionally the mechanism to the change in the output values of the deterministic mechanism(s) for given inputs.
+    """
+    if foreground_mechanism:
+        if isinstance(background_mechanism, LinearPredictionModel):
+            return unit_change_linear(
+                background_mechanism, background_df, foreground_mechanism, foreground_df, input_column_names
+            )
+        else:
+            return unit_change_nonlinear(
+                background_mechanism,
+                background_df,
+                foreground_mechanism,
+                foreground_df,
+                input_column_names,
+                shapley_config,
+            )
+
+    if isinstance(background_mechanism, LinearPredictionModel):
+        return unit_change_linear_input_only(background_mechanism, background_df, foreground_df, input_column_names)
+    else:
+        return unit_change_nonlinear_input_only(
+            background_mechanism, background_df, foreground_df, input_column_names, shapley_config
+        )
+
+
 def unit_change_nonlinear(
     background_mechanism: PredictionModel,
     background_df: pd.DataFrame,
@@ -54,13 +98,14 @@ def unit_change_nonlinear(
     """
     _check_if_input_columns_exist(background_df, foreground_df, input_column_names)
 
-    def payoff(binary_vector: List[int]) -> np.ndarray:
+    def payoff(player_indicator: List[int]) -> np.ndarray:
         """The last cell in the binary vector represents the player 'mechanism'."""
-        background_column_names = [input_column_names[i] for i, val in enumerate(binary_vector[:-1]) if val == 0]
-        foreground_column_names = [input_column_names[i] for i, val in enumerate(binary_vector[:-1]) if val == 1]
-        df = pd.concat([background_df[background_column_names], foreground_df[foreground_column_names]], axis=1)
-        mechanism = foreground_mechanism if binary_vector[-1] else background_mechanism
-        return mechanism.predict(df[input_column_names].values).flatten()
+        input_arrays = []
+        for i, is_player_active in enumerate(player_indicator[:-1]):
+            selected_df = foreground_df if is_player_active else background_df
+            input_arrays.append(selected_df[input_column_names[i]].to_numpy())
+        mechanism = foreground_mechanism if player_indicator[-1] else background_mechanism
+        return mechanism.predict(np.column_stack(input_arrays)).flatten()
 
     contributions = estimate_shapley_values(payoff, len(input_column_names) + 1, shapley_config)
     root_causes = input_column_names + ["f"]
@@ -99,6 +144,60 @@ def unit_change_linear(
     return contribution_df
 
 
+def unit_change_nonlinear_input_only(
+    mechanism: PredictionModel,
+    background_df: pd.DataFrame,
+    foreground_df: pd.DataFrame,
+    input_column_names: List[str],
+    shapley_config: Optional[ShapleyConfig] = None,
+) -> pd.DataFrame:
+    """
+    Calculates the contributions of each input to the change in the output values of a non-linear deterministic mechanism.
+    The technical method is a modification of the attribution method described in the following research paper, without mechanism as a player:
+    Kailash Budhathoki, George Michailidis, Dominik Janzing. *Explaining the root causes of unit-level changes*. arXiv, 2022.
+
+    :param mechanism: The mechanism.
+    :param background_df: The background data.
+    :param foreground_df: The foreground data.
+    :param input_column_names: The names of the input (features) columns in both dataframes.
+    :param shapley_config: The configuration for calculating Shapley values.
+    :return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
+    """
+    _check_if_input_columns_exist(background_df, foreground_df, input_column_names)
+
+    def payoff(player_indicator: List[int]) -> np.ndarray:
+        input_arrays = []
+        for i, is_player_active in enumerate(player_indicator):
+            selected_df = foreground_df if is_player_active else background_df
+            input_arrays.append(selected_df[input_column_names[i]].to_numpy())
+        return mechanism.predict(np.column_stack(input_arrays)).flatten()
+
+    contributions = estimate_shapley_values(payoff, len(input_column_names), shapley_config)
+    return pd.DataFrame(contributions, columns=input_column_names)
+
+
+def unit_change_linear_input_only(
+    mechanism: LinearPredictionModel,
+    background_df: pd.DataFrame,
+    foreground_df: pd.DataFrame,
+    input_column_names: List[str],
+) -> pd.DataFrame:
+    """
+    Calculates the contributions of each input to the change in the output values of a linear deterministic mechanism.
+
+    :param mechanism: The linear mechanism.
+    :param background_df: The background data.
+    :param foreground_df: The foreground data.
+    :param input_column_names: The names of the input (features) columns in both dataframes.
+    :return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
+    """
+    _check_if_input_columns_exist(background_df, foreground_df, input_column_names)
+
+    input_diff = foreground_df[input_column_names].to_numpy() - background_df[input_column_names].to_numpy()  # n x p
+    contribution_input = np.einsum("ij,ki->ki", mechanism.coefficients.reshape(-1, 1), input_diff)
+    return pd.DataFrame(contribution_input, columns=input_column_names)
+
+
 def _check_if_input_columns_exist(
     background_df: pd.DataFrame, foreground_df: pd.DataFrame, input_column_names: List[str]
 ) -> None:

diff --git a/tests/gcm/test_unit_change.py b/tests/gcm/test_unit_change.py
@@ -7,7 +7,14 @@
 from sklearn.linear_model import LinearRegression
 
 from dowhy.gcm.ml.regression import SklearnRegressionModel
-from dowhy.gcm.unit_change import SklearnLinearRegressionModel, unit_change_linear, unit_change_nonlinear
+from dowhy.gcm.unit_change import (
+    SklearnLinearRegressionModel,
+    unit_change,
+    unit_change_linear,
+    unit_change_linear_input_only,
+    unit_change_nonlinear,
+    unit_change_nonlinear_input_only,
+)
 
 
 @flaky(max_runs=5)
@@ -40,7 +47,7 @@ def test_given_fitted_linear_mechanisms_with_output_change_when_evaluate_unit_ch
         )
     )
 
-    np.testing.assert_array_almost_equal(actual_contributions.to_numpy(), expected_contributions.to_numpy(), decimal=1)
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
 
 
 @flaky(max_runs=5)
@@ -71,7 +78,7 @@ def test_given_fitted_linear_mechanisms_with_output_change_when_evaluate_unit_ch
         background_mechanism, background_df, foreground_mechanism, foreground_df, ["A", "B"]
     )
 
-    np.testing.assert_array_almost_equal(actual_contributions.to_numpy(), expected_contributions.to_numpy(), decimal=1)
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
 
 
 def test_given_unfitted_mechanisms_when_evaluate_unit_change_methods_then_raises_exception():
@@ -112,3 +119,175 @@ def test_given_fitted_nonlinnear_mechanisms_when_evaluate_unit_change_linear_met
             pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
             ["A", "B"],
         )
+
+
+@flaky(max_runs=5)
+def test_given_fitted_mechanisms_with_no_input_change_when_evaluate_unit_change_input_only_methods_then_returns_zero_attributions():
+    num_rows = 100
+    A = np.random.normal(size=num_rows)
+    B = np.random.normal(size=num_rows)
+    C = 3 * A + 2 * B
+
+    background_df = pd.DataFrame(data=dict(A=A, B=B, C=C))
+    foreground_df = pd.DataFrame(data=dict(A=A, B=B, C=C))
+
+    actual_contributions = unit_change_nonlinear_input_only(
+        SklearnRegressionModel(RFR().fit(np.column_stack((A, B)), C)),
+        background_df,
+        foreground_df,
+        ["A", "B"],
+    )
+    expected_contributions = pd.DataFrame(data=dict(A=np.zeros(num_rows), B=np.zeros(num_rows)))
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
+
+
+@flaky(max_runs=5)
+def test_given_fitted_linear_mechanism_with_input_change_when_evaluate_unit_change_linear_input_only_then_returns_correct_attributions():
+    num_rows = 100
+    A1 = np.random.normal(size=num_rows)
+    B1 = np.random.normal(size=num_rows)
+    C1 = 3 * A1 + 2 * B1
+
+    A2 = np.random.normal(size=num_rows)
+    B2 = np.random.normal(size=num_rows)
+    C2 = 3 * A2 + 2 * B2
+
+    background_df = pd.DataFrame(data=dict(A=A1, B=B1, C=C1))
+    foreground_df = pd.DataFrame(data=dict(A=A2, B=B2, C=C2))
+
+    fitted_linear_reg = LinearRegression()
+    fitted_linear_reg.coef_ = np.array([3, 2])
+
+    actual_contributions = unit_change_linear_input_only(
+        SklearnLinearRegressionModel(fitted_linear_reg), background_df, foreground_df, ["A", "B"]
+    )
+    expected_contributions = pd.DataFrame(data=dict(A=3 * (A2 - A1), B=2 * (B2 - B1)))
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
+
+
+@flaky(max_runs=5)
+def test_given_fitted_linear_mechanism_with_input_change_when_evaluate_unit_change_input_only_methods_then_attributions_are_consistent():
+    num_rows = 100
+    A1 = np.random.normal(size=num_rows)
+    B1 = np.random.normal(size=num_rows)
+    C1 = 3 * A1 + 2 * B1
+
+    A2 = np.random.normal(size=num_rows)
+    B2 = np.random.normal(size=num_rows)
+    C2 = 3 * A2 + 2 * B2
+
+    background_df = pd.DataFrame(data=dict(A=A1, B=B1, C=C1))
+    foreground_df = pd.DataFrame(data=dict(A=A2, B=B2, C=C2))
+
+    mechanism = SklearnLinearRegressionModel(LinearRegression().fit(np.column_stack((A1, B1)), C1))
+    actual_contributions = unit_change_nonlinear_input_only(mechanism, background_df, foreground_df, ["A", "B"])
+    expected_contributions = unit_change_linear_input_only(mechanism, background_df, foreground_df, ["A", "B"])
+
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
+
+
+def test_given_unfitted_mechanisms_when_evaluate_unit_change_input_only_methods_then_raises_exception():
+    with pytest.raises(NotFittedError):
+        unit_change_linear_input_only(
+            SklearnLinearRegressionModel(LinearRegression()),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            ["A", "B"],
+        )
+
+    with pytest.raises(NotFittedError):
+        unit_change_nonlinear_input_only(
+            SklearnLinearRegressionModel(LinearRegression()),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            ["A", "B"],
+        )
+
+    with pytest.raises(NotFittedError):
+        unit_change_nonlinear_input_only(
+            SklearnRegressionModel(RFR()),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            ["A", "B"],
+        )
+
+
+def test_given_fitted_nonlinnear_mechanism_when_evaluate_unit_change_linear_input_only_method_then_raises_exception():
+    with pytest.raises(AttributeError):
+        unit_change_linear_input_only(
+            SklearnRegressionModel(RFR().fit(np.random.normal(size=(100, 2)), np.random.normal(size=100))),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
+            ["A", "B"],
+        )
+
+
+@flaky(max_runs=5)
+def test_given_single_mechanism_with_default_optional_parameters_when_evaluate_unit_change_then_returns_correct_attributions_to_input_only():
+    num_rows = 100
+    A1 = np.random.normal(size=num_rows)
+    B1 = np.random.normal(size=num_rows)
+    C1 = 2 * A1 + 3 * B1
+
+    A2 = np.random.normal(size=num_rows)
+    B2 = np.random.normal(size=num_rows)
+    # C2 = 3 * A2 + 2 * B2
+
+    background_df = pd.DataFrame(data=dict(A=A1, B=B1))
+    foreground_df = pd.DataFrame(data=dict(A=A2, B=B2))
+
+    mechanism = SklearnLinearRegressionModel(LinearRegression(fit_intercept=False).fit(np.column_stack((A1, B1)), C1))
+
+    actual_contributions = unit_change(background_df, foreground_df, ["A", "B"], mechanism)
+    expected_contributions = unit_change_linear_input_only(mechanism, background_df, foreground_df, ["A", "B"])
+
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
+
+    mechanism = SklearnRegressionModel(RFR().fit(np.column_stack((A1, B1)), C1))
+
+    actual_contributions = unit_change(background_df, foreground_df, ["A", "B"], mechanism)
+    expected_contributions = unit_change_nonlinear_input_only(mechanism, background_df, foreground_df, ["A", "B"])
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
+
+
+@flaky(max_runs=5)
+def test_given_two_mechanisms_when_evaluate_unit_change_then_returns_correct_attributions_to_both_mechanism_and_input():
+    num_rows = 100
+    A1 = np.random.normal(size=num_rows)
+    B1 = np.random.normal(size=num_rows)
+    C1 = 2 * A1 + 3 * B1
+
+    A2 = np.random.normal(size=num_rows)
+    B2 = np.random.normal(size=num_rows)
+    C2 = 3 * A2 + 2 * B2
+
+    background_df = pd.DataFrame(data=dict(A=A1, B=B1))
+    foreground_df = pd.DataFrame(data=dict(A=A2, B=B2))
+
+    background_mechanism = SklearnLinearRegressionModel(
+        LinearRegression(fit_intercept=False).fit(np.column_stack((A1, B1)), C1)
+    )
+    foreground_mechanism = SklearnLinearRegressionModel(
+        LinearRegression(fit_intercept=False).fit(np.column_stack((A2, B2)), C2)
+    )
+
+    actual_contributions = unit_change(
+        background_df, foreground_df, ["A", "B"], background_mechanism, foreground_mechanism
+    )
+    expected_contributions = unit_change_linear(
+        background_mechanism, background_df, foreground_mechanism, foreground_df, ["A", "B"]
+    )
+
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)
+
+    background_mechanism = SklearnRegressionModel(RFR().fit(np.column_stack((A1, B1)), C1))
+    foreground_mechanism = SklearnRegressionModel(RFR().fit(np.column_stack((A2, B2)), C2))
+
+    actual_contributions = unit_change(
+        background_df, foreground_df, ["A", "B"], background_mechanism, foreground_mechanism
+    )
+    expected_contributions = unit_change_nonlinear(
+        background_mechanism, background_df, foreground_mechanism, foreground_df, ["A", "B"]
+    )
+
+    np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)