Skip to content

Commit

Permalink
Add APIs for attributing unit change without a mechanism change
Browse files Browse the repository at this point in the history
Signed-off-by: Kailash <[email protected]>
  • Loading branch information
kailashbuki committed Nov 14, 2022
1 parent 675cbc4 commit ac04706
Show file tree
Hide file tree
Showing 3 changed files with 288 additions and 9 deletions.
1 change: 1 addition & 0 deletions dowhy/gcm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,6 @@
from .independence_test import approx_kernel_based, independence_test, kernel_based, regression_based
from .influence import arrow_strength, intrinsic_causal_influence
from .stochastic_models import BayesianGaussianMixtureDistribution, EmpiricalDistribution, ScipyDistribution
from .unit_change import unit_change
from .validation import RejectionResult, refute_causal_structure, refute_invertible_model
from .whatif import average_causal_effect, counterfactual_samples, interventional_samples
111 changes: 105 additions & 6 deletions dowhy/gcm/unit_change.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,50 @@ def coefficients(self) -> np.ndarray:
return self.sklearn_model.coef_


def unit_change(
background_df: pd.DataFrame,
foreground_df: pd.DataFrame,
input_column_names: List[str],
background_mechanism: PredictionModel,
foreground_mechanism: Optional[PredictionModel] = None,
shapley_config: Optional[ShapleyConfig] = None,
) -> pd.DataFrame:
"""
This function attributes the change in the output value of a deterministic mechanism for a statistical unit to each input and optionally for the mechanism if `foreground_mechanism` is provided.
The technical method is described in the following research paper:
Kailash Budhathoki, George Michailidis, Dominik Janzing. *Explaining the root causes of unit-level changes*. arXiv, 2022.
:param background_df: The background dataset.
:param foreground_df: The foreground dataset.
:param input_column_names: The names of the input columns.
:param background_mechanism: The background mechanism. If the mechanism does not change, then this mechanism is used for attribution.
:param foreground_mechanism: The foreground mechanism. If provided, the method also attributes the output change to the change in the mechanism.
:param shapley_config: The configuration for calculating Shapley values.
:return: A dataframe containing the contributions of each input and optionally the mechanism to the change in the output values of the deterministic mechanism(s) for given inputs.
"""
if foreground_mechanism:
if isinstance(background_mechanism, LinearPredictionModel):
return unit_change_linear(
background_mechanism, background_df, foreground_mechanism, foreground_df, input_column_names
)
else:
return unit_change_nonlinear(
background_mechanism,
background_df,
foreground_mechanism,
foreground_df,
input_column_names,
shapley_config,
)

if isinstance(background_mechanism, LinearPredictionModel):
return unit_change_linear_input_only(background_mechanism, background_df, foreground_df, input_column_names)
else:
return unit_change_nonlinear_input_only(
background_mechanism, background_df, foreground_df, input_column_names, shapley_config
)


def unit_change_nonlinear(
background_mechanism: PredictionModel,
background_df: pd.DataFrame,
Expand All @@ -54,13 +98,14 @@ def unit_change_nonlinear(
"""
_check_if_input_columns_exist(background_df, foreground_df, input_column_names)

def payoff(binary_vector: List[int]) -> np.ndarray:
def payoff(player_indicator: List[int]) -> np.ndarray:
"""The last cell in the binary vector represents the player 'mechanism'."""
background_column_names = [input_column_names[i] for i, val in enumerate(binary_vector[:-1]) if val == 0]
foreground_column_names = [input_column_names[i] for i, val in enumerate(binary_vector[:-1]) if val == 1]
df = pd.concat([background_df[background_column_names], foreground_df[foreground_column_names]], axis=1)
mechanism = foreground_mechanism if binary_vector[-1] else background_mechanism
return mechanism.predict(df[input_column_names].values).flatten()
input_arrays = []
for i, is_player_active in enumerate(player_indicator[:-1]):
selected_df = foreground_df if is_player_active else background_df
input_arrays.append(selected_df[input_column_names[i]].to_numpy())
mechanism = foreground_mechanism if player_indicator[-1] else background_mechanism
return mechanism.predict(np.column_stack(input_arrays)).flatten()

contributions = estimate_shapley_values(payoff, len(input_column_names) + 1, shapley_config)
root_causes = input_column_names + ["f"]
Expand Down Expand Up @@ -99,6 +144,60 @@ def unit_change_linear(
return contribution_df


def unit_change_nonlinear_input_only(
mechanism: PredictionModel,
background_df: pd.DataFrame,
foreground_df: pd.DataFrame,
input_column_names: List[str],
shapley_config: Optional[ShapleyConfig] = None,
) -> pd.DataFrame:
"""
Calculates the contributions of each input to the change in the output values of a non-linear deterministic mechanism.
The technical method is a modification of the attribution method described in the following research paper, without mechanism as a player:
Kailash Budhathoki, George Michailidis, Dominik Janzing. *Explaining the root causes of unit-level changes*. arXiv, 2022.
:param mechanism: The mechanism.
:param background_df: The background data.
:param foreground_df: The foreground data.
:param input_column_names: The names of the input (features) columns in both dataframes.
:param shapley_config: The configuration for calculating Shapley values.
:return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
"""
_check_if_input_columns_exist(background_df, foreground_df, input_column_names)

def payoff(player_indicator: List[int]) -> np.ndarray:
input_arrays = []
for i, is_player_active in enumerate(player_indicator):
selected_df = foreground_df if is_player_active else background_df
input_arrays.append(selected_df[input_column_names[i]].to_numpy())
return mechanism.predict(np.column_stack(input_arrays)).flatten()

contributions = estimate_shapley_values(payoff, len(input_column_names), shapley_config)
return pd.DataFrame(contributions, columns=input_column_names)


def unit_change_linear_input_only(
mechanism: LinearPredictionModel,
background_df: pd.DataFrame,
foreground_df: pd.DataFrame,
input_column_names: List[str],
) -> pd.DataFrame:
"""
Calculates the contributions of each input to the change in the output values of a linear deterministic mechanism.
:param mechanism: The linear mechanism.
:param background_df: The background data.
:param foreground_df: The foreground data.
:param input_column_names: The names of the input (features) columns in both dataframes.
:return: A pandas dataframe with attributions to each cause for the change in each output row of provided dataframes.
"""
_check_if_input_columns_exist(background_df, foreground_df, input_column_names)

input_diff = foreground_df[input_column_names].to_numpy() - background_df[input_column_names].to_numpy() # n x p
contribution_input = np.einsum("ij,ki->ki", mechanism.coefficients.reshape(-1, 1), input_diff)
return pd.DataFrame(contribution_input, columns=input_column_names)


def _check_if_input_columns_exist(
background_df: pd.DataFrame, foreground_df: pd.DataFrame, input_column_names: List[str]
) -> None:
Expand Down
185 changes: 182 additions & 3 deletions tests/gcm/test_unit_change.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,14 @@
from sklearn.linear_model import LinearRegression

from dowhy.gcm.ml.regression import SklearnRegressionModel
from dowhy.gcm.unit_change import SklearnLinearRegressionModel, unit_change_linear, unit_change_nonlinear
from dowhy.gcm.unit_change import (
SklearnLinearRegressionModel,
unit_change,
unit_change_linear,
unit_change_linear_input_only,
unit_change_nonlinear,
unit_change_nonlinear_input_only,
)


@flaky(max_runs=5)
Expand Down Expand Up @@ -40,7 +47,7 @@ def test_given_fitted_linear_mechanisms_with_output_change_when_evaluate_unit_ch
)
)

np.testing.assert_array_almost_equal(actual_contributions.to_numpy(), expected_contributions.to_numpy(), decimal=1)
np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)


@flaky(max_runs=5)
Expand Down Expand Up @@ -71,7 +78,7 @@ def test_given_fitted_linear_mechanisms_with_output_change_when_evaluate_unit_ch
background_mechanism, background_df, foreground_mechanism, foreground_df, ["A", "B"]
)

np.testing.assert_array_almost_equal(actual_contributions.to_numpy(), expected_contributions.to_numpy(), decimal=1)
np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)


def test_given_unfitted_mechanisms_when_evaluate_unit_change_methods_then_raises_exception():
Expand Down Expand Up @@ -112,3 +119,175 @@ def test_given_fitted_nonlinnear_mechanisms_when_evaluate_unit_change_linear_met
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
["A", "B"],
)


@flaky(max_runs=5)
def test_given_fitted_mechanisms_with_no_input_change_when_evaluate_unit_change_input_only_methods_then_returns_zero_attributions():
num_rows = 100
A = np.random.normal(size=num_rows)
B = np.random.normal(size=num_rows)
C = 3 * A + 2 * B

background_df = pd.DataFrame(data=dict(A=A, B=B, C=C))
foreground_df = pd.DataFrame(data=dict(A=A, B=B, C=C))

actual_contributions = unit_change_nonlinear_input_only(
SklearnRegressionModel(RFR().fit(np.column_stack((A, B)), C)),
background_df,
foreground_df,
["A", "B"],
)
expected_contributions = pd.DataFrame(data=dict(A=np.zeros(num_rows), B=np.zeros(num_rows)))
np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)


@flaky(max_runs=5)
def test_given_fitted_linear_mechanism_with_input_change_when_evaluate_unit_change_linear_input_only_then_returns_correct_attributions():
num_rows = 100
A1 = np.random.normal(size=num_rows)
B1 = np.random.normal(size=num_rows)
C1 = 3 * A1 + 2 * B1

A2 = np.random.normal(size=num_rows)
B2 = np.random.normal(size=num_rows)
C2 = 3 * A2 + 2 * B2

background_df = pd.DataFrame(data=dict(A=A1, B=B1, C=C1))
foreground_df = pd.DataFrame(data=dict(A=A2, B=B2, C=C2))

fitted_linear_reg = LinearRegression()
fitted_linear_reg.coef_ = np.array([3, 2])

actual_contributions = unit_change_linear_input_only(
SklearnLinearRegressionModel(fitted_linear_reg), background_df, foreground_df, ["A", "B"]
)
expected_contributions = pd.DataFrame(data=dict(A=3 * (A2 - A1), B=2 * (B2 - B1)))
np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)


@flaky(max_runs=5)
def test_given_fitted_linear_mechanism_with_input_change_when_evaluate_unit_change_input_only_methods_then_attributions_are_consistent():
num_rows = 100
A1 = np.random.normal(size=num_rows)
B1 = np.random.normal(size=num_rows)
C1 = 3 * A1 + 2 * B1

A2 = np.random.normal(size=num_rows)
B2 = np.random.normal(size=num_rows)
C2 = 3 * A2 + 2 * B2

background_df = pd.DataFrame(data=dict(A=A1, B=B1, C=C1))
foreground_df = pd.DataFrame(data=dict(A=A2, B=B2, C=C2))

mechanism = SklearnLinearRegressionModel(LinearRegression().fit(np.column_stack((A1, B1)), C1))
actual_contributions = unit_change_nonlinear_input_only(mechanism, background_df, foreground_df, ["A", "B"])
expected_contributions = unit_change_linear_input_only(mechanism, background_df, foreground_df, ["A", "B"])

np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)


def test_given_unfitted_mechanisms_when_evaluate_unit_change_input_only_methods_then_raises_exception():
with pytest.raises(NotFittedError):
unit_change_linear_input_only(
SklearnLinearRegressionModel(LinearRegression()),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
["A", "B"],
)

with pytest.raises(NotFittedError):
unit_change_nonlinear_input_only(
SklearnLinearRegressionModel(LinearRegression()),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
["A", "B"],
)

with pytest.raises(NotFittedError):
unit_change_nonlinear_input_only(
SklearnRegressionModel(RFR()),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
["A", "B"],
)


def test_given_fitted_nonlinnear_mechanism_when_evaluate_unit_change_linear_input_only_method_then_raises_exception():
with pytest.raises(AttributeError):
unit_change_linear_input_only(
SklearnRegressionModel(RFR().fit(np.random.normal(size=(100, 2)), np.random.normal(size=100))),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
pd.DataFrame(data=dict(A=np.random.normal(size=100), B=np.random.normal(size=100))),
["A", "B"],
)


@flaky(max_runs=5)
def test_given_single_mechanism_with_default_optional_parameters_when_evaluate_unit_change_then_returns_correct_attributions_to_input_only():
num_rows = 100
A1 = np.random.normal(size=num_rows)
B1 = np.random.normal(size=num_rows)
C1 = 2 * A1 + 3 * B1

A2 = np.random.normal(size=num_rows)
B2 = np.random.normal(size=num_rows)
# C2 = 3 * A2 + 2 * B2

background_df = pd.DataFrame(data=dict(A=A1, B=B1))
foreground_df = pd.DataFrame(data=dict(A=A2, B=B2))

mechanism = SklearnLinearRegressionModel(LinearRegression(fit_intercept=False).fit(np.column_stack((A1, B1)), C1))

actual_contributions = unit_change(background_df, foreground_df, ["A", "B"], mechanism)
expected_contributions = unit_change_linear_input_only(mechanism, background_df, foreground_df, ["A", "B"])

np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)

mechanism = SklearnRegressionModel(RFR().fit(np.column_stack((A1, B1)), C1))

actual_contributions = unit_change(background_df, foreground_df, ["A", "B"], mechanism)
expected_contributions = unit_change_nonlinear_input_only(mechanism, background_df, foreground_df, ["A", "B"])
np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)


@flaky(max_runs=5)
def test_given_two_mechanisms_when_evaluate_unit_change_then_returns_correct_attributions_to_both_mechanism_and_input():
num_rows = 100
A1 = np.random.normal(size=num_rows)
B1 = np.random.normal(size=num_rows)
C1 = 2 * A1 + 3 * B1

A2 = np.random.normal(size=num_rows)
B2 = np.random.normal(size=num_rows)
C2 = 3 * A2 + 2 * B2

background_df = pd.DataFrame(data=dict(A=A1, B=B1))
foreground_df = pd.DataFrame(data=dict(A=A2, B=B2))

background_mechanism = SklearnLinearRegressionModel(
LinearRegression(fit_intercept=False).fit(np.column_stack((A1, B1)), C1)
)
foreground_mechanism = SklearnLinearRegressionModel(
LinearRegression(fit_intercept=False).fit(np.column_stack((A2, B2)), C2)
)

actual_contributions = unit_change(
background_df, foreground_df, ["A", "B"], background_mechanism, foreground_mechanism
)
expected_contributions = unit_change_linear(
background_mechanism, background_df, foreground_mechanism, foreground_df, ["A", "B"]
)

np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)

background_mechanism = SklearnRegressionModel(RFR().fit(np.column_stack((A1, B1)), C1))
foreground_mechanism = SklearnRegressionModel(RFR().fit(np.column_stack((A2, B2)), C2))

actual_contributions = unit_change(
background_df, foreground_df, ["A", "B"], background_mechanism, foreground_mechanism
)
expected_contributions = unit_change_nonlinear(
background_mechanism, background_df, foreground_mechanism, foreground_df, ["A", "B"]
)

np.testing.assert_array_almost_equal(actual_contributions, expected_contributions, decimal=1)

0 comments on commit ac04706

Please sign in to comment.