From 868ea5d9b53d683a6e32ca84eae87e9b90098f22 Mon Sep 17 00:00:00 2001 From: Michael Foster Date: Tue, 30 Jan 2024 09:16:43 +0000 Subject: [PATCH 1/5] We now support placing conditions on the data again. --- causal_testing/json_front/json_class.py | 5 ++- causal_testing/testing/estimators.py | 47 +++++++++++++++++++------ 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py index f60e86c0..3a5bb25e 100644 --- a/causal_testing/json_front/json_class.py +++ b/causal_testing/json_front/json_class.py @@ -301,9 +301,6 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima """Create the necessary inputs for a single test case :param causal_test_case: The concrete test case to be executed :param test: Single JSON test definition stored in a mapping (dict) - :param conditions: A list of conditions which should be applied to the - data. Conditions should be in the query format detailed at - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html :returns: - estimation_model - Estimator instance for the test being run """ @@ -315,6 +312,7 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima "formulas" ) estimator_kwargs["formula"] = test["formula"] + estimator_kwargs["query"] = test["query"] if "query" in test else "" estimator_kwargs["adjustment_set"] = None else: minimal_adjustment_set = self.causal_specification.causal_dag.identification( @@ -328,6 +326,7 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima estimator_kwargs["control_value"] = causal_test_case.control_value estimator_kwargs["outcome"] = causal_test_case.outcome_variable.name estimator_kwargs["effect_modifiers"] = causal_test_case.effect_modifier_configuration + estimator_kwargs["df"] = self.data_collector.collect_data() estimator_kwargs["alpha"] = test["alpha"] if "alpha" in test else 0.05 estimation_model = test["estimator"](**estimator_kwargs) diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py index 4e56562c..4ed80736 100644 --- a/causal_testing/testing/estimators.py +++ b/causal_testing/testing/estimators.py @@ -50,14 +50,16 @@ def __init__( df: pd.DataFrame = None, effect_modifiers: dict[str:Any] = None, alpha: float = 0.05, + query: str = "", ): self.treatment = treatment self.treatment_value = treatment_value self.control_value = control_value self.adjustment_set = adjustment_set self.outcome = outcome - self.df = df self.alpha = alpha + self.df = df.query(query) if query else df + if effect_modifiers is None: self.effect_modifiers = {} elif isinstance(effect_modifiers, dict): @@ -65,6 +67,8 @@ def __init__( else: raise ValueError(f"Unsupported type for effect_modifiers {effect_modifiers}. Expected iterable") self.modelling_assumptions = [] + if query: + self.modelling_assumptions.append(query) self.add_modelling_assumptions() logger.debug("Effect Modifiers: %s", self.effect_modifiers) @@ -100,8 +104,18 @@ def __init__( df: pd.DataFrame = None, effect_modifiers: dict[str:Any] = None, formula: str = None, + query: str = "", ): - super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers) + super().__init__( + treatment=treatment, + treatment_value=treatment_value, + control_value=control_value, + adjustment_set=adjustment_set, + outcome=outcome, + df=df, + effect_modifiers=effect_modifiers, + query=query, + ) self.model = None @@ -116,13 +130,13 @@ def add_modelling_assumptions(self): Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that must hold if the resulting causal inference is to be considered valid. """ - self.modelling_assumptions += ( + self.modelling_assumptions.append( "The variables in the data must fit a shape which can be expressed as a linear" "combination of parameters and functions of variables. Note that these functions" "do not need to be linear." ) - self.modelling_assumptions += "The outcome must be binary." - self.modelling_assumptions += "Independently and identically distributed errors." + self.modelling_assumptions.append("The outcome must be binary.") + self.modelling_assumptions.append("Independently and identically distributed errors.") def _run_logistic_regression(self, data) -> RegressionResultsWrapper: """Run logistic regression of the treatment and adjustment set against the outcome and return the model. @@ -291,9 +305,18 @@ def __init__( effect_modifiers: dict[Variable:Any] = None, formula: str = None, alpha: float = 0.05, + query: str = "", ): super().__init__( - treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, alpha=alpha + treatment, + treatment_value, + control_value, + adjustment_set, + outcome, + df, + effect_modifiers, + alpha=alpha, + query=query, ) self.model = None @@ -314,7 +337,7 @@ def add_modelling_assumptions(self): Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that must hold if the resulting causal inference is to be considered valid. """ - self.modelling_assumptions += ( + self.modelling_assumptions.append( "The variables in the data must fit a shape which can be expressed as a linear" "combination of parameters and functions of variables. Note that these functions" "do not need to be linear." @@ -468,13 +491,17 @@ def add_modelling_assumptions(self): Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that must hold if the resulting causal inference is to be considered valid. """ - self.modelling_assumptions += """The instrument and the treatment, and the treatment and the outcome must be + self.modelling_assumptions.append( + """The instrument and the treatment, and the treatment and the outcome must be related linearly in the form Y = aX + b.""" - self.modelling_assumptions += """The three IV conditions must hold + ) + self.modelling_assumptions.append( + """The three IV conditions must hold (i) Instrument is associated with treatment (ii) Instrument does not affect outcome except through its potential effect on treatment (iii) Instrument and outcome do not share causes """ + ) def estimate_iv_coefficient(self, df): """ @@ -517,7 +544,7 @@ def add_modelling_assumptions(self): :return self: Update self.modelling_assumptions """ - self.modelling_assumptions += "Non-parametric estimator: no restrictions imposed on the data." + self.modelling_assumptions.append("Non-parametric estimator: no restrictions imposed on the data.") def estimate_ate(self) -> float: """Estimate the average treatment effect. From d18c40742fa155840dd83d7f5406f1c3eec2daf4 Mon Sep 17 00:00:00 2001 From: Michael Foster Date: Tue, 30 Jan 2024 09:27:29 +0000 Subject: [PATCH 2/5] codecov --- tests/testing_tests/test_estimators.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py index 835a1144..64ad4ac3 100644 --- a/tests/testing_tests/test_estimators.py +++ b/tests/testing_tests/test_estimators.py @@ -124,16 +124,14 @@ def test_ate_adjustment(self): logistic_regression_estimator = LogisticRegressionEstimator( "length_in", 65, 55, {"large_gauge"}, "completed", df ) - ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config = {"large_gauge": 0}) + ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config={"large_gauge": 0}) self.assertEqual(round(ate, 4), -0.3388) def test_ate_invalid_adjustment(self): df = self.scarf_df.copy() logistic_regression_estimator = LogisticRegressionEstimator("length_in", 65, 55, {}, "completed", df) with self.assertRaises(ValueError): - ate, _ = logistic_regression_estimator.estimate_ate( - adjustment_config = {"large_gauge": 0} - ) + ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config={"large_gauge": 0}) def test_ate_effect_modifiers(self): df = self.scarf_df.copy() @@ -215,6 +213,13 @@ def setUpClass(cls) -> None: cls.nhefs_df = load_nhefs_df() cls.chapter_11_df = load_chapter_11_df() + def test_query(self): + df = self.nhefs_df + linear_regression_estimator = LinearRegressionEstimator( + "treatments", None, None, set(), "outcomes", df, query="sex==1" + ) + self.assertTrue(linear_regression_estimator.df.sex.all()) + def test_program_11_2(self): """Test whether our linear regression implementation produces the same results as program 11.2 (p. 141).""" df = self.chapter_11_df @@ -394,7 +399,7 @@ def test_program_15_no_interaction_ate_calculated(self): # for term_to_square in terms_to_square: ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated( - adjustment_config = {k: self.nhefs_df.mean()[k] for k in covariates} + adjustment_config={k: self.nhefs_df.mean()[k] for k in covariates} ) self.assertEqual(round(ate, 1), 3.5) self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [1.9, 5]) From 2c52186ec172c68b65729c6bdf5e606087cb3b8a Mon Sep 17 00:00:00 2001 From: Michael Foster Date: Tue, 30 Jan 2024 14:40:51 +0000 Subject: [PATCH 3/5] iv estimators with new param format --- causal_testing/testing/causal_test_result.py | 5 +++-- causal_testing/testing/estimators.py | 14 +++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/causal_testing/testing/causal_test_result.py b/causal_testing/testing/causal_test_result.py index c0af8bc9..7963afd1 100644 --- a/causal_testing/testing/causal_test_result.py +++ b/causal_testing/testing/causal_test_result.py @@ -59,9 +59,10 @@ def push(s, inc=" "): f"Treatment value: {self.estimator.treatment_value}\n" f"Outcome: {self.estimator.outcome}\n" f"Adjustment set: {self.adjustment_set}\n" - f"Formula: {self.estimator.formula}\n" - f"{self.test_value.type}: {result_str}\n" ) + if hasattr(self.estimator, "formula"): + base_str += f"Formula: {self.estimator.formula}\n" + base_str += f"{self.test_value.type}: {result_str}\n" confidence_str = "" if self.confidence_intervals: ci_str = " " + str(self.confidence_intervals) diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py index 4ed80736..74a4bed5 100644 --- a/causal_testing/testing/estimators.py +++ b/causal_testing/testing/estimators.py @@ -480,8 +480,20 @@ def __init__( df: pd.DataFrame = None, intercept: int = 1, effect_modifiers: dict = None, # Not used (yet?). Needed for compatibility + alpha: float = 0.05, + query: str = "", ): - super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, None) + super().__init__( + treatment=treatment, + treatment_value=treatment_value, + control_value=control_value, + adjustment_set=adjustment_set, + outcome=outcome, + df=df, + effect_modifiers=None, + alpha=alpha, + query=query, + ) self.intercept = intercept self.model = None self.instrument = instrument From 2cca321430c4eeecdc0b730c12092bafb662cce1 Mon Sep 17 00:00:00 2001 From: Michael Foster Date: Tue, 30 Jan 2024 14:54:24 +0000 Subject: [PATCH 4/5] fixed bug query in formula if statement --- causal_testing/json_front/json_class.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py index 3a5bb25e..88386441 100644 --- a/causal_testing/json_front/json_class.py +++ b/causal_testing/json_front/json_class.py @@ -312,7 +312,6 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima "formulas" ) estimator_kwargs["formula"] = test["formula"] - estimator_kwargs["query"] = test["query"] if "query" in test else "" estimator_kwargs["adjustment_set"] = None else: minimal_adjustment_set = self.causal_specification.causal_dag.identification( @@ -321,6 +320,7 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima minimal_adjustment_set = minimal_adjustment_set - {causal_test_case.treatment_variable} estimator_kwargs["adjustment_set"] = minimal_adjustment_set + estimator_kwargs["query"] = test["query"] if "query" in test else "" estimator_kwargs["treatment"] = causal_test_case.treatment_variable.name estimator_kwargs["treatment_value"] = causal_test_case.treatment_value estimator_kwargs["control_value"] = causal_test_case.control_value From bb8e3f53088d9262fef26ac4b39dced1f9fda22c Mon Sep 17 00:00:00 2001 From: Michael Foster Date: Wed, 31 Jan 2024 08:25:13 +0000 Subject: [PATCH 5/5] pylint --- causal_testing/testing/estimators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py index 74a4bed5..492ae417 100644 --- a/causal_testing/testing/estimators.py +++ b/causal_testing/testing/estimators.py @@ -10,7 +10,7 @@ import statsmodels.api as sm import statsmodels.formula.api as smf from econml.dml import CausalForestDML -from patsy import dmatrix +from patsy import dmatrix # pylint: disable = no-name-in-module from sklearn.ensemble import GradientBoostingRegressor from statsmodels.regression.linear_model import RegressionResultsWrapper