From 576920f6f087805f4b7f2d164d085b71db31b249 Mon Sep 17 00:00:00 2001 From: pzivich Date: Wed, 17 Jul 2019 13:13:31 -0400 Subject: [PATCH 01/42] v0.8.1 label --- zepid/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zepid/version.py b/zepid/version.py index 32a90a3..ef72cc0 100644 --- a/zepid/version.py +++ b/zepid/version.py @@ -1 +1 @@ -__version__ = '0.8.0' +__version__ = '0.8.1' From 032d3f8a9c05bd630a1d0344262a3998d706f3e8 Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 1 Aug 2019 14:10:13 -0400 Subject: [PATCH 02/42] Adding ATT and ATU to g-formula --- tests/test_gformula.py | 19 ++++++++++++ zepid/causal/gformula/TimeFixed.py | 50 +++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 5 deletions(-) diff --git a/tests/test_gformula.py b/tests/test_gformula.py index d0c7ba0..42d5cd2 100644 --- a/tests/test_gformula.py +++ b/tests/test_gformula.py @@ -268,6 +268,25 @@ def test_conditional_stochastic_warning(self): with pytest.warns(UserWarning): g.fit_stochastic(p=[1.0, 1.0], conditional=["(g['L']==1) | (g['L']==0)", "g['L']==0"]) + def test_exposed_standardized1(self, data): + g = TimeFixedGFormula(data, exposure='A', outcome='Y', standardize='exposed') + g.outcome_model(model='A + L + A:L', print_results=False) + g.fit(treatment='all') + rm = g.marginal_outcome + g.fit(treatment='none') + rs = g.marginal_outcome + npt.assert_allclose(rm - rs, 0.16, rtol=1e-5) + npt.assert_allclose(rm / rs, 1.387097, rtol=1e-5) + + g = TimeFixedGFormula(data, exposure='A', outcome='Y', standardize='unexposed') + g.outcome_model(model='A + L + A:L', print_results=False) + g.fit(treatment='all') + rm = g.marginal_outcome + g.fit(treatment='none') + rs = g.marginal_outcome + npt.assert_allclose(rm - rs, 0.16, rtol=1e-5) + npt.assert_allclose(rm / rs, 1.384615, rtol=1e-5) + class TestSurvivalGFormula: diff --git a/zepid/causal/gformula/TimeFixed.py b/zepid/causal/gformula/TimeFixed.py index 5431192..f459064 100644 --- a/zepid/causal/gformula/TimeFixed.py +++ b/zepid/causal/gformula/TimeFixed.py @@ -56,6 +56,13 @@ class TimeFixedGFormula: Column name for outcome variable outcome_type : str, optional Outcome variable type. Currently only 'binary', 'normal', and 'poisson variable types are supported + standardize : str, optional + Who the estimate corresponds to. Options are the entire population, the exposed, or the unexposed. See + Sato & Matsuyama Epidemiology (2003) for details on weighting to exposed/unexposed. Weighting to the + exposed or unexposed is also referred to as SMR weighting. Options for standardization are: + * 'population' : weight to entire population + * 'exposed' : weight to exposed individuals + * 'unexposed' : weight to unexposed individuals weights : str, optional Column name for weights. Default is None, which assumes every observations has the same weight (i.e. 1) @@ -140,7 +147,8 @@ class TimeFixedGFormula: population disease burden: a step-by-step illustration of causal inference methods. American Journal of Epidemiology, 169(9), 1140-1147. """ - def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type='binary', weights=None): + def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type='binary', standardize='population', + weights=None): self.gf = df.copy() self.exposure = exposure self.outcome = outcome @@ -157,6 +165,12 @@ def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type=' raise ValueError('Only binary or continuous exposures are currently supported. Please specify "binary", ' '"categorical", or "continuous".') + if standardize in ['population', 'exposed', 'unexposed']: + self.standardize = standardize + else: + raise ValueError('Please specify one of the currently supported standardizations: ' + + 'population, exposed, unexposed') + self._weights = weights self._outcome_model = None self._predicted_y_ = None @@ -263,10 +277,24 @@ def fit(self, treatment): # Getting predictions g[self.outcome] = np.nan g[self.outcome] = self._outcome_model.predict(g) + if self._weights is None: # unweighted marginal estimate - self.marginal_outcome = np.mean(g[self.outcome]) + if self.standardize == 'population': + self.marginal_outcome = np.mean(g[self.outcome]) + elif self.standardize == 'exposed': + self.marginal_outcome = np.mean(g.loc[self.gf[self.exposure] == 1, self.outcome]) + else: + self.marginal_outcome = np.mean(g.loc[self.gf[self.exposure] == 0, self.outcome]) else: # weighted marginal estimate - self.marginal_outcome = np.average(g[self.outcome], weights=self.gf[self._weights]) + if self.standardize == 'population': + self.marginal_outcome = np.average(g[self.outcome], weights=self.gf[self._weights]) + elif self.standardize == 'exposed': + self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 1, self.outcome], + weights=self.gf[self._weights]) + else: + self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 0, self.outcome], + weights=self.gf[self._weights]) + self.predicted_df = g def fit_stochastic(self, p, conditional=None, samples=100, seed=None): @@ -338,9 +366,21 @@ def fit_stochastic(self, p, conditional=None, samples=100, seed=None): g[self.outcome] = np.nan g[self.outcome] = self._outcome_model.predict(g) if self._weights is None: # unweighted marginal estimate - marginals.append(np.mean(g[self.outcome])) + if self.standardize == 'population': + marginals.append(np.mean(g[self.outcome])) + elif self.standardize == 'exposed': + marginals.append(np.mean(g.loc[self.gf[self.exposure] == 1, self.outcome])) + else: + marginals.append(np.mean(g.loc[self.gf[self.exposure] == 0, self.outcome])) else: # weighted marginal estimate - marginals.append(np.average(g[self.outcome], weights=g[self._weights])) + if self.standardize == 'population': + marginals.append(np.average(g[self.outcome], weights=g[self._weights])) + elif self.standardize == 'exposed': + marginals.append(np.average(g.loc[self.gf[self.exposure] == 1, self.outcome], + weights=g[self._weights])) + else: + marginals.append(np.average(g.loc[self.gf[self.exposure] == 0, self.outcome], + weights=g[self._weights])) self.marginal_outcome = np.mean(marginals) From 2024027904d9c17d0f06633ddecb7306b250190e Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 1 Aug 2019 17:10:16 -0400 Subject: [PATCH 03/42] Refactoring IPTW --- zepid/causal/ipw/IPTW.py | 77 ++++++---------------------------------- zepid/causal/utils.py | 51 ++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 67 deletions(-) diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py index 8c10b2c..71769ea 100644 --- a/zepid/causal/ipw/IPTW.py +++ b/zepid/causal/ipw/IPTW.py @@ -8,7 +8,7 @@ import matplotlib.pyplot as plt from zepid.causal.utils import (propensity_score, plot_boxplot, plot_kde, plot_love, - standardized_mean_differences, positivity, _bounding_) + standardized_mean_differences, positivity, _bounding_, iptw_calculator) class IPTW: @@ -226,32 +226,15 @@ def treatment_model(self, model_denominator, model_numerator='1', stabilized=Tru """ # Calculating denominator probabilities self.__mdenom = model_denominator - denominator_model = propensity_score(self.df, self.treatment + ' ~ ' + model_denominator, - weights=self._weight_, - print_results=print_results) - d = denominator_model.predict(self.df) - self.df['__denom__'] = d - - # Calculating numerator probabilities (if stabilized) - if stabilized is True: - numerator_model = propensity_score(self.df, self.treatment + ' ~ ' + model_numerator, - weights=self._weight_, - print_results=print_results) - n = numerator_model.predict(self.df) - else: - if model_numerator != '1': - raise ValueError('Argument for model_numerator is only used for stabilized=True') - n = 1 - self.df['__numer__'] = n - - # Bounding predicted probabilities if requested - if bound: - self.df['__denom__'] = _bounding_(self.df['__denom__'], bounds=bound) - self.df['__numer__'] = _bounding_(self.df['__numer__'], bounds=bound) - - # Calculating weights - self.iptw = self._weight_calculator(self.df, denominator='__denom__', - numerator='__numer__', stabilized=stabilized) + self.df['__denom__'], self.df['__numer__'], self.iptw = iptw_calculator(df=self.df, + treatment=self.treatment, + model_denom=model_denominator, + model_numer=model_numerator, + weight=self._weight_, + stabilized=stabilized, + standardize=self.standardize, + bound=bound, + print_results=print_results) def missing_model(self, model_denominator, model_numerator=None, stabilized=True, bound=False, print_results=True): """Estimation of Pr(M=0|A=a,L), which is the missing data mechanism for the outcome. The corresponding @@ -656,46 +639,6 @@ def plot_love(self, color_unweighted='r', color_weighted='b', shape_unweighted=' shape_unweighted=shape_unweighted, shape_weighted=shape_weighted) return ax - def _weight_calculator(self, df, denominator, numerator, stabilized): - """Calculates the IPTW based on the predicted probabilities and the specified group to standardize to in the - background for the fit() function. Not intended to be used by users - - df is the dataframe, denominator is the string indicating the column of Pr, numerator is the string indicating - the column of Pr - """ - if stabilized: # Stabilized weights - if self.standardize == 'population': - df['w'] = np.where(df[self.treatment] == 1, (df[numerator] / df[denominator]), - ((1 - df[numerator]) / (1 - df[denominator]))) - df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w']) - # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed) - elif self.standardize == 'exposed': - df['w'] = np.where(df[self.treatment] == 1, 1, - ((df[denominator] / (1 - df[denominator])) * ((1 - df[numerator]) / - df[numerator]))) - df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w']) - # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed) - else: - df['w'] = np.where(df[self.treatment] == 1, - (((1 - df[denominator]) / df[denominator]) * (df[numerator] / - (1 - df[numerator]))), - 1) - df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w']) - - else: # Unstabilized weights - if self.standardize == 'population': - df['w'] = np.where(df[self.treatment] == 1, 1 / df[denominator], 1 / (1 - df[denominator])) - df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w']) - # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed) - elif self.standardize == 'exposed': - df['w'] = np.where(df[self.treatment] == 1, 1, (df[denominator] / (1 - df[denominator]))) - df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w']) - # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed) - else: - df['w'] = np.where(df[self.treatment] == 1, ((1 - df[denominator]) / df[denominator]), 1) - df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w']) - return df['w'] - class StochasticIPTW: r"""Calculates the IPTW estimate for stochastic treatment plans. `StochasticIPTW` will returns the estimated diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py index 9264e35..db460f7 100644 --- a/zepid/causal/utils.py +++ b/zepid/causal/utils.py @@ -177,6 +177,57 @@ def _bounding_(v, bounds): return v +def iptw_calculator(df, treatment, model_denom, model_numer, weight, stabilized, standardize, bound, print_results): + """Background function to calculate inverse probability of treatment weights. Used by `IPTW`, `IPSW`, `AIPSW` + """ + denominator_model = propensity_score(df, treatment + ' ~ ' + model_denom, + weights=weight, print_results=print_results) + d = denominator_model.predict(df) + + # Calculating numerator probabilities (if stabilized) + if stabilized is True: + numerator_model = propensity_score(df, treatment + ' ~ ' + model_numer, + weights=weight, print_results=print_results) + n = numerator_model.predict(df) + else: + if model_numer != '1': + raise ValueError('Argument for model_numerator is only used for stabilized=True') + n = 1 + + # Bounding predicted probabilities if requested + if bound: + d = _bounding_(d, bounds=bound) + n = _bounding_(n, bounds=bound) + + # Calculating weights + if stabilized: # Stabilized weights + if standardize == 'population': + iptw = np.where(df[treatment] == 1, (n / d), ((1 - n) / (1 - d))) + iptw = np.where(df[treatment].isna(), np.nan, iptw) + # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed) + elif standardize == 'exposed': + iptw = np.where(df[treatment] == 1, 1, (d / (1 - d)) * ((1 - n) / n)) + iptw = np.where(df[treatment].isna(), np.nan, iptw) + # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed) + else: + iptw = np.where(df[treatment] == 1, (((1 - d) / d) * (n / (1 - n))), 1) + iptw = np.where(df[treatment].isna(), np.nan, iptw) + + else: # Unstabilized weights + if standardize == 'population': + iptw = np.where(df[treatment] == 1, 1 / d, 1 / (1 - d)) + iptw = np.where(df[treatment].isna(), np.nan, iptw) + # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed) + elif standardize == 'exposed': + iptw = np.where(df[treatment] == 1, 1, (d / (1 - d))) + iptw = np.where(df[treatment].isna(), np.nan, iptw) + # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed) + else: + iptw = np.where(df[treatment] == 1, ((1 - d) / d), 1) + iptw = np.where(df[treatment].isna(), np.nan, iptw) + return d, n, iptw + + def plot_kde(df, treatment, probability, measure='probability', bw_method='scott', fill=True, color_e='b', color_u='r'): """Generates a density plot that can be used to check whether positivity may be violated qualitatively. The From b8038401973ddbe16f6e405c2c184e55d2248a3c Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 1 Aug 2019 17:10:43 -0400 Subject: [PATCH 04/42] Adding IPTW to IPSW and AIPSW classes --- tests/test_generalize.py | 54 ++++----- zepid/causal/generalize/estimators.py | 155 +++++++++++++++++++++----- 2 files changed, 150 insertions(+), 59 deletions(-) diff --git a/tests/test_generalize.py b/tests/test_generalize.py index ac8f888..bbd5e77 100644 --- a/tests/test_generalize.py +++ b/tests/test_generalize.py @@ -21,22 +21,12 @@ def df_c(): return df -@pytest.fixture -def df_iptw(df_c): - dfs = df_c.loc[df_c['S'] == 1].copy() - - ipt = IPTW(dfs, treatment='A', outcome='Y') - ipt.treatment_model('L', stabilized=True, print_results=False) - dfs['iptw'] = ipt.iptw - return pd.concat([dfs, df_c.loc[df_c['S'] == 0]], ignore_index=True, sort=False) - - class TestIPSW: def test_stabilize_error(self, df_c): - ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', stabilized=False) + ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S') with pytest.raises(ValueError): - ipsw.regression_models('L + W_sq', model_numerator='W', print_results=False) + ipsw.weight_model('L + W_sq', model_numerator='W', stabilized=False, print_results=False) def test_no_model_error(self, df_c): ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True) @@ -44,43 +34,45 @@ def test_no_model_error(self, df_c): ipsw.fit() def test_generalize_unstabilized(self, df_r): - ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=False) - ipsw.regression_models('L + W_sq', print_results=False) + ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S') + ipsw.weight_model('L + W_sq', stabilized=False, print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4) def test_generalize_stabilized(self, df_r): - ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=True) - ipsw.regression_models('L + W_sq', print_results=False) + ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S') + ipsw.weight_model('L + W_sq', stabilized=True, print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4) def test_transport_unstabilized(self, df_r): - ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=False, generalize=False) - ipsw.regression_models('L + W_sq', print_results=False) + ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False) + ipsw.weight_model('L + W_sq', stabilized=False, print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4) def test_transport_stabilized(self, df_r): - ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=True, generalize=False) - ipsw.regression_models('L + W_sq', print_results=False) + ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False) + ipsw.weight_model('L + W_sq', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4) - def test_generalize_iptw(self, df_iptw): - ipsw = IPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=True, weights='iptw') - ipsw.regression_models('L + W + W_sq', print_results=False) + def test_generalize_iptw(self, df_c): + ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True) + ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.treatment_model('L', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.167213, atol=1e-4) - def test_transport_iptw(self, df_iptw): - ipsw = IPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=False, weights='iptw') - ipsw.regression_models('L + W + W_sq', print_results=False) + def test_transport_iptw(self, df_c): + ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False) + ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.treatment_model('L', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.1372, atol=1e-4) @@ -154,17 +146,19 @@ def test_transport(self, df_r): npt.assert_allclose(aipw.risk_difference, 0.05479, atol=1e-5) npt.assert_allclose(aipw.risk_ratio, 1.16352, atol=1e-4) - def test_generalize_conf(self, df_iptw): - aipw = AIPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=True, weights='iptw') + def test_generalize_conf(self, df_c): + aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True) aipw.weight_model('L + W_sq', print_results=False) + aipw.treatment_model('L', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5) npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4) - def test_transport_conf(self, df_iptw): - aipw = AIPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=False, weights='iptw') + def test_transport_conf(self, df_c): + aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False) aipw.weight_model('L + W_sq', print_results=False) + aipw.treatment_model('L', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5) diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py index b120c49..c331007 100644 --- a/zepid/causal/generalize/estimators.py +++ b/zepid/causal/generalize/estimators.py @@ -3,7 +3,7 @@ import statsmodels.api as sm import statsmodels.formula.api as smf -from zepid.causal.utils import propensity_score +from zepid.causal.utils import propensity_score, iptw_calculator, _bounding_ class IPSW: @@ -44,8 +44,6 @@ class IPSW: generalize : bool, optional Whether the problem is a generalizability (True) problem or a transportability (False) problem. See notes for further details on the difference between the two estimation methods - stabilized : bool, optional - Whether to return stabilized or unstabilized weights. Default is stabilized weights (True) weights : None, str, optional For conditionally randomized trials, or observational research, inverse probability of treatment weights can be used to adjust for confounding. Before estimating the effect measures, this weight vector and the @@ -107,27 +105,26 @@ class IPSW: randomized trial to a new target population. arXiv preprint arXiv:1805.00550. """ - def __init__(self, df, exposure, outcome, selection, generalize=True, stabilized=True, weights=None): + def __init__(self, df, exposure, outcome, selection, generalize=True, weights=None): self.df = df.copy() self.sample = df.loc[df[selection] == 1].copy() self.target = df.loc[df[selection] == 0].copy() self.generalize = generalize # determines whether IPSW or IOSW are calculated - self.stabilized = stabilized - self.exposure = exposure self.outcome = outcome self.selection = selection self.weight = weights + self.ipsw = None + self.iptw = None self.risk_difference = None self.risk_ratio = None - self.Weight = None self._denominator_model = False - def regression_models(self, model_denominator, model_numerator='1', print_results=True): - """Logistic regression model(s) for estimating weights. The model denominator must be specified for both - stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the + def weight_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True): + """Logistic regression model(s) for estimating sampling weights. The model denominator must be specified for + both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model results are returned Parameters @@ -140,10 +137,18 @@ def regression_models(self, model_denominator, model_numerator='1', print_result numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended. Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also only used when calculating stabilized weights + bound : float, list, optional + Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. + Specifying this argument can improve finite sample performance for random positivity violations. However, + inference becomes limited to the restricted population. Default is False, meaning no truncation of + predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of + floats can be provided for asymmetric trunctation + stabilized : bool, optional + Whether to generated stabilized IPSW. Default is True, which returns the stabilized IPSW print_results : bool, optional Whether to print the model results from the regression models. Default is True """ - if not self.stabilized: + if not stabilized: if model_numerator != '1': raise ValueError('Argument for model_numerator is only used for stabilized=True') @@ -153,25 +158,63 @@ def regression_models(self, model_denominator, model_numerator='1', print_result self._denominator_model = True # Stabilization factor if valid - if self.stabilized: + if stabilized: nmodel = propensity_score(self.df, self.selection + ' ~ ' + model_numerator, print_results=print_results) self.sample['__numer__'] = nmodel.predict(self.sample) else: self.sample['__numer__'] = 1 + if bound: + self.sample['__denom__'] = _bounding_(self.sample['__denom__'], bounds=bound) + self.sample['__numer__'] = _bounding_(self.sample['__numer__'], bounds=bound) + # Calculate IPSW (generalizability) if self.generalize: self.sample['__ipsw__'] = self.sample['__numer__'] / self.sample['__denom__'] # Calculate IOSW (transportability) else: - if self.stabilized: + if stabilized: self.sample['__ipsw__'] = (((1 - self.sample['__denom__']) / self.sample['__denom__']) * (self.sample['__numer__'] / (1 - self.sample['__numer__']))) else: self.sample['__ipsw__'] = (1 - self.sample['__denom__']) / self.sample['__denom__'] - self.Weight = self.sample['__ipsw__'] + self.ipsw = self.sample['__ipsw__'] + + def treatment_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True): + """Logistic regression model(s) for estimating inverse probability of treatment weights (IPTW). The model + denominator must be specified for both stabilized and unstabilized weights. The optional argument + 'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model + results are returned + + Parameters + ---------- + model_denominator : str + String listing variables to predict the exposure, separated by +. For example, 'var1 + var2 + var3'. This + is for the predicted probabilities of the denominator + model_numerator : str, optional + Optional string listing variables to predict the selection separated by +. Only used to calculate the + numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended. + Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also + only used when calculating stabilized weights + bound : float, list, optional + Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. + Specifying this argument can improve finite sample performance for random positivity violations. However, + inference becomes limited to the restricted population. Default is False, meaning no truncation of + predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of + floats can be provided for asymmetric trunctation + stabilized : bool, optional + Whether to generated stabilized IPSW. Default is True, which returns the stabilized IPSW + print_results : bool, optional + Whether to print the model results from the regression models. Default is True + """ + d, n, self.iptw = iptw_calculator(df=self.sample, + treatment=self.exposure, + model_denom=model_denominator, model_numer=model_numerator, + weight=self.weight, stabilized=stabilized, + standardize='population', + bound=bound, print_results=print_results) def fit(self): """Uses the calculated IPSW to obtain the risk difference and risk ratio from the sample. If weights are @@ -186,10 +229,17 @@ def fit(self): """ if not self._denominator_model: raise ValueError('The regression_models() function must be specified before effect measure estimation') + if self.weight is not None: - self.sample['__ipw__'] = self.Weight * self.sample[self.weight] + if self.iptw is None: + self.sample['__ipw__'] = self.ipsw * self.sample[self.weight] + else: + self.sample['__ipw__'] = self.ipsw * self.iptw * self.sample[self.weight] else: - self.sample['__ipw__'] = self.Weight + if self.iptw is None: + self.sample['__ipw__'] = self.ipsw + else: + self.sample['__ipw__'] = self.ipsw * self.iptw exp = self.sample[self.sample[self.exposure] == 1].copy() uxp = self.sample[self.sample[self.exposure] == 0].copy() @@ -218,11 +268,16 @@ def summary(self, decimal=4): print(fmt.format(self.exposure, self.sample.shape[0])) fmt = 'Outcome: {:<15} Target Observations: {:<20}' print(fmt.format(self.outcome, self.target.shape[0])) - fmt = 'Target estimate: {:<15}' + fmt = 'Target estimate: {:<15} IP Treatment Weights: {:<20}' if self.generalize: - print(fmt.format('Generalize')) + g = 'Generalize' else: - print(fmt.format('Transport')) + g = 'Transport' + if self.iptw is None: + w = 'No' + else: + w = 'Yes' + print(fmt.format(g, w)) print('----------------------------------------------------------------------') print('Risk Difference: ', round(float(self.risk_difference), decimal)) @@ -542,7 +597,8 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No self.selection = selection self.weight = weights - self.Weight = None + self.ipsw = None + self.iptw = None self._denominator_model = False self._outcome_model = False self._YA1 = None @@ -572,7 +628,6 @@ def weight_model(self, model_denominator, model_numerator='1', stabilized=True, Whether to print the model results from the regression models. Default is True """ dmodel = propensity_score(self.df, self.selection + ' ~ ' + model_denominator, print_results=print_results) - self.df['__denom__'] = dmodel.predict(self.df) self._denominator_model = True @@ -595,10 +650,41 @@ def weight_model(self, model_denominator, model_numerator='1', stabilized=True, else: self.df['__ipsw__'] = (1 - self.df['__denom__']) / self.df['__denom__'] - if self.weight is not None: - self.Weight = self.df['__ipsw__'] * self.df[self.weight] - else: - self.Weight = self.df['__ipsw__'] + self.ipsw = self.df['__ipsw__'] + + def treatment_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=False): + """Logistic regression model(s) for estimating inverse probability of treatment weights (IPTW). The model + denominator must be specified for both stabilized and unstabilized weights. The optional argument + 'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model + results are returned + + Parameters + ---------- + model_denominator : str + String listing variables to predict the exposure, separated by +. For example, 'var1 + var2 + var3'. This + is for the predicted probabilities of the denominator + model_numerator : str, optional + Optional string listing variables to predict the selection separated by +. Only used to calculate the + numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended. + Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also + only used when calculating stabilized weights + bound : float, list, optional + Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. + Specifying this argument can improve finite sample performance for random positivity violations. However, + inference becomes limited to the restricted population. Default is False, meaning no truncation of + predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of + floats can be provided for asymmetric trunctation + stabilized : bool, optional + Whether to generated stabilized IPTW. Default is True, which returns the stabilized IPTW + print_results : bool, optional + Whether to print the model results from the regression models. Default is True + """ + d, n, self.iptw = iptw_calculator(df=self.df, + treatment=self.exposure, + model_denom=model_denominator, model_numer=model_numerator, + weight=self.weight, stabilized=stabilized, + standardize='population', + bound=bound, print_results=print_results) def outcome_model(self, model, outcome_type='binary', print_results=True): """Build the g-transport model for the outcome. This is also referred to at the Q-model. @@ -652,27 +738,38 @@ def fit(self): if not self._outcome_model: raise ValueError('The outcome_model() function must be specified before effect measure estimation') + if self.weight is not None: + if self.iptw is None: + self.df['__ipw__'] = self.ipsw * self.sample[self.weight] + else: + self.df['__ipw__'] = self.ipsw * self.iptw * self.sample[self.weight] + else: + if self.iptw is None: + self.df['__ipw__'] = self.ipsw + else: + self.df['__ipw__'] = self.ipsw * self.iptw + # Generalizability problem if self.generalize: part1 = self._YA1 part2 = np.where(self.sample & (self.df[self.exposure] == 1), - self.Weight * (self.df[self.outcome] - self._YA1), 0) + self.df['__ipw__'] * (self.df[self.outcome] - self._YA1), 0) r1 = np.mean(part1 + part2) part1 = self._YA0 part2 = np.where(self.sample & (self.df[self.exposure] == 0), - self.Weight * (self.df[self.outcome] - self._YA0), 0) + self.df['__ipw__'] * (self.df[self.outcome] - self._YA0), 0) r0 = np.mean(part1 + part2) # Transportability problem else: part1 = np.where(self.sample & (self.df[self.exposure] == 1), - self.Weight * (self.df[self.outcome] - self._YA1), 0) + self.df['__ipw__'] * (self.df[self.outcome] - self._YA1), 0) part2 = (1 - self.df[self.selection]) * self._YA1 r1 = np.sum(part1 + part2) / np.sum(1 - self.df[self.selection]) part1 = np.where(self.sample & (self.df[self.exposure] == 0), - self.Weight * (self.df[self.outcome] - self._YA0), 0) + self.df['__ipw__'] * (self.df[self.outcome] - self._YA0), 0) part2 = (1 - self.df[self.selection]) * self._YA0 r0 = np.sum(part1 + part2) / np.sum(1 - self.df[self.selection]) From 7b6f594884cf89f447fabe01842f95a453ce55fb Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 1 Aug 2019 17:27:58 -0400 Subject: [PATCH 05/42] Refactoring AIPTW --- zepid/causal/doublyrobust/AIPW.py | 13 +++++++------ zepid/causal/utils.py | 3 ++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/zepid/causal/doublyrobust/AIPW.py b/zepid/causal/doublyrobust/AIPW.py index d3ced32..901dfc8 100644 --- a/zepid/causal/doublyrobust/AIPW.py +++ b/zepid/causal/doublyrobust/AIPW.py @@ -7,7 +7,7 @@ from statsmodels.stats.weightstats import DescrStatsW from scipy.stats import norm -from zepid.causal.utils import (propensity_score, plot_kde, plot_love, +from zepid.causal.utils import (propensity_score, plot_kde, plot_love, iptw_calculator, standardized_mean_differences, positivity, _bounding_, plot_kde_accuracy, outcome_accuracy) @@ -181,12 +181,13 @@ def exposure_model(self, model, bound=False, print_results=True): """ self.__mweight = model self._exp_model = self.exposure + ' ~ ' + model - fitmodel = propensity_score(self.df, self._exp_model, weights=self._weight_, print_results=print_results) - ps = fitmodel.predict(self.df) - self.df['_g1_'] = ps - self.df['_g0_'] = 1 - ps + d, n, iptw = iptw_calculator(df=self.df, treatment=self.exposure, model_denom=model, model_numer='1', + weight=self._weight_, stabilized=False, standardize='population', + bound=None, print_results=print_results) - # If bounds are requested + self.df['_g1_'] = d + self.df['_g0_'] = 1 - d + # Applying bounds AFTER extracting g1 and g0 if bound: self.df['_g1_'] = _bounding_(self.df['_g1_'], bounds=bound) self.df['_g0_'] = _bounding_(self.df['_g0_'], bounds=bound) diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py index db460f7..afb7c94 100644 --- a/zepid/causal/utils.py +++ b/zepid/causal/utils.py @@ -178,7 +178,8 @@ def _bounding_(v, bounds): def iptw_calculator(df, treatment, model_denom, model_numer, weight, stabilized, standardize, bound, print_results): - """Background function to calculate inverse probability of treatment weights. Used by `IPTW`, `IPSW`, `AIPSW` + """Background function to calculate inverse probability of treatment weights. Used by `IPTW`, `AIPTW`, `IPSW`, + `AIPSW` """ denominator_model = propensity_score(df, treatment + ' ~ ' + model_denom, weights=weight, print_results=print_results) From 0113e1865500a5e16e7be7947a8be52f0a08fb41 Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 12 Aug 2019 15:58:13 -0400 Subject: [PATCH 06/42] Update to generalizability estimators for IPTW update --- tests/test_generalize.py | 49 +++++++++++++++++++++++++++ zepid/causal/generalize/estimators.py | 23 +++++++------ 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/tests/test_generalize.py b/tests/test_generalize.py index bbd5e77..cf52be0 100644 --- a/tests/test_generalize.py +++ b/tests/test_generalize.py @@ -18,6 +18,7 @@ def df_r(): def df_c(): df = ze.load_generalize_data(True) df['W_sq'] = df['W'] ** 2 + df['weight'] = 3 return df @@ -77,6 +78,22 @@ def test_transport_iptw(self, df_c): npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.1372, atol=1e-4) + def test_generalize_weight(self, df_c): + ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight') + ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.treatment_model('L', print_results=False) + ipsw.fit() + npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5) + npt.assert_allclose(ipsw.risk_ratio, 1.167213, atol=1e-4) + + def test_transport_weight(self, df_c): + ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight') + ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.treatment_model('L', print_results=False) + ipsw.fit() + npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5) + npt.assert_allclose(ipsw.risk_ratio, 1.1372, atol=1e-4) + class TestGTransport: @@ -113,6 +130,20 @@ def test_transport_conf(self, df_c): npt.assert_allclose(gtf.risk_difference, 0.042574, atol=1e-5) npt.assert_allclose(gtf.risk_ratio, 1.124257, atol=1e-4) + def test_generalize_weight(self, df_c): + gtf = GTransportFormula(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight') + gtf.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) + gtf.fit() + npt.assert_allclose(gtf.risk_difference, 0.048949, atol=1e-5) + npt.assert_allclose(gtf.risk_ratio, 1.149556, atol=1e-4) + + def test_transport_weight(self, df_c): + gtf = GTransportFormula(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight') + gtf.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) + gtf.fit() + npt.assert_allclose(gtf.risk_difference, 0.042574, atol=1e-5) + npt.assert_allclose(gtf.risk_ratio, 1.124257, atol=1e-4) + class TestAIPSW: @@ -163,3 +194,21 @@ def test_transport_conf(self, df_c): aipw.fit() npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5) npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4) + + def test_generalize_weight(self, df_c): + aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight') + aipw.weight_model('L + W_sq', print_results=False) + aipw.treatment_model('L', print_results=False) + aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) + aipw.fit() + npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5) + npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4) + + def test_transport_weight(self, df_c): + aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight') + aipw.weight_model('L + W_sq', print_results=False) + aipw.treatment_model('L', print_results=False) + aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) + aipw.fit() + npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5) + npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4) diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py index c331007..106bc47 100644 --- a/zepid/causal/generalize/estimators.py +++ b/zepid/causal/generalize/estimators.py @@ -322,9 +322,7 @@ class GTransportFormula: Whether the problem is a generalizability (True) problem or a transportability (False) problem. See notes for further details on the difference between the two estimation methods weights : None, str, optional - If there are associated sampling (or other types of) weights, these can be included in this statement. - Sampling weights may be used for a transportability problem? Either way, I would like to keep this as an - option (to mirror TimeFixedGFormula) + Optional argument for weights. Can be used to input inverse probability of missing weights Note ---- @@ -538,10 +536,7 @@ class AIPSW: Whether the problem is a generalizability (True) problem or a transportability (False) problem. See notes for further details on the difference between the two estimation methods weights : None, str, optional - For conditionally randomized trials, or observational research, inverse probability of treatment weights - can be used to adjust for confounding in IPSW. Before estimating the effect measures, this weight vector - and the IPSW are multiplied to calculate new weights - When weights is None, the data is assumed to come from a randomized trial, and does not need to be adjusted + Optional argument for weights. Can be used to input inverse probability of missing weights Note ---- @@ -710,8 +705,13 @@ def outcome_model(self, model, outcome_type='binary', print_results=True): # Modeling the outcome df = self.df[self.sample].copy() - m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist) - self._outcome_model = m.fit() + + if self.weight is None: + m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist) + self._outcome_model = m.fit() + else: + m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist, freq_weights=df[self.weight]) + self._outcome_model = m.fit() # Printing results of the model and if any observations were dropped if print_results: @@ -740,9 +740,10 @@ def fit(self): if self.weight is not None: if self.iptw is None: - self.df['__ipw__'] = self.ipsw * self.sample[self.weight] + self.df['__ipw__'] = self.ipsw * self.df[self.weight] else: - self.df['__ipw__'] = self.ipsw * self.iptw * self.sample[self.weight] + self.df['__ipw__'] = self.ipsw * self.iptw * self.df[self.weight] + print(self.ipsw * self.iptw * self.df[self.weight]) else: if self.iptw is None: self.df['__ipw__'] = self.ipsw From 4ef25b0cd16b68a7daa0ad956cc10ee5ad469424 Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 12 Aug 2019 15:58:34 -0400 Subject: [PATCH 07/42] Fixing issue with custom weights in IPTW --- zepid/causal/ipw/IPTW.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py index 71769ea..284b583 100644 --- a/zepid/causal/ipw/IPTW.py +++ b/zepid/causal/ipw/IPTW.py @@ -343,12 +343,12 @@ def fit(self, continuous_distribution='gaussian'): if self._weight_ is None: df['_ipfw_'] = self.iptw else: - df['_ipfw_'] = self.iptw * self._weight_ + df['_ipfw_'] = self.iptw * self.df[self._weight_] else: if self._weight_ is None: df['_ipfw_'] = self.iptw * self.ipmw else: - df['_ipfw_'] = self.iptw * self.ipmw * self._weight_ + df['_ipfw_'] = self.iptw * self.ipmw * self.df[self._weight_] df = df.dropna() if self._continuous_outcome: From 8d238fe074658e65424d46ab4f721fa0c01e256f Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 12 Aug 2019 15:58:55 -0400 Subject: [PATCH 08/42] Updating CHANGELOG for v0.8.1 --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55fb2af..004bdee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ ## Change logs +### v0.8.1 +`IPSW` and `AIPSW` now natively support adjusting for confounding. Both now have the `treatment_model()` function, +which calculates the inverse probability of treatment weights. How weights are handled in `AIPSW` are updated. They +are used in both the weight and the outcome models. + +`IPSW` and `AIPSW` both add censoring... + +`TimeFixedGFormula` has added support for the average treatment effect in the treated (ATT), and average treatment +effect in the untreated (ATU). + +Background refactoring for IPTW. `utils.py` now contains a function to calculate inverse probability of treatment +weights. The function `iptw_calculator` is used by `IPTW`, `AIPTW`, `IPSW`, and `AIPSW` to calculate the weights now + ### v0.8.0 `IPTW` had a massive overhaul. It now follows a similar structure to `AIPTW` and other causal inference methods. One *major* change is that missing data is dropped before any calculations. Therefore, if missing data was present for From 39035b96561df38a13e0f4589026add5cc286627 Mon Sep 17 00:00:00 2001 From: pzivich Date: Fri, 16 Aug 2019 08:17:52 -0400 Subject: [PATCH 09/42] Removing extraneous print statement from G-estimation --- zepid/causal/snm/g_estimation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zepid/causal/snm/g_estimation.py b/zepid/causal/snm/g_estimation.py index e9c38fc..14044f2 100644 --- a/zepid/causal/snm/g_estimation.py +++ b/zepid/causal/snm/g_estimation.py @@ -355,7 +355,6 @@ def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e- y_vals = patsy.dmatrix(self._snm_ + ' - 1', yf, return_type='dataframe') # Solving for the array of Psi values - print(df.columns) self.psi = self._closed_form_solver_(treat=self.exposure, model=self.exposure + ' ~ ' + self._treatment_model, df=df, From ae93874e92310c29ca5f559202f3f03d30c7236e Mon Sep 17 00:00:00 2001 From: pzivich Date: Fri, 16 Aug 2019 08:48:01 -0400 Subject: [PATCH 10/42] Adding better Q-model warning #119 --- zepid/causal/doublyrobust/AIPW.py | 2 +- zepid/causal/doublyrobust/TMLE.py | 3 ++ zepid/causal/generalize/estimators.py | 7 ++++ zepid/causal/gformula/TimeFixed.py | 49 ++++++++++++++++++++++----- zepid/causal/snm/g_estimation.py | 3 ++ 5 files changed, 55 insertions(+), 9 deletions(-) diff --git a/zepid/causal/doublyrobust/AIPW.py b/zepid/causal/doublyrobust/AIPW.py index 901dfc8..119bec9 100644 --- a/zepid/causal/doublyrobust/AIPW.py +++ b/zepid/causal/doublyrobust/AIPW.py @@ -277,7 +277,7 @@ def outcome_model(self, model, continuous_distribution='gaussian', print_results Whether to print the fitted model results. Default is True (prints results) """ if self.exposure not in model: - warnings.warn("It looks like the exposure variable is missing from the outcome model", UserWarning) + warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") self._out_model = self.outcome + ' ~ ' + model diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index bad0ba7..fcc5191 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -346,6 +346,9 @@ def outcome_model(self, model, custom_model=None, bound=False, print_results=Tru Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson' for Poisson distributions """ + if self.exposure not in model: + warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") + self._out_model = self.outcome + ' ~ ' + model if self._miss_flag: diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py index 106bc47..f154a53 100644 --- a/zepid/causal/generalize/estimators.py +++ b/zepid/causal/generalize/estimators.py @@ -1,3 +1,4 @@ +import warnings import numpy as np import pandas as pd import statsmodels.api as sm @@ -395,6 +396,9 @@ def outcome_model(self, model, print_results=True): print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ + if self.exposure not in model: + warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") + if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial() elif self.outcome_type == 'normal': @@ -694,6 +698,9 @@ def outcome_model(self, model, outcome_type='binary', print_results=True): print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ + if self.exposure not in model: + warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") + if outcome_type == 'binary': linkdist = sm.families.family.Binomial() elif outcome_type == 'normal': diff --git a/zepid/causal/gformula/TimeFixed.py b/zepid/causal/gformula/TimeFixed.py index f459064..c6be6fb 100644 --- a/zepid/causal/gformula/TimeFixed.py +++ b/zepid/causal/gformula/TimeFixed.py @@ -149,7 +149,14 @@ class TimeFixedGFormula: """ def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type='binary', standardize='population', weights=None): - self.gf = df.copy() + if df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0] != df.shape[0]: + warnings.warn("There is missing data that is not the outcome in the data set. TimeFixedGFormula will drop " + "all missing data that is not missing outcome data. TimeFixedGFormula will fit " + + str(df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0]) + + ' of ' + str(df.shape[0]) + ' observations', UserWarning) + self.gf = df.copy().dropna(subset=[d for d in df.columns if d != outcome]).reset_index() + else: + self.gf = df.copy().reset_index() self.exposure = exposure self.outcome = outcome @@ -190,6 +197,9 @@ def outcome_model(self, model, print_results=True): print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ + if self.exposure not in model: + warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") + if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial() elif self.outcome_type == 'normal': @@ -213,7 +223,7 @@ def outcome_model(self, model, print_results=True): if print_results: print(self._outcome_model.summary()) - def fit(self, treatment): + def fit(self, treatment, predict_missing=True): """Fit the parametric g-formula as specified. Binary and multivariate treatments are available. This implementation has three options for the binary treatment courses. For multivariate treatments, the user must specify custom treatment plans. @@ -229,6 +239,12 @@ def fit(self, treatment): * custom -create a custom treatment. When specifying this, the dataframe must be referred to as 'g' The following is an example that selects those whose age is 25 or older and are females; ``treatment="((g['age0']>=25) & (g['male']==0))`` + predict_missing : bool, optional + Whether missing outcome values should be predicted via the model fit to the data with observed outcomes. + Default is set to True, which assumes that outcome data is missing completely at random conditional on + the variables included in the outcome model. Note this is the less restrictive assumption regarding missing + outcome data. If set to False, missing outcome data is assumed to be missing completely at random + (also referred to as non-informative censoring) Returns ------- @@ -277,8 +293,13 @@ def fit(self, treatment): # Getting predictions g[self.outcome] = np.nan g[self.outcome] = self._outcome_model.predict(g) + if not predict_missing: + g[self.outcome] = np.where(self.gf[self.outcome].isna(), np.nan, g[self.outcome]) + + self.predicted_df = g if self._weights is None: # unweighted marginal estimate + g = g.dropna() if self.standardize == 'population': self.marginal_outcome = np.mean(g[self.outcome]) elif self.standardize == 'exposed': @@ -286,18 +307,17 @@ def fit(self, treatment): else: self.marginal_outcome = np.mean(g.loc[self.gf[self.exposure] == 0, self.outcome]) else: # weighted marginal estimate + g = g.dropna() if self.standardize == 'population': - self.marginal_outcome = np.average(g[self.outcome], weights=self.gf[self._weights]) + self.marginal_outcome = np.average(g[self.outcome], weights=g[self._weights]) elif self.standardize == 'exposed': self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 1, self.outcome], - weights=self.gf[self._weights]) + weights=g[self._weights]) else: self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 0, self.outcome], - weights=self.gf[self._weights]) + weights=g[self._weights]) - self.predicted_df = g - - def fit_stochastic(self, p, conditional=None, samples=100, seed=None): + def fit_stochastic(self, p, conditional=None, samples=100, predict_missing=True, seed=None): """Fits the g-formula for a stochastic intervention. As currently implemented, `p` percent of the population is randomly treated. This process is repeated `n` times and the mean is the marginal stochastic outcome. @@ -310,6 +330,12 @@ def fit_stochastic(self, p, conditional=None, samples=100, seed=None): of the list of probabilities in 'p' samples: int, optional Number of resamples to calculate the marginal outcome. Default is 100 + predict_missing : bool, optional + Whether missing outcome values should be predicted via the model fit to the data with observed outcomes. + Default is set to True, which assumes that outcome data is missing completely at random conditional on + the variables included in the outcome model. Note this is the less restrictive assumption regarding missing + outcome data. If set to False, missing outcome data is assumed to be missing completely at random + (also referred to as non-informative censoring) seed: int, optional Seed for the random process selection @@ -365,6 +391,10 @@ def fit_stochastic(self, p, conditional=None, samples=100, seed=None): # Getting predictions g[self.outcome] = np.nan g[self.outcome] = self._outcome_model.predict(g) + if not predict_missing: + g[self.outcome] = np.where(self.gf[self.outcome].isna(), np.nan, g[self.outcome]) + g = g.dropna() + if self._weights is None: # unweighted marginal estimate if self.standardize == 'population': marginals.append(np.mean(g[self.outcome])) @@ -557,6 +587,9 @@ def outcome_model(self, model, print_results=True): print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ + if self.exposure not in model: + warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") + # Modeling the outcome linkdist = sm.families.family.Binomial() if self._weights is None: diff --git a/zepid/causal/snm/g_estimation.py b/zepid/causal/snm/g_estimation.py index 14044f2..9cd63f8 100644 --- a/zepid/causal/snm/g_estimation.py +++ b/zepid/causal/snm/g_estimation.py @@ -225,6 +225,9 @@ def structural_nested_model(self, model): interaction terms as needed. Interactions should be indicated via patsy magic. For example, 'A + A:V + A:C' """ + if self.exposure not in model: + warnings.warn("It looks like '" + self.exposure + "' is not included in the structural nested model.") + self._snm_ = model def missing_model(self, model_denominator, model_numerator=None, stabilized=True, bound=False, print_results=True): From 013786143fcd8b0f251f8f6856d4cda6447268cd Mon Sep 17 00:00:00 2001 From: pzivich Date: Fri, 16 Aug 2019 08:57:55 -0400 Subject: [PATCH 11/42] Better naming convention for IPSW model functions --- ...pid.causal.generalize.estimators.AIPSW.rst | 3 +- ...epid.causal.generalize.estimators.IPSW.rst | 3 +- zepid/causal/generalize/estimators.py | 39 +++++++++++-------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst b/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst index 6d7bd6e..00959a2 100644 --- a/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst +++ b/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst @@ -10,7 +10,8 @@ zepid.causal.generalize.estimators.AIPSW .. autosummary:: - ~AIPSW.weight_model + ~AIPSW.sampling_model + ~AIPSW.treatment_model ~AIPSW.outcome_model ~AIPSW.fit ~AIPSW.summary diff --git a/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst b/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst index 8d4e889..90b01b9 100644 --- a/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst +++ b/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst @@ -10,6 +10,7 @@ zepid.causal.generalize.estimators.IPSW .. autosummary:: - ~IPSW.regression_models + ~IPSW.sampling_model + ~IPSW.treatment_model ~IPSW.fit ~IPSW.summary diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py index f154a53..b00ef54 100644 --- a/zepid/causal/generalize/estimators.py +++ b/zepid/causal/generalize/estimators.py @@ -70,27 +70,22 @@ class IPSW: Generalizability for RCT results >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S', generalize=True) - >>> ipsw.regression_models('L + W + L:W', print_results=False) + >>> ipsw.sampling_model('L + W + L:W', print_results=False) >>> ipsw.fit() >>> ipsw.summary() Transportability for RCT results >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S', generalize=False) - >>> ipsw.regression_models('L + W + L:W', print_results=False) + >>> ipsw.sampling_model('L + W + L:W', print_results=False) >>> ipsw.fit() >>> ipsw.summary() - For observational studies, IPTW can be used to account for confounders + For observational studies, IPTW can be used to account for confounders via the `treatment_model()` function - >>> from zepid.causal.ipw import IPTW - >>> iptw = IPTW(df, treatment='A') - >>> iptw.regression_models('L') - >>> iptw.fit() - >>> df['iptw'] = iptw.Weight - - >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S', weights='iptw', generalize=False) - >>> ipsw.regression_models('L + W + L:W', print_results=False) + >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S') + >>> ipsw.sampling_model('L + W + L:W', print_results=False) + >>> ipsw.treatment_model('L', print_results=False) >>> ipsw.fit() >>> ipsw.summary() @@ -123,7 +118,7 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No self.risk_ratio = None self._denominator_model = False - def weight_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True): + def sampling_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True): """Logistic regression model(s) for estimating sampling weights. The model denominator must be specified for both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model results are returned @@ -558,18 +553,28 @@ class AIPSW: >>> from zepid.causal.generalize import AIPSW >>> df = load_generalize_data(False) - Generalizability + Generalizability for RCT >>> aipw = AIPSW(df, exposure='A', outcome='Y', selection='S', generalize=True) - >>> aipw.weight_model('L + W_sq', print_results=False) + >>> aipw.sampling_model('L + W_sq', print_results=False) >>> aipw.outcome_model('A + L + L:A + W + W:A + W:A:L', print_results=False) >>> aipw.fit() >>> aipw.summary() - Transportability + Transportability for RCT + + >>> aipw = AIPSW(df, exposure='A', outcome='Y', selection='S', generalize=False) + >>> aipw.sampling_model('L + W_sq', print_results=False) + >>> aipw.outcome_model('A + L + L:A + W + W:A + W:A:L', print_results=False) + >>> aipw.fit() + >>> aipw.summary() + + Transportability for Observational study + >>> df = load_generalize_data(True) >>> aipw = AIPSW(df, exposure='A', outcome='Y', selection='S', generalize=False) - >>> aipw.weight_model('L + W_sq', print_results=False) + >>> aipw.sampling_model('L + W_sq', print_results=False) + >>> aipw.treatment_model('L', print_results=False) >>> aipw.outcome_model('A + L + L:A + W + W:A + W:A:L', print_results=False) >>> aipw.fit() >>> aipw.summary() @@ -606,7 +611,7 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No self.risk_difference = None self.risk_ratio = None - def weight_model(self, model_denominator, model_numerator='1', stabilized=True, print_results=True): + def sampling_model(self, model_denominator, model_numerator='1', stabilized=True, print_results=True): """Logistic regression model(s) for estimating IPSW. The model denominator must be specified for both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model results are returned From 925e3a2162928d6457a93bb59501e1e21d846a4a Mon Sep 17 00:00:00 2001 From: pzivich Date: Fri, 16 Aug 2019 08:59:56 -0400 Subject: [PATCH 12/42] Updating tests to reflect IPSW and AIPSW function naming changes --- tests/test_generalize.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/test_generalize.py b/tests/test_generalize.py index cf52be0..a4d277e 100644 --- a/tests/test_generalize.py +++ b/tests/test_generalize.py @@ -27,7 +27,7 @@ class TestIPSW: def test_stabilize_error(self, df_c): ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S') with pytest.raises(ValueError): - ipsw.weight_model('L + W_sq', model_numerator='W', stabilized=False, print_results=False) + ipsw.sampling_model('L + W_sq', model_numerator='W', stabilized=False, print_results=False) def test_no_model_error(self, df_c): ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True) @@ -36,35 +36,35 @@ def test_no_model_error(self, df_c): def test_generalize_unstabilized(self, df_r): ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S') - ipsw.weight_model('L + W_sq', stabilized=False, print_results=False) + ipsw.sampling_model('L + W_sq', stabilized=False, print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4) def test_generalize_stabilized(self, df_r): ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S') - ipsw.weight_model('L + W_sq', stabilized=True, print_results=False) + ipsw.sampling_model('L + W_sq', stabilized=True, print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4) def test_transport_unstabilized(self, df_r): ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False) - ipsw.weight_model('L + W_sq', stabilized=False, print_results=False) + ipsw.sampling_model('L + W_sq', stabilized=False, print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4) def test_transport_stabilized(self, df_r): ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False) - ipsw.weight_model('L + W_sq', print_results=False) + ipsw.sampling_model('L + W_sq', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5) npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4) def test_generalize_iptw(self, df_c): ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True) - ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.sampling_model('L + W + W_sq', print_results=False) ipsw.treatment_model('L', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5) @@ -72,7 +72,7 @@ def test_generalize_iptw(self, df_c): def test_transport_iptw(self, df_c): ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False) - ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.sampling_model('L + W + W_sq', print_results=False) ipsw.treatment_model('L', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5) @@ -80,7 +80,7 @@ def test_transport_iptw(self, df_c): def test_generalize_weight(self, df_c): ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight') - ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.sampling_model('L + W + W_sq', print_results=False) ipsw.treatment_model('L', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5) @@ -88,7 +88,7 @@ def test_generalize_weight(self, df_c): def test_transport_weight(self, df_c): ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight') - ipsw.weight_model('L + W + W_sq', print_results=False) + ipsw.sampling_model('L + W + W_sq', print_results=False) ipsw.treatment_model('L', print_results=False) ipsw.fit() npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5) @@ -152,7 +152,7 @@ def test_no_model_error(self, df_c): with pytest.raises(ValueError): aipw.fit() - aipw.weight_model('L', print_results=False) + aipw.sampling_model('L', print_results=False) with pytest.raises(ValueError): aipw.fit() @@ -163,7 +163,7 @@ def test_no_model_error(self, df_c): def test_generalize(self, df_r): aipw = AIPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=True) - aipw.weight_model('L + W_sq', print_results=False) + aipw.sampling_model('L + W_sq', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() npt.assert_allclose(aipw.risk_difference, 0.061382, atol=1e-5) @@ -171,7 +171,7 @@ def test_generalize(self, df_r): def test_transport(self, df_r): aipw = AIPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False) - aipw.weight_model('L + W_sq', print_results=False) + aipw.sampling_model('L + W_sq', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() npt.assert_allclose(aipw.risk_difference, 0.05479, atol=1e-5) @@ -179,7 +179,7 @@ def test_transport(self, df_r): def test_generalize_conf(self, df_c): aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True) - aipw.weight_model('L + W_sq', print_results=False) + aipw.sampling_model('L + W_sq', print_results=False) aipw.treatment_model('L', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() @@ -188,7 +188,7 @@ def test_generalize_conf(self, df_c): def test_transport_conf(self, df_c): aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False) - aipw.weight_model('L + W_sq', print_results=False) + aipw.sampling_model('L + W_sq', print_results=False) aipw.treatment_model('L', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() @@ -197,7 +197,7 @@ def test_transport_conf(self, df_c): def test_generalize_weight(self, df_c): aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight') - aipw.weight_model('L + W_sq', print_results=False) + aipw.sampling_model('L + W_sq', print_results=False) aipw.treatment_model('L', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() @@ -206,7 +206,7 @@ def test_generalize_weight(self, df_c): def test_transport_weight(self, df_c): aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight') - aipw.weight_model('L + W_sq', print_results=False) + aipw.sampling_model('L + W_sq', print_results=False) aipw.treatment_model('L', print_results=False) aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) aipw.fit() From c6898c2f3e3010c0a1dac7d4679cee2a809ca989 Mon Sep 17 00:00:00 2001 From: pzivich Date: Fri, 16 Aug 2019 09:11:28 -0400 Subject: [PATCH 13/42] Resolving issue with outcome_model check and categorical treatments for g-formula --- zepid/causal/gformula/TimeFixed.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zepid/causal/gformula/TimeFixed.py b/zepid/causal/gformula/TimeFixed.py index c6be6fb..2df065a 100644 --- a/zepid/causal/gformula/TimeFixed.py +++ b/zepid/causal/gformula/TimeFixed.py @@ -197,8 +197,9 @@ def outcome_model(self, model, print_results=True): print_results : bool, optional Whether to print the logistic regression results to the terminal. Default is True """ - if self.exposure not in model: - warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") + if type(self.exposure) is not list: + if self.exposure not in model: + warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.") if self.outcome_type == 'binary': linkdist = sm.families.family.Binomial() From 27ee7ba756e88fc82b12bd7ecb1291113a2aa8ee Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 3 Oct 2019 08:36:23 -0400 Subject: [PATCH 14/42] Copying dataframe rather than using actual --- zepid/causal/ipw/IPTW.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py index 284b583..8e9e9d5 100644 --- a/zepid/causal/ipw/IPTW.py +++ b/zepid/causal/ipw/IPTW.py @@ -338,7 +338,7 @@ def fit(self, continuous_distribution='gaussian'): ind = sm.cov_struct.Independence() full_msm = self.outcome + ' ~ ' + self.ms_model - df = self.df + df = self.df.copy() if self.ipmw is None: if self._weight_ is None: df['_ipfw_'] = self.iptw From 4a91e6ac1202849397b25e0d9245424ef4f25f8c Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 3 Oct 2019 08:36:41 -0400 Subject: [PATCH 15/42] Updating changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 004bdee..13c6172 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ are used in both the weight and the outcome models. `TimeFixedGFormula` has added support for the average treatment effect in the treated (ATT), and average treatment effect in the untreated (ATU). +Improved warnings when the treatment/exposure variable is not included in models that it should be in (such as the +outcome model or in structural nested models). + Background refactoring for IPTW. `utils.py` now contains a function to calculate inverse probability of treatment weights. The function `iptw_calculator` is used by `IPTW`, `AIPTW`, `IPSW`, and `AIPSW` to calculate the weights now From cfe7aa447a94f579c42b66781852473b405808f9 Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 3 Oct 2019 08:37:50 -0400 Subject: [PATCH 16/42] Adding TODO list to generalize --- zepid/causal/generalize/estimators.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py index b00ef54..7c3a1cc 100644 --- a/zepid/causal/generalize/estimators.py +++ b/zepid/causal/generalize/estimators.py @@ -212,6 +212,8 @@ def treatment_model(self, model_denominator, model_numerator='1', bound=None, st standardize='population', bound=bound, print_results=print_results) + # TODO add missing_model + def fit(self): """Uses the calculated IPSW to obtain the risk difference and risk ratio from the sample. If weights are provided in the initial step, those weights are multiplied with IPSW to obtain the overall weights. @@ -455,8 +457,8 @@ def fit(self): yn = self._outcome_model.predict(dfn) if self.weight is not None: - r1 = np.average(ya, weights=self.df[self.weight]) - r0 = np.average(yn, weights=self.df[self.weight]) + r1 = np.average(ya, weights=self.target[self.weight]) + r0 = np.average(yn, weights=self.target[self.weight]) else: r1 = np.mean(ya) r0 = np.mean(yn) @@ -755,7 +757,6 @@ def fit(self): self.df['__ipw__'] = self.ipsw * self.df[self.weight] else: self.df['__ipw__'] = self.ipsw * self.iptw * self.df[self.weight] - print(self.ipsw * self.iptw * self.df[self.weight]) else: if self.iptw is None: self.df['__ipw__'] = self.ipsw From d990d1fc1445e31f4443d26ad9035220a5164809 Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 3 Oct 2019 09:32:39 -0400 Subject: [PATCH 17/42] Updating version-ing --- CHANGELOG.md | 2 +- zepid/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13c6172..c3f6f31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## Change logs -### v0.8.1 +### v0.8.2 `IPSW` and `AIPSW` now natively support adjusting for confounding. Both now have the `treatment_model()` function, which calculates the inverse probability of treatment weights. How weights are handled in `AIPSW` are updated. They are used in both the weight and the outcome models. diff --git a/zepid/version.py b/zepid/version.py index ef72cc0..4ca39e7 100644 --- a/zepid/version.py +++ b/zepid/version.py @@ -1 +1 @@ -__version__ = '0.8.1' +__version__ = '0.8.2' From 148b4c3a6b5d7ecfa155d238829a3fe0237ca072 Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 18 Nov 2019 11:32:22 -0500 Subject: [PATCH 18/42] Stochastic TMLE -- general structure --- zepid/causal/doublyrobust/TMLE.py | 376 ++++++++++++++++++++++++-- zepid/causal/doublyrobust/__init__.py | 2 +- 2 files changed, 356 insertions(+), 22 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index fcc5191..5d20aab 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -7,7 +7,7 @@ from scipy.stats import logistic, norm from zepid.causal.utils import propensity_score -from zepid.calc import probability_to_odds +from zepid.calc import probability_to_odds, odds_to_probability from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, missing_machine_learner, _bounding_, plot_kde, plot_love, standardized_mean_differences, positivity, plot_kde_accuracy, outcome_accuracy) @@ -15,13 +15,14 @@ class TMLE: r"""Implementation of target maximum likelihood estimator. This implementation calculates TMLE for a - time-fixed exposure and a single time-point outcome.It uses standard regression models to calculate the estimate - of interest. The TMLE estimator allows users to instead use machine learning algorithms from sklearn. + time-fixed exposure and a single time-point outcome. By default standard parametric regression models are used to + calculate the estimate of interest. The TMLE estimator allows users to instead use machine learning algorithms + from sklearn and PyGAM. Note ---- - Support for machine learning will be dropped in v0.9.0 due to poor confidence interval coverage. There will be - new cross-fitting procedures and classes for causal inference with machine learning. + Valid confidence intervals are only attainable with certain machine learning algorithms. These algorithms must be + Donsker class for valid confidence intervals. GAM and LASSO are examples of alogorithms that are Donsker class Parameters ---------- @@ -167,8 +168,8 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005): self._continuous_min = np.min(df[outcome]) self._continuous_max = np.max(df[outcome]) self._cb = continuous_bound - self.df[outcome] = self._unit_bounds(y=df[outcome], mini=self._continuous_min, - maxi=self._continuous_max, bound=self._cb) + self.df[outcome] = _tmle_unit_bounds_(y=df[outcome], mini=self._continuous_min, + maxi=self._continuous_max, bound=self._cb) self._out_model = None self._exp_model = None @@ -468,13 +469,13 @@ def fit(self): delta = np.where(self.df[self._missing_indicator] == 1, 1, 0) if self._continuous_outcome: # Calculating Average Treatment Effect - Qstar = self._unit_unbound(Qstar, mini=self._continuous_min, maxi=self._continuous_max) - Qstar1 = self._unit_unbound(Qstar1, mini=self._continuous_min, maxi=self._continuous_max) - Qstar0 = self._unit_unbound(Qstar0, mini=self._continuous_min, maxi=self._continuous_max) + Qstar = _tmle_unit_unbound_(Qstar, mini=self._continuous_min, maxi=self._continuous_max) + Qstar1 = _tmle_unit_unbound_(Qstar1, mini=self._continuous_min, maxi=self._continuous_max) + Qstar0 = _tmle_unit_unbound_(Qstar0, mini=self._continuous_min, maxi=self._continuous_max) self.average_treatment_effect = np.nanmean(Qstar1 - Qstar0) # Influence Curve for CL - y_unbound = self._unit_unbound(self.df[self.outcome], mini=self._continuous_min, maxi=self._continuous_max) + y_unbound = _tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min, maxi=self._continuous_max) ic = np.where(delta == 1, HAW * (y_unbound - Qstar) + (Qstar1 - Qstar0) - self.average_treatment_effect, Qstar1 - Qstar0 - self.average_treatment_effect) @@ -800,15 +801,348 @@ def plot_love(self, color_unweighted='r', color_weighted='b', shape_unweighted=' shape_unweighted=shape_unweighted, shape_weighted=shape_weighted) return ax - @staticmethod - def _unit_bounds(y, mini, maxi, bound): - # bounding for continuous outcomes - v = (y - mini) / (maxi - mini) - v = np.where(np.less(v, bound), bound, v) - v = np.where(np.greater(v, 1-bound), 1-bound, v) - return v + +class StochasticTMLE: + r"""Implementation of target maximum likelihood estimator for stochastic treatment plans. This implementation + calculates TMLE for a time-fixed exposure and a single time-point outcome under a stochastic treatment plan of + interest. By default, standard parametric regression models are used to calculate the estimate of interest. The + StochasticTMLE estimator allows users to instead use machine learning algorithms from sklearn and PyGAM. + + Note + ---- + Valid confidence intervals are only attainable with certain machine learning algorithms. These algorithms must be + Donsker class for valid confidence intervals. GAM and LASSO are examples of alogorithms that are Donsker class + + Parameters + ---------- + df : DataFrame + Pandas dataframe containing the variables of interest + exposure : str + Column label for the exposure of interest + outcome : str + Column label for the outcome of interest + alpha : float, optional + Alpha for confidence interval level. Default is 0.05 + continuous_bound : float, optional + Optional argument to control the bounding feature for continuous outcomes. The bounding process may result + in values of 0,1 which are undefined for logit(x). This parameter adds or substracts from the scenarios of + 0,1 respectively. Default value is 0.0005 + verbose : bool, optional + Optional argument for verbose estimation. With verbose estimation, the model fits for each result are printed + to the console. It is highly recommended to turn this parameter to True when conducting model diagnostics + + Note + ---- + TMLE is a doubly-robust substitution estimator. TMLE obtains the target estimate in a single step. The + single-step TMLE is described further by van der Laan. For further details, see the listed references. + + Continuous outcomes must be bounded between 0 and 1. TMLE does this automatically for the user. Additionally, + the average treatment effect is estimate is back converted to the original scale of Y. When scaling Y as Y*, + some values may take the value of 0 or 1, which breaks a logit(Y*) transformation. To avoid this issue, Y* is + bounded by the `continuous_bound` argument. The default is 0.0005, the same as R's tmle + + Examples + -------- + Setting up environment + + >>> from zepid import load_sample_data, spline + >>> from zepid.causal.doublyrobust import StochasticTMLE + >>> df = load_sample_data(False).dropna() + >>> df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True) + + Estimating TMLE using logistic regression + + >>> tmle = TMLE(df, exposure='art', outcome='dead') + >>> # Specifying exposure/treatment model + >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + >>> # Specifying outcome model + >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + >>> # TMLE estimation procedure + >>> tmle.fit() + >>> # Printing main results + >>> tmle.summary() + >>> # Extracting risk difference and confidence intervals, respectively + >>> tmle.risk_difference + >>> tmle.risk_difference_ci + + Estimating TMLE with machine learning algorithm from sklearn + + >>> from sklearn.linear_model import LogisticRegression + >>> log1 = LogisticRegression(penalty='l1', random_state=201) + >>> tmle = TMLE(df, 'art', 'dead') + >>> # custom_model allows specification of machine learning algorithms + >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1) + >>> tmle.outcome_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1) + >>> tmle.fit() + + References + ---------- + Muñoz ID, and Van Der Laan MJ. Population intervention causal effects based on stochastic interventions. + Biometrics 68.2 (2012): 541-549. + + Van der Laan MJ, and Sherri R. Targeted learning in data science: causal inference for complex longitudinal + studies. Springer Science & Business Media, 2011. + """ + def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, verbose=False): + # Going through missing data + if df.dropna().shape[0] != df.shape[0]: + warnings.warn("There is missing data that is not the outcome in the data set. StochasticTMLE will drop all " + "missing data. StochasticTMLE will fit " + + str(df.dropna().shape[0]) + + ' of ' + str(df.shape[0]) + ' observations', UserWarning) + self.df = df.copy().dropna().reset_index() + else: + self.df = df.copy().reset_index() + + if not df[exposure].value_counts().index.isin([0, 1]).all(): + raise ValueError("StochasticTMLE only supports binary exposures currently") + + # Manage outcomes + if df[outcome].dropna().value_counts().index.isin([0, 1]).all(): + self._continuous_outcome = False + self._cb = 0.0 + else: + self._continuous_outcome = True + self._continuous_min = np.min(df[outcome]) + self._continuous_max = np.max(df[outcome]) + self._cb = continuous_bound + self.df[outcome] = _tmle_unit_bounds_(y=df[outcome], mini=self._continuous_min, + maxi=self._continuous_max, bound=self._cb) + + self.exposure = exposure + self.outcome = outcome + + # Output attributes + self.marginals_vector = None + self.marginal_outcome = None + self.alpha = alpha + self.marginal_ci = None + self.conditional_ci = None + + # Storage for items I need later + self._outcome_model = None + self._q_model = None + self._Qinit_ = None + self._treatment_model = None + self._g_model = None + self._denominator_ = None + self._verbose_ = verbose + + # Custom model / machine learner storage + self._g_custom_ = None + self._q_custom_ = None + + def exposure_model(self, model, custom_model=None, bound=False): + """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature + + Parameters + ---------- + model : str + Independent variables to predict the exposure. Example) 'var1 + var2 + var3' + custom_model : optional + Input for a custom model that is used in place of the logit model (default). The model must have the + "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the + background, TMLE will fit the custom model and generate the predicted probablities + bound : float, list, optional + Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. + Specifying this argument can improve finite sample performance for random positivity violations. However, + truncating weights leads to additional confounding. Default is False, meaning no truncation of + predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below + or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for + asymmetric trunctation, with the first value being the lower bound and the second being the upper bound + """ + self._g_model = self.exposure + ' ~ ' + model + + if custom_model is None: # Standard parametric regression model + fitmodel = propensity_score(self.df, self._g_model, print_results=self._verbose_) + pred = fitmodel.predict(self.df) + else: # User-specified prediction model + # TODO need to create smart warning system + self._exp_model_custom = True + data = patsy.dmatrix(model + ' - 1', self.df) + pred = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]), + ml_model=custom_model, print_results=self._verbose_) + + if bound: # Bounding predicted probabilities if requested + self._denominator_ = _bounding_(self._denominator_, bounds=bound) + + self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred) + + def outcome_model(self, model, custom_model=None, bound=False, continuous_distribution='gaussian'): + """Estimation of E(Y|A,L), which is also written sometimes as Q(A,W) or Pr(Y=1|A,W). + + Parameters + ---------- + model : str + Independent variables to predict the exposure. Example) 'var1 + var2 + var3' + custom_model : optional + Input for a custom model that is used in place of the logit model (default). The model must have the + "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the + background, TMLE will fit the custom model and generate the predicted probablities + bound : bool, optional + This argument should ONLY be used if the outcome is continuous. Value between 0,1 to truncate the bounded + predicted outcomes. Default is `False`, meaning no truncation of predicted outcomes occurs (unless a + predicted outcome is outside the bounded continuous outcome). Providing a single float assumes symmetric + trunctation. A list of floats can be provided for asymmetric trunctation. + print_results : bool, optional + Whether to print the fitted model results. Default is True (prints results) + continuous_distribution : str, optional + Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson' + for Poisson distributions + """ + self._q_model = self.outcome + ' ~ ' + model + + # Step 1) Prediction for Q (estimation of Q-model) + if custom_model is None: # Standard parametric regression + self._continuous_type = continuous_distribution + if self._continuous_outcome: + if (continuous_distribution.lower() == 'gaussian') or (continuous_distribution.lower() == 'normal'): + f = sm.families.family.Gaussian() + elif continuous_distribution.lower() == 'poisson': + f = sm.families.family.Poisson() + else: + raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous " + "outcomes") + log = smf.glm(self._q_model, self.df, family=f).fit() + else: + f = sm.families.family.Binomial() + log = smf.glm(self._q_model, self.df, family=f).fit() + + if self._verbose_: + print('==============================================================================') + print('Q-model') + print(self._outcome_model.summary()) + + # Step 2) Estimation under the scenarios + self._Qinit_ = log.predict(self.df) + + else: # User-specified model + # TODO need to create smart warning system + self._out_model_custom = True + data = patsy.dmatrix(model + ' - 1', df) + self._Qinit_ + self._outcome_model + + if not bound: # Bounding predicted probabilities if requested + bound = self._cb + + # This bounding step prevents continuous outcomes from being outside the range + self._Qinit_ = _bounding_(self._Qinit_, bounds=bound) + + def fit(self, p, condition=None, samples=100): + """Calculate the effect from the predicted exposure probabilities and predicted outcome values using the TMLE + procedure. Confidence intervals are calculated using influence curves. + + Note + ---- + Exposure and outcome models must be specified prior to `fit()` + + Returns + ------- + TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and + `average _treatment_effect` for continuous outcomes + """ + # Step 4) Calculating clever covariate (HAW) + # TODO need to apply p properly + haw = p / self._denominator_ + + # Step 5) Estimating TMLE + epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_) + + # Step 6) Estimating psi + q_star_list = [] + for i in range(samples): + # Applying treatment plan + df = self.df.copy() + if condition is None: + df[self.exposure] = np.random.binomial(n=1, p=p, size=df.shape[0]) + else: + df[self.exposure] = np.nan + for c, prop in zip(condition, p): + df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0]) + + # Outcome model under treatment plan + y_star = self._outcome_model.predict(df) + + # Targeted Estimate + logit_qstar = np.log(probability_to_odds(y_star)) + epsilon # NOTE: log(Y^*) + e + q_star = odds_to_probability(np.exp(logit_qstar)) + q_star_list.append(q_star) + + if self._continuous_outcome: + self.marginals_vector = _tmle_unit_unbound_(q_star_list, + mini=self._continuous_min, maxi=self._continuous_max) + y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min, + maxi=self._continuous_max)) + yq0_ = _tmle_unit_unbound_(self._Qinit_, mini=self._continuous_min, maxi=self._continuous_max) + else: + self.marginals_vector = q_star_list + y_ = np.array(self.df[self.outcome]) + yq0_ = self._Qinit_ + + self.marginal_outcome = np.mean(self.marginals_vector) + + # Step 7) Estimating Var(psi) + if self.alpha == 0.05: # Without this, won't match R exactly. R relies on 1.96, while I use SciPy + zalpha = 1.96 + else: + zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) + # TODO add remainder of variance calculations + + def summary(self, decimal=3): + if self.marginal_outcome is None: + raise ValueError('The fit() statement must be ran before summary()') + + print('======================================================================') + print(' Stochastic Targeted Maximum Likelihood Estimator ') + print('======================================================================') + fmt = 'Treatment: {:<15} No. Observations: {:<20}' + print(fmt.format(self.exposure, self.df.shape[0])) + fmt = 'Outcome: {:<15}' + print(fmt.format(self.outcome)) + fmt = 'Q-Model: {:<15}' + print(fmt.format('Logistic')) + # TODO add custom model check + fmt = 'g-Model: {:<15}' + print(fmt.format('Logistic')) + + # TODO add treatment plan information + # TODO add information on m (number of re-samples) + + print('======================================================================') + print('Overall incidence: ', np.round(self.marginal_outcome, decimals=decimal)) + # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal)) + print('======================================================================') + print('Conditional') + # print('95% CL: ', np.round(self.ci_cond, decimals=decimal)) + print('======================================================================') @staticmethod - def _unit_unbound(ystar, mini, maxi): - # unbounding of bounded continuous outcomes - return ystar*(maxi - mini) + mini + def targeting_step(y, q_init, iptw, verbose): + f = sm.families.family.Binomial() + log = sm.GLM(y, # Outcome / dependent variable + np.repeat(1, y.shape[0]), # Generating intercept only model + offset=np.log(probability_to_odds(q_init)), # Offset by g-formula predictions + freq_weights=iptw, # Weighted by calculated IPW + family=f).fit() + + if verbose: # Optional argument to print each intermediary result + print('==============================================================================') + print('Targeting Model') + print(log.summary()) + + return log.params[0] # Returns single-step estimated Epsilon term + + +# Functions that all TMLEs can call are below +def _tmle_unit_bounds_(y, mini, maxi, bound): + # bounding for continuous outcomes + v = (y - mini) / (maxi - mini) + v = np.where(np.less(v, bound), bound, v) + v = np.where(np.greater(v, 1-bound), 1-bound, v) + return v + + +def _tmle_unit_unbound_(ystar, mini, maxi): + # unbounding of bounded continuous outcomes + return ystar*(maxi - mini) + mini diff --git a/zepid/causal/doublyrobust/__init__.py b/zepid/causal/doublyrobust/__init__.py index fb3d55b..fb76187 100644 --- a/zepid/causal/doublyrobust/__init__.py +++ b/zepid/causal/doublyrobust/__init__.py @@ -1,2 +1,2 @@ from .AIPW import AIPTW -from .TMLE import TMLE +from .TMLE import TMLE, StochasticTMLE From 6b6be91f19f87d0ea65903382232791a801d3b17 Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 18 Nov 2019 13:17:28 -0500 Subject: [PATCH 19/42] Moving check_conditional to utils.py --- zepid/causal/ipw/IPTW.py | 18 +++--------------- zepid/causal/utils.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py index 8e9e9d5..83d5fbb 100644 --- a/zepid/causal/ipw/IPTW.py +++ b/zepid/causal/ipw/IPTW.py @@ -7,7 +7,7 @@ from statsmodels.tools.sm_exceptions import DomainWarning import matplotlib.pyplot as plt -from zepid.causal.utils import (propensity_score, plot_boxplot, plot_kde, plot_love, +from zepid.causal.utils import (propensity_score, plot_boxplot, plot_kde, plot_love, stochastic_check_conditional, standardized_mean_differences, positivity, _bounding_, iptw_calculator) @@ -782,7 +782,7 @@ def fit(self, p, conditional=None): if conditional is None: df['_numer_'] = np.where(df[self.treatment] == 1, p, 1 - p) else: - self._check_conditional(conditional=conditional) + stochastic_check_conditional(df=df, conditional=conditional) df['_numer_'] = np.nan for c, prop in zip(conditional, p): df['_numer_'] = np.where(eval(c), @@ -809,7 +809,7 @@ def summary(self, decimal=3): raise ValueError('The fit() function must be specified before summary()') print('======================================================================') - print(' Stochastic IPTW') + print(' Stochastic IPTW ') print('======================================================================') fmt = 'Treatment: {:<15} No. Observations: {:<20}' print(fmt.format(self.treatment, self.df.shape[0])) @@ -818,15 +818,3 @@ def summary(self, decimal=3): print('======================================================================') print('Risk: ', round(self.marginal_outcome, decimal)) print('======================================================================') - - def _check_conditional(self, conditional): - """Check that conditionals are exclusive for the stochastic fit process. Generates a warning if not true - """ - df = self.df.copy() - a = np.array([0] * df.shape[0]) - for c in conditional: - a = np.add(a, np.where(eval(c), 1, 0)) - - if np.sum(np.where(a > 1, 1, 0)): - warnings.warn("It looks like your conditional categories are NOT exclusive. For appropriate estimation, " - "the conditions that designate each category should be exclusive", UserWarning) diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py index afb7c94..29f031c 100644 --- a/zepid/causal/utils.py +++ b/zepid/causal/utils.py @@ -609,3 +609,15 @@ def plot_kde_accuracy(values, bw_method='scott', fill=True, color='b'): ax.set_xlabel(r'$Y_{pred} - Y$') ax.set_ylabel('Density') return ax + + +def stochastic_check_conditional(df, conditional): + """Check that conditionals are exclusive for the stochastic fit process. Generates a warning if not true + """ + a = np.array([0] * df.shape[0]) + for c in conditional: + a = np.add(a, np.where(eval(c), 1, 0)) + + if np.sum(np.where(a > 1, 1, 0)): + warnings.warn("It looks like your conditional categories are NOT exclusive. For appropriate estimation, " + "the conditions that designate each category should be exclusive", UserWarning) From 8d3ca81c04c497d05e09c376bf44ca62b3bc9ed5 Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 18 Nov 2019 13:17:54 -0500 Subject: [PATCH 20/42] Fixing conditional IPTW for TMLE --- zepid/causal/doublyrobust/TMLE.py | 47 ++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 5d20aab..355b3f3 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -6,7 +6,7 @@ import matplotlib.pyplot as plt from scipy.stats import logistic, norm -from zepid.causal.utils import propensity_score +from zepid.causal.utils import propensity_score, stochastic_check_conditional from zepid.calc import probability_to_odds, odds_to_probability from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, missing_machine_learner, _bounding_, plot_kde, plot_love, standardized_mean_differences, positivity, @@ -984,8 +984,6 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri predicted outcomes. Default is `False`, meaning no truncation of predicted outcomes occurs (unless a predicted outcome is outside the bounded continuous outcome). Providing a single float assumes symmetric trunctation. A list of floats can be provided for asymmetric trunctation. - print_results : bool, optional - Whether to print the fitted model results. Default is True (prints results) continuous_distribution : str, optional Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson' for Poisson distributions @@ -1004,6 +1002,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous " "outcomes") log = smf.glm(self._q_model, self.df, family=f).fit() + else: f = sm.families.family.Binomial() log = smf.glm(self._q_model, self.df, family=f).fit() @@ -1019,7 +1018,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri else: # User-specified model # TODO need to create smart warning system self._out_model_custom = True - data = patsy.dmatrix(model + ' - 1', df) + data = patsy.dmatrix(model + ' - 1', self.df) self._Qinit_ self._outcome_model @@ -1029,7 +1028,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri # This bounding step prevents continuous outcomes from being outside the range self._Qinit_ = _bounding_(self._Qinit_, bounds=bound) - def fit(self, p, condition=None, samples=100): + def fit(self, p, conditional=None, samples=100): """Calculate the effect from the predicted exposure probabilities and predicted outcome values using the TMLE procedure. Confidence intervals are calculated using influence curves. @@ -1042,9 +1041,29 @@ def fit(self, p, condition=None, samples=100): TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and `average _treatment_effect` for continuous outcomes """ + # Error checking + if self._denominator_ is None: + raise ValueError("The exposure_model() function must be specified before the fit() function") + if self._Qinit_ is None: + raise ValueError("The outcome_model() function must be specified before the fit() function") + + p = np.array(p) + if np.any(p > 1): + raise ValueError("All specified treatment probabilities must be less than 1") + if conditional is not None: + if len(p) != len(conditional): + raise ValueError("'p' and 'conditional' must be the same length") + # Step 4) Calculating clever covariate (HAW) - # TODO need to apply p properly - haw = p / self._denominator_ + if conditional is None: + numerator = np.where(self.df[self.exposure] == 1, p, 1 - p) + else: + stochastic_check_conditional(df=self.df, conditional=conditional) + numerator = np.array([np.nan] for i in range(self.df.shape[0])) + for c, prop in zip(conditional, p): + numerator = np.where(eval(c), np.where(self.df[self.exposure] == 1, prop, 1 - prop), numerator) + + haw = numerator / self._denominator_ # Step 5) Estimating TMLE epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_) @@ -1054,26 +1073,26 @@ def fit(self, p, condition=None, samples=100): for i in range(samples): # Applying treatment plan df = self.df.copy() - if condition is None: + if conditional is None: df[self.exposure] = np.random.binomial(n=1, p=p, size=df.shape[0]) else: df[self.exposure] = np.nan - for c, prop in zip(condition, p): + for c, prop in zip(conditional, p): df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0]) # Outcome model under treatment plan y_star = self._outcome_model.predict(df) # Targeted Estimate - logit_qstar = np.log(probability_to_odds(y_star)) + epsilon # NOTE: log(Y^*) + e - q_star = odds_to_probability(np.exp(logit_qstar)) + logit_qstar = np.log(probability_to_odds(y_star)) + epsilon # logit(Y^*) + e + q_star = odds_to_probability(np.exp(logit_qstar)) # Y^* q_star_list.append(q_star) if self._continuous_outcome: self.marginals_vector = _tmle_unit_unbound_(q_star_list, - mini=self._continuous_min, maxi=self._continuous_max) + mini=self._continuous_min, maxi=self._continuous_max) y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min, - maxi=self._continuous_max)) + maxi=self._continuous_max)) yq0_ = _tmle_unit_unbound_(self._Qinit_, mini=self._continuous_min, maxi=self._continuous_max) else: self.marginals_vector = q_star_list @@ -1088,7 +1107,7 @@ def fit(self, p, condition=None, samples=100): else: zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) # TODO add remainder of variance calculations - + def summary(self, decimal=3): if self.marginal_outcome is None: raise ValueError('The fit() statement must be ran before summary()') From feca583268f2ad61fc62d9e107398faa985b49ee Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 18 Nov 2019 13:33:56 -0500 Subject: [PATCH 21/42] improved testing for OOB p in StochasticIPTW --- tests/test_ipw.py | 8 +++++++- zepid/causal/ipw/IPTW.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_ipw.py b/tests/test_ipw.py index e98d6b9..31adb3c 100644 --- a/tests/test_ipw.py +++ b/tests/test_ipw.py @@ -253,11 +253,14 @@ def test_error_no_model(self, sdata): with pytest.raises(ValueError): sipw.fit(p=0.8) - def test_error_prob_high(self, sdata): + def test_error_p_oob(self, sdata): sipw = StochasticIPTW(sdata.dropna(), treatment='art', outcome='dead') with pytest.raises(ValueError): sipw.fit(p=1.8) + with pytest.raises(ValueError): + sipw.fit(p=-0.8) + def test_error_summary(self, sdata): sipw = StochasticIPTW(sdata.dropna(), treatment='art', outcome='dead') with pytest.raises(ValueError): @@ -270,6 +273,9 @@ def test_error_conditional(self, sdata): with pytest.raises(ValueError): sipw.fit(p=[0.8, 0.1, 0.1], conditional=["df['male']==1", "df['male']==0"]) + with pytest.raises(ValueError): + sipw.fit(p=[0.8], conditional=["df['male']==1", "df['male']==0"]) + def test_uncond_treatment(self, sdata): r_pred = 0.1165162207 diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py index 83d5fbb..2a28eb2 100644 --- a/zepid/causal/ipw/IPTW.py +++ b/zepid/causal/ipw/IPTW.py @@ -771,8 +771,8 @@ def fit(self, p, conditional=None): if self._pdenom_ is None: raise ValueError("The treatment_model() function must be specified before the fit() function") - if np.any(p > 1): - raise ValueError("All specified treatment probabilities must be less than 1") + if np.any(p > 1) or np.any(p < 0): + raise ValueError("All specified treatment probabilities must be between 0 and 1") if conditional is not None: if len(p) != len(conditional): raise ValueError("'p' and 'conditional' must be the same length") From ff3f0bb9a596d4c3394d381110e860b0077eca9a Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 18 Nov 2019 14:11:52 -0500 Subject: [PATCH 22/42] starting tests for StochasticTMLE --- tests/test_doublyrobust.py | 187 +++++++++++++++++++++++++++++- zepid/causal/doublyrobust/TMLE.py | 15 +-- 2 files changed, 191 insertions(+), 11 deletions(-) diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py index 1c6cf6f..dbe737e 100644 --- a/tests/test_doublyrobust.py +++ b/tests/test_doublyrobust.py @@ -2,10 +2,11 @@ import numpy as np import pandas as pd import numpy.testing as npt +import pandas.testing as pdt from sklearn.linear_model import LogisticRegression, LinearRegression import zepid as ze -from zepid.causal.doublyrobust import TMLE, AIPTW +from zepid.causal.doublyrobust import TMLE, AIPTW, StochasticTMLE class TestTMLE: @@ -43,24 +44,43 @@ def test_drop_missing_data(self): tmle = TMLE(df, exposure='art', outcome='dead') assert df.dropna(subset=['cd4_wk45']).shape[0] == tmle.df.shape[0] - def test_error_when_no_models_specified1(self, df): + def test_error_when_no_models_specified(self, df): tmle = TMLE(df, exposure='art', outcome='dead') with pytest.raises(ValueError): tmle.fit() - def test_error_when_no_models_specified2(self, df): tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) with pytest.raises(ValueError): tmle.fit() - def test_error_when_no_models_specified3(self, df): tmle = TMLE(df, exposure='art', outcome='dead') tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) with pytest.raises(ValueError): tmle.fit() + def test_continuous_processing(self): + a_list = [0, 1, 1, 0, 1, 1, 0, 0] + y_list = [1, -1, 5, 0, 0, 0, 10, -5] + df = pd.DataFrame() + df['A'] = a_list + df['Y'] = y_list + + tmle = TMLE(df=df, exposure='A', outcome='Y', continuous_bound=0.0001) + + # Checking all flagged parts are correct + assert tmle._continuous_outcome is True + assert tmle._continuous_min == -5 + assert tmle._continuous_max == 10 + assert tmle._cb == 0.0001 + + # Checking that TMLE bounding works as intended + y_bound = [2 / 5, 4 / 15, 2 / 3, 1 / 3, 1 / 3, 1 / 3, 0.9999, 0.0001] + pdt.assert_series_equal(pd.Series(y_bound), + tmle.df['Y'], + check_dtype=False, check_names=False) + def test_match_r_epsilons(self, df): r_epsilons = [-0.016214091, 0.003304079] tmle = TMLE(df, exposure='art', outcome='dead') @@ -315,6 +335,165 @@ def test_sklearn_in_tmle_missing(self, mf): npt.assert_allclose(tmle.odds_ratio_ci, [0.213980, 0.978331], rtol=1e-4) +class TestStochasticTMLE: + + @pytest.fixture + def df(self): + df = ze.load_sample_data(False) + df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df, 'cd40', n_knots=3, term=2, restricted=True) + df[['age_rs1', 'age_rs2']] = ze.spline(df, 'age0', n_knots=3, term=2, restricted=True) + return df.drop(columns=['cd4_wk45']).dropna() + + @pytest.fixture + def cf(self): + df = ze.load_sample_data(False) + df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df, 'cd40', n_knots=3, term=2, restricted=True) + df[['age_rs1', 'age_rs2']] = ze.spline(df, 'age0', n_knots=3, term=2, restricted=True) + return df.drop(columns=['dead']).dropna() + + @pytest.fixture + def simple_df(self): + expected = pd.DataFrame([[1, 1, 1, 1, 1], + [0, 0, 0, -1, 2], + [0, 1, 0, 5, 1], + [0, 0, 1, 0, 0], + [1, 0, 0, 0, 1], + [1, 0, 1, 0, 0], + [0, 1, 0, 10, 1], + [0, 0, 0, -5, 0], + [1, 1, 0, -5, 2]], + columns=["W", "A", "Y", "C", "S"], + index=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + return expected + + def test_error_continuous_exp(self, df): + with pytest.raises(ValueError): + StochasticTMLE(df=df, exposure='cd40', outcome='dead') + + def test_error_fit(self, df): + stmle = StochasticTMLE(df=df, exposure='art', outcome='dead') + with pytest.raises(ValueError): + stmle.fit(p=0.5) + + stmle = StochasticTMLE(df=df, exposure='art', outcome='dead') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + with pytest.raises(ValueError): + stmle.fit(p=0.5) + + stmle = StochasticTMLE(df=df, exposure='art', outcome='dead') + stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + with pytest.raises(ValueError): + stmle.fit(p=0.5) + + def test_error_p_oob(self, df): + stmle = StochasticTMLE(df=df, exposure='art', outcome='dead') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + with pytest.raises(ValueError): + stmle.fit(p=1.1) + + with pytest.raises(ValueError): + stmle.fit(p=-0.1) + + def test_error_p_cond_len(self, df): + stmle = StochasticTMLE(df=df, exposure='art', outcome='dead') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + with pytest.raises(ValueError): + stmle.fit(p=[0.1], conditional=["df['male']==1", "df['male']==0"]) + + with pytest.raises(ValueError): + stmle.fit(p=[0.1, 0.3], conditional=["df['male']==1"]) + + def test_error_summary(self, df): + stmle = StochasticTMLE(df=df, exposure='art', outcome='dead') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + with pytest.raises(ValueError): + stmle.summary() + + def test_continuous_processing(self): + a_list = [0, 1, 1, 0, 1, 1, 0, 0] + y_list = [1, -1, 5, 0, 0, 0, 10, -5] + df = pd.DataFrame() + df['A'] = a_list + df['Y'] = y_list + + stmle = StochasticTMLE(df=df, exposure='A', outcome='Y', continuous_bound=0.0001) + + # Checking all flagged parts are correct + assert stmle._continuous_outcome is True + assert stmle._continuous_min == -5 + assert stmle._continuous_max == 10 + assert stmle._cb == 0.0001 + + # Checking that TMLE bounding works as intended + y_bound = [2 / 5, 4 / 15, 2 / 3, 1 / 3, 1 / 3, 1 / 3, 0.9999, 0.0001] + pdt.assert_series_equal(pd.Series(y_bound), + stmle.df['Y'], + check_dtype=False, check_names=False) + + def test_marginal_vector_length_stoch(self, df): + stmle = StochasticTMLE(df=df, exposure='art', outcome='dead') + stmle.exposure_model('male') + stmle.outcome_model('art + male + age0') + stmle.fit(p=0.4, samples=7) + assert len(stmle.marginals_vector) == 7 + + def test_qmodel_params(self, simple_df): + # Comparing to SAS logit model + sas_params = [-1.0699, -0.9525, 1.5462] + sas_preds = [0.3831332, 0.2554221, 0.1168668, 0.2554221, 0.6168668, 0.6168668, 0.1168668, 0.2554221, 0.3831332] + + stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='Y') + stmle.outcome_model('A + W') + est_params = stmle._outcome_model.params + est_preds = stmle._Qinit_ + + npt.assert_allclose(sas_params, est_params, atol=1e-4) + npt.assert_allclose(sas_preds, est_preds, atol=1e-6) + + def test_qmodel_params2(self, simple_df): + # Comparing to SAS linear model + sas_params = [0.3876, 0.3409, -0.2030, -0.0883] + sas_preds = [0.437265, 0.210957, 0.6402345, 0.3876202, 0.0963188, 0.1846502, 0.6402345, 0.38762016, 0.34893314] + + stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C') + stmle.outcome_model('A + W + S', continuous_distribution='normal') + est_params = stmle._outcome_model.params + est_preds = stmle._Qinit_ + + npt.assert_allclose(sas_params, est_params, atol=1e-4) + npt.assert_allclose(sas_preds, est_preds, atol=1e-6) + + def test_qmodel_params3(self, simple_df): + # Comparing to SAS Poisson model + sas_params = [-1.0478, 0.9371, -0.5321, -0.2733] + sas_preds = [0.4000579, 0.2030253, 0.6811115, 0.3507092, 0.1567304, 0.20599265, 0.6811115, 0.3507092, 0.3043857] + + stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C') + stmle.outcome_model('A + W + S', continuous_distribution='Poisson') + est_params = stmle._outcome_model.params + est_preds = stmle._Qinit_ + + npt.assert_allclose(sas_params, est_params, atol=1e-4) + npt.assert_allclose(sas_preds, est_preds, atol=1e-6) + + def test_gmodel_params(self, simple_df): + # Comparing to SAS Poisson model + sas_preds = [2.0, 1.6666666667, 2.5, 1.6666666667, 2, 2, 2.5, 1.6666666667, 2] + + stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C') + stmle.exposure_model('W') + est_preds = 1 / stmle._denominator_ + + npt.assert_allclose(sas_preds, est_preds, atol=1e-6) + + # TODO check bounding + + # TODO compare to R in several versions + + class TestAIPTW: @pytest.fixture diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 355b3f3..b6eae4e 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -1001,11 +1001,11 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri else: raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous " "outcomes") - log = smf.glm(self._q_model, self.df, family=f).fit() + self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit() else: f = sm.families.family.Binomial() - log = smf.glm(self._q_model, self.df, family=f).fit() + self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit() if self._verbose_: print('==============================================================================') @@ -1013,7 +1013,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri print(self._outcome_model.summary()) # Step 2) Estimation under the scenarios - self._Qinit_ = log.predict(self.df) + self._Qinit_ = self._outcome_model.predict(self.df) else: # User-specified model # TODO need to create smart warning system @@ -1041,6 +1041,7 @@ def fit(self, p, conditional=None, samples=100): TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and `average _treatment_effect` for continuous outcomes """ + # TODO add a seed # Error checking if self._denominator_ is None: raise ValueError("The exposure_model() function must be specified before the fit() function") @@ -1048,8 +1049,8 @@ def fit(self, p, conditional=None, samples=100): raise ValueError("The outcome_model() function must be specified before the fit() function") p = np.array(p) - if np.any(p > 1): - raise ValueError("All specified treatment probabilities must be less than 1") + if np.any(p > 1) or np.any(p < 0): + raise ValueError("All specified treatment probabilities must be between 0 and 1") if conditional is not None: if len(p) != len(conditional): raise ValueError("'p' and 'conditional' must be the same length") @@ -1086,7 +1087,7 @@ def fit(self, p, conditional=None, samples=100): # Targeted Estimate logit_qstar = np.log(probability_to_odds(y_star)) + epsilon # logit(Y^*) + e q_star = odds_to_probability(np.exp(logit_qstar)) # Y^* - q_star_list.append(q_star) + q_star_list.append(np.mean(q_star)) # E[Y^*] if self._continuous_outcome: self.marginals_vector = _tmle_unit_unbound_(q_star_list, @@ -1102,7 +1103,7 @@ def fit(self, p, conditional=None, samples=100): self.marginal_outcome = np.mean(self.marginals_vector) # Step 7) Estimating Var(psi) - if self.alpha == 0.05: # Without this, won't match R exactly. R relies on 1.96, while I use SciPy + if self.alpha == 0.05: # Without this, won't match R exactly. R relies on 1.96, while I use SciPy usually zalpha = 1.96 else: zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) From a146880fcf2d6432c18f544655fc32beab440af6 Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 18 Nov 2019 16:16:00 -0500 Subject: [PATCH 23/42] more tests for StochasticTMLE --- tests/test_doublyrobust.py | 18 ++++++++++++++++++ zepid/causal/doublyrobust/TMLE.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py index dbe737e..2555d42 100644 --- a/tests/test_doublyrobust.py +++ b/tests/test_doublyrobust.py @@ -489,6 +489,24 @@ def test_gmodel_params(self, simple_df): npt.assert_allclose(sas_preds, est_preds, atol=1e-6) + def test_compare_tmle(self, df): + stmle = StochasticTMLE(df, exposure='art', outcome='dead') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.fit(p=1.0, samples=1) + all_treat = stmle.marginal_outcome + stmle.fit(p=0.0, samples=1) + non_treat = stmle.marginal_outcome + + tmle = TMLE(df, exposure='art', outcome='dead') + tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) + tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', + print_results=False) + tmle.fit() + expected = tmle.risk_difference + + npt.assert_allclose(expected, all_treat - non_treat, atol=1e-4) + # TODO check bounding # TODO compare to R in several versions diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index b6eae4e..b71920a 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -1090,7 +1090,7 @@ def fit(self, p, conditional=None, samples=100): q_star_list.append(np.mean(q_star)) # E[Y^*] if self._continuous_outcome: - self.marginals_vector = _tmle_unit_unbound_(q_star_list, + self.marginals_vector = _tmle_unit_unbound_(np.array(q_star_list), mini=self._continuous_min, maxi=self._continuous_max) y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min, maxi=self._continuous_max)) From c7346a47838eb8775571158f9236e5e0793b9cd6 Mon Sep 17 00:00:00 2001 From: pzivich Date: Tue, 19 Nov 2019 08:09:05 -0500 Subject: [PATCH 24/42] updated StochasticTMLE docs --- zepid/causal/doublyrobust/TMLE.py | 59 ++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index b71920a..187c350 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -841,6 +841,38 @@ class StochasticTMLE: some values may take the value of 0 or 1, which breaks a logit(Y*) transformation. To avoid this issue, Y* is bounded by the `continuous_bound` argument. The default is 0.0005, the same as R's tmle + Following is a general narrative of the estimation procedure for TMLE with stochastic treatments + + 1. Initial estimators for g-model (IPTW) and Q-model (g-formula) are fit. By default these estimators are based + on parametric regression models. Additionally, machine learning algorithms can be used to estimate the g-model and + Q-model. + + 2. The auxiliary covariate is calculated (i.e. IPTW). + + .. math:: + + H(A=a, L) = \frac{I(A=a) p}{\widehat{\Pr}(A=a)} + + where `p` is the probability of treatment `a` under the stochastic intervention of interest. + + 3. Targeting step occurs through estimation of `e` via a logistic regression model. Briefly a weighted logistic + regression model (weighted by the auxiliary covariates) with the dependent variable as the observed outcome and + an offset term of the Q-model predictions under the observed treatment (A). + + .. math:: + + \text{logit}(Y) = \text{logit}(Q(A, W)) + \epsilon + + 4. Stochastic interventions are evaluated through Monte Carlo integration for binary treatments. The different + treatment plans are randomly applied and evaluated through the Q-model and then the targeting step via + + .. math:: + + E[\text{logit}(Q(A=a, W)) + \hat{\epsilon}] + + This process is repeated a large number of times and the point estimate is the average of those individual treatment + plans. + Examples -------- Setting up environment @@ -850,30 +882,30 @@ class StochasticTMLE: >>> df = load_sample_data(False).dropna() >>> df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True) - Estimating TMLE using logistic regression + Estimating TMLE for 0.2 being treated with ART >>> tmle = TMLE(df, exposure='art', outcome='dead') - >>> # Specifying exposure/treatment model >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') - >>> # Specifying outcome model >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') - >>> # TMLE estimation procedure - >>> tmle.fit() - >>> # Printing main results + >>> tmle.fit(p=0.2) + >>> tmle.summary() + + Estimating TMLE for conditional plan + + >>> tmle = TMLE(df, exposure='art', outcome='dead') + >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + >>> tmle.fit(p=[0.6, 0.4], conditional=["df['male']==1", "df['male']==0"]) >>> tmle.summary() - >>> # Extracting risk difference and confidence intervals, respectively - >>> tmle.risk_difference - >>> tmle.risk_difference_ci Estimating TMLE with machine learning algorithm from sklearn >>> from sklearn.linear_model import LogisticRegression >>> log1 = LogisticRegression(penalty='l1', random_state=201) >>> tmle = TMLE(df, 'art', 'dead') - >>> # custom_model allows specification of machine learning algorithms >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1) >>> tmle.outcome_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1) - >>> tmle.fit() + >>> tmle.fit(p=0.75) References ---------- @@ -1059,12 +1091,13 @@ def fit(self, p, conditional=None, samples=100): if conditional is None: numerator = np.where(self.df[self.exposure] == 1, p, 1 - p) else: + df = self.df.copy() stochastic_check_conditional(df=self.df, conditional=conditional) numerator = np.array([np.nan] for i in range(self.df.shape[0])) for c, prop in zip(conditional, p): - numerator = np.where(eval(c), np.where(self.df[self.exposure] == 1, prop, 1 - prop), numerator) + numerator = np.where(eval(c), np.where(df[self.exposure] == 1, prop, 1 - prop), numerator) - haw = numerator / self._denominator_ + haw = np.array(numerator / self._denominator_).astype(float) # Step 5) Estimating TMLE epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_) From f4d7cd422c0a498610f3912ac55a237ad35fce5f Mon Sep 17 00:00:00 2001 From: pzivich Date: Tue, 19 Nov 2019 08:23:30 -0500 Subject: [PATCH 25/42] variance estimation tweaking --- zepid/causal/doublyrobust/TMLE.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 187c350..0a37297 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -948,7 +948,9 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v self.marginals_vector = None self.marginal_outcome = None self.alpha = alpha + self.marginal_se = None self.marginal_ci = None + self.conditional_se = None self.conditional_ci = None # Storage for items I need later @@ -1140,7 +1142,19 @@ def fit(self, p, conditional=None, samples=100): zalpha = 1.96 else: zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) - # TODO add remainder of variance calculations + + # Marginal variance estimator + # variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_, + # y_pred_targeted=, psi=self.marginal_outcome) + # self.marginal_se = np.sqrt(variance_marginal) + # self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se, + # self.marginal_outcome + zalpha * self.marginal_se] + + # Conditional on W variance estimator + variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_) + self.conditional_se = np.sqrt(variance_conditional) + self.conditional_ci = [self.marginal_outcome - zalpha * self.conditional_se, + self.marginal_outcome + zalpha * self.conditional_se] def summary(self, decimal=3): if self.marginal_outcome is None: @@ -1186,6 +1200,18 @@ def targeting_step(y, q_init, iptw, verbose): return log.params[0] # Returns single-step estimated Epsilon term + @staticmethod + def est_marginal_variance(haw, y_obs, y_pred, y_pred_targeted, psi): + doqg_psi_sq = (haw*(y_obs - y_pred) + y_pred_targeted - psi)**2 + var_est = np.mean(doqg_psi_sq) + return var_est + + @staticmethod + def est_conditional_variance(haw, y_obs, y_pred): + doqg_psi_sq = (haw*(y_obs - y_pred))**2 + var_est = np.mean(doqg_psi_sq) + return var_est + # Functions that all TMLEs can call are below def _tmle_unit_bounds_(y, mini, maxi, bound): From a046ecce37a99986e7a0716e25e908f7f890aa7e Mon Sep 17 00:00:00 2001 From: pzivich Date: Tue, 19 Nov 2019 08:26:20 -0500 Subject: [PATCH 26/42] adding variance divisor --- zepid/causal/doublyrobust/TMLE.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 0a37297..3d5867e 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -1146,13 +1146,13 @@ def fit(self, p, conditional=None, samples=100): # Marginal variance estimator # variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_, # y_pred_targeted=, psi=self.marginal_outcome) - # self.marginal_se = np.sqrt(variance_marginal) + # self.marginal_se = np.sqrt(variance_marginal) / np.sqrt(self.df.shape[0]) # self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se, # self.marginal_outcome + zalpha * self.marginal_se] # Conditional on W variance estimator variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_) - self.conditional_se = np.sqrt(variance_conditional) + self.conditional_se = np.sqrt(variance_conditional) / np.sqrt(self.df.shape[0]) self.conditional_ci = [self.marginal_outcome - zalpha * self.conditional_se, self.marginal_outcome + zalpha * self.conditional_se] @@ -1181,7 +1181,7 @@ def summary(self, decimal=3): # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal)) print('======================================================================') print('Conditional') - # print('95% CL: ', np.round(self.ci_cond, decimals=decimal)) + print('95% CL: ', np.round(self.conditional_ci, decimals=decimal)) print('======================================================================') @staticmethod From f51927c49666ff154d170b99a65163a2578e5855 Mon Sep 17 00:00:00 2001 From: pzivich Date: Tue, 19 Nov 2019 08:37:35 -0500 Subject: [PATCH 27/42] both variances for testing versus R --- zepid/causal/doublyrobust/TMLE.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 3d5867e..a2e115a 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -1106,6 +1106,7 @@ def fit(self, p, conditional=None, samples=100): # Step 6) Estimating psi q_star_list = [] + q_i_star_list = [] for i in range(samples): # Applying treatment plan df = self.df.copy() @@ -1122,7 +1123,8 @@ def fit(self, p, conditional=None, samples=100): # Targeted Estimate logit_qstar = np.log(probability_to_odds(y_star)) + epsilon # logit(Y^*) + e q_star = odds_to_probability(np.exp(logit_qstar)) # Y^* - q_star_list.append(np.mean(q_star)) # E[Y^*] + q_i_star_list.append(q_star) # Saving Y_i^* for marginal variance + q_star_list.append(np.mean(q_star)) # Saving E[Y^*] if self._continuous_outcome: self.marginals_vector = _tmle_unit_unbound_(np.array(q_star_list), @@ -1130,10 +1132,13 @@ def fit(self, p, conditional=None, samples=100): y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min, maxi=self._continuous_max)) yq0_ = _tmle_unit_unbound_(self._Qinit_, mini=self._continuous_min, maxi=self._continuous_max) + yqstar_ = _tmle_unit_unbound_(np.array(q_i_star_list), mini=self._continuous_min, maxi=self._continuous_max) + else: self.marginals_vector = q_star_list y_ = np.array(self.df[self.outcome]) yq0_ = self._Qinit_ + yqstar_ = np.array(q_i_star_list) self.marginal_outcome = np.mean(self.marginals_vector) @@ -1144,11 +1149,12 @@ def fit(self, p, conditional=None, samples=100): zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) # Marginal variance estimator - # variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_, - # y_pred_targeted=, psi=self.marginal_outcome) - # self.marginal_se = np.sqrt(variance_marginal) / np.sqrt(self.df.shape[0]) - # self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se, - # self.marginal_outcome + zalpha * self.marginal_se] + variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_, + y_pred_targeted=np.mean(yqstar_, axis=0), + psi=self.marginal_outcome) + self.marginal_se = np.sqrt(variance_marginal) / np.sqrt(self.df.shape[0]) + self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se, + self.marginal_outcome + zalpha * self.marginal_se] # Conditional on W variance estimator variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_) @@ -1180,6 +1186,8 @@ def summary(self, decimal=3): print('Overall incidence: ', np.round(self.marginal_outcome, decimals=decimal)) # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal)) print('======================================================================') + print('Marginal') + print('95% CL: ', np.round(self.marginal_ci, decimals=decimal)) print('Conditional') print('95% CL: ', np.round(self.conditional_ci, decimals=decimal)) print('======================================================================') From c9c6a1b9ae1ee0688f019c24434bdab93d045663 Mon Sep 17 00:00:00 2001 From: pzivich Date: Mon, 25 Nov 2019 18:12:02 -0500 Subject: [PATCH 28/42] adding diagnostics option --- zepid/causal/doublyrobust/TMLE.py | 66 +++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index a2e115a..992d934 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -945,6 +945,7 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v self.outcome = outcome # Output attributes + self.epsilon = None self.marginals_vector = None self.marginal_outcome = None self.alpha = alpha @@ -959,6 +960,7 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v self._Qinit_ = None self._treatment_model = None self._g_model = None + self._specified_bound_ = None self._denominator_ = None self._verbose_ = verbose @@ -999,6 +1001,8 @@ def exposure_model(self, model, custom_model=None, bound=False): if bound: # Bounding predicted probabilities if requested self._denominator_ = _bounding_(self._denominator_, bounds=bound) + # self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0) + # ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0)) self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred) @@ -1053,8 +1057,8 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri # TODO need to create smart warning system self._out_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) - self._Qinit_ - self._outcome_model + # self._Qinit_ + # self._outcome_model if not bound: # Bounding predicted probabilities if requested bound = self._cb @@ -1102,7 +1106,8 @@ def fit(self, p, conditional=None, samples=100): haw = np.array(numerator / self._denominator_).astype(float) # Step 5) Estimating TMLE - epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_) + self.epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, + verbose=self._verbose_) # Step 6) Estimating psi q_star_list = [] @@ -1121,7 +1126,7 @@ def fit(self, p, conditional=None, samples=100): y_star = self._outcome_model.predict(df) # Targeted Estimate - logit_qstar = np.log(probability_to_odds(y_star)) + epsilon # logit(Y^*) + e + logit_qstar = np.log(probability_to_odds(y_star)) + self.epsilon # logit(Y^*) + e q_star = odds_to_probability(np.exp(logit_qstar)) # Y^* q_i_star_list.append(q_star) # Saving Y_i^* for marginal variance q_star_list.append(np.mean(q_star)) # Saving E[Y^*] @@ -1163,6 +1168,13 @@ def fit(self, p, conditional=None, samples=100): self.marginal_outcome + zalpha * self.conditional_se] def summary(self, decimal=3): + """Prints summary of the estimated incidence under the specified treatment plan + + Parameters + ---------- + decimal : int, optional + Number of decimal places to display. Default is 3 + """ if self.marginal_outcome is None: raise ValueError('The fit() statement must be ran before summary()') @@ -1171,25 +1183,45 @@ def summary(self, decimal=3): print('======================================================================') fmt = 'Treatment: {:<15} No. Observations: {:<20}' print(fmt.format(self.exposure, self.df.shape[0])) - fmt = 'Outcome: {:<15}' - print(fmt.format(self.outcome)) - fmt = 'Q-Model: {:<15}' - print(fmt.format('Logistic')) - # TODO add custom model check - fmt = 'g-Model: {:<15}' - print(fmt.format('Logistic')) + + fmt = 'Outcome: {:<15} No. Truncated: {:<20}' + if self._specified_bound_ is None: + b = 0 + else: + b = self._specified_bound_ + print(fmt.format(self.outcome, b)) + + fmt = 'Q-Model: {:<15} g-model: {:<20}' + print(fmt.format('Logistic', 'Logistic')) # TODO add treatment plan information # TODO add information on m (number of re-samples) print('======================================================================') print('Overall incidence: ', np.round(self.marginal_outcome, decimals=decimal)) - # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal)) print('======================================================================') - print('Marginal') - print('95% CL: ', np.round(self.marginal_ci, decimals=decimal)) - print('Conditional') - print('95% CL: ', np.round(self.conditional_ci, decimals=decimal)) + print('Marginal 95% CL: ', np.round(self.marginal_ci, decimals=decimal)) + print('Conditional 95% CL: ', np.round(self.conditional_ci, decimals=decimal)) + print('======================================================================') + + def run_diagnostics(self, decimal=3): + """Provides some summary diagnostics for `StochasticTMLE`. Diagnostics must be called are the fit() function is + called since weights are dependent on the specific treatment plan of interest + + Parameters + ---------- + decimal : int, optional + Number of decimal places to display. Default is 3 + """ + if self.marginal_outcome is None: + raise ValueError('The fit() statement must be ran before summary()') + + print('======================================================================') + print(' Diagnostics ') + print('======================================================================') + # TODO print epsilon + # TODO print outcome model accuracy + # TODO print distribution of weights / IPW summary print('======================================================================') @staticmethod @@ -1210,12 +1242,14 @@ def targeting_step(y, q_init, iptw, verbose): @staticmethod def est_marginal_variance(haw, y_obs, y_pred, y_pred_targeted, psi): + # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4965321/ doqg_psi_sq = (haw*(y_obs - y_pred) + y_pred_targeted - psi)**2 var_est = np.mean(doqg_psi_sq) return var_est @staticmethod def est_conditional_variance(haw, y_obs, y_pred): + # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4965321/ doqg_psi_sq = (haw*(y_obs - y_pred))**2 var_est = np.mean(doqg_psi_sq) return var_est From 22e85493d27baa28fb913a2eb5e6a829c5e63639 Mon Sep 17 00:00:00 2001 From: pzivich Date: Wed, 8 Jan 2020 13:37:53 -0500 Subject: [PATCH 29/42] CONFIRMED: psi must be transformed then averaged --- tests/test_doublyrobust.py | 14 ++++++++++ zepid/causal/doublyrobust/TMLE.py | 45 ++++++++++++++++--------------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py index 2555d42..c31d219 100644 --- a/tests/test_doublyrobust.py +++ b/tests/test_doublyrobust.py @@ -412,6 +412,20 @@ def test_error_summary(self, df): with pytest.raises(ValueError): stmle.summary() + def test_warn_missing_data(self): + df = pd.DataFrame() + df['A'] = [1, 1, 0, 0, np.nan] + df['Y'] = [np.nan, 0, 1, 0, 1] + with pytest.warns(UserWarning): + StochasticTMLE(df=df, exposure='A', outcome='Y') + + def test_drop_missing_data(self): + df = pd.DataFrame() + df['A'] = [1, 1, 0, 0, np.nan] + df['Y'] = [np.nan, 0, 1, 0, 1] + stmle = StochasticTMLE(df=df, exposure='A', outcome='Y') + assert stmle.df.shape[0] == 3 + def test_continuous_processing(self): a_list = [0, 1, 1, 0, 1, 1, 0, 0] y_list = [1, -1, 5, 0, 0, 0, 10, -5] diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 992d934..0ab6ae3 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -884,7 +884,7 @@ class StochasticTMLE: Estimating TMLE for 0.2 being treated with ART - >>> tmle = TMLE(df, exposure='art', outcome='dead') + >>> tmle = StochasticTMLE(df, exposure='art', outcome='dead') >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') >>> tmle.fit(p=0.2) @@ -892,7 +892,7 @@ class StochasticTMLE: Estimating TMLE for conditional plan - >>> tmle = TMLE(df, exposure='art', outcome='dead') + >>> tmle = StochasticTMLE(df, exposure='art', outcome='dead') >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') >>> tmle.fit(p=[0.6, 0.4], conditional=["df['male']==1", "df['male']==0"]) @@ -902,7 +902,7 @@ class StochasticTMLE: >>> from sklearn.linear_model import LogisticRegression >>> log1 = LogisticRegression(penalty='l1', random_state=201) - >>> tmle = TMLE(df, 'art', 'dead') + >>> tmle = StochasticTMLE(df, 'art', 'dead') >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1) >>> tmle.outcome_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1) >>> tmle.fit(p=0.75) @@ -916,10 +916,10 @@ class StochasticTMLE: studies. Springer Science & Business Media, 2011. """ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, verbose=False): - # Going through missing data + # Dropping ALL missing data (currently doesn't allow for censored outcomes) if df.dropna().shape[0] != df.shape[0]: - warnings.warn("There is missing data that is not the outcome in the data set. StochasticTMLE will drop all " - "missing data. StochasticTMLE will fit " + warnings.warn("There is missing data in the data set. StochasticTMLE will drop all missing data. " + "StochasticTMLE will fit " + str(df.dropna().shape[0]) + ' of ' + str(df.shape[0]) + ' observations', UserWarning) self.df = df.copy().dropna().reset_index() @@ -960,6 +960,7 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v self._Qinit_ = None self._treatment_model = None self._g_model = None + self._resamples_ = None self._specified_bound_ = None self._denominator_ = None self._verbose_ = verbose @@ -1001,8 +1002,8 @@ def exposure_model(self, model, custom_model=None, bound=False): if bound: # Bounding predicted probabilities if requested self._denominator_ = _bounding_(self._denominator_, bounds=bound) - # self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0) - # ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0)) + self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0) + ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0)) self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred) @@ -1028,7 +1029,6 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri """ self._q_model = self.outcome + ' ~ ' + model - # Step 1) Prediction for Q (estimation of Q-model) if custom_model is None: # Standard parametric regression self._continuous_type = continuous_distribution if self._continuous_outcome: @@ -1057,6 +1057,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri # TODO need to create smart warning system self._out_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) + # TODO machine learning predictions # self._Qinit_ # self._outcome_model @@ -1080,7 +1081,7 @@ def fit(self, p, conditional=None, samples=100): `average _treatment_effect` for continuous outcomes """ # TODO add a seed - # Error checking + if self._denominator_ is None: raise ValueError("The exposure_model() function must be specified before the fit() function") if self._Qinit_ is None: @@ -1093,7 +1094,7 @@ def fit(self, p, conditional=None, samples=100): if len(p) != len(conditional): raise ValueError("'p' and 'conditional' must be the same length") - # Step 4) Calculating clever covariate (HAW) + # Step 1) Calculating clever covariate (HAW) if conditional is None: numerator = np.where(self.df[self.exposure] == 1, p, 1 - p) else: @@ -1105,13 +1106,17 @@ def fit(self, p, conditional=None, samples=100): haw = np.array(numerator / self._denominator_).astype(float) - # Step 5) Estimating TMLE + # Step 2) Estimate from Q-model + # process completed in outcome_model() function and stored in self._Qinit_ + + # Step 3) Target parameter TMLE self.epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_) - # Step 6) Estimating psi + # Step 4) Monte-Carlo Integration procedure q_star_list = [] q_i_star_list = [] + self._resamples_ = samples for i in range(samples): # Applying treatment plan df = self.df.copy() @@ -1147,11 +1152,8 @@ def fit(self, p, conditional=None, samples=100): self.marginal_outcome = np.mean(self.marginals_vector) - # Step 7) Estimating Var(psi) - if self.alpha == 0.05: # Without this, won't match R exactly. R relies on 1.96, while I use SciPy usually - zalpha = 1.96 - else: - zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) + # Step 5) Estimating Var(psi) + zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1) # Marginal variance estimator variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_, @@ -1161,7 +1163,7 @@ def fit(self, p, conditional=None, samples=100): self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se, self.marginal_outcome + zalpha * self.marginal_se] - # Conditional on W variance estimator + # Conditional on W variance estimator (not generally recommended but I need it for other work) variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_) self.conditional_se = np.sqrt(variance_conditional) / np.sqrt(self.df.shape[0]) self.conditional_ci = [self.marginal_outcome - zalpha * self.conditional_se, @@ -1183,19 +1185,18 @@ def summary(self, decimal=3): print('======================================================================') fmt = 'Treatment: {:<15} No. Observations: {:<20}' print(fmt.format(self.exposure, self.df.shape[0])) - fmt = 'Outcome: {:<15} No. Truncated: {:<20}' if self._specified_bound_ is None: b = 0 else: b = self._specified_bound_ print(fmt.format(self.outcome, b)) - fmt = 'Q-Model: {:<15} g-model: {:<20}' print(fmt.format('Logistic', 'Logistic')) + fmt = 'No. Resamples: {:<15}' + print(fmt.format(self._resamples_)) # TODO add treatment plan information - # TODO add information on m (number of re-samples) print('======================================================================') print('Overall incidence: ', np.round(self.marginal_outcome, decimals=decimal)) From 5b4ad08f32783162cdf879ce171cdb4ad6267ce9 Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 9 Jan 2020 11:40:32 -0500 Subject: [PATCH 30/42] updating run_diagnostics --- zepid/causal/doublyrobust/TMLE.py | 54 +++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 0ab6ae3..c3173aa 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -1001,9 +1001,9 @@ def exposure_model(self, model, custom_model=None, bound=False): ml_model=custom_model, print_results=self._verbose_) if bound: # Bounding predicted probabilities if requested - self._denominator_ = _bounding_(self._denominator_, bounds=bound) - self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0) - ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0)) + pred = _bounding_(pred, bounds=bound) + self._specified_bound_ = np.sum(np.where(pred == bound, 1, 0) + ) + np.sum(np.where(pred == 1 / bound, 1, 0)) self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred) @@ -1220,11 +1220,53 @@ def run_diagnostics(self, decimal=3): print('======================================================================') print(' Diagnostics ') print('======================================================================') - # TODO print epsilon - # TODO print outcome model accuracy - # TODO print distribution of weights / IPW summary + print(' Natural Course Prediction Accuracy ') + value = self.df[self.outcome] - self._Qinit_ + fmt = 'Mean: {:<20}' + print(fmt.format(np.round(np.mean(value), decimals=decimal))) + fmt = 'Standard Deviation: {:<20}' + print(fmt.format(np.round(np.std(value, ddof=1), decimals=decimal))) + fmt = 'Median: {:<20}' + print(fmt.format(np.round(np.median(value), decimals=decimal))) + fmt = 'Minimum value: {:<20}' + print(fmt.format(np.round(np.min(value), decimals=decimal))) + fmt = 'Maximum value: {:<20}' + print(fmt.format(np.round(np.max(value), decimals=decimal))) + + print('----------------------------------------------------------------------') + print(' Inverse Probability Weight Denominator ') + w = 1 / self._denominator_ + fmt = 'Mean (expected=2): {:<20}' + print(fmt.format(np.round(np.mean(w), decimals=decimal))) + fmt = 'Standard Deviation: {:<20}' + print(fmt.format(np.round(np.std(w, ddof=1), decimals=decimal))) + fmt = 'Minimum value: {:<20}' + print(fmt.format(np.round(np.min(w), decimals=decimal))) + fmt = 'Maximum value: {:<20}' + print(fmt.format(np.round(np.max(w), decimals=decimal))) + + print('----------------------------------------------------------------------') + print(' Targeting Step ') + fmt = 'Epsilon: {:<20}' + print(fmt.format(np.round(self.epsilon, decimals=decimal))) + print('======================================================================') + # Generating plots + df = self.df.copy() + df['_g1_'] = np.where(df[self.exposure] == 1, self._denominator_, 1 - self._denominator_) + + plt.figure(figsize=[8, 4]) + plt.subplot(121) + plot_kde(df=df, treatment=self.exposure, probability='_g1_') + plt.title("Kernel Density of Propensity Scores") + + plt.subplot(122) + plot_kde_accuracy(values=value, color='green') + plt.title("Kernel Density of Accuracy") + plt.tight_layout() + plt.show() + @staticmethod def targeting_step(y, q_init, iptw, verbose): f = sm.families.family.Binomial() From edfcad1eb96b7f75cea30d91c08ab19b9c37edf4 Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 9 Jan 2020 11:40:55 -0500 Subject: [PATCH 31/42] adding reference pages for StochasticTMLE --- docs/Reference/Causal.rst | 1 + ....causal.doublyrobust.TMLE.StochasticTMLE.rst | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst diff --git a/docs/Reference/Causal.rst b/docs/Reference/Causal.rst index 5c421bf..d30b44d 100644 --- a/docs/Reference/Causal.rst +++ b/docs/Reference/Causal.rst @@ -69,6 +69,7 @@ Targeted Maximum Likelihood Estimator :toctree: generated/ TMLE + StochasticTMLE G-estimation of SNM ------------------- diff --git a/docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst b/docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst new file mode 100644 index 0000000..9428a15 --- /dev/null +++ b/docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst @@ -0,0 +1,17 @@ +zepid.causal.doublyrobust.TMLE.StochastsicTMLE +============================================== + +.. currentmodule:: zepid.causal.doublyrobust.StochasticTMLE + +.. autoclass:: StochasticTMLE + :members: + + .. rubric:: Methods + + .. autosummary:: + + ~TMLE.exposure_model + ~TMLE.outcome_model + ~TMLE.fit + ~TMLE.summary + ~TMLE.run_diagnostics From 08c1dcd6eb82737fc1ca6310b0ac02b7f94ed14f Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 9 Jan 2020 14:44:32 -0500 Subject: [PATCH 32/42] adding support for machine-learning estimators in StochasticTMLE --- zepid/causal/doublyrobust/TMLE.py | 28 +++++++++----- zepid/causal/utils.py | 64 ++++++++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index c3173aa..fcdd23f 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -8,9 +8,9 @@ from zepid.causal.utils import propensity_score, stochastic_check_conditional from zepid.calc import probability_to_odds, odds_to_probability -from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, missing_machine_learner, _bounding_, - plot_kde, plot_love, standardized_mean_differences, positivity, - plot_kde_accuracy, outcome_accuracy) +from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, stochastic_outcome_machine_learner, + stochastic_outcome_predict, missing_machine_learner, _bounding_, plot_kde, plot_love, + standardized_mean_differences, positivity, plot_kde_accuracy, outcome_accuracy) class TMLE: @@ -1001,9 +1001,8 @@ def exposure_model(self, model, custom_model=None, bound=False): ml_model=custom_model, print_results=self._verbose_) if bound: # Bounding predicted probabilities if requested - pred = _bounding_(pred, bounds=bound) - self._specified_bound_ = np.sum(np.where(pred == bound, 1, 0) - ) + np.sum(np.where(pred == 1 / bound, 1, 0)) + pred2 = _bounding_(pred, bounds=bound) + self._specified_bound_ = np.sum(np.where(pred2 == pred, 0, 1)) self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred) @@ -1057,9 +1056,12 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri # TODO need to create smart warning system self._out_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) - # TODO machine learning predictions - # self._Qinit_ - # self._outcome_model + output = stochastic_outcome_machine_learner(xdata=np.asarray(data), + ydata=np.asarray(self.df[self.outcome]), + ml_model=custom_model, + continuous=self._continuous_outcome, + print_results=self._verbose_) + self._Qinit_, self._outcome_model = output if not bound: # Bounding predicted probabilities if requested bound = self._cb @@ -1128,7 +1130,13 @@ def fit(self, p, conditional=None, samples=100): df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0]) # Outcome model under treatment plan - y_star = self._outcome_model.predict(df) + if self._out_model_custom : + data_star = patsy.dmatrix(self._q_model + ' - 1', self.df) + y_star = stochastic_outcome_predict(xdata=data_star, + fit_ml_model=self._outcome_model, + continuous=self._continuous_outcome) + else: + y_star = self._outcome_model.predict(df) # Targeted Estimate logit_qstar = np.log(probability_to_odds(y_star)) + self.epsilon # logit(Y^*) + e diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py index 29f031c..9e99cf0 100644 --- a/zepid/causal/utils.py +++ b/zepid/causal/utils.py @@ -79,7 +79,7 @@ def exposure_machine_learner(xdata, ydata, ml_model, print_results=True): def outcome_machine_learner(xdata, ydata, all_a, none_a, ml_model, continuous, print_results=True): """Function to fit machine learning predictions. Used by TMLE to generate predicted probabilities of outcome - (i.e. Pr(Y=1 | A=1, L) and Pr(Y=1 | A=0, L)). Future update will include continuous Y functionality (i.e. E(Y)) + (i.e. Pr(Y=1 | A=1, L) and Pr(Y=1 | A=0, L)). """ # Trying to fit Machine Learning model try: @@ -114,6 +114,68 @@ def outcome_machine_learner(xdata, ydata, all_a, none_a, ml_model, continuous, p raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute") +def stochastic_outcome_machine_learner(xdata, ydata, ml_model, continuous, print_results=True): + """Function to fit machine learning predictions. Used by StochasticTMLE to generate predicted probabilities of + outcome (i.e. Pr(Y=1 | A, L) + """ + # Trying to fit Machine Learning model + try: + fm = ml_model.fit(X=xdata, y=ydata) + except TypeError: + raise TypeError("Currently custom_model must have the 'fit' function with arguments 'X', 'y'. This " + "covers both sklearn and supylearner. If there is a predictive model you would " + "like to use, please open an issue at https://github.com/pzivich/zepid and I " + "can work on adding support") + if print_results and hasattr(fm, 'summarize'): # Nice summarize option from SuPyLearner + fm.summarize() + + # Generating predictions + if continuous: + if hasattr(fm, 'predict'): + qa = fm.predict(xdata) + return qa, fm + else: + raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute") + + else: + if hasattr(fm, 'predict_proba'): + qa = fm.predict_proba(xdata) + if qa.ndim == 1: # Allows for PyGAM + return qa, fm + else: + return qa[:, 1], fm + elif hasattr(fm, 'predict'): + qa = fm.predict(xdata) + return qa, fm + else: + raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute") + + +def stochastic_outcome_predict(xdata, fit_ml_model, continuous): + """Function to generate predictions machine learning predictions. Used by StochasticTMLE to generate predicted + probabilities of outcome (i.e. Pr(Y=1 | A=a*, L) in the Monte-Carlo integration procedure + """ + # Generating predictions + if continuous: + if hasattr(fit_ml_model, 'predict'): + qa_star = fit_ml_model.predict(xdata) + return qa_star + else: + raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute") + else: + if hasattr(fit_ml_model, 'predict_proba'): + qa_star = fit_ml_model.predict_proba(xdata) + if qa_star.ndim == 1: # Allows for PyGAM + return qa_star + else: + return qa_star[:, 1] + elif hasattr(fit_ml_model, 'predict'): + qa_star = fit_ml_model.predict(xdata) + return qa_star + else: + raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute") + + def missing_machine_learner(xdata, mdata, all_a, none_a, ml_model, print_results=True): """Function to fit machine learning predictions. Used by TMLE to generate predicted probabilities of missing outcome data, Pr(M=1|A,L) From 9b6508db5daff4c87b91a18908514c57d7eae259 Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 9 Jan 2020 14:59:20 -0500 Subject: [PATCH 33/42] fixing error in patsy data parsing for ML --- zepid/causal/doublyrobust/TMLE.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index fcdd23f..387903a 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -964,6 +964,9 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v self._specified_bound_ = None self._denominator_ = None self._verbose_ = verbose + self._out_model_custom = False + self._exp_model_custom = False + self._continuous_type = None # Custom model / machine learner storage self._g_custom_ = None @@ -1029,6 +1032,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri self._q_model = self.outcome + ' ~ ' + model if custom_model is None: # Standard parametric regression + self._out_model_custom = False self._continuous_type = continuous_distribution if self._continuous_outcome: if (continuous_distribution.lower() == 'gaussian') or (continuous_distribution.lower() == 'normal'): @@ -1130,8 +1134,8 @@ def fit(self, p, conditional=None, samples=100): df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0]) # Outcome model under treatment plan - if self._out_model_custom : - data_star = patsy.dmatrix(self._q_model + ' - 1', self.df) + if self._out_model_custom: + _, data_star = patsy.dmatrices(self._q_model + ' - 1', self.df) y_star = stochastic_outcome_predict(xdata=data_star, fit_ml_model=self._outcome_model, continuous=self._continuous_outcome) From 1987dcc9bb389d2955efb831eca44dadd8c8ad7d Mon Sep 17 00:00:00 2001 From: pzivich Date: Thu, 9 Jan 2020 16:53:04 -0500 Subject: [PATCH 34/42] seed and checking documentation --- zepid/causal/doublyrobust/TMLE.py | 40 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index 387903a..a0c8698 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -851,7 +851,7 @@ class StochasticTMLE: .. math:: - H(A=a, L) = \frac{I(A=a) p}{\widehat{\Pr}(A=a)} + H = \frac{p}{\widehat{\Pr}(A=a)} where `p` is the probability of treatment `a` under the stochastic intervention of interest. @@ -912,7 +912,7 @@ class StochasticTMLE: Muñoz ID, and Van Der Laan MJ. Population intervention causal effects based on stochastic interventions. Biometrics 68.2 (2012): 541-549. - Van der Laan MJ, and Sherri R. Targeted learning in data science: causal inference for complex longitudinal + van der Laan MJ, and Sherri R. Targeted learning in data science: causal inference for complex longitudinal studies. Springer Science & Business Media, 2011. """ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, verbose=False): @@ -973,7 +973,8 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v self._q_custom_ = None def exposure_model(self, model, custom_model=None, bound=False): - """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature + """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature. This value is used as the denominator + for the inverse probability weights. Parameters ---------- @@ -997,7 +998,7 @@ def exposure_model(self, model, custom_model=None, bound=False): fitmodel = propensity_score(self.df, self._g_model, print_results=self._verbose_) pred = fitmodel.predict(self.df) else: # User-specified prediction model - # TODO need to create smart warning system + # TODO need to create smart warning system -- Issue #124 self._exp_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) pred = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]), @@ -1043,11 +1044,9 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous " "outcomes") self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit() - else: f = sm.families.family.Binomial() self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit() - if self._verbose_: print('==============================================================================') print('Q-model') @@ -1057,7 +1056,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri self._Qinit_ = self._outcome_model.predict(self.df) else: # User-specified model - # TODO need to create smart warning system + # TODO need to create smart warning system -- Issue #124 self._out_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) output = stochastic_outcome_machine_learner(xdata=np.asarray(data), @@ -1073,26 +1072,40 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri # This bounding step prevents continuous outcomes from being outside the range self._Qinit_ = _bounding_(self._Qinit_, bounds=bound) - def fit(self, p, conditional=None, samples=100): + def fit(self, p, conditional=None, samples=100, seed=None): """Calculate the effect from the predicted exposure probabilities and predicted outcome values using the TMLE procedure. Confidence intervals are calculated using influence curves. + Parameters + ---------- + p : float, list, tuple + Proportion that correspond to the number of persons treated (all values must be between 0.0 and 1.0). If + conditional is specified, p must be a list/tuple of floats of the same length + conditional : None, list, tuple, optional + A + samples : int, optional + Number of samples to use for the Monte Carlo integration procedure + seed : None, int, optional + Seed for the Monte Carlo integration procedure + Note ---- Exposure and outcome models must be specified prior to `fit()` Returns ------- - TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and - `average _treatment_effect` for continuous outcomes + `StochasticTMLE` gains `marginal_vector` and `marginal_outcome` along with `marginal_ci` """ - # TODO add a seed - if self._denominator_ is None: raise ValueError("The exposure_model() function must be specified before the fit() function") if self._Qinit_ is None: raise ValueError("The outcome_model() function must be specified before the fit() function") + if seed is None: + pass + else: + np.random.seed(seed) + p = np.array(p) if np.any(p > 1) or np.any(p < 0): raise ValueError("All specified treatment probabilities must be between 0 and 1") @@ -1195,6 +1208,7 @@ def summary(self, decimal=3): print('======================================================================') print(' Stochastic Targeted Maximum Likelihood Estimator ') print('======================================================================') + fmt = 'Treatment: {:<15} No. Observations: {:<20}' print(fmt.format(self.exposure, self.df.shape[0])) fmt = 'Outcome: {:<15} No. Truncated: {:<20}' @@ -1208,8 +1222,6 @@ def summary(self, decimal=3): fmt = 'No. Resamples: {:<15}' print(fmt.format(self._resamples_)) - # TODO add treatment plan information - print('======================================================================') print('Overall incidence: ', np.round(self.marginal_outcome, decimals=decimal)) print('======================================================================') From 83b769e121aa8b07b0f2f52286a6068812241896 Mon Sep 17 00:00:00 2001 From: pzivich Date: Fri, 10 Jan 2020 11:38:50 -0500 Subject: [PATCH 35/42] updating ML model specification --- tests/test_doublyrobust.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py index c31d219..a1e5114 100644 --- a/tests/test_doublyrobust.py +++ b/tests/test_doublyrobust.py @@ -240,7 +240,7 @@ def test_match_r_continuous_poisson(self, cf): npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3) def test_sklearn_in_tmle(self, df): - log = LogisticRegression(C=1.0) + log = LogisticRegression(C=1.0, solver='liblinear') tmle = TMLE(df, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log) tmle.outcome_model('art + male + age0 + cd40 + dvl0', custom_model=log) @@ -257,7 +257,7 @@ def test_sklearn_in_tmle(self, df): npt.assert_allclose(tmle.odds_ratio_ci, [0.2139277755, 0.944971255], rtol=1e-5) def test_sklearn_in_tmle2(self, cf): - log = LogisticRegression(C=1.0) + log = LogisticRegression(C=1.0, solver='liblinear') lin = LinearRegression() tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log) @@ -317,7 +317,7 @@ def test_missing_continuous_outcome(self, mcf): npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3) def test_sklearn_in_tmle_missing(self, mf): - log = LogisticRegression(C=1.0) + log = LogisticRegression(C=1.0, solver='liblinear') tmle = TMLE(mf, exposure='art', outcome='dead') tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log, print_results=False) tmle.missing_model('male + age0 + cd40 + dvl0', custom_model=log, print_results=False) @@ -525,6 +525,15 @@ def test_compare_tmle(self, df): # TODO compare to R in several versions + def test_machine_learning_runs(self, df): + # Only verifies that machine learning doesn't throw an error + log = LogisticRegression(penalty='l1', solver='liblinear', random_state=201) + + tmle = StochasticTMLE(df, exposure='art', outcome='dead') + tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0 + male:dvl0', custom_model=log) + tmle.outcome_model('art + male + age0 + dvl0 + cd40', custom_model=log) + tmle.fit(p=0.4, samples=20) + class TestAIPTW: From 03106795259d2e5847b54dfe7165b019a167a4a9 Mon Sep 17 00:00:00 2001 From: pzivich Date: Fri, 10 Jan 2020 12:37:03 -0500 Subject: [PATCH 36/42] updating StochasticTMLE tests --- tests/test_doublyrobust.py | 66 +++++++++++++++++++++++++++++-- zepid/causal/doublyrobust/TMLE.py | 1 + 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py index a1e5114..185be4c 100644 --- a/tests/test_doublyrobust.py +++ b/tests/test_doublyrobust.py @@ -503,7 +503,7 @@ def test_gmodel_params(self, simple_df): npt.assert_allclose(sas_preds, est_preds, atol=1e-6) - def test_compare_tmle(self, df): + def test_compare_tmle_binary(self, df): stmle = StochasticTMLE(df, exposure='art', outcome='dead') stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') @@ -521,9 +521,69 @@ def test_compare_tmle(self, df): npt.assert_allclose(expected, all_treat - non_treat, atol=1e-4) - # TODO check bounding + def test_compare_tmle_continuous(self, cf): + cf['cd4_wk45'] = np.log(cf['cd4_wk45']) + stmle = StochasticTMLE(cf, exposure='art', outcome='cd4_wk45') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.fit(p=1.0, samples=1) + all_treat = stmle.marginal_outcome + stmle.fit(p=0.0, samples=1) + non_treat = stmle.marginal_outcome + + tmle = TMLE(cf, exposure='art', outcome='cd4_wk45') + tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False) + tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', + print_results=False) + tmle.fit() + expected = tmle.average_treatment_effect + + npt.assert_allclose(expected, all_treat - non_treat, atol=1e-3) + + def test_qmodel_bound(self, simple_df): + # Comparing to SAS logit model + sas_params = [-1.0699, -0.9525, 1.5462] + sas_preds = [0.3831332, 0.2554221, 0.2, 0.2554221, 0.6, 0.6, 0.2, 0.2554221, 0.3831332] + + stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='Y') + stmle.outcome_model('A + W', bound=[0.2, 0.6]) + est_params = stmle._outcome_model.params + est_preds = stmle._Qinit_ + + npt.assert_allclose(sas_params, est_params, atol=1e-4) + npt.assert_allclose(sas_preds, est_preds, atol=1e-6) + + def test_gmodel_bound(self, simple_df): + # Comparing to SAS Poisson model + sas_preds = [2, 1/0.55, 1/0.45, 1/0.55, 2, 2, 1/0.45, 1/0.55, 2] + + stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C') + stmle.exposure_model('W', bound=[0.45, 0.55]) + est_preds = 1 / stmle._denominator_ + + npt.assert_allclose(sas_preds, est_preds, atol=1e-6) + + def test_calculate_epsilon1(self, df): + stmle = StochasticTMLE(df, exposure='art', outcome='dead') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + dvl0 + cd40 + cd4_rs1 + cd4_rs2') + + stmle.fit(p=0.15, samples=1) + npt.assert_allclose(-0.0157043107, stmle.epsilon, atol=1e-6) + + stmle.fit(p=0.4, samples=1) + npt.assert_allclose(-0.0381559025, stmle.epsilon, atol=1e-6) + + def test_calculate_epsilon2(self, cf): + stmle = StochasticTMLE(cf, exposure='art', outcome='cd4_wk45') + stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0') + stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + dvl0 + cd40 + cd4_rs1 + cd4_rs2') + + stmle.fit(p=0.15, samples=1) + npt.assert_allclose(-0.0059476590, stmle.epsilon, atol=1e-6) - # TODO compare to R in several versions + stmle.fit(p=0.4, samples=1) + npt.assert_allclose(-0.0154923643, stmle.epsilon, atol=1e-6) def test_machine_learning_runs(self, df): # Only verifies that machine learning doesn't throw an error diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py index a0c8698..31a379a 100644 --- a/zepid/causal/doublyrobust/TMLE.py +++ b/zepid/causal/doublyrobust/TMLE.py @@ -1007,6 +1007,7 @@ def exposure_model(self, model, custom_model=None, bound=False): if bound: # Bounding predicted probabilities if requested pred2 = _bounding_(pred, bounds=bound) self._specified_bound_ = np.sum(np.where(pred2 == pred, 0, 1)) + pred = pred2 self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred) From ab6c6b3634ea18b24d3466a6fde6068ca4a9d320 Mon Sep 17 00:00:00 2001 From: pzivich Date: Sun, 5 Jul 2020 12:10:30 -0400 Subject: [PATCH 37/42] updating based on review --- zepid/causal/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py index 9e99cf0..fb13755 100644 --- a/zepid/causal/utils.py +++ b/zepid/causal/utils.py @@ -676,10 +676,10 @@ def plot_kde_accuracy(values, bw_method='scott', fill=True, color='b'): def stochastic_check_conditional(df, conditional): """Check that conditionals are exclusive for the stochastic fit process. Generates a warning if not true """ - a = np.array([0] * df.shape[0]) + a = np.zeros(df.shape[0]) for c in conditional: - a = np.add(a, np.where(eval(c), 1, 0)) + a = a + np.where(eval(c), 1, 0) - if np.sum(np.where(a > 1, 1, 0)): + if np.any(a > 1): warnings.warn("It looks like your conditional categories are NOT exclusive. For appropriate estimation, " "the conditions that designate each category should be exclusive", UserWarning) From a4243b060eb95bcc8cf55d3f38027667ef52a0f8 Mon Sep 17 00:00:00 2001 From: pzivich Date: Sun, 12 Jul 2020 14:19:53 -0400 Subject: [PATCH 38/42] adding error with the AIPSW weights since seems to be underlying issue --- tests/test_generalize.py | 36 +++++++++++++-------------- zepid/causal/generalize/estimators.py | 12 +++++---- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/tests/test_generalize.py b/tests/test_generalize.py index a4d277e..35cecf1 100644 --- a/tests/test_generalize.py +++ b/tests/test_generalize.py @@ -18,7 +18,7 @@ def df_r(): def df_c(): df = ze.load_generalize_data(True) df['W_sq'] = df['W'] ** 2 - df['weight'] = 3 + df['weight'] = 2 return df @@ -195,20 +195,20 @@ def test_transport_conf(self, df_c): npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5) npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4) - def test_generalize_weight(self, df_c): - aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight') - aipw.sampling_model('L + W_sq', print_results=False) - aipw.treatment_model('L', print_results=False) - aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) - aipw.fit() - npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5) - npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4) - - def test_transport_weight(self, df_c): - aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight') - aipw.sampling_model('L + W_sq', print_results=False) - aipw.treatment_model('L', print_results=False) - aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) - aipw.fit() - npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5) - npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4) + # def test_generalize_weight(self, df_c): + # aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight') + # aipw.sampling_model('L + W_sq', print_results=False) + # aipw.treatment_model('L', print_results=False) + # aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) + # aipw.fit() + # npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5) + # npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4) + + # def test_transport_weight(self, df_c): + # aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight') + # aipw.sampling_model('L + W_sq', print_results=False) + # aipw.treatment_model('L', print_results=False) + # aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False) + # aipw.fit() + # npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5) + # npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4) diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py index 7c3a1cc..d156b03 100644 --- a/zepid/causal/generalize/estimators.py +++ b/zepid/causal/generalize/estimators.py @@ -601,6 +601,8 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No self.exposure = exposure self.outcome = outcome self.selection = selection + if weights is not None: + raise ValueError("AIPSW is not stable with `weights`. This functionality is currently unavailable") self.weight = weights self.ipsw = None @@ -732,11 +734,11 @@ def outcome_model(self, model, outcome_type='binary', print_results=True): print(self._outcome_model.summary()) dfa = self.df.copy() - dfn = self.df.copy() dfa[self.exposure] = 1 - dfn[self.exposure] = 0 - self._YA1 = self._outcome_model.predict(dfa) + + dfn = self.df.copy() + dfn[self.exposure] = 0 self._YA0 = self._outcome_model.predict(dfn) def fit(self): @@ -748,9 +750,9 @@ def fit(self): the exposure-outcome relationship based on the data and IPSW model """ if not self._denominator_model: - raise ValueError('The weight_model() function must be specified before effect measure estimation') + raise ValueError('sampling_model() function must be specified before effect measure estimation') if not self._outcome_model: - raise ValueError('The outcome_model() function must be specified before effect measure estimation') + raise ValueError('outcome_model() function must be specified before effect measure estimation') if self.weight is not None: if self.iptw is None: From 043cbd9bd1339283957358aa47d83b18e3c77c77 Mon Sep 17 00:00:00 2001 From: pzivich Date: Sun, 12 Jul 2020 14:22:00 -0400 Subject: [PATCH 39/42] updating the travisci 3.7 call: --- .travis.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index f086f5b..4e14b95 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,12 +3,7 @@ language: python python: - "3.5" - "3.6" - -matrix: - include: - - python: 3.7 - dist: xenial - sudo: true + - "3.7" notifications: email: false From dff5c34b4d13f7b5e39fd838b2f88b7315c633bb Mon Sep 17 00:00:00 2001 From: pzivich Date: Sun, 12 Jul 2020 14:23:09 -0400 Subject: [PATCH 40/42] constrating numpy in travisci for python 3.5 --- travis-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis-requirements.txt b/travis-requirements.txt index 32a3c3a..b6ac319 100644 --- a/travis-requirements.txt +++ b/travis-requirements.txt @@ -1,4 +1,4 @@ -numpy +numpy<=1.18.5 pandas>=0.18 statsmodels>=0.7.0 matplotlib>=2.1.0 From bc6e6d00317a186a3ca9688573b5f6dcd304beca Mon Sep 17 00:00:00 2001 From: pzivich Date: Sun, 12 Jul 2020 14:31:29 -0400 Subject: [PATCH 41/42] scipy version restrictions --- travis-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis-requirements.txt b/travis-requirements.txt index b6ac319..2fe235e 100644 --- a/travis-requirements.txt +++ b/travis-requirements.txt @@ -2,7 +2,7 @@ numpy<=1.18.5 pandas>=0.18 statsmodels>=0.7.0 matplotlib>=2.1.0 -scipy +scipy<=1.0.1 tabulate patsy sklearn From 1d0076c349810799dec66c45673ccb02fc188ec8 Mon Sep 17 00:00:00 2001 From: pzivich Date: Sun, 12 Jul 2020 14:38:58 -0400 Subject: [PATCH 42/42] another attempt to get travis ci working for 3.5 and 3.7 --- .travis.yml | 5 +---- travis-requirements.txt | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4e14b95..27c8426 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,12 +8,9 @@ python: notifications: email: false -before_install: - - "pip install -U pip" - - "python setup.py install" - install: - pip install -r travis-requirements.txt + - python setup.py install script: - pytest \ No newline at end of file diff --git a/travis-requirements.txt b/travis-requirements.txt index 2fe235e..666490f 100644 --- a/travis-requirements.txt +++ b/travis-requirements.txt @@ -2,7 +2,7 @@ numpy<=1.18.5 pandas>=0.18 statsmodels>=0.7.0 matplotlib>=2.1.0 -scipy<=1.0.1 +scipy<=1.4.1 tabulate patsy sklearn