From 3ac849d7350d4dd9e8f0e13f9974de3cdb59837e Mon Sep 17 00:00:00 2001 From: pzivich Date: Wed, 17 Jul 2019 13:01:07 -0400 Subject: [PATCH] Adding missing_model() to GEstimationSNM --- ...causal.snm.g_estimation.GEstimationSNM.rst | 1 + tests/test_snm.py | 51 +++++++ zepid/causal/snm/g_estimation.py | 142 ++++++++++++++++-- 3 files changed, 178 insertions(+), 16 deletions(-) diff --git a/docs/Reference/generated/zepid.causal.snm.g_estimation.GEstimationSNM.rst b/docs/Reference/generated/zepid.causal.snm.g_estimation.GEstimationSNM.rst index b131571..57d0964 100644 --- a/docs/Reference/generated/zepid.causal.snm.g_estimation.GEstimationSNM.rst +++ b/docs/Reference/generated/zepid.causal.snm.g_estimation.GEstimationSNM.rst @@ -12,5 +12,6 @@ zepid.causal.snm.g\_estimation.GEstimationSNM ~GEstimationSNM.exposure_model ~GEstimationSNM.structural_nested_model + ~GEstimationSNM.missing_model ~GEstimationSNM.fit ~GEstimationSNM.summary diff --git a/tests/test_snm.py b/tests/test_snm.py index 934816e..7b68c21 100644 --- a/tests/test_snm.py +++ b/tests/test_snm.py @@ -157,6 +157,57 @@ def test_weighted_model(self): npt.assert_allclose(snm.psi, [244.379181], atol=1e-5) + def test_missing_model(self): + df = load_sample_data(False).drop(columns=['dead']) + df['age_sq'] = df['age0'] ** 2 + df['age_cu'] = df['age0'] ** 3 + df['cd4_sq'] = df['cd40'] ** 2 + df['cd4_cu'] = df['cd40'] ** 3 + + snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45') + snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False) + snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', + stabilized=False, print_results=False) + snm.structural_nested_model('art') + snm.fit() + + npt.assert_allclose(snm.psi, [244.379181], atol=1e-5) + + snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45') + snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False) + snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', + stabilized=False, print_results=False) + snm.structural_nested_model('art') + snm.fit(solver='search') + + npt.assert_allclose(snm.psi, [244.379181], atol=1e-5) + + def test_missing_w_weights(self): + df = load_sample_data(False).drop(columns=['dead']) + df['age_sq'] = df['age0'] ** 2 + df['age_cu'] = df['age0'] ** 3 + df['cd4_sq'] = df['cd40'] ** 2 + df['cd4_cu'] = df['cd40'] ** 3 + df['weight'] = 2 + + snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45', weights='weight') + snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False) + snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', + stabilized=False, print_results=False) + snm.structural_nested_model('art') + snm.fit() + + npt.assert_allclose(snm.psi, [244.379181], atol=1e-5) + + snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45') + snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False) + snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', + stabilized=False, print_results=False) + snm.structural_nested_model('art') + snm.fit(solver='search') + + npt.assert_allclose(snm.psi, [244.379181], atol=1e-5) + def test_match_r_continuous(self, data_c): # Comparing to R's DTRreg r_1param = [251.4032] diff --git a/zepid/causal/snm/g_estimation.py b/zepid/causal/snm/g_estimation.py index 92b547e..e9c38fc 100644 --- a/zepid/causal/snm/g_estimation.py +++ b/zepid/causal/snm/g_estimation.py @@ -4,7 +4,7 @@ import pandas as pd import scipy.optimize -from zepid.causal.utils import propensity_score +from zepid.causal.utils import propensity_score, _bounding_ class GEstimationSNM: @@ -140,12 +140,26 @@ class GEstimationSNM: nonresponse models. Journal of the American Statistical Association, 94(448), 1096-1120. """ def __init__(self, df, exposure, outcome, weights=None): - if df.dropna().shape[0] != df.shape[0]: - warnings.warn("There is missing data in the dataset. By default, GEstimationSNM will drop all missing " - "data. GEstimationSNM will fit " + str(df.dropna().shape[0]) + ' of ' + str(df.shape[0]) + - " observations", UserWarning) - self.df = df.copy().dropna().reset_index() + # Checking for missing data that is non-outcome + if df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0] != df.shape[0]: + warnings.warn("There is missing data that is not the outcome in the data set. IPTW will drop " + "all missing data that is not missing outcome data. IPTW will fit " + + str(df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0]) + + ' of ' + str(df.shape[0]) + ' observations', UserWarning) + self.df = df.copy().dropna(subset=[d for d in df.columns if d != outcome]).reset_index() + else: + self.df = df.copy().reset_index() + + # Checking to see if missing outcome data occurs + self._missing_indicator = '__missing_indicator__' + if self.df.dropna(subset=[outcome]).shape[0] != self.df.shape[0]: + self._miss_flag = True + self.df[self._missing_indicator] = np.where(self.df[outcome].isna(), 0, 1) + else: + self._miss_flag = False + self.df[self._missing_indicator] = 1 + # Checking binary exposure only if not self.df[exposure].value_counts().index.isin([0, 1]).all(): raise ValueError("GEstimationSNM only supports binary exposures currently") @@ -155,9 +169,11 @@ def __init__(self, df, exposure, outcome, weights=None): self.psi = None self.psi_labels = None - self._weights = weights + self._weight_ = weights + self.ipmw = None self._alphas = None self._treatment_model = None + self._fit_missing_ = False self._predicted_A = None self._snm_ = None self._print_results = True @@ -211,6 +227,70 @@ def structural_nested_model(self, model): """ self._snm_ = model + def missing_model(self, model_denominator, model_numerator=None, stabilized=True, bound=False, print_results=True): + """Estimation of Pr(M=0|A=a,L), which is the missing data mechanism for the outcome. The corresponding + observation probabilities are used to account for informative censoring by observed variables. The missing_model + only accounts for missing outcome data. + + The inverse probability weights calculated by this function account for informative censoring (missing data on + the outcome) by observed variables. + + Note + ---- + The treatment variable should be included in the model + + Parameters + ---------- + model_denominator: str + String listing variables predicting missingness of outcomes via `patsy` syntax. For example, ` + 'var1 + var2 + var3'. This is for the predicted probabilities of the denominator + model_numerator : str, optional + Optional string listing variables to predict the exposure, separated by +. Only used to calculate the + numerator. Default (None) calculates the probability of censoring by treatment only. In general this is + recommended. If assessing effect modifcation, this variable should be included in the numerator as well. + Argument is only used when calculating stabilized weights + stabilized : bool, optional + Whether to use stabilized inverse probability of censoring weights + bound : float, list, optional + Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. + Specifying this argument can improve finite sample performance for random positivity violations. However, + inference becomes limited to the restricted population. Default is False, meaning no truncation of + predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of + floats can be provided for asymmetric trunctation + print_results: bool, optional + """ + # Error if no missing outcome data + if not self._miss_flag: + raise ValueError("No missing outcome data is present in the data set") + + # Warning if exposure is not included in the missingness of outcome model + if self.exposure not in model_denominator: + warnings.warn("For the specified missing outcome model, the exposure variable should be included in the " + "model", UserWarning) + + miss_model = self._missing_indicator + ' ~ ' + model_denominator + fitmodel = propensity_score(self.df, miss_model, print_results=print_results) + + if stabilized: + if model_numerator is None: + mnum = self.exposure + else: + mnum = model_numerator + numerator_model = propensity_score(self.df, self._missing_indicator + ' ~ ' + mnum, + weights=self._weight_, + print_results=print_results) + n = numerator_model.predict(self.df) + else: + n = 1 + + if bound: # Bounding predicted probabilities if requested + d = _bounding_(fitmodel.predict(self.df), bounds=bound) + else: + d = fitmodel.predict(self.df) + + self.ipmw = np.where(self.df[self._missing_indicator] == 1, n / d, np.nan) + self._fit_missing_ = True + def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e-7, verbose_solver=False, maxiter=500): """Using the treatment model and the format of the structural nested mean model, the solutions for psi are @@ -244,26 +324,47 @@ def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e- raise ValueError("The exposure_model() and structural_nested_model() must be specified before the fit " "procedure") + if self._miss_flag and not self._fit_missing_: + warnings.warn("All missing outcome data is assumed to be missing completely at random. To relax this " + "assumption to outcome data is missing at random please use the `missing_model()` " + "function", UserWarning) + + # Assigning label for weights + df = self.df.copy() + if self.ipmw is None: + if self._weight_ is None: + weight_col = None + else: + weight_col = self._weight_ + else: + weight_col = '_w_' + if self._weight_ is None: + df['_w_'] = self.ipmw + else: + df['_w_'] = self.ipmw * df[self._weight_] + # Pulling out data set up for SNM via patsy - snm = patsy.dmatrix(self._snm_ + ' - 1', self.df, return_type='dataframe') + df = df.dropna() + snm = patsy.dmatrix(self._snm_ + ' - 1', df, return_type='dataframe') self.psi_labels = snm.columns.values.tolist() # Grabs labels for the solved psi values if solver == 'closed': # Pulling array of outcomes with the interaction terms (copy and rename column to get right interactions) - yf = self.df.copy().drop(columns=[self.exposure]) + yf = df.copy().drop(columns=[self.exposure]) yf = yf.rename(columns={self.outcome: self.exposure}) y_vals = patsy.dmatrix(self._snm_ + ' - 1', yf, return_type='dataframe') # Solving for the array of Psi values + print(df.columns) self.psi = self._closed_form_solver_(treat=self.exposure, model=self.exposure + ' ~ ' + self._treatment_model, - df=self.df, + df=df, snm_matrix=snm, y_matrix=y_vals, - weights=self._weights, print_results=self._print_results) + weights=weight_col, print_results=self._print_results) elif solver == 'search': # Adding other potential SNM variables to the input data - sf = pd.concat([self.df, snm.drop(columns=[self.exposure])], axis=1) + sf = pd.concat([df, snm.drop(columns=[self.exposure])], axis=1) # Resolving if not initial parameters if starting_value is None: @@ -272,7 +373,7 @@ def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e- # Passing to optimization procedure self._scipy_solver_obj = self._grid_search_(data_set=sf, treatment=self.exposure, outcome=self.outcome, - weights=self._weights, + weights=weight_col, model=self._treatment_model, snm_terms=self.psi_labels, start_vals=starting_value, alpha_shift=np.array(alpha_value), tolerance=tolerance, verbose_solver=verbose_solver, @@ -298,9 +399,18 @@ def summary(self, decimal=3): print('======================================================================') print(' G-estimation of Structural Nested Mean Model ') print('======================================================================') - fmt = 'Treatment: {:<24} No. Observations: {:<10}' + fmt = 'Treatment: {:<22} No. Observations: {:<10}' print(fmt.format(self.exposure, self.df.shape[0])) - print('Outcome: ', self.outcome) + fmt = 'Outcome: {:<22} No. Missing Outcome: {:<10}' + print(fmt.format(self.outcome, np.sum(self.df[self.outcome].isnull()))) + + fmt = 'Missing model: {:<15}' + if self._fit_missing_: + m = 'Logistic' + else: + m = 'None' + + print(fmt.format(m)) # Printing scipy optimization if possible if self._scipy_solver_obj is not None: @@ -308,6 +418,7 @@ def summary(self, decimal=3): else: self._print_closed_results() + print('----------------------------------------------------------------------') print('SNM: ' + snm_form) print('----------------------------------------------------------------------') @@ -391,7 +502,6 @@ def _print_scipy_results(optimized_function, alpha_values): fmt = 'Method: {:<24} No. Iterations: {:<10}' print(fmt.format('Nelder-Mead', optimized_function.nit)) fmt = 'Alpha values: {:<24} Optimized: {:<10}' - print(optimized_function.success) print(fmt.format(np.str(alpha_values), str(optimized_function.success))) @staticmethod