Skip to content

Commit

Permalink
Adding missing_model() to GEstimationSNM
Browse files Browse the repository at this point in the history
  • Loading branch information
pzivich committed Jul 17, 2019
1 parent 69d7def commit 3ac849d
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ zepid.causal.snm.g\_estimation.GEstimationSNM

~GEstimationSNM.exposure_model
~GEstimationSNM.structural_nested_model
~GEstimationSNM.missing_model
~GEstimationSNM.fit
~GEstimationSNM.summary
51 changes: 51 additions & 0 deletions tests/test_snm.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,57 @@ def test_weighted_model(self):

npt.assert_allclose(snm.psi, [244.379181], atol=1e-5)

def test_missing_model(self):
df = load_sample_data(False).drop(columns=['dead'])
df['age_sq'] = df['age0'] ** 2
df['age_cu'] = df['age0'] ** 3
df['cd4_sq'] = df['cd40'] ** 2
df['cd4_cu'] = df['cd40'] ** 3

snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45')
snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False)
snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0',
stabilized=False, print_results=False)
snm.structural_nested_model('art')
snm.fit()

npt.assert_allclose(snm.psi, [244.379181], atol=1e-5)

snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45')
snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False)
snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0',
stabilized=False, print_results=False)
snm.structural_nested_model('art')
snm.fit(solver='search')

npt.assert_allclose(snm.psi, [244.379181], atol=1e-5)

def test_missing_w_weights(self):
df = load_sample_data(False).drop(columns=['dead'])
df['age_sq'] = df['age0'] ** 2
df['age_cu'] = df['age0'] ** 3
df['cd4_sq'] = df['cd40'] ** 2
df['cd4_cu'] = df['cd40'] ** 3
df['weight'] = 2

snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45', weights='weight')
snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False)
snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0',
stabilized=False, print_results=False)
snm.structural_nested_model('art')
snm.fit()

npt.assert_allclose(snm.psi, [244.379181], atol=1e-5)

snm = GEstimationSNM(df, exposure='art', outcome='cd4_wk45')
snm.exposure_model('male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0', print_results=False)
snm.missing_model('art + male + age0 + age_sq + age_cu + cd40 + cd4_sq + cd4_cu + dvl0',
stabilized=False, print_results=False)
snm.structural_nested_model('art')
snm.fit(solver='search')

npt.assert_allclose(snm.psi, [244.379181], atol=1e-5)

def test_match_r_continuous(self, data_c):
# Comparing to R's DTRreg
r_1param = [251.4032]
Expand Down
142 changes: 126 additions & 16 deletions zepid/causal/snm/g_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
import scipy.optimize

from zepid.causal.utils import propensity_score
from zepid.causal.utils import propensity_score, _bounding_


class GEstimationSNM:
Expand Down Expand Up @@ -140,12 +140,26 @@ class GEstimationSNM:
nonresponse models. Journal of the American Statistical Association, 94(448), 1096-1120.
"""
def __init__(self, df, exposure, outcome, weights=None):
if df.dropna().shape[0] != df.shape[0]:
warnings.warn("There is missing data in the dataset. By default, GEstimationSNM will drop all missing "
"data. GEstimationSNM will fit " + str(df.dropna().shape[0]) + ' of ' + str(df.shape[0]) +
" observations", UserWarning)
self.df = df.copy().dropna().reset_index()
# Checking for missing data that is non-outcome
if df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0] != df.shape[0]:
warnings.warn("There is missing data that is not the outcome in the data set. IPTW will drop "
"all missing data that is not missing outcome data. IPTW will fit "
+ str(df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0]) +
' of ' + str(df.shape[0]) + ' observations', UserWarning)
self.df = df.copy().dropna(subset=[d for d in df.columns if d != outcome]).reset_index()
else:
self.df = df.copy().reset_index()

# Checking to see if missing outcome data occurs
self._missing_indicator = '__missing_indicator__'
if self.df.dropna(subset=[outcome]).shape[0] != self.df.shape[0]:
self._miss_flag = True
self.df[self._missing_indicator] = np.where(self.df[outcome].isna(), 0, 1)
else:
self._miss_flag = False
self.df[self._missing_indicator] = 1

# Checking binary exposure only
if not self.df[exposure].value_counts().index.isin([0, 1]).all():
raise ValueError("GEstimationSNM only supports binary exposures currently")

Expand All @@ -155,9 +169,11 @@ def __init__(self, df, exposure, outcome, weights=None):
self.psi = None
self.psi_labels = None

self._weights = weights
self._weight_ = weights
self.ipmw = None
self._alphas = None
self._treatment_model = None
self._fit_missing_ = False
self._predicted_A = None
self._snm_ = None
self._print_results = True
Expand Down Expand Up @@ -211,6 +227,70 @@ def structural_nested_model(self, model):
"""
self._snm_ = model

def missing_model(self, model_denominator, model_numerator=None, stabilized=True, bound=False, print_results=True):
"""Estimation of Pr(M=0|A=a,L), which is the missing data mechanism for the outcome. The corresponding
observation probabilities are used to account for informative censoring by observed variables. The missing_model
only accounts for missing outcome data.
The inverse probability weights calculated by this function account for informative censoring (missing data on
the outcome) by observed variables.
Note
----
The treatment variable should be included in the model
Parameters
----------
model_denominator: str
String listing variables predicting missingness of outcomes via `patsy` syntax. For example, `
'var1 + var2 + var3'. This is for the predicted probabilities of the denominator
model_numerator : str, optional
Optional string listing variables to predict the exposure, separated by +. Only used to calculate the
numerator. Default (None) calculates the probability of censoring by treatment only. In general this is
recommended. If assessing effect modifcation, this variable should be included in the numerator as well.
Argument is only used when calculating stabilized weights
stabilized : bool, optional
Whether to use stabilized inverse probability of censoring weights
bound : float, list, optional
Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
Specifying this argument can improve finite sample performance for random positivity violations. However,
inference becomes limited to the restricted population. Default is False, meaning no truncation of
predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
floats can be provided for asymmetric trunctation
print_results: bool, optional
"""
# Error if no missing outcome data
if not self._miss_flag:
raise ValueError("No missing outcome data is present in the data set")

# Warning if exposure is not included in the missingness of outcome model
if self.exposure not in model_denominator:
warnings.warn("For the specified missing outcome model, the exposure variable should be included in the "
"model", UserWarning)

miss_model = self._missing_indicator + ' ~ ' + model_denominator
fitmodel = propensity_score(self.df, miss_model, print_results=print_results)

if stabilized:
if model_numerator is None:
mnum = self.exposure
else:
mnum = model_numerator
numerator_model = propensity_score(self.df, self._missing_indicator + ' ~ ' + mnum,
weights=self._weight_,
print_results=print_results)
n = numerator_model.predict(self.df)
else:
n = 1

if bound: # Bounding predicted probabilities if requested
d = _bounding_(fitmodel.predict(self.df), bounds=bound)
else:
d = fitmodel.predict(self.df)

self.ipmw = np.where(self.df[self._missing_indicator] == 1, n / d, np.nan)
self._fit_missing_ = True

def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e-7, verbose_solver=False,
maxiter=500):
"""Using the treatment model and the format of the structural nested mean model, the solutions for psi are
Expand Down Expand Up @@ -244,26 +324,47 @@ def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e-
raise ValueError("The exposure_model() and structural_nested_model() must be specified before the fit "
"procedure")

if self._miss_flag and not self._fit_missing_:
warnings.warn("All missing outcome data is assumed to be missing completely at random. To relax this "
"assumption to outcome data is missing at random please use the `missing_model()` "
"function", UserWarning)

# Assigning label for weights
df = self.df.copy()
if self.ipmw is None:
if self._weight_ is None:
weight_col = None
else:
weight_col = self._weight_
else:
weight_col = '_w_'
if self._weight_ is None:
df['_w_'] = self.ipmw
else:
df['_w_'] = self.ipmw * df[self._weight_]

# Pulling out data set up for SNM via patsy
snm = patsy.dmatrix(self._snm_ + ' - 1', self.df, return_type='dataframe')
df = df.dropna()
snm = patsy.dmatrix(self._snm_ + ' - 1', df, return_type='dataframe')
self.psi_labels = snm.columns.values.tolist() # Grabs labels for the solved psi values

if solver == 'closed':
# Pulling array of outcomes with the interaction terms (copy and rename column to get right interactions)
yf = self.df.copy().drop(columns=[self.exposure])
yf = df.copy().drop(columns=[self.exposure])
yf = yf.rename(columns={self.outcome: self.exposure})
y_vals = patsy.dmatrix(self._snm_ + ' - 1', yf, return_type='dataframe')

# Solving for the array of Psi values
print(df.columns)
self.psi = self._closed_form_solver_(treat=self.exposure,
model=self.exposure + ' ~ ' + self._treatment_model,
df=self.df,
df=df,
snm_matrix=snm, y_matrix=y_vals,
weights=self._weights, print_results=self._print_results)
weights=weight_col, print_results=self._print_results)

elif solver == 'search':
# Adding other potential SNM variables to the input data
sf = pd.concat([self.df, snm.drop(columns=[self.exposure])], axis=1)
sf = pd.concat([df, snm.drop(columns=[self.exposure])], axis=1)

# Resolving if not initial parameters
if starting_value is None:
Expand All @@ -272,7 +373,7 @@ def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e-
# Passing to optimization procedure
self._scipy_solver_obj = self._grid_search_(data_set=sf,
treatment=self.exposure, outcome=self.outcome,
weights=self._weights,
weights=weight_col,
model=self._treatment_model, snm_terms=self.psi_labels,
start_vals=starting_value, alpha_shift=np.array(alpha_value),
tolerance=tolerance, verbose_solver=verbose_solver,
Expand All @@ -298,16 +399,26 @@ def summary(self, decimal=3):
print('======================================================================')
print(' G-estimation of Structural Nested Mean Model ')
print('======================================================================')
fmt = 'Treatment: {:<24} No. Observations: {:<10}'
fmt = 'Treatment: {:<22} No. Observations: {:<10}'
print(fmt.format(self.exposure, self.df.shape[0]))
print('Outcome: ', self.outcome)
fmt = 'Outcome: {:<22} No. Missing Outcome: {:<10}'
print(fmt.format(self.outcome, np.sum(self.df[self.outcome].isnull())))

fmt = 'Missing model: {:<15}'
if self._fit_missing_:
m = 'Logistic'
else:
m = 'None'

print(fmt.format(m))

# Printing scipy optimization if possible
if self._scipy_solver_obj is not None:
self._print_scipy_results(self._scipy_solver_obj, self._alphas)
else:
self._print_closed_results()

print('----------------------------------------------------------------------')
print('SNM: ' + snm_form)
print('----------------------------------------------------------------------')

Expand Down Expand Up @@ -391,7 +502,6 @@ def _print_scipy_results(optimized_function, alpha_values):
fmt = 'Method: {:<24} No. Iterations: {:<10}'
print(fmt.format('Nelder-Mead', optimized_function.nit))
fmt = 'Alpha values: {:<24} Optimized: {:<10}'
print(optimized_function.success)
print(fmt.format(np.str(alpha_values), str(optimized_function.success)))

@staticmethod
Expand Down

0 comments on commit 3ac849d

Please sign in to comment.