From 576920f6f087805f4b7f2d164d085b71db31b249 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Wed, 17 Jul 2019 13:13:31 -0400
Subject: [PATCH 01/42] v0.8.1 label

---
 zepid/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zepid/version.py b/zepid/version.py
index 32a90a3..ef72cc0 100644
--- a/zepid/version.py
+++ b/zepid/version.py
@@ -1 +1 @@
-__version__ = '0.8.0'
+__version__ = '0.8.1'

From 032d3f8a9c05bd630a1d0344262a3998d706f3e8 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 1 Aug 2019 14:10:13 -0400
Subject: [PATCH 02/42] Adding ATT and ATU to g-formula

---
 tests/test_gformula.py             | 19 ++++++++++++
 zepid/causal/gformula/TimeFixed.py | 50 +++++++++++++++++++++++++++---
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/tests/test_gformula.py b/tests/test_gformula.py
index d0c7ba0..42d5cd2 100644
--- a/tests/test_gformula.py
+++ b/tests/test_gformula.py
@@ -268,6 +268,25 @@ def test_conditional_stochastic_warning(self):
         with pytest.warns(UserWarning):
             g.fit_stochastic(p=[1.0, 1.0], conditional=["(g['L']==1) | (g['L']==0)", "g['L']==0"])
 
+    def test_exposed_standardized1(self, data):
+        g = TimeFixedGFormula(data, exposure='A', outcome='Y', standardize='exposed')
+        g.outcome_model(model='A + L + A:L', print_results=False)
+        g.fit(treatment='all')
+        rm = g.marginal_outcome
+        g.fit(treatment='none')
+        rs = g.marginal_outcome
+        npt.assert_allclose(rm - rs, 0.16, rtol=1e-5)
+        npt.assert_allclose(rm / rs, 1.387097, rtol=1e-5)
+
+        g = TimeFixedGFormula(data, exposure='A', outcome='Y', standardize='unexposed')
+        g.outcome_model(model='A + L + A:L', print_results=False)
+        g.fit(treatment='all')
+        rm = g.marginal_outcome
+        g.fit(treatment='none')
+        rs = g.marginal_outcome
+        npt.assert_allclose(rm - rs, 0.16, rtol=1e-5)
+        npt.assert_allclose(rm / rs, 1.384615, rtol=1e-5)
+
 
 class TestSurvivalGFormula:
 
diff --git a/zepid/causal/gformula/TimeFixed.py b/zepid/causal/gformula/TimeFixed.py
index 5431192..f459064 100644
--- a/zepid/causal/gformula/TimeFixed.py
+++ b/zepid/causal/gformula/TimeFixed.py
@@ -56,6 +56,13 @@ class TimeFixedGFormula:
         Column name for outcome variable
     outcome_type : str, optional
         Outcome variable type. Currently only 'binary', 'normal', and 'poisson variable types are supported
+    standardize : str, optional
+        Who the estimate corresponds to. Options are the entire population, the exposed, or the unexposed. See
+        Sato & Matsuyama Epidemiology (2003) for details on weighting to exposed/unexposed. Weighting to the
+        exposed or unexposed is also referred to as SMR weighting. Options for standardization are:
+        * 'population'    :   weight to entire population
+        * 'exposed'       :   weight to exposed individuals
+        * 'unexposed'     :   weight to unexposed individuals
     weights : str, optional
         Column name for weights. Default is None, which assumes every observations has the same weight (i.e. 1)
 
@@ -140,7 +147,8 @@ class TimeFixedGFormula:
     population disease burden: a step-by-step illustration of causal inference methods. American Journal of
     Epidemiology, 169(9), 1140-1147.
     """
-    def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type='binary', weights=None):
+    def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type='binary', standardize='population',
+                 weights=None):
         self.gf = df.copy()
         self.exposure = exposure
         self.outcome = outcome
@@ -157,6 +165,12 @@ def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type='
             raise ValueError('Only binary or continuous exposures are currently supported. Please specify "binary", '
                              '"categorical", or "continuous".')
 
+        if standardize in ['population', 'exposed', 'unexposed']:
+            self.standardize = standardize
+        else:
+            raise ValueError('Please specify one of the currently supported standardizations: ' +
+                             'population, exposed, unexposed')
+
         self._weights = weights
         self._outcome_model = None
         self._predicted_y_ = None
@@ -263,10 +277,24 @@ def fit(self, treatment):
         # Getting predictions
         g[self.outcome] = np.nan
         g[self.outcome] = self._outcome_model.predict(g)
+
         if self._weights is None:  # unweighted marginal estimate
-            self.marginal_outcome = np.mean(g[self.outcome])
+            if self.standardize == 'population':
+                self.marginal_outcome = np.mean(g[self.outcome])
+            elif self.standardize == 'exposed':
+                self.marginal_outcome = np.mean(g.loc[self.gf[self.exposure] == 1, self.outcome])
+            else:
+                self.marginal_outcome = np.mean(g.loc[self.gf[self.exposure] == 0, self.outcome])
         else:  # weighted marginal estimate
-            self.marginal_outcome = np.average(g[self.outcome], weights=self.gf[self._weights])
+            if self.standardize == 'population':
+                self.marginal_outcome = np.average(g[self.outcome], weights=self.gf[self._weights])
+            elif self.standardize == 'exposed':
+                self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 1, self.outcome],
+                                                   weights=self.gf[self._weights])
+            else:
+                self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 0, self.outcome],
+                                                   weights=self.gf[self._weights])
+
         self.predicted_df = g
 
     def fit_stochastic(self, p, conditional=None, samples=100, seed=None):
@@ -338,9 +366,21 @@ def fit_stochastic(self, p, conditional=None, samples=100, seed=None):
             g[self.outcome] = np.nan
             g[self.outcome] = self._outcome_model.predict(g)
             if self._weights is None:  # unweighted marginal estimate
-                marginals.append(np.mean(g[self.outcome]))
+                if self.standardize == 'population':
+                    marginals.append(np.mean(g[self.outcome]))
+                elif self.standardize == 'exposed':
+                    marginals.append(np.mean(g.loc[self.gf[self.exposure] == 1, self.outcome]))
+                else:
+                    marginals.append(np.mean(g.loc[self.gf[self.exposure] == 0, self.outcome]))
             else:  # weighted marginal estimate
-                marginals.append(np.average(g[self.outcome], weights=g[self._weights]))
+                if self.standardize == 'population':
+                    marginals.append(np.average(g[self.outcome], weights=g[self._weights]))
+                elif self.standardize == 'exposed':
+                    marginals.append(np.average(g.loc[self.gf[self.exposure] == 1, self.outcome],
+                                                weights=g[self._weights]))
+                else:
+                    marginals.append(np.average(g.loc[self.gf[self.exposure] == 0, self.outcome],
+                                                weights=g[self._weights]))
 
         self.marginal_outcome = np.mean(marginals)
 

From 2024027904d9c17d0f06633ddecb7306b250190e Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 1 Aug 2019 17:10:16 -0400
Subject: [PATCH 03/42] Refactoring IPTW

---
 zepid/causal/ipw/IPTW.py | 77 ++++++----------------------------------
 zepid/causal/utils.py    | 51 ++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 67 deletions(-)

diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py
index 8c10b2c..71769ea 100644
--- a/zepid/causal/ipw/IPTW.py
+++ b/zepid/causal/ipw/IPTW.py
@@ -8,7 +8,7 @@
 import matplotlib.pyplot as plt
 
 from zepid.causal.utils import (propensity_score, plot_boxplot, plot_kde, plot_love,
-                                standardized_mean_differences, positivity, _bounding_)
+                                standardized_mean_differences, positivity, _bounding_, iptw_calculator)
 
 
 class IPTW:
@@ -226,32 +226,15 @@ def treatment_model(self, model_denominator, model_numerator='1', stabilized=Tru
         """
         # Calculating denominator probabilities
         self.__mdenom = model_denominator
-        denominator_model = propensity_score(self.df, self.treatment + ' ~ ' + model_denominator,
-                                             weights=self._weight_,
-                                             print_results=print_results)
-        d = denominator_model.predict(self.df)
-        self.df['__denom__'] = d
-
-        # Calculating numerator probabilities (if stabilized)
-        if stabilized is True:
-            numerator_model = propensity_score(self.df, self.treatment + ' ~ ' + model_numerator,
-                                               weights=self._weight_,
-                                               print_results=print_results)
-            n = numerator_model.predict(self.df)
-        else:
-            if model_numerator != '1':
-                raise ValueError('Argument for model_numerator is only used for stabilized=True')
-            n = 1
-        self.df['__numer__'] = n
-
-        # Bounding predicted probabilities if requested
-        if bound:
-            self.df['__denom__'] = _bounding_(self.df['__denom__'], bounds=bound)
-            self.df['__numer__'] = _bounding_(self.df['__numer__'], bounds=bound)
-
-        # Calculating weights
-        self.iptw = self._weight_calculator(self.df, denominator='__denom__',
-                                            numerator='__numer__', stabilized=stabilized)
+        self.df['__denom__'], self.df['__numer__'], self.iptw = iptw_calculator(df=self.df,
+                                                                                treatment=self.treatment,
+                                                                                model_denom=model_denominator,
+                                                                                model_numer=model_numerator,
+                                                                                weight=self._weight_,
+                                                                                stabilized=stabilized,
+                                                                                standardize=self.standardize,
+                                                                                bound=bound,
+                                                                                print_results=print_results)
 
     def missing_model(self, model_denominator, model_numerator=None, stabilized=True, bound=False, print_results=True):
         """Estimation of Pr(M=0|A=a,L), which is the missing data mechanism for the outcome. The corresponding
@@ -656,46 +639,6 @@ def plot_love(self, color_unweighted='r', color_weighted='b', shape_unweighted='
                        shape_unweighted=shape_unweighted, shape_weighted=shape_weighted)
         return ax
 
-    def _weight_calculator(self, df, denominator, numerator, stabilized):
-        """Calculates the IPTW based on the predicted probabilities and the specified group to standardize to in the
-        background for the fit() function. Not intended to be used by users
-
-        df is the dataframe, denominator is the string indicating the column of Pr, numerator is the string indicating
-        the column of Pr
-        """
-        if stabilized:  # Stabilized weights
-            if self.standardize == 'population':
-                df['w'] = np.where(df[self.treatment] == 1, (df[numerator] / df[denominator]),
-                                   ((1 - df[numerator]) / (1 - df[denominator])))
-                df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w'])
-            # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed)
-            elif self.standardize == 'exposed':
-                df['w'] = np.where(df[self.treatment] == 1, 1,
-                                   ((df[denominator] / (1 - df[denominator])) * ((1 - df[numerator]) /
-                                                                                 df[numerator])))
-                df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w'])
-            # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed)
-            else:
-                df['w'] = np.where(df[self.treatment] == 1,
-                                   (((1 - df[denominator]) / df[denominator]) * (df[numerator] /
-                                                                                 (1 - df[numerator]))),
-                                   1)
-                df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w'])
-
-        else:  # Unstabilized weights
-            if self.standardize == 'population':
-                df['w'] = np.where(df[self.treatment] == 1, 1 / df[denominator], 1 / (1 - df[denominator]))
-                df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w'])
-            # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed)
-            elif self.standardize == 'exposed':
-                df['w'] = np.where(df[self.treatment] == 1, 1, (df[denominator] / (1 - df[denominator])))
-                df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w'])
-            # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed)
-            else:
-                df['w'] = np.where(df[self.treatment] == 1, ((1 - df[denominator]) / df[denominator]), 1)
-                df['w'] = np.where(df[self.treatment].isna(), np.nan, df['w'])
-        return df['w']
-
 
 class StochasticIPTW:
     r"""Calculates the IPTW estimate for stochastic treatment plans. `StochasticIPTW` will returns the estimated
diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py
index 9264e35..db460f7 100644
--- a/zepid/causal/utils.py
+++ b/zepid/causal/utils.py
@@ -177,6 +177,57 @@ def _bounding_(v, bounds):
     return v
 
 
+def iptw_calculator(df, treatment, model_denom, model_numer, weight, stabilized, standardize, bound, print_results):
+    """Background function to calculate inverse probability of treatment weights. Used by `IPTW`, `IPSW`, `AIPSW`
+    """
+    denominator_model = propensity_score(df, treatment + ' ~ ' + model_denom,
+                                         weights=weight, print_results=print_results)
+    d = denominator_model.predict(df)
+
+    # Calculating numerator probabilities (if stabilized)
+    if stabilized is True:
+        numerator_model = propensity_score(df, treatment + ' ~ ' + model_numer,
+                                           weights=weight, print_results=print_results)
+        n = numerator_model.predict(df)
+    else:
+        if model_numer != '1':
+            raise ValueError('Argument for model_numerator is only used for stabilized=True')
+        n = 1
+
+    # Bounding predicted probabilities if requested
+    if bound:
+        d = _bounding_(d, bounds=bound)
+        n = _bounding_(n, bounds=bound)
+
+    # Calculating weights
+    if stabilized:  # Stabilized weights
+        if standardize == 'population':
+            iptw = np.where(df[treatment] == 1, (n / d), ((1 - n) / (1 - d)))
+            iptw = np.where(df[treatment].isna(), np.nan, iptw)
+        # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed)
+        elif standardize == 'exposed':
+            iptw = np.where(df[treatment] == 1, 1, (d / (1 - d)) * ((1 - n) / n))
+            iptw = np.where(df[treatment].isna(), np.nan, iptw)
+        # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed)
+        else:
+            iptw = np.where(df[treatment] == 1, (((1 - d) / d) * (n / (1 - n))), 1)
+            iptw = np.where(df[treatment].isna(), np.nan, iptw)
+
+    else:  # Unstabilized weights
+        if standardize == 'population':
+            iptw = np.where(df[treatment] == 1, 1 / d, 1 / (1 - d))
+            iptw = np.where(df[treatment].isna(), np.nan, iptw)
+        # Stabilizing to exposed (compares all exposed if they were exposed versus unexposed)
+        elif standardize == 'exposed':
+            iptw = np.where(df[treatment] == 1, 1, (d / (1 - d)))
+            iptw = np.where(df[treatment].isna(), np.nan, iptw)
+        # Stabilizing to unexposed (compares all unexposed if they were exposed versus unexposed)
+        else:
+            iptw = np.where(df[treatment] == 1, ((1 - d) / d), 1)
+            iptw = np.where(df[treatment].isna(), np.nan, iptw)
+    return d, n, iptw
+
+
 def plot_kde(df, treatment, probability,
              measure='probability', bw_method='scott', fill=True, color_e='b', color_u='r'):
     """Generates a density plot that can be used to check whether positivity may be violated qualitatively. The

From b8038401973ddbe16f6e405c2c184e55d2248a3c Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 1 Aug 2019 17:10:43 -0400
Subject: [PATCH 04/42] Adding IPTW to IPSW and AIPSW classes

---
 tests/test_generalize.py              |  54 ++++-----
 zepid/causal/generalize/estimators.py | 155 +++++++++++++++++++++-----
 2 files changed, 150 insertions(+), 59 deletions(-)

diff --git a/tests/test_generalize.py b/tests/test_generalize.py
index ac8f888..bbd5e77 100644
--- a/tests/test_generalize.py
+++ b/tests/test_generalize.py
@@ -21,22 +21,12 @@ def df_c():
     return df
 
 
-@pytest.fixture
-def df_iptw(df_c):
-    dfs = df_c.loc[df_c['S'] == 1].copy()
-
-    ipt = IPTW(dfs, treatment='A', outcome='Y')
-    ipt.treatment_model('L', stabilized=True, print_results=False)
-    dfs['iptw'] = ipt.iptw
-    return pd.concat([dfs, df_c.loc[df_c['S'] == 0]], ignore_index=True, sort=False)
-
-
 class TestIPSW:
 
     def test_stabilize_error(self, df_c):
-        ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', stabilized=False)
+        ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S')
         with pytest.raises(ValueError):
-            ipsw.regression_models('L + W_sq', model_numerator='W', print_results=False)
+            ipsw.weight_model('L + W_sq', model_numerator='W', stabilized=False, print_results=False)
 
     def test_no_model_error(self, df_c):
         ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True)
@@ -44,43 +34,45 @@ def test_no_model_error(self, df_c):
             ipsw.fit()
 
     def test_generalize_unstabilized(self, df_r):
-        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=False)
-        ipsw.regression_models('L + W_sq', print_results=False)
+        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S')
+        ipsw.weight_model('L + W_sq', stabilized=False, print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4)
 
     def test_generalize_stabilized(self, df_r):
-        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=True)
-        ipsw.regression_models('L + W_sq', print_results=False)
+        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S')
+        ipsw.weight_model('L + W_sq', stabilized=True, print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4)
 
     def test_transport_unstabilized(self, df_r):
-        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=False, generalize=False)
-        ipsw.regression_models('L + W_sq', print_results=False)
+        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False)
+        ipsw.weight_model('L + W_sq', stabilized=False, print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4)
 
     def test_transport_stabilized(self, df_r):
-        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', stabilized=True, generalize=False)
-        ipsw.regression_models('L + W_sq', print_results=False)
+        ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False)
+        ipsw.weight_model('L + W_sq', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4)
 
-    def test_generalize_iptw(self, df_iptw):
-        ipsw = IPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=True, weights='iptw')
-        ipsw.regression_models('L + W + W_sq', print_results=False)
+    def test_generalize_iptw(self, df_c):
+        ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True)
+        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.treatment_model('L', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.167213, atol=1e-4)
 
-    def test_transport_iptw(self, df_iptw):
-        ipsw = IPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=False, weights='iptw')
-        ipsw.regression_models('L + W + W_sq', print_results=False)
+    def test_transport_iptw(self, df_c):
+        ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False)
+        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.treatment_model('L', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.1372, atol=1e-4)
@@ -154,17 +146,19 @@ def test_transport(self, df_r):
         npt.assert_allclose(aipw.risk_difference, 0.05479, atol=1e-5)
         npt.assert_allclose(aipw.risk_ratio, 1.16352, atol=1e-4)
 
-    def test_generalize_conf(self, df_iptw):
-        aipw = AIPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=True, weights='iptw')
+    def test_generalize_conf(self, df_c):
+        aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True)
         aipw.weight_model('L + W_sq', print_results=False)
+        aipw.treatment_model('L', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()
         npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5)
         npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4)
 
-    def test_transport_conf(self, df_iptw):
-        aipw = AIPSW(df_iptw, exposure='A', outcome='Y', selection='S', generalize=False, weights='iptw')
+    def test_transport_conf(self, df_c):
+        aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False)
         aipw.weight_model('L + W_sq', print_results=False)
+        aipw.treatment_model('L', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()
         npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5)
diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py
index b120c49..c331007 100644
--- a/zepid/causal/generalize/estimators.py
+++ b/zepid/causal/generalize/estimators.py
@@ -3,7 +3,7 @@
 import statsmodels.api as sm
 import statsmodels.formula.api as smf
 
-from zepid.causal.utils import propensity_score
+from zepid.causal.utils import propensity_score, iptw_calculator, _bounding_
 
 
 class IPSW:
@@ -44,8 +44,6 @@ class IPSW:
     generalize : bool, optional
         Whether the problem is a generalizability (True) problem or a transportability (False) problem. See notes
         for further details on the difference between the two estimation methods
-    stabilized : bool, optional
-        Whether to return stabilized or unstabilized weights. Default is stabilized weights (True)
     weights : None, str, optional
         For conditionally randomized trials, or observational research, inverse probability of treatment weights
         can be used to adjust for confounding. Before estimating the effect measures, this weight vector and the
@@ -107,27 +105,26 @@ class IPSW:
     randomized trial to a new target population. arXiv preprint arXiv:1805.00550.
     """
 
-    def __init__(self, df, exposure, outcome, selection, generalize=True, stabilized=True, weights=None):
+    def __init__(self, df, exposure, outcome, selection, generalize=True, weights=None):
         self.df = df.copy()
         self.sample = df.loc[df[selection] == 1].copy()
         self.target = df.loc[df[selection] == 0].copy()
 
         self.generalize = generalize  # determines whether IPSW or IOSW are calculated
-        self.stabilized = stabilized
-
         self.exposure = exposure
         self.outcome = outcome
         self.selection = selection
         self.weight = weights
 
+        self.ipsw = None
+        self.iptw = None
         self.risk_difference = None
         self.risk_ratio = None
-        self.Weight = None
         self._denominator_model = False
 
-    def regression_models(self, model_denominator, model_numerator='1', print_results=True):
-        """Logistic regression model(s) for estimating weights. The model denominator must be specified for both
-        stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
+    def weight_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True):
+        """Logistic regression model(s) for estimating sampling weights. The model denominator must be specified for
+        both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
         stabilization factor for the weight numerator. By default model results are returned
 
         Parameters
@@ -140,10 +137,18 @@ def regression_models(self, model_denominator, model_numerator='1', print_result
             numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended.
             Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also
             only used when calculating stabilized weights
+        bound : float, list, optional
+            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
+            Specifying this argument can improve finite sample performance for random positivity violations. However,
+            inference becomes limited to the restricted population. Default is False, meaning no truncation of
+            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
+            floats can be provided for asymmetric trunctation
+        stabilized : bool, optional
+            Whether to generated stabilized IPSW. Default is True, which returns the stabilized IPSW
         print_results : bool, optional
             Whether to print the model results from the regression models. Default is True
         """
-        if not self.stabilized:
+        if not stabilized:
             if model_numerator != '1':
                 raise ValueError('Argument for model_numerator is only used for stabilized=True')
 
@@ -153,25 +158,63 @@ def regression_models(self, model_denominator, model_numerator='1', print_result
         self._denominator_model = True
 
         # Stabilization factor if valid
-        if self.stabilized:
+        if stabilized:
             nmodel = propensity_score(self.df, self.selection + ' ~ ' + model_numerator, print_results=print_results)
             self.sample['__numer__'] = nmodel.predict(self.sample)
         else:
             self.sample['__numer__'] = 1
 
+        if bound:
+            self.sample['__denom__'] = _bounding_(self.sample['__denom__'], bounds=bound)
+            self.sample['__numer__'] = _bounding_(self.sample['__numer__'], bounds=bound)
+
         # Calculate IPSW (generalizability)
         if self.generalize:
             self.sample['__ipsw__'] = self.sample['__numer__'] / self.sample['__denom__']
 
         # Calculate IOSW (transportability)
         else:
-            if self.stabilized:
+            if stabilized:
                 self.sample['__ipsw__'] = (((1 - self.sample['__denom__']) / self.sample['__denom__']) *
                                            (self.sample['__numer__'] / (1 - self.sample['__numer__'])))
             else:
                 self.sample['__ipsw__'] = (1 - self.sample['__denom__']) / self.sample['__denom__']
 
-        self.Weight = self.sample['__ipsw__']
+        self.ipsw = self.sample['__ipsw__']
+
+    def treatment_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True):
+        """Logistic regression model(s) for estimating inverse probability of treatment weights (IPTW). The model
+        denominator must be specified for both stabilized and unstabilized weights. The optional argument
+        'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model
+        results are returned
+
+        Parameters
+        ----------
+        model_denominator : str
+            String listing variables to predict the exposure, separated by +. For example, 'var1 + var2 + var3'. This
+            is for the predicted probabilities of the denominator
+        model_numerator : str, optional
+            Optional string listing variables to predict the selection separated by +. Only used to calculate the
+            numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended.
+            Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also
+            only used when calculating stabilized weights
+        bound : float, list, optional
+            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
+            Specifying this argument can improve finite sample performance for random positivity violations. However,
+            inference becomes limited to the restricted population. Default is False, meaning no truncation of
+            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
+            floats can be provided for asymmetric trunctation
+        stabilized : bool, optional
+            Whether to generated stabilized IPSW. Default is True, which returns the stabilized IPSW
+        print_results : bool, optional
+            Whether to print the model results from the regression models. Default is True
+        """
+        d, n, self.iptw = iptw_calculator(df=self.sample,
+                                          treatment=self.exposure,
+                                          model_denom=model_denominator, model_numer=model_numerator,
+                                          weight=self.weight, stabilized=stabilized,
+                                          standardize='population',
+                                          bound=bound, print_results=print_results)
 
     def fit(self):
         """Uses the calculated IPSW to obtain the risk difference and risk ratio from the sample. If weights are
@@ -186,10 +229,17 @@ def fit(self):
         """
         if not self._denominator_model:
             raise ValueError('The regression_models() function must be specified before effect measure estimation')
+
         if self.weight is not None:
-            self.sample['__ipw__'] = self.Weight * self.sample[self.weight]
+            if self.iptw is None:
+                self.sample['__ipw__'] = self.ipsw * self.sample[self.weight]
+            else:
+                self.sample['__ipw__'] = self.ipsw * self.iptw * self.sample[self.weight]
         else:
-            self.sample['__ipw__'] = self.Weight
+            if self.iptw is None:
+                self.sample['__ipw__'] = self.ipsw
+            else:
+                self.sample['__ipw__'] = self.ipsw * self.iptw
 
         exp = self.sample[self.sample[self.exposure] == 1].copy()
         uxp = self.sample[self.sample[self.exposure] == 0].copy()
@@ -218,11 +268,16 @@ def summary(self, decimal=4):
         print(fmt.format(self.exposure, self.sample.shape[0]))
         fmt = 'Outcome:          {:<15} Target Observations:  {:<20}'
         print(fmt.format(self.outcome, self.target.shape[0]))
-        fmt = 'Target estimate:  {:<15}'
+        fmt = 'Target estimate:  {:<15} IP Treatment Weights: {:<20}'
         if self.generalize:
-            print(fmt.format('Generalize'))
+            g = 'Generalize'
         else:
-            print(fmt.format('Transport'))
+            g = 'Transport'
+        if self.iptw is None:
+            w = 'No'
+        else:
+            w = 'Yes'
+        print(fmt.format(g, w))
 
         print('----------------------------------------------------------------------')
         print('Risk Difference: ', round(float(self.risk_difference), decimal))
@@ -542,7 +597,8 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No
         self.selection = selection
         self.weight = weights
 
-        self.Weight = None
+        self.ipsw = None
+        self.iptw = None
         self._denominator_model = False
         self._outcome_model = False
         self._YA1 = None
@@ -572,7 +628,6 @@ def weight_model(self, model_denominator, model_numerator='1', stabilized=True,
             Whether to print the model results from the regression models. Default is True
         """
         dmodel = propensity_score(self.df, self.selection + ' ~ ' + model_denominator, print_results=print_results)
-
         self.df['__denom__'] = dmodel.predict(self.df)
         self._denominator_model = True
 
@@ -595,10 +650,41 @@ def weight_model(self, model_denominator, model_numerator='1', stabilized=True,
             else:
                 self.df['__ipsw__'] = (1 - self.df['__denom__']) / self.df['__denom__']
 
-        if self.weight is not None:
-            self.Weight = self.df['__ipsw__'] * self.df[self.weight]
-        else:
-            self.Weight = self.df['__ipsw__']
+        self.ipsw = self.df['__ipsw__']
+
+    def treatment_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=False):
+        """Logistic regression model(s) for estimating inverse probability of treatment weights (IPTW). The model
+        denominator must be specified for both stabilized and unstabilized weights. The optional argument
+        'model_numerator' allows specification of the stabilization factor for the weight numerator. By default model
+        results are returned
+
+        Parameters
+        ----------
+        model_denominator : str
+            String listing variables to predict the exposure, separated by +. For example, 'var1 + var2 + var3'. This
+            is for the predicted probabilities of the denominator
+        model_numerator : str, optional
+            Optional string listing variables to predict the selection separated by +. Only used to calculate the
+            numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended.
+            Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also
+            only used when calculating stabilized weights
+        bound : float, list, optional
+            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
+            Specifying this argument can improve finite sample performance for random positivity violations. However,
+            inference becomes limited to the restricted population. Default is False, meaning no truncation of
+            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
+            floats can be provided for asymmetric trunctation
+        stabilized : bool, optional
+            Whether to generated stabilized IPTW. Default is True, which returns the stabilized IPTW
+        print_results : bool, optional
+            Whether to print the model results from the regression models. Default is True
+        """
+        d, n, self.iptw = iptw_calculator(df=self.df,
+                                          treatment=self.exposure,
+                                          model_denom=model_denominator, model_numer=model_numerator,
+                                          weight=self.weight, stabilized=stabilized,
+                                          standardize='population',
+                                          bound=bound, print_results=print_results)
 
     def outcome_model(self, model, outcome_type='binary', print_results=True):
         """Build the g-transport model for the outcome. This is also referred to at the Q-model.
@@ -652,27 +738,38 @@ def fit(self):
         if not self._outcome_model:
             raise ValueError('The outcome_model() function must be specified before effect measure estimation')
 
+        if self.weight is not None:
+            if self.iptw is None:
+                self.df['__ipw__'] = self.ipsw * self.sample[self.weight]
+            else:
+                self.df['__ipw__'] = self.ipsw * self.iptw * self.sample[self.weight]
+        else:
+            if self.iptw is None:
+                self.df['__ipw__'] = self.ipsw
+            else:
+                self.df['__ipw__'] = self.ipsw * self.iptw
+
         # Generalizability problem
         if self.generalize:
             part1 = self._YA1
             part2 = np.where(self.sample & (self.df[self.exposure] == 1),
-                             self.Weight * (self.df[self.outcome] - self._YA1), 0)
+                             self.df['__ipw__'] * (self.df[self.outcome] - self._YA1), 0)
             r1 = np.mean(part1 + part2)
 
             part1 = self._YA0
             part2 = np.where(self.sample & (self.df[self.exposure] == 0),
-                             self.Weight * (self.df[self.outcome] - self._YA0), 0)
+                             self.df['__ipw__'] * (self.df[self.outcome] - self._YA0), 0)
             r0 = np.mean(part1 + part2)
 
         # Transportability problem
         else:
             part1 = np.where(self.sample & (self.df[self.exposure] == 1),
-                             self.Weight * (self.df[self.outcome] - self._YA1), 0)
+                             self.df['__ipw__'] * (self.df[self.outcome] - self._YA1), 0)
             part2 = (1 - self.df[self.selection]) * self._YA1
             r1 = np.sum(part1 + part2) / np.sum(1 - self.df[self.selection])
 
             part1 = np.where(self.sample & (self.df[self.exposure] == 0),
-                             self.Weight * (self.df[self.outcome] - self._YA0), 0)
+                             self.df['__ipw__'] * (self.df[self.outcome] - self._YA0), 0)
             part2 = (1 - self.df[self.selection]) * self._YA0
             r0 = np.sum(part1 + part2) / np.sum(1 - self.df[self.selection])
 

From 7b6f594884cf89f447fabe01842f95a453ce55fb Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 1 Aug 2019 17:27:58 -0400
Subject: [PATCH 05/42] Refactoring AIPTW

---
 zepid/causal/doublyrobust/AIPW.py | 13 +++++++------
 zepid/causal/utils.py             |  3 ++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/zepid/causal/doublyrobust/AIPW.py b/zepid/causal/doublyrobust/AIPW.py
index d3ced32..901dfc8 100644
--- a/zepid/causal/doublyrobust/AIPW.py
+++ b/zepid/causal/doublyrobust/AIPW.py
@@ -7,7 +7,7 @@
 from statsmodels.stats.weightstats import DescrStatsW
 from scipy.stats import norm
 
-from zepid.causal.utils import (propensity_score, plot_kde, plot_love,
+from zepid.causal.utils import (propensity_score, plot_kde, plot_love, iptw_calculator,
                                 standardized_mean_differences, positivity, _bounding_,
                                 plot_kde_accuracy, outcome_accuracy)
 
@@ -181,12 +181,13 @@ def exposure_model(self, model, bound=False, print_results=True):
         """
         self.__mweight = model
         self._exp_model = self.exposure + ' ~ ' + model
-        fitmodel = propensity_score(self.df, self._exp_model, weights=self._weight_, print_results=print_results)
-        ps = fitmodel.predict(self.df)
-        self.df['_g1_'] = ps
-        self.df['_g0_'] = 1 - ps
+        d, n, iptw = iptw_calculator(df=self.df, treatment=self.exposure, model_denom=model, model_numer='1',
+                                     weight=self._weight_, stabilized=False, standardize='population',
+                                     bound=None, print_results=print_results)
 
-        # If bounds are requested
+        self.df['_g1_'] = d
+        self.df['_g0_'] = 1 - d
+        # Applying bounds AFTER extracting g1 and g0
         if bound:
             self.df['_g1_'] = _bounding_(self.df['_g1_'], bounds=bound)
             self.df['_g0_'] = _bounding_(self.df['_g0_'], bounds=bound)
diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py
index db460f7..afb7c94 100644
--- a/zepid/causal/utils.py
+++ b/zepid/causal/utils.py
@@ -178,7 +178,8 @@ def _bounding_(v, bounds):
 
 
 def iptw_calculator(df, treatment, model_denom, model_numer, weight, stabilized, standardize, bound, print_results):
-    """Background function to calculate inverse probability of treatment weights. Used by `IPTW`, `IPSW`, `AIPSW`
+    """Background function to calculate inverse probability of treatment weights. Used by `IPTW`, `AIPTW`, `IPSW`,
+    `AIPSW`
     """
     denominator_model = propensity_score(df, treatment + ' ~ ' + model_denom,
                                          weights=weight, print_results=print_results)

From 0113e1865500a5e16e7be7947a8be52f0a08fb41 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 12 Aug 2019 15:58:13 -0400
Subject: [PATCH 06/42] Update to generalizability estimators for IPTW update

---
 tests/test_generalize.py              | 49 +++++++++++++++++++++++++++
 zepid/causal/generalize/estimators.py | 23 +++++++------
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/tests/test_generalize.py b/tests/test_generalize.py
index bbd5e77..cf52be0 100644
--- a/tests/test_generalize.py
+++ b/tests/test_generalize.py
@@ -18,6 +18,7 @@ def df_r():
 def df_c():
     df = ze.load_generalize_data(True)
     df['W_sq'] = df['W'] ** 2
+    df['weight'] = 3
     return df
 
 
@@ -77,6 +78,22 @@ def test_transport_iptw(self, df_c):
         npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.1372, atol=1e-4)
 
+    def test_generalize_weight(self, df_c):
+        ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight')
+        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.treatment_model('L', print_results=False)
+        ipsw.fit()
+        npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5)
+        npt.assert_allclose(ipsw.risk_ratio, 1.167213, atol=1e-4)
+
+    def test_transport_weight(self, df_c):
+        ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight')
+        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.treatment_model('L', print_results=False)
+        ipsw.fit()
+        npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5)
+        npt.assert_allclose(ipsw.risk_ratio, 1.1372, atol=1e-4)
+
 
 class TestGTransport:
 
@@ -113,6 +130,20 @@ def test_transport_conf(self, df_c):
         npt.assert_allclose(gtf.risk_difference, 0.042574, atol=1e-5)
         npt.assert_allclose(gtf.risk_ratio, 1.124257, atol=1e-4)
 
+    def test_generalize_weight(self, df_c):
+        gtf = GTransportFormula(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight')
+        gtf.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
+        gtf.fit()
+        npt.assert_allclose(gtf.risk_difference, 0.048949, atol=1e-5)
+        npt.assert_allclose(gtf.risk_ratio, 1.149556, atol=1e-4)
+
+    def test_transport_weight(self, df_c):
+        gtf = GTransportFormula(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight')
+        gtf.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
+        gtf.fit()
+        npt.assert_allclose(gtf.risk_difference, 0.042574, atol=1e-5)
+        npt.assert_allclose(gtf.risk_ratio, 1.124257, atol=1e-4)
+
 
 class TestAIPSW:
 
@@ -163,3 +194,21 @@ def test_transport_conf(self, df_c):
         aipw.fit()
         npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5)
         npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4)
+
+    def test_generalize_weight(self, df_c):
+        aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight')
+        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.treatment_model('L', print_results=False)
+        aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
+        aipw.fit()
+        npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5)
+        npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4)
+
+    def test_transport_weight(self, df_c):
+        aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight')
+        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.treatment_model('L', print_results=False)
+        aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
+        aipw.fit()
+        npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5)
+        npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4)
diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py
index c331007..106bc47 100644
--- a/zepid/causal/generalize/estimators.py
+++ b/zepid/causal/generalize/estimators.py
@@ -322,9 +322,7 @@ class GTransportFormula:
         Whether the problem is a generalizability (True) problem or a transportability (False) problem. See notes
         for further details on the difference between the two estimation methods
     weights : None, str, optional
-        If there are associated sampling (or other types of) weights, these can be included in this statement.
-        Sampling weights may be used for a transportability problem? Either way, I would like to keep this as an
-        option (to mirror TimeFixedGFormula)
+        Optional argument for weights. Can be used to input inverse probability of missing weights
 
     Note
     ----
@@ -538,10 +536,7 @@ class AIPSW:
         Whether the problem is a generalizability (True) problem or a transportability (False) problem. See notes
         for further details on the difference between the two estimation methods
     weights : None, str, optional
-        For conditionally randomized trials, or observational research, inverse probability of treatment weights
-        can be used to adjust for confounding in IPSW. Before estimating the effect measures, this weight vector
-        and the IPSW are multiplied to calculate new weights
-        When weights is None, the data is assumed to come from a randomized trial, and does not need to be adjusted
+        Optional argument for weights. Can be used to input inverse probability of missing weights
 
     Note
     ----
@@ -710,8 +705,13 @@ def outcome_model(self, model, outcome_type='binary', print_results=True):
 
         # Modeling the outcome
         df = self.df[self.sample].copy()
-        m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist)
-        self._outcome_model = m.fit()
+
+        if self.weight is None:
+            m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist)
+            self._outcome_model = m.fit()
+        else:
+            m = smf.glm(self.outcome+' ~ '+model, df, family=linkdist, freq_weights=df[self.weight])
+            self._outcome_model = m.fit()
 
         # Printing results of the model and if any observations were dropped
         if print_results:
@@ -740,9 +740,10 @@ def fit(self):
 
         if self.weight is not None:
             if self.iptw is None:
-                self.df['__ipw__'] = self.ipsw * self.sample[self.weight]
+                self.df['__ipw__'] = self.ipsw * self.df[self.weight]
             else:
-                self.df['__ipw__'] = self.ipsw * self.iptw * self.sample[self.weight]
+                self.df['__ipw__'] = self.ipsw * self.iptw * self.df[self.weight]
+                print(self.ipsw * self.iptw * self.df[self.weight])
         else:
             if self.iptw is None:
                 self.df['__ipw__'] = self.ipsw

From 4ef25b0cd16b68a7daa0ad956cc10ee5ad469424 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 12 Aug 2019 15:58:34 -0400
Subject: [PATCH 07/42] Fixing issue with custom weights in IPTW

---
 zepid/causal/ipw/IPTW.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py
index 71769ea..284b583 100644
--- a/zepid/causal/ipw/IPTW.py
+++ b/zepid/causal/ipw/IPTW.py
@@ -343,12 +343,12 @@ def fit(self, continuous_distribution='gaussian'):
             if self._weight_ is None:
                 df['_ipfw_'] = self.iptw
             else:
-                df['_ipfw_'] = self.iptw * self._weight_
+                df['_ipfw_'] = self.iptw * self.df[self._weight_]
         else:
             if self._weight_ is None:
                 df['_ipfw_'] = self.iptw * self.ipmw
             else:
-                df['_ipfw_'] = self.iptw * self.ipmw * self._weight_
+                df['_ipfw_'] = self.iptw * self.ipmw * self.df[self._weight_]
         df = df.dropna()
 
         if self._continuous_outcome:

From 8d238fe074658e65424d46ab4f721fa0c01e256f Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 12 Aug 2019 15:58:55 -0400
Subject: [PATCH 08/42] Updating CHANGELOG for v0.8.1

---
 CHANGELOG.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 55fb2af..004bdee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 ## Change logs
 
+### v0.8.1
+`IPSW` and `AIPSW` now natively support adjusting for confounding. Both now have the `treatment_model()` function, 
+which calculates the inverse probability of treatment weights. How weights are handled in `AIPSW` are updated. They 
+are used in both the weight and the outcome models.
+
+`IPSW` and `AIPSW` both add censoring...
+
+`TimeFixedGFormula` has added support for the average treatment effect in the treated (ATT), and average treatment 
+effect in the untreated (ATU). 
+
+Background refactoring for IPTW. `utils.py` now contains a function to calculate inverse probability of treatment 
+weights. The function `iptw_calculator` is used by `IPTW`, `AIPTW`, `IPSW`, and `AIPSW` to calculate the weights now
+
 ### v0.8.0
 `IPTW` had a massive overhaul. It now follows a similar structure to `AIPTW` and other causal inference methods. 
 One *major* change is that missing data is dropped before any calculations. Therefore, if missing data was present for

From 39035b96561df38a13e0f4589026add5cc286627 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Fri, 16 Aug 2019 08:17:52 -0400
Subject: [PATCH 09/42] Removing extraneous print statement from G-estimation

---
 zepid/causal/snm/g_estimation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/zepid/causal/snm/g_estimation.py b/zepid/causal/snm/g_estimation.py
index e9c38fc..14044f2 100644
--- a/zepid/causal/snm/g_estimation.py
+++ b/zepid/causal/snm/g_estimation.py
@@ -355,7 +355,6 @@ def fit(self, solver='closed', starting_value=None, alpha_value=0, tolerance=1e-
             y_vals = patsy.dmatrix(self._snm_ + ' - 1', yf, return_type='dataframe')
 
             # Solving for the array of Psi values
-            print(df.columns)
             self.psi = self._closed_form_solver_(treat=self.exposure,
                                                  model=self.exposure + ' ~ ' + self._treatment_model,
                                                  df=df,

From ae93874e92310c29ca5f559202f3f03d30c7236e Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Fri, 16 Aug 2019 08:48:01 -0400
Subject: [PATCH 10/42] Adding better Q-model warning #119

---
 zepid/causal/doublyrobust/AIPW.py     |  2 +-
 zepid/causal/doublyrobust/TMLE.py     |  3 ++
 zepid/causal/generalize/estimators.py |  7 ++++
 zepid/causal/gformula/TimeFixed.py    | 49 ++++++++++++++++++++++-----
 zepid/causal/snm/g_estimation.py      |  3 ++
 5 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/zepid/causal/doublyrobust/AIPW.py b/zepid/causal/doublyrobust/AIPW.py
index 901dfc8..119bec9 100644
--- a/zepid/causal/doublyrobust/AIPW.py
+++ b/zepid/causal/doublyrobust/AIPW.py
@@ -277,7 +277,7 @@ def outcome_model(self, model, continuous_distribution='gaussian', print_results
             Whether to print the fitted model results. Default is True (prints results)
         """
         if self.exposure not in model:
-            warnings.warn("It looks like the exposure variable is missing from the outcome model", UserWarning)
+            warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
 
         self._out_model = self.outcome + ' ~ ' + model
 
diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index bad0ba7..fcc5191 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -346,6 +346,9 @@ def outcome_model(self, model, custom_model=None, bound=False, print_results=Tru
             Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson'
             for Poisson distributions
         """
+        if self.exposure not in model:
+            warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
+
         self._out_model = self.outcome + ' ~ ' + model
 
         if self._miss_flag:
diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py
index 106bc47..f154a53 100644
--- a/zepid/causal/generalize/estimators.py
+++ b/zepid/causal/generalize/estimators.py
@@ -1,3 +1,4 @@
+import warnings
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm
@@ -395,6 +396,9 @@ def outcome_model(self, model, print_results=True):
         print_results : bool, optional
             Whether to print the logistic regression results to the terminal. Default is True
         """
+        if self.exposure not in model:
+            warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
+
         if self.outcome_type == 'binary':
             linkdist = sm.families.family.Binomial()
         elif self.outcome_type == 'normal':
@@ -694,6 +698,9 @@ def outcome_model(self, model, outcome_type='binary', print_results=True):
         print_results : bool, optional
             Whether to print the logistic regression results to the terminal. Default is True
         """
+        if self.exposure not in model:
+            warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
+
         if outcome_type == 'binary':
             linkdist = sm.families.family.Binomial()
         elif outcome_type == 'normal':
diff --git a/zepid/causal/gformula/TimeFixed.py b/zepid/causal/gformula/TimeFixed.py
index f459064..c6be6fb 100644
--- a/zepid/causal/gformula/TimeFixed.py
+++ b/zepid/causal/gformula/TimeFixed.py
@@ -149,7 +149,14 @@ class TimeFixedGFormula:
     """
     def __init__(self, df, exposure, outcome, exposure_type='binary', outcome_type='binary', standardize='population',
                  weights=None):
-        self.gf = df.copy()
+        if df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0] != df.shape[0]:
+            warnings.warn("There is missing data that is not the outcome in the data set. TimeFixedGFormula will drop "
+                          "all missing data that is not missing outcome data. TimeFixedGFormula will fit "
+                          + str(df.dropna(subset=[d for d in df.columns if d != outcome]).shape[0]) +
+                          ' of ' + str(df.shape[0]) + ' observations', UserWarning)
+            self.gf = df.copy().dropna(subset=[d for d in df.columns if d != outcome]).reset_index()
+        else:
+            self.gf = df.copy().reset_index()
         self.exposure = exposure
         self.outcome = outcome
 
@@ -190,6 +197,9 @@ def outcome_model(self, model, print_results=True):
         print_results : bool, optional
             Whether to print the logistic regression results to the terminal. Default is True
         """
+        if self.exposure not in model:
+            warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
+
         if self.outcome_type == 'binary':
             linkdist = sm.families.family.Binomial()
         elif self.outcome_type == 'normal':
@@ -213,7 +223,7 @@ def outcome_model(self, model, print_results=True):
         if print_results:
             print(self._outcome_model.summary())
 
-    def fit(self, treatment):
+    def fit(self, treatment, predict_missing=True):
         """Fit the parametric g-formula as specified. Binary and multivariate treatments are available. This
         implementation has three options for the binary treatment courses. For multivariate treatments, the user must
         specify custom treatment plans.
@@ -229,6 +239,12 @@ def fit(self, treatment):
               * custom  -create a custom treatment. When specifying this, the dataframe must be referred to as 'g' The
                 following is an example that selects those whose age is 25 or older and are females;
                 ``treatment="((g['age0']>=25) & (g['male']==0))``
+        predict_missing : bool, optional
+            Whether missing outcome values should be predicted via the model fit to the data with observed outcomes.
+            Default is set to True, which assumes that outcome data is missing completely at random conditional on
+            the variables included in the outcome model. Note this is the less restrictive assumption regarding missing
+            outcome data. If set to False, missing outcome data is assumed to be missing completely at random
+            (also referred to as non-informative censoring)
 
         Returns
         -------
@@ -277,8 +293,13 @@ def fit(self, treatment):
         # Getting predictions
         g[self.outcome] = np.nan
         g[self.outcome] = self._outcome_model.predict(g)
+        if not predict_missing:
+            g[self.outcome] = np.where(self.gf[self.outcome].isna(), np.nan, g[self.outcome])
+
+        self.predicted_df = g
 
         if self._weights is None:  # unweighted marginal estimate
+            g = g.dropna()
             if self.standardize == 'population':
                 self.marginal_outcome = np.mean(g[self.outcome])
             elif self.standardize == 'exposed':
@@ -286,18 +307,17 @@ def fit(self, treatment):
             else:
                 self.marginal_outcome = np.mean(g.loc[self.gf[self.exposure] == 0, self.outcome])
         else:  # weighted marginal estimate
+            g = g.dropna()
             if self.standardize == 'population':
-                self.marginal_outcome = np.average(g[self.outcome], weights=self.gf[self._weights])
+                self.marginal_outcome = np.average(g[self.outcome], weights=g[self._weights])
             elif self.standardize == 'exposed':
                 self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 1, self.outcome],
-                                                   weights=self.gf[self._weights])
+                                                   weights=g[self._weights])
             else:
                 self.marginal_outcome = np.average(g.loc[self.gf[self.exposure] == 0, self.outcome],
-                                                   weights=self.gf[self._weights])
+                                                   weights=g[self._weights])
 
-        self.predicted_df = g
-
-    def fit_stochastic(self, p, conditional=None, samples=100, seed=None):
+    def fit_stochastic(self, p, conditional=None, samples=100, predict_missing=True, seed=None):
         """Fits the g-formula for a stochastic intervention. As currently implemented, `p` percent of the population
         is randomly treated. This process is repeated `n` times and the mean is the marginal stochastic outcome.
 
@@ -310,6 +330,12 @@ def fit_stochastic(self, p, conditional=None, samples=100, seed=None):
             of the list of probabilities in 'p'
         samples: int, optional
             Number of resamples to calculate the marginal outcome. Default is 100
+        predict_missing : bool, optional
+            Whether missing outcome values should be predicted via the model fit to the data with observed outcomes.
+            Default is set to True, which assumes that outcome data is missing completely at random conditional on
+            the variables included in the outcome model. Note this is the less restrictive assumption regarding missing
+            outcome data. If set to False, missing outcome data is assumed to be missing completely at random
+            (also referred to as non-informative censoring)
         seed: int, optional
             Seed for the random process selection
 
@@ -365,6 +391,10 @@ def fit_stochastic(self, p, conditional=None, samples=100, seed=None):
             # Getting predictions
             g[self.outcome] = np.nan
             g[self.outcome] = self._outcome_model.predict(g)
+            if not predict_missing:
+                g[self.outcome] = np.where(self.gf[self.outcome].isna(), np.nan, g[self.outcome])
+                g = g.dropna()
+
             if self._weights is None:  # unweighted marginal estimate
                 if self.standardize == 'population':
                     marginals.append(np.mean(g[self.outcome]))
@@ -557,6 +587,9 @@ def outcome_model(self, model, print_results=True):
         print_results : bool, optional
             Whether to print the logistic regression results to the terminal. Default is True
         """
+        if self.exposure not in model:
+            warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
+
         # Modeling the outcome
         linkdist = sm.families.family.Binomial()
         if self._weights is None:
diff --git a/zepid/causal/snm/g_estimation.py b/zepid/causal/snm/g_estimation.py
index 14044f2..9cd63f8 100644
--- a/zepid/causal/snm/g_estimation.py
+++ b/zepid/causal/snm/g_estimation.py
@@ -225,6 +225,9 @@ def structural_nested_model(self, model):
             interaction terms as needed. Interactions should be indicated via patsy magic.
             For example, 'A + A:V + A:C'
         """
+        if self.exposure not in model:
+            warnings.warn("It looks like '" + self.exposure + "' is not included in the structural nested model.")
+
         self._snm_ = model
 
     def missing_model(self, model_denominator, model_numerator=None, stabilized=True, bound=False, print_results=True):

From 013786143fcd8b0f251f8f6856d4cda6447268cd Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Fri, 16 Aug 2019 08:57:55 -0400
Subject: [PATCH 11/42] Better naming convention for IPSW model functions

---
 ...pid.causal.generalize.estimators.AIPSW.rst |  3 +-
 ...epid.causal.generalize.estimators.IPSW.rst |  3 +-
 zepid/causal/generalize/estimators.py         | 39 +++++++++++--------
 3 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst b/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst
index 6d7bd6e..00959a2 100644
--- a/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst
+++ b/docs/Reference/generated/zepid.causal.generalize.estimators.AIPSW.rst
@@ -10,7 +10,8 @@ zepid.causal.generalize.estimators.AIPSW
 
    .. autosummary::
 
-      ~AIPSW.weight_model
+      ~AIPSW.sampling_model
+      ~AIPSW.treatment_model
       ~AIPSW.outcome_model
       ~AIPSW.fit
       ~AIPSW.summary
diff --git a/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst b/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst
index 8d4e889..90b01b9 100644
--- a/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst
+++ b/docs/Reference/generated/zepid.causal.generalize.estimators.IPSW.rst
@@ -10,6 +10,7 @@ zepid.causal.generalize.estimators.IPSW
 
    .. autosummary::
 
-      ~IPSW.regression_models
+      ~IPSW.sampling_model
+      ~IPSW.treatment_model
       ~IPSW.fit
       ~IPSW.summary
diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py
index f154a53..b00ef54 100644
--- a/zepid/causal/generalize/estimators.py
+++ b/zepid/causal/generalize/estimators.py
@@ -70,27 +70,22 @@ class IPSW:
     Generalizability for RCT results
 
     >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S', generalize=True)
-    >>> ipsw.regression_models('L + W + L:W', print_results=False)
+    >>> ipsw.sampling_model('L + W + L:W', print_results=False)
     >>> ipsw.fit()
     >>> ipsw.summary()
 
     Transportability for RCT results
 
     >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S', generalize=False)
-    >>> ipsw.regression_models('L + W + L:W', print_results=False)
+    >>> ipsw.sampling_model('L + W + L:W', print_results=False)
     >>> ipsw.fit()
     >>> ipsw.summary()
 
-    For observational studies, IPTW can be used to account for confounders
+    For observational studies, IPTW can be used to account for confounders via the `treatment_model()` function
 
-    >>> from zepid.causal.ipw import IPTW
-    >>> iptw = IPTW(df, treatment='A')
-    >>> iptw.regression_models('L')
-    >>> iptw.fit()
-    >>> df['iptw'] = iptw.Weight
-
-    >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S', weights='iptw', generalize=False)
-    >>> ipsw.regression_models('L + W + L:W', print_results=False)
+    >>> ipsw = IPSW(df, exposure='A', outcome='Y', selection='S')
+    >>> ipsw.sampling_model('L + W + L:W', print_results=False)
+    >>> ipsw.treatment_model('L', print_results=False)
     >>> ipsw.fit()
     >>> ipsw.summary()
 
@@ -123,7 +118,7 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No
         self.risk_ratio = None
         self._denominator_model = False
 
-    def weight_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True):
+    def sampling_model(self, model_denominator, model_numerator='1', bound=None, stabilized=True, print_results=True):
         """Logistic regression model(s) for estimating sampling weights. The model denominator must be specified for
         both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
         stabilization factor for the weight numerator. By default model results are returned
@@ -558,18 +553,28 @@ class AIPSW:
     >>> from zepid.causal.generalize import AIPSW
     >>> df = load_generalize_data(False)
 
-    Generalizability
+    Generalizability for RCT
 
     >>> aipw = AIPSW(df, exposure='A', outcome='Y', selection='S', generalize=True)
-    >>> aipw.weight_model('L + W_sq', print_results=False)
+    >>> aipw.sampling_model('L + W_sq', print_results=False)
     >>> aipw.outcome_model('A + L + L:A + W + W:A + W:A:L', print_results=False)
     >>> aipw.fit()
     >>> aipw.summary()
 
-    Transportability
+    Transportability for RCT
+
+    >>> aipw = AIPSW(df, exposure='A', outcome='Y', selection='S', generalize=False)
+    >>> aipw.sampling_model('L + W_sq', print_results=False)
+    >>> aipw.outcome_model('A + L + L:A + W + W:A + W:A:L', print_results=False)
+    >>> aipw.fit()
+    >>> aipw.summary()
+
+    Transportability for Observational study
 
+    >>> df = load_generalize_data(True)
     >>> aipw = AIPSW(df, exposure='A', outcome='Y', selection='S', generalize=False)
-    >>> aipw.weight_model('L + W_sq', print_results=False)
+    >>> aipw.sampling_model('L + W_sq', print_results=False)
+    >>> aipw.treatment_model('L', print_results=False)
     >>> aipw.outcome_model('A + L + L:A + W + W:A + W:A:L', print_results=False)
     >>> aipw.fit()
     >>> aipw.summary()
@@ -606,7 +611,7 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No
         self.risk_difference = None
         self.risk_ratio = None
 
-    def weight_model(self, model_denominator, model_numerator='1', stabilized=True, print_results=True):
+    def sampling_model(self, model_denominator, model_numerator='1', stabilized=True, print_results=True):
         """Logistic regression model(s) for estimating IPSW. The model denominator must be specified for both
         stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
         stabilization factor for the weight numerator. By default model results are returned

From 925e3a2162928d6457a93bb59501e1e21d846a4a Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Fri, 16 Aug 2019 08:59:56 -0400
Subject: [PATCH 12/42] Updating tests to reflect IPSW and AIPSW function
 naming changes

---
 tests/test_generalize.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/test_generalize.py b/tests/test_generalize.py
index cf52be0..a4d277e 100644
--- a/tests/test_generalize.py
+++ b/tests/test_generalize.py
@@ -27,7 +27,7 @@ class TestIPSW:
     def test_stabilize_error(self, df_c):
         ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S')
         with pytest.raises(ValueError):
-            ipsw.weight_model('L + W_sq', model_numerator='W', stabilized=False, print_results=False)
+            ipsw.sampling_model('L + W_sq', model_numerator='W', stabilized=False, print_results=False)
 
     def test_no_model_error(self, df_c):
         ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True)
@@ -36,35 +36,35 @@ def test_no_model_error(self, df_c):
 
     def test_generalize_unstabilized(self, df_r):
         ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S')
-        ipsw.weight_model('L + W_sq', stabilized=False, print_results=False)
+        ipsw.sampling_model('L + W_sq', stabilized=False, print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4)
 
     def test_generalize_stabilized(self, df_r):
         ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S')
-        ipsw.weight_model('L + W_sq', stabilized=True, print_results=False)
+        ipsw.sampling_model('L + W_sq', stabilized=True, print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.046809, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.13905, atol=1e-4)
 
     def test_transport_unstabilized(self, df_r):
         ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False)
-        ipsw.weight_model('L + W_sq', stabilized=False, print_results=False)
+        ipsw.sampling_model('L + W_sq', stabilized=False, print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4)
 
     def test_transport_stabilized(self, df_r):
         ipsw = IPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False)
-        ipsw.weight_model('L + W_sq', print_results=False)
+        ipsw.sampling_model('L + W_sq', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.034896, atol=1e-5)
         npt.assert_allclose(ipsw.risk_ratio, 1.097139, atol=1e-4)
 
     def test_generalize_iptw(self, df_c):
         ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True)
-        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.sampling_model('L + W + W_sq', print_results=False)
         ipsw.treatment_model('L', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5)
@@ -72,7 +72,7 @@ def test_generalize_iptw(self, df_c):
 
     def test_transport_iptw(self, df_c):
         ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False)
-        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.sampling_model('L + W + W_sq', print_results=False)
         ipsw.treatment_model('L', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5)
@@ -80,7 +80,7 @@ def test_transport_iptw(self, df_c):
 
     def test_generalize_weight(self, df_c):
         ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight')
-        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.sampling_model('L + W + W_sq', print_results=False)
         ipsw.treatment_model('L', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.055034, atol=1e-5)
@@ -88,7 +88,7 @@ def test_generalize_weight(self, df_c):
 
     def test_transport_weight(self, df_c):
         ipsw = IPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight')
-        ipsw.weight_model('L + W + W_sq', print_results=False)
+        ipsw.sampling_model('L + W + W_sq', print_results=False)
         ipsw.treatment_model('L', print_results=False)
         ipsw.fit()
         npt.assert_allclose(ipsw.risk_difference, 0.047296, atol=1e-5)
@@ -152,7 +152,7 @@ def test_no_model_error(self, df_c):
         with pytest.raises(ValueError):
             aipw.fit()
 
-        aipw.weight_model('L', print_results=False)
+        aipw.sampling_model('L', print_results=False)
         with pytest.raises(ValueError):
             aipw.fit()
 
@@ -163,7 +163,7 @@ def test_no_model_error(self, df_c):
 
     def test_generalize(self, df_r):
         aipw = AIPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=True)
-        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.sampling_model('L + W_sq', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()
         npt.assert_allclose(aipw.risk_difference, 0.061382, atol=1e-5)
@@ -171,7 +171,7 @@ def test_generalize(self, df_r):
 
     def test_transport(self, df_r):
         aipw = AIPSW(df_r, exposure='A', outcome='Y', selection='S', generalize=False)
-        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.sampling_model('L + W_sq', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()
         npt.assert_allclose(aipw.risk_difference, 0.05479, atol=1e-5)
@@ -179,7 +179,7 @@ def test_transport(self, df_r):
 
     def test_generalize_conf(self, df_c):
         aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True)
-        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.sampling_model('L + W_sq', print_results=False)
         aipw.treatment_model('L', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()
@@ -188,7 +188,7 @@ def test_generalize_conf(self, df_c):
 
     def test_transport_conf(self, df_c):
         aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False)
-        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.sampling_model('L + W_sq', print_results=False)
         aipw.treatment_model('L', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()
@@ -197,7 +197,7 @@ def test_transport_conf(self, df_c):
 
     def test_generalize_weight(self, df_c):
         aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight')
-        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.sampling_model('L + W_sq', print_results=False)
         aipw.treatment_model('L', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()
@@ -206,7 +206,7 @@ def test_generalize_weight(self, df_c):
 
     def test_transport_weight(self, df_c):
         aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight')
-        aipw.weight_model('L + W_sq', print_results=False)
+        aipw.sampling_model('L + W_sq', print_results=False)
         aipw.treatment_model('L', print_results=False)
         aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
         aipw.fit()

From c6898c2f3e3010c0a1dac7d4679cee2a809ca989 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Fri, 16 Aug 2019 09:11:28 -0400
Subject: [PATCH 13/42] Resolving issue with outcome_model check and
 categorical treatments for g-formula

---
 zepid/causal/gformula/TimeFixed.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/zepid/causal/gformula/TimeFixed.py b/zepid/causal/gformula/TimeFixed.py
index c6be6fb..2df065a 100644
--- a/zepid/causal/gformula/TimeFixed.py
+++ b/zepid/causal/gformula/TimeFixed.py
@@ -197,8 +197,9 @@ def outcome_model(self, model, print_results=True):
         print_results : bool, optional
             Whether to print the logistic regression results to the terminal. Default is True
         """
-        if self.exposure not in model:
-            warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
+        if type(self.exposure) is not list:
+            if self.exposure not in model:
+                warnings.warn("It looks like '" + self.exposure + "' is not included in the outcome model.")
 
         if self.outcome_type == 'binary':
             linkdist = sm.families.family.Binomial()

From 27ee7ba756e88fc82b12bd7ecb1291113a2aa8ee Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 3 Oct 2019 08:36:23 -0400
Subject: [PATCH 14/42] Copying dataframe rather than using actual

---
 zepid/causal/ipw/IPTW.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py
index 284b583..8e9e9d5 100644
--- a/zepid/causal/ipw/IPTW.py
+++ b/zepid/causal/ipw/IPTW.py
@@ -338,7 +338,7 @@ def fit(self, continuous_distribution='gaussian'):
         ind = sm.cov_struct.Independence()
         full_msm = self.outcome + ' ~ ' + self.ms_model
 
-        df = self.df
+        df = self.df.copy()
         if self.ipmw is None:
             if self._weight_ is None:
                 df['_ipfw_'] = self.iptw

From 4a91e6ac1202849397b25e0d9245424ef4f25f8c Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 3 Oct 2019 08:36:41 -0400
Subject: [PATCH 15/42] Updating changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 004bdee..13c6172 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,9 @@ are used in both the weight and the outcome models.
 `TimeFixedGFormula` has added support for the average treatment effect in the treated (ATT), and average treatment 
 effect in the untreated (ATU). 
 
+Improved warnings when the treatment/exposure variable is not included in models that it should be in (such as the 
+outcome model or in structural nested models).
+
 Background refactoring for IPTW. `utils.py` now contains a function to calculate inverse probability of treatment 
 weights. The function `iptw_calculator` is used by `IPTW`, `AIPTW`, `IPSW`, and `AIPSW` to calculate the weights now
 

From cfe7aa447a94f579c42b66781852473b405808f9 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 3 Oct 2019 08:37:50 -0400
Subject: [PATCH 16/42] Adding TODO list to generalize

---
 zepid/causal/generalize/estimators.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py
index b00ef54..7c3a1cc 100644
--- a/zepid/causal/generalize/estimators.py
+++ b/zepid/causal/generalize/estimators.py
@@ -212,6 +212,8 @@ def treatment_model(self, model_denominator, model_numerator='1', bound=None, st
                                           standardize='population',
                                           bound=bound, print_results=print_results)
 
+    # TODO add missing_model
+
     def fit(self):
         """Uses the calculated IPSW to obtain the risk difference and risk ratio from the sample. If weights are
         provided in the initial step, those weights are multiplied with IPSW to obtain the overall weights.
@@ -455,8 +457,8 @@ def fit(self):
             yn = self._outcome_model.predict(dfn)
 
             if self.weight is not None:
-                r1 = np.average(ya, weights=self.df[self.weight])
-                r0 = np.average(yn, weights=self.df[self.weight])
+                r1 = np.average(ya, weights=self.target[self.weight])
+                r0 = np.average(yn, weights=self.target[self.weight])
             else:
                 r1 = np.mean(ya)
                 r0 = np.mean(yn)
@@ -755,7 +757,6 @@ def fit(self):
                 self.df['__ipw__'] = self.ipsw * self.df[self.weight]
             else:
                 self.df['__ipw__'] = self.ipsw * self.iptw * self.df[self.weight]
-                print(self.ipsw * self.iptw * self.df[self.weight])
         else:
             if self.iptw is None:
                 self.df['__ipw__'] = self.ipsw

From d990d1fc1445e31f4443d26ad9035220a5164809 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 3 Oct 2019 09:32:39 -0400
Subject: [PATCH 17/42] Updating version-ing

---
 CHANGELOG.md     | 2 +-
 zepid/version.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 13c6172..c3f6f31 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ## Change logs
 
-### v0.8.1
+### v0.8.2
 `IPSW` and `AIPSW` now natively support adjusting for confounding. Both now have the `treatment_model()` function, 
 which calculates the inverse probability of treatment weights. How weights are handled in `AIPSW` are updated. They 
 are used in both the weight and the outcome models.
diff --git a/zepid/version.py b/zepid/version.py
index ef72cc0..4ca39e7 100644
--- a/zepid/version.py
+++ b/zepid/version.py
@@ -1 +1 @@
-__version__ = '0.8.1'
+__version__ = '0.8.2'

From 148b4c3a6b5d7ecfa155d238829a3fe0237ca072 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 18 Nov 2019 11:32:22 -0500
Subject: [PATCH 18/42] Stochastic TMLE -- general structure

---
 zepid/causal/doublyrobust/TMLE.py     | 376 ++++++++++++++++++++++++--
 zepid/causal/doublyrobust/__init__.py |   2 +-
 2 files changed, 356 insertions(+), 22 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index fcc5191..5d20aab 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -7,7 +7,7 @@
 from scipy.stats import logistic, norm
 
 from zepid.causal.utils import propensity_score
-from zepid.calc import probability_to_odds
+from zepid.calc import probability_to_odds, odds_to_probability
 from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, missing_machine_learner, _bounding_,
                                 plot_kde, plot_love, standardized_mean_differences, positivity,
                                 plot_kde_accuracy, outcome_accuracy)
@@ -15,13 +15,14 @@
 
 class TMLE:
     r"""Implementation of target maximum likelihood estimator. This implementation calculates TMLE for a
-    time-fixed exposure and a single time-point outcome.It uses standard regression models to calculate the estimate
-    of interest. The TMLE estimator allows users to instead use machine learning algorithms from sklearn.
+    time-fixed exposure and a single time-point outcome. By default standard parametric regression models are used to
+    calculate the estimate of interest. The TMLE estimator allows users to instead use machine learning algorithms
+    from sklearn and PyGAM.
 
     Note
     ----
-    Support for machine learning will be dropped in v0.9.0 due to poor confidence interval coverage. There will be
-    new cross-fitting procedures and classes for causal inference with machine learning.
+    Valid confidence intervals are only attainable with certain machine learning algorithms. These algorithms must be
+    Donsker class for valid confidence intervals. GAM and LASSO are examples of alogorithms that are Donsker class
 
     Parameters
     ----------
@@ -167,8 +168,8 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005):
             self._continuous_min = np.min(df[outcome])
             self._continuous_max = np.max(df[outcome])
             self._cb = continuous_bound
-            self.df[outcome] = self._unit_bounds(y=df[outcome], mini=self._continuous_min,
-                                                 maxi=self._continuous_max, bound=self._cb)
+            self.df[outcome] = _tmle_unit_bounds_(y=df[outcome], mini=self._continuous_min,
+                                                  maxi=self._continuous_max, bound=self._cb)
 
         self._out_model = None
         self._exp_model = None
@@ -468,13 +469,13 @@ def fit(self):
         delta = np.where(self.df[self._missing_indicator] == 1, 1, 0)
         if self._continuous_outcome:
             # Calculating Average Treatment Effect
-            Qstar = self._unit_unbound(Qstar, mini=self._continuous_min, maxi=self._continuous_max)
-            Qstar1 = self._unit_unbound(Qstar1, mini=self._continuous_min, maxi=self._continuous_max)
-            Qstar0 = self._unit_unbound(Qstar0, mini=self._continuous_min, maxi=self._continuous_max)
+            Qstar = _tmle_unit_unbound_(Qstar, mini=self._continuous_min, maxi=self._continuous_max)
+            Qstar1 = _tmle_unit_unbound_(Qstar1, mini=self._continuous_min, maxi=self._continuous_max)
+            Qstar0 = _tmle_unit_unbound_(Qstar0, mini=self._continuous_min, maxi=self._continuous_max)
 
             self.average_treatment_effect = np.nanmean(Qstar1 - Qstar0)
             # Influence Curve for CL
-            y_unbound = self._unit_unbound(self.df[self.outcome], mini=self._continuous_min, maxi=self._continuous_max)
+            y_unbound = _tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min, maxi=self._continuous_max)
             ic = np.where(delta == 1,
                           HAW * (y_unbound - Qstar) + (Qstar1 - Qstar0) - self.average_treatment_effect,
                           Qstar1 - Qstar0 - self.average_treatment_effect)
@@ -800,15 +801,348 @@ def plot_love(self, color_unweighted='r', color_weighted='b', shape_unweighted='
                        shape_unweighted=shape_unweighted, shape_weighted=shape_weighted)
         return ax
 
-    @staticmethod
-    def _unit_bounds(y, mini, maxi, bound):
-        # bounding for continuous outcomes
-        v = (y - mini) / (maxi - mini)
-        v = np.where(np.less(v, bound), bound, v)
-        v = np.where(np.greater(v, 1-bound), 1-bound, v)
-        return v
+
+class StochasticTMLE:
+    r"""Implementation of target maximum likelihood estimator for stochastic treatment plans. This implementation
+    calculates TMLE for a time-fixed exposure and a single time-point outcome under a stochastic treatment plan of
+    interest. By default, standard parametric regression models are used to calculate the estimate of interest. The
+    StochasticTMLE estimator allows users to instead use machine learning algorithms from sklearn and PyGAM.
+
+    Note
+    ----
+    Valid confidence intervals are only attainable with certain machine learning algorithms. These algorithms must be
+    Donsker class for valid confidence intervals. GAM and LASSO are examples of alogorithms that are Donsker class
+
+    Parameters
+    ----------
+    df : DataFrame
+        Pandas dataframe containing the variables of interest
+    exposure : str
+        Column label for the exposure of interest
+    outcome : str
+        Column label for the outcome of interest
+    alpha : float, optional
+        Alpha for confidence interval level. Default is 0.05
+    continuous_bound : float, optional
+        Optional argument to control the bounding feature for continuous outcomes. The bounding process may result
+        in values of 0,1 which are undefined for logit(x). This parameter adds or substracts from the scenarios of
+        0,1 respectively. Default value is 0.0005
+    verbose : bool, optional
+        Optional argument for verbose estimation. With verbose estimation, the model fits for each result are printed
+        to the console. It is highly recommended to turn this parameter to True when conducting model diagnostics
+
+    Note
+    ----
+    TMLE is a doubly-robust substitution estimator. TMLE obtains the target estimate in a single step. The
+    single-step TMLE is described further by van der Laan. For further details, see the listed references.
+
+    Continuous outcomes must be bounded between 0 and 1. TMLE does this automatically for the user. Additionally,
+    the average treatment effect is estimate is back converted to the original scale of Y. When scaling Y as Y*,
+    some values may take the value of 0 or 1, which breaks a logit(Y*) transformation. To avoid this issue, Y* is
+    bounded by the `continuous_bound` argument. The default is 0.0005, the same as R's tmle
+
+    Examples
+    --------
+    Setting up environment
+
+    >>> from zepid import load_sample_data, spline
+    >>> from zepid.causal.doublyrobust import StochasticTMLE
+    >>> df = load_sample_data(False).dropna()
+    >>> df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True)
+
+    Estimating TMLE using logistic regression
+
+    >>> tmle = TMLE(df, exposure='art', outcome='dead')
+    >>> # Specifying exposure/treatment model
+    >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+    >>> # Specifying outcome model
+    >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+    >>> # TMLE estimation procedure
+    >>> tmle.fit()
+    >>> # Printing main results
+    >>> tmle.summary()
+    >>> # Extracting risk difference and confidence intervals, respectively
+    >>> tmle.risk_difference
+    >>> tmle.risk_difference_ci
+
+    Estimating TMLE with machine learning algorithm from sklearn
+
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> log1 = LogisticRegression(penalty='l1', random_state=201)
+    >>> tmle = TMLE(df, 'art', 'dead')
+    >>> # custom_model allows specification of machine learning algorithms
+    >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1)
+    >>> tmle.outcome_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1)
+    >>> tmle.fit()
+
+    References
+    ----------
+    Muñoz ID, and Van Der Laan MJ. Population intervention causal effects based on stochastic interventions.
+    Biometrics 68.2 (2012): 541-549.
+
+    Van der Laan MJ, and Sherri R. Targeted learning in data science: causal inference for complex longitudinal
+    studies. Springer Science & Business Media, 2011.
+    """
+    def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, verbose=False):
+        # Going through missing data
+        if df.dropna().shape[0] != df.shape[0]:
+            warnings.warn("There is missing data that is not the outcome in the data set. StochasticTMLE will drop all "
+                          "missing data. StochasticTMLE will fit "
+                          + str(df.dropna().shape[0]) +
+                          ' of ' + str(df.shape[0]) + ' observations', UserWarning)
+            self.df = df.copy().dropna().reset_index()
+        else:
+            self.df = df.copy().reset_index()
+
+        if not df[exposure].value_counts().index.isin([0, 1]).all():
+            raise ValueError("StochasticTMLE only supports binary exposures currently")
+
+        # Manage outcomes
+        if df[outcome].dropna().value_counts().index.isin([0, 1]).all():
+            self._continuous_outcome = False
+            self._cb = 0.0
+        else:
+            self._continuous_outcome = True
+            self._continuous_min = np.min(df[outcome])
+            self._continuous_max = np.max(df[outcome])
+            self._cb = continuous_bound
+            self.df[outcome] = _tmle_unit_bounds_(y=df[outcome], mini=self._continuous_min,
+                                                  maxi=self._continuous_max, bound=self._cb)
+
+        self.exposure = exposure
+        self.outcome = outcome
+
+        # Output attributes
+        self.marginals_vector = None
+        self.marginal_outcome = None
+        self.alpha = alpha
+        self.marginal_ci = None
+        self.conditional_ci = None
+
+        # Storage for items I need later
+        self._outcome_model = None
+        self._q_model = None
+        self._Qinit_ = None
+        self._treatment_model = None
+        self._g_model = None
+        self._denominator_ = None
+        self._verbose_ = verbose
+
+        # Custom model / machine learner storage
+        self._g_custom_ = None
+        self._q_custom_ = None
+
+    def exposure_model(self, model, custom_model=None, bound=False):
+        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature
+
+        Parameters
+        ----------
+        model : str
+            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
+        custom_model : optional
+            Input for a custom model that is used in place of the logit model (default). The model must have the
+            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
+            background, TMLE will fit the custom model and generate the predicted probablities
+        bound : float, list, optional
+            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
+            Specifying this argument can improve finite sample performance for random positivity violations. However,
+            truncating weights leads to additional confounding. Default is False, meaning no truncation of
+            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
+            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
+            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
+        """
+        self._g_model = self.exposure + ' ~ ' + model
+
+        if custom_model is None:  # Standard parametric regression model
+            fitmodel = propensity_score(self.df, self._g_model, print_results=self._verbose_)
+            pred = fitmodel.predict(self.df)
+        else:  # User-specified prediction model
+            # TODO need to create smart warning system
+            self._exp_model_custom = True
+            data = patsy.dmatrix(model + ' - 1', self.df)
+            pred = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]),
+                                            ml_model=custom_model, print_results=self._verbose_)
+
+        if bound:  # Bounding predicted probabilities if requested
+            self._denominator_ = _bounding_(self._denominator_, bounds=bound)
+
+        self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred)
+
+    def outcome_model(self, model, custom_model=None, bound=False, continuous_distribution='gaussian'):
+        """Estimation of E(Y|A,L), which is also written sometimes as Q(A,W) or Pr(Y=1|A,W).
+
+        Parameters
+        ----------
+        model : str
+            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
+        custom_model : optional
+            Input for a custom model that is used in place of the logit model (default). The model must have the
+            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
+            background, TMLE will fit the custom model and generate the predicted probablities
+        bound : bool, optional
+            This argument should ONLY be used if the outcome is continuous. Value between 0,1 to truncate the bounded
+            predicted outcomes. Default is `False`, meaning no truncation of predicted outcomes occurs (unless a
+            predicted outcome is outside the bounded continuous outcome). Providing a single float assumes symmetric
+            trunctation. A list of floats can be provided for asymmetric trunctation.
+        print_results : bool, optional
+            Whether to print the fitted model results. Default is True (prints results)
+        continuous_distribution : str, optional
+            Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson'
+            for Poisson distributions
+        """
+        self._q_model = self.outcome + ' ~ ' + model
+
+        # Step 1) Prediction for Q (estimation of Q-model)
+        if custom_model is None:  # Standard parametric regression
+            self._continuous_type = continuous_distribution
+            if self._continuous_outcome:
+                if (continuous_distribution.lower() == 'gaussian') or (continuous_distribution.lower() == 'normal'):
+                    f = sm.families.family.Gaussian()
+                elif continuous_distribution.lower() == 'poisson':
+                    f = sm.families.family.Poisson()
+                else:
+                    raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous "
+                                     "outcomes")
+                log = smf.glm(self._q_model, self.df, family=f).fit()
+            else:
+                f = sm.families.family.Binomial()
+                log = smf.glm(self._q_model, self.df, family=f).fit()
+
+            if self._verbose_:
+                print('==============================================================================')
+                print('Q-model')
+                print(self._outcome_model.summary())
+
+            # Step 2) Estimation under the scenarios
+            self._Qinit_ = log.predict(self.df)
+
+        else:  # User-specified model
+            # TODO need to create smart warning system
+            self._out_model_custom = True
+            data = patsy.dmatrix(model + ' - 1', df)
+            self._Qinit_
+            self._outcome_model
+
+        if not bound:  # Bounding predicted probabilities if requested
+            bound = self._cb
+
+        # This bounding step prevents continuous outcomes from being outside the range
+        self._Qinit_ = _bounding_(self._Qinit_, bounds=bound)
+
+    def fit(self, p, condition=None, samples=100):
+        """Calculate the effect from the predicted exposure probabilities and predicted outcome values using the TMLE
+        procedure. Confidence intervals are calculated using influence curves.
+
+        Note
+        ----
+        Exposure and outcome models must be specified prior to `fit()`
+
+        Returns
+        -------
+        TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and
+        `average _treatment_effect` for continuous outcomes
+        """
+        # Step 4) Calculating clever covariate (HAW)
+        # TODO need to apply p properly
+        haw = p / self._denominator_
+
+        # Step 5) Estimating TMLE
+        epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_)
+
+        # Step 6) Estimating psi
+        q_star_list = []
+        for i in range(samples):
+            # Applying treatment plan
+            df = self.df.copy()
+            if condition is None:
+                df[self.exposure] = np.random.binomial(n=1, p=p, size=df.shape[0])
+            else:
+                df[self.exposure] = np.nan
+                for c, prop in zip(condition, p):
+                    df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0])
+
+            # Outcome model under treatment plan
+            y_star = self._outcome_model.predict(df)
+
+            # Targeted Estimate
+            logit_qstar = np.log(probability_to_odds(y_star)) + epsilon  # NOTE: log(Y^*) + e
+            q_star = odds_to_probability(np.exp(logit_qstar))
+            q_star_list.append(q_star)
+
+        if self._continuous_outcome:
+            self.marginals_vector = _tmle_unit_unbound_(q_star_list,
+                                                      mini=self._continuous_min, maxi=self._continuous_max)
+            y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min,
+                                            maxi=self._continuous_max))
+            yq0_ = _tmle_unit_unbound_(self._Qinit_, mini=self._continuous_min, maxi=self._continuous_max)
+        else:
+            self.marginals_vector = q_star_list
+            y_ = np.array(self.df[self.outcome])
+            yq0_ = self._Qinit_
+
+        self.marginal_outcome = np.mean(self.marginals_vector)
+
+        # Step 7) Estimating Var(psi)
+        if self.alpha == 0.05:  # Without this, won't match R exactly. R relies on 1.96, while I use SciPy
+            zalpha = 1.96
+        else:
+            zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)
+        # TODO add remainder of variance calculations
+    
+    def summary(self, decimal=3):
+        if self.marginal_outcome is None:
+            raise ValueError('The fit() statement must be ran before summary()')
+
+        print('======================================================================')
+        print('          Stochastic Targeted Maximum Likelihood Estimator            ')
+        print('======================================================================')
+        fmt = 'Treatment:        {:<15} No. Observations:     {:<20}'
+        print(fmt.format(self.exposure, self.df.shape[0]))
+        fmt = 'Outcome:          {:<15}'
+        print(fmt.format(self.outcome))
+        fmt = 'Q-Model:          {:<15}'
+        print(fmt.format('Logistic'))
+        # TODO add custom model check
+        fmt = 'g-Model:          {:<15}'
+        print(fmt.format('Logistic'))
+
+        # TODO add treatment plan information
+        # TODO add information on m (number of re-samples)
+
+        print('======================================================================')
+        print('Overall incidence:      ', np.round(self.marginal_outcome, decimals=decimal))
+        # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal))
+        print('======================================================================')
+        print('Conditional')
+        # print('95% CL:    ', np.round(self.ci_cond, decimals=decimal))
+        print('======================================================================')
 
     @staticmethod
-    def _unit_unbound(ystar, mini, maxi):
-        # unbounding of bounded continuous outcomes
-        return ystar*(maxi - mini) + mini
+    def targeting_step(y, q_init, iptw, verbose):
+        f = sm.families.family.Binomial()
+        log = sm.GLM(y,  # Outcome / dependent variable
+                     np.repeat(1, y.shape[0]),  # Generating intercept only model
+                     offset=np.log(probability_to_odds(q_init)),  # Offset by g-formula predictions
+                     freq_weights=iptw,  # Weighted by calculated IPW
+                     family=f).fit()
+
+        if verbose:  # Optional argument to print each intermediary result
+            print('==============================================================================')
+            print('Targeting Model')
+            print(log.summary())
+
+        return log.params[0]  # Returns single-step estimated Epsilon term
+
+
+# Functions that all TMLEs can call are below
+def _tmle_unit_bounds_(y, mini, maxi, bound):
+    # bounding for continuous outcomes
+    v = (y - mini) / (maxi - mini)
+    v = np.where(np.less(v, bound), bound, v)
+    v = np.where(np.greater(v, 1-bound), 1-bound, v)
+    return v
+
+
+def _tmle_unit_unbound_(ystar, mini, maxi):
+    # unbounding of bounded continuous outcomes
+    return ystar*(maxi - mini) + mini
diff --git a/zepid/causal/doublyrobust/__init__.py b/zepid/causal/doublyrobust/__init__.py
index fb3d55b..fb76187 100644
--- a/zepid/causal/doublyrobust/__init__.py
+++ b/zepid/causal/doublyrobust/__init__.py
@@ -1,2 +1,2 @@
 from .AIPW import AIPTW
-from .TMLE import TMLE
+from .TMLE import TMLE, StochasticTMLE

From 6b6be91f19f87d0ea65903382232791a801d3b17 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 18 Nov 2019 13:17:28 -0500
Subject: [PATCH 19/42] Moving check_conditional to utils.py

---
 zepid/causal/ipw/IPTW.py | 18 +++---------------
 zepid/causal/utils.py    | 12 ++++++++++++
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py
index 8e9e9d5..83d5fbb 100644
--- a/zepid/causal/ipw/IPTW.py
+++ b/zepid/causal/ipw/IPTW.py
@@ -7,7 +7,7 @@
 from statsmodels.tools.sm_exceptions import DomainWarning
 import matplotlib.pyplot as plt
 
-from zepid.causal.utils import (propensity_score, plot_boxplot, plot_kde, plot_love,
+from zepid.causal.utils import (propensity_score, plot_boxplot, plot_kde, plot_love, stochastic_check_conditional,
                                 standardized_mean_differences, positivity, _bounding_, iptw_calculator)
 
 
@@ -782,7 +782,7 @@ def fit(self, p, conditional=None):
         if conditional is None:
             df['_numer_'] = np.where(df[self.treatment] == 1, p, 1 - p)
         else:
-            self._check_conditional(conditional=conditional)
+            stochastic_check_conditional(df=df, conditional=conditional)
             df['_numer_'] = np.nan
             for c, prop in zip(conditional, p):
                 df['_numer_'] = np.where(eval(c),
@@ -809,7 +809,7 @@ def summary(self, decimal=3):
             raise ValueError('The fit() function must be specified before summary()')
 
         print('======================================================================')
-        print('                       Stochastic IPTW')
+        print('                       Stochastic IPTW                                ')
         print('======================================================================')
         fmt = 'Treatment:        {:<15} No. Observations:     {:<20}'
         print(fmt.format(self.treatment, self.df.shape[0]))
@@ -818,15 +818,3 @@ def summary(self, decimal=3):
         print('======================================================================')
         print('Risk:  ', round(self.marginal_outcome, decimal))
         print('======================================================================')
-
-    def _check_conditional(self, conditional):
-        """Check that conditionals are exclusive for the stochastic fit process. Generates a warning if not true
-        """
-        df = self.df.copy()
-        a = np.array([0] * df.shape[0])
-        for c in conditional:
-            a = np.add(a, np.where(eval(c), 1, 0))
-
-        if np.sum(np.where(a > 1, 1, 0)):
-            warnings.warn("It looks like your conditional categories are NOT exclusive. For appropriate estimation, "
-                          "the conditions that designate each category should be exclusive", UserWarning)
diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py
index afb7c94..29f031c 100644
--- a/zepid/causal/utils.py
+++ b/zepid/causal/utils.py
@@ -609,3 +609,15 @@ def plot_kde_accuracy(values, bw_method='scott', fill=True, color='b'):
     ax.set_xlabel(r'$Y_{pred} - Y$')
     ax.set_ylabel('Density')
     return ax
+
+
+def stochastic_check_conditional(df, conditional):
+    """Check that conditionals are exclusive for the stochastic fit process. Generates a warning if not true
+    """
+    a = np.array([0] * df.shape[0])
+    for c in conditional:
+        a = np.add(a, np.where(eval(c), 1, 0))
+
+    if np.sum(np.where(a > 1, 1, 0)):
+        warnings.warn("It looks like your conditional categories are NOT exclusive. For appropriate estimation, "
+                      "the conditions that designate each category should be exclusive", UserWarning)

From 8d3ca81c04c497d05e09c376bf44ca62b3bc9ed5 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 18 Nov 2019 13:17:54 -0500
Subject: [PATCH 20/42] Fixing conditional IPTW for TMLE

---
 zepid/causal/doublyrobust/TMLE.py | 47 ++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 5d20aab..355b3f3 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -6,7 +6,7 @@
 import matplotlib.pyplot as plt
 from scipy.stats import logistic, norm
 
-from zepid.causal.utils import propensity_score
+from zepid.causal.utils import propensity_score, stochastic_check_conditional
 from zepid.calc import probability_to_odds, odds_to_probability
 from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, missing_machine_learner, _bounding_,
                                 plot_kde, plot_love, standardized_mean_differences, positivity,
@@ -984,8 +984,6 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
             predicted outcomes. Default is `False`, meaning no truncation of predicted outcomes occurs (unless a
             predicted outcome is outside the bounded continuous outcome). Providing a single float assumes symmetric
             trunctation. A list of floats can be provided for asymmetric trunctation.
-        print_results : bool, optional
-            Whether to print the fitted model results. Default is True (prints results)
         continuous_distribution : str, optional
             Distribution to use for continuous outcomes. Options are 'gaussian' for normal distributions and 'poisson'
             for Poisson distributions
@@ -1004,6 +1002,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
                     raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous "
                                      "outcomes")
                 log = smf.glm(self._q_model, self.df, family=f).fit()
+
             else:
                 f = sm.families.family.Binomial()
                 log = smf.glm(self._q_model, self.df, family=f).fit()
@@ -1019,7 +1018,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
         else:  # User-specified model
             # TODO need to create smart warning system
             self._out_model_custom = True
-            data = patsy.dmatrix(model + ' - 1', df)
+            data = patsy.dmatrix(model + ' - 1', self.df)
             self._Qinit_
             self._outcome_model
 
@@ -1029,7 +1028,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
         # This bounding step prevents continuous outcomes from being outside the range
         self._Qinit_ = _bounding_(self._Qinit_, bounds=bound)
 
-    def fit(self, p, condition=None, samples=100):
+    def fit(self, p, conditional=None, samples=100):
         """Calculate the effect from the predicted exposure probabilities and predicted outcome values using the TMLE
         procedure. Confidence intervals are calculated using influence curves.
 
@@ -1042,9 +1041,29 @@ def fit(self, p, condition=None, samples=100):
         TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and
         `average _treatment_effect` for continuous outcomes
         """
+        # Error checking
+        if self._denominator_ is None:
+            raise ValueError("The exposure_model() function must be specified before the fit() function")
+        if self._Qinit_ is None:
+            raise ValueError("The outcome_model() function must be specified before the fit() function")
+
+        p = np.array(p)
+        if np.any(p > 1):
+            raise ValueError("All specified treatment probabilities must be less than 1")
+        if conditional is not None:
+            if len(p) != len(conditional):
+                raise ValueError("'p' and 'conditional' must be the same length")
+
         # Step 4) Calculating clever covariate (HAW)
-        # TODO need to apply p properly
-        haw = p / self._denominator_
+        if conditional is None:
+            numerator = np.where(self.df[self.exposure] == 1, p, 1 - p)
+        else:
+            stochastic_check_conditional(df=self.df, conditional=conditional)
+            numerator = np.array([np.nan] for i in range(self.df.shape[0]))
+            for c, prop in zip(conditional, p):
+                numerator = np.where(eval(c), np.where(self.df[self.exposure] == 1, prop, 1 - prop), numerator)
+
+        haw = numerator / self._denominator_
 
         # Step 5) Estimating TMLE
         epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_)
@@ -1054,26 +1073,26 @@ def fit(self, p, condition=None, samples=100):
         for i in range(samples):
             # Applying treatment plan
             df = self.df.copy()
-            if condition is None:
+            if conditional is None:
                 df[self.exposure] = np.random.binomial(n=1, p=p, size=df.shape[0])
             else:
                 df[self.exposure] = np.nan
-                for c, prop in zip(condition, p):
+                for c, prop in zip(conditional, p):
                     df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0])
 
             # Outcome model under treatment plan
             y_star = self._outcome_model.predict(df)
 
             # Targeted Estimate
-            logit_qstar = np.log(probability_to_odds(y_star)) + epsilon  # NOTE: log(Y^*) + e
-            q_star = odds_to_probability(np.exp(logit_qstar))
+            logit_qstar = np.log(probability_to_odds(y_star)) + epsilon  # logit(Y^*) + e
+            q_star = odds_to_probability(np.exp(logit_qstar))  # Y^*
             q_star_list.append(q_star)
 
         if self._continuous_outcome:
             self.marginals_vector = _tmle_unit_unbound_(q_star_list,
-                                                      mini=self._continuous_min, maxi=self._continuous_max)
+                                                        mini=self._continuous_min, maxi=self._continuous_max)
             y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min,
-                                            maxi=self._continuous_max))
+                                              maxi=self._continuous_max))
             yq0_ = _tmle_unit_unbound_(self._Qinit_, mini=self._continuous_min, maxi=self._continuous_max)
         else:
             self.marginals_vector = q_star_list
@@ -1088,7 +1107,7 @@ def fit(self, p, condition=None, samples=100):
         else:
             zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)
         # TODO add remainder of variance calculations
-    
+
     def summary(self, decimal=3):
         if self.marginal_outcome is None:
             raise ValueError('The fit() statement must be ran before summary()')

From feca583268f2ad61fc62d9e107398faa985b49ee Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 18 Nov 2019 13:33:56 -0500
Subject: [PATCH 21/42] improved testing for OOB p in StochasticIPTW

---
 tests/test_ipw.py        | 8 +++++++-
 zepid/causal/ipw/IPTW.py | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_ipw.py b/tests/test_ipw.py
index e98d6b9..31adb3c 100644
--- a/tests/test_ipw.py
+++ b/tests/test_ipw.py
@@ -253,11 +253,14 @@ def test_error_no_model(self, sdata):
         with pytest.raises(ValueError):
             sipw.fit(p=0.8)
 
-    def test_error_prob_high(self, sdata):
+    def test_error_p_oob(self, sdata):
         sipw = StochasticIPTW(sdata.dropna(), treatment='art', outcome='dead')
         with pytest.raises(ValueError):
             sipw.fit(p=1.8)
 
+        with pytest.raises(ValueError):
+            sipw.fit(p=-0.8)
+
     def test_error_summary(self, sdata):
         sipw = StochasticIPTW(sdata.dropna(), treatment='art', outcome='dead')
         with pytest.raises(ValueError):
@@ -270,6 +273,9 @@ def test_error_conditional(self, sdata):
         with pytest.raises(ValueError):
             sipw.fit(p=[0.8, 0.1, 0.1], conditional=["df['male']==1", "df['male']==0"])
 
+        with pytest.raises(ValueError):
+            sipw.fit(p=[0.8], conditional=["df['male']==1", "df['male']==0"])
+
     def test_uncond_treatment(self, sdata):
         r_pred = 0.1165162207
 
diff --git a/zepid/causal/ipw/IPTW.py b/zepid/causal/ipw/IPTW.py
index 83d5fbb..2a28eb2 100644
--- a/zepid/causal/ipw/IPTW.py
+++ b/zepid/causal/ipw/IPTW.py
@@ -771,8 +771,8 @@ def fit(self, p, conditional=None):
 
         if self._pdenom_ is None:
             raise ValueError("The treatment_model() function must be specified before the fit() function")
-        if np.any(p > 1):
-            raise ValueError("All specified treatment probabilities must be less than 1")
+        if np.any(p > 1) or np.any(p < 0):
+            raise ValueError("All specified treatment probabilities must be between 0 and 1")
         if conditional is not None:
             if len(p) != len(conditional):
                 raise ValueError("'p' and 'conditional' must be the same length")

From ff3f0bb9a596d4c3394d381110e860b0077eca9a Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 18 Nov 2019 14:11:52 -0500
Subject: [PATCH 22/42] starting tests for StochasticTMLE

---
 tests/test_doublyrobust.py        | 187 +++++++++++++++++++++++++++++-
 zepid/causal/doublyrobust/TMLE.py |  15 +--
 2 files changed, 191 insertions(+), 11 deletions(-)

diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py
index 1c6cf6f..dbe737e 100644
--- a/tests/test_doublyrobust.py
+++ b/tests/test_doublyrobust.py
@@ -2,10 +2,11 @@
 import numpy as np
 import pandas as pd
 import numpy.testing as npt
+import pandas.testing as pdt
 from sklearn.linear_model import LogisticRegression, LinearRegression
 
 import zepid as ze
-from zepid.causal.doublyrobust import TMLE, AIPTW
+from zepid.causal.doublyrobust import TMLE, AIPTW, StochasticTMLE
 
 
 class TestTMLE:
@@ -43,24 +44,43 @@ def test_drop_missing_data(self):
         tmle = TMLE(df, exposure='art', outcome='dead')
         assert df.dropna(subset=['cd4_wk45']).shape[0] == tmle.df.shape[0]
 
-    def test_error_when_no_models_specified1(self, df):
+    def test_error_when_no_models_specified(self, df):
         tmle = TMLE(df, exposure='art', outcome='dead')
         with pytest.raises(ValueError):
             tmle.fit()
 
-    def test_error_when_no_models_specified2(self, df):
         tmle = TMLE(df, exposure='art', outcome='dead')
         tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False)
         with pytest.raises(ValueError):
             tmle.fit()
 
-    def test_error_when_no_models_specified3(self, df):
         tmle = TMLE(df, exposure='art', outcome='dead')
         tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0',
                            print_results=False)
         with pytest.raises(ValueError):
             tmle.fit()
 
+    def test_continuous_processing(self):
+        a_list = [0, 1, 1, 0, 1, 1, 0, 0]
+        y_list = [1, -1, 5, 0, 0, 0, 10, -5]
+        df = pd.DataFrame()
+        df['A'] = a_list
+        df['Y'] = y_list
+
+        tmle = TMLE(df=df, exposure='A', outcome='Y', continuous_bound=0.0001)
+
+        # Checking all flagged parts are correct
+        assert tmle._continuous_outcome is True
+        assert tmle._continuous_min == -5
+        assert tmle._continuous_max == 10
+        assert tmle._cb == 0.0001
+
+        # Checking that TMLE bounding works as intended
+        y_bound = [2 / 5, 4 / 15, 2 / 3, 1 / 3, 1 / 3, 1 / 3, 0.9999, 0.0001]
+        pdt.assert_series_equal(pd.Series(y_bound),
+                                tmle.df['Y'],
+                                check_dtype=False, check_names=False)
+
     def test_match_r_epsilons(self, df):
         r_epsilons = [-0.016214091, 0.003304079]
         tmle = TMLE(df, exposure='art', outcome='dead')
@@ -315,6 +335,165 @@ def test_sklearn_in_tmle_missing(self, mf):
         npt.assert_allclose(tmle.odds_ratio_ci, [0.213980, 0.978331], rtol=1e-4)
 
 
+class TestStochasticTMLE:
+
+    @pytest.fixture
+    def df(self):
+        df = ze.load_sample_data(False)
+        df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df, 'cd40', n_knots=3, term=2, restricted=True)
+        df[['age_rs1', 'age_rs2']] = ze.spline(df, 'age0', n_knots=3, term=2, restricted=True)
+        return df.drop(columns=['cd4_wk45']).dropna()
+
+    @pytest.fixture
+    def cf(self):
+        df = ze.load_sample_data(False)
+        df[['cd4_rs1', 'cd4_rs2']] = ze.spline(df, 'cd40', n_knots=3, term=2, restricted=True)
+        df[['age_rs1', 'age_rs2']] = ze.spline(df, 'age0', n_knots=3, term=2, restricted=True)
+        return df.drop(columns=['dead']).dropna()
+
+    @pytest.fixture
+    def simple_df(self):
+        expected = pd.DataFrame([[1, 1, 1, 1, 1],
+                                 [0, 0, 0, -1, 2],
+                                 [0, 1, 0, 5, 1],
+                                 [0, 0, 1, 0, 0],
+                                 [1, 0, 0, 0, 1],
+                                 [1, 0, 1, 0, 0],
+                                 [0, 1, 0, 10, 1],
+                                 [0, 0, 0, -5, 0],
+                                 [1, 1, 0, -5, 2]],
+                                columns=["W", "A", "Y", "C", "S"],
+                                index=[1, 2, 3, 4, 5, 6, 7, 8, 9])
+        return expected
+
+    def test_error_continuous_exp(self, df):
+        with pytest.raises(ValueError):
+            StochasticTMLE(df=df, exposure='cd40', outcome='dead')
+
+    def test_error_fit(self, df):
+        stmle = StochasticTMLE(df=df, exposure='art', outcome='dead')
+        with pytest.raises(ValueError):
+            stmle.fit(p=0.5)
+
+        stmle = StochasticTMLE(df=df, exposure='art', outcome='dead')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        with pytest.raises(ValueError):
+            stmle.fit(p=0.5)
+
+        stmle = StochasticTMLE(df=df, exposure='art', outcome='dead')
+        stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        with pytest.raises(ValueError):
+            stmle.fit(p=0.5)
+
+    def test_error_p_oob(self, df):
+        stmle = StochasticTMLE(df=df, exposure='art', outcome='dead')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        with pytest.raises(ValueError):
+            stmle.fit(p=1.1)
+
+        with pytest.raises(ValueError):
+            stmle.fit(p=-0.1)
+
+    def test_error_p_cond_len(self, df):
+        stmle = StochasticTMLE(df=df, exposure='art', outcome='dead')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        with pytest.raises(ValueError):
+            stmle.fit(p=[0.1], conditional=["df['male']==1", "df['male']==0"])
+
+        with pytest.raises(ValueError):
+            stmle.fit(p=[0.1, 0.3], conditional=["df['male']==1"])
+
+    def test_error_summary(self, df):
+        stmle = StochasticTMLE(df=df, exposure='art', outcome='dead')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.outcome_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        with pytest.raises(ValueError):
+            stmle.summary()
+
+    def test_continuous_processing(self):
+        a_list = [0, 1, 1, 0, 1, 1, 0, 0]
+        y_list = [1, -1, 5, 0, 0, 0, 10, -5]
+        df = pd.DataFrame()
+        df['A'] = a_list
+        df['Y'] = y_list
+
+        stmle = StochasticTMLE(df=df, exposure='A', outcome='Y', continuous_bound=0.0001)
+
+        # Checking all flagged parts are correct
+        assert stmle._continuous_outcome is True
+        assert stmle._continuous_min == -5
+        assert stmle._continuous_max == 10
+        assert stmle._cb == 0.0001
+
+        # Checking that TMLE bounding works as intended
+        y_bound = [2 / 5, 4 / 15, 2 / 3, 1 / 3, 1 / 3, 1 / 3, 0.9999, 0.0001]
+        pdt.assert_series_equal(pd.Series(y_bound),
+                                stmle.df['Y'],
+                                check_dtype=False, check_names=False)
+
+    def test_marginal_vector_length_stoch(self, df):
+        stmle = StochasticTMLE(df=df, exposure='art', outcome='dead')
+        stmle.exposure_model('male')
+        stmle.outcome_model('art + male + age0')
+        stmle.fit(p=0.4, samples=7)
+        assert len(stmle.marginals_vector) == 7
+
+    def test_qmodel_params(self, simple_df):
+        # Comparing to SAS logit model
+        sas_params = [-1.0699, -0.9525, 1.5462]
+        sas_preds = [0.3831332, 0.2554221, 0.1168668, 0.2554221, 0.6168668, 0.6168668, 0.1168668, 0.2554221, 0.3831332]
+
+        stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='Y')
+        stmle.outcome_model('A + W')
+        est_params = stmle._outcome_model.params
+        est_preds = stmle._Qinit_
+
+        npt.assert_allclose(sas_params, est_params, atol=1e-4)
+        npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
+
+    def test_qmodel_params2(self, simple_df):
+        # Comparing to SAS linear model
+        sas_params = [0.3876, 0.3409, -0.2030, -0.0883]
+        sas_preds = [0.437265, 0.210957, 0.6402345, 0.3876202, 0.0963188, 0.1846502, 0.6402345, 0.38762016, 0.34893314]
+
+        stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C')
+        stmle.outcome_model('A + W + S', continuous_distribution='normal')
+        est_params = stmle._outcome_model.params
+        est_preds = stmle._Qinit_
+
+        npt.assert_allclose(sas_params, est_params, atol=1e-4)
+        npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
+
+    def test_qmodel_params3(self, simple_df):
+        # Comparing to SAS Poisson model
+        sas_params = [-1.0478, 0.9371, -0.5321, -0.2733]
+        sas_preds = [0.4000579, 0.2030253, 0.6811115, 0.3507092, 0.1567304, 0.20599265, 0.6811115, 0.3507092, 0.3043857]
+
+        stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C')
+        stmle.outcome_model('A + W + S', continuous_distribution='Poisson')
+        est_params = stmle._outcome_model.params
+        est_preds = stmle._Qinit_
+
+        npt.assert_allclose(sas_params, est_params, atol=1e-4)
+        npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
+
+    def test_gmodel_params(self, simple_df):
+        # Comparing to SAS Poisson model
+        sas_preds = [2.0, 1.6666666667, 2.5, 1.6666666667, 2, 2, 2.5, 1.6666666667, 2]
+
+        stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C')
+        stmle.exposure_model('W')
+        est_preds = 1 / stmle._denominator_
+
+        npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
+
+    # TODO check bounding
+
+    # TODO compare to R in several versions
+
+
 class TestAIPTW:
 
     @pytest.fixture
diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 355b3f3..b6eae4e 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -1001,11 +1001,11 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
                 else:
                     raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous "
                                      "outcomes")
-                log = smf.glm(self._q_model, self.df, family=f).fit()
+                self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit()
 
             else:
                 f = sm.families.family.Binomial()
-                log = smf.glm(self._q_model, self.df, family=f).fit()
+                self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit()
 
             if self._verbose_:
                 print('==============================================================================')
@@ -1013,7 +1013,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
                 print(self._outcome_model.summary())
 
             # Step 2) Estimation under the scenarios
-            self._Qinit_ = log.predict(self.df)
+            self._Qinit_ = self._outcome_model.predict(self.df)
 
         else:  # User-specified model
             # TODO need to create smart warning system
@@ -1041,6 +1041,7 @@ def fit(self, p, conditional=None, samples=100):
         TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and
         `average _treatment_effect` for continuous outcomes
         """
+        # TODO add a seed
         # Error checking
         if self._denominator_ is None:
             raise ValueError("The exposure_model() function must be specified before the fit() function")
@@ -1048,8 +1049,8 @@ def fit(self, p, conditional=None, samples=100):
             raise ValueError("The outcome_model() function must be specified before the fit() function")
 
         p = np.array(p)
-        if np.any(p > 1):
-            raise ValueError("All specified treatment probabilities must be less than 1")
+        if np.any(p > 1) or np.any(p < 0):
+            raise ValueError("All specified treatment probabilities must be between 0 and 1")
         if conditional is not None:
             if len(p) != len(conditional):
                 raise ValueError("'p' and 'conditional' must be the same length")
@@ -1086,7 +1087,7 @@ def fit(self, p, conditional=None, samples=100):
             # Targeted Estimate
             logit_qstar = np.log(probability_to_odds(y_star)) + epsilon  # logit(Y^*) + e
             q_star = odds_to_probability(np.exp(logit_qstar))  # Y^*
-            q_star_list.append(q_star)
+            q_star_list.append(np.mean(q_star))  # E[Y^*]
 
         if self._continuous_outcome:
             self.marginals_vector = _tmle_unit_unbound_(q_star_list,
@@ -1102,7 +1103,7 @@ def fit(self, p, conditional=None, samples=100):
         self.marginal_outcome = np.mean(self.marginals_vector)
 
         # Step 7) Estimating Var(psi)
-        if self.alpha == 0.05:  # Without this, won't match R exactly. R relies on 1.96, while I use SciPy
+        if self.alpha == 0.05:  # Without this, won't match R exactly. R relies on 1.96, while I use SciPy usually
             zalpha = 1.96
         else:
             zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)

From a146880fcf2d6432c18f544655fc32beab440af6 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 18 Nov 2019 16:16:00 -0500
Subject: [PATCH 23/42] more tests for StochasticTMLE

---
 tests/test_doublyrobust.py        | 18 ++++++++++++++++++
 zepid/causal/doublyrobust/TMLE.py |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py
index dbe737e..2555d42 100644
--- a/tests/test_doublyrobust.py
+++ b/tests/test_doublyrobust.py
@@ -489,6 +489,24 @@ def test_gmodel_params(self, simple_df):
 
         npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
 
+    def test_compare_tmle(self, df):
+        stmle = StochasticTMLE(df, exposure='art', outcome='dead')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.fit(p=1.0, samples=1)
+        all_treat = stmle.marginal_outcome
+        stmle.fit(p=0.0, samples=1)
+        non_treat = stmle.marginal_outcome
+
+        tmle = TMLE(df, exposure='art', outcome='dead')
+        tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False)
+        tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0',
+                           print_results=False)
+        tmle.fit()
+        expected = tmle.risk_difference
+
+        npt.assert_allclose(expected, all_treat - non_treat, atol=1e-4)
+
     # TODO check bounding
 
     # TODO compare to R in several versions
diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index b6eae4e..b71920a 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -1090,7 +1090,7 @@ def fit(self, p, conditional=None, samples=100):
             q_star_list.append(np.mean(q_star))  # E[Y^*]
 
         if self._continuous_outcome:
-            self.marginals_vector = _tmle_unit_unbound_(q_star_list,
+            self.marginals_vector = _tmle_unit_unbound_(np.array(q_star_list),
                                                         mini=self._continuous_min, maxi=self._continuous_max)
             y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min,
                                               maxi=self._continuous_max))

From c7346a47838eb8775571158f9236e5e0793b9cd6 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Tue, 19 Nov 2019 08:09:05 -0500
Subject: [PATCH 24/42] updated StochasticTMLE docs

---
 zepid/causal/doublyrobust/TMLE.py | 59 ++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index b71920a..187c350 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -841,6 +841,38 @@ class StochasticTMLE:
     some values may take the value of 0 or 1, which breaks a logit(Y*) transformation. To avoid this issue, Y* is
     bounded by the `continuous_bound` argument. The default is 0.0005, the same as R's tmle
 
+    Following is a general narrative of the estimation procedure for TMLE with stochastic treatments
+
+    1. Initial estimators for g-model (IPTW) and Q-model (g-formula) are fit. By default these estimators are based
+    on parametric regression models. Additionally, machine learning algorithms can be used to estimate the g-model and
+    Q-model.
+
+    2. The auxiliary covariate is calculated (i.e. IPTW).
+
+    .. math::
+
+        H(A=a, L) = \frac{I(A=a) p}{\widehat{\Pr}(A=a)}
+
+    where `p` is the probability of treatment `a` under the stochastic intervention of interest.
+
+    3. Targeting step occurs through estimation of `e` via a logistic regression model. Briefly a weighted logistic
+    regression model (weighted by the auxiliary covariates) with the dependent variable as the observed outcome and
+    an offset term of the Q-model predictions under the observed treatment (A).
+
+    .. math::
+
+        \text{logit}(Y) = \text{logit}(Q(A, W)) + \epsilon
+
+    4. Stochastic interventions are evaluated through Monte Carlo integration for binary treatments. The different
+    treatment plans are randomly applied and evaluated through the Q-model and then the targeting step via
+
+    .. math::
+
+        E[\text{logit}(Q(A=a, W)) + \hat{\epsilon}]
+
+    This process is repeated a large number of times and the point estimate is the average of those individual treatment
+    plans.
+
     Examples
     --------
     Setting up environment
@@ -850,30 +882,30 @@ class StochasticTMLE:
     >>> df = load_sample_data(False).dropna()
     >>> df[['cd4_rs1', 'cd4_rs2']] = spline(df, 'cd40', n_knots=3, term=2, restricted=True)
 
-    Estimating TMLE using logistic regression
+    Estimating TMLE for 0.2 being treated with ART
 
     >>> tmle = TMLE(df, exposure='art', outcome='dead')
-    >>> # Specifying exposure/treatment model
     >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
-    >>> # Specifying outcome model
     >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
-    >>> # TMLE estimation procedure
-    >>> tmle.fit()
-    >>> # Printing main results
+    >>> tmle.fit(p=0.2)
+    >>> tmle.summary()
+
+    Estimating TMLE for conditional plan
+
+    >>> tmle = TMLE(df, exposure='art', outcome='dead')
+    >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+    >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+    >>> tmle.fit(p=[0.6, 0.4], conditional=["df['male']==1", "df['male']==0"])
     >>> tmle.summary()
-    >>> # Extracting risk difference and confidence intervals, respectively
-    >>> tmle.risk_difference
-    >>> tmle.risk_difference_ci
 
     Estimating TMLE with machine learning algorithm from sklearn
 
     >>> from sklearn.linear_model import LogisticRegression
     >>> log1 = LogisticRegression(penalty='l1', random_state=201)
     >>> tmle = TMLE(df, 'art', 'dead')
-    >>> # custom_model allows specification of machine learning algorithms
     >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1)
     >>> tmle.outcome_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1)
-    >>> tmle.fit()
+    >>> tmle.fit(p=0.75)
 
     References
     ----------
@@ -1059,12 +1091,13 @@ def fit(self, p, conditional=None, samples=100):
         if conditional is None:
             numerator = np.where(self.df[self.exposure] == 1, p, 1 - p)
         else:
+            df = self.df.copy()
             stochastic_check_conditional(df=self.df, conditional=conditional)
             numerator = np.array([np.nan] for i in range(self.df.shape[0]))
             for c, prop in zip(conditional, p):
-                numerator = np.where(eval(c), np.where(self.df[self.exposure] == 1, prop, 1 - prop), numerator)
+                numerator = np.where(eval(c), np.where(df[self.exposure] == 1, prop, 1 - prop), numerator)
 
-        haw = numerator / self._denominator_
+        haw = np.array(numerator / self._denominator_).astype(float)
 
         # Step 5) Estimating TMLE
         epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_)

From f4d7cd422c0a498610f3912ac55a237ad35fce5f Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Tue, 19 Nov 2019 08:23:30 -0500
Subject: [PATCH 25/42] variance estimation tweaking

---
 zepid/causal/doublyrobust/TMLE.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 187c350..0a37297 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -948,7 +948,9 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v
         self.marginals_vector = None
         self.marginal_outcome = None
         self.alpha = alpha
+        self.marginal_se = None
         self.marginal_ci = None
+        self.conditional_se = None
         self.conditional_ci = None
 
         # Storage for items I need later
@@ -1140,7 +1142,19 @@ def fit(self, p, conditional=None, samples=100):
             zalpha = 1.96
         else:
             zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)
-        # TODO add remainder of variance calculations
+
+        # Marginal variance estimator
+        # variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_,
+        #                                                 y_pred_targeted=, psi=self.marginal_outcome)
+        # self.marginal_se = np.sqrt(variance_marginal)
+        # self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se,
+        #                     self.marginal_outcome + zalpha * self.marginal_se]
+
+        # Conditional on W variance estimator
+        variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_)
+        self.conditional_se = np.sqrt(variance_conditional)
+        self.conditional_ci = [self.marginal_outcome - zalpha * self.conditional_se,
+                               self.marginal_outcome + zalpha * self.conditional_se]
 
     def summary(self, decimal=3):
         if self.marginal_outcome is None:
@@ -1186,6 +1200,18 @@ def targeting_step(y, q_init, iptw, verbose):
 
         return log.params[0]  # Returns single-step estimated Epsilon term
 
+    @staticmethod
+    def est_marginal_variance(haw, y_obs, y_pred, y_pred_targeted, psi):
+        doqg_psi_sq = (haw*(y_obs - y_pred) + y_pred_targeted - psi)**2
+        var_est = np.mean(doqg_psi_sq)
+        return var_est
+
+    @staticmethod
+    def est_conditional_variance(haw, y_obs, y_pred):
+        doqg_psi_sq = (haw*(y_obs - y_pred))**2
+        var_est = np.mean(doqg_psi_sq)
+        return var_est
+
 
 # Functions that all TMLEs can call are below
 def _tmle_unit_bounds_(y, mini, maxi, bound):

From a046ecce37a99986e7a0716e25e908f7f890aa7e Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Tue, 19 Nov 2019 08:26:20 -0500
Subject: [PATCH 26/42] adding variance divisor

---
 zepid/causal/doublyrobust/TMLE.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 0a37297..3d5867e 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -1146,13 +1146,13 @@ def fit(self, p, conditional=None, samples=100):
         # Marginal variance estimator
         # variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_,
         #                                                 y_pred_targeted=, psi=self.marginal_outcome)
-        # self.marginal_se = np.sqrt(variance_marginal)
+        # self.marginal_se = np.sqrt(variance_marginal) / np.sqrt(self.df.shape[0])
         # self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se,
         #                     self.marginal_outcome + zalpha * self.marginal_se]
 
         # Conditional on W variance estimator
         variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_)
-        self.conditional_se = np.sqrt(variance_conditional)
+        self.conditional_se = np.sqrt(variance_conditional) / np.sqrt(self.df.shape[0])
         self.conditional_ci = [self.marginal_outcome - zalpha * self.conditional_se,
                                self.marginal_outcome + zalpha * self.conditional_se]
 
@@ -1181,7 +1181,7 @@ def summary(self, decimal=3):
         # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal))
         print('======================================================================')
         print('Conditional')
-        # print('95% CL:    ', np.round(self.ci_cond, decimals=decimal))
+        print('95% CL:    ', np.round(self.conditional_ci, decimals=decimal))
         print('======================================================================')
 
     @staticmethod

From f51927c49666ff154d170b99a65163a2578e5855 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Tue, 19 Nov 2019 08:37:35 -0500
Subject: [PATCH 27/42] both variances for testing versus R

---
 zepid/causal/doublyrobust/TMLE.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 3d5867e..a2e115a 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -1106,6 +1106,7 @@ def fit(self, p, conditional=None, samples=100):
 
         # Step 6) Estimating psi
         q_star_list = []
+        q_i_star_list = []
         for i in range(samples):
             # Applying treatment plan
             df = self.df.copy()
@@ -1122,7 +1123,8 @@ def fit(self, p, conditional=None, samples=100):
             # Targeted Estimate
             logit_qstar = np.log(probability_to_odds(y_star)) + epsilon  # logit(Y^*) + e
             q_star = odds_to_probability(np.exp(logit_qstar))  # Y^*
-            q_star_list.append(np.mean(q_star))  # E[Y^*]
+            q_i_star_list.append(q_star)  # Saving Y_i^* for marginal variance
+            q_star_list.append(np.mean(q_star))  # Saving E[Y^*]
 
         if self._continuous_outcome:
             self.marginals_vector = _tmle_unit_unbound_(np.array(q_star_list),
@@ -1130,10 +1132,13 @@ def fit(self, p, conditional=None, samples=100):
             y_ = np.array(_tmle_unit_unbound_(self.df[self.outcome], mini=self._continuous_min,
                                               maxi=self._continuous_max))
             yq0_ = _tmle_unit_unbound_(self._Qinit_, mini=self._continuous_min, maxi=self._continuous_max)
+            yqstar_ = _tmle_unit_unbound_(np.array(q_i_star_list), mini=self._continuous_min, maxi=self._continuous_max)
+
         else:
             self.marginals_vector = q_star_list
             y_ = np.array(self.df[self.outcome])
             yq0_ = self._Qinit_
+            yqstar_ = np.array(q_i_star_list)
 
         self.marginal_outcome = np.mean(self.marginals_vector)
 
@@ -1144,11 +1149,12 @@ def fit(self, p, conditional=None, samples=100):
             zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)
 
         # Marginal variance estimator
-        # variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_,
-        #                                                 y_pred_targeted=, psi=self.marginal_outcome)
-        # self.marginal_se = np.sqrt(variance_marginal) / np.sqrt(self.df.shape[0])
-        # self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se,
-        #                     self.marginal_outcome + zalpha * self.marginal_se]
+        variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_,
+                                                       y_pred_targeted=np.mean(yqstar_, axis=0),
+                                                       psi=self.marginal_outcome)
+        self.marginal_se = np.sqrt(variance_marginal) / np.sqrt(self.df.shape[0])
+        self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se,
+                            self.marginal_outcome + zalpha * self.marginal_se]
 
         # Conditional on W variance estimator
         variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_)
@@ -1180,6 +1186,8 @@ def summary(self, decimal=3):
         print('Overall incidence:      ', np.round(self.marginal_outcome, decimals=decimal))
         # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal))
         print('======================================================================')
+        print('Marginal')
+        print('95% CL:    ', np.round(self.marginal_ci, decimals=decimal))
         print('Conditional')
         print('95% CL:    ', np.round(self.conditional_ci, decimals=decimal))
         print('======================================================================')

From c9c6a1b9ae1ee0688f019c24434bdab93d045663 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Mon, 25 Nov 2019 18:12:02 -0500
Subject: [PATCH 28/42] adding diagnostics option

---
 zepid/causal/doublyrobust/TMLE.py | 66 +++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index a2e115a..992d934 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -945,6 +945,7 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v
         self.outcome = outcome
 
         # Output attributes
+        self.epsilon = None
         self.marginals_vector = None
         self.marginal_outcome = None
         self.alpha = alpha
@@ -959,6 +960,7 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v
         self._Qinit_ = None
         self._treatment_model = None
         self._g_model = None
+        self._specified_bound_ = None
         self._denominator_ = None
         self._verbose_ = verbose
 
@@ -999,6 +1001,8 @@ def exposure_model(self, model, custom_model=None, bound=False):
 
         if bound:  # Bounding predicted probabilities if requested
             self._denominator_ = _bounding_(self._denominator_, bounds=bound)
+            # self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0)
+            #                                ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0))
 
         self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred)
 
@@ -1053,8 +1057,8 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
             # TODO need to create smart warning system
             self._out_model_custom = True
             data = patsy.dmatrix(model + ' - 1', self.df)
-            self._Qinit_
-            self._outcome_model
+            # self._Qinit_
+            # self._outcome_model
 
         if not bound:  # Bounding predicted probabilities if requested
             bound = self._cb
@@ -1102,7 +1106,8 @@ def fit(self, p, conditional=None, samples=100):
         haw = np.array(numerator / self._denominator_).astype(float)
 
         # Step 5) Estimating TMLE
-        epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw, verbose=self._verbose_)
+        self.epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw,
+                                           verbose=self._verbose_)
 
         # Step 6) Estimating psi
         q_star_list = []
@@ -1121,7 +1126,7 @@ def fit(self, p, conditional=None, samples=100):
             y_star = self._outcome_model.predict(df)
 
             # Targeted Estimate
-            logit_qstar = np.log(probability_to_odds(y_star)) + epsilon  # logit(Y^*) + e
+            logit_qstar = np.log(probability_to_odds(y_star)) + self.epsilon  # logit(Y^*) + e
             q_star = odds_to_probability(np.exp(logit_qstar))  # Y^*
             q_i_star_list.append(q_star)  # Saving Y_i^* for marginal variance
             q_star_list.append(np.mean(q_star))  # Saving E[Y^*]
@@ -1163,6 +1168,13 @@ def fit(self, p, conditional=None, samples=100):
                                self.marginal_outcome + zalpha * self.conditional_se]
 
     def summary(self, decimal=3):
+        """Prints summary of the estimated incidence under the specified treatment plan
+
+        Parameters
+        ----------
+        decimal : int, optional
+            Number of decimal places to display. Default is 3
+        """
         if self.marginal_outcome is None:
             raise ValueError('The fit() statement must be ran before summary()')
 
@@ -1171,25 +1183,45 @@ def summary(self, decimal=3):
         print('======================================================================')
         fmt = 'Treatment:        {:<15} No. Observations:     {:<20}'
         print(fmt.format(self.exposure, self.df.shape[0]))
-        fmt = 'Outcome:          {:<15}'
-        print(fmt.format(self.outcome))
-        fmt = 'Q-Model:          {:<15}'
-        print(fmt.format('Logistic'))
-        # TODO add custom model check
-        fmt = 'g-Model:          {:<15}'
-        print(fmt.format('Logistic'))
+
+        fmt = 'Outcome:          {:<15} No. Truncated:        {:<20}'
+        if self._specified_bound_ is None:
+            b = 0
+        else:
+            b = self._specified_bound_
+        print(fmt.format(self.outcome, b))
+
+        fmt = 'Q-Model:          {:<15} g-model:              {:<20}'
+        print(fmt.format('Logistic', 'Logistic'))
 
         # TODO add treatment plan information
         # TODO add information on m (number of re-samples)
 
         print('======================================================================')
         print('Overall incidence:      ', np.round(self.marginal_outcome, decimals=decimal))
-        # print('Var(Overall incidence): ', np.round(np.var(self.marginals_vector, ddof=1), decimals=decimal))
         print('======================================================================')
-        print('Marginal')
-        print('95% CL:    ', np.round(self.marginal_ci, decimals=decimal))
-        print('Conditional')
-        print('95% CL:    ', np.round(self.conditional_ci, decimals=decimal))
+        print('Marginal 95% CL:        ', np.round(self.marginal_ci, decimals=decimal))
+        print('Conditional 95% CL:     ', np.round(self.conditional_ci, decimals=decimal))
+        print('======================================================================')
+
+    def run_diagnostics(self, decimal=3):
+        """Provides some summary diagnostics for `StochasticTMLE`. Diagnostics must be called are the fit() function is
+        called since weights are dependent on the specific treatment plan of interest
+
+        Parameters
+        ----------
+        decimal : int, optional
+            Number of decimal places to display. Default is 3
+        """
+        if self.marginal_outcome is None:
+            raise ValueError('The fit() statement must be ran before summary()')
+
+        print('======================================================================')
+        print('                         Diagnostics                                  ')
+        print('======================================================================')
+        # TODO print epsilon
+        # TODO print outcome model accuracy
+        # TODO print distribution of weights / IPW summary
         print('======================================================================')
 
     @staticmethod
@@ -1210,12 +1242,14 @@ def targeting_step(y, q_init, iptw, verbose):
 
     @staticmethod
     def est_marginal_variance(haw, y_obs, y_pred, y_pred_targeted, psi):
+        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4965321/
         doqg_psi_sq = (haw*(y_obs - y_pred) + y_pred_targeted - psi)**2
         var_est = np.mean(doqg_psi_sq)
         return var_est
 
     @staticmethod
     def est_conditional_variance(haw, y_obs, y_pred):
+        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4965321/
         doqg_psi_sq = (haw*(y_obs - y_pred))**2
         var_est = np.mean(doqg_psi_sq)
         return var_est

From 22e85493d27baa28fb913a2eb5e6a829c5e63639 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Wed, 8 Jan 2020 13:37:53 -0500
Subject: [PATCH 29/42] CONFIRMED: psi must be transformed then averaged

---
 tests/test_doublyrobust.py        | 14 ++++++++++
 zepid/causal/doublyrobust/TMLE.py | 45 ++++++++++++++++---------------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py
index 2555d42..c31d219 100644
--- a/tests/test_doublyrobust.py
+++ b/tests/test_doublyrobust.py
@@ -412,6 +412,20 @@ def test_error_summary(self, df):
         with pytest.raises(ValueError):
             stmle.summary()
 
+    def test_warn_missing_data(self):
+        df = pd.DataFrame()
+        df['A'] = [1, 1, 0, 0, np.nan]
+        df['Y'] = [np.nan, 0, 1, 0, 1]
+        with pytest.warns(UserWarning):
+            StochasticTMLE(df=df, exposure='A', outcome='Y')
+
+    def test_drop_missing_data(self):
+        df = pd.DataFrame()
+        df['A'] = [1, 1, 0, 0, np.nan]
+        df['Y'] = [np.nan, 0, 1, 0, 1]
+        stmle = StochasticTMLE(df=df, exposure='A', outcome='Y')
+        assert stmle.df.shape[0] == 3
+
     def test_continuous_processing(self):
         a_list = [0, 1, 1, 0, 1, 1, 0, 0]
         y_list = [1, -1, 5, 0, 0, 0, 10, -5]
diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 992d934..0ab6ae3 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -884,7 +884,7 @@ class StochasticTMLE:
 
     Estimating TMLE for 0.2 being treated with ART
 
-    >>> tmle = TMLE(df, exposure='art', outcome='dead')
+    >>> tmle = StochasticTMLE(df, exposure='art', outcome='dead')
     >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
     >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
     >>> tmle.fit(p=0.2)
@@ -892,7 +892,7 @@ class StochasticTMLE:
 
     Estimating TMLE for conditional plan
 
-    >>> tmle = TMLE(df, exposure='art', outcome='dead')
+    >>> tmle = StochasticTMLE(df, exposure='art', outcome='dead')
     >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
     >>> tmle.outcome_model('art + male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
     >>> tmle.fit(p=[0.6, 0.4], conditional=["df['male']==1", "df['male']==0"])
@@ -902,7 +902,7 @@ class StochasticTMLE:
 
     >>> from sklearn.linear_model import LogisticRegression
     >>> log1 = LogisticRegression(penalty='l1', random_state=201)
-    >>> tmle = TMLE(df, 'art', 'dead')
+    >>> tmle = StochasticTMLE(df, 'art', 'dead')
     >>> tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1)
     >>> tmle.outcome_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', custom_model=log1)
     >>> tmle.fit(p=0.75)
@@ -916,10 +916,10 @@ class StochasticTMLE:
     studies. Springer Science & Business Media, 2011.
     """
     def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, verbose=False):
-        # Going through missing data
+        # Dropping ALL missing data (currently doesn't allow for censored outcomes)
         if df.dropna().shape[0] != df.shape[0]:
-            warnings.warn("There is missing data that is not the outcome in the data set. StochasticTMLE will drop all "
-                          "missing data. StochasticTMLE will fit "
+            warnings.warn("There is missing data in the data set. StochasticTMLE will drop all missing data. "
+                          "StochasticTMLE will fit "
                           + str(df.dropna().shape[0]) +
                           ' of ' + str(df.shape[0]) + ' observations', UserWarning)
             self.df = df.copy().dropna().reset_index()
@@ -960,6 +960,7 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v
         self._Qinit_ = None
         self._treatment_model = None
         self._g_model = None
+        self._resamples_ = None
         self._specified_bound_ = None
         self._denominator_ = None
         self._verbose_ = verbose
@@ -1001,8 +1002,8 @@ def exposure_model(self, model, custom_model=None, bound=False):
 
         if bound:  # Bounding predicted probabilities if requested
             self._denominator_ = _bounding_(self._denominator_, bounds=bound)
-            # self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0)
-            #                                ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0))
+            self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0)
+                                            ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0))
 
         self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred)
 
@@ -1028,7 +1029,6 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
         """
         self._q_model = self.outcome + ' ~ ' + model
 
-        # Step 1) Prediction for Q (estimation of Q-model)
         if custom_model is None:  # Standard parametric regression
             self._continuous_type = continuous_distribution
             if self._continuous_outcome:
@@ -1057,6 +1057,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
             # TODO need to create smart warning system
             self._out_model_custom = True
             data = patsy.dmatrix(model + ' - 1', self.df)
+            # TODO machine learning predictions
             # self._Qinit_
             # self._outcome_model
 
@@ -1080,7 +1081,7 @@ def fit(self, p, conditional=None, samples=100):
         `average _treatment_effect` for continuous outcomes
         """
         # TODO add a seed
-        # Error checking
+
         if self._denominator_ is None:
             raise ValueError("The exposure_model() function must be specified before the fit() function")
         if self._Qinit_ is None:
@@ -1093,7 +1094,7 @@ def fit(self, p, conditional=None, samples=100):
             if len(p) != len(conditional):
                 raise ValueError("'p' and 'conditional' must be the same length")
 
-        # Step 4) Calculating clever covariate (HAW)
+        # Step 1) Calculating clever covariate (HAW)
         if conditional is None:
             numerator = np.where(self.df[self.exposure] == 1, p, 1 - p)
         else:
@@ -1105,13 +1106,17 @@ def fit(self, p, conditional=None, samples=100):
 
         haw = np.array(numerator / self._denominator_).astype(float)
 
-        # Step 5) Estimating TMLE
+        # Step 2) Estimate from Q-model
+        # process completed in outcome_model() function and stored in self._Qinit_
+
+        # Step 3) Target parameter TMLE
         self.epsilon = self.targeting_step(y=self.df[self.outcome], q_init=self._Qinit_, iptw=haw,
                                            verbose=self._verbose_)
 
-        # Step 6) Estimating psi
+        # Step 4) Monte-Carlo Integration procedure
         q_star_list = []
         q_i_star_list = []
+        self._resamples_ = samples
         for i in range(samples):
             # Applying treatment plan
             df = self.df.copy()
@@ -1147,11 +1152,8 @@ def fit(self, p, conditional=None, samples=100):
 
         self.marginal_outcome = np.mean(self.marginals_vector)
 
-        # Step 7) Estimating Var(psi)
-        if self.alpha == 0.05:  # Without this, won't match R exactly. R relies on 1.96, while I use SciPy usually
-            zalpha = 1.96
-        else:
-            zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)
+        # Step 5) Estimating Var(psi)
+        zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)
 
         # Marginal variance estimator
         variance_marginal = self.est_marginal_variance(haw=haw, y_obs=y_, y_pred=yq0_,
@@ -1161,7 +1163,7 @@ def fit(self, p, conditional=None, samples=100):
         self.marginal_ci = [self.marginal_outcome - zalpha * self.marginal_se,
                             self.marginal_outcome + zalpha * self.marginal_se]
 
-        # Conditional on W variance estimator
+        # Conditional on W variance estimator (not generally recommended but I need it for other work)
         variance_conditional = self.est_conditional_variance(haw=haw, y_obs=y_, y_pred=yq0_)
         self.conditional_se = np.sqrt(variance_conditional) / np.sqrt(self.df.shape[0])
         self.conditional_ci = [self.marginal_outcome - zalpha * self.conditional_se,
@@ -1183,19 +1185,18 @@ def summary(self, decimal=3):
         print('======================================================================')
         fmt = 'Treatment:        {:<15} No. Observations:     {:<20}'
         print(fmt.format(self.exposure, self.df.shape[0]))
-
         fmt = 'Outcome:          {:<15} No. Truncated:        {:<20}'
         if self._specified_bound_ is None:
             b = 0
         else:
             b = self._specified_bound_
         print(fmt.format(self.outcome, b))
-
         fmt = 'Q-Model:          {:<15} g-model:              {:<20}'
         print(fmt.format('Logistic', 'Logistic'))
+        fmt = 'No. Resamples:    {:<15}'
+        print(fmt.format(self._resamples_))
 
         # TODO add treatment plan information
-        # TODO add information on m (number of re-samples)
 
         print('======================================================================')
         print('Overall incidence:      ', np.round(self.marginal_outcome, decimals=decimal))

From 5b4ad08f32783162cdf879ce171cdb4ad6267ce9 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 9 Jan 2020 11:40:32 -0500
Subject: [PATCH 30/42] updating run_diagnostics

---
 zepid/causal/doublyrobust/TMLE.py | 54 +++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 0ab6ae3..c3173aa 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -1001,9 +1001,9 @@ def exposure_model(self, model, custom_model=None, bound=False):
                                             ml_model=custom_model, print_results=self._verbose_)
 
         if bound:  # Bounding predicted probabilities if requested
-            self._denominator_ = _bounding_(self._denominator_, bounds=bound)
-            self._specified_bound_ = np.sum(np.where(self._denominator_ == bound, 1, 0)
-                                            ) + np.sum(np.where(self._denominator_ == 1 / bound, 1, 0))
+            pred = _bounding_(pred, bounds=bound)
+            self._specified_bound_ = np.sum(np.where(pred == bound, 1, 0)
+                                            ) + np.sum(np.where(pred == 1 / bound, 1, 0))
 
         self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred)
 
@@ -1220,11 +1220,53 @@ def run_diagnostics(self, decimal=3):
         print('======================================================================')
         print('                         Diagnostics                                  ')
         print('======================================================================')
-        # TODO print epsilon
-        # TODO print outcome model accuracy
-        # TODO print distribution of weights / IPW summary
+        print('                 Natural Course Prediction Accuracy                   ')
+        value = self.df[self.outcome] - self._Qinit_
+        fmt = 'Mean:                    {:<20}'
+        print(fmt.format(np.round(np.mean(value), decimals=decimal)))
+        fmt = 'Standard Deviation:      {:<20}'
+        print(fmt.format(np.round(np.std(value, ddof=1), decimals=decimal)))
+        fmt = 'Median:                  {:<20}'
+        print(fmt.format(np.round(np.median(value), decimals=decimal)))
+        fmt = 'Minimum value:           {:<20}'
+        print(fmt.format(np.round(np.min(value), decimals=decimal)))
+        fmt = 'Maximum value:           {:<20}'
+        print(fmt.format(np.round(np.max(value), decimals=decimal)))
+
+        print('----------------------------------------------------------------------')
+        print('               Inverse Probability Weight Denominator                 ')
+        w = 1 / self._denominator_
+        fmt = 'Mean (expected=2):       {:<20}'
+        print(fmt.format(np.round(np.mean(w), decimals=decimal)))
+        fmt = 'Standard Deviation:      {:<20}'
+        print(fmt.format(np.round(np.std(w, ddof=1), decimals=decimal)))
+        fmt = 'Minimum value:           {:<20}'
+        print(fmt.format(np.round(np.min(w), decimals=decimal)))
+        fmt = 'Maximum value:           {:<20}'
+        print(fmt.format(np.round(np.max(w), decimals=decimal)))
+
+        print('----------------------------------------------------------------------')
+        print('                        Targeting Step                                ')
+        fmt = 'Epsilon:                 {:<20}'
+        print(fmt.format(np.round(self.epsilon, decimals=decimal)))
+
         print('======================================================================')
 
+        # Generating plots
+        df = self.df.copy()
+        df['_g1_'] = np.where(df[self.exposure] == 1, self._denominator_, 1 - self._denominator_)
+
+        plt.figure(figsize=[8, 4])
+        plt.subplot(121)
+        plot_kde(df=df, treatment=self.exposure, probability='_g1_')
+        plt.title("Kernel Density of Propensity Scores")
+
+        plt.subplot(122)
+        plot_kde_accuracy(values=value, color='green')
+        plt.title("Kernel Density of Accuracy")
+        plt.tight_layout()
+        plt.show()
+
     @staticmethod
     def targeting_step(y, q_init, iptw, verbose):
         f = sm.families.family.Binomial()

From edfcad1eb96b7f75cea30d91c08ab19b9c37edf4 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 9 Jan 2020 11:40:55 -0500
Subject: [PATCH 31/42] adding reference pages for StochasticTMLE

---
 docs/Reference/Causal.rst                       |  1 +
 ....causal.doublyrobust.TMLE.StochasticTMLE.rst | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst

diff --git a/docs/Reference/Causal.rst b/docs/Reference/Causal.rst
index 5c421bf..d30b44d 100644
--- a/docs/Reference/Causal.rst
+++ b/docs/Reference/Causal.rst
@@ -69,6 +69,7 @@ Targeted Maximum Likelihood Estimator
    :toctree: generated/
 
    TMLE
+   StochasticTMLE
 
 G-estimation of SNM
 -------------------
diff --git a/docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst b/docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst
new file mode 100644
index 0000000..9428a15
--- /dev/null
+++ b/docs/Reference/generated/zepid.causal.doublyrobust.TMLE.StochasticTMLE.rst
@@ -0,0 +1,17 @@
+zepid.causal.doublyrobust.TMLE.StochastsicTMLE
+==============================================
+
+.. currentmodule:: zepid.causal.doublyrobust.StochasticTMLE
+
+.. autoclass:: StochasticTMLE
+   :members:
+
+   .. rubric:: Methods
+
+   .. autosummary::
+
+      ~TMLE.exposure_model
+      ~TMLE.outcome_model
+      ~TMLE.fit
+      ~TMLE.summary
+      ~TMLE.run_diagnostics

From 08c1dcd6eb82737fc1ca6310b0ac02b7f94ed14f Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 9 Jan 2020 14:44:32 -0500
Subject: [PATCH 32/42] adding support for machine-learning estimators in
 StochasticTMLE

---
 zepid/causal/doublyrobust/TMLE.py | 28 +++++++++-----
 zepid/causal/utils.py             | 64 ++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 11 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index c3173aa..fcdd23f 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -8,9 +8,9 @@
 
 from zepid.causal.utils import propensity_score, stochastic_check_conditional
 from zepid.calc import probability_to_odds, odds_to_probability
-from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, missing_machine_learner, _bounding_,
-                                plot_kde, plot_love, standardized_mean_differences, positivity,
-                                plot_kde_accuracy, outcome_accuracy)
+from zepid.causal.utils import (exposure_machine_learner, outcome_machine_learner, stochastic_outcome_machine_learner,
+                                stochastic_outcome_predict, missing_machine_learner, _bounding_, plot_kde, plot_love,
+                                standardized_mean_differences, positivity, plot_kde_accuracy, outcome_accuracy)
 
 
 class TMLE:
@@ -1001,9 +1001,8 @@ def exposure_model(self, model, custom_model=None, bound=False):
                                             ml_model=custom_model, print_results=self._verbose_)
 
         if bound:  # Bounding predicted probabilities if requested
-            pred = _bounding_(pred, bounds=bound)
-            self._specified_bound_ = np.sum(np.where(pred == bound, 1, 0)
-                                            ) + np.sum(np.where(pred == 1 / bound, 1, 0))
+            pred2 = _bounding_(pred, bounds=bound)
+            self._specified_bound_ = np.sum(np.where(pred2 == pred, 0, 1))
 
         self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred)
 
@@ -1057,9 +1056,12 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
             # TODO need to create smart warning system
             self._out_model_custom = True
             data = patsy.dmatrix(model + ' - 1', self.df)
-            # TODO machine learning predictions
-            # self._Qinit_
-            # self._outcome_model
+            output = stochastic_outcome_machine_learner(xdata=np.asarray(data),
+                                                        ydata=np.asarray(self.df[self.outcome]),
+                                                        ml_model=custom_model,
+                                                        continuous=self._continuous_outcome,
+                                                        print_results=self._verbose_)
+            self._Qinit_, self._outcome_model = output
 
         if not bound:  # Bounding predicted probabilities if requested
             bound = self._cb
@@ -1128,7 +1130,13 @@ def fit(self, p, conditional=None, samples=100):
                     df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0])
 
             # Outcome model under treatment plan
-            y_star = self._outcome_model.predict(df)
+            if self._out_model_custom :
+                data_star = patsy.dmatrix(self._q_model + ' - 1', self.df)
+                y_star = stochastic_outcome_predict(xdata=data_star,
+                                                    fit_ml_model=self._outcome_model,
+                                                    continuous=self._continuous_outcome)
+            else:
+                y_star = self._outcome_model.predict(df)
 
             # Targeted Estimate
             logit_qstar = np.log(probability_to_odds(y_star)) + self.epsilon  # logit(Y^*) + e
diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py
index 29f031c..9e99cf0 100644
--- a/zepid/causal/utils.py
+++ b/zepid/causal/utils.py
@@ -79,7 +79,7 @@ def exposure_machine_learner(xdata, ydata, ml_model, print_results=True):
 
 def outcome_machine_learner(xdata, ydata, all_a, none_a, ml_model, continuous, print_results=True):
     """Function to fit machine learning predictions. Used by TMLE to generate predicted probabilities of outcome
-    (i.e. Pr(Y=1 | A=1, L) and Pr(Y=1 | A=0, L)). Future update will include continuous Y functionality (i.e. E(Y))
+    (i.e. Pr(Y=1 | A=1, L) and Pr(Y=1 | A=0, L)).
     """
     # Trying to fit Machine Learning model
     try:
@@ -114,6 +114,68 @@ def outcome_machine_learner(xdata, ydata, all_a, none_a, ml_model, continuous, p
             raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute")
 
 
+def stochastic_outcome_machine_learner(xdata, ydata, ml_model, continuous, print_results=True):
+    """Function to fit machine learning predictions. Used by StochasticTMLE to generate predicted probabilities of
+    outcome (i.e. Pr(Y=1 | A, L)
+    """
+    # Trying to fit Machine Learning model
+    try:
+        fm = ml_model.fit(X=xdata, y=ydata)
+    except TypeError:
+        raise TypeError("Currently custom_model must have the 'fit' function with arguments 'X', 'y'. This "
+                        "covers both sklearn and supylearner. If there is a predictive model you would "
+                        "like to use, please open an issue at https://github.com/pzivich/zepid and I "
+                        "can work on adding support")
+    if print_results and hasattr(fm, 'summarize'):  # Nice summarize option from SuPyLearner
+        fm.summarize()
+
+    # Generating predictions
+    if continuous:
+        if hasattr(fm, 'predict'):
+            qa = fm.predict(xdata)
+            return qa, fm
+        else:
+            raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute")
+
+    else:
+        if hasattr(fm, 'predict_proba'):
+            qa = fm.predict_proba(xdata)
+            if qa.ndim == 1:  # Allows for PyGAM
+                return qa, fm
+            else:
+                return qa[:, 1], fm
+        elif hasattr(fm, 'predict'):
+            qa = fm.predict(xdata)
+            return qa, fm
+        else:
+            raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute")
+
+
+def stochastic_outcome_predict(xdata, fit_ml_model, continuous):
+    """Function to generate predictions machine learning predictions. Used by StochasticTMLE to generate predicted
+    probabilities of outcome (i.e. Pr(Y=1 | A=a*, L) in the Monte-Carlo integration procedure
+    """
+    # Generating predictions
+    if continuous:
+        if hasattr(fit_ml_model, 'predict'):
+            qa_star = fit_ml_model.predict(xdata)
+            return qa_star
+        else:
+            raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute")
+    else:
+        if hasattr(fit_ml_model, 'predict_proba'):
+            qa_star = fit_ml_model.predict_proba(xdata)
+            if qa_star.ndim == 1:  # Allows for PyGAM
+                return qa_star
+            else:
+                return qa_star[:, 1]
+        elif hasattr(fit_ml_model, 'predict'):
+            qa_star = fit_ml_model.predict(xdata)
+            return qa_star
+        else:
+            raise ValueError("Currently custom_model must have 'predict' or 'predict_proba' attribute")
+
+
 def missing_machine_learner(xdata, mdata, all_a, none_a, ml_model, print_results=True):
     """Function to fit machine learning predictions. Used by TMLE to generate predicted probabilities of missing
      outcome data, Pr(M=1|A,L)

From 9b6508db5daff4c87b91a18908514c57d7eae259 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 9 Jan 2020 14:59:20 -0500
Subject: [PATCH 33/42] fixing error in patsy data parsing for ML

---
 zepid/causal/doublyrobust/TMLE.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index fcdd23f..387903a 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -964,6 +964,9 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v
         self._specified_bound_ = None
         self._denominator_ = None
         self._verbose_ = verbose
+        self._out_model_custom = False
+        self._exp_model_custom = False
+        self._continuous_type = None
 
         # Custom model / machine learner storage
         self._g_custom_ = None
@@ -1029,6 +1032,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
         self._q_model = self.outcome + ' ~ ' + model
 
         if custom_model is None:  # Standard parametric regression
+            self._out_model_custom = False
             self._continuous_type = continuous_distribution
             if self._continuous_outcome:
                 if (continuous_distribution.lower() == 'gaussian') or (continuous_distribution.lower() == 'normal'):
@@ -1130,8 +1134,8 @@ def fit(self, p, conditional=None, samples=100):
                     df[self.exposure] = np.random.binomial(n=1, p=prop, size=df.shape[0])
 
             # Outcome model under treatment plan
-            if self._out_model_custom :
-                data_star = patsy.dmatrix(self._q_model + ' - 1', self.df)
+            if self._out_model_custom:
+                _, data_star = patsy.dmatrices(self._q_model + ' - 1', self.df)
                 y_star = stochastic_outcome_predict(xdata=data_star,
                                                     fit_ml_model=self._outcome_model,
                                                     continuous=self._continuous_outcome)

From 1987dcc9bb389d2955efb831eca44dadd8c8ad7d Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Thu, 9 Jan 2020 16:53:04 -0500
Subject: [PATCH 34/42] seed and checking documentation

---
 zepid/causal/doublyrobust/TMLE.py | 40 ++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index 387903a..a0c8698 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -851,7 +851,7 @@ class StochasticTMLE:
 
     .. math::
 
-        H(A=a, L) = \frac{I(A=a) p}{\widehat{\Pr}(A=a)}
+        H = \frac{p}{\widehat{\Pr}(A=a)}
 
     where `p` is the probability of treatment `a` under the stochastic intervention of interest.
 
@@ -912,7 +912,7 @@ class StochasticTMLE:
     Muñoz ID, and Van Der Laan MJ. Population intervention causal effects based on stochastic interventions.
     Biometrics 68.2 (2012): 541-549.
 
-    Van der Laan MJ, and Sherri R. Targeted learning in data science: causal inference for complex longitudinal
+    van der Laan MJ, and Sherri R. Targeted learning in data science: causal inference for complex longitudinal
     studies. Springer Science & Business Media, 2011.
     """
     def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, verbose=False):
@@ -973,7 +973,8 @@ def __init__(self, df, exposure, outcome, alpha=0.05, continuous_bound=0.0005, v
         self._q_custom_ = None
 
     def exposure_model(self, model, custom_model=None, bound=False):
-        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature
+        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature. This value is used as the denominator
+        for the inverse probability weights.
 
         Parameters
         ----------
@@ -997,7 +998,7 @@ def exposure_model(self, model, custom_model=None, bound=False):
             fitmodel = propensity_score(self.df, self._g_model, print_results=self._verbose_)
             pred = fitmodel.predict(self.df)
         else:  # User-specified prediction model
-            # TODO need to create smart warning system
+            # TODO need to create smart warning system  -- Issue #124
             self._exp_model_custom = True
             data = patsy.dmatrix(model + ' - 1', self.df)
             pred = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]),
@@ -1043,11 +1044,9 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
                     raise ValueError("Only 'gaussian' and 'poisson' distributions are supported for continuous "
                                      "outcomes")
                 self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit()
-
             else:
                 f = sm.families.family.Binomial()
                 self._outcome_model = smf.glm(self._q_model, self.df, family=f).fit()
-
             if self._verbose_:
                 print('==============================================================================')
                 print('Q-model')
@@ -1057,7 +1056,7 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
             self._Qinit_ = self._outcome_model.predict(self.df)
 
         else:  # User-specified model
-            # TODO need to create smart warning system
+            # TODO need to create smart warning system -- Issue #124
             self._out_model_custom = True
             data = patsy.dmatrix(model + ' - 1', self.df)
             output = stochastic_outcome_machine_learner(xdata=np.asarray(data),
@@ -1073,26 +1072,40 @@ def outcome_model(self, model, custom_model=None, bound=False, continuous_distri
         # This bounding step prevents continuous outcomes from being outside the range
         self._Qinit_ = _bounding_(self._Qinit_, bounds=bound)
 
-    def fit(self, p, conditional=None, samples=100):
+    def fit(self, p, conditional=None, samples=100, seed=None):
         """Calculate the effect from the predicted exposure probabilities and predicted outcome values using the TMLE
         procedure. Confidence intervals are calculated using influence curves.
 
+        Parameters
+        ----------
+        p : float, list, tuple
+            Proportion that correspond to the number of persons treated (all values must be between 0.0 and 1.0). If
+            conditional is specified, p must be a list/tuple of floats of the same length
+        conditional : None, list, tuple, optional
+            A
+        samples : int, optional
+            Number of samples to use for the Monte Carlo integration procedure
+        seed : None, int, optional
+            Seed for the Monte Carlo integration procedure
+
         Note
         ----
         Exposure and outcome models must be specified prior to `fit()`
 
         Returns
         -------
-        TMLE gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and
-        `average _treatment_effect` for continuous outcomes
+        `StochasticTMLE` gains `marginal_vector` and `marginal_outcome` along with `marginal_ci`
         """
-        # TODO add a seed
-
         if self._denominator_ is None:
             raise ValueError("The exposure_model() function must be specified before the fit() function")
         if self._Qinit_ is None:
             raise ValueError("The outcome_model() function must be specified before the fit() function")
 
+        if seed is None:
+            pass
+        else:
+            np.random.seed(seed)
+
         p = np.array(p)
         if np.any(p > 1) or np.any(p < 0):
             raise ValueError("All specified treatment probabilities must be between 0 and 1")
@@ -1195,6 +1208,7 @@ def summary(self, decimal=3):
         print('======================================================================')
         print('          Stochastic Targeted Maximum Likelihood Estimator            ')
         print('======================================================================')
+
         fmt = 'Treatment:        {:<15} No. Observations:     {:<20}'
         print(fmt.format(self.exposure, self.df.shape[0]))
         fmt = 'Outcome:          {:<15} No. Truncated:        {:<20}'
@@ -1208,8 +1222,6 @@ def summary(self, decimal=3):
         fmt = 'No. Resamples:    {:<15}'
         print(fmt.format(self._resamples_))
 
-        # TODO add treatment plan information
-
         print('======================================================================')
         print('Overall incidence:      ', np.round(self.marginal_outcome, decimals=decimal))
         print('======================================================================')

From 83b769e121aa8b07b0f2f52286a6068812241896 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Fri, 10 Jan 2020 11:38:50 -0500
Subject: [PATCH 35/42] updating ML model specification

---
 tests/test_doublyrobust.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py
index c31d219..a1e5114 100644
--- a/tests/test_doublyrobust.py
+++ b/tests/test_doublyrobust.py
@@ -240,7 +240,7 @@ def test_match_r_continuous_poisson(self, cf):
         npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3)
 
     def test_sklearn_in_tmle(self, df):
-        log = LogisticRegression(C=1.0)
+        log = LogisticRegression(C=1.0, solver='liblinear')
         tmle = TMLE(df, exposure='art', outcome='dead')
         tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log)
         tmle.outcome_model('art + male + age0 + cd40 + dvl0', custom_model=log)
@@ -257,7 +257,7 @@ def test_sklearn_in_tmle(self, df):
         npt.assert_allclose(tmle.odds_ratio_ci, [0.2139277755, 0.944971255], rtol=1e-5)
 
     def test_sklearn_in_tmle2(self, cf):
-        log = LogisticRegression(C=1.0)
+        log = LogisticRegression(C=1.0, solver='liblinear')
         lin = LinearRegression()
         tmle = TMLE(cf, exposure='art', outcome='cd4_wk45')
         tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log)
@@ -317,7 +317,7 @@ def test_missing_continuous_outcome(self, mcf):
         npt.assert_allclose(tmle.average_treatment_effect_ci, r_ci, rtol=1e-3)
 
     def test_sklearn_in_tmle_missing(self, mf):
-        log = LogisticRegression(C=1.0)
+        log = LogisticRegression(C=1.0, solver='liblinear')
         tmle = TMLE(mf, exposure='art', outcome='dead')
         tmle.exposure_model('male + age0 + cd40 + dvl0', custom_model=log, print_results=False)
         tmle.missing_model('male + age0 + cd40 + dvl0', custom_model=log, print_results=False)
@@ -525,6 +525,15 @@ def test_compare_tmle(self, df):
 
     # TODO compare to R in several versions
 
+    def test_machine_learning_runs(self, df):
+        # Only verifies that machine learning doesn't throw an error
+        log = LogisticRegression(penalty='l1', solver='liblinear', random_state=201)
+
+        tmle = StochasticTMLE(df, exposure='art', outcome='dead')
+        tmle.exposure_model('male + age0 + cd40 + cd4_rs1 + cd4_rs2 + dvl0 + male:dvl0', custom_model=log)
+        tmle.outcome_model('art + male + age0 + dvl0  + cd40', custom_model=log)
+        tmle.fit(p=0.4, samples=20)
+
 
 class TestAIPTW:
 

From 03106795259d2e5847b54dfe7165b019a167a4a9 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Fri, 10 Jan 2020 12:37:03 -0500
Subject: [PATCH 36/42] updating StochasticTMLE tests

---
 tests/test_doublyrobust.py        | 66 +++++++++++++++++++++++++++++--
 zepid/causal/doublyrobust/TMLE.py |  1 +
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/tests/test_doublyrobust.py b/tests/test_doublyrobust.py
index a1e5114..185be4c 100644
--- a/tests/test_doublyrobust.py
+++ b/tests/test_doublyrobust.py
@@ -503,7 +503,7 @@ def test_gmodel_params(self, simple_df):
 
         npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
 
-    def test_compare_tmle(self, df):
+    def test_compare_tmle_binary(self, df):
         stmle = StochasticTMLE(df, exposure='art', outcome='dead')
         stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
         stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
@@ -521,9 +521,69 @@ def test_compare_tmle(self, df):
 
         npt.assert_allclose(expected, all_treat - non_treat, atol=1e-4)
 
-    # TODO check bounding
+    def test_compare_tmle_continuous(self, cf):
+        cf['cd4_wk45'] = np.log(cf['cd4_wk45'])
+        stmle = StochasticTMLE(cf, exposure='art', outcome='cd4_wk45')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.fit(p=1.0, samples=1)
+        all_treat = stmle.marginal_outcome
+        stmle.fit(p=0.0, samples=1)
+        non_treat = stmle.marginal_outcome
+
+        tmle = TMLE(cf, exposure='art', outcome='cd4_wk45')
+        tmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0', print_results=False)
+        tmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0',
+                           print_results=False)
+        tmle.fit()
+        expected = tmle.average_treatment_effect
+
+        npt.assert_allclose(expected, all_treat - non_treat, atol=1e-3)
+
+    def test_qmodel_bound(self, simple_df):
+        # Comparing to SAS logit model
+        sas_params = [-1.0699, -0.9525, 1.5462]
+        sas_preds = [0.3831332, 0.2554221, 0.2, 0.2554221, 0.6, 0.6, 0.2, 0.2554221, 0.3831332]
+
+        stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='Y')
+        stmle.outcome_model('A + W', bound=[0.2, 0.6])
+        est_params = stmle._outcome_model.params
+        est_preds = stmle._Qinit_
+
+        npt.assert_allclose(sas_params, est_params, atol=1e-4)
+        npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
+
+    def test_gmodel_bound(self, simple_df):
+        # Comparing to SAS Poisson model
+        sas_preds = [2, 1/0.55, 1/0.45, 1/0.55, 2, 2, 1/0.45, 1/0.55, 2]
+
+        stmle = StochasticTMLE(df=simple_df, exposure='A', outcome='C')
+        stmle.exposure_model('W', bound=[0.45, 0.55])
+        est_preds = 1 / stmle._denominator_
+
+        npt.assert_allclose(sas_preds, est_preds, atol=1e-6)
+
+    def test_calculate_epsilon1(self, df):
+        stmle = StochasticTMLE(df, exposure='art', outcome='dead')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + dvl0  + cd40 + cd4_rs1 + cd4_rs2')
+
+        stmle.fit(p=0.15, samples=1)
+        npt.assert_allclose(-0.0157043107, stmle.epsilon, atol=1e-6)
+
+        stmle.fit(p=0.4, samples=1)
+        npt.assert_allclose(-0.0381559025, stmle.epsilon, atol=1e-6)
+
+    def test_calculate_epsilon2(self, cf):
+        stmle = StochasticTMLE(cf, exposure='art', outcome='cd4_wk45')
+        stmle.exposure_model('male + age0 + age_rs1 + age_rs2 + cd40 + cd4_rs1 + cd4_rs2 + dvl0')
+        stmle.outcome_model('art + male + age0 + age_rs1 + age_rs2 + dvl0  + cd40 + cd4_rs1 + cd4_rs2')
+
+        stmle.fit(p=0.15, samples=1)
+        npt.assert_allclose(-0.0059476590, stmle.epsilon, atol=1e-6)
 
-    # TODO compare to R in several versions
+        stmle.fit(p=0.4, samples=1)
+        npt.assert_allclose(-0.0154923643, stmle.epsilon, atol=1e-6)
 
     def test_machine_learning_runs(self, df):
         # Only verifies that machine learning doesn't throw an error
diff --git a/zepid/causal/doublyrobust/TMLE.py b/zepid/causal/doublyrobust/TMLE.py
index a0c8698..31a379a 100644
--- a/zepid/causal/doublyrobust/TMLE.py
+++ b/zepid/causal/doublyrobust/TMLE.py
@@ -1007,6 +1007,7 @@ def exposure_model(self, model, custom_model=None, bound=False):
         if bound:  # Bounding predicted probabilities if requested
             pred2 = _bounding_(pred, bounds=bound)
             self._specified_bound_ = np.sum(np.where(pred2 == pred, 0, 1))
+            pred = pred2
 
         self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred)
 

From ab6c6b3634ea18b24d3466a6fde6068ca4a9d320 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Sun, 5 Jul 2020 12:10:30 -0400
Subject: [PATCH 37/42] updating based on review

---
 zepid/causal/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/zepid/causal/utils.py b/zepid/causal/utils.py
index 9e99cf0..fb13755 100644
--- a/zepid/causal/utils.py
+++ b/zepid/causal/utils.py
@@ -676,10 +676,10 @@ def plot_kde_accuracy(values, bw_method='scott', fill=True, color='b'):
 def stochastic_check_conditional(df, conditional):
     """Check that conditionals are exclusive for the stochastic fit process. Generates a warning if not true
     """
-    a = np.array([0] * df.shape[0])
+    a = np.zeros(df.shape[0])
     for c in conditional:
-        a = np.add(a, np.where(eval(c), 1, 0))
+        a = a + np.where(eval(c), 1, 0)
 
-    if np.sum(np.where(a > 1, 1, 0)):
+    if np.any(a > 1):
         warnings.warn("It looks like your conditional categories are NOT exclusive. For appropriate estimation, "
                       "the conditions that designate each category should be exclusive", UserWarning)

From a4243b060eb95bcc8cf55d3f38027667ef52a0f8 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Sun, 12 Jul 2020 14:19:53 -0400
Subject: [PATCH 38/42] adding error with the AIPSW weights since seems to be
 underlying issue

---
 tests/test_generalize.py              | 36 +++++++++++++--------------
 zepid/causal/generalize/estimators.py | 12 +++++----
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/tests/test_generalize.py b/tests/test_generalize.py
index a4d277e..35cecf1 100644
--- a/tests/test_generalize.py
+++ b/tests/test_generalize.py
@@ -18,7 +18,7 @@ def df_r():
 def df_c():
     df = ze.load_generalize_data(True)
     df['W_sq'] = df['W'] ** 2
-    df['weight'] = 3
+    df['weight'] = 2
     return df
 
 
@@ -195,20 +195,20 @@ def test_transport_conf(self, df_c):
         npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5)
         npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4)
 
-    def test_generalize_weight(self, df_c):
-        aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight')
-        aipw.sampling_model('L + W_sq', print_results=False)
-        aipw.treatment_model('L', print_results=False)
-        aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
-        aipw.fit()
-        npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5)
-        npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4)
-
-    def test_transport_weight(self, df_c):
-        aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight')
-        aipw.sampling_model('L + W_sq', print_results=False)
-        aipw.treatment_model('L', print_results=False)
-        aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
-        aipw.fit()
-        npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5)
-        npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4)
+    # def test_generalize_weight(self, df_c):
+    #    aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=True, weights='weight')
+    #    aipw.sampling_model('L + W_sq', print_results=False)
+    #    aipw.treatment_model('L', print_results=False)
+    #    aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
+    #    aipw.fit()
+    #    npt.assert_allclose(aipw.risk_difference, 0.048129, atol=1e-5)
+    #    npt.assert_allclose(aipw.risk_ratio, 1.146787, atol=1e-4)
+
+    # def test_transport_weight(self, df_c):
+    #    aipw = AIPSW(df_c, exposure='A', outcome='Y', selection='S', generalize=False, weights='weight')
+    #    aipw.sampling_model('L + W_sq', print_results=False)
+    #    aipw.treatment_model('L', print_results=False)
+    #    aipw.outcome_model('A + L + L:A + W_sq + W_sq:A + W_sq:A:L', print_results=False)
+    #    aipw.fit()
+    #    npt.assert_allclose(aipw.risk_difference, 0.041407, atol=1e-5)
+    #    npt.assert_allclose(aipw.risk_ratio, 1.120556, atol=1e-4)
diff --git a/zepid/causal/generalize/estimators.py b/zepid/causal/generalize/estimators.py
index 7c3a1cc..d156b03 100644
--- a/zepid/causal/generalize/estimators.py
+++ b/zepid/causal/generalize/estimators.py
@@ -601,6 +601,8 @@ def __init__(self, df, exposure, outcome, selection, generalize=True, weights=No
         self.exposure = exposure
         self.outcome = outcome
         self.selection = selection
+        if weights is not None:
+            raise ValueError("AIPSW is not stable with `weights`. This functionality is currently unavailable")
         self.weight = weights
 
         self.ipsw = None
@@ -732,11 +734,11 @@ def outcome_model(self, model, outcome_type='binary', print_results=True):
             print(self._outcome_model.summary())
 
         dfa = self.df.copy()
-        dfn = self.df.copy()
         dfa[self.exposure] = 1
-        dfn[self.exposure] = 0
-
         self._YA1 = self._outcome_model.predict(dfa)
+
+        dfn = self.df.copy()
+        dfn[self.exposure] = 0
         self._YA0 = self._outcome_model.predict(dfn)
 
     def fit(self):
@@ -748,9 +750,9 @@ def fit(self):
         the exposure-outcome relationship based on the data and IPSW model
         """
         if not self._denominator_model:
-            raise ValueError('The weight_model() function must be specified before effect measure estimation')
+            raise ValueError('sampling_model() function must be specified before effect measure estimation')
         if not self._outcome_model:
-            raise ValueError('The outcome_model() function must be specified before effect measure estimation')
+            raise ValueError('outcome_model() function must be specified before effect measure estimation')
 
         if self.weight is not None:
             if self.iptw is None:

From 043cbd9bd1339283957358aa47d83b18e3c77c77 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Sun, 12 Jul 2020 14:22:00 -0400
Subject: [PATCH 39/42] updating the travisci 3.7 call:

---
 .travis.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f086f5b..4e14b95 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,12 +3,7 @@ language: python
 python:
    - "3.5"
    - "3.6"
-
-matrix:
-  include:
-    - python: 3.7
-      dist: xenial
-      sudo: true
+   - "3.7"
 
 notifications:
   email: false

From dff5c34b4d13f7b5e39fd838b2f88b7315c633bb Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Sun, 12 Jul 2020 14:23:09 -0400
Subject: [PATCH 40/42] constrating numpy in travisci for python 3.5

---
 travis-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis-requirements.txt b/travis-requirements.txt
index 32a3c3a..b6ac319 100644
--- a/travis-requirements.txt
+++ b/travis-requirements.txt
@@ -1,4 +1,4 @@
-numpy
+numpy<=1.18.5
 pandas>=0.18
 statsmodels>=0.7.0
 matplotlib>=2.1.0

From bc6e6d00317a186a3ca9688573b5f6dcd304beca Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Sun, 12 Jul 2020 14:31:29 -0400
Subject: [PATCH 41/42] scipy version restrictions

---
 travis-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis-requirements.txt b/travis-requirements.txt
index b6ac319..2fe235e 100644
--- a/travis-requirements.txt
+++ b/travis-requirements.txt
@@ -2,7 +2,7 @@ numpy<=1.18.5
 pandas>=0.18
 statsmodels>=0.7.0
 matplotlib>=2.1.0
-scipy
+scipy<=1.0.1
 tabulate
 patsy
 sklearn

From 1d0076c349810799dec66c45673ccb02fc188ec8 Mon Sep 17 00:00:00 2001
From: pzivich <pzivich@live.unc.edu>
Date: Sun, 12 Jul 2020 14:38:58 -0400
Subject: [PATCH 42/42] another attempt to get travis ci working for 3.5 and
 3.7

---
 .travis.yml             | 5 +----
 travis-requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4e14b95..27c8426 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,12 +8,9 @@ python:
 notifications:
   email: false
 
-before_install:
-  - "pip install -U pip"
-  - "python setup.py install"
-
 install:
   - pip install -r travis-requirements.txt
+  - python setup.py install
 
 script:
   - pytest
\ No newline at end of file
diff --git a/travis-requirements.txt b/travis-requirements.txt
index 2fe235e..666490f 100644
--- a/travis-requirements.txt
+++ b/travis-requirements.txt
@@ -2,7 +2,7 @@ numpy<=1.18.5
 pandas>=0.18
 statsmodels>=0.7.0
 matplotlib>=2.1.0
-scipy<=1.0.1
+scipy<=1.4.1
 tabulate
 patsy
 sklearn