Merge pull request #42 from sdfordham/fix-penalized-synth

Fix penalized synth
sdfordham · Feb 3, 2024 · fb07cbd · fb07cbd
2 parents 982dd1b + fe71ea3
commit fb07cbd
Show file tree

Hide file tree

Showing 5 changed files with 218 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ A python module for the synthetic control method that provides implementations o
 - Synthetic Control Method (Abadie & Gardeazabal 2003)
 - Robust Synthetic Control Method (Amjad, Shah & Shen 2018)
 - Augmented Synthetic Control Method (Ben-Michael, Feller & Rothstein 2021)
-- ~~Penalized Synthetic Control Method (Abadie & L'Hour 2021)~~ (the implementation of this method is flawed, and in the process of being corrected.)
+- Penalized Synthetic Control Method (Abadie & L'Hour 2021)
 
 The package also provides methods for performing placebo tests with the above.
 

diff --git a/examples/penalized/basque_penalized.ipynb b/examples/penalized/basque_penalized.ipynb
diff --git a/pysyncon/penalized.py b/pysyncon/penalized.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
-from typing import Optional
+from typing import Optional, Literal, Union
 
 import numpy as np
+import pandas as pd
 from scipy.optimize import minimize, Bounds, LinearConstraint
 
 from .dataprep import Dataprep
@@ -11,24 +12,33 @@
 class PenalizedOptimMixin:
     @staticmethod
     def w_optimize(
+        V_mat: np.ndarray,
         X0: np.ndarray,
         X1: np.ndarray,
         lambda_: float,
-        initial: Optional[np.ndarray] = None,
+        qp_method: Literal["SLSQP"] = "SLSQP",
+        qp_options: dict = {"maxiter": 1000},
     ) -> tuple[np.ndarray, float]:
         """Solves the weight optimisation problem in the penalized setting.
 
         Parameters
         ----------
-        X0 : numpy.ndarray, shape (m, c)
+        V_mat : numpy.ndarray, shape (c, c)
+            The V matrix (using the notation of the Abadie, Diamond &
+            Hainmueller paper, this matrix is denoted by Γ in the Abadie and
+            L'Hour paper).
+        X0 : numpy.ndarray, shape (c, m)
             Matrix with each column corresponding to a control unit and each
             row is covariates.
-        X1 : numpy.ndarray, shape (m,)
+        X1 : numpy.ndarray, shape (c,)
             Column vector giving the covariate values for the treated unit.
         lambda_ : float,
             Regularization parameter.
-        initial: numpy.ndarray, shape(m,), optional
-            Initial point to use in the optimisation problem.
+        qp_method : str, optional
+            Minimization routine to use in scipy minimize to solve the problem
+            , by default "SLSQP"
+        qp_options : dict, optional
+            Options for scipy minimize, by default {"maxiter": 1000}
 
         Returns
         -------
@@ -38,21 +48,28 @@ def w_optimize(
         :meta private:
         """
         _, n_c = X0.shape
+
         diff = np.subtract(X0, X1.reshape(-1, 1))
-        r = np.linalg.norm(diff, axis=0)
+        r = np.diag(diff.T @ V_mat @ diff)
+
+        P = X0.T @ V_mat @ X0
+        q = -1.0 * X1.T @ V_mat @ X0 + (lambda_ / 2.0) * r.T
 
         def fun(x):
-            return (X1 - X0 @ x).T @ (X1 - X0 @ x) + lambda_ * (r.T @ x)
+            return q.T @ x + 0.5 * x.T @ P @ x
 
         bounds = Bounds(lb=np.full(n_c, 0.0), ub=np.full(n_c, 1.0))
         constraints = LinearConstraint(A=np.full(n_c, 1.0), lb=1.0, ub=1.0)
 
-        if initial:
-            x0 = initial
-        else:
-            x0 = np.full(n_c, 1 / n_c)
-
-        res = minimize(fun=fun, x0=x0, bounds=bounds, constraints=constraints)
+        x0 = np.full(n_c, 1 / n_c)
+        res = minimize(
+            fun=fun,
+            x0=x0,
+            bounds=bounds,
+            constraints=constraints,
+            method=qp_method,
+            options=qp_options,
+        )
         W, loss_W = res["x"], res["fun"]
         return W, loss_W.item()
 
@@ -64,24 +81,67 @@ class PenalizedSynth(BaseSynth, PenalizedOptimMixin):
 
     def __init__(self) -> None:
         super().__init__()
-        self.W: Optional[np.ndarray] = None
+        self.loss_W: Optional[float] = None
         self.lambda_: Optional[float] = None
 
-    def fit(self, dataprep: Dataprep, lambda_: float) -> None:
+    def fit(
+        self,
+        dataprep: Optional[Dataprep] = None,
+        X0: Optional[pd.DataFrame] = None,
+        X1: Optional[Union[pd.Series, pd.DataFrame]] = None,
+        lambda_: Optional[float] = 0.01,
+        custom_V: Optional[np.ndarray] = None,
+    ) -> None:
         """Fit the model/calculate the weights.
 
         Parameters
         ----------
-        dataprep : Dataprep
-            :class:`Dataprep` object containing data to model.
-        lambda_ : float
-            Ridge parameter to use.
+        dataprep : Dataprep, optional
+            :class:`Dataprep` object containing data to model, by default None.
+        X0 : pd.DataFrame, shape (c, m), optional
+            Matrix with each column corresponding to a control unit and each
+            row is a covariate value, by default None.
+        X1 : pandas.Series | pandas.DataFrame, shape (c, 1), optional
+            Column vector giving the covariate values for the treated unit, by
+            default None.
+        lambda_ : float, optional
+            Ridge parameter to use, default 0.01
+        custom_V : numpy.ndarray, shape (c, c), optional
+            Provide a V matrix (using the notation of the Abadie, Diamond &
+            Hainmueller paper, this matrix is denoted by Γ in the Abadie and
+            L'Hour paper), if not provided then the identity matrix is used
+            (equal importance to all covariates).
+
+        Returns
+        -------
+        NoneType
+            None
+
+        Raises
+        ------
+        ValueError
+            if neither a Dataprep object nor all of (X0, X1) are
+            supplied.
         """
-        self.dataprep = dataprep
+        if dataprep:
+            self.dataprep = dataprep
+            X0, X1 = dataprep.make_covariate_mats()
+        else:
+            if X0 is None or X1 is None:
+                raise ValueError("dataprep must be set or (X0, X1) must all be set.")
         self.lambda_ = lambda_
 
-        X0_df, X1_df = dataprep.make_outcome_mats()
-        X0, X1 = X0_df.to_numpy(), X1_df.to_numpy()
+        X = pd.concat([X0, X1], axis=1)
+        X_scaled = X.divide(X.std(axis=1), axis=0)
+        X0_scaled, X1_scaled = X_scaled.drop(columns=X1.name), X_scaled[X1.name]
+
+        X0_arr = X0_scaled.to_numpy()
+        X1_arr = X1_scaled.to_numpy()
+
+        if custom_V is None:
+            V_mat = np.identity(X0_arr.shape[0])
+        else:
+            V_mat = np.diag(custom_V)
 
-        W, _ = self.w_optimize(X0=X0, X1=X1, lambda_=lambda_)
-        self.W = W
+        W, loss_W = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr, lambda_=lambda_)
+        self.W, self.loss_W = W, loss_W
diff --git a/tests/test_penalized.py b/tests/test_penalized.py
@@ -0,0 +1,43 @@
+import unittest
+import numpy as np
+import pandas as pd
+
+import pysyncon
+
+
+class TestPenalizedSynth(unittest.TestCase):
+    def setUp(self):
+        self.dataprep = pysyncon.Dataprep(
+            foo=pd.DataFrame(
+                {
+                    "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
+                    "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
+                    "dependent": np.random.random(12),
+                    "predictor1": np.random.random(12),
+                    "predictor2": np.random.random(12),
+                }
+            ),
+            predictors=["predictor1"],
+            predictors_op="mean",
+            dependent="dependent",
+            unit_variable="name",
+            time_variable="time",
+            treatment_identifier=1,
+            controls_identifier=[2, 3],
+            time_predictors_prior=[2, 3],
+            time_optimize_ssr=[1, 2, 3],
+            special_predictors=[
+                ("predictor1", [2], "mean"),
+                ("predictor2", [1, 2], "median"),
+                ("predictor2", [1, 2], "std"),
+            ],
+        )
+        self.custom_V = np.full(4, 1.0)
+
+    def test_fit_no_data(self):
+        pen = pysyncon.PenalizedSynth()
+        self.assertRaises(ValueError, pen.fit)
+
+    def test_fit_custom_V(self):
+        pen = pysyncon.PenalizedSynth()
+        pen.fit(dataprep=self.dataprep, custom_V=self.custom_V)
diff --git a/tests/test_penalized_basque.py b/tests/test_penalized_basque.py
@@ -50,27 +50,29 @@ def setUp(self):
                 "Navarra (Comunidad Foral De)",
                 "Principado De Asturias",
                 "Rioja (La)",
+                "Spain (Espana)",
             ],
             time_optimize_ssr=range(1960, 1970),
         )
         self.lambda_ = 0.01
         self.weights = {
             "Aragon": 0.0,
-            "Baleares (Islas)": 0.21680004,
+            "Baleares (Islas)": 0.0,
             "Andalucia": 0.0,
             "Canarias": 0.0,
-            "Cantabria": 0.0,
+            "Cantabria": 0.241,
             "Castilla Y Leon": 0.0,
             "Castilla-La Mancha": 0.0,
-            "Cataluna": 0.636479454,
+            "Cataluna": 0.759,
             "Comunidad Valenciana": 0.0,
             "Extremadura": 0.0,
             "Galicia": 0.0,
-            "Madrid (Comunidad De)": 0.146720506,
+            "Madrid (Comunidad De)": 0.0,
             "Murcia (Region de)": 0.0,
             "Navarra (Comunidad Foral De)": 0.0,
             "Principado De Asturias": 0.0,
             "Rioja (La)": 0.0,
+            "Spain (Espana)": 0.0,
         }
 
     def test_weights(self):