Merge pull request #17 from washingtonpost/ELEX-3031-voterflow-models 🎉

ELEX-3031: Potential Voterflow Solvers
washingtonpost · Mar 29, 2024 · a4e6030 · a4e6030
2 parents 07b58fd + 06385f0
commit a4e6030
Show file tree

Hide file tree

Showing 8 changed files with 598 additions and 26 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -4,10 +4,10 @@ jobs:
   test:
     name: Run unit tests
     runs-on: ubuntu-latest
-    timeout-minutes: 5
+    timeout-minutes: 15
     strategy:
       matrix:
-        python-version: ['3.10']
+        python-version: ['3.11']
     steps:
       - uses: actions/checkout@v2
       - name: Setup Python

diff --git a/README.md b/README.md
@@ -16,8 +16,8 @@ We have our own implementation of ordinary least squares in Python because this
 ## Quantile Regression
 Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. At the moment this uses two libraries, the version that solves the non-regularized problem uses `numpy`and solves the dual based on [this](https://arxiv.org/pdf/2305.12616.pdf) paper. The version that solves the regularized problem uses [`cvxpy`](https://www.cvxpy.org/#) and sets up the problem as a normal optimization problem. Eventually, we are planning on replacing the regularized version with the dual also.
 
-## Transition matrices
-We also have a solver for transition matrices. While this works arbitrarily, we have used this in the past for our primary election night model. We can still use this to create the sankey diagram coefficients.
+## Transition Matrices
+We also have a matrix regression solver built with `cvxpy`.  We've used this for our primary election model and analysis.  The transitions it generates form the transitions displayed in our sankey diagrams.
 
 ## Development
 We welcome contributions to this repo. Please open a Github issue for any issues or comments you have.

diff --git a/setup.cfg b/setup.cfg
@@ -2,4 +2,8 @@
 universal = 1
 
 [pycodestyle]
-max-line-length = 160
+max-line-length = 160
+
+[pylint]
+max-line-length = 160
+disable = invalid-name, duplicate-code, missing-function-docstring, too-many-instance-attributes, too-many-arguments, too-many-locals, missing-module-docstring
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import find_packages, setup
 
-INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "scipy~=1.11"]
+INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "scipy~=1.12"]
 
 THIS_FILE_DIR = os.path.dirname(__file__)
 
@@ -29,7 +29,7 @@
         "Intended Audience :: Developers",
         "License :: OSI Approved :: MIT License",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
     ],
     description="A package for optimization solvers",
     long_description=LONG_DESCRIPTION,

diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py
@@ -1,28 +1,90 @@
+import logging
+import warnings
+
 import cvxpy as cp
+import numpy as np
+
+from elexsolver.logging import initialize_logging
+from elexsolver.TransitionSolver import TransitionSolver
+
+initialize_logging()
+
+LOG = logging.getLogger(__name__)
 
 
-class TransitionMatrixSolver:
-    def __init__(self):
-        self.transition_matrix = None
+class TransitionMatrixSolver(TransitionSolver):
+    """
+    Matrix regression transition solver using CVXPY.
+    """
+
+    def __init__(self, strict: bool = True, lam: float | None = None):
+        """
+        Parameters
+        ----------
+        strict : bool, default True
+            If `True`, solution will be constrainted so that all coefficients are >= 0,
+            <= 1, and the sum of each row equals 1.
+        lam : float, optional
+            `lam != 0` will enable L2 regularization (Ridge).
+        """
+        super().__init__()
+        self._strict = strict
+        self._lambda = lam
 
     @staticmethod
-    def __get_constraint(X, strict):
+    def __get_constraints(coef: np.ndarray, strict: bool) -> list:
         if strict:
-            return [cp.sum(X, axis=1) == 1]
-        return [cp.sum(X, axis=1) <= 1.1, cp.sum(X, axis=1) >= 0.9]
-
-    def __solve(self, A, B, strict):
-        transition_matrix = cp.Variable((A.shape[1], B.shape[1]))
-        loss_function = cp.norm(A @ transition_matrix - B, "fro")
-        objective = cp.Minimize(loss_function)
-        constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, strict)
-        problem = cp.Problem(objective, constraint)
-        problem.solve()
+            return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1]
+        return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9]
+
+    def __standard_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray) -> cp.Minimize:
+        loss_function = cp.norm(A @ beta - B, "fro")
+        return cp.Minimize(loss_function)
+
+    def __ridge_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray) -> cp.Minimize:
+        # Based on https://www.cvxpy.org/examples/machine_learning/ridge_regression.html
+        lam = cp.Parameter(nonneg=True, value=self._lambda)
+        loss_function = cp.pnorm(A @ beta - B, p=2) ** 2
+        regularizer = cp.pnorm(beta, p=2) ** 2
+        return cp.Minimize(loss_function + lam * regularizer)
+
+    def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray) -> np.ndarray:
+        transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True)
+        Aw = np.dot(weights, A)
+        Bw = np.dot(weights, B)
+
+        if self._lambda is None or self._lambda == 0:
+            objective = self.__standard_objective(Aw, Bw, transition_matrix)
+        else:
+            objective = self.__ridge_objective(Aw, Bw, transition_matrix)
+
+        constraints = TransitionMatrixSolver.__get_constraints(transition_matrix, self._strict)
+        problem = cp.Problem(objective, constraints)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            try:
+                problem.solve(solver=cp.CLARABEL)
+            except (UserWarning, cp.error.SolverError) as e:
+                raise RuntimeError(e) from e
+
         return transition_matrix.value
 
-    def fit(self, A, B, strict=False):
-        transition_matrix = self.__solve(A, B, strict)
-        self.transition_matrix = transition_matrix
+    def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None) -> np.ndarray:
+        self._check_any_element_nan_or_inf(X)
+        self._check_any_element_nan_or_inf(Y)
+        self._check_for_zero_units(X)
+        self._check_for_zero_units(Y)
+
+        if not isinstance(X, np.ndarray):
+            X = X.to_numpy()
+        if not isinstance(Y, np.ndarray):
+            Y = Y.to_numpy()
+
+        if X.shape[0] != Y.shape[0]:
+            raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).")
+
+        weights = self._check_and_prepare_weights(X, Y, sample_weight)
 
-    def predict(self, A):
-        return A @ self.transition_matrix
+        self.coefficients = self.__solve(X, Y, weights)
+        return self
diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py
@@ -0,0 +1,95 @@
+import logging
+
+import numpy as np
+
+from elexsolver.LinearSolver import LinearSolver
+from elexsolver.logging import initialize_logging
+
+initialize_logging()
+
+LOG = logging.getLogger(__name__)
+
+
+class TransitionSolver(LinearSolver):
+    """
+    Abstract class for transition solvers.
+    """
+
+    def __init__(self):
+        """
+        After model-fit, `self.coefficients` will contain
+        the solved coefficients, an np.ndarray matrix of float of shape
+        (number of columns in `X`) x (number of columns in `Y`).
+        Each float represents the percent of how much of row x is part of column y.
+        """
+        super().__init__()
+
+    def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None):
+        """
+        Parameters
+        ----------
+        X : np.ndarray matrix or pandas.DataFrame of int
+            Must have the same number of rows as `Y` but can have any number of columns greater than the number of rows.
+        Y : np.ndarray matrix or pandas.DataFrame of int
+            Must have the same number of rows as `X` but can have any number of columns greater than the number of rows.
+        sample_weight : list or np.ndarray or pandas.Series of int, optional
+            Must have the same length (number of rows) as both `X` and `Y`.
+
+        Returns
+        -------
+        `self` and populates `betas` with the beta coefficients determined by this solver.
+        `betas` is an np.ndarray matrix of float of shape (number of columns in `X`) x (number of columns in `Y`).
+        Each float represents the percent of how much of row x is part of column y.
+        """
+        raise NotImplementedError
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """
+        Parameters
+        ----------
+        X : np.ndarray matrix or pandas.DataFrame of int
+            Must have the same dimensions as the `X` supplied to `fit()`.
+
+        Returns
+        -------
+        `Y_hat`, np.ndarray of float of the same shape as Y.
+        """
+        if self.coefficients is None:
+            raise RuntimeError("Solver must be fit before prediction can be performed.")
+
+        self._check_any_element_nan_or_inf(X)
+
+        return X @ self.coefficients
+
+    def _check_for_zero_units(self, A: np.ndarray):
+        """
+        If we have at least one unit whose columns are all zero, most if not all of our solvers will fail.
+        """
+        if np.any(np.sum(A, axis=1) == 0):
+            raise ValueError("Matrix cannot contain any rows (units) where all columns (things) are zero.")
+
+    def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None) -> np.ndarray:
+        """
+        If `weights` is not None, and `weights` has the same number of rows in both matrices `X` and `Y`,
+        we'll rescale the weights by taking the square root after dividing them by their sum,
+        then return a diagonal matrix containing these now-normalized weights.
+        If `weights` is None, return a diagonal matrix of ones.
+
+        Parameters
+        ----------
+        X : np.ndarray matrix of int (same number of rows as `Y`)
+        Y : np.ndarray matrix of int (same number of rows as `X`)
+        weights : np.ndarray of int of the shape (number of rows in `X` and `Y`, 1), optional
+        """
+
+        if weights is not None:
+            if len(weights) != X.shape[0] and len(weights) != Y.shape[0]:
+                raise ValueError("weights must be the same length as the number of rows in X and Y.")
+            if isinstance(weights, list):
+                weights = np.array(weights).copy()
+            elif not isinstance(weights, np.ndarray):
+                # pandas.Series
+                weights = weights.values.copy()
+            return np.diag(np.sqrt(weights.flatten() / weights.sum()))
+
+        return np.diag(np.ones((Y.shape[0],)))