From e0580cd60964e098307a7ad09607e5b86d841976 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 6 Oct 2023 16:11:34 -0400 Subject: [PATCH 001/135] Adding TransitionSolver abstract base class --- src/elexsolver/TransitionSolver.py | 42 ++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 src/elexsolver/TransitionSolver.py diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py new file mode 100644 index 00000000..d80930ff --- /dev/null +++ b/src/elexsolver/TransitionSolver.py @@ -0,0 +1,42 @@ +import logging +import warnings +from abc import ABC + +import numpy as np + +from elexsolver.logging import initialize_logging + +initialize_logging() + +LOG = logging.getLogger(__name__) + + +class TransitionSolver(ABC): + """ + Abstract class for (voter) transition solvers. + """ + + def fit_predict(self, X: np.ndarray, Y: np.ndarray): + raise NotImplementedError + + def mean_absolute_error(self, X: np.ndarray, Y: np.ndarray): + raise NotImplementedError + + def _get_expected_totals(self, A: np.ndarray): + output = np.sum(A, axis=0) + # rescaling in case any columns had been dropped previously + return output / sum(output) + + def _check_any_element_nan_or_inf(self, A: np.ndarray): + """ + Check whether any element in a matrix or vector is NaN or infinity + """ + if np.any(np.isnan(A)) or np.any(np.isinf(A)): + raise ValueError("Matrix contains NaN or Infinity") + + def _check_percentages(self, A: np.ndarray): + """ + Verify that every element in matrix A is >= 0 and <= 1. + """ + if not np.all((A >= 0) & (A <= 1)): + raise ValueError("Matrix contains values less than 0 or greater than 1.") From 37b756c629e573874b62dba1f24decb77bbb6f2e Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 6 Oct 2023 16:12:41 -0400 Subject: [PATCH 002/135] Running pre-commit on TransitionSolver and removing unused warnings import --- src/elexsolver/TransitionSolver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index d80930ff..a43279db 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -1,5 +1,4 @@ import logging -import warnings from abc import ABC import numpy as np @@ -26,7 +25,7 @@ def _get_expected_totals(self, A: np.ndarray): output = np.sum(A, axis=0) # rescaling in case any columns had been dropped previously return output / sum(output) - + def _check_any_element_nan_or_inf(self, A: np.ndarray): """ Check whether any element in a matrix or vector is NaN or infinity From da59677ac87700954bad1eb9808ba514d4660de4 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 6 Oct 2023 16:13:54 -0400 Subject: [PATCH 003/135] Modifying TransitionMatrixSolver according to some experiments I've run and also to inherit from TransitionSolver --- src/elexsolver/TransitionMatrixSolver.py | 53 +++++++++++++++++------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 8e29cad3..916696ff 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -1,28 +1,53 @@ +import logging + import cvxpy as cp +import numpy as np + +from elexsolver.logging import initialize_logging +from elexsolver.TransitionSolver import TransitionSolver + +initialize_logging() + +LOG = logging.getLogger(__name__) -class TransitionMatrixSolver: - def __init__(self): - self.transition_matrix = None +class TransitionMatrixSolver(TransitionSolver): + def __init__(self, strict=True): + super().__init__() + self._transition_matrix = None + self._strict = strict @staticmethod - def __get_constraint(X, strict): + def __get_constraint(coef, strict): if strict: - return [cp.sum(X, axis=1) == 1] - return [cp.sum(X, axis=1) <= 1.1, cp.sum(X, axis=1) >= 0.9] + return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] + return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] - def __solve(self, A, B, strict): + def __solve(self, A, B): transition_matrix = cp.Variable((A.shape[1], B.shape[1])) - loss_function = cp.norm(A @ transition_matrix - B, "fro") + loss_function = cp.norm(A.values @ transition_matrix - B.values, "fro") objective = cp.Minimize(loss_function) - constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, strict) + constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) problem = cp.Problem(objective, constraint) problem.solve() return transition_matrix.value - def fit(self, A, B, strict=False): - transition_matrix = self.__solve(A, B, strict) - self.transition_matrix = transition_matrix + def mean_absolute_error(self, X, Y): + x = self._get_expected_totals(X) + y = self._get_expected_totals(Y) + + absolute_errors = np.abs(np.matmul(x, self._transition_matrix) - y) + error_sum = np.sum(absolute_errors) + mae = error_sum / len(absolute_errors) + + return mae + + def fit_predict(self, X, Y): + self._check_any_element_nan_or_inf(X) + self._check_any_element_nan_or_inf(Y) + self._check_percentages(X) + self._check_percentages(Y) - def predict(self, A): - return A @ self.transition_matrix + self._transition_matrix = self.__solve(X, Y) + LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(X, Y), 4))) + return np.diag(self._get_expected_totals(X)) @ self._transition_matrix From 3d6aef0b10a18e124c5a8255cc0bfdf017f5d0e3 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 6 Oct 2023 18:16:29 -0400 Subject: [PATCH 004/135] Adding placeholder for prediction intervals method --- src/elexsolver/TransitionSolver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index a43279db..590d348f 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -21,6 +21,9 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray): def mean_absolute_error(self, X: np.ndarray, Y: np.ndarray): raise NotImplementedError + def get_prediction_interval(self, pi: float): + raise NotImplementedError + def _get_expected_totals(self, A: np.ndarray): output = np.sum(A, axis=0) # rescaling in case any columns had been dropped previously From 9ebe5c5283a65f89fc7505828fe826ea8260fbad Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 6 Oct 2023 18:16:46 -0400 Subject: [PATCH 005/135] Initial check-in of EI-based transition solver --- src/elexsolver/EITransitionSolver.py | 93 ++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 src/elexsolver/EITransitionSolver.py diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py new file mode 100644 index 00000000..2991c209 --- /dev/null +++ b/src/elexsolver/EITransitionSolver.py @@ -0,0 +1,93 @@ +import logging + +import pymc as pm +import numpy as np + +from elexsolver.logging import initialize_logging +from elexsolver.TransitionSolver import TransitionSolver + +initialize_logging() + +LOG = logging.getLogger(__name__) + + +class EITransitionSolver(TransitionSolver): + """ + A (voter) transition solver based on RxC ecological inference. + Largely adapted from version 1.0.1 of + Knudson et al., (2021). PyEI: A Python package for ecological inference. + Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 + """ + + def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=1): + super().__init__() + self._n = n + self._alpha = alpha # lmbda1 in PyEI + self._beta = beta # lmbda2 in PyEI, supplied as an int then used as 1 / lmbda2 + self._chains = sampling_chains + self._sampled = None # will not be None after model-fit + + def mean_absolute_error(self, X, Y): + # x = self._get_expected_totals(X) + # y = self._get_expected_totals(Y) + + # absolute_errors = np.abs(np.matmul(x, self._transition_matrix) - y) + # error_sum = np.sum(absolute_errors) + # mae = error_sum / len(absolute_errors) + + # return mae + return 0 # TODO + + def fit_predict(self, X, Y): + self._check_any_element_nan_or_inf(X) + self._check_any_element_nan_or_inf(Y) + self._check_percentages(X) + self._check_percentages(Y) + + # TODO: check if these matrices are (long x short), then transpose + # currently assuming this is the case since the other solver expects (long x short) + X = np.transpose(X) + Y = np.transpose(Y) + + num_units = len(self._n) # should be the same as the number of units in Y + num_rows = X.shape[0] # number of things in X that are being transitioned "from" + num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" + + # reshaping and rounding + Y_obs = np.swapaxes(Y * self._n, 0, 1).round() + X_extended = np.expand_dims(X, axis=2) + X_extended = np.repeat(X_extended, num_cols, axis=2) + X_extended = np.swapaxes(X_extended, 0, 1) + + with pm.Model() as model: + conc_params = pm.Gamma( + "conc_params", alpha=self._alpha, beta=self._beta, shape=(num_rows, num_cols) + ) + beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) + theta = (X_extended * beta).sum(axis=1) + yhat = pm.Multinomial( + "transfers", + n=self._n, + p=theta, + observed=Y_obs, + shape=(num_units, num_cols), + ) + # TODO: allow other samplers; this one is very good but slow + model_trace = pm.sample(chains=self._chains) + + b_values = np.transpose( + model_trace["posterior"]["beta"].stack(all_draws=["chain", "draw"]).values, axes=(3, 0, 1, 2)) + samples_converted = np.transpose(b_values, axes=(3, 0, 1, 2)) * X.T.values + samples_summed_across = samples_converted.sum(axis=2) + self._sampled = np.transpose(samples_summed_across / X.T.sum(axis=0).values, axes=(1, 2, 0)) + + posterior_mean_rxc = self._sampled.mean(axis=0) + X_totals = self._get_expected_totals(np.transpose(X)) + # TODO + # LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(X, Y), 4))) + # to go from inferences to transitions + transitions = [] + for col in posterior_mean_rxc.T: + transitions.append(col * X_totals) + return np.array(transitions).T + From 9f49d83a80474665851a84de14830380dc272c5d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 9 Oct 2023 12:30:43 -0400 Subject: [PATCH 006/135] Adding some required-matching-length exceptions to EITransitionSolver --- src/elexsolver/EITransitionSolver.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 2991c209..992aa371 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -49,6 +49,11 @@ def fit_predict(self, X, Y): X = np.transpose(X) Y = np.transpose(Y) + if X.shape[1] != Y.shape[1]: + raise ValueError(f"Number of units in X ({X.shape[1]}) != number of units in Y ({Y.shape[1]}).") + if Y.shape[1] != len(self._n): + raise ValueError(f"Number of units in Y ({Y.shape[1]}) != number of units in n ({len(self._n)}).") + num_units = len(self._n) # should be the same as the number of units in Y num_rows = X.shape[0] # number of things in X that are being transitioned "from" num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" From 7f965d8ea01a879eb7e9c4827febeab8028252ca Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 9 Oct 2023 16:07:16 -0400 Subject: [PATCH 007/135] Rescale if needed, check for things x units --- src/elexsolver/EITransitionSolver.py | 31 ++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 992aa371..7b7fa814 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -38,22 +38,38 @@ def mean_absolute_error(self, X, Y): # return mae return 0 # TODO + def _check_and_rescale(self, A): + if not np.all(A.sum(axis=0) == 1): + LOG.warn("Each column (unit) needs to sum to 1. Rescaling...") + if isinstance(A, np.ndarray): + for j in range(0, A.shape[1]): + A[:, j] /= A[:, j].sum() + else: + # pandas.DataFrame() + for col in A.columns: + A[col] /= A[col].sum() + return A + def fit_predict(self, X, Y): self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) self._check_percentages(X) self._check_percentages(Y) - # TODO: check if these matrices are (long x short), then transpose - # currently assuming this is the case since the other solver expects (long x short) - X = np.transpose(X) - Y = np.transpose(Y) + # matrices should be (things x units), where the number of units is > the number of things + if X.shape[0] > X.shape[1]: + X = X.T + if Y.shape[0] > Y.shape[1]: + Y = Y.T if X.shape[1] != Y.shape[1]: raise ValueError(f"Number of units in X ({X.shape[1]}) != number of units in Y ({Y.shape[1]}).") if Y.shape[1] != len(self._n): raise ValueError(f"Number of units in Y ({Y.shape[1]}) != number of units in n ({len(self._n)}).") + X = self._check_and_rescale(X) + Y = self._check_and_rescale(Y) + num_units = len(self._n) # should be the same as the number of units in Y num_rows = X.shape[0] # number of things in X that are being transitioned "from" num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" @@ -77,8 +93,11 @@ def fit_predict(self, X, Y): observed=Y_obs, shape=(num_units, num_cols), ) - # TODO: allow other samplers; this one is very good but slow - model_trace = pm.sample(chains=self._chains) + try: + # TODO: allow other samplers; this one is very good but slow + model_trace = pm.sample(chains=self._chains) + except: + print(model.debug()) b_values = np.transpose( model_trace["posterior"]["beta"].stack(all_draws=["chain", "draw"]).values, axes=(3, 0, 1, 2)) From 9e5a3fefd01cd9a9cad5e112eb358e7e3d4a1867 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 10 Oct 2023 09:36:45 -0400 Subject: [PATCH 008/135] Adding _check_and_rescale() to superclass so the same check can be performed before the cvxpy solver --- src/elexsolver/EITransitionSolver.py | 12 ------------ src/elexsolver/TransitionMatrixSolver.py | 11 +++++++++++ src/elexsolver/TransitionSolver.py | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 7b7fa814..58bf8a7c 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -38,18 +38,6 @@ def mean_absolute_error(self, X, Y): # return mae return 0 # TODO - def _check_and_rescale(self, A): - if not np.all(A.sum(axis=0) == 1): - LOG.warn("Each column (unit) needs to sum to 1. Rescaling...") - if isinstance(A, np.ndarray): - for j in range(0, A.shape[1]): - A[:, j] /= A[:, j].sum() - else: - # pandas.DataFrame() - for col in A.columns: - A[col] /= A[col].sum() - return A - def fit_predict(self, X, Y): self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 916696ff..2c3ee139 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -48,6 +48,17 @@ def fit_predict(self, X, Y): self._check_percentages(X) self._check_percentages(Y) + # matrices should be (units x things), where the number of units is > the number of things + if X.shape[1] > X.shape[0]: + X = X.T + if Y.shape[1] > Y.shape[0]: + Y = Y.T + + X = self._check_and_rescale(X.T) + X = X.T + Y = self._check_and_rescale(Y.T) + Y = Y.T + self._transition_matrix = self.__solve(X, Y) LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(X, Y), 4))) return np.diag(self._get_expected_totals(X)) @ self._transition_matrix diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 590d348f..2dae4c37 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -42,3 +42,18 @@ def _check_percentages(self, A: np.ndarray): """ if not np.all((A >= 0) & (A <= 1)): raise ValueError("Matrix contains values less than 0 or greater than 1.") + + def _check_and_rescale(self, A: np.ndarray): + """ + Rescale columns (units) so that they sum to 1 (100%). + """ + if not np.all(A.sum(axis=0) == 1): + LOG.warn("Each column (unit) needs to sum to 1. Rescaling...") + if isinstance(A, np.ndarray): + for j in range(0, A.shape[1]): + A[:, j] /= A[:, j].sum() + else: + # pandas.DataFrame() + for col in A.columns: + A[col] /= A[col].sum() + return A From 76f4b617d1869df216aa48fa55f7234444b4b6a8 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 10 Oct 2023 09:50:41 -0400 Subject: [PATCH 009/135] Cleaning up code and exception-handling with pre-commit --- src/elexsolver/EITransitionSolver.py | 35 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 58bf8a7c..9b4deec4 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -1,7 +1,7 @@ import logging -import pymc as pm import numpy as np +import pymc as pm from elexsolver.logging import initialize_logging from elexsolver.TransitionSolver import TransitionSolver @@ -18,14 +18,14 @@ class EITransitionSolver(TransitionSolver): Knudson et al., (2021). PyEI: A Python package for ecological inference. Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 """ - + def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=1): super().__init__() self._n = n - self._alpha = alpha # lmbda1 in PyEI - self._beta = beta # lmbda2 in PyEI, supplied as an int then used as 1 / lmbda2 + self._alpha = alpha # lmbda1 in PyEI + self._beta = beta # lmbda2 in PyEI, supplied as an int then used as 1 / lmbda2 self._chains = sampling_chains - self._sampled = None # will not be None after model-fit + self._sampled = None # will not be None after model-fit def mean_absolute_error(self, X, Y): # x = self._get_expected_totals(X) @@ -36,7 +36,7 @@ def mean_absolute_error(self, X, Y): # mae = error_sum / len(absolute_errors) # return mae - return 0 # TODO + return 0 # TODO def fit_predict(self, X, Y): self._check_any_element_nan_or_inf(X) @@ -58,9 +58,9 @@ def fit_predict(self, X, Y): X = self._check_and_rescale(X) Y = self._check_and_rescale(Y) - num_units = len(self._n) # should be the same as the number of units in Y - num_rows = X.shape[0] # number of things in X that are being transitioned "from" - num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" + num_units = len(self._n) # should be the same as the number of units in Y + num_rows = X.shape[0] # number of things in X that are being transitioned "from" + num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" # reshaping and rounding Y_obs = np.swapaxes(Y * self._n, 0, 1).round() @@ -69,13 +69,11 @@ def fit_predict(self, X, Y): X_extended = np.swapaxes(X_extended, 0, 1) with pm.Model() as model: - conc_params = pm.Gamma( - "conc_params", alpha=self._alpha, beta=self._beta, shape=(num_rows, num_cols) - ) + conc_params = pm.Gamma("conc_params", alpha=self._alpha, beta=self._beta, shape=(num_rows, num_cols)) beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) theta = (X_extended * beta).sum(axis=1) - yhat = pm.Multinomial( - "transfers", + pm.Multinomial( + "result_fractions", n=self._n, p=theta, observed=Y_obs, @@ -84,11 +82,13 @@ def fit_predict(self, X, Y): try: # TODO: allow other samplers; this one is very good but slow model_trace = pm.sample(chains=self._chains) - except: - print(model.debug()) + except Exception as e: + LOG.debug(model.debug()) + raise e b_values = np.transpose( - model_trace["posterior"]["beta"].stack(all_draws=["chain", "draw"]).values, axes=(3, 0, 1, 2)) + model_trace["posterior"]["beta"].stack(all_draws=["chain", "draw"]).values, axes=(3, 0, 1, 2) + ) samples_converted = np.transpose(b_values, axes=(3, 0, 1, 2)) * X.T.values samples_summed_across = samples_converted.sum(axis=2) self._sampled = np.transpose(samples_summed_across / X.T.sum(axis=0).values, axes=(1, 2, 0)) @@ -102,4 +102,3 @@ def fit_predict(self, X, Y): for col in posterior_mean_rxc.T: transitions.append(col * X_totals) return np.array(transitions).T - From 79076a774700c29f9b14a28081f5a06f20a4e6f3 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 10 Oct 2023 11:01:41 -0400 Subject: [PATCH 010/135] Adding MAE computation/reporting to ETTransitionSolver --- src/elexsolver/EITransitionSolver.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 9b4deec4..acbae345 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -28,15 +28,12 @@ def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=1): self._sampled = None # will not be None after model-fit def mean_absolute_error(self, X, Y): - # x = self._get_expected_totals(X) - # y = self._get_expected_totals(Y) - - # absolute_errors = np.abs(np.matmul(x, self._transition_matrix) - y) - # error_sum = np.sum(absolute_errors) - # mae = error_sum / len(absolute_errors) - - # return mae - return 0 # TODO + y_pred = self._get_expected_totals(X) + y = self._get_expected_totals(Y.T) + absolute_errors = np.abs(y_pred - y) + error_sum = np.sum(absolute_errors) + mae = error_sum / len(absolute_errors) + return mae def fit_predict(self, X, Y): self._check_any_element_nan_or_inf(X) @@ -95,10 +92,10 @@ def fit_predict(self, X, Y): posterior_mean_rxc = self._sampled.mean(axis=0) X_totals = self._get_expected_totals(np.transpose(X)) - # TODO - # LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(X, Y), 4))) # to go from inferences to transitions transitions = [] for col in posterior_mean_rxc.T: transitions.append(col * X_totals) - return np.array(transitions).T + transitions = np.array(transitions).T + LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(transitions, Y), 4))) + return transitions From 2254a65005a83a1aa9ab52741f94defc254d6e71 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 12 Oct 2023 09:54:45 -0400 Subject: [PATCH 011/135] Adding the ability to compute a prediction (credible) interval with the EI solver and also experimenting with different sampler options there --- src/elexsolver/EITransitionSolver.py | 48 ++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index acbae345..1c155709 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -19,13 +19,17 @@ class EITransitionSolver(TransitionSolver): Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 """ - def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=1): + def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=1, random_seed=None): super().__init__() self._n = n self._alpha = alpha # lmbda1 in PyEI self._beta = beta # lmbda2 in PyEI, supplied as an int then used as 1 / lmbda2 self._chains = sampling_chains - self._sampled = None # will not be None after model-fit + self._seed = random_seed + + # class members that are instantiated during model-fit + self._sampled = None + self._X_totals = None def mean_absolute_error(self, X, Y): y_pred = self._get_expected_totals(X) @@ -78,7 +82,7 @@ def fit_predict(self, X, Y): ) try: # TODO: allow other samplers; this one is very good but slow - model_trace = pm.sample(chains=self._chains) + model_trace = pm.sample(chains=self._chains, random_seed=self._seed, nuts_sampler="numpyro") except Exception as e: LOG.debug(model.debug()) raise e @@ -91,11 +95,37 @@ def fit_predict(self, X, Y): self._sampled = np.transpose(samples_summed_across / X.T.sum(axis=0).values, axes=(1, 2, 0)) posterior_mean_rxc = self._sampled.mean(axis=0) - X_totals = self._get_expected_totals(np.transpose(X)) - # to go from inferences to transitions - transitions = [] - for col in posterior_mean_rxc.T: - transitions.append(col * X_totals) - transitions = np.array(transitions).T + self._X_totals = self._get_expected_totals(np.transpose(X)) + transitions = self._get_transitions(posterior_mean_rxc) LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(transitions, Y), 4))) return transitions + + def _get_transitions(self, A: np.ndarray): + # to go from inferences to transitions + transitions = [] + for col in A.T: + transitions.append(col * self._X_totals) + return np.array(transitions).T + + def get_prediction_interval(self, pi): + """ + Note: this is actually a credible interval, not a prediction interval. + """ + if pi <= 1: + pi = pi * 100 + if pi < 0 or pi > 100: + raise ValueError(f"Invalid prediction interval {pi}.") + + lower = (100 - pi) / 2 + upper = pi + lower + A_dict = { + lower: np.zeros((self._sampled.shape[1], self._sampled.shape[2])), + upper: np.zeros((self._sampled.shape[1], self._sampled.shape[2])), + } + + for ci in [lower, upper]: + for i in range(0, self._sampled.shape[1]): + for j in range(0, self._sampled.shape[2]): + A_dict[ci][i][j] = np.percentile(self._sampled[:, i, j], ci) + + return (self._get_transitions(A_dict[lower]), self._get_transitions(A_dict[upper])) From 3aa0f5bdeb351342b0db3f09fe11869181867644 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 12 Oct 2023 16:55:13 -0400 Subject: [PATCH 012/135] Handle zero and other weird division correctly when rescaling the percentages to sum to 1 at the unit-level --- src/elexsolver/TransitionSolver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 2dae4c37..945da4ea 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -52,8 +52,10 @@ def _check_and_rescale(self, A: np.ndarray): if isinstance(A, np.ndarray): for j in range(0, A.shape[1]): A[:, j] /= A[:, j].sum() + return np.nan_to_num(A, nan=0, posinf=0, neginf=0) else: # pandas.DataFrame() for col in A.columns: A[col] /= A[col].sum() + return A.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) return A From a67304f6dd0445760e3e3462883c3b7b51b35e8e Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 16 Oct 2023 10:01:35 -0400 Subject: [PATCH 013/135] Adding a check to make sure we have enough units --- src/elexsolver/TransitionSolver.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 945da4ea..dd3facab 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -45,10 +45,14 @@ def _check_percentages(self, A: np.ndarray): def _check_and_rescale(self, A: np.ndarray): """ - Rescale columns (units) so that they sum to 1 (100%). + After ensuring that A is (things x units), make sure we have enough units. + If that's the case, rescale columns (units) so that they sum to 1 (100%). """ + if A.shape[1] <= A.shape[0] or (A.shape[1] // 2) <= A.shape[0]: + raise ValueError(f"Not enough units ({A.shape[1]}) relative to the number of things ({A.shape[0]}).") + if not np.all(A.sum(axis=0) == 1): - LOG.warn("Each column (unit) needs to sum to 1. Rescaling...") + LOG.warn("Each unit needs to sum to 1. Rescaling...") if isinstance(A, np.ndarray): for j in range(0, A.shape[1]): A[:, j] /= A[:, j].sum() From e54ba8592ba1d9a1452b4c5da82e8beeeee7c544 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 17 Oct 2023 12:31:42 -0400 Subject: [PATCH 014/135] Silencing some warnings --- src/elexsolver/EITransitionSolver.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 1c155709..e0b018a5 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -2,6 +2,7 @@ import numpy as np import pymc as pm +import pymc.sampling.jax as pmjax from elexsolver.logging import initialize_logging from elexsolver.TransitionSolver import TransitionSolver @@ -64,7 +65,7 @@ def fit_predict(self, X, Y): num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" # reshaping and rounding - Y_obs = np.swapaxes(Y * self._n, 0, 1).round() + Y_obs = np.transpose(Y * self._n).round() X_extended = np.expand_dims(X, axis=2) X_extended = np.repeat(X_extended, num_cols, axis=2) X_extended = np.swapaxes(X_extended, 0, 1) @@ -81,8 +82,8 @@ def fit_predict(self, X, Y): shape=(num_units, num_cols), ) try: - # TODO: allow other samplers; this one is very good but slow - model_trace = pm.sample(chains=self._chains, random_seed=self._seed, nuts_sampler="numpyro") + # TODO: keep trying to tune this for performance and speed + model_trace = pmjax.sample_numpyro_nuts(chains=self._chains, random_seed=self._seed, target_accept=0.95) except Exception as e: LOG.debug(model.debug()) raise e From aedcdfff0ae32529bd7c544faeb012949b601436 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 17 Oct 2023 13:57:47 -0400 Subject: [PATCH 015/135] Increasing the target_accept on the sampler and making sure the sampling_chains argument is an int --- src/elexsolver/EITransitionSolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index e0b018a5..49f24538 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -25,7 +25,7 @@ def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=1, random_s self._n = n self._alpha = alpha # lmbda1 in PyEI self._beta = beta # lmbda2 in PyEI, supplied as an int then used as 1 / lmbda2 - self._chains = sampling_chains + self._chains = int(sampling_chains) self._seed = random_seed # class members that are instantiated during model-fit @@ -83,7 +83,7 @@ def fit_predict(self, X, Y): ) try: # TODO: keep trying to tune this for performance and speed - model_trace = pmjax.sample_numpyro_nuts(chains=self._chains, random_seed=self._seed, target_accept=0.95) + model_trace = pmjax.sample_numpyro_nuts(chains=self._chains, random_seed=self._seed, target_accept=0.99) except Exception as e: LOG.debug(model.debug()) raise e From e9fc725ae344fed9bd9929138ef2d35546a13e7c Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 18 Oct 2023 08:56:33 -0400 Subject: [PATCH 016/135] Trying out some more EI solver optimizations --- src/elexsolver/EITransitionSolver.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 49f24538..b0c9ddd3 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -2,7 +2,6 @@ import numpy as np import pymc as pm -import pymc.sampling.jax as pmjax from elexsolver.logging import initialize_logging from elexsolver.TransitionSolver import TransitionSolver @@ -70,7 +69,7 @@ def fit_predict(self, X, Y): X_extended = np.repeat(X_extended, num_cols, axis=2) X_extended = np.swapaxes(X_extended, 0, 1) - with pm.Model() as model: + with pm.Model(check_bounds=False) as model: conc_params = pm.Gamma("conc_params", alpha=self._alpha, beta=self._beta, shape=(num_rows, num_cols)) beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) theta = (X_extended * beta).sum(axis=1) @@ -83,7 +82,9 @@ def fit_predict(self, X, Y): ) try: # TODO: keep trying to tune this for performance and speed - model_trace = pmjax.sample_numpyro_nuts(chains=self._chains, random_seed=self._seed, target_accept=0.99) + model_trace = pm.sample( + chains=self._chains, random_seed=self._seed, nuts_sampler="numpyro", cores=1, draws=1500, tune=500 + ) except Exception as e: LOG.debug(model.debug()) raise e From 08401eb35e0ba8bb5774dfb2ea592e3b964c9c18 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 18 Oct 2023 09:09:05 -0400 Subject: [PATCH 017/135] Explicitly specifying the cvxpy solver to use since the default is about to change (also squashes warning) --- src/elexsolver/TransitionMatrixSolver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 2c3ee139..beb20b03 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -29,7 +29,9 @@ def __solve(self, A, B): objective = cp.Minimize(loss_function) constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) problem = cp.Problem(objective, constraint) - problem.solve() + # preferring CVXPY's prior default solver, ECOS, over its new default, Clarabel + # because sometimes Clarabel produces negative-values results for our problem + problem.solve(solver=cp.ECOS) return transition_matrix.value def mean_absolute_error(self, X, Y): From 3d35f674bc229d5721f4b999c8cb271553b75da4 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 18 Oct 2023 09:12:11 -0400 Subject: [PATCH 018/135] Fixing poorly-worded comment in transition matrix solver --- src/elexsolver/TransitionMatrixSolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index beb20b03..9aad9f4a 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -29,8 +29,8 @@ def __solve(self, A, B): objective = cp.Minimize(loss_function) constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) problem = cp.Problem(objective, constraint) - # preferring CVXPY's prior default solver, ECOS, over its new default, Clarabel - # because sometimes Clarabel produces negative-values results for our problem + # preferring cvxpy's prior default solver, ECOS, over its new default, Clarabel + # because sometimes Clarabel produces negative-valued results for our problem problem.solve(solver=cp.ECOS) return transition_matrix.value From 1ac1c78c8edb170fa3c7a4183952d1d6061a0bc7 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 18 Oct 2023 16:44:33 -0400 Subject: [PATCH 019/135] Default sampling chains of 2 on 2 cores --- src/elexsolver/EITransitionSolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index b0c9ddd3..35db5c20 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -19,7 +19,7 @@ class EITransitionSolver(TransitionSolver): Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 """ - def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=1, random_seed=None): + def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=2, random_seed=None): super().__init__() self._n = n self._alpha = alpha # lmbda1 in PyEI @@ -83,7 +83,7 @@ def fit_predict(self, X, Y): try: # TODO: keep trying to tune this for performance and speed model_trace = pm.sample( - chains=self._chains, random_seed=self._seed, nuts_sampler="numpyro", cores=1, draws=1500, tune=500 + chains=self._chains, random_seed=self._seed, nuts_sampler="numpyro", cores=self._chains, draws=1500, tune=500 ) except Exception as e: LOG.debug(model.debug()) From c87c25ba3eb13832d89032a10c5b10919f0285c6 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 20 Oct 2023 13:38:19 -0400 Subject: [PATCH 020/135] Making sure pymc is a requirement --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 15340bfa..9fd7e86d 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "scipy~=1.11"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "pymc~=5.9", "scipy~=1.11"] THIS_FILE_DIR = os.path.dirname(__file__) From 8d4637d735a7ff48c9eae5629355b8a2fbb04458 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 23 Oct 2023 09:39:22 -0400 Subject: [PATCH 021/135] Experimenting with drawing fewer samples, different Dirichlet alphas --- src/elexsolver/EITransitionSolver.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 35db5c20..a712a50a 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -19,13 +19,15 @@ class EITransitionSolver(TransitionSolver): Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 """ - def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=2, random_seed=None): + def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=2, random_seed=None, draws=300): super().__init__() self._n = n self._alpha = alpha # lmbda1 in PyEI - self._beta = beta # lmbda2 in PyEI, supplied as an int then used as 1 / lmbda2 + self._beta = beta # lmbda2 in PyEI, in PyEI supplied as an int then used as 1 / lmbda2 self._chains = int(sampling_chains) self._seed = random_seed + self._draws = draws + self._tune = draws // 2 # class members that are instantiated during model-fit self._sampled = None @@ -72,6 +74,7 @@ def fit_predict(self, X, Y): with pm.Model(check_bounds=False) as model: conc_params = pm.Gamma("conc_params", alpha=self._alpha, beta=self._beta, shape=(num_rows, num_cols)) beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) + # beta = pm.Dirichlet("beta", a=np.ones((num_rows, num_cols)), shape=(num_units, num_rows, num_cols)) theta = (X_extended * beta).sum(axis=1) pm.Multinomial( "result_fractions", @@ -83,7 +86,12 @@ def fit_predict(self, X, Y): try: # TODO: keep trying to tune this for performance and speed model_trace = pm.sample( - chains=self._chains, random_seed=self._seed, nuts_sampler="numpyro", cores=self._chains, draws=1500, tune=500 + chains=self._chains, + random_seed=self._seed, + nuts_sampler="numpyro", + cores=self._chains, + draws=self._draws, + tune=self._tune, ) except Exception as e: LOG.debug(model.debug()) From 8fc068cfc6cd8d53e844894f2ebfaa5df940cc22 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 23 Oct 2023 09:46:42 -0400 Subject: [PATCH 022/135] Semi-relaxing the check for units summing to 1 using np.allclose() --- src/elexsolver/TransitionSolver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index dd3facab..c6e6eb24 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -51,7 +51,8 @@ def _check_and_rescale(self, A: np.ndarray): if A.shape[1] <= A.shape[0] or (A.shape[1] // 2) <= A.shape[0]: raise ValueError(f"Not enough units ({A.shape[1]}) relative to the number of things ({A.shape[0]}).") - if not np.all(A.sum(axis=0) == 1): + unit_totals = A.sum(axis=0) + if not np.allclose(unit_totals, np.ones(unit_totals.shape)): LOG.warn("Each unit needs to sum to 1. Rescaling...") if isinstance(A, np.ndarray): for j in range(0, A.shape[1]): From c05acdeb561c0aa58ad693c5e3ad934db6c45504 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 23 Oct 2023 13:22:08 -0400 Subject: [PATCH 023/135] Adding numpyro to the requirements --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9fd7e86d..49282d44 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "pymc~=5.9", "scipy~=1.11"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "numpyro~=0.13", "pymc~=5.9", "scipy~=1.11"] THIS_FILE_DIR = os.path.dirname(__file__) From 39c974934d60436cee2f26a5d5a3ac1f5aa771f4 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 23 Oct 2023 13:41:05 -0400 Subject: [PATCH 024/135] Test against Python 3.11 because I don't think some of the requirements are available for 3.10 --- .github/workflows/test.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5ff24e13..cfb5df9d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,7 +7,7 @@ jobs: timeout-minutes: 5 strategy: matrix: - python-version: ['3.10'] + python-version: ['3.11'] steps: - uses: actions/checkout@v2 - name: Setup Python diff --git a/setup.py b/setup.py index 49282d44..8dfe9d39 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python", - "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], description="A package for optimization solvers", long_description=LONG_DESCRIPTION, From 9d595e8e790c346c40439c1a0b069df6c2d6230b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 23 Oct 2023 13:49:00 -0400 Subject: [PATCH 025/135] Increasing github test timeout to 10 minutes --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cfb5df9d..af3b9ce1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,7 @@ jobs: test: name: Run unit tests runs-on: ubuntu-latest - timeout-minutes: 5 + timeout-minutes: 10 strategy: matrix: python-version: ['3.11'] From 78d5655997ad6f16ca532b55db7b1ee21bcf9769 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 23 Oct 2023 14:03:54 -0400 Subject: [PATCH 026/135] Trying a slightly-older version of numpy to be compatible with the current version of pytensor --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8dfe9d39..7b1a4189 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "numpyro~=0.13", "pymc~=5.9", "scipy~=1.11"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.25", "numpyro~=0.13", "pymc~=5.9", "scipy~=1.11"] THIS_FILE_DIR = os.path.dirname(__file__) From 54bd39f979d7ba9a2382ed09ecc3157243002c9c Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 23 Oct 2023 16:30:51 -0400 Subject: [PATCH 027/135] Switching to HalfNormal prior instead of Gamma after preliminary evaluation --- src/elexsolver/EITransitionSolver.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index a712a50a..7a32e2c1 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -19,11 +19,10 @@ class EITransitionSolver(TransitionSolver): Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 """ - def __init__(self, n: np.ndarray, alpha=4, beta=0.5, sampling_chains=2, random_seed=None, draws=300): + def __init__(self, n: np.ndarray, sigma=1, sampling_chains=2, random_seed=None, draws=300): super().__init__() self._n = n - self._alpha = alpha # lmbda1 in PyEI - self._beta = beta # lmbda2 in PyEI, in PyEI supplied as an int then used as 1 / lmbda2 + self._sigma = sigma self._chains = int(sampling_chains) self._seed = random_seed self._draws = draws @@ -72,9 +71,8 @@ def fit_predict(self, X, Y): X_extended = np.swapaxes(X_extended, 0, 1) with pm.Model(check_bounds=False) as model: - conc_params = pm.Gamma("conc_params", alpha=self._alpha, beta=self._beta, shape=(num_rows, num_cols)) + conc_params = pm.HalfNormal("conc_params", sigma=self._sigma, shape=(num_rows, num_cols)) beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) - # beta = pm.Dirichlet("beta", a=np.ones((num_rows, num_cols)), shape=(num_units, num_rows, num_cols)) theta = (X_extended * beta).sum(axis=1) pm.Multinomial( "result_fractions", From 5ce550cc7ca6904e95b996bfb63dee8232f54606 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 24 Oct 2023 13:44:29 -0400 Subject: [PATCH 028/135] Addressing some pylint complaints --- setup.cfg | 6 +++++- src/elexsolver/EITransitionSolver.py | 2 +- src/elexsolver/TransitionMatrixSolver.py | 2 +- src/elexsolver/TransitionSolver.py | 11 +++++------ 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/setup.cfg b/setup.cfg index 000bb99c..9a638abb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,4 +2,8 @@ universal = 1 [pycodestyle] -max-line-length = 160 \ No newline at end of file +max-line-length = 160 + +[pylint] +max-line-length = 160 +disable = invalid-name, duplicate-code, missing-function-docstring, too-many-instance-attributes, too-many-arguments, too-many-locals \ No newline at end of file diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 7a32e2c1..7fb1cff5 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -105,7 +105,7 @@ def fit_predict(self, X, Y): posterior_mean_rxc = self._sampled.mean(axis=0) self._X_totals = self._get_expected_totals(np.transpose(X)) transitions = self._get_transitions(posterior_mean_rxc) - LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(transitions, Y), 4))) + LOG.info("MAE = %s", np.around(self.mean_absolute_error(transitions, Y), 4)) return transitions def _get_transitions(self, A: np.ndarray): diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 9aad9f4a..3a43ff4a 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -62,5 +62,5 @@ def fit_predict(self, X, Y): Y = Y.T self._transition_matrix = self.__solve(X, Y) - LOG.info("MAE = {}".format(np.around(self.mean_absolute_error(X, Y), 4))) + LOG.info("MAE = %s", np.around(self.mean_absolute_error(X, Y), 4)) return np.diag(self._get_expected_totals(X)) @ self._transition_matrix diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index c6e6eb24..ce4c52b4 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -53,14 +53,13 @@ def _check_and_rescale(self, A: np.ndarray): unit_totals = A.sum(axis=0) if not np.allclose(unit_totals, np.ones(unit_totals.shape)): - LOG.warn("Each unit needs to sum to 1. Rescaling...") + LOG.warning("Each unit needs to sum to 1. Rescaling...") if isinstance(A, np.ndarray): for j in range(0, A.shape[1]): A[:, j] /= A[:, j].sum() return np.nan_to_num(A, nan=0, posinf=0, neginf=0) - else: - # pandas.DataFrame() - for col in A.columns: - A[col] /= A[col].sum() - return A.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) + # pandas.DataFrame() + for col in A.columns: + A[col] /= A[col].sum() + return A.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) return A From c25abecf438b534c8ff6b0df8416fc08696bbfb3 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 24 Oct 2023 14:29:46 -0400 Subject: [PATCH 029/135] Starting on TransitionSolver unit tests --- tests/test_transition_solver.py | 65 +++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tests/test_transition_solver.py diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py new file mode 100644 index 00000000..5ac50545 --- /dev/null +++ b/tests/test_transition_solver.py @@ -0,0 +1,65 @@ +from unittest.mock import patch + +import numpy as np +import pytest + +from elexsolver.TransitionSolver import TransitionSolver + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_fit_predict(): + with pytest.raises(NotImplementedError): + ts = TransitionSolver() + ts.fit_predict(None, None) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_mean_absolute_error(): + with pytest.raises(NotImplementedError): + ts = TransitionSolver() + ts.mean_absolute_error(None, None) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_get_prediction_interval(): + with pytest.raises(NotImplementedError): + ts = TransitionSolver() + ts.get_prediction_interval(0) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_get_expected_totals(): + A = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) + expected = np.array([0.23809524, 0.33333333, 0.42857143]) + ts = TransitionSolver() + np.testing.assert_allclose(ts._get_expected_totals(A), expected) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_any_element_nan_or_inf_with_nan(): + with pytest.raises(ValueError): + A = np.array([[0.1, 0.2, 0.3], [0.4, np.nan, 0.6]]) + ts = TransitionSolver() + ts._check_any_element_nan_or_inf(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_any_element_nan_or_inf_without_nan(): + A = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) + ts = TransitionSolver() + ts._check_any_element_nan_or_inf(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_percentages_bad(): + with pytest.raises(ValueError): + A = np.array([[0.1, 0.2, 3], [0.4, 0.5, 0.6]]) + ts = TransitionSolver() + ts._check_percentages(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_percentages_good(): + A = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) + ts = TransitionSolver() + ts._check_percentages(A) # pylint: disable=protected-access From 14d5239acb5fb90b85da310729066412a2cafeb7 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 24 Oct 2023 14:36:08 -0400 Subject: [PATCH 030/135] Splitting TransitionSolver._check_and_rescale() into two separate methods --- src/elexsolver/EITransitionSolver.py | 6 ++++-- src/elexsolver/TransitionMatrixSolver.py | 8 ++++---- src/elexsolver/TransitionSolver.py | 10 +++++++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 7fb1cff5..cec1961f 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -57,8 +57,10 @@ def fit_predict(self, X, Y): if Y.shape[1] != len(self._n): raise ValueError(f"Number of units in Y ({Y.shape[1]}) != number of units in n ({len(self._n)}).") - X = self._check_and_rescale(X) - Y = self._check_and_rescale(Y) + self._check_dimensions(X) + X = self._rescale(X) + self._check_dimensions(Y) + Y = self._rescale(Y) num_units = len(self._n) # should be the same as the number of units in Y num_rows = X.shape[0] # number of things in X that are being transitioned "from" diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 3a43ff4a..8076afb9 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -56,10 +56,10 @@ def fit_predict(self, X, Y): if Y.shape[1] > Y.shape[0]: Y = Y.T - X = self._check_and_rescale(X.T) - X = X.T - Y = self._check_and_rescale(Y.T) - Y = Y.T + self._check_dimensions(X.T) + X = self._rescale(X.T).T + self._check_dimensions(Y.T) + Y = self._rescale(Y.T).T self._transition_matrix = self.__solve(X, Y) LOG.info("MAE = %s", np.around(self.mean_absolute_error(X, Y), 4)) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index ce4c52b4..fa0aeaf0 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -43,14 +43,18 @@ def _check_percentages(self, A: np.ndarray): if not np.all((A >= 0) & (A <= 1)): raise ValueError("Matrix contains values less than 0 or greater than 1.") - def _check_and_rescale(self, A: np.ndarray): + def _check_dimensions(self, A: np.ndarray): """ - After ensuring that A is (things x units), make sure we have enough units. - If that's the case, rescale columns (units) so that they sum to 1 (100%). + Ensure that in our (things x units) matrix, the number of units is + at least twice as large as the number of things. """ if A.shape[1] <= A.shape[0] or (A.shape[1] // 2) <= A.shape[0]: raise ValueError(f"Not enough units ({A.shape[1]}) relative to the number of things ({A.shape[0]}).") + def _rescale(self, A: np.ndarray): + """ + Rescale columns (units) to ensure they sum to 1 (100%). + """ unit_totals = A.sum(axis=0) if not np.allclose(unit_totals, np.ones(unit_totals.shape)): LOG.warning("Each unit needs to sum to 1. Rescaling...") From 6e2dfaa020bb77e47e24d3ea4f53218c82ca14a7 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 24 Oct 2023 15:16:13 -0400 Subject: [PATCH 031/135] Finishing TransitionSolver unit tests --- tests/test_transition_solver.py | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 5ac50545..3fc54d35 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -63,3 +63,39 @@ def test_check_percentages_good(): A = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) ts = TransitionSolver() ts._check_percentages(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_dimensions_bad(): + with pytest.raises(ValueError): + A = np.array([[0.1, 0.2, 0.3]]) + ts = TransitionSolver() + ts._check_dimensions(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_dimensions_good(): + A = np.array( + [ + [0.1, 0.4, 0.7, 0.1, 0.4, 0.7, 0.1, 0.4, 0.7], + [0.2, 0.5, 0.8, 0.2, 0.5, 0.8, 0.2, 0.5, 0.8], + [0.3, 0.6, 0.9, 0.3, 0.6, 0.9, 0.3, 0.6, 0.9], + ] + ) + ts = TransitionSolver() + ts._check_dimensions(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_rescale_skipped(): + A = np.ones((10, 2)) / 10 + ts = TransitionSolver() + np.testing.assert_array_equal(ts._rescale(A), A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_rescale_rescaled(): + A = np.ones((2, 2)) + expected = np.array([[0.5, 0.5], [0.5, 0.5]]) + ts = TransitionSolver() + np.testing.assert_array_equal(ts._rescale(A), expected) # pylint: disable=protected-access From a5ced360a37de37f1c761e5bc539e00ea576c3fd Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 24 Oct 2023 16:40:44 -0400 Subject: [PATCH 032/135] Make sure X and Y are numpy arrays for cvxpy if they're not already --- src/elexsolver/TransitionMatrixSolver.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 8076afb9..7cfa2cff 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -25,7 +25,7 @@ def __get_constraint(coef, strict): def __solve(self, A, B): transition_matrix = cp.Variable((A.shape[1], B.shape[1])) - loss_function = cp.norm(A.values @ transition_matrix - B.values, "fro") + loss_function = cp.norm(A @ transition_matrix - B, "fro") objective = cp.Minimize(loss_function) constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) problem = cp.Problem(objective, constraint) @@ -61,6 +61,11 @@ def fit_predict(self, X, Y): self._check_dimensions(Y.T) Y = self._rescale(Y.T).T + if not isinstance(X, np.ndarray): + X = X.to_numpy() + if not isinstance(Y, np.ndarray): + Y = Y.to_numpy() + self._transition_matrix = self.__solve(X, Y) LOG.info("MAE = %s", np.around(self.mean_absolute_error(X, Y), 4)) return np.diag(self._get_expected_totals(X)) @ self._transition_matrix From dabea4e539a653a24affd9a4b34177fbf8db57b0 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 24 Oct 2023 16:41:29 -0400 Subject: [PATCH 033/135] Adding unit tests for the transition matrix solver --- tests/test_transition_matrix_solver.py | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/test_transition_matrix_solver.py diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py new file mode 100644 index 00000000..770701d3 --- /dev/null +++ b/tests/test_transition_matrix_solver.py @@ -0,0 +1,50 @@ +import numpy as np +import pytest + +from elexsolver.TransitionMatrixSolver import TransitionMatrixSolver + + +def test_fit_predict(): + X = np.array( + [ + [0.13991186, 0.19010302], + [0.13774767, 0.19199878], + [0.17240947, 0.16163616], + [0.19983843, 0.13760927], + [0.15796058, 0.17429292], + [0.192132, 0.14435986], + ] + ) + + Y = np.array( + [ + [0.15551131, 0.16977255], + [0.1573689, 0.16925536], + [0.16995309, 0.16575166], + [0.15144583, 0.17090446], + [0.16700258, 0.16657314], + [0.19871829, 0.15774283], + ] + ) + + expected = np.array([[0.29134295, 0.20806254], [0.2076699, 0.29292461]]) + + tms = TransitionMatrixSolver() + current = tms.fit_predict(X, Y) + np.testing.assert_allclose(expected, current) + + +def test_mean_absolute_error(): + X = np.ones((10, 3)) + Y = np.ones((10, 4)) + expected = 0.0 + tms = TransitionMatrixSolver() + tms.fit_predict(X, Y) + current = np.around(tms.mean_absolute_error(X, Y), 6) + np.testing.assert_allclose(expected, current) + + +def test_get_prediction_interval(): + with pytest.raises(NotImplementedError): + tms = TransitionMatrixSolver() + tms.get_prediction_interval(0) From bc1a7693fdf612cae16f6df225d48ec434ed98fd Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 24 Oct 2023 16:45:33 -0400 Subject: [PATCH 034/135] Adding test for rescaling the matrix if it's a pandas.DataFrame --- tests/test_transition_solver.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 3fc54d35..8ddde53e 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -94,8 +94,18 @@ def test_rescale_skipped(): @patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_rescale_rescaled(): +def test_rescale_rescaled_numpy(): A = np.ones((2, 2)) expected = np.array([[0.5, 0.5], [0.5, 0.5]]) ts = TransitionSolver() np.testing.assert_array_equal(ts._rescale(A), expected) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_rescale_rescaled_pandas(): + import pandas + + a_df = pandas.DataFrame(np.ones((2, 2)), columns=["A", "B"]) + expected_df = pandas.DataFrame([[0.5, 0.5], [0.5, 0.5]], columns=["A", "B"]) + ts = TransitionSolver() + np.testing.assert_array_equal(ts._rescale(a_df), expected_df) # pylint: disable=protected-access From 9f2683c6e13d9ba702c3d9bf869e0e4cdeede83b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 25 Oct 2023 14:07:52 -0400 Subject: [PATCH 035/135] Pass in matrixes of vote counts (integers) so I can ensure the percentage transitions are being computed correctly :disappointed: --- src/elexsolver/EITransitionSolver.py | 47 ++++++++++++------------ src/elexsolver/TransitionMatrixSolver.py | 30 +++++++-------- src/elexsolver/TransitionSolver.py | 38 ++++++------------- 3 files changed, 49 insertions(+), 66 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index cec1961f..0d393e44 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -14,37 +14,33 @@ class EITransitionSolver(TransitionSolver): """ A (voter) transition solver based on RxC ecological inference. - Largely adapted from version 1.0.1 of + Somewhat adapted from version 1.0.1 of Knudson et al., (2021). PyEI: A Python package for ecological inference. Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 + See also: + Ori Rosen, Wenxin Jiang, Gary King, and Martin A Tanner. 2001. + “Bayesian and Frequentist Inference for Ecological Inference: The RxC Case.” + Statistica Neerlandica, 55, Pp. 134–156. Copy at https://tinyurl.com/yajkae6n """ - def __init__(self, n: np.ndarray, sigma=1, sampling_chains=2, random_seed=None, draws=300): + def __init__(self, sigma=1, sampling_chains=2, random_seed=None, n_samples=300): super().__init__() - self._n = n self._sigma = sigma self._chains = int(sampling_chains) self._seed = random_seed - self._draws = draws - self._tune = draws // 2 + self._draws = n_samples + self._tune = n_samples // 2 # class members that are instantiated during model-fit self._sampled = None self._X_totals = None - def mean_absolute_error(self, X, Y): - y_pred = self._get_expected_totals(X) - y = self._get_expected_totals(Y.T) - absolute_errors = np.abs(y_pred - y) - error_sum = np.sum(absolute_errors) - mae = error_sum / len(absolute_errors) - return mae - def fit_predict(self, X, Y): + """ + X and Y are matrixes of integers. + """ self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) - self._check_percentages(X) - self._check_percentages(Y) # matrices should be (things x units), where the number of units is > the number of things if X.shape[0] > X.shape[1]: @@ -52,22 +48,25 @@ def fit_predict(self, X, Y): if Y.shape[0] > Y.shape[1]: Y = Y.T + self._check_dimensions(X) + self._check_dimensions(Y) + if X.shape[1] != Y.shape[1]: raise ValueError(f"Number of units in X ({X.shape[1]}) != number of units in Y ({Y.shape[1]}).") - if Y.shape[1] != len(self._n): - raise ValueError(f"Number of units in Y ({Y.shape[1]}) != number of units in n ({len(self._n)}).") - self._check_dimensions(X) + self._X_totals = X.sum(axis=1) / X.sum(axis=1).sum() + Y_expected_totals = Y.sum(axis=1) / Y.sum(axis=1).sum() + n = Y.sum(axis=0) + X = self._rescale(X) - self._check_dimensions(Y) Y = self._rescale(Y) - num_units = len(self._n) # should be the same as the number of units in Y + num_units = len(n) # should be the same as the number of units in Y num_rows = X.shape[0] # number of things in X that are being transitioned "from" num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" # reshaping and rounding - Y_obs = np.transpose(Y * self._n).round() + Y_obs = np.transpose(Y * n).round() X_extended = np.expand_dims(X, axis=2) X_extended = np.repeat(X_extended, num_cols, axis=2) X_extended = np.swapaxes(X_extended, 0, 1) @@ -78,7 +77,7 @@ def fit_predict(self, X, Y): theta = (X_extended * beta).sum(axis=1) pm.Multinomial( "result_fractions", - n=self._n, + n=n, p=theta, observed=Y_obs, shape=(num_units, num_cols), @@ -105,9 +104,9 @@ def fit_predict(self, X, Y): self._sampled = np.transpose(samples_summed_across / X.T.sum(axis=0).values, axes=(1, 2, 0)) posterior_mean_rxc = self._sampled.mean(axis=0) - self._X_totals = self._get_expected_totals(np.transpose(X)) transitions = self._get_transitions(posterior_mean_rxc) - LOG.info("MAE = %s", np.around(self.mean_absolute_error(transitions, Y), 4)) + Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() + LOG.info("MAE = %s", np.around(self.mean_absolute_error(Y_pred_totals, Y_expected_totals), 4)) return transitions def _get_transitions(self, A: np.ndarray): diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 7cfa2cff..2f921513 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -34,21 +34,12 @@ def __solve(self, A, B): problem.solve(solver=cp.ECOS) return transition_matrix.value - def mean_absolute_error(self, X, Y): - x = self._get_expected_totals(X) - y = self._get_expected_totals(Y) - - absolute_errors = np.abs(np.matmul(x, self._transition_matrix) - y) - error_sum = np.sum(absolute_errors) - mae = error_sum / len(absolute_errors) - - return mae - def fit_predict(self, X, Y): + """ + X and Y are matrixes of integers. + """ self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) - self._check_percentages(X) - self._check_percentages(Y) # matrices should be (units x things), where the number of units is > the number of things if X.shape[1] > X.shape[0]: @@ -57,15 +48,22 @@ def fit_predict(self, X, Y): Y = Y.T self._check_dimensions(X.T) - X = self._rescale(X.T).T self._check_dimensions(Y.T) - Y = self._rescale(Y.T).T if not isinstance(X, np.ndarray): X = X.to_numpy() if not isinstance(Y, np.ndarray): Y = Y.to_numpy() + X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() + Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() + + X = self._rescale(X.T).T + Y = self._rescale(Y.T).T + self._transition_matrix = self.__solve(X, Y) - LOG.info("MAE = %s", np.around(self.mean_absolute_error(X, Y), 4)) - return np.diag(self._get_expected_totals(X)) @ self._transition_matrix + transitions = np.diag(X_expected_totals) @ self._transition_matrix + Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() + LOG.info("MAE = %s", np.around(self.mean_absolute_error(Y_expected_totals, Y_pred_totals), 4)) + + return transitions diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index fa0aeaf0..4815aa3f 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -18,16 +18,13 @@ class TransitionSolver(ABC): def fit_predict(self, X: np.ndarray, Y: np.ndarray): raise NotImplementedError - def mean_absolute_error(self, X: np.ndarray, Y: np.ndarray): - raise NotImplementedError - def get_prediction_interval(self, pi: float): raise NotImplementedError - def _get_expected_totals(self, A: np.ndarray): - output = np.sum(A, axis=0) - # rescaling in case any columns had been dropped previously - return output / sum(output) + def mean_absolute_error(self, Y_expected: np.ndarray, Y_pred: np.ndarray): + absolute_errors = np.abs(Y_pred - Y_expected) + error_sum = np.sum(absolute_errors) + return error_sum / len(absolute_errors) def _check_any_element_nan_or_inf(self, A: np.ndarray): """ @@ -36,13 +33,6 @@ def _check_any_element_nan_or_inf(self, A: np.ndarray): if np.any(np.isnan(A)) or np.any(np.isinf(A)): raise ValueError("Matrix contains NaN or Infinity") - def _check_percentages(self, A: np.ndarray): - """ - Verify that every element in matrix A is >= 0 and <= 1. - """ - if not np.all((A >= 0) & (A <= 1)): - raise ValueError("Matrix contains values less than 0 or greater than 1.") - def _check_dimensions(self, A: np.ndarray): """ Ensure that in our (things x units) matrix, the number of units is @@ -55,15 +45,11 @@ def _rescale(self, A: np.ndarray): """ Rescale columns (units) to ensure they sum to 1 (100%). """ - unit_totals = A.sum(axis=0) - if not np.allclose(unit_totals, np.ones(unit_totals.shape)): - LOG.warning("Each unit needs to sum to 1. Rescaling...") - if isinstance(A, np.ndarray): - for j in range(0, A.shape[1]): - A[:, j] /= A[:, j].sum() - return np.nan_to_num(A, nan=0, posinf=0, neginf=0) - # pandas.DataFrame() - for col in A.columns: - A[col] /= A[col].sum() - return A.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) - return A + if isinstance(A, np.ndarray): + for j in range(0, A.shape[1]): + A[:, j] /= A[:, j].sum() + return np.nan_to_num(A, nan=0, posinf=0, neginf=0) + # pandas.DataFrame() + for col in A.columns: + A[col] /= A[col].sum() + return A.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) From 53d4daa6f2b88eda37935a140064322883df7afb Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 25 Oct 2023 14:36:36 -0400 Subject: [PATCH 036/135] Getting unit tests working again after all those changes --- src/elexsolver/TransitionSolver.py | 2 +- tests/test_transition_matrix_solver.py | 38 ++++++++++---------------- tests/test_transition_solver.py | 32 ++++------------------ 3 files changed, 21 insertions(+), 51 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 4815aa3f..e841ddc8 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -47,7 +47,7 @@ def _rescale(self, A: np.ndarray): """ if isinstance(A, np.ndarray): for j in range(0, A.shape[1]): - A[:, j] /= A[:, j].sum() + A[:, j] = A[:, j] / A[:, j].sum() return np.nan_to_num(A, nan=0, posinf=0, neginf=0) # pandas.DataFrame() for col in A.columns: diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 770701d3..c8a0a0cd 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -7,41 +7,31 @@ def test_fit_predict(): X = np.array( [ - [0.13991186, 0.19010302], - [0.13774767, 0.19199878], - [0.17240947, 0.16163616], - [0.19983843, 0.13760927], - [0.15796058, 0.17429292], - [0.192132, 0.14435986], + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], ] ) Y = np.array( [ - [0.15551131, 0.16977255], - [0.1573689, 0.16925536], - [0.16995309, 0.16575166], - [0.15144583, 0.17090446], - [0.16700258, 0.16657314], - [0.19871829, 0.15774283], + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], ] ) - expected = np.array([[0.29134295, 0.20806254], [0.2076699, 0.29292461]]) + expected = np.array([[0.230769, 0.230769], [0.269231, 0.269231]]) tms = TransitionMatrixSolver() current = tms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current) - - -def test_mean_absolute_error(): - X = np.ones((10, 3)) - Y = np.ones((10, 4)) - expected = 0.0 - tms = TransitionMatrixSolver() - tms.fit_predict(X, Y) - current = np.around(tms.mean_absolute_error(X, Y), 6) - np.testing.assert_allclose(expected, current) + np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) def test_get_prediction_interval(): diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 8ddde53e..6f2f486d 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -13,13 +13,6 @@ def test_superclass_fit_predict(): ts.fit_predict(None, None) -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_superclass_mean_absolute_error(): - with pytest.raises(NotImplementedError): - ts = TransitionSolver() - ts.mean_absolute_error(None, None) - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_superclass_get_prediction_interval(): with pytest.raises(NotImplementedError): @@ -28,11 +21,13 @@ def test_superclass_get_prediction_interval(): @patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_get_expected_totals(): - A = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) - expected = np.array([0.23809524, 0.33333333, 0.42857143]) +def test_mean_absolute_error(): + Y = np.ones((5, 4)) + Y_pred = Y - 0.02 + expected = 0.08 ts = TransitionSolver() - np.testing.assert_allclose(ts._get_expected_totals(A), expected) # pylint: disable=protected-access + current = np.around(ts.mean_absolute_error(Y, Y_pred), 6) + np.testing.assert_allclose(expected, current) @patch.object(TransitionSolver, "__abstractmethods__", set()) @@ -50,21 +45,6 @@ def test_check_any_element_nan_or_inf_without_nan(): ts._check_any_element_nan_or_inf(A) # pylint: disable=protected-access -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_check_percentages_bad(): - with pytest.raises(ValueError): - A = np.array([[0.1, 0.2, 3], [0.4, 0.5, 0.6]]) - ts = TransitionSolver() - ts._check_percentages(A) # pylint: disable=protected-access - - -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_check_percentages_good(): - A = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) - ts = TransitionSolver() - ts._check_percentages(A) # pylint: disable=protected-access - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_check_dimensions_bad(): with pytest.raises(ValueError): From 1d2e26b7b13c41e79052e3a6e041bc6564343cf1 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 25 Oct 2023 15:13:04 -0400 Subject: [PATCH 037/135] Add check for integer data --- src/elexsolver/EITransitionSolver.py | 2 ++ src/elexsolver/TransitionMatrixSolver.py | 2 ++ src/elexsolver/TransitionSolver.py | 6 +++++- tests/test_transition_solver.py | 22 +++++++++++++++------- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 0d393e44..58fa03f3 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -39,6 +39,8 @@ def fit_predict(self, X, Y): """ X and Y are matrixes of integers. """ + self._check_data_type(X) + self._check_data_type(Y) self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 2f921513..4366b0e6 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -38,6 +38,8 @@ def fit_predict(self, X, Y): """ X and Y are matrixes of integers. """ + self._check_data_type(X) + self._check_data_type(Y) self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index e841ddc8..d5b09529 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -31,7 +31,11 @@ def _check_any_element_nan_or_inf(self, A: np.ndarray): Check whether any element in a matrix or vector is NaN or infinity """ if np.any(np.isnan(A)) or np.any(np.isinf(A)): - raise ValueError("Matrix contains NaN or Infinity") + raise ValueError("Matrix contains NaN or Infinity.") + + def _check_data_type(self, A: np.ndarray): + if not np.all(A.astype("int64") == A): + raise ValueError("Matrix must contain integers.") def _check_dimensions(self, A: np.ndarray): """ diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 6f2f486d..78e9a5c2 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -66,13 +66,6 @@ def test_check_dimensions_good(): ts._check_dimensions(A) # pylint: disable=protected-access -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_rescale_skipped(): - A = np.ones((10, 2)) / 10 - ts = TransitionSolver() - np.testing.assert_array_equal(ts._rescale(A), A) # pylint: disable=protected-access - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_rescale_rescaled_numpy(): A = np.ones((2, 2)) @@ -89,3 +82,18 @@ def test_rescale_rescaled_pandas(): expected_df = pandas.DataFrame([[0.5, 0.5], [0.5, 0.5]], columns=["A", "B"]) ts = TransitionSolver() np.testing.assert_array_equal(ts._rescale(a_df), expected_df) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_data_type_good(): + A = np.array([[1, 2, 3], [4, 5, 6]]) + ts = TransitionSolver() + ts._check_data_type(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_data_type_bad(): + with pytest.raises(ValueError): + A = np.array([[0.1, 0.2, 0.3]]) + ts = TransitionSolver() + ts._check_data_type(A) # pylint: disable=protected-access From 9ffdc70a1ca6ef4707ea0735658e114e9da035fb Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 2 Nov 2023 11:51:44 -0400 Subject: [PATCH 038/135] Silencing perfectly-ok warning about division-by-zero in rescale method in TransitionSolver --- src/elexsolver/TransitionSolver.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index d5b09529..a1507149 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -1,4 +1,5 @@ import logging +import warnings from abc import ABC import numpy as np @@ -50,9 +51,15 @@ def _rescale(self, A: np.ndarray): Rescale columns (units) to ensure they sum to 1 (100%). """ if isinstance(A, np.ndarray): - for j in range(0, A.shape[1]): - A[:, j] = A[:, j] / A[:, j].sum() - return np.nan_to_num(A, nan=0, posinf=0, neginf=0) + with warnings.catch_warnings(): + # Zeros are completely ok here; + # means the candidate received zero votes. + warnings.filterwarnings( + "ignore", category=RuntimeWarning, message="invalid value encountered in divide" + ) + for j in range(0, A.shape[1]): + A[:, j] = A[:, j] / A[:, j].sum() + return np.nan_to_num(A, nan=0, posinf=0, neginf=0) # pandas.DataFrame() for col in A.columns: A[col] /= A[col].sum() From c2a8909c13e0de3508b0f6d74f652f515d9599c3 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 20 Nov 2023 12:00:07 -0500 Subject: [PATCH 039/135] Adding check and exception for units that are completely zero --- src/elexsolver/TransitionSolver.py | 17 +++++++++---- tests/test_transition_solver.py | 39 +++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index a1507149..8d020c6b 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -40,20 +40,27 @@ def _check_data_type(self, A: np.ndarray): def _check_dimensions(self, A: np.ndarray): """ - Ensure that in our (things x units) matrix, the number of units is + Ensure that in our (units x things) matrix, the number of units is at least twice as large as the number of things. """ - if A.shape[1] <= A.shape[0] or (A.shape[1] // 2) <= A.shape[0]: - raise ValueError(f"Not enough units ({A.shape[1]}) relative to the number of things ({A.shape[0]}).") + if A.shape[0] <= A.shape[1] or (A.shape[0] // 2) <= A.shape[1]: + raise ValueError(f"Not enough units ({A.shape[0]}) relative to the number of things ({A.shape[1]}).") + + def _check_for_zero_units(self, A: np.ndarray): + """ + If we have at least one unit whose columns are all zero, we can't continue. + """ + if np.any(np.sum(A, axis=1) == 0): + raise ValueError("Matrix cannot contain any rows (units) where all columns (things) are zero.") def _rescale(self, A: np.ndarray): """ - Rescale columns (units) to ensure they sum to 1 (100%). + Rescale columns (things) to ensure they sum to 1 (100%). """ if isinstance(A, np.ndarray): with warnings.catch_warnings(): # Zeros are completely ok here; - # means the candidate received zero votes. + # means the thing (e.g. candidate) received zero votes. warnings.filterwarnings( "ignore", category=RuntimeWarning, message="invalid value encountered in divide" ) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 78e9a5c2..28d33fee 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -57,15 +57,48 @@ def test_check_dimensions_bad(): def test_check_dimensions_good(): A = np.array( [ - [0.1, 0.4, 0.7, 0.1, 0.4, 0.7, 0.1, 0.4, 0.7], - [0.2, 0.5, 0.8, 0.2, 0.5, 0.8, 0.2, 0.5, 0.8], - [0.3, 0.6, 0.9, 0.3, 0.6, 0.9, 0.3, 0.6, 0.9], + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + [0.7, 0.8, 0.9], + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + [0.7, 0.8, 0.9], + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + [0.7, 0.8, 0.9], ] ) ts = TransitionSolver() ts._check_dimensions(A) # pylint: disable=protected-access +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_for_zero_units_good(): + A = np.array( + [ + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + [0.7, 0.8, 0.9], + ] + ) + ts = TransitionSolver() + ts._check_for_zero_units(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_for_zero_units_bad(): + with pytest.raises(ValueError): + A = np.array( + [ + [0.1, 0.2, 0.3], + [0.0, 0.0, 0.0], + [0.7, 0.8, 0.9], + ] + ) + ts = TransitionSolver() + ts._check_for_zero_units(A) # pylint: disable=protected-access + + @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_rescale_rescaled_numpy(): A = np.ones((2, 2)) From 6beb55830109fbde7b2d5a219e9d65bae026e95b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 20 Nov 2023 12:11:56 -0500 Subject: [PATCH 040/135] Need to push the changes I made to the matrix solver so the tests pass remotely lol --- src/elexsolver/TransitionMatrixSolver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 4366b0e6..2cc4ff71 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -49,8 +49,10 @@ def fit_predict(self, X, Y): if Y.shape[1] > Y.shape[0]: Y = Y.T - self._check_dimensions(X.T) - self._check_dimensions(Y.T) + self._check_dimensions(X) + self._check_dimensions(Y) + self._check_for_zero_units(X) + self._check_for_zero_units(Y) if not isinstance(X, np.ndarray): X = X.to_numpy() From f734896282681b59f0b298dbfd0780ff6caa7391 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 20 Nov 2023 13:24:52 -0500 Subject: [PATCH 041/135] Adding the check for zero units and adding some consistency to checks in the EI solver --- src/elexsolver/EITransitionSolver.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 58fa03f3..830eb932 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -44,17 +44,24 @@ def fit_predict(self, X, Y): self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) - # matrices should be (things x units), where the number of units is > the number of things - if X.shape[0] > X.shape[1]: + # first, ensure matrices are (units x things), where the number of units is > the number of things + # this will allow us to re-use some of the same checks we use for all other solvers + if X.shape[1] > X.shape[0]: X = X.T - if Y.shape[0] > Y.shape[1]: + if Y.shape[1] > Y.shape[0]: Y = Y.T + if X.shape[0] != Y.shape[0]: + raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") + self._check_dimensions(X) self._check_dimensions(Y) + self._check_for_zero_units(X) + self._check_for_zero_units(Y) - if X.shape[1] != Y.shape[1]: - raise ValueError(f"Number of units in X ({X.shape[1]}) != number of units in Y ({Y.shape[1]}).") + # but for this solver, we need our matrices to be (things x units) + X = X.T + Y = Y.T self._X_totals = X.sum(axis=1) / X.sum(axis=1).sum() Y_expected_totals = Y.sum(axis=1) / Y.sum(axis=1).sum() From 3d19897051717c806f5ee5552c67487f96d626a5 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 20 Nov 2023 15:18:15 -0500 Subject: [PATCH 042/135] Silencing some extraneous/unnecessary pymc and jax logging messages --- src/elexsolver/EITransitionSolver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 830eb932..100ca2d1 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -9,6 +9,8 @@ initialize_logging() LOG = logging.getLogger(__name__) +logging.getLogger("pymc").setLevel(logging.ERROR) +logging.getLogger("jax").setLevel(logging.ERROR) class EITransitionSolver(TransitionSolver): From d32844c069ca5780a7b3bd9b2179bfc176797530 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 1 Dec 2023 17:49:34 -0500 Subject: [PATCH 043/135] Independent function for MAE calculation and MAE as a class member/property --- src/elexsolver/EITransitionSolver.py | 5 +++-- src/elexsolver/TransitionMatrixSolver.py | 5 +++-- src/elexsolver/TransitionSolver.py | 16 ++++++++++++---- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 100ca2d1..9efc51ea 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -4,7 +4,7 @@ import pymc as pm from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver +from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error initialize_logging() @@ -117,7 +117,8 @@ def fit_predict(self, X, Y): posterior_mean_rxc = self._sampled.mean(axis=0) transitions = self._get_transitions(posterior_mean_rxc) Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() - LOG.info("MAE = %s", np.around(self.mean_absolute_error(Y_pred_totals, Y_expected_totals), 4)) + self._mae = mean_absolute_error(Y_pred_totals, Y_expected_totals) + LOG.info("MAE = %s", np.around(self._mae, 4)) return transitions def _get_transitions(self, A: np.ndarray): diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 2cc4ff71..4a917d50 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -4,7 +4,7 @@ import numpy as np from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver +from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error initialize_logging() @@ -68,6 +68,7 @@ def fit_predict(self, X, Y): self._transition_matrix = self.__solve(X, Y) transitions = np.diag(X_expected_totals) @ self._transition_matrix Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() - LOG.info("MAE = %s", np.around(self.mean_absolute_error(Y_expected_totals, Y_pred_totals), 4)) + self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) + LOG.info("MAE = %s", np.around(self._mae, 4)) return transitions diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 8d020c6b..5b8f5dd1 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -11,21 +11,29 @@ LOG = logging.getLogger(__name__) +def mean_absolute_error(Y_expected: np.ndarray, Y_pred: np.ndarray): + absolute_errors = np.abs(Y_pred - Y_expected) + error_sum = np.sum(absolute_errors) + return error_sum / len(absolute_errors) + + class TransitionSolver(ABC): """ Abstract class for (voter) transition solvers. """ + def __init__(self): + self._mae = None + def fit_predict(self, X: np.ndarray, Y: np.ndarray): raise NotImplementedError def get_prediction_interval(self, pi: float): raise NotImplementedError - def mean_absolute_error(self, Y_expected: np.ndarray, Y_pred: np.ndarray): - absolute_errors = np.abs(Y_pred - Y_expected) - error_sum = np.sum(absolute_errors) - return error_sum / len(absolute_errors) + @property + def MAE(self): + return self._mae def _check_any_element_nan_or_inf(self, A: np.ndarray): """ From e6a80335dc91dea78f8c92e1e98dba8af686df8d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 1 Dec 2023 22:12:55 -0500 Subject: [PATCH 044/135] Handle situation where numpy arrays passed in to MAE function are actually lists and not numpy arrays --- src/elexsolver/TransitionSolver.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 5b8f5dd1..0f5cc917 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -12,6 +12,11 @@ def mean_absolute_error(Y_expected: np.ndarray, Y_pred: np.ndarray): + if isinstance(Y_expected, list): + Y_expected = np.array(Y_expected) + if isinstance(Y_pred, list): + Y_pred = np.array(Y_pred) + absolute_errors = np.abs(Y_pred - Y_expected) error_sum = np.sum(absolute_errors) return error_sum / len(absolute_errors) From f8e65c66534adaa03a4c32aac07e0ee566896181 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 1 Dec 2023 22:19:57 -0500 Subject: [PATCH 045/135] Updating the MAE unit test since it's no longer a class method --- tests/test_transition_solver.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 28d33fee..a63d16de 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -3,7 +3,15 @@ import numpy as np import pytest -from elexsolver.TransitionSolver import TransitionSolver +from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error + + +def test_mean_absolute_error(): + Y = np.ones((5, 4)) + Y_pred = Y - 0.02 + expected = 0.08 + current = np.around(mean_absolute_error(Y, Y_pred), 6) + np.testing.assert_allclose(expected, current) @patch.object(TransitionSolver, "__abstractmethods__", set()) @@ -20,16 +28,6 @@ def test_superclass_get_prediction_interval(): ts.get_prediction_interval(0) -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_mean_absolute_error(): - Y = np.ones((5, 4)) - Y_pred = Y - 0.02 - expected = 0.08 - ts = TransitionSolver() - current = np.around(ts.mean_absolute_error(Y, Y_pred), 6) - np.testing.assert_allclose(expected, current) - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_check_any_element_nan_or_inf_with_nan(): with pytest.raises(ValueError): From d1044d604c80487a021d06bdf019654692189250 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 6 Dec 2023 10:48:57 -0500 Subject: [PATCH 046/135] Fixing issue where integer division was being performed when rescaling so actual numbers weren't being used to get like any actual voterflows --- src/elexsolver/TransitionSolver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 0f5cc917..5b6ebff6 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -70,6 +70,8 @@ def _rescale(self, A: np.ndarray): """ Rescale columns (things) to ensure they sum to 1 (100%). """ + A = A.copy().astype(float) + if isinstance(A, np.ndarray): with warnings.catch_warnings(): # Zeros are completely ok here; From 8a315580eff804bdb7f6bac6e8b8f193594a389e Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 6 Dec 2023 11:32:33 -0500 Subject: [PATCH 047/135] Fixing the matrix solver fit_predict() unit test now that I fixed such a huge mistake --- tests/test_transition_matrix_solver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index c8a0a0cd..1d7c908f 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -27,7 +27,7 @@ def test_fit_predict(): ] ) - expected = np.array([[0.230769, 0.230769], [0.269231, 0.269231]]) + expected = np.array([[0.35096678, 0.11057168], [0.11665334, 0.4218082]]) tms = TransitionMatrixSolver() current = tms.fit_predict(X, Y) From 399daaf2959e95046ff1ca91cfcd9d1055ee1e36 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 6 Dec 2023 12:55:40 -0500 Subject: [PATCH 048/135] Ensuring that the tests for rescale() test using integers with dtype int --- tests/test_transition_solver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index a63d16de..3d90fd13 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -99,7 +99,7 @@ def test_check_for_zero_units_bad(): @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_rescale_rescaled_numpy(): - A = np.ones((2, 2)) + A = np.ones((2, 2)).astype(int) expected = np.array([[0.5, 0.5], [0.5, 0.5]]) ts = TransitionSolver() np.testing.assert_array_equal(ts._rescale(A), expected) # pylint: disable=protected-access @@ -109,7 +109,7 @@ def test_rescale_rescaled_numpy(): def test_rescale_rescaled_pandas(): import pandas - a_df = pandas.DataFrame(np.ones((2, 2)), columns=["A", "B"]) + a_df = pandas.DataFrame(np.ones((2, 2)), columns=["A", "B"]).astype(int) expected_df = pandas.DataFrame([[0.5, 0.5], [0.5, 0.5]], columns=["A", "B"]) ts = TransitionSolver() np.testing.assert_array_equal(ts._rescale(a_df), expected_df) # pylint: disable=protected-access From 6e130ba670b9d0b4244f9ffa36b323028d99fd72 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 6 Dec 2023 14:02:10 -0500 Subject: [PATCH 049/135] Using the Clarabel solver instead of ECOS after all since it seems more robust; setting transition_matrix pos=True helps prevent negative results --- src/elexsolver/TransitionMatrixSolver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 4a917d50..695a55a8 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -24,14 +24,12 @@ def __get_constraint(coef, strict): return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] def __solve(self, A, B): - transition_matrix = cp.Variable((A.shape[1], B.shape[1])) + transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) loss_function = cp.norm(A @ transition_matrix - B, "fro") objective = cp.Minimize(loss_function) constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) problem = cp.Problem(objective, constraint) - # preferring cvxpy's prior default solver, ECOS, over its new default, Clarabel - # because sometimes Clarabel produces negative-valued results for our problem - problem.solve(solver=cp.ECOS) + problem.solve(solver=cp.CLARABEL) return transition_matrix.value def fit_predict(self, X, Y): From 5d33777b9156faf5799bf0f7310b8d3e6651f2f4 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 8 Dec 2023 09:42:42 -0500 Subject: [PATCH 050/135] SUPER preliminary version of a bootstrap matrix solver --- src/elexsolver/TransitionMatrixSolver.py | 50 +++++++++++++++++++----- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 695a55a8..55d974ef 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -14,16 +14,23 @@ class TransitionMatrixSolver(TransitionSolver): def __init__(self, strict=True): super().__init__() - self._transition_matrix = None self._strict = strict + # class members that are instantiated during model-fit + # for bootstrapping + self._residuals = None + self._X = None + self._Y = None + self._X_expected_totals = None + self._Y_expected_totals = None + @staticmethod def __get_constraint(coef, strict): if strict: return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] - def __solve(self, A, B): + def _solve(self, A, B): transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) loss_function = cp.norm(A @ transition_matrix - B, "fro") objective = cp.Minimize(loss_function) @@ -57,16 +64,41 @@ def fit_predict(self, X, Y): if not isinstance(Y, np.ndarray): Y = Y.to_numpy() - X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() - Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() + self._X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() + self._Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() - X = self._rescale(X.T).T - Y = self._rescale(Y.T).T + self._X = self._rescale(X.T).T + self._Y = self._rescale(Y.T).T - self._transition_matrix = self.__solve(X, Y) - transitions = np.diag(X_expected_totals) @ self._transition_matrix + transition_matrix = self._solve(self._X, self._Y) + transitions = np.diag(self._X_expected_totals) @ transition_matrix Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() - self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) + self._mae = mean_absolute_error(self._Y_expected_totals, Y_pred_totals) LOG.info("MAE = %s", np.around(self._mae, 4)) + self._residuals = Y_pred_totals - self._Y_expected_totals return transitions + + +class BootstrapTransitionMatrixSolver(TransitionSolver): + def __init__(self, B=1, strict=True): + super().__init__() + self._strict = strict + + def fit_predict(self, X, Y): + tm = TransitionMatrixSolver(strict=self._strict) + _ = tm.fit_predict(X, Y) + + from sklearn.utils import resample # to be replaced + + residuals_hat = resample(tm._residuals, replace=True, random_state=1024) + Y_hat = tm._Y.copy() + for j in range(0, Y_hat.shape[1]): + Y_hat[:, j] = Y_hat[:, j] + (residuals_hat[j] / len(Y_hat)) + + transition_matrix_hat = tm._solve(tm._X, Y_hat) + transitions_hat = np.diag(tm._X_expected_totals) @ transition_matrix_hat + Y_pred_totals = np.sum(transitions_hat, axis=0) / np.sum(transitions_hat, axis=0).sum() + self._mae = mean_absolute_error(tm._Y_expected_totals, Y_pred_totals) + LOG.info("MAE = %s", np.around(self._mae, 4)) + return transitions_hat From ccea996678bf78ea94015116599b0d30fa5daa95 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 8 Dec 2023 11:01:47 -0500 Subject: [PATCH 051/135] Now generating random residuals for each unit/candidate rather than just distributing them uniformly across all units/candidates --- src/elexsolver/TransitionMatrixSolver.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 55d974ef..2e97bd14 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -85,6 +85,18 @@ def __init__(self, B=1, strict=True): super().__init__() self._strict = strict + def _constrained_random_numbers(self, n, M, seed=None): + """ + Generate n random numbers that sum to M. + Based on: https://stackoverflow.com/a/30659457/224912 + """ + rng = np.random.default_rng(seed=seed) + splits = [0] + [rng.random() for _ in range(0, n - 1)] + [1] + splits.sort() + diffs = [x - splits[i - 1] for (i, x) in enumerate(splits)][1:] + result = map(lambda x: x * M, diffs) + return list(result) + def fit_predict(self, X, Y): tm = TransitionMatrixSolver(strict=self._strict) _ = tm.fit_predict(X, Y) @@ -94,7 +106,8 @@ def fit_predict(self, X, Y): residuals_hat = resample(tm._residuals, replace=True, random_state=1024) Y_hat = tm._Y.copy() for j in range(0, Y_hat.shape[1]): - Y_hat[:, j] = Y_hat[:, j] + (residuals_hat[j] / len(Y_hat)) + residuals_j = self._constrained_random_numbers(len(Y_hat), residuals_hat[j], seed=j) + Y_hat[:, j] = Y_hat[:, j] + residuals_j transition_matrix_hat = tm._solve(tm._X, Y_hat) transitions_hat = np.diag(tm._X_expected_totals) @ transition_matrix_hat From f67852b9ea6fbd15b56333d0b92aeab16383112b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 8 Dec 2023 11:36:52 -0500 Subject: [PATCH 052/135] Hmmmmmm... --- src/elexsolver/TransitionMatrixSolver.py | 35 +++++++++++++++--------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 2e97bd14..1bbef11c 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -81,9 +81,10 @@ def fit_predict(self, X, Y): class BootstrapTransitionMatrixSolver(TransitionSolver): - def __init__(self, B=1, strict=True): + def __init__(self, B=1000, strict=True): super().__init__() self._strict = strict + self._B = B def _constrained_random_numbers(self, n, M, seed=None): """ @@ -99,19 +100,27 @@ def _constrained_random_numbers(self, n, M, seed=None): def fit_predict(self, X, Y): tm = TransitionMatrixSolver(strict=self._strict) - _ = tm.fit_predict(X, Y) + transitions = tm.fit_predict(X, Y) from sklearn.utils import resample # to be replaced - residuals_hat = resample(tm._residuals, replace=True, random_state=1024) - Y_hat = tm._Y.copy() - for j in range(0, Y_hat.shape[1]): - residuals_j = self._constrained_random_numbers(len(Y_hat), residuals_hat[j], seed=j) - Y_hat[:, j] = Y_hat[:, j] + residuals_j + maes = [] - transition_matrix_hat = tm._solve(tm._X, Y_hat) - transitions_hat = np.diag(tm._X_expected_totals) @ transition_matrix_hat - Y_pred_totals = np.sum(transitions_hat, axis=0) / np.sum(transitions_hat, axis=0).sum() - self._mae = mean_absolute_error(tm._Y_expected_totals, Y_pred_totals) - LOG.info("MAE = %s", np.around(self._mae, 4)) - return transitions_hat + for b in range(0, self._B): + residuals_hat = resample(tm._residuals, replace=True, random_state=b) + Y_hat = tm._Y.copy() + for j in range(0, Y_hat.shape[1]): + residuals_j = self._constrained_random_numbers(len(Y_hat), residuals_hat[j], seed=j) + Y_hat[:, j] = Y_hat[:, j] + residuals_j + + transition_matrix_hat = tm._solve(tm._X, Y_hat) + transitions_hat = np.diag(tm._X_expected_totals) @ transition_matrix_hat + transitions = transitions + transitions_hat + + Y_pred_totals = np.sum(transitions_hat, axis=0) / np.sum(transitions_hat, axis=0).sum() + this_mae = mean_absolute_error(tm._Y_expected_totals, Y_pred_totals) + maes.append(this_mae) + LOG.info("MAE = %s", np.around(this_mae, 4)) + + self._mae = np.mean(maes) + return transitions / self._B From baf0e417a07fde9c8a752e09985914a2b4ef0a68 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 8 Dec 2023 12:00:09 -0500 Subject: [PATCH 053/135] Shuffling the random per unit/candidate residuals, which does nothing lol --- src/elexsolver/TransitionMatrixSolver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 1bbef11c..52bedb3d 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -95,8 +95,9 @@ def _constrained_random_numbers(self, n, M, seed=None): splits = [0] + [rng.random() for _ in range(0, n - 1)] + [1] splits.sort() diffs = [x - splits[i - 1] for (i, x) in enumerate(splits)][1:] - result = map(lambda x: x * M, diffs) - return list(result) + result = list(map(lambda x: x * M, diffs)) + rng.shuffle(result) + return result def fit_predict(self, X, Y): tm = TransitionMatrixSolver(strict=self._strict) From b1f3a7a39ce886be97ed7594112ad4d1b9f16c4c Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 8 Dec 2023 15:41:24 -0500 Subject: [PATCH 054/135] Adding option for weights to matrix solver --- src/elexsolver/TransitionMatrixSolver.py | 13 +++++++--- src/elexsolver/TransitionSolver.py | 16 +++++++++++- tests/test_transition_matrix_solver.py | 32 ++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 52bedb3d..bb69285b 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -30,18 +30,21 @@ def __get_constraint(coef, strict): return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] - def _solve(self, A, B): + def _solve(self, A, B, weights): transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) - loss_function = cp.norm(A @ transition_matrix - B, "fro") + Aw = np.dot(weights, A) + Bw = np.dot(weights, B) + loss_function = cp.norm(Aw @ transition_matrix - Bw, "fro") objective = cp.Minimize(loss_function) constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) problem = cp.Problem(objective, constraint) problem.solve(solver=cp.CLARABEL) return transition_matrix.value - def fit_predict(self, X, Y): + def fit_predict(self, X, Y, weights=None): """ X and Y are matrixes of integers. + weights is a list or numpy array with the same length as both X and Y. """ self._check_data_type(X) self._check_data_type(Y) @@ -70,7 +73,9 @@ def fit_predict(self, X, Y): self._X = self._rescale(X.T).T self._Y = self._rescale(Y.T).T - transition_matrix = self._solve(self._X, self._Y) + weights = self._check_and_prepare_weights(self._X, self._Y, weights) + + transition_matrix = self._solve(self._X, self._Y, weights) transitions = np.diag(self._X_expected_totals) @ transition_matrix Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() self._mae = mean_absolute_error(self._Y_expected_totals, Y_pred_totals) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 5b6ebff6..16414963 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -30,7 +30,7 @@ class TransitionSolver(ABC): def __init__(self): self._mae = None - def fit_predict(self, X: np.ndarray, Y: np.ndarray): + def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): raise NotImplementedError def get_prediction_interval(self, pi: float): @@ -86,3 +86,17 @@ def _rescale(self, A: np.ndarray): for col in A.columns: A[col] /= A[col].sum() return A.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) + + def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None): + if weights is not None: + if len(weights) != X.shape[0] and len(weights) != Y.shape[0]: + raise ValueError("weights must be the same length as the number of rows in X and Y.") + if isinstance(weights, list): + weights = np.array(weights).copy() + elif not isinstance(weights, np.ndarray): + # pandas.Series + weights = weights.values.copy() + else: + weights = np.ones((Y.shape[0],)) + + return np.diag(np.sqrt(weights.flatten() / weights.sum())) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 1d7c908f..f4a36285 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -34,6 +34,38 @@ def test_fit_predict(): np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) +def test_fit_predict_with_weights(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) + + expected = np.array([[0.340306, 0.121233], [0.124163, 0.414298]]) + + tms = TransitionMatrixSolver() + current = tms.fit_predict(X, Y, weights=weights) + np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) + + def test_get_prediction_interval(): with pytest.raises(NotImplementedError): tms = TransitionMatrixSolver() From 5fa30c74ee917b28915f612c7bc82f594eb4230f Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 8 Dec 2023 17:12:48 -0500 Subject: [PATCH 055/135] Switching the EI solver over to (units x candidates) to match the matrix solver and remove confusion --- src/elexsolver/EITransitionSolver.py | 29 ++++++++++++---------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 9efc51ea..f81da2bb 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -37,7 +37,7 @@ def __init__(self, sigma=1, sampling_chains=2, random_seed=None, n_samples=300): self._sampled = None self._X_totals = None - def fit_predict(self, X, Y): + def fit_predict(self, X, Y, weights=None): """ X and Y are matrixes of integers. """ @@ -61,26 +61,21 @@ def fit_predict(self, X, Y): self._check_for_zero_units(X) self._check_for_zero_units(Y) - # but for this solver, we need our matrices to be (things x units) - X = X.T - Y = Y.T + self._X_totals = X.sum(axis=0) / X.sum(axis=0).sum() + Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() + n = Y.sum(axis=1) - self._X_totals = X.sum(axis=1) / X.sum(axis=1).sum() - Y_expected_totals = Y.sum(axis=1) / Y.sum(axis=1).sum() - n = Y.sum(axis=0) - - X = self._rescale(X) - Y = self._rescale(Y) + X = self._rescale(X.T).T + Y = self._rescale(Y.T).T num_units = len(n) # should be the same as the number of units in Y - num_rows = X.shape[0] # number of things in X that are being transitioned "from" - num_cols = Y.shape[0] # number of things in Y that are being transitioned "to" + num_rows = X.shape[1] # number of things in X that are being transitioned "from" + num_cols = Y.shape[1] # number of things in Y that are being transitioned "to" # reshaping and rounding - Y_obs = np.transpose(Y * n).round() + Y_obs = (Y.T * n).round() X_extended = np.expand_dims(X, axis=2) X_extended = np.repeat(X_extended, num_cols, axis=2) - X_extended = np.swapaxes(X_extended, 0, 1) with pm.Model(check_bounds=False) as model: conc_params = pm.HalfNormal("conc_params", sigma=self._sigma, shape=(num_rows, num_cols)) @@ -90,7 +85,7 @@ def fit_predict(self, X, Y): "result_fractions", n=n, p=theta, - observed=Y_obs, + observed=Y_obs.T, shape=(num_units, num_cols), ) try: @@ -110,9 +105,9 @@ def fit_predict(self, X, Y): b_values = np.transpose( model_trace["posterior"]["beta"].stack(all_draws=["chain", "draw"]).values, axes=(3, 0, 1, 2) ) - samples_converted = np.transpose(b_values, axes=(3, 0, 1, 2)) * X.T.values + samples_converted = np.transpose(b_values, axes=(3, 0, 1, 2)) * X.values samples_summed_across = samples_converted.sum(axis=2) - self._sampled = np.transpose(samples_summed_across / X.T.sum(axis=0).values, axes=(1, 2, 0)) + self._sampled = np.transpose(samples_summed_across / X.sum(axis=0).values, axes=(1, 2, 0)) posterior_mean_rxc = self._sampled.mean(axis=0) transitions = self._get_transitions(posterior_mean_rxc) From 80512b17c8d36c42a4d0640fdb0c5d4c37260816 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 11 Dec 2023 09:37:24 -0500 Subject: [PATCH 056/135] Finish up the conversion of the EI solver from (things x units) to (units x things) --- src/elexsolver/EITransitionSolver.py | 16 ++++++++-------- src/elexsolver/TransitionMatrixSolver.py | 4 ++-- src/elexsolver/TransitionSolver.py | 21 ++++++--------------- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index f81da2bb..c32203f5 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -46,8 +46,7 @@ def fit_predict(self, X, Y, weights=None): self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) - # first, ensure matrices are (units x things), where the number of units is > the number of things - # this will allow us to re-use some of the same checks we use for all other solvers + # matrices should be (units x things), where the number of units is > the number of things if X.shape[1] > X.shape[0]: X = X.T if Y.shape[1] > Y.shape[0]: @@ -65,15 +64,16 @@ def fit_predict(self, X, Y, weights=None): Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() n = Y.sum(axis=1) - X = self._rescale(X.T).T - Y = self._rescale(Y.T).T + X = self._rescale(X) + Y_obs = Y.copy() + Y = self._rescale(Y) num_units = len(n) # should be the same as the number of units in Y num_rows = X.shape[1] # number of things in X that are being transitioned "from" num_cols = Y.shape[1] # number of things in Y that are being transitioned "to" # reshaping and rounding - Y_obs = (Y.T * n).round() + Y_obs = Y_obs.round() X_extended = np.expand_dims(X, axis=2) X_extended = np.repeat(X_extended, num_cols, axis=2) @@ -85,7 +85,7 @@ def fit_predict(self, X, Y, weights=None): "result_fractions", n=n, p=theta, - observed=Y_obs.T, + observed=Y_obs, shape=(num_units, num_cols), ) try: @@ -105,9 +105,9 @@ def fit_predict(self, X, Y, weights=None): b_values = np.transpose( model_trace["posterior"]["beta"].stack(all_draws=["chain", "draw"]).values, axes=(3, 0, 1, 2) ) - samples_converted = np.transpose(b_values, axes=(3, 0, 1, 2)) * X.values + samples_converted = np.transpose(b_values, axes=(3, 0, 1, 2)) * X samples_summed_across = samples_converted.sum(axis=2) - self._sampled = np.transpose(samples_summed_across / X.sum(axis=0).values, axes=(1, 2, 0)) + self._sampled = np.transpose(samples_summed_across / X.sum(axis=0), axes=(1, 2, 0)) posterior_mean_rxc = self._sampled.mean(axis=0) transitions = self._get_transitions(posterior_mean_rxc) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index bb69285b..60ad6b01 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -70,8 +70,8 @@ def fit_predict(self, X, Y, weights=None): self._X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() self._Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() - self._X = self._rescale(X.T).T - self._Y = self._rescale(Y.T).T + self._X = self._rescale(X) + self._Y = self._rescale(Y) weights = self._check_and_prepare_weights(self._X, self._Y, weights) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 16414963..cc6d427a 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -68,24 +68,15 @@ def _check_for_zero_units(self, A: np.ndarray): def _rescale(self, A: np.ndarray): """ - Rescale columns (things) to ensure they sum to 1 (100%). + Rescale rows (units) to ensure they sum to 1 (100%). """ A = A.copy().astype(float) - if isinstance(A, np.ndarray): - with warnings.catch_warnings(): - # Zeros are completely ok here; - # means the thing (e.g. candidate) received zero votes. - warnings.filterwarnings( - "ignore", category=RuntimeWarning, message="invalid value encountered in divide" - ) - for j in range(0, A.shape[1]): - A[:, j] = A[:, j] / A[:, j].sum() - return np.nan_to_num(A, nan=0, posinf=0, neginf=0) - # pandas.DataFrame() - for col in A.columns: - A[col] /= A[col].sum() - return A.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in divide") + A = (A.T / A.sum(axis=1)).T + + return np.nan_to_num(A, nan=0, posinf=0, neginf=0) def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None): if weights is not None: From b453261e40404cb5935e88dafe974ce08ce2e91a Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 11 Dec 2023 09:47:20 -0500 Subject: [PATCH 057/135] Removing some redundancy in the EI solver --- src/elexsolver/EITransitionSolver.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index c32203f5..c769adf0 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -64,16 +64,12 @@ def fit_predict(self, X, Y, weights=None): Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() n = Y.sum(axis=1) - X = self._rescale(X) - Y_obs = Y.copy() - Y = self._rescale(Y) - num_units = len(n) # should be the same as the number of units in Y num_rows = X.shape[1] # number of things in X that are being transitioned "from" num_cols = Y.shape[1] # number of things in Y that are being transitioned "to" - # reshaping and rounding - Y_obs = Y_obs.round() + # rescaling and reshaping + X = self._rescale(X) X_extended = np.expand_dims(X, axis=2) X_extended = np.repeat(X_extended, num_cols, axis=2) @@ -85,7 +81,7 @@ def fit_predict(self, X, Y, weights=None): "result_fractions", n=n, p=theta, - observed=Y_obs, + observed=Y, shape=(num_units, num_cols), ) try: From 1ed57ae28d612de6a923a4101f1a96a4944f2ecb Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 11 Dec 2023 10:26:39 -0500 Subject: [PATCH 058/135] Fixing mistake in preparing the weights --- src/elexsolver/TransitionSolver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index cc6d427a..fdbb0585 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -87,7 +87,6 @@ def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.n elif not isinstance(weights, np.ndarray): # pandas.Series weights = weights.values.copy() - else: - weights = np.ones((Y.shape[0],)) + return np.diag(np.sqrt(weights.flatten() / weights.sum())) - return np.diag(np.sqrt(weights.flatten() / weights.sum())) + return np.diag(np.ones((Y.shape[0],))) From 5088ee83db5d574cb19093cc361027ddcf848b9d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 11 Dec 2023 13:01:28 -0500 Subject: [PATCH 059/135] Finished EI solver with weights (I think) --- src/elexsolver/EITransitionSolver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index c769adf0..2210a6f7 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -63,6 +63,7 @@ def fit_predict(self, X, Y, weights=None): self._X_totals = X.sum(axis=0) / X.sum(axis=0).sum() Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() n = Y.sum(axis=1) + weights = self._check_and_prepare_weights(X, Y, weights) num_units = len(n) # should be the same as the number of units in Y num_rows = X.shape[1] # number of things in X that are being transitioned "from" @@ -70,13 +71,13 @@ def fit_predict(self, X, Y, weights=None): # rescaling and reshaping X = self._rescale(X) - X_extended = np.expand_dims(X, axis=2) + X_extended = np.expand_dims(np.dot(weights, X), axis=2) X_extended = np.repeat(X_extended, num_cols, axis=2) with pm.Model(check_bounds=False) as model: conc_params = pm.HalfNormal("conc_params", sigma=self._sigma, shape=(num_rows, num_cols)) beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) - theta = (X_extended * beta).sum(axis=1) + theta = pm.math.dot(weights, (X_extended * beta).sum(axis=1)) pm.Multinomial( "result_fractions", n=n, From 5af483560ced7a974c2d0fd6e950cebdfe0c5cf4 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 11 Dec 2023 14:56:54 -0500 Subject: [PATCH 060/135] Switching to 'classic' bootstrap matrix solver which seems to produce more interesting results... --- src/elexsolver/TransitionMatrixSolver.py | 93 +++++++++++------------- 1 file changed, 42 insertions(+), 51 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 60ad6b01..1dcecc35 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -16,21 +16,13 @@ def __init__(self, strict=True): super().__init__() self._strict = strict - # class members that are instantiated during model-fit - # for bootstrapping - self._residuals = None - self._X = None - self._Y = None - self._X_expected_totals = None - self._Y_expected_totals = None - @staticmethod def __get_constraint(coef, strict): if strict: return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] - def _solve(self, A, B, weights): + def __solve(self, A, B, weights): transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) Aw = np.dot(weights, A) Bw = np.dot(weights, B) @@ -57,6 +49,9 @@ def fit_predict(self, X, Y, weights=None): if Y.shape[1] > Y.shape[0]: Y = Y.T + if X.shape[0] != Y.shape[0]: + raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") + self._check_dimensions(X) self._check_dimensions(Y) self._check_for_zero_units(X) @@ -67,20 +62,19 @@ def fit_predict(self, X, Y, weights=None): if not isinstance(Y, np.ndarray): Y = Y.to_numpy() - self._X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() - self._Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() + X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() + Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() - self._X = self._rescale(X) - self._Y = self._rescale(Y) + X = self._rescale(X) + Y = self._rescale(Y) - weights = self._check_and_prepare_weights(self._X, self._Y, weights) + weights = self._check_and_prepare_weights(X, Y, weights) - transition_matrix = self._solve(self._X, self._Y, weights) - transitions = np.diag(self._X_expected_totals) @ transition_matrix + transition_matrix = self.__solve(X, Y, weights) + transitions = np.diag(X_expected_totals) @ transition_matrix Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() - self._mae = mean_absolute_error(self._Y_expected_totals, Y_pred_totals) + self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) LOG.info("MAE = %s", np.around(self._mae, 4)) - self._residuals = Y_pred_totals - self._Y_expected_totals return transitions @@ -91,42 +85,39 @@ def __init__(self, B=1000, strict=True): self._strict = strict self._B = B - def _constrained_random_numbers(self, n, M, seed=None): - """ - Generate n random numbers that sum to M. - Based on: https://stackoverflow.com/a/30659457/224912 - """ - rng = np.random.default_rng(seed=seed) - splits = [0] + [rng.random() for _ in range(0, n - 1)] + [1] - splits.sort() - diffs = [x - splits[i - 1] for (i, x) in enumerate(splits)][1:] - result = list(map(lambda x: x * M, diffs)) - rng.shuffle(result) - return result - - def fit_predict(self, X, Y): - tm = TransitionMatrixSolver(strict=self._strict) - transitions = tm.fit_predict(X, Y) - - from sklearn.utils import resample # to be replaced - + def fit_predict(self, X, Y, weights=None): maes = [] + predicted_transitions = [] + + # assuming pandas.DataFrame + if not isinstance(X, np.ndarray): + X = X.to_numpy() + if not isinstance(Y, np.ndarray): + Y = Y.to_numpy() - for b in range(0, self._B): - residuals_hat = resample(tm._residuals, replace=True, random_state=b) - Y_hat = tm._Y.copy() - for j in range(0, Y_hat.shape[1]): - residuals_j = self._constrained_random_numbers(len(Y_hat), residuals_hat[j], seed=j) - Y_hat[:, j] = Y_hat[:, j] + residuals_j + tm = TransitionMatrixSolver(strict=self._strict) + predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) + maes.append(tm.MAE) - transition_matrix_hat = tm._solve(tm._X, Y_hat) - transitions_hat = np.diag(tm._X_expected_totals) @ transition_matrix_hat - transitions = transitions + transitions_hat + from sklearn.utils import resample # to be replaced - Y_pred_totals = np.sum(transitions_hat, axis=0) / np.sum(transitions_hat, axis=0).sum() - this_mae = mean_absolute_error(tm._Y_expected_totals, Y_pred_totals) - maes.append(this_mae) - LOG.info("MAE = %s", np.around(this_mae, 4)) + for b in range(0, self._B - 1): + X_resampled = [] + Y_resampled = [] + weights_resampled = [] + for i in resample(range(0, len(X)), replace=True, random_state=b): + X_resampled.append(X[i]) + Y_resampled.append(Y[i]) + if weights is not None: + weights_resampled.append(weights[i]) + if weights is None: + weights_resampled = None + else: + weights_resampled = np.array(weights_resampled) + predicted_transitions.append( + tm.fit_predict(np.array(X_resampled), np.array(Y_resampled), weights=weights_resampled) + ) + maes.append(tm.MAE) self._mae = np.mean(maes) - return transitions / self._B + return np.mean(predicted_transitions, axis=0) From 871bf11ea10a55d2155007f56d85f78b1b907cc8 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 11 Dec 2023 16:44:35 -0500 Subject: [PATCH 061/135] Fix typing with weights in bootstrap matrix solver --- src/elexsolver/TransitionMatrixSolver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 1dcecc35..ff9b833d 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -94,6 +94,9 @@ def fit_predict(self, X, Y, weights=None): X = X.to_numpy() if not isinstance(Y, np.ndarray): Y = Y.to_numpy() + # assuming pandas.Series + if weights is not None and not isinstance(weights, np.ndarray): + weights = weights.values tm = TransitionMatrixSolver(strict=self._strict) predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) From 2f9c75bdd633d184d7f7af0ac8a76c848e8eae61 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 12 Dec 2023 11:11:26 -0500 Subject: [PATCH 062/135] Improving some of the logging generated by the matrix solvers --- setup.py | 2 +- src/elexsolver/TransitionMatrixSolver.py | 46 +++++++++++++----------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 7b1a4189..762371e5 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.25", "numpyro~=0.13", "pymc~=5.9", "scipy~=1.11"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.25", "numpyro~=0.13", "pymc~=5.9", "scipy~=1.11", "tqdm~=4.66"] THIS_FILE_DIR = os.path.dirname(__file__) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index ff9b833d..6ccc0e15 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -2,6 +2,8 @@ import cvxpy as cp import numpy as np +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm from elexsolver.logging import initialize_logging from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error @@ -12,9 +14,10 @@ class TransitionMatrixSolver(TransitionSolver): - def __init__(self, strict=True): + def __init__(self, strict=True, verbose=True): super().__init__() self._strict = strict + self._verbose = verbose @staticmethod def __get_constraint(coef, strict): @@ -74,7 +77,8 @@ def fit_predict(self, X, Y, weights=None): transitions = np.diag(X_expected_totals) @ transition_matrix Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) - LOG.info("MAE = %s", np.around(self._mae, 4)) + if self._verbose: + LOG.info("MAE = %s", np.around(self._mae, 4)) return transitions @@ -98,29 +102,31 @@ def fit_predict(self, X, Y, weights=None): if weights is not None and not isinstance(weights, np.ndarray): weights = weights.values - tm = TransitionMatrixSolver(strict=self._strict) + tm = TransitionMatrixSolver(strict=self._strict, verbose=False) predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) maes.append(tm.MAE) from sklearn.utils import resample # to be replaced - for b in range(0, self._B - 1): - X_resampled = [] - Y_resampled = [] - weights_resampled = [] - for i in resample(range(0, len(X)), replace=True, random_state=b): - X_resampled.append(X[i]) - Y_resampled.append(Y[i]) - if weights is not None: - weights_resampled.append(weights[i]) - if weights is None: - weights_resampled = None - else: - weights_resampled = np.array(weights_resampled) - predicted_transitions.append( - tm.fit_predict(np.array(X_resampled), np.array(Y_resampled), weights=weights_resampled) - ) - maes.append(tm.MAE) + with logging_redirect_tqdm(loggers=[LOG]): + for b in tqdm(range(0, self._B - 1), desc="Bootstrapping"): + X_resampled = [] + Y_resampled = [] + weights_resampled = [] + for i in resample(range(0, len(X)), replace=True, random_state=b): + X_resampled.append(X[i]) + Y_resampled.append(Y[i]) + if weights is not None: + weights_resampled.append(weights[i]) + if weights is None: + weights_resampled = None + else: + weights_resampled = np.array(weights_resampled) + predicted_transitions.append( + tm.fit_predict(np.array(X_resampled), np.array(Y_resampled), weights=weights_resampled) + ) + maes.append(tm.MAE) self._mae = np.mean(maes) + LOG.info("MAE = %s", np.around(self._mae, 4)) return np.mean(predicted_transitions, axis=0) From 833cb828cb23fee98d4a87144f121af7095eb77f Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 12 Dec 2023 12:26:42 -0500 Subject: [PATCH 063/135] Trying out some error handling --- src/elexsolver/TransitionMatrixSolver.py | 56 ++++++++++++++---------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 6ccc0e15..f11ea056 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -1,9 +1,9 @@ import logging +import warnings import cvxpy as cp import numpy as np from tqdm import tqdm -from tqdm.contrib.logging import logging_redirect_tqdm from elexsolver.logging import initialize_logging from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error @@ -33,7 +33,15 @@ def __solve(self, A, B, weights): objective = cp.Minimize(loss_function) constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) problem = cp.Problem(objective, constraint) - problem.solve(solver=cp.CLARABEL) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + try: + problem.solve(solver=cp.CLARABEL) + except (UserWarning, cp.error.SolverError) as e: + LOG.error(e) + return np.zeros((A.shape[1], B.shape[1])) + return transition_matrix.value def fit_predict(self, X, Y, weights=None): @@ -75,8 +83,13 @@ def fit_predict(self, X, Y, weights=None): transition_matrix = self.__solve(X, Y, weights) transitions = np.diag(X_expected_totals) @ transition_matrix - Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() - self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) + + if np.sum(transitions, axis=0).sum() != 0: + Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() + self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) + else: + # would have logged an error above + self._mae = 1 if self._verbose: LOG.info("MAE = %s", np.around(self._mae, 4)) @@ -108,24 +121,23 @@ def fit_predict(self, X, Y, weights=None): from sklearn.utils import resample # to be replaced - with logging_redirect_tqdm(loggers=[LOG]): - for b in tqdm(range(0, self._B - 1), desc="Bootstrapping"): - X_resampled = [] - Y_resampled = [] - weights_resampled = [] - for i in resample(range(0, len(X)), replace=True, random_state=b): - X_resampled.append(X[i]) - Y_resampled.append(Y[i]) - if weights is not None: - weights_resampled.append(weights[i]) - if weights is None: - weights_resampled = None - else: - weights_resampled = np.array(weights_resampled) - predicted_transitions.append( - tm.fit_predict(np.array(X_resampled), np.array(Y_resampled), weights=weights_resampled) - ) - maes.append(tm.MAE) + for b in tqdm(range(0, self._B - 1), desc="Bootstrapping"): + X_resampled = [] + Y_resampled = [] + weights_resampled = [] + for i in resample(range(0, len(X)), replace=True, random_state=b): + X_resampled.append(X[i]) + Y_resampled.append(Y[i]) + if weights is not None: + weights_resampled.append(weights[i]) + if weights is None: + weights_resampled = None + else: + weights_resampled = np.array(weights_resampled) + predicted_transitions.append( + tm.fit_predict(np.array(X_resampled), np.array(Y_resampled), weights=weights_resampled) + ) + maes.append(tm.MAE) self._mae = np.mean(maes) LOG.info("MAE = %s", np.around(self._mae, 4)) From 211e5b51ce88d36fc5de35c4ebdb2205f0874217 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 12 Dec 2023 16:29:00 -0500 Subject: [PATCH 064/135] Use the weights in the bootstrap to draw a weighted sample --- src/elexsolver/TransitionMatrixSolver.py | 27 +++++++----------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index f11ea056..6bd23aca 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -111,32 +111,21 @@ def fit_predict(self, X, Y, weights=None): X = X.to_numpy() if not isinstance(Y, np.ndarray): Y = Y.to_numpy() - # assuming pandas.Series - if weights is not None and not isinstance(weights, np.ndarray): - weights = weights.values tm = TransitionMatrixSolver(strict=self._strict, verbose=False) predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) maes.append(tm.MAE) - from sklearn.utils import resample # to be replaced - for b in tqdm(range(0, self._B - 1), desc="Bootstrapping"): - X_resampled = [] - Y_resampled = [] - weights_resampled = [] - for i in resample(range(0, len(X)), replace=True, random_state=b): - X_resampled.append(X[i]) - Y_resampled.append(Y[i]) - if weights is not None: - weights_resampled.append(weights[i]) - if weights is None: - weights_resampled = None - else: - weights_resampled = np.array(weights_resampled) - predicted_transitions.append( - tm.fit_predict(np.array(X_resampled), np.array(Y_resampled), weights=weights_resampled) + rng = np.random.default_rng(seed=b) + X_resampled = rng.choice( + X, len(X), replace=True, axis=0, p=(weights / weights.sum() if weights is not None else None) ) + Y_resampled = [] + for x in X_resampled: + index = np.where((X == x).all(axis=1))[0][0] + Y_resampled.append(Y[index]) + predicted_transitions.append(tm.fit_predict(X_resampled, np.array(Y_resampled), weights=None)) maes.append(tm.MAE) self._mae = np.mean(maes) From 518a6f83e3544652277f8583ec3c87d8ff72d8ca Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 13 Dec 2023 09:40:53 -0500 Subject: [PATCH 065/135] Speeding up the bootstrap matrix solver a bit --- src/elexsolver/TransitionMatrixSolver.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 6bd23aca..885c7e13 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -121,11 +121,9 @@ def fit_predict(self, X, Y, weights=None): X_resampled = rng.choice( X, len(X), replace=True, axis=0, p=(weights / weights.sum() if weights is not None else None) ) - Y_resampled = [] - for x in X_resampled: - index = np.where((X == x).all(axis=1))[0][0] - Y_resampled.append(Y[index]) - predicted_transitions.append(tm.fit_predict(X_resampled, np.array(Y_resampled), weights=None)) + indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] + Y_resampled = Y[indices] + predicted_transitions.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) maes.append(tm.MAE) self._mae = np.mean(maes) From b2232d4302346f0e9bfd6938e7040c157980ebbd Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 14 Dec 2023 16:17:34 -0500 Subject: [PATCH 066/135] Removing weighting from EI solver for now since it's SUPER slow and probably not implemented right; also making the prediction/credible interval distinction clear --- src/elexsolver/EITransitionSolver.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 2210a6f7..42984123 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -40,6 +40,7 @@ def __init__(self, sigma=1, sampling_chains=2, random_seed=None, n_samples=300): def fit_predict(self, X, Y, weights=None): """ X and Y are matrixes of integers. + NOTE: weighting is not currently implemented. """ self._check_data_type(X) self._check_data_type(Y) @@ -63,7 +64,6 @@ def fit_predict(self, X, Y, weights=None): self._X_totals = X.sum(axis=0) / X.sum(axis=0).sum() Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() n = Y.sum(axis=1) - weights = self._check_and_prepare_weights(X, Y, weights) num_units = len(n) # should be the same as the number of units in Y num_rows = X.shape[1] # number of things in X that are being transitioned "from" @@ -71,13 +71,13 @@ def fit_predict(self, X, Y, weights=None): # rescaling and reshaping X = self._rescale(X) - X_extended = np.expand_dims(np.dot(weights, X), axis=2) + X_extended = np.expand_dims(X, axis=2) X_extended = np.repeat(X_extended, num_cols, axis=2) with pm.Model(check_bounds=False) as model: conc_params = pm.HalfNormal("conc_params", sigma=self._sigma, shape=(num_rows, num_cols)) beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) - theta = pm.math.dot(weights, (X_extended * beta).sum(axis=1)) + theta = (X_extended * beta).sum(axis=1) pm.Multinomial( "result_fractions", n=n, @@ -120,17 +120,14 @@ def _get_transitions(self, A: np.ndarray): transitions.append(col * self._X_totals) return np.array(transitions).T - def get_prediction_interval(self, pi): - """ - Note: this is actually a credible interval, not a prediction interval. - """ - if pi <= 1: - pi = pi * 100 - if pi < 0 or pi > 100: - raise ValueError(f"Invalid prediction interval {pi}.") + def get_credible_interval(self, ci): + if ci <= 1: + ci = ci * 100 + if ci < 0 or ci > 100: + raise ValueError(f"Invalid prediction interval {ci}.") - lower = (100 - pi) / 2 - upper = pi + lower + lower = (100 - ci) / 2 + upper = ci + lower A_dict = { lower: np.zeros((self._sampled.shape[1], self._sampled.shape[2])), upper: np.zeros((self._sampled.shape[1], self._sampled.shape[2])), From a77409c15725eec99b9ac239399c80c6d221a33d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 15 Dec 2023 13:10:49 -0500 Subject: [PATCH 067/135] Correcting comment prediction => credible --- src/elexsolver/EITransitionSolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 42984123..92277efa 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -124,7 +124,7 @@ def get_credible_interval(self, ci): if ci <= 1: ci = ci * 100 if ci < 0 or ci > 100: - raise ValueError(f"Invalid prediction interval {ci}.") + raise ValueError(f"Invalid credible interval {ci}.") lower = (100 - ci) / 2 upper = ci + lower From 5559e228c481dc35221f367ad9473d69e8222ed5 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 15 Dec 2023 13:15:23 -0500 Subject: [PATCH 068/135] Adding method for confidence interval to bootstrap solver --- src/elexsolver/TransitionMatrixSolver.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 885c7e13..0ad9bb8b 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -102,9 +102,12 @@ def __init__(self, B=1000, strict=True): self._strict = strict self._B = B + # class members that are instantiated during model-fit + self._predicted_transitions = None + def fit_predict(self, X, Y, weights=None): maes = [] - predicted_transitions = [] + self._predicted_transitions = [] # assuming pandas.DataFrame if not isinstance(X, np.ndarray): @@ -113,7 +116,7 @@ def fit_predict(self, X, Y, weights=None): Y = Y.to_numpy() tm = TransitionMatrixSolver(strict=self._strict, verbose=False) - predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) + self._predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) maes.append(tm.MAE) for b in tqdm(range(0, self._B - 1), desc="Bootstrapping"): @@ -123,9 +126,22 @@ def fit_predict(self, X, Y, weights=None): ) indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] Y_resampled = Y[indices] - predicted_transitions.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) + self._predicted_transitions.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) maes.append(tm.MAE) self._mae = np.mean(maes) LOG.info("MAE = %s", np.around(self._mae, 4)) - return np.mean(predicted_transitions, axis=0) + return np.mean(self._predicted_transitions, axis=0) + + def get_confidence_interval(self, alpha): + if alpha > 1: + alpha = alpha / 100 + if alpha < 0 or alpha >= 1: + raise ValueError(f"Invalid confidence interval {alpha}.") + + p_lower = ((1.0 - alpha) / 2.0) * 100 + p_upper = ((1.0 + alpha) / 2.0) * 100 + return ( + np.percentile(self._predicted_transitions, p_lower, axis=0), + np.percentile(self._predicted_transitions, p_upper, axis=0), + ) From 11d23bc1510f445c4480cfc79ce00a5e46d46df5 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 15 Dec 2023 14:33:39 -0500 Subject: [PATCH 069/135] Adding unit tests for the weights standardization/checking --- tests/test_transition_solver.py | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 3d90fd13..9f7b6d45 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -112,7 +112,7 @@ def test_rescale_rescaled_pandas(): a_df = pandas.DataFrame(np.ones((2, 2)), columns=["A", "B"]).astype(int) expected_df = pandas.DataFrame([[0.5, 0.5], [0.5, 0.5]], columns=["A", "B"]) ts = TransitionSolver() - np.testing.assert_array_equal(ts._rescale(a_df), expected_df) # pylint: disable=protected-access + np.testing.assert_array_equal(expected_df, ts._rescale(a_df)) # pylint: disable=protected-access @patch.object(TransitionSolver, "__abstractmethods__", set()) @@ -128,3 +128,36 @@ def test_check_data_type_bad(): A = np.array([[0.1, 0.2, 0.3]]) ts = TransitionSolver() ts._check_data_type(A) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_bad(): + with pytest.raises(ValueError): + weights = [1, 2] + A = np.array([[1, 2], [3, 4], [5, 6]]) + B = A.copy() + ts = TransitionSolver() + ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_none(): + A = np.array([[1, 2], [3, 4], [5, 6]]) + B = A.copy() + expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, None) # pylint: disable=protected-access + np.testing.assert_array_equal(expected, current) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_with_weights(): + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = A.copy() + weights = np.array([0.6, 0.4]) + expected = np.array([[0.77459667, 0], [0, 0.63245553]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + np.testing.assert_allclose(expected, current) From f661ce8123fa6415aa3db52bbc910ca9a0011288 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 15 Dec 2023 15:04:52 -0500 Subject: [PATCH 070/135] Adding option to hide the progress bar during bootstrapping --- src/elexsolver/TransitionMatrixSolver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 0ad9bb8b..5ca6d0f3 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -97,10 +97,11 @@ def fit_predict(self, X, Y, weights=None): class BootstrapTransitionMatrixSolver(TransitionSolver): - def __init__(self, B=1000, strict=True): + def __init__(self, B=1000, strict=True, verbose=True): super().__init__() self._strict = strict self._B = B + self._verbose = verbose # class members that are instantiated during model-fit self._predicted_transitions = None @@ -119,7 +120,7 @@ def fit_predict(self, X, Y, weights=None): self._predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) maes.append(tm.MAE) - for b in tqdm(range(0, self._B - 1), desc="Bootstrapping"): + for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not (self._verbose)): rng = np.random.default_rng(seed=b) X_resampled = rng.choice( X, len(X), replace=True, axis=0, p=(weights / weights.sum() if weights is not None else None) From 9e481348aaae88447982fd91035495221c8b3f0c Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 15 Dec 2023 15:28:51 -0500 Subject: [PATCH 071/135] Removing extraneous parentheses from bootstrap solver --- src/elexsolver/TransitionMatrixSolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 5ca6d0f3..af8051ff 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -120,7 +120,7 @@ def fit_predict(self, X, Y, weights=None): self._predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) maes.append(tm.MAE) - for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not (self._verbose)): + for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not self._verbose): rng = np.random.default_rng(seed=b) X_resampled = rng.choice( X, len(X), replace=True, axis=0, p=(weights / weights.sum() if weights is not None else None) From 8bda934c9381aebd100c8e3f6603020811c7a26d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 15 Dec 2023 15:50:53 -0500 Subject: [PATCH 072/135] Don't fail the one unit test requiring pandas if the user doesn't have pandas --- tests/test_transition_solver.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 9f7b6d45..468efa67 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -107,12 +107,16 @@ def test_rescale_rescaled_numpy(): @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_rescale_rescaled_pandas(): - import pandas + try: + import pandas # pylint: disable=import-outside-toplevel - a_df = pandas.DataFrame(np.ones((2, 2)), columns=["A", "B"]).astype(int) - expected_df = pandas.DataFrame([[0.5, 0.5], [0.5, 0.5]], columns=["A", "B"]) - ts = TransitionSolver() - np.testing.assert_array_equal(expected_df, ts._rescale(a_df)) # pylint: disable=protected-access + a_df = pandas.DataFrame(np.ones((2, 2)), columns=["A", "B"]).astype(int) + expected_df = pandas.DataFrame([[0.5, 0.5], [0.5, 0.5]], columns=["A", "B"]) + ts = TransitionSolver() + np.testing.assert_array_equal(expected_df, ts._rescale(a_df)) # pylint: disable=protected-access + except ImportError: + # pass this test through since pandas isn't a requirement for elex-solver + assert True @patch.object(TransitionSolver, "__abstractmethods__", set()) From c0efe08bf6ca22ac87c5eece31e253d5e25dc369 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 15 Dec 2023 15:57:39 -0500 Subject: [PATCH 073/135] Adding unit tests for the bootstrap solver --- tests/test_transition_matrix_solver.py | 103 ++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 4 deletions(-) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index f4a36285..4ea2908f 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -1,10 +1,10 @@ import numpy as np import pytest -from elexsolver.TransitionMatrixSolver import TransitionMatrixSolver +from elexsolver.TransitionMatrixSolver import BootstrapTransitionMatrixSolver, TransitionMatrixSolver -def test_fit_predict(): +def test_matrix_fit_predict(): X = np.array( [ [1, 2], @@ -34,7 +34,7 @@ def test_fit_predict(): np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) -def test_fit_predict_with_weights(): +def test_matrix_fit_predict_with_weights(): X = np.array( [ [1, 2], @@ -66,7 +66,102 @@ def test_fit_predict_with_weights(): np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) -def test_get_prediction_interval(): +def test_matrix_get_prediction_interval(): with pytest.raises(NotImplementedError): tms = TransitionMatrixSolver() tms.get_prediction_interval(0) + + +def test_bootstrap_fit_predict(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected = np.array([[0.374623, 0.087791], [0.093755, 0.44383]]) + + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + current = btms.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) + + +def test_bootstrap_fit_predict_with_weights(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) + + expected = np.array([[0.319791, 0.112347], [0.130296, 0.437565]]) + + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + current = btms.fit_predict(X, Y, weights=weights) + np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) + + +def test_bootstrap_confidence_interval(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected_lower = np.array([[0.34326, 0.045649], [0.047865, 0.418057]]) + expected_upper = np.array([[0.429978, 0.112171], [0.119081, 0.477393]]) + + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + _ = btms.fit_predict(X, Y) + (current_lower, current_upper) = btms.get_confidence_interval(0.95) + np.testing.assert_allclose(expected_lower, current_lower, rtol=1e-08, atol=1e-02) + np.testing.assert_allclose(expected_upper, current_upper, rtol=1e-08, atol=1e-02) From cec6a3c94c53e4e3c2ab2ef917e553fd10cc9530 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 18 Dec 2023 16:05:24 -0500 Subject: [PATCH 074/135] Changing constraint to constraints --- src/elexsolver/TransitionMatrixSolver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index af8051ff..abe5e459 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -20,7 +20,7 @@ def __init__(self, strict=True, verbose=True): self._verbose = verbose @staticmethod - def __get_constraint(coef, strict): + def __get_constraints(coef, strict): if strict: return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] @@ -31,8 +31,8 @@ def __solve(self, A, B, weights): Bw = np.dot(weights, B) loss_function = cp.norm(Aw @ transition_matrix - Bw, "fro") objective = cp.Minimize(loss_function) - constraint = TransitionMatrixSolver.__get_constraint(transition_matrix, self._strict) - problem = cp.Problem(objective, constraint) + constraints = TransitionMatrixSolver.__get_constraints(transition_matrix, self._strict) + problem = cp.Problem(objective, constraints) with warnings.catch_warnings(): warnings.simplefilter("error") From 260446f611ea8519a7d0bfe35cca2352e5e95589 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 19 Dec 2023 10:46:15 -0500 Subject: [PATCH 075/135] Adding L2 regularization option to matrix solver --- src/elexsolver/TransitionMatrixSolver.py | 25 +++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index abe5e459..d5886d34 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -14,10 +14,14 @@ class TransitionMatrixSolver(TransitionSolver): - def __init__(self, strict=True, verbose=True): + def __init__(self, strict=True, verbose=True, lam=None): + """ + `lam` > 0 will enable L2 regularization (Ridge). + """ super().__init__() self._strict = strict self._verbose = verbose + self._lambda = lam @staticmethod def __get_constraints(coef, strict): @@ -25,12 +29,27 @@ def __get_constraints(coef, strict): return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] + def __standard_objective(self, A, B, beta): + loss_function = cp.norm(A @ beta - B, "fro") + return cp.Minimize(loss_function) + + def __ridge_objective(self, A, B, beta): + # Based on https://www.cvxpy.org/examples/machine_learning/ridge_regression.html + lam = cp.Parameter(nonneg=True, value=self._lambda) + loss_function = cp.pnorm(A @ beta - B, p=2) ** 2 + regularizer = cp.pnorm(beta, p=2) ** 2 + return cp.Minimize(loss_function + lam * regularizer) + def __solve(self, A, B, weights): transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) Aw = np.dot(weights, A) Bw = np.dot(weights, B) - loss_function = cp.norm(Aw @ transition_matrix - Bw, "fro") - objective = cp.Minimize(loss_function) + + if self._lambda is None or self._lambda == 0: + objective = self.__standard_objective(Aw, Bw, transition_matrix) + else: + objective = self.__ridge_objective(Aw, Bw, transition_matrix) + constraints = TransitionMatrixSolver.__get_constraints(transition_matrix, self._strict) problem = cp.Problem(objective, constraints) From e32fb806284450f563e8c4086e227e19681a9ddf Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 19 Dec 2023 13:28:21 -0500 Subject: [PATCH 076/135] Adding lambda argument to bootstrap solver to enable bootstrap ridge --- src/elexsolver/TransitionMatrixSolver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index d5886d34..a7726522 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -116,11 +116,12 @@ def fit_predict(self, X, Y, weights=None): class BootstrapTransitionMatrixSolver(TransitionSolver): - def __init__(self, B=1000, strict=True, verbose=True): + def __init__(self, B=1000, strict=True, verbose=True, lam=None): super().__init__() self._strict = strict self._B = B self._verbose = verbose + self._lambda = lam # class members that are instantiated during model-fit self._predicted_transitions = None @@ -135,7 +136,7 @@ def fit_predict(self, X, Y, weights=None): if not isinstance(Y, np.ndarray): Y = Y.to_numpy() - tm = TransitionMatrixSolver(strict=self._strict, verbose=False) + tm = TransitionMatrixSolver(strict=self._strict, verbose=False, lam=self._lambda) self._predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) maes.append(tm.MAE) From 20e14b663f48f77ee39bd380812c4912c77fc69d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 20 Dec 2023 14:40:56 -0500 Subject: [PATCH 077/135] Clarifying predicted percentages vs. transitions --- src/elexsolver/EITransitionSolver.py | 14 +++++++----- src/elexsolver/TransitionMatrixSolver.py | 29 ++++++++++++++---------- src/elexsolver/TransitionSolver.py | 5 ++++ tests/test_transition_matrix_solver.py | 12 +++++----- 4 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 92277efa..61fc878b 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -107,20 +107,20 @@ def fit_predict(self, X, Y, weights=None): self._sampled = np.transpose(samples_summed_across / X.sum(axis=0), axes=(1, 2, 0)) posterior_mean_rxc = self._sampled.mean(axis=0) - transitions = self._get_transitions(posterior_mean_rxc) - Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() + self._transitions = self._get_transitions(posterior_mean_rxc) + Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() self._mae = mean_absolute_error(Y_pred_totals, Y_expected_totals) LOG.info("MAE = %s", np.around(self._mae, 4)) - return transitions + return posterior_mean_rxc def _get_transitions(self, A: np.ndarray): - # to go from inferences to transitions + # to go from inferred percentages to transitions transitions = [] for col in A.T: transitions.append(col * self._X_totals) return np.array(transitions).T - def get_credible_interval(self, ci): + def get_credible_interval(self, ci, transitions=False): if ci <= 1: ci = ci * 100 if ci < 0 or ci > 100: @@ -138,4 +138,6 @@ def get_credible_interval(self, ci): for j in range(0, self._sampled.shape[2]): A_dict[ci][i][j] = np.percentile(self._sampled[:, i, j], ci) - return (self._get_transitions(A_dict[lower]), self._get_transitions(A_dict[upper])) + if transitions: + return (self._get_transitions(A_dict[lower]), self._get_transitions(A_dict[upper])) + return (A_dict[lower], A_dict[upper]) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index a7726522..bc194009 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -100,11 +100,11 @@ def fit_predict(self, X, Y, weights=None): weights = self._check_and_prepare_weights(X, Y, weights) - transition_matrix = self.__solve(X, Y, weights) - transitions = np.diag(X_expected_totals) @ transition_matrix + percentages = self.__solve(X, Y, weights) + self._transitions = np.diag(X_expected_totals) @ percentages - if np.sum(transitions, axis=0).sum() != 0: - Y_pred_totals = np.sum(transitions, axis=0) / np.sum(transitions, axis=0).sum() + if np.sum(self._transitions, axis=0).sum() != 0: + Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) else: # would have logged an error above @@ -112,7 +112,7 @@ def fit_predict(self, X, Y, weights=None): if self._verbose: LOG.info("MAE = %s", np.around(self._mae, 4)) - return transitions + return percentages class BootstrapTransitionMatrixSolver(TransitionSolver): @@ -124,11 +124,12 @@ def __init__(self, B=1000, strict=True, verbose=True, lam=None): self._lambda = lam # class members that are instantiated during model-fit - self._predicted_transitions = None + self._predicted_percentages = None def fit_predict(self, X, Y, weights=None): maes = [] - self._predicted_transitions = [] + self._predicted_percentages = [] + predicted_transitions = [] # assuming pandas.DataFrame if not isinstance(X, np.ndarray): @@ -137,8 +138,9 @@ def fit_predict(self, X, Y, weights=None): Y = Y.to_numpy() tm = TransitionMatrixSolver(strict=self._strict, verbose=False, lam=self._lambda) - self._predicted_transitions.append(tm.fit_predict(X, Y, weights=weights)) + self._predicted_percentages.append(tm.fit_predict(X, Y, weights=weights)) maes.append(tm.MAE) + predicted_transitions.append(tm.transitions) for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not self._verbose): rng = np.random.default_rng(seed=b) @@ -147,14 +149,17 @@ def fit_predict(self, X, Y, weights=None): ) indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] Y_resampled = Y[indices] - self._predicted_transitions.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) + self._predicted_percentages.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) maes.append(tm.MAE) + predicted_transitions.append(tm.transitions) self._mae = np.mean(maes) LOG.info("MAE = %s", np.around(self._mae, 4)) - return np.mean(self._predicted_transitions, axis=0) + self._transitions = np.mean(predicted_transitions, axis=0) + return np.mean(self._predicted_percentages, axis=0) def get_confidence_interval(self, alpha): + # TODO: option to get this in transition form if alpha > 1: alpha = alpha / 100 if alpha < 0 or alpha >= 1: @@ -163,6 +168,6 @@ def get_confidence_interval(self, alpha): p_lower = ((1.0 - alpha) / 2.0) * 100 p_upper = ((1.0 + alpha) / 2.0) * 100 return ( - np.percentile(self._predicted_transitions, p_lower, axis=0), - np.percentile(self._predicted_transitions, p_upper, axis=0), + np.percentile(self._predicted_percentages, p_lower, axis=0), + np.percentile(self._predicted_percentages, p_upper, axis=0), ) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index fdbb0585..f63524ba 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -29,6 +29,7 @@ class TransitionSolver(ABC): def __init__(self): self._mae = None + self._transitions = None def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): raise NotImplementedError @@ -36,6 +37,10 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = def get_prediction_interval(self, pi: float): raise NotImplementedError + @property + def transitions(self): + return self._transitions + @property def MAE(self): return self._mae diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 4ea2908f..62129412 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -27,7 +27,7 @@ def test_matrix_fit_predict(): ] ) - expected = np.array([[0.35096678, 0.11057168], [0.11665334, 0.4218082]]) + expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) tms = TransitionMatrixSolver() current = tms.fit_predict(X, Y) @@ -59,7 +59,7 @@ def test_matrix_fit_predict_with_weights(): weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - expected = np.array([[0.340306, 0.121233], [0.124163, 0.414298]]) + expected = np.array([[0.737329, 0.262671], [0.230589, 0.769411]]) tms = TransitionMatrixSolver() current = tms.fit_predict(X, Y, weights=weights) @@ -95,7 +95,7 @@ def test_bootstrap_fit_predict(): ] ) - expected = np.array([[0.374623, 0.087791], [0.093755, 0.44383]]) + expected = np.array([[0.809393, 0.190607], [0.173843, 0.826157]]) btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) current = btms.fit_predict(X, Y) @@ -127,7 +127,7 @@ def test_bootstrap_fit_predict_with_weights(): weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - expected = np.array([[0.319791, 0.112347], [0.130296, 0.437565]]) + expected = np.array([[0.739798, 0.260202], [0.229358, 0.770642]]) btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) current = btms.fit_predict(X, Y, weights=weights) @@ -157,8 +157,8 @@ def test_bootstrap_confidence_interval(): ] ) - expected_lower = np.array([[0.34326, 0.045649], [0.047865, 0.418057]]) - expected_upper = np.array([[0.429978, 0.112171], [0.119081, 0.477393]]) + expected_lower = np.array([[0.757573, 0.095978], [0.09128, 0.779471]]) + expected_upper = np.array([[0.904022, 0.242427], [0.220529, 0.90872]]) btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) _ = btms.fit_predict(X, Y) From 0905e66c5bc7591ef4a0a85fee1b3de609b0e5da Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 26 Dec 2023 10:14:11 -0500 Subject: [PATCH 078/135] Adding function for WAPE --- src/elexsolver/TransitionSolver.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index f63524ba..6a3b7de4 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -22,6 +22,17 @@ def mean_absolute_error(Y_expected: np.ndarray, Y_pred: np.ndarray): return error_sum / len(absolute_errors) +def weighted_absolute_percentage_error(Y_expected: np.ndarray, Y_pred: np.ndarray): + if isinstance(Y_expected, list): + Y_expected = np.array(Y_expected) + if isinstance(Y_pred, list): + Y_pred = np.array(Y_pred) + + absolute_errors = np.abs(Y_expected - Y_pred) + error_sum = np.sum(absolute_errors) + return error_sum / np.sum(Y_expected) + + class TransitionSolver(ABC): """ Abstract class for (voter) transition solvers. From 5f7efcbffa9e03cdf95926997d8a19b40e60dcc8 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 26 Dec 2023 14:29:52 -0500 Subject: [PATCH 079/135] Converting the model-fit score to WAPE, too --- src/elexsolver/EITransitionSolver.py | 6 +++--- src/elexsolver/TransitionMatrixSolver.py | 18 +++++++++--------- src/elexsolver/TransitionSolver.py | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 61fc878b..dfb0d5f8 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -4,7 +4,7 @@ import pymc as pm from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error +from elexsolver.TransitionSolver import TransitionSolver, weighted_absolute_percentage_error initialize_logging() @@ -109,8 +109,8 @@ def fit_predict(self, X, Y, weights=None): posterior_mean_rxc = self._sampled.mean(axis=0) self._transitions = self._get_transitions(posterior_mean_rxc) Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() - self._mae = mean_absolute_error(Y_pred_totals, Y_expected_totals) - LOG.info("MAE = %s", np.around(self._mae, 4)) + self._wape = weighted_absolute_percentage_error(Y_pred_totals, Y_expected_totals) + LOG.info("WAPE = %s", np.around(self._wape, 4)) return posterior_mean_rxc def _get_transitions(self, A: np.ndarray): diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index bc194009..5f91b7e2 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -6,7 +6,7 @@ from tqdm import tqdm from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error +from elexsolver.TransitionSolver import TransitionSolver, weighted_absolute_percentage_error initialize_logging() @@ -105,12 +105,12 @@ def fit_predict(self, X, Y, weights=None): if np.sum(self._transitions, axis=0).sum() != 0: Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() - self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) + self._wape = weighted_absolute_percentage_error(Y_expected_totals, Y_pred_totals) else: # would have logged an error above - self._mae = 1 + self._wape = 1 if self._verbose: - LOG.info("MAE = %s", np.around(self._mae, 4)) + LOG.info("WAPE = %s", np.around(self._wape, 4)) return percentages @@ -127,7 +127,7 @@ def __init__(self, B=1000, strict=True, verbose=True, lam=None): self._predicted_percentages = None def fit_predict(self, X, Y, weights=None): - maes = [] + wapes = [] self._predicted_percentages = [] predicted_transitions = [] @@ -139,7 +139,7 @@ def fit_predict(self, X, Y, weights=None): tm = TransitionMatrixSolver(strict=self._strict, verbose=False, lam=self._lambda) self._predicted_percentages.append(tm.fit_predict(X, Y, weights=weights)) - maes.append(tm.MAE) + wapes.append(tm.score) predicted_transitions.append(tm.transitions) for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not self._verbose): @@ -150,11 +150,11 @@ def fit_predict(self, X, Y, weights=None): indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] Y_resampled = Y[indices] self._predicted_percentages.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) - maes.append(tm.MAE) + wapes.append(tm.score) predicted_transitions.append(tm.transitions) - self._mae = np.mean(maes) - LOG.info("MAE = %s", np.around(self._mae, 4)) + self._wape = np.mean(wapes) + LOG.info("Average WAPE = %s", np.around(self._wape, 4)) self._transitions = np.mean(predicted_transitions, axis=0) return np.mean(self._predicted_percentages, axis=0) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 6a3b7de4..e6812e1b 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -39,7 +39,7 @@ class TransitionSolver(ABC): """ def __init__(self): - self._mae = None + self._wape = None self._transitions = None def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): @@ -53,8 +53,8 @@ def transitions(self): return self._transitions @property - def MAE(self): - return self._mae + def score(self): + return self._wape def _check_any_element_nan_or_inf(self, A: np.ndarray): """ From a5346deeda19ad5775da81321a4c267a98a2ab09 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 27 Dec 2023 09:40:21 -0500 Subject: [PATCH 080/135] Handle situation in calculating WAPE when the expected Y is 0 --- src/elexsolver/TransitionSolver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index e6812e1b..795652c4 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -30,6 +30,9 @@ def weighted_absolute_percentage_error(Y_expected: np.ndarray, Y_pred: np.ndarra absolute_errors = np.abs(Y_expected - Y_pred) error_sum = np.sum(absolute_errors) + + if np.sum(Y_expected) == 0: + return error_sum return error_sum / np.sum(Y_expected) From 7955e8252f193ebb55e25ac0109a2c36a545af2c Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 27 Dec 2023 11:22:25 -0500 Subject: [PATCH 081/135] Letting WAPE remain undefined when Y_expected is zero --- src/elexsolver/TransitionSolver.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 795652c4..5752379f 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -31,8 +31,6 @@ def weighted_absolute_percentage_error(Y_expected: np.ndarray, Y_pred: np.ndarra absolute_errors = np.abs(Y_expected - Y_pred) error_sum = np.sum(absolute_errors) - if np.sum(Y_expected) == 0: - return error_sum return error_sum / np.sum(Y_expected) From 27a92bef9cf4f7a8e77a5f8bb2591843f9eb1c18 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 27 Dec 2023 11:43:08 -0500 Subject: [PATCH 082/135] Switching back to MAE because WAPE is undefined when Y expected is zero --- src/elexsolver/EITransitionSolver.py | 6 +++--- src/elexsolver/TransitionMatrixSolver.py | 18 +++++++++--------- src/elexsolver/TransitionSolver.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index dfb0d5f8..61fc878b 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -4,7 +4,7 @@ import pymc as pm from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver, weighted_absolute_percentage_error +from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error initialize_logging() @@ -109,8 +109,8 @@ def fit_predict(self, X, Y, weights=None): posterior_mean_rxc = self._sampled.mean(axis=0) self._transitions = self._get_transitions(posterior_mean_rxc) Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() - self._wape = weighted_absolute_percentage_error(Y_pred_totals, Y_expected_totals) - LOG.info("WAPE = %s", np.around(self._wape, 4)) + self._mae = mean_absolute_error(Y_pred_totals, Y_expected_totals) + LOG.info("MAE = %s", np.around(self._mae, 4)) return posterior_mean_rxc def _get_transitions(self, A: np.ndarray): diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 5f91b7e2..96cb22cd 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -6,7 +6,7 @@ from tqdm import tqdm from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver, weighted_absolute_percentage_error +from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error initialize_logging() @@ -105,12 +105,12 @@ def fit_predict(self, X, Y, weights=None): if np.sum(self._transitions, axis=0).sum() != 0: Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() - self._wape = weighted_absolute_percentage_error(Y_expected_totals, Y_pred_totals) + self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) else: # would have logged an error above - self._wape = 1 + self._mae = 1 if self._verbose: - LOG.info("WAPE = %s", np.around(self._wape, 4)) + LOG.info("MAE = %s", np.around(self._mae, 4)) return percentages @@ -127,7 +127,7 @@ def __init__(self, B=1000, strict=True, verbose=True, lam=None): self._predicted_percentages = None def fit_predict(self, X, Y, weights=None): - wapes = [] + maes = [] self._predicted_percentages = [] predicted_transitions = [] @@ -139,7 +139,7 @@ def fit_predict(self, X, Y, weights=None): tm = TransitionMatrixSolver(strict=self._strict, verbose=False, lam=self._lambda) self._predicted_percentages.append(tm.fit_predict(X, Y, weights=weights)) - wapes.append(tm.score) + maes.append(tm.score) predicted_transitions.append(tm.transitions) for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not self._verbose): @@ -150,11 +150,11 @@ def fit_predict(self, X, Y, weights=None): indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] Y_resampled = Y[indices] self._predicted_percentages.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) - wapes.append(tm.score) + maes.append(tm.score) predicted_transitions.append(tm.transitions) - self._wape = np.mean(wapes) - LOG.info("Average WAPE = %s", np.around(self._wape, 4)) + self._mae = np.mean(maes) + LOG.info("Average MAE = %s", np.around(self._mae, 4)) self._transitions = np.mean(predicted_transitions, axis=0) return np.mean(self._predicted_percentages, axis=0) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 5752379f..3eee22fd 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -40,7 +40,7 @@ class TransitionSolver(ABC): """ def __init__(self): - self._wape = None + self._mae = None self._transitions = None def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): @@ -55,7 +55,7 @@ def transitions(self): @property def score(self): - return self._wape + return self._mae def _check_any_element_nan_or_inf(self, A: np.ndarray): """ From 61d2f01f86e3da1a85239ffa3f5aa54d844e3a98 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 27 Dec 2023 22:00:04 -0500 Subject: [PATCH 083/135] Adding option to compute MAE with sample weights --- src/elexsolver/TransitionSolver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 3eee22fd..0c98b597 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -11,15 +11,13 @@ LOG = logging.getLogger(__name__) -def mean_absolute_error(Y_expected: np.ndarray, Y_pred: np.ndarray): +def mean_absolute_error(Y_expected: np.ndarray, Y_pred: np.ndarray, weights: np.ndarray | None = None): if isinstance(Y_expected, list): Y_expected = np.array(Y_expected) if isinstance(Y_pred, list): Y_pred = np.array(Y_pred) - absolute_errors = np.abs(Y_pred - Y_expected) - error_sum = np.sum(absolute_errors) - return error_sum / len(absolute_errors) + return np.average(np.abs(Y_expected - Y_pred), weights=weights) def weighted_absolute_percentage_error(Y_expected: np.ndarray, Y_pred: np.ndarray): From 372cef3258848b832df225076f757a9a5c36f78f Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 27 Dec 2023 22:11:23 -0500 Subject: [PATCH 084/135] And apparently fixing a mistake I had made in the MAE formula... --- tests/test_transition_solver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 468efa67..69a0c14f 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -9,7 +9,7 @@ def test_mean_absolute_error(): Y = np.ones((5, 4)) Y_pred = Y - 0.02 - expected = 0.08 + expected = 0.02 current = np.around(mean_absolute_error(Y, Y_pred), 6) np.testing.assert_allclose(expected, current) From a93a189d0e79bca5d9265a26431611d44d0b6438 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 15 Jan 2024 15:57:25 -0500 Subject: [PATCH 085/135] Moving MAE computation out of elex-solver --- src/elexsolver/EITransitionSolver.py | 6 +----- src/elexsolver/TransitionMatrixSolver.py | 23 +++------------------ src/elexsolver/TransitionSolver.py | 26 ------------------------ tests/test_transition_solver.py | 10 +-------- 4 files changed, 5 insertions(+), 60 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 61fc878b..02eaf21d 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -4,7 +4,7 @@ import pymc as pm from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error +from elexsolver.TransitionSolver import TransitionSolver initialize_logging() @@ -62,7 +62,6 @@ def fit_predict(self, X, Y, weights=None): self._check_for_zero_units(Y) self._X_totals = X.sum(axis=0) / X.sum(axis=0).sum() - Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() n = Y.sum(axis=1) num_units = len(n) # should be the same as the number of units in Y @@ -108,9 +107,6 @@ def fit_predict(self, X, Y, weights=None): posterior_mean_rxc = self._sampled.mean(axis=0) self._transitions = self._get_transitions(posterior_mean_rxc) - Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() - self._mae = mean_absolute_error(Y_pred_totals, Y_expected_totals) - LOG.info("MAE = %s", np.around(self._mae, 4)) return posterior_mean_rxc def _get_transitions(self, A: np.ndarray): diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 96cb22cd..12266c91 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -6,7 +6,7 @@ from tqdm import tqdm from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error +from elexsolver.TransitionSolver import TransitionSolver initialize_logging() @@ -14,13 +14,12 @@ class TransitionMatrixSolver(TransitionSolver): - def __init__(self, strict=True, verbose=True, lam=None): + def __init__(self, strict=True, lam=None): """ `lam` > 0 will enable L2 regularization (Ridge). """ super().__init__() self._strict = strict - self._verbose = verbose self._lambda = lam @staticmethod @@ -93,7 +92,6 @@ def fit_predict(self, X, Y, weights=None): Y = Y.to_numpy() X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() - Y_expected_totals = Y.sum(axis=0) / Y.sum(axis=0).sum() X = self._rescale(X) Y = self._rescale(Y) @@ -102,16 +100,6 @@ def fit_predict(self, X, Y, weights=None): percentages = self.__solve(X, Y, weights) self._transitions = np.diag(X_expected_totals) @ percentages - - if np.sum(self._transitions, axis=0).sum() != 0: - Y_pred_totals = np.sum(self._transitions, axis=0) / np.sum(self._transitions, axis=0).sum() - self._mae = mean_absolute_error(Y_expected_totals, Y_pred_totals) - else: - # would have logged an error above - self._mae = 1 - if self._verbose: - LOG.info("MAE = %s", np.around(self._mae, 4)) - return percentages @@ -127,7 +115,6 @@ def __init__(self, B=1000, strict=True, verbose=True, lam=None): self._predicted_percentages = None def fit_predict(self, X, Y, weights=None): - maes = [] self._predicted_percentages = [] predicted_transitions = [] @@ -137,9 +124,8 @@ def fit_predict(self, X, Y, weights=None): if not isinstance(Y, np.ndarray): Y = Y.to_numpy() - tm = TransitionMatrixSolver(strict=self._strict, verbose=False, lam=self._lambda) + tm = TransitionMatrixSolver(strict=self._strict, lam=self._lambda) self._predicted_percentages.append(tm.fit_predict(X, Y, weights=weights)) - maes.append(tm.score) predicted_transitions.append(tm.transitions) for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not self._verbose): @@ -150,11 +136,8 @@ def fit_predict(self, X, Y, weights=None): indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] Y_resampled = Y[indices] self._predicted_percentages.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) - maes.append(tm.score) predicted_transitions.append(tm.transitions) - self._mae = np.mean(maes) - LOG.info("Average MAE = %s", np.around(self._mae, 4)) self._transitions = np.mean(predicted_transitions, axis=0) return np.mean(self._predicted_percentages, axis=0) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 0c98b597..6346ba56 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -11,34 +11,12 @@ LOG = logging.getLogger(__name__) -def mean_absolute_error(Y_expected: np.ndarray, Y_pred: np.ndarray, weights: np.ndarray | None = None): - if isinstance(Y_expected, list): - Y_expected = np.array(Y_expected) - if isinstance(Y_pred, list): - Y_pred = np.array(Y_pred) - - return np.average(np.abs(Y_expected - Y_pred), weights=weights) - - -def weighted_absolute_percentage_error(Y_expected: np.ndarray, Y_pred: np.ndarray): - if isinstance(Y_expected, list): - Y_expected = np.array(Y_expected) - if isinstance(Y_pred, list): - Y_pred = np.array(Y_pred) - - absolute_errors = np.abs(Y_expected - Y_pred) - error_sum = np.sum(absolute_errors) - - return error_sum / np.sum(Y_expected) - - class TransitionSolver(ABC): """ Abstract class for (voter) transition solvers. """ def __init__(self): - self._mae = None self._transitions = None def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): @@ -51,10 +29,6 @@ def get_prediction_interval(self, pi: float): def transitions(self): return self._transitions - @property - def score(self): - return self._mae - def _check_any_element_nan_or_inf(self, A: np.ndarray): """ Check whether any element in a matrix or vector is NaN or infinity diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 69a0c14f..095e40ce 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -3,15 +3,7 @@ import numpy as np import pytest -from elexsolver.TransitionSolver import TransitionSolver, mean_absolute_error - - -def test_mean_absolute_error(): - Y = np.ones((5, 4)) - Y_pred = Y - 0.02 - expected = 0.02 - current = np.around(mean_absolute_error(Y, Y_pred), 6) - np.testing.assert_allclose(expected, current) +from elexsolver.TransitionSolver import TransitionSolver @patch.object(TransitionSolver, "__abstractmethods__", set()) From 54a47b0a4f2cf4bfc8bd5f6dd01d39b6cf519896 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 17 Jan 2024 15:28:38 -0500 Subject: [PATCH 086/135] Finishing up unit tests for TransitionSolver abstract base class --- tests/test_transition_solver.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 095e40ce..1c7f1df3 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -157,3 +157,33 @@ def test_check_and_prepare_weights_with_weights(): ts = TransitionSolver() current = ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access np.testing.assert_allclose(expected, current) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_with_weights_list(): + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = A.copy() + weights = [0.6, 0.4] + expected = np.array([[0.77459667, 0], [0, 0.63245553]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + np.testing.assert_allclose(expected, current) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_check_and_prepare_weights_with_weights_pandas(): + try: + import pandas # pylint: disable=import-outside-toplevel + + A = np.array([[1, 2, 3], [4, 5, 6]]) + B = A.copy() + weights = pandas.Series([0.6, 0.4]) + expected = np.array([[0.77459667, 0], [0, 0.63245553]]) + + ts = TransitionSolver() + current = ts._check_and_prepare_weights(A, B, weights) # pylint: disable=protected-access + np.testing.assert_allclose(expected, current) + except ImportError: + # pass this test through since pandas isn't a requirement for elex-solver + assert True From 9ec424ea9abb79510fa748d74353ccefd014201c Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 17 Jan 2024 15:40:43 -0500 Subject: [PATCH 087/135] Adding type hints to TransitionMatrixSolver --- src/elexsolver/TransitionMatrixSolver.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 12266c91..cc2d1206 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -23,23 +23,23 @@ def __init__(self, strict=True, lam=None): self._lambda = lam @staticmethod - def __get_constraints(coef, strict): + def __get_constraints(coef: np.ndarray, strict: bool): if strict: return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] - def __standard_objective(self, A, B, beta): + def __standard_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray): loss_function = cp.norm(A @ beta - B, "fro") return cp.Minimize(loss_function) - def __ridge_objective(self, A, B, beta): + def __ridge_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray): # Based on https://www.cvxpy.org/examples/machine_learning/ridge_regression.html lam = cp.Parameter(nonneg=True, value=self._lambda) loss_function = cp.pnorm(A @ beta - B, p=2) ** 2 regularizer = cp.pnorm(beta, p=2) ** 2 return cp.Minimize(loss_function + lam * regularizer) - def __solve(self, A, B, weights): + def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray): transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) Aw = np.dot(weights, A) Bw = np.dot(weights, B) @@ -62,10 +62,10 @@ def __solve(self, A, B, weights): return transition_matrix.value - def fit_predict(self, X, Y, weights=None): + def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): """ - X and Y are matrixes of integers. - weights is a list or numpy array with the same length as both X and Y. + X and Y are matrixes (numpy or pandas.DataFrame) of integers. + weights is a list, numpy array, or pandas.Series with the same length as both X and Y. """ self._check_data_type(X) self._check_data_type(Y) @@ -104,7 +104,7 @@ def fit_predict(self, X, Y, weights=None): class BootstrapTransitionMatrixSolver(TransitionSolver): - def __init__(self, B=1000, strict=True, verbose=True, lam=None): + def __init__(self, B: int = 1000, strict: bool = True, verbose: bool = True, lam: int | None = None): super().__init__() self._strict = strict self._B = B @@ -114,7 +114,7 @@ def __init__(self, B=1000, strict=True, verbose=True, lam=None): # class members that are instantiated during model-fit self._predicted_percentages = None - def fit_predict(self, X, Y, weights=None): + def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): self._predicted_percentages = [] predicted_transitions = [] @@ -141,7 +141,7 @@ def fit_predict(self, X, Y, weights=None): self._transitions = np.mean(predicted_transitions, axis=0) return np.mean(self._predicted_percentages, axis=0) - def get_confidence_interval(self, alpha): + def get_confidence_interval(self, alpha: float): # TODO: option to get this in transition form if alpha > 1: alpha = alpha / 100 From 1fc45b8d3555c0f89199c3d840d46b7c4363176b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 22 Jan 2024 10:36:43 -0500 Subject: [PATCH 088/135] Missed a few type hints in TransitionMatrixSolver constructor --- src/elexsolver/TransitionMatrixSolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index cc2d1206..30d1ef7f 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -14,7 +14,7 @@ class TransitionMatrixSolver(TransitionSolver): - def __init__(self, strict=True, lam=None): + def __init__(self, strict: bool = True, lam: float | None = None): """ `lam` > 0 will enable L2 regularization (Ridge). """ From 12861ede7018e06518ad4e2a7afdb3f85cd82619 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 22 Jan 2024 10:39:33 -0500 Subject: [PATCH 089/135] Adding type hints to EITransitionSolver --- src/elexsolver/EITransitionSolver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 02eaf21d..1f20c0a1 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -25,7 +25,7 @@ class EITransitionSolver(TransitionSolver): Statistica Neerlandica, 55, Pp. 134–156. Copy at https://tinyurl.com/yajkae6n """ - def __init__(self, sigma=1, sampling_chains=2, random_seed=None, n_samples=300): + def __init__(self, sigma: int = 1, sampling_chains: int = 2, random_seed: int | None = None, n_samples: int = 300): super().__init__() self._sigma = sigma self._chains = int(sampling_chains) @@ -37,7 +37,7 @@ def __init__(self, sigma=1, sampling_chains=2, random_seed=None, n_samples=300): self._sampled = None self._X_totals = None - def fit_predict(self, X, Y, weights=None): + def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): """ X and Y are matrixes of integers. NOTE: weighting is not currently implemented. @@ -116,7 +116,7 @@ def _get_transitions(self, A: np.ndarray): transitions.append(col * self._X_totals) return np.array(transitions).T - def get_credible_interval(self, ci, transitions=False): + def get_credible_interval(self, ci: float, transitions: bool = False): if ci <= 1: ci = ci * 100 if ci < 0 or ci > 100: From 45796780c301d9216eb66dbfa41de6f6cdf27ba7 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 22 Jan 2024 15:55:33 -0500 Subject: [PATCH 090/135] Adding unit test for strict constraints with matrix solver and global RTOL and ATOL constants for all matrix solver unit tests --- tests/test_transition_matrix_solver.py | 45 ++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 62129412..9ac349e7 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -3,6 +3,9 @@ from elexsolver.TransitionMatrixSolver import BootstrapTransitionMatrixSolver, TransitionMatrixSolver +RTOL = 1e-04 +ATOL = 1e-04 + def test_matrix_fit_predict(): X = np.array( @@ -31,7 +34,7 @@ def test_matrix_fit_predict(): tms = TransitionMatrixSolver() current = tms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_with_weights(): @@ -63,7 +66,37 @@ def test_matrix_fit_predict_with_weights(): tms = TransitionMatrixSolver() current = tms.fit_predict(X, Y, weights=weights) - np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + +def test_matrix_fit_predict_not_strict(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected = np.array([[0.760451, 0.239558], [0.216624, 0.783369]]) + + tms = TransitionMatrixSolver(strict=False) + current = tms.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) def test_matrix_get_prediction_interval(): @@ -99,7 +132,7 @@ def test_bootstrap_fit_predict(): btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) current = btms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) def test_bootstrap_fit_predict_with_weights(): @@ -131,7 +164,7 @@ def test_bootstrap_fit_predict_with_weights(): btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) current = btms.fit_predict(X, Y, weights=weights) - np.testing.assert_allclose(expected, current, rtol=1e-08, atol=1e-02) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) def test_bootstrap_confidence_interval(): @@ -163,5 +196,5 @@ def test_bootstrap_confidence_interval(): btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) _ = btms.fit_predict(X, Y) (current_lower, current_upper) = btms.get_confidence_interval(0.95) - np.testing.assert_allclose(expected_lower, current_lower, rtol=1e-08, atol=1e-02) - np.testing.assert_allclose(expected_upper, current_upper, rtol=1e-08, atol=1e-02) + np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) From bfd05a8fe948464e8b1a2304a17c34308e69be4f Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 22 Jan 2024 15:59:24 -0500 Subject: [PATCH 091/135] Adding unit test for matrix solver with L2 regularization --- tests/test_transition_matrix_solver.py | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 9ac349e7..179d8893 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -99,6 +99,36 @@ def test_matrix_fit_predict_not_strict(): np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) +def test_ridge_matrix_fit_predict(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected = np.array([[0.479416, 0.520584], [0.455918, 0.544082]]) + + tms = TransitionMatrixSolver(lam=1) + current = tms.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + def test_matrix_get_prediction_interval(): with pytest.raises(NotImplementedError): tms = TransitionMatrixSolver() From 1b583b8a3a6ded769c34ca1afd53133a4048807d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 22 Jan 2024 16:29:22 -0500 Subject: [PATCH 092/135] Adding a matrix solver unit test where the matrix needs to be pivoted first and clarifying the use of lambda and L2 regularization --- src/elexsolver/TransitionMatrixSolver.py | 2 +- tests/test_transition_matrix_solver.py | 30 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 30d1ef7f..bdd7f988 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -16,7 +16,7 @@ class TransitionMatrixSolver(TransitionSolver): def __init__(self, strict: bool = True, lam: float | None = None): """ - `lam` > 0 will enable L2 regularization (Ridge). + `lam` != 0 will enable L2 regularization (Ridge). """ super().__init__() self._strict = strict diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 179d8893..8cf83d89 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -129,6 +129,36 @@ def test_ridge_matrix_fit_predict(): np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) +def test_matrix_fit_predict_pivoted(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ).T + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ).T + + expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + + tms = TransitionMatrixSolver() + current = tms.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + def test_matrix_get_prediction_interval(): with pytest.raises(NotImplementedError): tms = TransitionMatrixSolver() From bba196fe99dbaa428c78dfbb344e5e99be681901 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 22 Jan 2024 16:51:15 -0500 Subject: [PATCH 093/135] Removing redundant/error-prone 'taking the mean' to get bootstrapped transitions --- src/elexsolver/TransitionMatrixSolver.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index bdd7f988..d11d63d8 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -116,7 +116,6 @@ def __init__(self, B: int = 1000, strict: bool = True, verbose: bool = True, lam def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): self._predicted_percentages = [] - predicted_transitions = [] # assuming pandas.DataFrame if not isinstance(X, np.ndarray): @@ -124,9 +123,10 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = if not isinstance(Y, np.ndarray): Y = Y.to_numpy() + X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() + tm = TransitionMatrixSolver(strict=self._strict, lam=self._lambda) self._predicted_percentages.append(tm.fit_predict(X, Y, weights=weights)) - predicted_transitions.append(tm.transitions) for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not self._verbose): rng = np.random.default_rng(seed=b) @@ -136,13 +136,13 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] Y_resampled = Y[indices] self._predicted_percentages.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) - predicted_transitions.append(tm.transitions) - self._transitions = np.mean(predicted_transitions, axis=0) - return np.mean(self._predicted_percentages, axis=0) + percentages = np.mean(self._predicted_percentages, axis=0) + self._transitions = np.diag(X_expected_totals) @ percentages + return percentages def get_confidence_interval(self, alpha: float): - # TODO: option to get this in transition form + # TODO: option to return transitions as well as (or instead of) percentages if alpha > 1: alpha = alpha / 100 if alpha < 0 or alpha >= 1: From 98b9e8168802b66aaba2b1274bb98fab3ebbbfcb Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 22 Jan 2024 16:59:04 -0500 Subject: [PATCH 094/135] Two more unit tests on the bootstrap confidence interval --- tests/test_transition_matrix_solver.py | 63 ++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 8cf83d89..60f4b1c2 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -258,3 +258,66 @@ def test_bootstrap_confidence_interval(): (current_lower, current_upper) = btms.get_confidence_interval(0.95) np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) + + +def test_bootstrap_confidence_interval_greater_than_1(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected_lower = np.array([[0.757573, 0.095978], [0.09128, 0.779471]]) + expected_upper = np.array([[0.904022, 0.242427], [0.220529, 0.90872]]) + + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + _ = btms.fit_predict(X, Y) + (current_lower, current_upper) = btms.get_confidence_interval(95) + np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) + + +def test_bootstrap_confidence_interval_invalid(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + _ = btms.fit_predict(X, Y) + + with pytest.raises(ValueError): + btms.get_confidence_interval(-34) From 3e57ffda3d4d1a108f6f6fd64d501d251d3ce9f1 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 23 Jan 2024 10:01:25 -0500 Subject: [PATCH 095/135] Starting work on EI solver unit tests --- tests/test_ei_transition_solver.py | 105 +++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 tests/test_ei_transition_solver.py diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py new file mode 100644 index 00000000..646fde0a --- /dev/null +++ b/tests/test_ei_transition_solver.py @@ -0,0 +1,105 @@ +import numpy as np +import pytest + +from elexsolver.EITransitionSolver import EITransitionSolver + +RTOL = 1e-04 +ATOL = 1e-04 + + +def test_ei_fit_predict(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected = np.array([[0.883539, 0.116461], [0.09511, 0.90489]]) + + ei = EITransitionSolver(random_seed=1024) + current = ei.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + +# def test_matrix_fit_predict_with_weights(): +# X = np.array( +# [ +# [1, 2], +# [3, 4], +# [5, 6], +# [7, 8], +# [9, 10], +# [11, 12], +# ] +# ) + +# Y = np.array( +# [ +# [2, 3], +# [4, 5], +# [6, 7], +# [8, 9], +# [10, 11], +# [12, 13], +# ] +# ) + +# weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) + +# expected = np.array([[0.737329, 0.262671], [0.230589, 0.769411]]) + +# tms = TransitionMatrixSolver() +# current = tms.fit_predict(X, Y, weights=weights) +# np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + +# def test_matrix_fit_predict_pivoted(): +# X = np.array( +# [ +# [1, 2], +# [3, 4], +# [5, 6], +# [7, 8], +# [9, 10], +# [11, 12], +# ] +# ).T + +# Y = np.array( +# [ +# [2, 3], +# [4, 5], +# [6, 7], +# [8, 9], +# [10, 11], +# [12, 13], +# ] +# ).T + +# expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + +# tms = TransitionMatrixSolver() +# current = tms.fit_predict(X, Y) +# np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + +def test_ei_get_prediction_interval(): + with pytest.raises(NotImplementedError): + ei = EITransitionSolver() + ei.get_prediction_interval(0) From 8064b6e68bcf3b3a12b17ab7e4cb481a99cbf131 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 23 Jan 2024 10:08:46 -0500 Subject: [PATCH 096/135] Adding two more EI solver unit tests --- tests/test_ei_transition_solver.py | 121 +++++++++++++++-------------- 1 file changed, 61 insertions(+), 60 deletions(-) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 646fde0a..6af0098c 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -37,66 +37,67 @@ def test_ei_fit_predict(): np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) -# def test_matrix_fit_predict_with_weights(): -# X = np.array( -# [ -# [1, 2], -# [3, 4], -# [5, 6], -# [7, 8], -# [9, 10], -# [11, 12], -# ] -# ) - -# Y = np.array( -# [ -# [2, 3], -# [4, 5], -# [6, 7], -# [8, 9], -# [10, 11], -# [12, 13], -# ] -# ) - -# weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - -# expected = np.array([[0.737329, 0.262671], [0.230589, 0.769411]]) - -# tms = TransitionMatrixSolver() -# current = tms.fit_predict(X, Y, weights=weights) -# np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) - - -# def test_matrix_fit_predict_pivoted(): -# X = np.array( -# [ -# [1, 2], -# [3, 4], -# [5, 6], -# [7, 8], -# [9, 10], -# [11, 12], -# ] -# ).T - -# Y = np.array( -# [ -# [2, 3], -# [4, 5], -# [6, 7], -# [8, 9], -# [10, 11], -# [12, 13], -# ] -# ).T - -# expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) - -# tms = TransitionMatrixSolver() -# current = tms.fit_predict(X, Y) -# np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) +def test_ei_fit_predict_with_weights(): + # NOTE: currently, supplying weights to the EI solver does nothing. + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) + + expected = np.array([[0.883539, 0.116461], [0.09511, 0.90489]]) + + ei = EITransitionSolver(random_seed=1024) + current = ei.fit_predict(X, Y, weights=weights) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + +def test_ei_fit_predict_pivoted(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ).T + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ).T + + expected = np.array([[0.883539, 0.116461], [0.09511, 0.90489]]) + + ei = EITransitionSolver(random_seed=1024) + current = ei.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) def test_ei_get_prediction_interval(): From 006de0e54c291b9811748f9c0a741f265092f5d8 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 23 Jan 2024 15:46:21 -0500 Subject: [PATCH 097/135] Updating some requirement version numbers --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 762371e5..76891b5a 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.25", "numpyro~=0.13", "pymc~=5.9", "scipy~=1.11", "tqdm~=4.66"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "numpyro~=0.13", "pymc~=5.10", "scipy~=1.12", "tqdm~=4.66"] THIS_FILE_DIR = os.path.dirname(__file__) From 85c911473a498dee66cfb7462b69c46e7486c9df Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 23 Jan 2024 15:47:30 -0500 Subject: [PATCH 098/135] Increasing the test timeout on github --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index af3b9ce1..b9065b11 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,7 @@ jobs: test: name: Run unit tests runs-on: ubuntu-latest - timeout-minutes: 10 + timeout-minutes: 15 strategy: matrix: python-version: ['3.11'] From fd2a344ea3af638e83452ef6ff68662180de24f7 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 23 Jan 2024 16:08:52 -0500 Subject: [PATCH 099/135] Maybe the numpyro NUTS sampler is the problem? --- setup.py | 2 +- src/elexsolver/EITransitionSolver.py | 4 ++-- tests/test_ei_transition_solver.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 76891b5a..959e7497 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "numpyro~=0.13", "pymc~=5.10", "scipy~=1.12", "tqdm~=4.66"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "pymc~=5.10", "scipy~=1.12", "tqdm~=4.66"] THIS_FILE_DIR = os.path.dirname(__file__) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 1f20c0a1..9e6b2895 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -85,11 +85,11 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = shape=(num_units, num_cols), ) try: - # TODO: keep trying to tune this for performance and speed + # DO NOT USE THE NUMPYRO NUTS SAMPLER + # IT IS UNSTABLE model_trace = pm.sample( chains=self._chains, random_seed=self._seed, - nuts_sampler="numpyro", cores=self._chains, draws=self._draws, tune=self._tune, diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 6af0098c..7a1bcf63 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -30,7 +30,7 @@ def test_ei_fit_predict(): ] ) - expected = np.array([[0.883539, 0.116461], [0.09511, 0.90489]]) + expected = np.array([[0.530026, 0.469974], [0.401865, 0.598135]]) ei = EITransitionSolver(random_seed=1024) current = ei.fit_predict(X, Y) @@ -63,7 +63,7 @@ def test_ei_fit_predict_with_weights(): weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - expected = np.array([[0.883539, 0.116461], [0.09511, 0.90489]]) + expected = np.array([[0.530026, 0.469974], [0.401865, 0.598135]]) ei = EITransitionSolver(random_seed=1024) current = ei.fit_predict(X, Y, weights=weights) @@ -93,7 +93,7 @@ def test_ei_fit_predict_pivoted(): ] ).T - expected = np.array([[0.883539, 0.116461], [0.09511, 0.90489]]) + expected = np.array([[0.530026, 0.469974], [0.401865, 0.598135]]) ei = EITransitionSolver(random_seed=1024) current = ei.fit_predict(X, Y) From 707f641174338a164ebf98c5c3ab028dc85c59d4 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 23 Jan 2024 16:53:29 -0500 Subject: [PATCH 100/135] Please let me have fixed these unit tests --- tests/test_ei_transition_solver.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 7a1bcf63..198585a7 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -3,8 +3,9 @@ from elexsolver.EITransitionSolver import EITransitionSolver -RTOL = 1e-04 -ATOL = 1e-04 +# high tolerance to match PyMC's unit tests +RTOL = 1e-01 +ATOL = 1e-01 def test_ei_fit_predict(): @@ -30,9 +31,9 @@ def test_ei_fit_predict(): ] ) - expected = np.array([[0.530026, 0.469974], [0.401865, 0.598135]]) + expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) - ei = EITransitionSolver(random_seed=1024) + ei = EITransitionSolver(random_seed=1024, n_samples=100, sampling_chains=1) current = ei.fit_predict(X, Y) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) @@ -63,9 +64,9 @@ def test_ei_fit_predict_with_weights(): weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - expected = np.array([[0.530026, 0.469974], [0.401865, 0.598135]]) + expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) - ei = EITransitionSolver(random_seed=1024) + ei = EITransitionSolver(random_seed=1024, n_samples=100, sampling_chains=1) current = ei.fit_predict(X, Y, weights=weights) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) @@ -93,9 +94,9 @@ def test_ei_fit_predict_pivoted(): ] ).T - expected = np.array([[0.530026, 0.469974], [0.401865, 0.598135]]) + expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) - ei = EITransitionSolver(random_seed=1024) + ei = EITransitionSolver(random_seed=1024, n_samples=100, sampling_chains=1) current = ei.fit_predict(X, Y) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) From 9997d13be163effe6852b62139e1d70932d98370 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 12:44:07 -0500 Subject: [PATCH 101/135] Trying to fix my failing pymc-related unit tests by adding a pytensor-compilation environment variable --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b9065b11..f224ffb2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,6 +4,8 @@ jobs: test: name: Run unit tests runs-on: ubuntu-latest + env: + PYTENSOR_FLAGS: floatX=float64,gcc__cxxflags='-march=native' timeout-minutes: 15 strategy: matrix: From 8c69d6e26850f2fc1cf3fa20ba0d82b70f4de1dd Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 12:50:05 -0500 Subject: [PATCH 102/135] Hopefully fixing bad whitespacing in .github/workflows/test.yml ? --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f224ffb2..00faf315 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,7 +5,7 @@ jobs: name: Run unit tests runs-on: ubuntu-latest env: - PYTENSOR_FLAGS: floatX=float64,gcc__cxxflags='-march=native' + PYTENSOR_FLAGS: floatX=float64,gcc__cxxflags='-march=native' timeout-minutes: 15 strategy: matrix: From 6d6973ba0dddfb27625e889ccab766e1c304af8b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 13:23:55 -0500 Subject: [PATCH 103/135] Reverting the commit where I try adding an environment variable to the github tests --- .github/workflows/test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 00faf315..b9065b11 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,8 +4,6 @@ jobs: test: name: Run unit tests runs-on: ubuntu-latest - env: - PYTENSOR_FLAGS: floatX=float64,gcc__cxxflags='-march=native' timeout-minutes: 15 strategy: matrix: From 0a9fd86cab6f70325907b4bddf17e7bc6fd23590 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 13:26:04 -0500 Subject: [PATCH 104/135] Maybe setting a numpy random seed will help --- tests/test_ei_transition_solver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 198585a7..5e0faedc 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -7,6 +7,8 @@ RTOL = 1e-01 ATOL = 1e-01 +np.random.seed(1024) + def test_ei_fit_predict(): X = np.array( From 9f7fea346479a1569e735825a3d13b65dbab2865 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 13:34:52 -0500 Subject: [PATCH 105/135] Reducing the number of samples drawn; want to see if the tests fail the same way here and on Ubuntu --- tests/test_ei_transition_solver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 5e0faedc..8e30a4fc 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -35,7 +35,7 @@ def test_ei_fit_predict(): expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) - ei = EITransitionSolver(random_seed=1024, n_samples=100, sampling_chains=1) + ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) current = ei.fit_predict(X, Y) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) @@ -68,7 +68,7 @@ def test_ei_fit_predict_with_weights(): expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) - ei = EITransitionSolver(random_seed=1024, n_samples=100, sampling_chains=1) + ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) current = ei.fit_predict(X, Y, weights=weights) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) @@ -98,7 +98,7 @@ def test_ei_fit_predict_pivoted(): expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) - ei = EITransitionSolver(random_seed=1024, n_samples=100, sampling_chains=1) + ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) current = ei.fit_predict(X, Y) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) From ca5729242202270d16f5519b5f7fd201d5549809 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 13:41:37 -0500 Subject: [PATCH 106/135] Starting to see same results on macOS/M1 and Ubuntu/x86-64... --- tests/test_ei_transition_solver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 8e30a4fc..74a1c6e4 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -33,7 +33,7 @@ def test_ei_fit_predict(): ] ) - expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) + expected = np.array([[0.34566509, 0.65433491], [0.40254308, 0.59745692]]) ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) current = ei.fit_predict(X, Y) @@ -66,7 +66,7 @@ def test_ei_fit_predict_with_weights(): weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) + expected = np.array([[0.34566509, 0.65433491], [0.40254308, 0.59745692]]) ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) current = ei.fit_predict(X, Y, weights=weights) @@ -96,7 +96,7 @@ def test_ei_fit_predict_pivoted(): ] ).T - expected = np.array([[0.735609, 0.264391], [0.204346, 0.795654]]) + expected = np.array([[0.34566509, 0.65433491], [0.40254308, 0.59745692]]) ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) current = ei.fit_predict(X, Y) From 8e594c441c01583313840191f47288c0a30572e5 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 14:14:05 -0500 Subject: [PATCH 107/135] Adding a test for credible interval, increasing the number of samples, decreasing the tolerance --- tests/test_ei_transition_solver.py | 49 +++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 74a1c6e4..1ea0649a 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -4,8 +4,8 @@ from elexsolver.EITransitionSolver import EITransitionSolver # high tolerance to match PyMC's unit tests -RTOL = 1e-01 -ATOL = 1e-01 +RTOL = 1e-02 +ATOL = 1e-02 np.random.seed(1024) @@ -33,9 +33,9 @@ def test_ei_fit_predict(): ] ) - expected = np.array([[0.34566509, 0.65433491], [0.40254308, 0.59745692]]) + expected = np.array([[0.279297, 0.720703], [0.623953, 0.376047]]) - ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) current = ei.fit_predict(X, Y) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) @@ -66,9 +66,9 @@ def test_ei_fit_predict_with_weights(): weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - expected = np.array([[0.34566509, 0.65433491], [0.40254308, 0.59745692]]) + expected = np.array([[0.279297, 0.720703], [0.623953, 0.376047]]) - ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) current = ei.fit_predict(X, Y, weights=weights) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) @@ -96,9 +96,9 @@ def test_ei_fit_predict_pivoted(): ] ).T - expected = np.array([[0.34566509, 0.65433491], [0.40254308, 0.59745692]]) + expected = np.array([[0.279297, 0.720703], [0.623953, 0.376047]]) - ei = EITransitionSolver(random_seed=1024, n_samples=5, sampling_chains=1) + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) current = ei.fit_predict(X, Y) np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) @@ -107,3 +107,36 @@ def test_ei_get_prediction_interval(): with pytest.raises(NotImplementedError): ei = EITransitionSolver() ei.get_prediction_interval(0) + + +def test_ei_credible_interval_percentages(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected_lower = np.array([[0.037212, 0.356174], [0.424652, 0.117605]]) + expected_upper = np.array([[0.643826, 0.962788], [0.882395, 0.575348]]) + + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) + _ = ei.fit_predict(X, Y) + (current_lower, current_upper) = ei.get_credible_interval(99, transitions=False) + np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) From 0bf5278e298a750485e08096e0a3de7fdb38eb02 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 14:46:01 -0500 Subject: [PATCH 108/135] Adding unit tests for other values that could be specified to credible interval and also getting credible interval transitions --- tests/test_ei_transition_solver.py | 99 +++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 1ea0649a..8d354bcb 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -3,7 +3,8 @@ from elexsolver.EITransitionSolver import EITransitionSolver -# high tolerance to match PyMC's unit tests +# high tolerance due to random sampling +# (which can produce different outcomes on different architectures, despite setting seeds) RTOL = 1e-02 ATOL = 1e-02 @@ -140,3 +141,99 @@ def test_ei_credible_interval_percentages(): (current_lower, current_upper) = ei.get_credible_interval(99, transitions=False) np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) + + +def test_ei_credible_interval_percentages_float_interval(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected_lower = np.array([[0.037212, 0.356174], [0.424652, 0.117605]]) + expected_upper = np.array([[0.643826, 0.962788], [0.882395, 0.575348]]) + + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) + _ = ei.fit_predict(X, Y) + (current_lower, current_upper) = ei.get_credible_interval(0.99, transitions=False) + np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) + + +def test_ei_credible_interval_invalid(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) + _ = ei.fit_predict(X, Y) + + with pytest.raises(ValueError): + ei.get_credible_interval(3467838976, transitions=False) + + +def test_ei_credible_interval_transitions(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected_lower = np.array([[0.017175, 0.164388], [0.228659, 0.063326]]) + expected_upper = np.array([[0.29715, 0.444364], [0.475136, 0.309803]]) + + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) + _ = ei.fit_predict(X, Y) + (current_lower, current_upper) = ei.get_credible_interval(99, transitions=True) + np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) From f66e2362f347297b17b6dd1099e2a228c5b1290b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 15:05:07 -0500 Subject: [PATCH 109/135] More matrix solver unit tests, particularly involving pandas --- tests/test_transition_matrix_solver.py | 112 ++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 60f4b1c2..115d964b 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -160,11 +160,76 @@ def test_matrix_fit_predict_pivoted(): def test_matrix_get_prediction_interval(): + tms = TransitionMatrixSolver() with pytest.raises(NotImplementedError): - tms = TransitionMatrixSolver() tms.get_prediction_interval(0) +def test_matrix_fit_predict_bad_dimensions(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + ] + ) + + tms = TransitionMatrixSolver() + with pytest.raises(ValueError): + tms.fit_predict(X, Y) + + +def test_matrix_fit_predict_pandas(): + try: + import pandas # pylint: disable=import-outside-toplevel + + X = pandas.DataFrame( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ], + columns=["x1", "x2"], + ) + + Y = pandas.DataFrame( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ], + columns=["y1", "y2"], + ) + + expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + + tms = TransitionMatrixSolver() + current = tms.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + except ImportError: + # pass this test through since pandas isn't a requirement for elex-solver + assert True + + def test_bootstrap_fit_predict(): X = np.array( [ @@ -321,3 +386,48 @@ def test_bootstrap_confidence_interval_invalid(): with pytest.raises(ValueError): btms.get_confidence_interval(-34) + + +def test_bootstrap_get_prediction_interval(): + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + with pytest.raises(NotImplementedError): + btms.get_prediction_interval(0) + + +def test_bootstrap_fit_predict_pandas(): + try: + import pandas # pylint: disable=import-outside-toplevel + + X = pandas.DataFrame( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ], + columns=["x1", "x2"], + ) + + Y = pandas.DataFrame( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ], + columns=["y1", "y2"], + ) + + expected = np.array([[0.809393, 0.190607], [0.173843, 0.826157]]) + + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + current = btms.fit_predict(X, Y) + np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + + except ImportError: + # pass this test through since pandas isn't a requirement for elex-solver + assert True From c28a6f5d331c0f585e8e1f6671a3fdd8fdd67a22 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 15:14:22 -0500 Subject: [PATCH 110/135] 100% code coverage in transition solver base class :tada: --- tests/test_transition_solver.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 1c7f1df3..692d0c0e 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -20,6 +20,12 @@ def test_superclass_get_prediction_interval(): ts.get_prediction_interval(0) +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_get_transitions(): + ts = TransitionSolver() + assert ts.transitions is None + + @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_check_any_element_nan_or_inf_with_nan(): with pytest.raises(ValueError): From bc747630c0297aad3f0632b011c60c07e576dc76 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 15:21:34 -0500 Subject: [PATCH 111/135] One last EI solver unit test :tada: --- tests/test_ei_transition_solver.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py index 8d354bcb..1bf47130 100644 --- a/tests/test_ei_transition_solver.py +++ b/tests/test_ei_transition_solver.py @@ -104,6 +104,31 @@ def test_ei_fit_predict_pivoted(): np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) +def test_ei_fit_predict_bad_dimensions(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) + with pytest.raises(ValueError): + ei.fit_predict(X, Y) + + def test_ei_get_prediction_interval(): with pytest.raises(NotImplementedError): ei = EITransitionSolver() From f2d0578226b6d368017886c84299e20d858d0dff Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 15:45:46 -0500 Subject: [PATCH 112/135] Adding some method docstrings and return types to the transition solver base class --- src/elexsolver/TransitionSolver.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 6346ba56..88112fd9 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -26,7 +26,7 @@ def get_prediction_interval(self, pi: float): raise NotImplementedError @property - def transitions(self): + def transitions(self) -> np.ndarray: return self._transitions def _check_any_element_nan_or_inf(self, A: np.ndarray): @@ -55,7 +55,7 @@ def _check_for_zero_units(self, A: np.ndarray): if np.any(np.sum(A, axis=1) == 0): raise ValueError("Matrix cannot contain any rows (units) where all columns (things) are zero.") - def _rescale(self, A: np.ndarray): + def _rescale(self, A: np.ndarray) -> np.ndarray: """ Rescale rows (units) to ensure they sum to 1 (100%). """ @@ -67,7 +67,20 @@ def _rescale(self, A: np.ndarray): return np.nan_to_num(A, nan=0, posinf=0, neginf=0) - def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None): + def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None) -> np.ndarray: + """ + If `weights` is not None, and `weights` has the same number of rows in both matrices `X` and `Y`, + we'll rescale the weights by taking the square root after dividing them by their sum, + then return a diagonal matrix containing these now-normalized weights. + If `weights` is None, return a diagonal matrix of ones. + + Parameters + ---------- + `X` : np.ndarray matrix of int (same number of rows as `Y`) + `Y` : np.ndarray matrix of int (same number of rows as `X`) + `weights` : np.ndarray of int of the shape (number of rows in `X` and `Y`, 1), optional + """ + if weights is not None: if len(weights) != X.shape[0] and len(weights) != Y.shape[0]: raise ValueError("weights must be the same length as the number of rows in X and Y.") From d62680bf711ef585596eef7254bc8ce2453f4a9f Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 16:27:59 -0500 Subject: [PATCH 113/135] Writing one single docstring for fit_predict() in the base class --- src/elexsolver/TransitionSolver.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 88112fd9..df65c868 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -20,6 +20,23 @@ def __init__(self): self._transitions = None def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): + """ + After this method finishes, transitions will be available in the `transitions` class member. + + Parameters + ---------- + `X` : np.ndarray matrix or pandas.DataFrame of int + Must have the same number of rows as `Y` but can have any number of columns greater than the number of rows. + `Y` : np.ndarray matrix or pandas.DataFrame of int + Must have the same number of rows as `X` but can have any number of columns greater than the number of rows. + `weights` : list, np.ndarray, or pandas.Series of int, optional + Must have the same length (number of rows) as both `X` and `Y`. + + Returns + ------- + np.ndarray matrix of float of shape (number of columns in `X`) x (number of columns in `Y`). + Each float represents the percent of how much of row x is part of column y. + """ raise NotImplementedError def get_prediction_interval(self, pi: float): From 9c3379db0f61aef1b3facdc6d7ef2e5cb367ffdb Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 16:28:40 -0500 Subject: [PATCH 114/135] Matrix and bootstrap solvers docstrings, return types, and modifying the confidence interval method to return transitions --- src/elexsolver/TransitionMatrixSolver.py | 75 +++++++++++++++++++----- 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index d11d63d8..ad1dc9f4 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -14,32 +14,42 @@ class TransitionMatrixSolver(TransitionSolver): + """ + Matrix regression transition solver using CVXPY. + """ + def __init__(self, strict: bool = True, lam: float | None = None): """ - `lam` != 0 will enable L2 regularization (Ridge). + Parameters + ---------- + `strict` : bool, default True + If `True`, solution will be constrainted so that all coefficients are >= 0, + <= 1, and the sum of each row equals 1. + `lam` : float, optional + `lam` != 0 will enable L2 regularization (Ridge). """ super().__init__() self._strict = strict self._lambda = lam @staticmethod - def __get_constraints(coef: np.ndarray, strict: bool): + def __get_constraints(coef: np.ndarray, strict: bool) -> list: if strict: return [0 <= coef, coef <= 1, cp.sum(coef, axis=1) == 1] return [cp.sum(coef, axis=1) <= 1.1, cp.sum(coef, axis=1) >= 0.9] - def __standard_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray): + def __standard_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray) -> cp.Minimize: loss_function = cp.norm(A @ beta - B, "fro") return cp.Minimize(loss_function) - def __ridge_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray): + def __ridge_objective(self, A: np.ndarray, B: np.ndarray, beta: np.ndarray) -> cp.Minimize: # Based on https://www.cvxpy.org/examples/machine_learning/ridge_regression.html lam = cp.Parameter(nonneg=True, value=self._lambda) loss_function = cp.pnorm(A @ beta - B, p=2) ** 2 regularizer = cp.pnorm(beta, p=2) ** 2 return cp.Minimize(loss_function + lam * regularizer) - def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray): + def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray) -> np.ndarray: transition_matrix = cp.Variable((A.shape[1], B.shape[1]), pos=True) Aw = np.dot(weights, A) Bw = np.dot(weights, B) @@ -62,11 +72,7 @@ def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray): return transition_matrix.value - def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): - """ - X and Y are matrixes (numpy or pandas.DataFrame) of integers. - weights is a list, numpy array, or pandas.Series with the same length as both X and Y. - """ + def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None) -> np.ndarray: self._check_data_type(X) self._check_data_type(Y) self._check_any_element_nan_or_inf(X) @@ -104,7 +110,24 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = class BootstrapTransitionMatrixSolver(TransitionSolver): + """ + Bootstrap version of the matrix regression transition solver. + """ + def __init__(self, B: int = 1000, strict: bool = True, verbose: bool = True, lam: int | None = None): + """ + Parameters + ---------- + `B` : int, default 1000 + Number of bootstrap samples to draw and matrix solver models to fit/predict. + `strict` : bool, default True + If `True`, solution will be constrainted so that all coefficients are >= 0, + <= 1, and the sum of each row equals 1. + `verbose` : bool, default True + If `False`, this will reduce the amount of logging produced for each of the `B` bootstrap samples. + `lam` : float, optional + `lam` != 0 will enable L2 regularization (Ridge). + """ super().__init__() self._strict = strict self._B = B @@ -113,8 +136,9 @@ def __init__(self, B: int = 1000, strict: bool = True, verbose: bool = True, lam # class members that are instantiated during model-fit self._predicted_percentages = None + self._X_expected_totals = None - def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): + def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None) -> np.ndarray: self._predicted_percentages = [] # assuming pandas.DataFrame @@ -123,7 +147,7 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = if not isinstance(Y, np.ndarray): Y = Y.to_numpy() - X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() + self._X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() tm = TransitionMatrixSolver(strict=self._strict, lam=self._lambda) self._predicted_percentages.append(tm.fit_predict(X, Y, weights=weights)) @@ -138,11 +162,22 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = self._predicted_percentages.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) percentages = np.mean(self._predicted_percentages, axis=0) - self._transitions = np.diag(X_expected_totals) @ percentages + self._transitions = np.diag(self._X_expected_totals) @ percentages return percentages - def get_confidence_interval(self, alpha: float): - # TODO: option to return transitions as well as (or instead of) percentages + def get_confidence_interval(self, alpha: float, transitions: bool = False) -> (np.ndarray, np.ndarray): + """ + Parameters + ---------- + `alpha` : float + Value between [0, 1). If greater than 1, will be divided by 100. + `transitions` : bool, default False + If True, the returned matrix will represent transitions, not percentages. + + Returns + ------- + A tuple of two np.ndarray matrices of float. Element 0 has the lower bound and 1 has the upper bound. + """ if alpha > 1: alpha = alpha / 100 if alpha < 0 or alpha >= 1: @@ -150,7 +185,15 @@ def get_confidence_interval(self, alpha: float): p_lower = ((1.0 - alpha) / 2.0) * 100 p_upper = ((1.0 + alpha) / 2.0) * 100 - return ( + + percentages = ( np.percentile(self._predicted_percentages, p_lower, axis=0), np.percentile(self._predicted_percentages, p_upper, axis=0), ) + + if transitions: + return ( + np.diag(self._X_expected_totals) @ percentages[0], + np.diag(self._X_expected_totals) @ percentages[1], + ) + return percentages From 1ac03f3dd56c936e82b46e525377b1356347af44 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 30 Jan 2024 17:11:00 -0500 Subject: [PATCH 115/135] Adding unit test for bootstrap matrix confidence interval transitions --- tests/test_transition_matrix_solver.py | 39 ++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 115d964b..a11ffdcd 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -292,7 +292,7 @@ def test_bootstrap_fit_predict_with_weights(): np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) -def test_bootstrap_confidence_interval(): +def test_bootstrap_confidence_interval_percentages(): X = np.array( [ [1, 2], @@ -320,7 +320,7 @@ def test_bootstrap_confidence_interval(): btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) _ = btms.fit_predict(X, Y) - (current_lower, current_upper) = btms.get_confidence_interval(0.95) + (current_lower, current_upper) = btms.get_confidence_interval(0.95, transitions=False) np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) @@ -353,7 +353,7 @@ def test_bootstrap_confidence_interval_greater_than_1(): btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) _ = btms.fit_predict(X, Y) - (current_lower, current_upper) = btms.get_confidence_interval(95) + (current_lower, current_upper) = btms.get_confidence_interval(95, transitions=False) np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) @@ -388,6 +388,39 @@ def test_bootstrap_confidence_interval_invalid(): btms.get_confidence_interval(-34) +def test_bootstrap_confidence_interval_transitions(): + X = np.array( + [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + ) + + Y = np.array( + [ + [2, 3], + [4, 5], + [6, 7], + [8, 9], + [10, 11], + [12, 13], + ] + ) + + expected_lower = np.array([[0.349649, 0.044297], [0.049151, 0.419715]]) + expected_upper = np.array([[0.417241, 0.111889], [0.118746, 0.489311]]) + + btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) + _ = btms.fit_predict(X, Y) + (current_lower, current_upper) = btms.get_confidence_interval(0.95, transitions=True) + np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) + + def test_bootstrap_get_prediction_interval(): btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) with pytest.raises(NotImplementedError): From 53404929fe0062eb2c9557d0ff9a3fa199285427 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 31 Jan 2024 13:55:34 -0500 Subject: [PATCH 116/135] Adding docstrings to EI transition solver and cleaning up a few others --- src/elexsolver/EITransitionSolver.py | 31 +++++++++++++++++++++--- src/elexsolver/TransitionMatrixSolver.py | 4 +-- src/elexsolver/TransitionSolver.py | 2 +- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 9e6b2895..1c3ceb36 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -15,7 +15,7 @@ class EITransitionSolver(TransitionSolver): """ - A (voter) transition solver based on RxC ecological inference. + A transition solver based on RxC ecological inference. Somewhat adapted from version 1.0.1 of Knudson et al., (2021). PyEI: A Python package for ecological inference. Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 @@ -26,6 +26,18 @@ class EITransitionSolver(TransitionSolver): """ def __init__(self, sigma: int = 1, sampling_chains: int = 2, random_seed: int | None = None, n_samples: int = 300): + """ + Parameters + ---------- + `sigma` : int, default 1 + Standard deviation of the half-normal distribution that provides alphas to the Dirichlet distribution. + `sampling_chains` : int, default 2 + The number of sampling chains to run in parallel, each of which will draw `n_samples`. + `random_seed` : int, optional + For seeding the NUTS sampler. + `n_samples` : int, default 300 + The number of samples to draw. Before sampling, the NUTS sampler will be tuned using `n_samples // 2` samples. + """ super().__init__() self._sigma = sigma self._chains = int(sampling_chains) @@ -39,7 +51,6 @@ def __init__(self, sigma: int = 1, sampling_chains: int = 2, random_seed: int | def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): """ - X and Y are matrixes of integers. NOTE: weighting is not currently implemented. """ self._check_data_type(X) @@ -117,6 +128,18 @@ def _get_transitions(self, A: np.ndarray): return np.array(transitions).T def get_credible_interval(self, ci: float, transitions: bool = False): + """ + Parameters + ---------- + `ci` : float + Size of the credible interval [0, 100). If <= 1, will be multiplied by 100. + `transitions` : bool, default False + If True, the returned matrices will represent transitions, not percentages. + + Returns + ------- + A tuple of two np.ndarray matrices of float. Element 0 has the lower bound and 1 has the upper bound. + """ if ci <= 1: ci = ci * 100 if ci < 0 or ci > 100: @@ -129,10 +152,10 @@ def get_credible_interval(self, ci: float, transitions: bool = False): upper: np.zeros((self._sampled.shape[1], self._sampled.shape[2])), } - for ci in [lower, upper]: + for interval in [lower, upper]: for i in range(0, self._sampled.shape[1]): for j in range(0, self._sampled.shape[2]): - A_dict[ci][i][j] = np.percentile(self._sampled[:, i, j], ci) + A_dict[interval][i][j] = np.percentile(self._sampled[:, i, j], interval) if transitions: return (self._get_transitions(A_dict[lower]), self._get_transitions(A_dict[upper])) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index ad1dc9f4..38ac22e7 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -23,7 +23,7 @@ def __init__(self, strict: bool = True, lam: float | None = None): Parameters ---------- `strict` : bool, default True - If `True`, solution will be constrainted so that all coefficients are >= 0, + If True, solution will be constrainted so that all coefficients are >= 0, <= 1, and the sum of each row equals 1. `lam` : float, optional `lam` != 0 will enable L2 regularization (Ridge). @@ -172,7 +172,7 @@ def get_confidence_interval(self, alpha: float, transitions: bool = False) -> (n `alpha` : float Value between [0, 1). If greater than 1, will be divided by 100. `transitions` : bool, default False - If True, the returned matrix will represent transitions, not percentages. + If True, the returned matrices will represent transitions, not percentages. Returns ------- diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index df65c868..923686d6 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -13,7 +13,7 @@ class TransitionSolver(ABC): """ - Abstract class for (voter) transition solvers. + Abstract class for transition solvers. """ def __init__(self): From 7c69f8874670cc0ef3bd24b926bf296cd48e62cf Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 31 Jan 2024 14:18:16 -0500 Subject: [PATCH 117/135] Making EI solver's get_transitions() method super private --- src/elexsolver/EITransitionSolver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py index 1c3ceb36..6f81c53c 100644 --- a/src/elexsolver/EITransitionSolver.py +++ b/src/elexsolver/EITransitionSolver.py @@ -117,10 +117,10 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = self._sampled = np.transpose(samples_summed_across / X.sum(axis=0), axes=(1, 2, 0)) posterior_mean_rxc = self._sampled.mean(axis=0) - self._transitions = self._get_transitions(posterior_mean_rxc) + self._transitions = self.__get_transitions(posterior_mean_rxc) return posterior_mean_rxc - def _get_transitions(self, A: np.ndarray): + def __get_transitions(self, A: np.ndarray): # to go from inferred percentages to transitions transitions = [] for col in A.T: @@ -158,5 +158,5 @@ def get_credible_interval(self, ci: float, transitions: bool = False): A_dict[interval][i][j] = np.percentile(self._sampled[:, i, j], interval) if transitions: - return (self._get_transitions(A_dict[lower]), self._get_transitions(A_dict[upper])) + return (self.__get_transitions(A_dict[lower]), self.__get_transitions(A_dict[upper])) return (A_dict[lower], A_dict[upper]) From 160066cabde7ef477e7947aa6d1e4b48d39ec070 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 31 Jan 2024 14:36:21 -0500 Subject: [PATCH 118/135] Adding text to the README: --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fcd98ee0..c56c8f6e 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,13 @@ We have our own implementation of ordinary least squares in Python because this Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. At the moment this uses two libraries, the version that solves the non-regularized problem uses `numpy`and solves the dual based on [this](https://arxiv.org/pdf/2305.12616.pdf) paper. The version that solves the regularized problem uses [`cvxpy`](https://www.cvxpy.org/#) and sets up the problem as a normal optimization problem. Eventually, we are planning on replacing the regularized version with the dual also. ## Transition matrices -We also have a solver for transition matrices. While this works arbitrarily, we have used this in the past for our primary election night model. We can still use this to create the sankey diagram coefficients. +We have three solvers for transition matrices: + +1. A matrix regression solver built using `cvxpy`; +2. A bootstrapped version of #1; +3. A Bayesian ecological inference solver built using [`pymc`](https://www.pymc.io/) based on [Knudson et al., (2021)](https://doi.org/10.21105/joss.03397) and [Rosen et al., (2001)](https://tinyurl.com/yajkae6n). + +We have used #1 for our primary election model and analysis. The transitions it generates form the transitions displayed in our sankey diagrams, but all three solvers could be used for the same purpose. ## Development We welcome contributions to this repo. Please open a Github issue for any issues or comments you have. From 67dd426c8d8eb9ba8085386532ea24e226396063 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 31 Jan 2024 14:38:10 -0500 Subject: [PATCH 119/135] Capitalizing the 'm' in the section header --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c56c8f6e..51125a2d 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ We have our own implementation of ordinary least squares in Python because this ## Quantile Regression Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. At the moment this uses two libraries, the version that solves the non-regularized problem uses `numpy`and solves the dual based on [this](https://arxiv.org/pdf/2305.12616.pdf) paper. The version that solves the regularized problem uses [`cvxpy`](https://www.cvxpy.org/#) and sets up the problem as a normal optimization problem. Eventually, we are planning on replacing the regularized version with the dual also. -## Transition matrices +## Transition Matrices We have three solvers for transition matrices: 1. A matrix regression solver built using `cvxpy`; From fe69ce4d39dfffe9c3e434be1cdd300ebe059a16 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 31 Jan 2024 17:13:46 -0500 Subject: [PATCH 120/135] Removing the bootstrap matrix solver from this branch in favor of it living in the ELEX-3830 branch --- setup.py | 2 +- src/elexsolver/TransitionMatrixSolver.py | 91 --------- tests/test_transition_matrix_solver.py | 238 +---------------------- 3 files changed, 2 insertions(+), 329 deletions(-) diff --git a/setup.py b/setup.py index 959e7497..9c1fcf59 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "pymc~=5.10", "scipy~=1.12", "tqdm~=4.66"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "pymc~=5.10", "scipy~=1.12"] THIS_FILE_DIR = os.path.dirname(__file__) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 38ac22e7..f9e6dd58 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -3,7 +3,6 @@ import cvxpy as cp import numpy as np -from tqdm import tqdm from elexsolver.logging import initialize_logging from elexsolver.TransitionSolver import TransitionSolver @@ -107,93 +106,3 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = percentages = self.__solve(X, Y, weights) self._transitions = np.diag(X_expected_totals) @ percentages return percentages - - -class BootstrapTransitionMatrixSolver(TransitionSolver): - """ - Bootstrap version of the matrix regression transition solver. - """ - - def __init__(self, B: int = 1000, strict: bool = True, verbose: bool = True, lam: int | None = None): - """ - Parameters - ---------- - `B` : int, default 1000 - Number of bootstrap samples to draw and matrix solver models to fit/predict. - `strict` : bool, default True - If `True`, solution will be constrainted so that all coefficients are >= 0, - <= 1, and the sum of each row equals 1. - `verbose` : bool, default True - If `False`, this will reduce the amount of logging produced for each of the `B` bootstrap samples. - `lam` : float, optional - `lam` != 0 will enable L2 regularization (Ridge). - """ - super().__init__() - self._strict = strict - self._B = B - self._verbose = verbose - self._lambda = lam - - # class members that are instantiated during model-fit - self._predicted_percentages = None - self._X_expected_totals = None - - def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None) -> np.ndarray: - self._predicted_percentages = [] - - # assuming pandas.DataFrame - if not isinstance(X, np.ndarray): - X = X.to_numpy() - if not isinstance(Y, np.ndarray): - Y = Y.to_numpy() - - self._X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() - - tm = TransitionMatrixSolver(strict=self._strict, lam=self._lambda) - self._predicted_percentages.append(tm.fit_predict(X, Y, weights=weights)) - - for b in tqdm(range(0, self._B - 1), desc="Bootstrapping", disable=not self._verbose): - rng = np.random.default_rng(seed=b) - X_resampled = rng.choice( - X, len(X), replace=True, axis=0, p=(weights / weights.sum() if weights is not None else None) - ) - indices = [np.where((X == x).all(axis=1))[0][0] for x in X_resampled] - Y_resampled = Y[indices] - self._predicted_percentages.append(tm.fit_predict(X_resampled, Y_resampled, weights=None)) - - percentages = np.mean(self._predicted_percentages, axis=0) - self._transitions = np.diag(self._X_expected_totals) @ percentages - return percentages - - def get_confidence_interval(self, alpha: float, transitions: bool = False) -> (np.ndarray, np.ndarray): - """ - Parameters - ---------- - `alpha` : float - Value between [0, 1). If greater than 1, will be divided by 100. - `transitions` : bool, default False - If True, the returned matrices will represent transitions, not percentages. - - Returns - ------- - A tuple of two np.ndarray matrices of float. Element 0 has the lower bound and 1 has the upper bound. - """ - if alpha > 1: - alpha = alpha / 100 - if alpha < 0 or alpha >= 1: - raise ValueError(f"Invalid confidence interval {alpha}.") - - p_lower = ((1.0 - alpha) / 2.0) * 100 - p_upper = ((1.0 + alpha) / 2.0) * 100 - - percentages = ( - np.percentile(self._predicted_percentages, p_lower, axis=0), - np.percentile(self._predicted_percentages, p_upper, axis=0), - ) - - if transitions: - return ( - np.diag(self._X_expected_totals) @ percentages[0], - np.diag(self._X_expected_totals) @ percentages[1], - ) - return percentages diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index a11ffdcd..44d4a4bd 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from elexsolver.TransitionMatrixSolver import BootstrapTransitionMatrixSolver, TransitionMatrixSolver +from elexsolver.TransitionMatrixSolver import TransitionMatrixSolver RTOL = 1e-04 ATOL = 1e-04 @@ -228,239 +228,3 @@ def test_matrix_fit_predict_pandas(): except ImportError: # pass this test through since pandas isn't a requirement for elex-solver assert True - - -def test_bootstrap_fit_predict(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected = np.array([[0.809393, 0.190607], [0.173843, 0.826157]]) - - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - current = btms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) - - -def test_bootstrap_fit_predict_with_weights(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - - expected = np.array([[0.739798, 0.260202], [0.229358, 0.770642]]) - - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - current = btms.fit_predict(X, Y, weights=weights) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) - - -def test_bootstrap_confidence_interval_percentages(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected_lower = np.array([[0.757573, 0.095978], [0.09128, 0.779471]]) - expected_upper = np.array([[0.904022, 0.242427], [0.220529, 0.90872]]) - - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - _ = btms.fit_predict(X, Y) - (current_lower, current_upper) = btms.get_confidence_interval(0.95, transitions=False) - np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) - np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) - - -def test_bootstrap_confidence_interval_greater_than_1(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected_lower = np.array([[0.757573, 0.095978], [0.09128, 0.779471]]) - expected_upper = np.array([[0.904022, 0.242427], [0.220529, 0.90872]]) - - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - _ = btms.fit_predict(X, Y) - (current_lower, current_upper) = btms.get_confidence_interval(95, transitions=False) - np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) - np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) - - -def test_bootstrap_confidence_interval_invalid(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - _ = btms.fit_predict(X, Y) - - with pytest.raises(ValueError): - btms.get_confidence_interval(-34) - - -def test_bootstrap_confidence_interval_transitions(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected_lower = np.array([[0.349649, 0.044297], [0.049151, 0.419715]]) - expected_upper = np.array([[0.417241, 0.111889], [0.118746, 0.489311]]) - - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - _ = btms.fit_predict(X, Y) - (current_lower, current_upper) = btms.get_confidence_interval(0.95, transitions=True) - np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) - np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) - - -def test_bootstrap_get_prediction_interval(): - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - with pytest.raises(NotImplementedError): - btms.get_prediction_interval(0) - - -def test_bootstrap_fit_predict_pandas(): - try: - import pandas # pylint: disable=import-outside-toplevel - - X = pandas.DataFrame( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ], - columns=["x1", "x2"], - ) - - Y = pandas.DataFrame( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ], - columns=["y1", "y2"], - ) - - expected = np.array([[0.809393, 0.190607], [0.173843, 0.826157]]) - - btms = BootstrapTransitionMatrixSolver(B=10, verbose=False) - current = btms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) - - except ImportError: - # pass this test through since pandas isn't a requirement for elex-solver - assert True From e9ef2edb0b74c6a03de5d515b6cd9e7e9e19b765 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 31 Jan 2024 17:14:43 -0500 Subject: [PATCH 121/135] Forgot to remove bootstrap mentions from the README --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 51125a2d..da575283 100644 --- a/README.md +++ b/README.md @@ -17,13 +17,12 @@ We have our own implementation of ordinary least squares in Python because this Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. At the moment this uses two libraries, the version that solves the non-regularized problem uses `numpy`and solves the dual based on [this](https://arxiv.org/pdf/2305.12616.pdf) paper. The version that solves the regularized problem uses [`cvxpy`](https://www.cvxpy.org/#) and sets up the problem as a normal optimization problem. Eventually, we are planning on replacing the regularized version with the dual also. ## Transition Matrices -We have three solvers for transition matrices: +We have two solvers for transition matrices: 1. A matrix regression solver built using `cvxpy`; -2. A bootstrapped version of #1; -3. A Bayesian ecological inference solver built using [`pymc`](https://www.pymc.io/) based on [Knudson et al., (2021)](https://doi.org/10.21105/joss.03397) and [Rosen et al., (2001)](https://tinyurl.com/yajkae6n). +2. A Bayesian ecological inference solver built using [`pymc`](https://www.pymc.io/) based on [Knudson et al., (2021)](https://doi.org/10.21105/joss.03397) and [Rosen et al., (2001)](https://tinyurl.com/yajkae6n). -We have used #1 for our primary election model and analysis. The transitions it generates form the transitions displayed in our sankey diagrams, but all three solvers could be used for the same purpose. +We have used #1 for our primary election model and analysis. The transitions it generates form the transitions displayed in our sankey diagrams, but all two solvers could be used for the same purpose. ## Development We welcome contributions to this repo. Please open a Github issue for any issues or comments you have. From 92641ac0c16fe150ab20fa0cb7e0e2f2dd38df81 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 1 Feb 2024 09:32:57 -0500 Subject: [PATCH 122/135] Moving the EI solver to elex-voterflow-model as the EI model --- README.md | 7 +- setup.py | 2 +- src/elexsolver/EITransitionSolver.py | 162 ---------------- tests/test_ei_transition_solver.py | 264 --------------------------- 4 files changed, 2 insertions(+), 433 deletions(-) delete mode 100644 src/elexsolver/EITransitionSolver.py delete mode 100644 tests/test_ei_transition_solver.py diff --git a/README.md b/README.md index da575283..93258818 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,7 @@ We have our own implementation of ordinary least squares in Python because this Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. At the moment this uses two libraries, the version that solves the non-regularized problem uses `numpy`and solves the dual based on [this](https://arxiv.org/pdf/2305.12616.pdf) paper. The version that solves the regularized problem uses [`cvxpy`](https://www.cvxpy.org/#) and sets up the problem as a normal optimization problem. Eventually, we are planning on replacing the regularized version with the dual also. ## Transition Matrices -We have two solvers for transition matrices: - -1. A matrix regression solver built using `cvxpy`; -2. A Bayesian ecological inference solver built using [`pymc`](https://www.pymc.io/) based on [Knudson et al., (2021)](https://doi.org/10.21105/joss.03397) and [Rosen et al., (2001)](https://tinyurl.com/yajkae6n). - -We have used #1 for our primary election model and analysis. The transitions it generates form the transitions displayed in our sankey diagrams, but all two solvers could be used for the same purpose. +We also have a matrix regression solver built with `cvxpy`. We've used this for our primary election model and analysis. The transitions it generates form the transitions displayed in our sankey diagrams. ## Development We welcome contributions to this repo. Please open a Github issue for any issues or comments you have. diff --git a/setup.py b/setup.py index 9c1fcf59..3a74a7a8 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "pymc~=5.10", "scipy~=1.12"] +INSTALL_REQUIRES = ["cvxpy~=1.4", "numpy~=1.26", "scipy~=1.12"] THIS_FILE_DIR = os.path.dirname(__file__) diff --git a/src/elexsolver/EITransitionSolver.py b/src/elexsolver/EITransitionSolver.py deleted file mode 100644 index 6f81c53c..00000000 --- a/src/elexsolver/EITransitionSolver.py +++ /dev/null @@ -1,162 +0,0 @@ -import logging - -import numpy as np -import pymc as pm - -from elexsolver.logging import initialize_logging -from elexsolver.TransitionSolver import TransitionSolver - -initialize_logging() - -LOG = logging.getLogger(__name__) -logging.getLogger("pymc").setLevel(logging.ERROR) -logging.getLogger("jax").setLevel(logging.ERROR) - - -class EITransitionSolver(TransitionSolver): - """ - A transition solver based on RxC ecological inference. - Somewhat adapted from version 1.0.1 of - Knudson et al., (2021). PyEI: A Python package for ecological inference. - Journal of Open Source Software, 6(64), 3397, https://doi.org/10.21105/joss.03397 - See also: - Ori Rosen, Wenxin Jiang, Gary King, and Martin A Tanner. 2001. - “Bayesian and Frequentist Inference for Ecological Inference: The RxC Case.” - Statistica Neerlandica, 55, Pp. 134–156. Copy at https://tinyurl.com/yajkae6n - """ - - def __init__(self, sigma: int = 1, sampling_chains: int = 2, random_seed: int | None = None, n_samples: int = 300): - """ - Parameters - ---------- - `sigma` : int, default 1 - Standard deviation of the half-normal distribution that provides alphas to the Dirichlet distribution. - `sampling_chains` : int, default 2 - The number of sampling chains to run in parallel, each of which will draw `n_samples`. - `random_seed` : int, optional - For seeding the NUTS sampler. - `n_samples` : int, default 300 - The number of samples to draw. Before sampling, the NUTS sampler will be tuned using `n_samples // 2` samples. - """ - super().__init__() - self._sigma = sigma - self._chains = int(sampling_chains) - self._seed = random_seed - self._draws = n_samples - self._tune = n_samples // 2 - - # class members that are instantiated during model-fit - self._sampled = None - self._X_totals = None - - def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): - """ - NOTE: weighting is not currently implemented. - """ - self._check_data_type(X) - self._check_data_type(Y) - self._check_any_element_nan_or_inf(X) - self._check_any_element_nan_or_inf(Y) - - # matrices should be (units x things), where the number of units is > the number of things - if X.shape[1] > X.shape[0]: - X = X.T - if Y.shape[1] > Y.shape[0]: - Y = Y.T - - if X.shape[0] != Y.shape[0]: - raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") - - self._check_dimensions(X) - self._check_dimensions(Y) - self._check_for_zero_units(X) - self._check_for_zero_units(Y) - - self._X_totals = X.sum(axis=0) / X.sum(axis=0).sum() - n = Y.sum(axis=1) - - num_units = len(n) # should be the same as the number of units in Y - num_rows = X.shape[1] # number of things in X that are being transitioned "from" - num_cols = Y.shape[1] # number of things in Y that are being transitioned "to" - - # rescaling and reshaping - X = self._rescale(X) - X_extended = np.expand_dims(X, axis=2) - X_extended = np.repeat(X_extended, num_cols, axis=2) - - with pm.Model(check_bounds=False) as model: - conc_params = pm.HalfNormal("conc_params", sigma=self._sigma, shape=(num_rows, num_cols)) - beta = pm.Dirichlet("beta", a=conc_params, shape=(num_units, num_rows, num_cols)) - theta = (X_extended * beta).sum(axis=1) - pm.Multinomial( - "result_fractions", - n=n, - p=theta, - observed=Y, - shape=(num_units, num_cols), - ) - try: - # DO NOT USE THE NUMPYRO NUTS SAMPLER - # IT IS UNSTABLE - model_trace = pm.sample( - chains=self._chains, - random_seed=self._seed, - cores=self._chains, - draws=self._draws, - tune=self._tune, - ) - except Exception as e: - LOG.debug(model.debug()) - raise e - - b_values = np.transpose( - model_trace["posterior"]["beta"].stack(all_draws=["chain", "draw"]).values, axes=(3, 0, 1, 2) - ) - samples_converted = np.transpose(b_values, axes=(3, 0, 1, 2)) * X - samples_summed_across = samples_converted.sum(axis=2) - self._sampled = np.transpose(samples_summed_across / X.sum(axis=0), axes=(1, 2, 0)) - - posterior_mean_rxc = self._sampled.mean(axis=0) - self._transitions = self.__get_transitions(posterior_mean_rxc) - return posterior_mean_rxc - - def __get_transitions(self, A: np.ndarray): - # to go from inferred percentages to transitions - transitions = [] - for col in A.T: - transitions.append(col * self._X_totals) - return np.array(transitions).T - - def get_credible_interval(self, ci: float, transitions: bool = False): - """ - Parameters - ---------- - `ci` : float - Size of the credible interval [0, 100). If <= 1, will be multiplied by 100. - `transitions` : bool, default False - If True, the returned matrices will represent transitions, not percentages. - - Returns - ------- - A tuple of two np.ndarray matrices of float. Element 0 has the lower bound and 1 has the upper bound. - """ - if ci <= 1: - ci = ci * 100 - if ci < 0 or ci > 100: - raise ValueError(f"Invalid credible interval {ci}.") - - lower = (100 - ci) / 2 - upper = ci + lower - A_dict = { - lower: np.zeros((self._sampled.shape[1], self._sampled.shape[2])), - upper: np.zeros((self._sampled.shape[1], self._sampled.shape[2])), - } - - for interval in [lower, upper]: - for i in range(0, self._sampled.shape[1]): - for j in range(0, self._sampled.shape[2]): - A_dict[interval][i][j] = np.percentile(self._sampled[:, i, j], interval) - - if transitions: - return (self.__get_transitions(A_dict[lower]), self.__get_transitions(A_dict[upper])) - return (A_dict[lower], A_dict[upper]) diff --git a/tests/test_ei_transition_solver.py b/tests/test_ei_transition_solver.py deleted file mode 100644 index 1bf47130..00000000 --- a/tests/test_ei_transition_solver.py +++ /dev/null @@ -1,264 +0,0 @@ -import numpy as np -import pytest - -from elexsolver.EITransitionSolver import EITransitionSolver - -# high tolerance due to random sampling -# (which can produce different outcomes on different architectures, despite setting seeds) -RTOL = 1e-02 -ATOL = 1e-02 - -np.random.seed(1024) - - -def test_ei_fit_predict(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected = np.array([[0.279297, 0.720703], [0.623953, 0.376047]]) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - current = ei.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) - - -def test_ei_fit_predict_with_weights(): - # NOTE: currently, supplying weights to the EI solver does nothing. - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - - expected = np.array([[0.279297, 0.720703], [0.623953, 0.376047]]) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - current = ei.fit_predict(X, Y, weights=weights) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) - - -def test_ei_fit_predict_pivoted(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ).T - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ).T - - expected = np.array([[0.279297, 0.720703], [0.623953, 0.376047]]) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - current = ei.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) - - -def test_ei_fit_predict_bad_dimensions(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - with pytest.raises(ValueError): - ei.fit_predict(X, Y) - - -def test_ei_get_prediction_interval(): - with pytest.raises(NotImplementedError): - ei = EITransitionSolver() - ei.get_prediction_interval(0) - - -def test_ei_credible_interval_percentages(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected_lower = np.array([[0.037212, 0.356174], [0.424652, 0.117605]]) - expected_upper = np.array([[0.643826, 0.962788], [0.882395, 0.575348]]) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - _ = ei.fit_predict(X, Y) - (current_lower, current_upper) = ei.get_credible_interval(99, transitions=False) - np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) - np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) - - -def test_ei_credible_interval_percentages_float_interval(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected_lower = np.array([[0.037212, 0.356174], [0.424652, 0.117605]]) - expected_upper = np.array([[0.643826, 0.962788], [0.882395, 0.575348]]) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - _ = ei.fit_predict(X, Y) - (current_lower, current_upper) = ei.get_credible_interval(0.99, transitions=False) - np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) - np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) - - -def test_ei_credible_interval_invalid(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - _ = ei.fit_predict(X, Y) - - with pytest.raises(ValueError): - ei.get_credible_interval(3467838976, transitions=False) - - -def test_ei_credible_interval_transitions(): - X = np.array( - [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], - ] - ) - - Y = np.array( - [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], - ] - ) - - expected_lower = np.array([[0.017175, 0.164388], [0.228659, 0.063326]]) - expected_upper = np.array([[0.29715, 0.444364], [0.475136, 0.309803]]) - - ei = EITransitionSolver(random_seed=1024, n_samples=10, sampling_chains=1) - _ = ei.fit_predict(X, Y) - (current_lower, current_upper) = ei.get_credible_interval(99, transitions=True) - np.testing.assert_allclose(expected_lower, current_lower, rtol=RTOL, atol=ATOL) - np.testing.assert_allclose(expected_upper, current_upper, rtol=RTOL, atol=ATOL) From 109b718d2cf5dbdadba7317a1a0708d8ad37c32b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 1 Feb 2024 09:42:38 -0500 Subject: [PATCH 123/135] Asking pylint to ignore missing-module-docstring --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 9a638abb..70749624 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,4 +6,4 @@ max-line-length = 160 [pylint] max-line-length = 160 -disable = invalid-name, duplicate-code, missing-function-docstring, too-many-instance-attributes, too-many-arguments, too-many-locals \ No newline at end of file +disable = invalid-name, duplicate-code, missing-function-docstring, too-many-instance-attributes, too-many-arguments, too-many-locals, missing-module-docstring \ No newline at end of file From 70bbaf7ebb61443a253d34cccd820b2414975633 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 6 Feb 2024 15:11:10 -0500 Subject: [PATCH 124/135] When the Clarabel solver fails or throws a warning, chain that to a RuntimeError and raise the exception --- src/elexsolver/TransitionMatrixSolver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index f9e6dd58..07674231 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -66,8 +66,7 @@ def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray) -> np.ndarr try: problem.solve(solver=cp.CLARABEL) except (UserWarning, cp.error.SolverError) as e: - LOG.error(e) - return np.zeros((A.shape[1], B.shape[1])) + raise RuntimeError(e) from e return transition_matrix.value From 1c188834f3736fb2e9e5554f0817f7a87a98f56a Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 18 Mar 2024 15:42:03 -0400 Subject: [PATCH 125/135] Removing get_prediction_interval() method since it hasn't been implemented anywhere yet --- src/elexsolver/TransitionSolver.py | 3 --- tests/test_transition_matrix_solver.py | 6 ------ tests/test_transition_solver.py | 7 ------- 3 files changed, 16 deletions(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 923686d6..8a77966a 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -39,9 +39,6 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = """ raise NotImplementedError - def get_prediction_interval(self, pi: float): - raise NotImplementedError - @property def transitions(self) -> np.ndarray: return self._transitions diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 44d4a4bd..48071e0e 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -159,12 +159,6 @@ def test_matrix_fit_predict_pivoted(): np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) -def test_matrix_get_prediction_interval(): - tms = TransitionMatrixSolver() - with pytest.raises(NotImplementedError): - tms.get_prediction_interval(0) - - def test_matrix_fit_predict_bad_dimensions(): X = np.array( [ diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 692d0c0e..0a201611 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -13,13 +13,6 @@ def test_superclass_fit_predict(): ts.fit_predict(None, None) -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_superclass_get_prediction_interval(): - with pytest.raises(NotImplementedError): - ts = TransitionSolver() - ts.get_prediction_interval(0) - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_superclass_get_transitions(): ts = TransitionSolver() From ffde8aa597775209d5b9f06afa9af145ff1f6246 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Mon, 18 Mar 2024 16:48:19 -0400 Subject: [PATCH 126/135] Moving rules about matrix dimensions that are specific to voterflow over to elex-voterflow-model --- src/elexsolver/TransitionMatrixSolver.py | 15 +++---------- src/elexsolver/TransitionSolver.py | 8 ------- tests/test_transition_matrix_solver.py | 11 +++++++++- tests/test_transition_solver.py | 27 ------------------------ 4 files changed, 13 insertions(+), 48 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 07674231..805766ba 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -75,18 +75,6 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = self._check_data_type(Y) self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) - - # matrices should be (units x things), where the number of units is > the number of things - if X.shape[1] > X.shape[0]: - X = X.T - if Y.shape[1] > Y.shape[0]: - Y = Y.T - - if X.shape[0] != Y.shape[0]: - raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") - - self._check_dimensions(X) - self._check_dimensions(Y) self._check_for_zero_units(X) self._check_for_zero_units(Y) @@ -95,6 +83,9 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = if not isinstance(Y, np.ndarray): Y = Y.to_numpy() + if X.shape[0] != Y.shape[0]: + raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") + X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() X = self._rescale(X) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 8a77966a..ce59a59b 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -54,14 +54,6 @@ def _check_data_type(self, A: np.ndarray): if not np.all(A.astype("int64") == A): raise ValueError("Matrix must contain integers.") - def _check_dimensions(self, A: np.ndarray): - """ - Ensure that in our (units x things) matrix, the number of units is - at least twice as large as the number of things. - """ - if A.shape[0] <= A.shape[1] or (A.shape[0] // 2) <= A.shape[1]: - raise ValueError(f"Not enough units ({A.shape[0]}) relative to the number of things ({A.shape[1]}).") - def _check_for_zero_units(self, A: np.ndarray): """ If we have at least one unit whose columns are all zero, we can't continue. diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 48071e0e..c5a621f1 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -152,7 +152,16 @@ def test_matrix_fit_predict_pivoted(): ] ).T - expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + expected = np.array( + [ + [0.68274443, 0.18437159, 0.06760119, 0.03363495, 0.0197597, 0.01188814], + [0.13541428, 0.48122828, 0.22128163, 0.0960816, 0.04540571, 0.02058852], + [0.04545795, 0.16052607, 0.38881747, 0.27665629, 0.12758087, 0.00096135], + [0.02289342, 0.06401812, 0.17970185, 0.28708764, 0.28820718, 0.15809178], + [0.01424566, 0.03468587, 0.08136858, 0.21299756, 0.26935036, 0.38735196], + [0.00995853, 0.02159863, 0.04337214, 0.1113991, 0.30326763, 0.51040397], + ] + ) tms = TransitionMatrixSolver() current = tms.fit_predict(X, Y) diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 0a201611..3d8030a5 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -34,33 +34,6 @@ def test_check_any_element_nan_or_inf_without_nan(): ts._check_any_element_nan_or_inf(A) # pylint: disable=protected-access -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_check_dimensions_bad(): - with pytest.raises(ValueError): - A = np.array([[0.1, 0.2, 0.3]]) - ts = TransitionSolver() - ts._check_dimensions(A) # pylint: disable=protected-access - - -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_check_dimensions_good(): - A = np.array( - [ - [0.1, 0.2, 0.3], - [0.4, 0.5, 0.6], - [0.7, 0.8, 0.9], - [0.1, 0.2, 0.3], - [0.4, 0.5, 0.6], - [0.7, 0.8, 0.9], - [0.1, 0.2, 0.3], - [0.4, 0.5, 0.6], - [0.7, 0.8, 0.9], - ] - ) - ts = TransitionSolver() - ts._check_dimensions(A) # pylint: disable=protected-access - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_check_for_zero_units_good(): A = np.array( From 71f29c9a35d36527393c7a4f28f404e2dc8bb7bb Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 19 Mar 2024 11:24:33 -0400 Subject: [PATCH 127/135] Splitting fit_predict() into two methods, renaming 'percentages' to 'self._betas', renaming 'weights' to 'sample_weight' to be consistent with sklearn --- src/elexsolver/TransitionMatrixSolver.py | 10 ++--- src/elexsolver/TransitionSolver.py | 37 +++++++++++++--- tests/test_transition_matrix_solver.py | 56 +++++++++++++----------- tests/test_transition_solver.py | 17 ++++++- 4 files changed, 83 insertions(+), 37 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 805766ba..acdbe035 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -70,7 +70,7 @@ def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray) -> np.ndarr return transition_matrix.value - def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None) -> np.ndarray: + def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None) -> np.ndarray: self._check_data_type(X) self._check_data_type(Y) self._check_any_element_nan_or_inf(X) @@ -91,8 +91,8 @@ def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = X = self._rescale(X) Y = self._rescale(Y) - weights = self._check_and_prepare_weights(X, Y, weights) + weights = self._check_and_prepare_weights(X, Y, sample_weight) - percentages = self.__solve(X, Y, weights) - self._transitions = np.diag(X_expected_totals) @ percentages - return percentages + self._betas = self.__solve(X, Y, weights) + self._transitions = np.diag(X_expected_totals) @ self._betas + return self diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index ce59a59b..034c5827 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -17,32 +17,59 @@ class TransitionSolver(ABC): """ def __init__(self): + self._betas = None self._transitions = None - def fit_predict(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None = None): + def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None): """ - After this method finishes, transitions will be available in the `transitions` class member. - Parameters ---------- `X` : np.ndarray matrix or pandas.DataFrame of int Must have the same number of rows as `Y` but can have any number of columns greater than the number of rows. `Y` : np.ndarray matrix or pandas.DataFrame of int Must have the same number of rows as `X` but can have any number of columns greater than the number of rows. - `weights` : list, np.ndarray, or pandas.Series of int, optional + `sample_weight` : list, np.ndarray, or pandas.Series of int, optional Must have the same length (number of rows) as both `X` and `Y`. Returns ------- - np.ndarray matrix of float of shape (number of columns in `X`) x (number of columns in `Y`). + `self` and populates `betas` with the beta coefficients determined by this solver. + `betas` is an np.ndarray matrix of float of shape (number of columns in `X`) x (number of columns in `Y`). Each float represents the percent of how much of row x is part of column y. """ raise NotImplementedError + def predict(self, X: np.ndarray): + """ + Parameters + ---------- + `X` : np.ndarray matrix or pandas.DataFrame of int + Must have the same dimensions as the `X` supplied to `fit()`. + + Returns + ------- + `Y_hat`, np.ndarray of float of the same shape as Y. + """ + if self._betas is None: + raise RuntimeError("Solver must be fit before prediction can be performed.") + return X @ self._betas + @property def transitions(self) -> np.ndarray: return self._transitions + @property + def betas(self) -> np.ndarray: + """ + Returns + ------- + The solved coefficients, an np.ndarray matrix of float of shape + (number of columns in `X`) x (number of columns in `Y`). + Each float represents the percent of how much of row x is part of column y. + Will return `None` if `fit()` hasn't been called yet. + """ + return self._betas + def _check_any_element_nan_or_inf(self, A: np.ndarray): """ Check whether any element in a matrix or vector is NaN or infinity diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index c5a621f1..4317f89c 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -30,11 +30,22 @@ def test_matrix_fit_predict(): ] ) - expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + expected_betas = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + expected_yhat = np.array( + [ + [1.19371187, 1.80628813], + [3.14785177, 3.85214823], + [5.10199167, 5.89800833], + [7.05613156, 7.94386844], + [9.01027146, 9.98972854], + [10.96441136, 12.03558864], + ] + ) - tms = TransitionMatrixSolver() - current = tms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + tms = TransitionMatrixSolver().fit(X, Y) + current_yhat = tms.predict(X) + np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_yhat, current_yhat, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_with_weights(): @@ -62,11 +73,10 @@ def test_matrix_fit_predict_with_weights(): weights = np.array([500, 250, 125, 62.5, 31.25, 15.625]) - expected = np.array([[0.737329, 0.262671], [0.230589, 0.769411]]) + expected_betas = np.array([[0.737329, 0.262671], [0.230589, 0.769411]]) - tms = TransitionMatrixSolver() - current = tms.fit_predict(X, Y, weights=weights) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + tms = TransitionMatrixSolver().fit(X, Y, sample_weight=weights) + np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_not_strict(): @@ -92,11 +102,10 @@ def test_matrix_fit_predict_not_strict(): ] ) - expected = np.array([[0.760451, 0.239558], [0.216624, 0.783369]]) + expected_betas = np.array([[0.760451, 0.239558], [0.216624, 0.783369]]) - tms = TransitionMatrixSolver(strict=False) - current = tms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + tms = TransitionMatrixSolver(strict=False).fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) def test_ridge_matrix_fit_predict(): @@ -122,11 +131,10 @@ def test_ridge_matrix_fit_predict(): ] ) - expected = np.array([[0.479416, 0.520584], [0.455918, 0.544082]]) + expected_betas = np.array([[0.479416, 0.520584], [0.455918, 0.544082]]) - tms = TransitionMatrixSolver(lam=1) - current = tms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + tms = TransitionMatrixSolver(lam=1).fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_pivoted(): @@ -152,7 +160,7 @@ def test_matrix_fit_predict_pivoted(): ] ).T - expected = np.array( + expected_betas = np.array( [ [0.68274443, 0.18437159, 0.06760119, 0.03363495, 0.0197597, 0.01188814], [0.13541428, 0.48122828, 0.22128163, 0.0960816, 0.04540571, 0.02058852], @@ -163,9 +171,8 @@ def test_matrix_fit_predict_pivoted(): ] ) - tms = TransitionMatrixSolver() - current = tms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + tms = TransitionMatrixSolver().fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_bad_dimensions(): @@ -191,7 +198,7 @@ def test_matrix_fit_predict_bad_dimensions(): tms = TransitionMatrixSolver() with pytest.raises(ValueError): - tms.fit_predict(X, Y) + tms.fit(X, Y) def test_matrix_fit_predict_pandas(): @@ -222,11 +229,10 @@ def test_matrix_fit_predict_pandas(): columns=["y1", "y2"], ) - expected = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) + expected_betas = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) - tms = TransitionMatrixSolver() - current = tms.fit_predict(X, Y) - np.testing.assert_allclose(expected, current, rtol=RTOL, atol=ATOL) + tms = TransitionMatrixSolver().fit(X, Y) + np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) except ImportError: # pass this test through since pandas isn't a requirement for elex-solver diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 3d8030a5..1d1b82d5 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -7,10 +7,17 @@ @patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_superclass_fit_predict(): +def test_superclass_fit(): with pytest.raises(NotImplementedError): ts = TransitionSolver() - ts.fit_predict(None, None) + ts.fit(None, None) + + +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_predict(): + with pytest.raises(RuntimeError): + ts = TransitionSolver() + ts.predict(None) @patch.object(TransitionSolver, "__abstractmethods__", set()) @@ -19,6 +26,12 @@ def test_superclass_get_transitions(): assert ts.transitions is None +@patch.object(TransitionSolver, "__abstractmethods__", set()) +def test_superclass_get_betas(): + ts = TransitionSolver() + assert ts.betas is None + + @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_check_any_element_nan_or_inf_with_nan(): with pytest.raises(ValueError): From b5a2e07c59179c84baf0fe0dc29cdd20611ee134 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 19 Mar 2024 16:42:39 -0400 Subject: [PATCH 128/135] Specifying return type for TransitionSolver.predict() --- src/elexsolver/TransitionSolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 034c5827..7bf83d2f 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -39,7 +39,7 @@ def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = N """ raise NotImplementedError - def predict(self, X: np.ndarray): + def predict(self, X: np.ndarray) -> np.ndarray: """ Parameters ---------- From 73f0c9e3cf1f8c8405b006c43d90114a2172480d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 19 Mar 2024 16:44:38 -0400 Subject: [PATCH 129/135] A version of TransitionSolver that inherets from LinearSolver --- src/elexsolver/TransitionMatrixSolver.py | 4 ++-- src/elexsolver/TransitionSolver.py | 19 ++++++------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index acdbe035..f9d8d4d6 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -93,6 +93,6 @@ def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = N weights = self._check_and_prepare_weights(X, Y, sample_weight) - self._betas = self.__solve(X, Y, weights) - self._transitions = np.diag(X_expected_totals) @ self._betas + self.coefficients = self.__solve(X, Y, weights) + self._transitions = np.diag(X_expected_totals) @ self.coefficients return self diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 7bf83d2f..743f797c 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -1,23 +1,23 @@ import logging import warnings -from abc import ABC import numpy as np from elexsolver.logging import initialize_logging +from elexsolver.LinearSolver import LinearSolver initialize_logging() LOG = logging.getLogger(__name__) -class TransitionSolver(ABC): +class TransitionSolver(LinearSolver): """ Abstract class for transition solvers. """ def __init__(self): - self._betas = None + super().__init__() self._transitions = None def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None): @@ -50,9 +50,9 @@ def predict(self, X: np.ndarray) -> np.ndarray: ------- `Y_hat`, np.ndarray of float of the same shape as Y. """ - if self._betas is None: + if self.coefficients is None: raise RuntimeError("Solver must be fit before prediction can be performed.") - return X @ self._betas + return X @ self.coefficients @property def transitions(self) -> np.ndarray: @@ -68,14 +68,7 @@ def betas(self) -> np.ndarray: Each float represents the percent of how much of row x is part of column y. Will return `None` if `fit()` hasn't been called yet. """ - return self._betas - - def _check_any_element_nan_or_inf(self, A: np.ndarray): - """ - Check whether any element in a matrix or vector is NaN or infinity - """ - if np.any(np.isnan(A)) or np.any(np.isinf(A)): - raise ValueError("Matrix contains NaN or Infinity.") + return self.coefficients def _check_data_type(self, A: np.ndarray): if not np.all(A.astype("int64") == A): From 2705e8bcab70263cda58f744c96a49ca98819b9d Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Tue, 19 Mar 2024 16:46:09 -0400 Subject: [PATCH 130/135] Forgot to run pre-commit --- src/elexsolver/TransitionSolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 743f797c..3108bb80 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -3,8 +3,8 @@ import numpy as np -from elexsolver.logging import initialize_logging from elexsolver.LinearSolver import LinearSolver +from elexsolver.logging import initialize_logging initialize_logging() From 268ad1563d868d8251f9711184e9f9ba5095b920 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 20 Mar 2024 14:16:57 -0400 Subject: [PATCH 131/135] Removing math to convert the coefficients to 'transitions' from TransitionSolver/TransitionMatrixSolver --- src/elexsolver/TransitionMatrixSolver.py | 3 --- src/elexsolver/TransitionSolver.py | 5 ----- tests/test_transition_solver.py | 6 ------ 3 files changed, 14 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index f9d8d4d6..5e8925e2 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -86,13 +86,10 @@ def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = N if X.shape[0] != Y.shape[0]: raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") - X_expected_totals = X.sum(axis=0) / X.sum(axis=0).sum() - X = self._rescale(X) Y = self._rescale(Y) weights = self._check_and_prepare_weights(X, Y, sample_weight) self.coefficients = self.__solve(X, Y, weights) - self._transitions = np.diag(X_expected_totals) @ self.coefficients return self diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 3108bb80..a2c6db39 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -18,7 +18,6 @@ class TransitionSolver(LinearSolver): def __init__(self): super().__init__() - self._transitions = None def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None): """ @@ -54,10 +53,6 @@ def predict(self, X: np.ndarray) -> np.ndarray: raise RuntimeError("Solver must be fit before prediction can be performed.") return X @ self.coefficients - @property - def transitions(self) -> np.ndarray: - return self._transitions - @property def betas(self) -> np.ndarray: """ diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 1d1b82d5..9bf3af53 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -20,12 +20,6 @@ def test_superclass_predict(): ts.predict(None) -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_superclass_get_transitions(): - ts = TransitionSolver() - assert ts.transitions is None - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_superclass_get_betas(): ts = TransitionSolver() From f714ef4f155deca20f03adb0ebefbfcaa3760c19 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 22 Mar 2024 15:36:56 -0400 Subject: [PATCH 132/135] Cleaning up some docstring formatting and removing unnecessary property --- src/elexsolver/TransitionMatrixSolver.py | 8 +++--- src/elexsolver/TransitionSolver.py | 34 +++++++++++------------- tests/test_transition_matrix_solver.py | 12 ++++----- tests/test_transition_solver.py | 4 +-- 4 files changed, 28 insertions(+), 30 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 5e8925e2..03da3e2f 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -21,11 +21,11 @@ def __init__(self, strict: bool = True, lam: float | None = None): """ Parameters ---------- - `strict` : bool, default True - If True, solution will be constrainted so that all coefficients are >= 0, + strict : bool, default True + If `True`, solution will be constrainted so that all coefficients are >= 0, <= 1, and the sum of each row equals 1. - `lam` : float, optional - `lam` != 0 will enable L2 regularization (Ridge). + lam : float, optional + `lam != 0` will enable L2 regularization (Ridge). """ super().__init__() self._strict = strict diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index a2c6db39..bb70949a 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -17,17 +17,23 @@ class TransitionSolver(LinearSolver): """ def __init__(self): + """ + After model-fit, `self.coefficients` will contain + the solved coefficients, an np.ndarray matrix of float of shape + (number of columns in `X`) x (number of columns in `Y`). + Each float represents the percent of how much of row x is part of column y. + """ super().__init__() def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None): """ Parameters ---------- - `X` : np.ndarray matrix or pandas.DataFrame of int + X : np.ndarray matrix or pandas.DataFrame of int Must have the same number of rows as `Y` but can have any number of columns greater than the number of rows. - `Y` : np.ndarray matrix or pandas.DataFrame of int + Y : np.ndarray matrix or pandas.DataFrame of int Must have the same number of rows as `X` but can have any number of columns greater than the number of rows. - `sample_weight` : list, np.ndarray, or pandas.Series of int, optional + sample_weight : list or np.ndarray or pandas.Series of int, optional Must have the same length (number of rows) as both `X` and `Y`. Returns @@ -42,7 +48,7 @@ def predict(self, X: np.ndarray) -> np.ndarray: """ Parameters ---------- - `X` : np.ndarray matrix or pandas.DataFrame of int + X : np.ndarray matrix or pandas.DataFrame of int Must have the same dimensions as the `X` supplied to `fit()`. Returns @@ -53,19 +59,11 @@ def predict(self, X: np.ndarray) -> np.ndarray: raise RuntimeError("Solver must be fit before prediction can be performed.") return X @ self.coefficients - @property - def betas(self) -> np.ndarray: + def _check_data_type(self, A: np.ndarray): """ - Returns - ------- - The solved coefficients, an np.ndarray matrix of float of shape - (number of columns in `X`) x (number of columns in `Y`). - Each float represents the percent of how much of row x is part of column y. - Will return `None` if `fit()` hasn't been called yet. + Make sure we're starting with count data which we'll standardize to percentages + by calling `self._rescale(A)` later. """ - return self.coefficients - - def _check_data_type(self, A: np.ndarray): if not np.all(A.astype("int64") == A): raise ValueError("Matrix must contain integers.") @@ -97,9 +95,9 @@ def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.n Parameters ---------- - `X` : np.ndarray matrix of int (same number of rows as `Y`) - `Y` : np.ndarray matrix of int (same number of rows as `X`) - `weights` : np.ndarray of int of the shape (number of rows in `X` and `Y`, 1), optional + X : np.ndarray matrix of int (same number of rows as `Y`) + Y : np.ndarray matrix of int (same number of rows as `X`) + weights : np.ndarray of int of the shape (number of rows in `X` and `Y`, 1), optional """ if weights is not None: diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 4317f89c..7ab73e3b 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -44,7 +44,7 @@ def test_matrix_fit_predict(): tms = TransitionMatrixSolver().fit(X, Y) current_yhat = tms.predict(X) - np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) np.testing.assert_allclose(expected_yhat, current_yhat, rtol=RTOL, atol=ATOL) @@ -76,7 +76,7 @@ def test_matrix_fit_predict_with_weights(): expected_betas = np.array([[0.737329, 0.262671], [0.230589, 0.769411]]) tms = TransitionMatrixSolver().fit(X, Y, sample_weight=weights) - np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_not_strict(): @@ -105,7 +105,7 @@ def test_matrix_fit_predict_not_strict(): expected_betas = np.array([[0.760451, 0.239558], [0.216624, 0.783369]]) tms = TransitionMatrixSolver(strict=False).fit(X, Y) - np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) def test_ridge_matrix_fit_predict(): @@ -134,7 +134,7 @@ def test_ridge_matrix_fit_predict(): expected_betas = np.array([[0.479416, 0.520584], [0.455918, 0.544082]]) tms = TransitionMatrixSolver(lam=1).fit(X, Y) - np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_pivoted(): @@ -172,7 +172,7 @@ def test_matrix_fit_predict_pivoted(): ) tms = TransitionMatrixSolver().fit(X, Y) - np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) def test_matrix_fit_predict_bad_dimensions(): @@ -232,7 +232,7 @@ def test_matrix_fit_predict_pandas(): expected_betas = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) tms = TransitionMatrixSolver().fit(X, Y) - np.testing.assert_allclose(expected_betas, tms.betas, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) except ImportError: # pass this test through since pandas isn't a requirement for elex-solver diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 9bf3af53..2adaf03f 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -21,9 +21,9 @@ def test_superclass_predict(): @patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_superclass_get_betas(): +def test_superclass_get_coefficients(): ts = TransitionSolver() - assert ts.betas is None + assert ts.coefficients is None @patch.object(TransitionSolver, "__abstractmethods__", set()) From 4edb1fd46b01dbb2f3d0668394eeb5d5b9ccb511 Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Wed, 27 Mar 2024 16:48:29 -0400 Subject: [PATCH 133/135] BROKEN testing the requirement of integer data and the forced-rescaling here --- src/elexsolver/TransitionMatrixSolver.py | 5 ---- src/elexsolver/TransitionSolver.py | 21 -------------- tests/test_transition_solver.py | 37 ------------------------ 3 files changed, 63 deletions(-) diff --git a/src/elexsolver/TransitionMatrixSolver.py b/src/elexsolver/TransitionMatrixSolver.py index 03da3e2f..95e31252 100644 --- a/src/elexsolver/TransitionMatrixSolver.py +++ b/src/elexsolver/TransitionMatrixSolver.py @@ -71,8 +71,6 @@ def __solve(self, A: np.ndarray, B: np.ndarray, weights: np.ndarray) -> np.ndarr return transition_matrix.value def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = None) -> np.ndarray: - self._check_data_type(X) - self._check_data_type(Y) self._check_any_element_nan_or_inf(X) self._check_any_element_nan_or_inf(Y) self._check_for_zero_units(X) @@ -86,9 +84,6 @@ def fit(self, X: np.ndarray, Y: np.ndarray, sample_weight: np.ndarray | None = N if X.shape[0] != Y.shape[0]: raise ValueError(f"Number of units in X ({X.shape[0]}) != number of units in Y ({Y.shape[0]}).") - X = self._rescale(X) - Y = self._rescale(Y) - weights = self._check_and_prepare_weights(X, Y, sample_weight) self.coefficients = self.__solve(X, Y, weights) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index bb70949a..77e26162 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -1,5 +1,4 @@ import logging -import warnings import numpy as np @@ -59,14 +58,6 @@ def predict(self, X: np.ndarray) -> np.ndarray: raise RuntimeError("Solver must be fit before prediction can be performed.") return X @ self.coefficients - def _check_data_type(self, A: np.ndarray): - """ - Make sure we're starting with count data which we'll standardize to percentages - by calling `self._rescale(A)` later. - """ - if not np.all(A.astype("int64") == A): - raise ValueError("Matrix must contain integers.") - def _check_for_zero_units(self, A: np.ndarray): """ If we have at least one unit whose columns are all zero, we can't continue. @@ -74,18 +65,6 @@ def _check_for_zero_units(self, A: np.ndarray): if np.any(np.sum(A, axis=1) == 0): raise ValueError("Matrix cannot contain any rows (units) where all columns (things) are zero.") - def _rescale(self, A: np.ndarray) -> np.ndarray: - """ - Rescale rows (units) to ensure they sum to 1 (100%). - """ - A = A.copy().astype(float) - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in divide") - A = (A.T / A.sum(axis=1)).T - - return np.nan_to_num(A, nan=0, posinf=0, neginf=0) - def _check_and_prepare_weights(self, X: np.ndarray, Y: np.ndarray, weights: np.ndarray | None) -> np.ndarray: """ If `weights` is not None, and `weights` has the same number of rows in both matrices `X` and `Y`, diff --git a/tests/test_transition_solver.py b/tests/test_transition_solver.py index 2adaf03f..b7bc9aa3 100644 --- a/tests/test_transition_solver.py +++ b/tests/test_transition_solver.py @@ -68,43 +68,6 @@ def test_check_for_zero_units_bad(): ts._check_for_zero_units(A) # pylint: disable=protected-access -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_rescale_rescaled_numpy(): - A = np.ones((2, 2)).astype(int) - expected = np.array([[0.5, 0.5], [0.5, 0.5]]) - ts = TransitionSolver() - np.testing.assert_array_equal(ts._rescale(A), expected) # pylint: disable=protected-access - - -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_rescale_rescaled_pandas(): - try: - import pandas # pylint: disable=import-outside-toplevel - - a_df = pandas.DataFrame(np.ones((2, 2)), columns=["A", "B"]).astype(int) - expected_df = pandas.DataFrame([[0.5, 0.5], [0.5, 0.5]], columns=["A", "B"]) - ts = TransitionSolver() - np.testing.assert_array_equal(expected_df, ts._rescale(a_df)) # pylint: disable=protected-access - except ImportError: - # pass this test through since pandas isn't a requirement for elex-solver - assert True - - -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_check_data_type_good(): - A = np.array([[1, 2, 3], [4, 5, 6]]) - ts = TransitionSolver() - ts._check_data_type(A) # pylint: disable=protected-access - - -@patch.object(TransitionSolver, "__abstractmethods__", set()) -def test_check_data_type_bad(): - with pytest.raises(ValueError): - A = np.array([[0.1, 0.2, 0.3]]) - ts = TransitionSolver() - ts._check_data_type(A) # pylint: disable=protected-access - - @patch.object(TransitionSolver, "__abstractmethods__", set()) def test_check_and_prepare_weights_bad(): with pytest.raises(ValueError): From 9beef4885ccb496f2613d61fdd99603caefdb88b Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 28 Mar 2024 11:28:04 -0400 Subject: [PATCH 134/135] FIXED updated unit tests; integer data is no longer a requirement for transition solvers :tada: --- tests/test_transition_matrix_solver.py | 175 +++++++++++++++---------- 1 file changed, 108 insertions(+), 67 deletions(-) diff --git a/tests/test_transition_matrix_solver.py b/tests/test_transition_matrix_solver.py index 7ab73e3b..5796c887 100644 --- a/tests/test_transition_matrix_solver.py +++ b/tests/test_transition_matrix_solver.py @@ -7,7 +7,7 @@ ATOL = 1e-04 -def test_matrix_fit_predict(): +def test_matrix_fit_predict_with_integers(): X = np.array( [ [1, 2], @@ -30,15 +30,56 @@ def test_matrix_fit_predict(): ] ) + expected_betas = np.array([[9.99831808e-01, 1.68191521e-04], [1.49085896e-04, 9.99850914e-01]]) + expected_yhat = np.array( + [ + [1.00012998, 1.99987002], + [3.00009177, 3.99990823], + [5.00005356, 5.99994644], + [7.00001535, 7.99998465], + [8.99997714, 10.00002286], + [10.99993892, 12.00006108], + ] + ) + + tms = TransitionMatrixSolver().fit(X, Y) + current_yhat = tms.predict(X) + np.testing.assert_allclose(expected_betas, tms.coefficients, rtol=RTOL, atol=ATOL) + np.testing.assert_allclose(expected_yhat, current_yhat, rtol=RTOL, atol=ATOL) + + +def test_matrix_fit_predict(): + X = np.array( + [ + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], + ] + ) + + Y = np.array( + [ + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], + ] + ) + expected_betas = np.array([[0.760428, 0.239572], [0.216642, 0.783358]]) expected_yhat = np.array( [ - [1.19371187, 1.80628813], - [3.14785177, 3.85214823], - [5.10199167, 5.89800833], - [7.05613156, 7.94386844], - [9.01027146, 9.98972854], - [10.96441136, 12.03558864], + [0.39790396, 0.60209604], + [0.44969311, 0.55030689], + [0.46381742, 0.53618258], + [0.47040877, 0.52959123], + [0.47422481, 0.52577519], + [0.47671354, 0.52328646], ] ) @@ -51,23 +92,23 @@ def test_matrix_fit_predict(): def test_matrix_fit_predict_with_weights(): X = np.array( [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], ] ) Y = np.array( [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], ] ) @@ -82,23 +123,23 @@ def test_matrix_fit_predict_with_weights(): def test_matrix_fit_predict_not_strict(): X = np.array( [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], ] ) Y = np.array( [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], ] ) @@ -111,23 +152,23 @@ def test_matrix_fit_predict_not_strict(): def test_ridge_matrix_fit_predict(): X = np.array( [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], ] ) Y = np.array( [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], ] ) @@ -162,12 +203,12 @@ def test_matrix_fit_predict_pivoted(): expected_betas = np.array( [ - [0.68274443, 0.18437159, 0.06760119, 0.03363495, 0.0197597, 0.01188814], - [0.13541428, 0.48122828, 0.22128163, 0.0960816, 0.04540571, 0.02058852], - [0.04545795, 0.16052607, 0.38881747, 0.27665629, 0.12758087, 0.00096135], - [0.02289342, 0.06401812, 0.17970185, 0.28708764, 0.28820718, 0.15809178], - [0.01424566, 0.03468587, 0.08136858, 0.21299756, 0.26935036, 0.38735196], - [0.00995853, 0.02159863, 0.04337214, 0.1113991, 0.30326763, 0.51040397], + [9.99706e-01, 1.85000e-04, 5.00000e-05, 2.80000e-05, 1.90000e-05, 1.30000e-05], + [4.80000e-05, 9.99464e-01, 3.43000e-04, 8.10000e-05, 4.00000e-05, 2.40000e-05], + [1.70000e-05, 1.56000e-04, 9.99188e-01, 4.86000e-04, 1.06000e-04, 4.70000e-05], + [1.00000e-05, 4.60000e-05, 2.76000e-04, 9.98960e-01, 5.93000e-04, 1.14000e-04], + [7.00000e-06, 2.40000e-05, 7.40000e-05, 3.88000e-04, 9.98887e-01, 6.20000e-04], + [5.00000e-06, 1.50000e-05, 3.60000e-05, 9.70000e-05, 4.66000e-04, 9.99382e-01], ] ) @@ -178,12 +219,12 @@ def test_matrix_fit_predict_pivoted(): def test_matrix_fit_predict_bad_dimensions(): X = np.array( [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], ] ) @@ -207,24 +248,24 @@ def test_matrix_fit_predict_pandas(): X = pandas.DataFrame( [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [0.33333333, 0.66666667], + [0.42857143, 0.57142857], + [0.45454545, 0.54545455], + [0.46666667, 0.53333333], + [0.47368421, 0.52631579], + [0.47826087, 0.52173913], ], columns=["x1", "x2"], ) Y = pandas.DataFrame( [ - [2, 3], - [4, 5], - [6, 7], - [8, 9], - [10, 11], - [12, 13], + [0.4, 0.6], + [0.44444444, 0.55555556], + [0.46153846, 0.53846154], + [0.47058824, 0.52941176], + [0.47619048, 0.52380952], + [0.48, 0.52], ], columns=["y1", "y2"], ) From 06385f02e4b867b6b33c70af3b93fd8078d0cfad Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Thu, 28 Mar 2024 11:33:42 -0400 Subject: [PATCH 135/135] Adding more information to why self._check_for_zero_units() is important --- src/elexsolver/TransitionSolver.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/elexsolver/TransitionSolver.py b/src/elexsolver/TransitionSolver.py index 77e26162..b55ffae9 100644 --- a/src/elexsolver/TransitionSolver.py +++ b/src/elexsolver/TransitionSolver.py @@ -56,11 +56,14 @@ def predict(self, X: np.ndarray) -> np.ndarray: """ if self.coefficients is None: raise RuntimeError("Solver must be fit before prediction can be performed.") + + self._check_any_element_nan_or_inf(X) + return X @ self.coefficients def _check_for_zero_units(self, A: np.ndarray): """ - If we have at least one unit whose columns are all zero, we can't continue. + If we have at least one unit whose columns are all zero, most if not all of our solvers will fail. """ if np.any(np.sum(A, axis=1) == 0): raise ValueError("Matrix cannot contain any rows (units) where all columns (things) are zero.")