diff --git a/SystemUpdates/ModelDefinitions.py b/SystemUpdates/ModelDefinitions.py index b2f5634..26591c8 100644 --- a/SystemUpdates/ModelDefinitions.py +++ b/SystemUpdates/ModelDefinitions.py @@ -1,128 +1,14 @@ # The ModelList is a list of dictionaries that define a range of models for the project -import sys -# sys.path.append('../') -sys.path.append('../Tools') -#sys.path.append('../Intermediates') -# sklearn +import settings + from sklearn.ensemble import RandomForestRegressor -from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.ensemble import AdaBoostRegressor -from sklearn import linear_model -from sklearn.metrics import mean_squared_error -from sklearn import preprocessing -from sklearn.linear_model import ElasticNet -from sklearn.datasets import make_regression - -from xgboost import XGBRegressor -from xgboost import XGBClassifier -from xgboost import XGBRFRegressor, XGBRFClassifier - -from lightgbm import LGBMClassifier, LGBMRegressor - -from ViewsEstimators import * - -class FixedFirstSplitRegression(BaseEstimator): - """ Regression model which makes the first split according to a specified feature and then splits according to other - algorithms. The model optimizes onset-situation predictions by fitting a two-part model and combining predictions: - 1) binary classifier - 2) continuous regression - Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects. - Args: - ones_name: model to estimate if z variable is one (e.g. "onset") - zeros_name: model to estimate if z variable is zeros (e.g. "continuation") - ones_params: dict of parameters to pass to "ones" sub-model when initialized - zeros_params: dict of parameters to pass to "zeros" sub-model when initialized - """ - - def __init__(self, - ones_name: str = 'RFRegressor', - zeros_name: str = 'RFRegressor', - ones_indicator: str = '', - ones_params: Optional[dict] = None, - zeros_params: Optional[dict] = None): - - self.ones_name = ones_name - self.zeros_name = zeros_name - self.ones_indicator = ones_indicator - self.ones_params = ones_params - self.zeros_params = zeros_params - self.ones_fi = [] - self.zeros_fi = [] - - @staticmethod - def _resolve_estimator(func_name: str): - """ Lookup table for supported estimators. - This is necessary because sklearn estimator default arguments - must pass equality test, and instantiated sub-estimators are not equal. """ - - funcs = {'linear': LinearRegression(), - 'logistic': LogisticRegression(solver='liblinear'), - 'LGBMRegressor': LGBMRegressor(n_estimators=250), - 'LGBMClassifier': LGBMClassifier(n_estimators=250), - 'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2), - 'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2), - 'GBMRegressor': GradientBoostingRegressor(n_estimators=200), - 'GBMClassifier': GradientBoostingClassifier(n_estimators=200), - 'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2), - 'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2), - 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200), - 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200), - } - - return funcs[func_name] - - def fit(self, - X: Union[np.ndarray, pd.DataFrame], - y: Union[np.ndarray, pd.Series], - z: Union[np.ndarray, pd.Series]): - X, y = check_X_y(X, y, dtype=None, - accept_sparse=False, - accept_large_sparse=False, - force_all_finite='allow-nan') - z = X[ones_indicator] - - if X.shape[1] < 2: - raise ValueError('Cannot fit model when n_features = 1') - - self.ones_ = self._resolve_estimator(self.ones_name) - if self.ones_params: - self.ones_.set_params(**self.ones_params) - self.ones_.fit(X[z==1], y[z==1]) - self.ones_fi = self.ones_.feature_importances_ - - self.zeros_ = self._resolve_estimator(self.zeros_name) - if self.zeros_params: - self.zeros_.set_params(**self.zeros_params) - self.zeros_.fit(X[z==0], y[z==0]) - self.zeros_fi = self.zeros_.feature_importances_ - - self.is_fitted_ = True - return self - - - def predict(self, X: Union[np.ndarray, pd.DataFrame]): -# def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]): - """ Predict combined response using probabilistic classification outcome """ - X = check_array(X, accept_sparse=False, accept_large_sparse=False) - check_is_fitted(self, 'is_fitted_') -# predict = - return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X) - -def manual_test(): - """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """ - check_estimator(HurdleRegression) - from sklearn.datasets import make_regression - X, y = make_regression() - reg = FixedFirstSplitRegression() - reg.fit(X, y) - reg.predict(X) - - +from xgboost import XGBRegressor, XGBRFRegressor +from lightgbm import LGBMRegressor +from Tools.models.hurdle_regression_model import HurdleRegression +from Tools.models.fixed_first_split_regression_model import FixedFirstSplitRegression def DefineEnsembleModels(level): @@ -131,18 +17,18 @@ def DefineEnsembleModels(level): if level == 'cm': nj = 12 -# model = { -# 'modelname': 'fatalities003_baseline_ons', -# 'algorithm': FixedFirstSplitRegression(ones_name='LGBMClassifier', zeros_name='LGBMRegressor',onset_indicator = ''), -# 'depvar': 'ln_ged_sb_dep', -# 'data_train': 'baseline002', -# 'queryset': 'fatalities002_baseline', -# 'preprocessing': 'float_it', -# 'level': 'cm', -# 'description': 'Baseline model with a few conflict history features as well as log population, random forests regression model.', -# 'long_description': 'A very simple model with only five data columns (each column representing one feature): The number of fatalities in the same country at $t-1$, three decay functions of time since there was at least five fatalities in a single month, for each of the UCDP conflict types -- state-based, one-sided, or non-state conflict -- and log population size (Hegre2020RP,Pettersson2021JPR).The features in the baseline are included in all the models described below. This ensures that all models in the ensemble provides at least moderately good predictions, while guaranteeing diversity in feature sets and modelling approaches.' -# } -# ModelList.append(model) + #model = { + # 'modelname': 'fatalities003_baseline_ons', + # 'algorithm': FixedFirstSplitRegression(ones_name='LGBMClassifier', zeros_name='LGBMClassifier',onset_indicator = 'split_by'), + # 'depvar': 'ln_ged_sb_dep', + # 'data_train': 'baseline002', + # 'queryset': 'fatalities002_baseline', + # 'preprocessing': 'float_it', + # 'level': 'cm', + # 'description': 'Baseline model with a few conflict history features as well as log population, random forests regression model.', + # 'long_description': 'A very simple model with only five data columns (each column representing one feature): The number of fatalities in the same country at $t-1$, three decay functions of time since there was at least five fatalities in a single month, for each of the UCDP conflict types -- state-based, one-sided, or non-state conflict -- and log population size (Hegre2020RP,Pettersson2021JPR).The features in the baseline are included in all the models described below. This ensures that all models in the ensemble provides at least moderately good predictions, while guaranteeing diversity in feature sets and modelling approaches.' + #} + #ModelList.append(model) model = { 'modelname': 'fatalities003_nl_baseline_rf', diff --git a/SystemUpdates/settings.py b/SystemUpdates/settings.py new file mode 100644 index 0000000..f946a68 --- /dev/null +++ b/SystemUpdates/settings.py @@ -0,0 +1,7 @@ +import sys +import os + +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(parent_dir) +sys.path.append(os.path.join(parent_dir, 'Tools')) +sys.path.append(os.path.join(parent_dir, 'Intermediates')) diff --git a/SystemUpdates/test_hurdle_regression.ipynb b/SystemUpdates/test_hurdle_regression.ipynb new file mode 100644 index 0000000..24295e3 --- /dev/null +++ b/SystemUpdates/test_hurdle_regression.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8855fab3", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ef27dd3d", + "metadata": {}, + "outputs": [], + "source": [ + "import settings # to append paths\n", + "\n", + "from Tools.models.model_tests import test_hurdle_regression, test_fixed_first_split_regression\n", + "from Tools.models.hurdle_regression_model import HurdleRegression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test Hurdle Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "model_pairs = (\n", + " ['logistic', 'linear'], \n", + " ['LGBMClassifier', 'LGBMRegressor'],\n", + " ['RFClassifier', 'RFRegressor'],\n", + " ['XGBRFClassifier', 'XGBRFRegressor'],\n", + " ['GBMClassifier', 'GBMRegressor'],\n", + " ['XGBClassifier', 'XGBRegressor'],\n", + " ['HGBClassifier' , 'HGBRegressor'],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Squared Error for combined predictions with positive class probability from the classifier: 6338\n", + "Mean Squared Error or combined predictions with binary outcome from the classifier: 5458\n" + ] + } + ], + "source": [ + "clf_name = 'RFClassifier'\n", + "reg_name = 'RFRegressor'\n", + "\n", + "test_hurdle_regression(clf_name=clf_name, reg_name=reg_name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Tools/ViewsEstimators.py b/Tools/ViewsEstimators.py deleted file mode 100644 index 6cfc66d..0000000 --- a/Tools/ViewsEstimators.py +++ /dev/null @@ -1,123 +0,0 @@ -from typing import Optional, Union -import numpy as np -import pandas as pd - -from sklearn.linear_model import LinearRegression, LogisticRegression -from sklearn.base import BaseEstimator -from sklearn.utils.estimator_checks import check_estimator -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted -from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor -from sklearn.ensemble import RandomForestRegressor -from sklearn.ensemble import RandomForestClassifier -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.ensemble import HistGradientBoostingClassifier -from xgboost import XGBRegressor -from xgboost import XGBClassifier -from xgboost import XGBRFRegressor, XGBRFClassifier -from lightgbm import LGBMClassifier, LGBMRegressor - -#from lightgbm import LGBMClassifier, LGBMRegressor - - -class HurdleRegression(BaseEstimator): - """ Regression model which handles excessive zeros by fitting a two-part model and combining predictions: - 1) binary classifier - 2) continuous regression - Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects. - Args: - clf_name: currently supports either 'logistic' or 'LGBMClassifier' - reg_name: currently supports either 'linear' or 'LGBMRegressor' - clf_params: dict of parameters to pass to classifier sub-model when initialized - reg_params: dict of parameters to pass to regression sub-model when initialized - """ - - def __init__(self, - clf_name: str = 'logistic', - reg_name: str = 'linear', - clf_params: Optional[dict] = None, - reg_params: Optional[dict] = None): - - self.clf_name = clf_name - self.reg_name = reg_name - self.clf_params = clf_params - self.reg_params = reg_params - self.clf_fi = [] - self.reg_fi = [] - - @staticmethod - def _resolve_estimator(func_name: str): - """ Lookup table for supported estimators. - This is necessary because sklearn estimator default arguments - must pass equality test, and instantiated sub-estimators are not equal. """ - - funcs = {'linear': LinearRegression(), - 'logistic': LogisticRegression(solver='liblinear'), - 'LGBMRegressor': LGBMRegressor(n_estimators=250), - 'LGBMClassifier': LGBMClassifier(n_estimators=250), - 'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2), - 'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2), - 'GBMRegressor': GradientBoostingRegressor(n_estimators=200), - 'GBMClassifier': GradientBoostingClassifier(n_estimators=200), - 'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2), - 'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2), - 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200), - 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200), - } - - return funcs[func_name] - - def fit(self, - X: Union[np.ndarray, pd.DataFrame], - y: Union[np.ndarray, pd.Series]): - X, y = check_X_y(X, y, dtype=None, - accept_sparse=False, - accept_large_sparse=False, - force_all_finite='allow-nan') - - if X.shape[1] < 2: - raise ValueError('Cannot fit model when n_features = 1') - - self.clf_ = self._resolve_estimator(self.clf_name) - if self.clf_params: - self.clf_.set_params(**self.clf_params) - self.clf_.fit(X, y > 0) - self.clf_fi = self.clf_.feature_importances_ - - self.reg_ = self._resolve_estimator(self.reg_name) - if self.reg_params: - self.reg_.set_params(**self.reg_params) - self.reg_.fit(X[y > 0], y[y > 0]) - self.reg_fi = self.reg_.feature_importances_ - - self.is_fitted_ = True - return self - - -# def predict(self, X: Union[np.ndarray, pd.DataFrame]): - def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]): - """ Predict combined response using binary classification outcome """ - X = check_array(X, accept_sparse=False, accept_large_sparse=False) - check_is_fitted(self, 'is_fitted_') - return self.clf_.predict(X) * self.reg_.predict(X) - - def predict(self, X: Union[np.ndarray, pd.DataFrame]): -# def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]): - """ Predict combined response using probabilistic classification outcome """ - X = check_array(X, accept_sparse=False, accept_large_sparse=False) - check_is_fitted(self, 'is_fitted_') - return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X) - -def manual_test(): - """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """ - check_estimator(HurdleRegression) - from sklearn.datasets import make_regression - X, y = make_regression() - reg = HurdleRegression() - reg.fit(X, y) - reg.predict(X) - - - - -#if __name__ == '__main__': -# manual_test() \ No newline at end of file diff --git a/Tools/models/fixed_first_split_regression_model.py b/Tools/models/fixed_first_split_regression_model.py new file mode 100644 index 0000000..d593e2b --- /dev/null +++ b/Tools/models/fixed_first_split_regression_model.py @@ -0,0 +1,112 @@ +from typing import Optional, Union +import numpy as np +import pandas as pd + +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.base import BaseEstimator +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, HistGradientBoostingRegressor, HistGradientBoostingClassifier + +from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier +from lightgbm import LGBMClassifier, LGBMRegressor + + +class FixedFirstSplitRegression(BaseEstimator): + def __init__(self, + ones_model_name: str = 'RFRegressor', + zeros_model_name: str = 'RFRegressor', + ones_model_params: Optional[dict] = None, + zeros_model_params: Optional[dict] = None, + split_by: str = ''): + + self.ones_model_name = ones_model_name + self.zeros_model_name = zeros_model_name + self.ones_model_params = ones_model_params + self.zeros_model_params = zeros_model_params + self.split_by = split_by + + self.ones_ = None + self.zeros_ = None + + @staticmethod + def _resolve_estimator(estimator_name: str): + funcs = { + 'linear': LinearRegression(), + 'logistic': LogisticRegression(solver='liblinear'), + + 'LGBMRegressor': LGBMRegressor(n_estimators=250), + 'LGBMClassifier': LGBMClassifier(n_estimators=250), + + 'RFRegressor': RandomForestRegressor(n_estimators=250), + 'RFClassifier': RandomForestClassifier(n_estimators=250), + + 'GBMRegressor': GradientBoostingRegressor(n_estimators=200), + 'GBMClassifier': GradientBoostingClassifier(n_estimators=200), + + 'XGBRegressor': XGBRegressor(n_estimators=100, learning_rate=0.05), + 'XGBClassifier': XGBClassifier(n_estimators=100, learning_rate=0.05), + + 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200), + 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200), + } + + return funcs[estimator_name] + + def fit(self, + X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series], + ): + + """ Fit the model based on the indicator values. """ + if isinstance(X, np.ndarray): + X = pd.DataFrame(X) + if self.split_by not in X.columns: + raise ValueError(f"split_by column '{self.split_by}' not found in X") + + split_indicator = X[self.split_by].astype(int) + X = X.drop(columns=[self.split_by]) + X, y = check_X_y(X, y, accept_sparse=False, force_all_finite='allow-nan', dtype=np.float64) + + self.ones_ = self._resolve_estimator(self.ones_model_name) + if self.ones_model_params: + self.ones_.set_params(**self.ones_model_params) + self.ones_.fit(X[split_indicator == 1], y[split_indicator == 1]) + + self.zeros_ = self._resolve_estimator(self.zeros_model_name) + if self.zeros_model_params: + self.zeros_.set_params(**self.zeros_model_params) + self.zeros_.fit(X[split_indicator == 0], y[split_indicator == 0]) + + self.is_fitted_ = True + return self + + def predict(self, X: Union[np.ndarray, pd.DataFrame]): + """ Predict the response based on the split_by indicator values. """ + if isinstance(X, np.ndarray): + X = pd.DataFrame(X) + + check_is_fitted(self, 'is_fitted_') + + if self.split_by not in X: + raise ValueError(f"split_by column '{self.split_by}' not found in X") + + split_indicator = X[self.split_by].astype(int) + X = X.drop(columns=[self.split_by]) + + pred_ones = self.ones_.predict(X[split_indicator == 1]) + pred_zeros = self.zeros_.predict(X[split_indicator == 0]) + + pred = np.zeros(X.shape[0]) + pred[split_indicator == 1] = pred_ones + pred[split_indicator == 0] = pred_zeros + + # Create a DataFrame with predictions and split_by values + result = pd.DataFrame({ + 'prediction': pred, + 'split_by': split_indicator + }) + + return result + + diff --git a/Tools/models/hurdle_regression_model.py b/Tools/models/hurdle_regression_model.py new file mode 100644 index 0000000..9e09809 --- /dev/null +++ b/Tools/models/hurdle_regression_model.py @@ -0,0 +1,170 @@ +from typing import Optional, Union +import numpy as np +import pandas as pd + +from sklearn.linear_model import LinearRegression, LogisticRegression +from sklearn.base import BaseEstimator +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, HistGradientBoostingRegressor, HistGradientBoostingClassifier + +from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier +from lightgbm import LGBMClassifier, LGBMRegressor + + +class HurdleRegression(BaseEstimator): + """ Regression model which handles excessive zeros by fitting a two-part model and combining predictions: + 1) binary classifier + 2) continuous regression + Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects. + Args: + clf_name: name of a classifier sub-model to use + reg_name: name of a regression sub-model to use + clf_params: dict of parameters to pass to classifier sub-model when initialized + reg_params: dict of parameters to pass to regression sub-model when initialized + """ + # Define the constructor method for the class + def __init__(self, + clf_name: str = 'logistic', + reg_name: str = 'linear', + clf_params: Optional[dict] = None, + reg_params: Optional[dict] = None): + + self.clf_name = clf_name + self.reg_name = reg_name + self.clf_params = clf_params + self.reg_params = reg_params + + @staticmethod + def _resolve_estimator(estimator_name: str, random_state=None): + """ Lookup table for supported estimators. + This is necessary because sklearn estimator default arguments + must pass equality test, and instantiated sub-estimators are not equal. """ + + estimators = { + 'linear': LinearRegression(), + 'logistic': LogisticRegression(solver='liblinear', random_state=random_state), + + 'LGBMRegressor': LGBMRegressor(n_estimators=250, random_state=random_state), + 'LGBMClassifier': LGBMClassifier(n_estimators=250, random_state=random_state), + + 'RFRegressor': RandomForestRegressor(n_estimators=250, n_jobs=-2, random_state=random_state), + 'RFClassifier': RandomForestClassifier(n_estimators=250, n_jobs=-2, random_state=random_state), + + 'XGBRFRegressor': XGBRFRegressor(n_estimators=100, learning_rate=0.05, random_state=random_state), + 'XGBRFClassifier': XGBRFClassifier(n_estimators=100, learning_rate=0.05, random_state=random_state), + + 'GBMRegressor': GradientBoostingRegressor(n_estimators=200, random_state=random_state), + 'GBMClassifier': GradientBoostingClassifier(n_estimators=200, random_state=random_state), + + 'XGBRegressor': XGBRegressor(n_estimators=100, learning_rate=0.05, n_jobs=-2, random_state=random_state), + 'XGBClassifier': XGBClassifier(n_estimators=100, learning_rate=0.05, n_jobs=-2, random_state=random_state), + + 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200, random_state=random_state), + 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200, random_state=random_state), + } + estimator = estimators.get(estimator_name) + + if estimator is None: + raise ValueError(f"Unknown estimator: {estimator_name}") + + return estimator + + def fit(self, + X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series]): + """ Fit the model. """ + # Run several sanity checks on the input + X, y = check_X_y(X, + y, + dtype=None, + accept_sparse=False, + accept_large_sparse=False, + force_all_finite='allow-nan') # allow X and y contain NaNs but not infinities + + if len(np.unique(y)) < 2: + raise ValueError("Input y for classifier must have more than one unique value.") + + # Save n of features in X to make checks later + self.n_features_in_ = X.shape[1] + + # Instantiate the classifier + self.clf_ = self._resolve_estimator(self.clf_name, random_state=42) + + # If defined, add parameters to the classifier + if self.clf_params: + self.clf_.set_params(**self.clf_params) + + # Fit the classifier + self.clf_.fit(X, y > 0) + + # Instantiate the regressor + self.reg_ = self._resolve_estimator(self.reg_name, random_state=42) + + # If defined, add parameters to the regressor + if self.reg_params: + self.reg_.set_params(**self.reg_params) + + # Fit the regressor for data where y > 0 only + self.reg_.fit(X[y > 0], y[y > 0]) + + # Set the is_fitted_ flag to True for further sanity checks + self.is_fitted_ = True + + return self + + def predict(self, + X: Union[np.ndarray, pd.DataFrame]): + """ Predict combined response using probabilistic classification outcome """ + # Run several sanity checks on the input + X = check_array(X, + dtype=None, + accept_sparse=False, + accept_large_sparse=False, + force_all_finite='allow-nan') + + check_is_fitted(self, 'is_fitted_') + + if X.shape[1] != self.n_features_in_: + raise ValueError("Number of features of the model must match the input." + f"Model n_features_in_ is {self.n_features_in_}" + f"and input n_features is {X.shape[1]}") + + # Predict with the classifier - take probability to be in class 1 + clf_predictions_proba = self.clf_.predict_proba(X)[:, 1] + + # Make predictions using the regressor + reg_predictions = self.reg_.predict(X) + + # The final prediction is the product of the classifier and regressor predictions + combined_predictions = clf_predictions_proba * reg_predictions + + return combined_predictions + + def predict_bin(self, + X: Union[np.ndarray, pd.DataFrame]): + """ Predict combined response using binary classification outcome. """ + # Run several sanity checks on the input + X = check_array(X, + dtype=None, + accept_sparse=False, + accept_large_sparse=False, + force_all_finite='allow-nan') + + check_is_fitted(self, 'is_fitted_') + + if X.shape[1] != self.n_features_in_: + raise ValueError("Number of features of the model must match the input." + f"Model n_features_in_ is {self.n_features_in_}" + f"and input n_features is {X.shape[1]}") + + # Predict with the classifier - take classes, 0 or 1 + clf_predictions_bin = self.clf_.predict(X) + + # Predict with the regressor + reg_predictions = self.reg_.predict(X) + + # Make the final prediction + combined_predictions = clf_predictions_bin * reg_predictions + + return combined_predictions diff --git a/Tools/models/model_tests.py b/Tools/models/model_tests.py new file mode 100644 index 0000000..6fa0f7b --- /dev/null +++ b/Tools/models/model_tests.py @@ -0,0 +1,61 @@ +import pandas as pd +import numpy as np + +from sklearn.datasets import make_regression, make_classification +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error + +from Tools.models.hurdle_regression_model import HurdleRegression +from Tools.models.fixed_first_split_regression_model import FixedFirstSplitRegression + + +def test_hurdle_regression(clf_name:str='logistic', + reg_name:str='linear'): + """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset resembling distribution for hurdle regression.""" + # Create a synthetic dataset that simulates dataset with many zeroes + X_reg, y_reg = make_regression(n_samples=1000, n_features=20) + X_clf, y_clf = make_classification(n_samples=1000, n_features=20) + X = np.hstack([X_clf, X_reg]) + y = y_clf * y_reg + + # Split the data into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + # Instantiate a HurdleRegression object + hr = HurdleRegression(clf_name=clf_name, reg_name=reg_name) + + # Fit the model to the data + hr.fit(X_train, y_train) + + # Make predictions with probability of positive class from the classifier + y_pred = hr.predict(X_test) + print('Mean Squared Error for combined predictions with positive class probability from the classifier:', int(mean_squared_error(y_test, y_pred))) + assert y_pred.shape == y_test.shape, "Predictions and y do not have the same shape" + + # Make predictions with binary outcome from the classifier + y_pred_bin = hr.predict_bin(X_test) + print('Mean Squared Error or combined predictions with binary outcome from the classifier:', int(mean_squared_error(y_test, y_pred_bin))) + assert y_pred_bin.shape == y_test.shape, "Probability predictions and y do not have the same shape" + + +def test_fixed_first_split_regression(zeros_model_name:str='linear', + ones_model_name:str='linear', + split_by = "split_by"): + """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """ + # Create a dataset + X, y = make_classification(n_samples=100, n_features=20) + + # Convert to DataFrame and add a 'split_by' column + X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])]) + X['split_by'] = np.random.randint(0, 2, len(y)) # Random 0s and 1s for the indicator + + # Initialize and fit the FixedFirstSplitRegression model + reg = FixedFirstSplitRegression(zeros_model_name=zeros_model_name, + ones_model_name=ones_model_name, + split_by=split_by) + + reg.fit(X, y) + + # Make predictions + result = reg.predict(X) + print("Predictions:", result)