Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make hurdle regression work #51

Open
wants to merge 20 commits into
base: fatalities003
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 18 additions & 132 deletions SystemUpdates/ModelDefinitions.py
Original file line number Diff line number Diff line change
@@ -1,128 +1,14 @@
# The ModelList is a list of dictionaries that define a range of models for the project

import sys
# sys.path.append('../')
sys.path.append('../Tools')
#sys.path.append('../Intermediates')
# sklearn
import settings

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

from ViewsEstimators import *

class FixedFirstSplitRegression(BaseEstimator):
""" Regression model which makes the first split according to a specified feature and then splits according to other
algorithms. The model optimizes onset-situation predictions by fitting a two-part model and combining predictions:
1) binary classifier
2) continuous regression
Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects.
Args:
ones_name: model to estimate if z variable is one (e.g. "onset")
zeros_name: model to estimate if z variable is zeros (e.g. "continuation")
ones_params: dict of parameters to pass to "ones" sub-model when initialized
zeros_params: dict of parameters to pass to "zeros" sub-model when initialized
"""

def __init__(self,
ones_name: str = 'RFRegressor',
zeros_name: str = 'RFRegressor',
ones_indicator: str = '',
ones_params: Optional[dict] = None,
zeros_params: Optional[dict] = None):

self.ones_name = ones_name
self.zeros_name = zeros_name
self.ones_indicator = ones_indicator
self.ones_params = ones_params
self.zeros_params = zeros_params
self.ones_fi = []
self.zeros_fi = []

@staticmethod
def _resolve_estimator(func_name: str):
""" Lookup table for supported estimators.
This is necessary because sklearn estimator default arguments
must pass equality test, and instantiated sub-estimators are not equal. """

funcs = {'linear': LinearRegression(),
'logistic': LogisticRegression(solver='liblinear'),
'LGBMRegressor': LGBMRegressor(n_estimators=250),
'LGBMClassifier': LGBMClassifier(n_estimators=250),
'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2),
'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2),
'GBMRegressor': GradientBoostingRegressor(n_estimators=200),
'GBMClassifier': GradientBoostingClassifier(n_estimators=200),
'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2),
'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2),
'HGBRegressor': HistGradientBoostingRegressor(max_iter=200),
'HGBClassifier': HistGradientBoostingClassifier(max_iter=200),
}

return funcs[func_name]

def fit(self,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.Series],
z: Union[np.ndarray, pd.Series]):
X, y = check_X_y(X, y, dtype=None,
accept_sparse=False,
accept_large_sparse=False,
force_all_finite='allow-nan')
z = X[ones_indicator]

if X.shape[1] < 2:
raise ValueError('Cannot fit model when n_features = 1')

self.ones_ = self._resolve_estimator(self.ones_name)
if self.ones_params:
self.ones_.set_params(**self.ones_params)
self.ones_.fit(X[z==1], y[z==1])
self.ones_fi = self.ones_.feature_importances_

self.zeros_ = self._resolve_estimator(self.zeros_name)
if self.zeros_params:
self.zeros_.set_params(**self.zeros_params)
self.zeros_.fit(X[z==0], y[z==0])
self.zeros_fi = self.zeros_.feature_importances_

self.is_fitted_ = True
return self


def predict(self, X: Union[np.ndarray, pd.DataFrame]):
# def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]):
""" Predict combined response using probabilistic classification outcome """
X = check_array(X, accept_sparse=False, accept_large_sparse=False)
check_is_fitted(self, 'is_fitted_')
# predict =
return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)

def manual_test():
""" Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """
check_estimator(HurdleRegression)
from sklearn.datasets import make_regression
X, y = make_regression()
reg = FixedFirstSplitRegression()
reg.fit(X, y)
reg.predict(X)


from xgboost import XGBRegressor, XGBRFRegressor
from lightgbm import LGBMRegressor

from Tools.models.hurdle_regression_model import HurdleRegression
from Tools.models.fixed_first_split_regression_model import FixedFirstSplitRegression


def DefineEnsembleModels(level):
Expand All @@ -131,18 +17,18 @@ def DefineEnsembleModels(level):
if level == 'cm':
nj = 12

# model = {
# 'modelname': 'fatalities003_baseline_ons',
# 'algorithm': FixedFirstSplitRegression(ones_name='LGBMClassifier', zeros_name='LGBMRegressor',onset_indicator = ''),
# 'depvar': 'ln_ged_sb_dep',
# 'data_train': 'baseline002',
# 'queryset': 'fatalities002_baseline',
# 'preprocessing': 'float_it',
# 'level': 'cm',
# 'description': 'Baseline model with a few conflict history features as well as log population, random forests regression model.',
# 'long_description': 'A very simple model with only five data columns (each column representing one feature): The number of fatalities in the same country at $t-1$, three decay functions of time since there was at least five fatalities in a single month, for each of the UCDP conflict types -- state-based, one-sided, or non-state conflict -- and log population size (Hegre2020RP,Pettersson2021JPR).The features in the baseline are included in all the models described below. This ensures that all models in the ensemble provides at least moderately good predictions, while guaranteeing diversity in feature sets and modelling approaches.'
# }
# ModelList.append(model)
#model = {
# 'modelname': 'fatalities003_baseline_ons',
# 'algorithm': FixedFirstSplitRegression(ones_name='LGBMClassifier', zeros_name='LGBMClassifier',onset_indicator = 'split_by'),
# 'depvar': 'ln_ged_sb_dep',
# 'data_train': 'baseline002',
# 'queryset': 'fatalities002_baseline',
# 'preprocessing': 'float_it',
# 'level': 'cm',
# 'description': 'Baseline model with a few conflict history features as well as log population, random forests regression model.',
# 'long_description': 'A very simple model with only five data columns (each column representing one feature): The number of fatalities in the same country at $t-1$, three decay functions of time since there was at least five fatalities in a single month, for each of the UCDP conflict types -- state-based, one-sided, or non-state conflict -- and log population size (Hegre2020RP,Pettersson2021JPR).The features in the baseline are included in all the models described below. This ensures that all models in the ensemble provides at least moderately good predictions, while guaranteeing diversity in feature sets and modelling approaches.'
#}
#ModelList.append(model)

model = {
'modelname': 'fatalities003_nl_baseline_rf',
Expand Down
7 changes: 7 additions & 0 deletions SystemUpdates/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(parent_dir)
sys.path.append(os.path.join(parent_dir, 'Tools'))
sys.path.append(os.path.join(parent_dir, 'Intermediates'))
94 changes: 94 additions & 0 deletions SystemUpdates/test_hurdle_regression.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "8855fab3",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ef27dd3d",
"metadata": {},
"outputs": [],
"source": [
"import settings # to append paths\n",
"\n",
"from Tools.models.model_tests import test_hurdle_regression, test_fixed_first_split_regression\n",
"from Tools.models.hurdle_regression_model import HurdleRegression"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test Hurdle Regression"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"model_pairs = (\n",
" ['logistic', 'linear'], \n",
" ['LGBMClassifier', 'LGBMRegressor'],\n",
" ['RFClassifier', 'RFRegressor'],\n",
" ['XGBRFClassifier', 'XGBRFRegressor'],\n",
" ['GBMClassifier', 'GBMRegressor'],\n",
" ['XGBClassifier', 'XGBRegressor'],\n",
" ['HGBClassifier' , 'HGBRegressor'],\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Squared Error for combined predictions with positive class probability from the classifier: 6338\n",
"Mean Squared Error or combined predictions with binary outcome from the classifier: 5458\n"
]
}
],
"source": [
"clf_name = 'RFClassifier'\n",
"reg_name = 'RFRegressor'\n",
"\n",
"test_hurdle_regression(clf_name=clf_name, reg_name=reg_name)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
123 changes: 0 additions & 123 deletions Tools/ViewsEstimators.py

This file was deleted.

Loading