Skip to content

Commit

Permalink
tests, docstrings, some renaming
Browse files Browse the repository at this point in the history
  • Loading branch information
fwick-panasonic committed Aug 14, 2023
1 parent 87fe88c commit 3c9b723
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 28 deletions.
6 changes: 3 additions & 3 deletions cyclic_boosting/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


from cyclic_boosting.base import CyclicBoostingBase
from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor, CBQuantileRegressor
from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor, CBMultiplicativeQuantileRegressor
from cyclic_boosting.price import CBExponential
from cyclic_boosting.location import CBLocationRegressor, CBLocPoissonRegressor
from cyclic_boosting.nbinom import CBNBinomC
Expand All @@ -51,7 +51,7 @@
pipeline_CBLocPoissonRegressor,
pipeline_CBNBinomC,
pipeline_CBGBSRegressor,
pipeline_CBQuantileRegressor,
pipeline_CBMultiplicativeQuantileRegressor,
)

__all__ = [
Expand All @@ -73,7 +73,7 @@
"pipeline_CBLocPoissonRegressor",
"pipeline_CBNBinomC",
"pipeline_CBGBSRegressor",
"pipeline_CBQuantileRegressor",
"pipeline_CBMultiplicativeQuantileRegressor",
]

__version__ = "1.0"
10 changes: 5 additions & 5 deletions cyclic_boosting/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
CBNBinomC,
CBClassifier,
CBGBSRegressor,
CBQuantileRegressor,
CBMultiplicativeQuantileRegressor,
binning,
)

Expand Down Expand Up @@ -126,7 +126,7 @@ def pipeline_CB(
aggregate=aggregate,
regalpha=regalpha,
)
elif estimator == CBQuantileRegressor:
elif estimator == CBMultiplicativeQuantileRegressor:
estimatorCB = estimator(
feature_groups=feature_groups,
feature_properties=feature_properties,
Expand Down Expand Up @@ -205,8 +205,8 @@ def pipeline_CBGBSRegressor(**kwargs):
return pipeline_CB(CBGBSRegressor, **kwargs)


def pipeline_CBQuantileRegressor(**kwargs):
def pipeline_CBMultiplicativeQuantileRegressor(**kwargs):
"""
Convenience function containing CBQuantileRegressor (estimator) + binning.
Convenience function containing CBMultiplicativeQuantileRegressor (estimator) + binning.
"""
return pipeline_CB(CBQuantileRegressor, **kwargs)
return pipeline_CB(CBMultiplicativeQuantileRegressor, **kwargs)
144 changes: 132 additions & 12 deletions cyclic_boosting/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,26 @@ def calc_parameters(self, feature, y, pred, prefit_data):
return _calc_factors_and_uncertainties(alpha=prefit_data, beta=prediction_sum_of_bins, link_func=self.link_func)


class CBQuantileRegressor(CBBaseRegressor):
class CBMultiplicativeQuantileRegressor(CBBaseRegressor):
"""
Cyclic Boosting multiplicative quantile-regression mode. While its general
structure allows arbitrary/empirical target ranges/distributions, the
multiplicative model of this mode requires non-negative target values.
A quantile loss, according to the desired quantile to be predicted, is
minimized in each bin of each feature. While binning, feature cycles,
smoothing, and iterations work in the same way as usual in Cyclic Boosting,
the minimization itself is performed via ``scipy.optimize.minimize``
(instead of an analytical solution like, e.g., in ``CBPoissonRegressor``,
``CBNBinomRegressor``, or ``CBLocationRegressor``).
Parameters
----------
quantile : float
quantile to be estimated
See :class:`cyclic_boosting.base` for all other parameters.
"""

def __init__(
self,
feature_groups=None,
Expand Down Expand Up @@ -222,6 +241,25 @@ def precalc_parameters(self, feature, y, pred):
pass

def loss(self, prediction, y, weights):
"""
Calculation of the in-sample quantile loss, or to be exact costs,
(potentially including sample weights) after full feature cycles, i.e.,
iterations, to be used as stopping criteria.
Parameters
----------
prediction : np.ndarray
(in-sample) predictions for desired quantile, containing data with `float` type
y : np.ndarray
target variable, containing data with `float` type (potentially discrete)
weights : np.ndarray
optional (otherwise set to 1) sample weights, containing data with `float` type
Returns
-------
float
calcualted quantile costs
"""
if not len(y) > 0:
raise ValueError("Loss cannot be computed on empty data")
else:
Expand All @@ -235,6 +273,17 @@ def loss(self, prediction, y, weights):
return sum_weighted_error / np.nansum(weights)

def _init_global_scale(self, X, y):
"""
Calculation of the global scale for quantile regression, corresponding
to the (continuous approximation of the) respective quantile of the
target values used in the training.
The exact value of the global scale is not critical for the model
accuracy (as the model has enough parameters to compensate), but a
value not representating a good overall average leads to factors with
averages unequal to 1 for each feature (making interpretation more
difficult).
"""
if self.weights is None:
raise RuntimeError("The weights have to be initialized.")

Expand Down Expand Up @@ -264,21 +313,92 @@ def _init_global_scale(self, X, y):
)
self.prior_pred_link_offset_ = float(self.global_scale_link_)

def quantile_loss(self, params, yhat_others, y, weights):
sum_weighted_error = np.nansum(
(
(y < (params[0] * yhat_others)) * (1 - self.quantile) * (params[0] * yhat_others - y)
+ (y >= (params[0] * yhat_others)) * self.quantile * (y - params[0] * yhat_others)
)
* weights
)
def quantile_costs(self, param, yhat_others, y, weights):
"""
Calculation of the in-sample quantile costs (potentially including
sample weights) for individual feature bins according to a quantile
loss function, to be minimized subsequently.
Parameters
----------
param : float
Factor to be estimated for the feature bin at hand.
yhat_others : np.ndarray
(in-sample) predictions of all other features (excluding the one at
hand) for the bin at hand, containing data with `float` type
y : np.ndarray
target variable, containing data with `float` type (potentially discrete)
weights : np.ndarray
optional (otherwise set to 1) sample weights, containing data with `float` type
Returns
-------
float
calcualted quantile costs
"""
quantile_loss = (y < (param * yhat_others)) * (1 - self.quantile) * (param * yhat_others - y) + (
y >= (param * yhat_others)
) * self.quantile * (y - param * yhat_others)
sum_weighted_error = np.nansum(quantile_loss * weights)
return sum_weighted_error / np.nansum(weights)

def optimization(self, y, yhat_others, weights):
res = minimize(self.quantile_loss, 1, args=(yhat_others, y, weights))
return res.x, np.sqrt(np.log(1 + 2 + np.sum(y)) - np.log(2 + np.sum(y)))
"""
Minimization of the quantile costs (potentially including sample
weights) for individual feature bins. The initial value for the factors
is set to 1 (neutral value for multiplicative model).
Parameters
----------
param : float
Factor to be estimated for the feature bin at hand.
yhat_others : np.ndarray
(in-sample) predictions from all other features (excluding the one
at hand) for the bin at hand, containing data with `float` type
y : np.ndarray
target variable, containing data with `float` type (potentially discrete).
weights : np.ndarray
optional (otherwise set to 1) sample weights, containing data with `float` type
Returns
-------
float, float
estimated parameter (factor) and its uncertainty
"""
res = minimize(self.quantile_costs, 1, args=(yhat_others, y, weights))
# use moment-matching of a Gamma posterior with a log-normal
# distribution as approximation
uncertainty = np.sqrt(np.log(1 + 2 + np.sum(y)) - np.log(2 + np.sum(y)))
return res.x, uncertainty

def calc_parameters(self, feature, y, pred, prefit_data):
"""
Calling of the optimization (quantile loss minimization) for the
different bins of the feature at hand. In contrast to the analytical
solution in most other Cyclic Boosting modes (e.g.,
``CBPoissonRegressor``), working simply via bin statistics
(`bincount`), the optimization here requires a dedicated loss funtion
to be called for each observation.
Parameters
----------
feature : :class:`Feature`
feature for which the parameters of each bin are estimated
y : np.ndarray
target variable, containing data with `float` type (potentially
discrete)
pred : np.ndarray
(in-sample) predictions from all other features (excluding the one
at hand), containing data with `float` type
prefit_data
data returned by :meth:`~.precalc_parameters` during fit, not used
here
Returns
-------
float, float
estimated parameter (factor) and its uncertainty
"""
sorting = feature.lex_binned_data.argsort()
sorted_bins = feature.lex_binned_data[sorting]
splits_indices = np.unique(sorted_bins, return_index=True)[1][1:]
Expand All @@ -305,4 +425,4 @@ def calc_parameters(self, feature, y, pred, prefit_data):
return np.log(factors), uncertainties


__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor", "CBQuantileRegressor"]
__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor", "CBMultiplicativeQuantileRegressor"]
2 changes: 1 addition & 1 deletion cyclic_boosting/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ def continuous_quantile_from_discrete(y, quantile):
Parameters
----------
y : np.ndarray
variable with potentially discrete values
containing data with `float` type (potentially discrete)
quantile : float
desired quantile
Expand Down
14 changes: 7 additions & 7 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
pipeline_CBNBinomRegressor,
pipeline_CBNBinomC,
pipeline_CBGBSRegressor,
pipeline_CBQuantileRegressor,
pipeline_CBMultiplicativeQuantileRegressor,
)


Expand Down Expand Up @@ -472,7 +472,7 @@ def evaluate_quantile(y, yhat):
return quantile_acc


def cb_quantile_regressor_model(quantile):
def cb_multiplicative_quantile_regressor_model(quantile):
features = get_features()

fp = feature_properties()
Expand All @@ -486,7 +486,7 @@ def cb_quantile_regressor_model(quantile):
observers.PlottingObserver(iteration=-1),
]

CB_pipeline = pipeline_CBQuantileRegressor(
CB_pipeline = pipeline_CBMultiplicativeQuantileRegressor(
quantile=quantile,
feature_properties=fp,
feature_groups=features,
Expand All @@ -500,15 +500,15 @@ def cb_quantile_regressor_model(quantile):
return CB_pipeline


def test_quantile_regression_median():
def test_multiplicative_quantile_regression_median():
np.random.seed(42)

df = pd.read_csv("./tests/integration_test_data.csv")

X, y = prepare_data(df)

quantile = 0.5
CB_est = cb_quantile_regressor_model(quantile)
CB_est = cb_multiplicative_quantile_regressor_model(quantile)
CB_est.fit(X.copy(), y)
# plot_CB('analysis_CB_iterfirst',
# [CB_est[-1].observers[0]], CB_est[-2])
Expand All @@ -524,15 +524,15 @@ def test_quantile_regression_median():
np.testing.assert_almost_equal(mad, 1.6559, 3)


def test_quantile_regression_90():
def test_multiplicative_quantile_regression_90():
np.random.seed(42)

df = pd.read_csv("./tests/integration_test_data.csv")

X, y = prepare_data(df)

quantile = 0.9
CB_est = cb_quantile_regressor_model(quantile)
CB_est = cb_multiplicative_quantile_regressor_model(quantile)
CB_est.fit(X.copy(), y)
# plot_CB('analysis_CB_iterfirst',
# [CB_est[-1].observers[0]], CB_est[-2])
Expand Down
11 changes: 11 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,14 @@ def test_get_feature_column_names():
np.testing.assert_equal(features, ["a", "b", "c"])
features = utils.get_feature_column_names(X, exclude_columns=["a"])
np.testing.assert_equal(features, ["b", "c"])


def test_continuous_quantile_from_discrete():
y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
quantile_value = utils.continuous_quantile_from_discrete(y, 0.8)
assert quantile_value == 8.0
quantile_value = utils.continuous_quantile_from_discrete(y, 0.35)
assert quantile_value == 3.0
y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2)
quantile_value = utils.continuous_quantile_from_discrete(y, 0.35)
assert quantile_value == 3.5

0 comments on commit 3c9b723

Please sign in to comment.