tests, docstrings, some renaming

Blue-Yonder-OSS · Aug 14, 2023 · 3c9b723 · 3c9b723
1 parent 87fe88c
commit 3c9b723
Show file tree

Hide file tree

Showing 6 changed files with 159 additions and 28 deletions.
diff --git a/cyclic_boosting/__init__.py b/cyclic_boosting/__init__.py
@@ -36,7 +36,7 @@
 
 
 from cyclic_boosting.base import CyclicBoostingBase
-from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor, CBQuantileRegressor
+from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor, CBMultiplicativeQuantileRegressor
 from cyclic_boosting.price import CBExponential
 from cyclic_boosting.location import CBLocationRegressor, CBLocPoissonRegressor
 from cyclic_boosting.nbinom import CBNBinomC
@@ -51,7 +51,7 @@
     pipeline_CBLocPoissonRegressor,
     pipeline_CBNBinomC,
     pipeline_CBGBSRegressor,
-    pipeline_CBQuantileRegressor,
+    pipeline_CBMultiplicativeQuantileRegressor,
 )
 
 __all__ = [
@@ -73,7 +73,7 @@
     "pipeline_CBLocPoissonRegressor",
     "pipeline_CBNBinomC",
     "pipeline_CBGBSRegressor",
-    "pipeline_CBQuantileRegressor",
+    "pipeline_CBMultiplicativeQuantileRegressor",
 ]
 
 __version__ = "1.0"
diff --git a/cyclic_boosting/pipelines.py b/cyclic_boosting/pipelines.py
@@ -7,7 +7,7 @@
     CBNBinomC,
     CBClassifier,
     CBGBSRegressor,
-    CBQuantileRegressor,
+    CBMultiplicativeQuantileRegressor,
     binning,
 )
 
@@ -126,7 +126,7 @@ def pipeline_CB(
             aggregate=aggregate,
             regalpha=regalpha,
         )
-    elif estimator == CBQuantileRegressor:
+    elif estimator == CBMultiplicativeQuantileRegressor:
         estimatorCB = estimator(
             feature_groups=feature_groups,
             feature_properties=feature_properties,
@@ -205,8 +205,8 @@ def pipeline_CBGBSRegressor(**kwargs):
     return pipeline_CB(CBGBSRegressor, **kwargs)
 
 
-def pipeline_CBQuantileRegressor(**kwargs):
+def pipeline_CBMultiplicativeQuantileRegressor(**kwargs):
     """
-    Convenience function containing CBQuantileRegressor (estimator) + binning.
+    Convenience function containing CBMultiplicativeQuantileRegressor (estimator) + binning.
     """
-    return pipeline_CB(CBQuantileRegressor, **kwargs)
+    return pipeline_CB(CBMultiplicativeQuantileRegressor, **kwargs)
diff --git a/cyclic_boosting/regression.py b/cyclic_boosting/regression.py
@@ -183,7 +183,26 @@ def calc_parameters(self, feature, y, pred, prefit_data):
         return _calc_factors_and_uncertainties(alpha=prefit_data, beta=prediction_sum_of_bins, link_func=self.link_func)
 
 
-class CBQuantileRegressor(CBBaseRegressor):
+class CBMultiplicativeQuantileRegressor(CBBaseRegressor):
+    """
+    Cyclic Boosting multiplicative quantile-regression mode. While its general
+    structure allows arbitrary/empirical target ranges/distributions, the
+    multiplicative model of this mode requires non-negative target values.
+
+    A quantile loss, according to the desired quantile to be predicted, is
+    minimized in each bin of each feature. While binning, feature cycles,
+    smoothing, and iterations work in the same way as usual in Cyclic Boosting,
+    the minimization itself is performed via ``scipy.optimize.minimize``
+    (instead of an analytical solution like, e.g., in ``CBPoissonRegressor``,
+    ``CBNBinomRegressor``, or ``CBLocationRegressor``).
+
+    Parameters
+    ----------
+    quantile : float
+        quantile to be estimated
+    See :class:`cyclic_boosting.base` for all other parameters.
+    """
+
     def __init__(
         self,
         feature_groups=None,
@@ -222,6 +241,25 @@ def precalc_parameters(self, feature, y, pred):
         pass
 
     def loss(self, prediction, y, weights):
+        """
+        Calculation of the in-sample quantile loss, or to be exact costs,
+        (potentially including sample weights) after full feature cycles, i.e.,
+        iterations, to be used as stopping criteria.
+
+        Parameters
+        ----------
+        prediction : np.ndarray
+            (in-sample) predictions for desired quantile, containing data with `float` type
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially discrete)
+        weights : np.ndarray
+            optional (otherwise set to 1) sample weights, containing data with `float` type
+
+        Returns
+        -------
+        float
+            calcualted quantile costs
+        """
         if not len(y) > 0:
             raise ValueError("Loss cannot be computed on empty data")
         else:
@@ -235,6 +273,17 @@ def loss(self, prediction, y, weights):
             return sum_weighted_error / np.nansum(weights)
 
     def _init_global_scale(self, X, y):
+        """
+        Calculation of the global scale for quantile regression, corresponding
+        to the (continuous approximation of the) respective quantile of the
+        target values used in the training.
+
+        The exact value of the global scale is not critical for the model
+        accuracy (as the model has enough parameters to compensate), but a
+        value not representating a good overall average leads to factors with
+        averages unequal to 1 for each feature (making interpretation more
+        difficult).
+        """
         if self.weights is None:
             raise RuntimeError("The weights have to be initialized.")
 
@@ -264,21 +313,92 @@ def _init_global_scale(self, X, y):
                 )
                 self.prior_pred_link_offset_ = float(self.global_scale_link_)
 
-    def quantile_loss(self, params, yhat_others, y, weights):
-        sum_weighted_error = np.nansum(
-            (
-                (y < (params[0] * yhat_others)) * (1 - self.quantile) * (params[0] * yhat_others - y)
-                + (y >= (params[0] * yhat_others)) * self.quantile * (y - params[0] * yhat_others)
-            )
-            * weights
-        )
+    def quantile_costs(self, param, yhat_others, y, weights):
+        """
+        Calculation of the in-sample quantile costs (potentially including
+        sample weights) for individual feature bins according to a quantile
+        loss function, to be minimized subsequently.
+
+        Parameters
+        ----------
+        param : float
+            Factor to be estimated for the feature bin at hand.
+        yhat_others : np.ndarray
+            (in-sample) predictions of all other features (excluding the one at
+            hand) for the bin at hand, containing data with `float` type
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially discrete)
+        weights : np.ndarray
+            optional (otherwise set to 1) sample weights, containing data with `float` type
+
+        Returns
+        -------
+        float
+            calcualted quantile costs
+        """
+        quantile_loss = (y < (param * yhat_others)) * (1 - self.quantile) * (param * yhat_others - y) + (
+            y >= (param * yhat_others)
+        ) * self.quantile * (y - param * yhat_others)
+        sum_weighted_error = np.nansum(quantile_loss * weights)
         return sum_weighted_error / np.nansum(weights)
 
     def optimization(self, y, yhat_others, weights):
-        res = minimize(self.quantile_loss, 1, args=(yhat_others, y, weights))
-        return res.x, np.sqrt(np.log(1 + 2 + np.sum(y)) - np.log(2 + np.sum(y)))
+        """
+        Minimization of the quantile costs (potentially including sample
+        weights) for individual feature bins. The initial value for the factors
+        is set to 1 (neutral value for multiplicative model).
+
+        Parameters
+        ----------
+        param : float
+            Factor to be estimated for the feature bin at hand.
+        yhat_others : np.ndarray
+            (in-sample) predictions from all other features (excluding the one
+            at hand) for the bin at hand, containing data with `float` type
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially discrete).
+        weights : np.ndarray
+            optional (otherwise set to 1) sample weights, containing data with `float` type
+
+        Returns
+        -------
+        float, float
+            estimated parameter (factor) and its uncertainty
+        """
+        res = minimize(self.quantile_costs, 1, args=(yhat_others, y, weights))
+        # use moment-matching of a Gamma posterior with a log-normal
+        # distribution as approximation
+        uncertainty = np.sqrt(np.log(1 + 2 + np.sum(y)) - np.log(2 + np.sum(y)))
+        return res.x, uncertainty
 
     def calc_parameters(self, feature, y, pred, prefit_data):
+        """
+        Calling of the optimization (quantile loss minimization) for the
+        different bins of the feature at hand. In contrast to the analytical
+        solution in most other Cyclic Boosting modes (e.g.,
+        ``CBPoissonRegressor``), working simply via bin statistics
+        (`bincount`), the optimization here requires a dedicated loss funtion
+        to be called for each observation.
+
+        Parameters
+        ----------
+        feature : :class:`Feature`
+            feature for which the parameters of each bin are estimated
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially
+            discrete)
+        pred : np.ndarray
+            (in-sample) predictions from all other features (excluding the one
+            at hand), containing data with `float` type
+        prefit_data
+            data returned by :meth:`~.precalc_parameters` during fit, not used
+            here
+
+        Returns
+        -------
+        float, float
+            estimated parameter (factor) and its uncertainty
+        """
         sorting = feature.lex_binned_data.argsort()
         sorted_bins = feature.lex_binned_data[sorting]
         splits_indices = np.unique(sorted_bins, return_index=True)[1][1:]
@@ -305,4 +425,4 @@ def calc_parameters(self, feature, y, pred, prefit_data):
         return np.log(factors), uncertainties
 
 
-__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor", "CBQuantileRegressor"]
+__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor", "CBMultiplicativeQuantileRegressor"]
diff --git a/cyclic_boosting/utils.py b/cyclic_boosting/utils.py
@@ -984,7 +984,7 @@ def continuous_quantile_from_discrete(y, quantile):
     Parameters
     ----------
     y : np.ndarray
-        variable with potentially discrete values
+        containing data with `float` type (potentially discrete)
     quantile : float
         desired quantile
 

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -15,7 +15,7 @@
     pipeline_CBNBinomRegressor,
     pipeline_CBNBinomC,
     pipeline_CBGBSRegressor,
-    pipeline_CBQuantileRegressor,
+    pipeline_CBMultiplicativeQuantileRegressor,
 )
 
 
@@ -472,7 +472,7 @@ def evaluate_quantile(y, yhat):
     return quantile_acc
 
 
-def cb_quantile_regressor_model(quantile):
+def cb_multiplicative_quantile_regressor_model(quantile):
     features = get_features()
 
     fp = feature_properties()
@@ -486,7 +486,7 @@ def cb_quantile_regressor_model(quantile):
         observers.PlottingObserver(iteration=-1),
     ]
 
-    CB_pipeline = pipeline_CBQuantileRegressor(
+    CB_pipeline = pipeline_CBMultiplicativeQuantileRegressor(
         quantile=quantile,
         feature_properties=fp,
         feature_groups=features,
@@ -500,15 +500,15 @@ def cb_quantile_regressor_model(quantile):
     return CB_pipeline
 
 
-def test_quantile_regression_median():
+def test_multiplicative_quantile_regression_median():
     np.random.seed(42)
 
     df = pd.read_csv("./tests/integration_test_data.csv")
 
     X, y = prepare_data(df)
 
     quantile = 0.5
-    CB_est = cb_quantile_regressor_model(quantile)
+    CB_est = cb_multiplicative_quantile_regressor_model(quantile)
     CB_est.fit(X.copy(), y)
     # plot_CB('analysis_CB_iterfirst',
     #         [CB_est[-1].observers[0]], CB_est[-2])
@@ -524,15 +524,15 @@ def test_quantile_regression_median():
     np.testing.assert_almost_equal(mad, 1.6559, 3)
 
 
-def test_quantile_regression_90():
+def test_multiplicative_quantile_regression_90():
     np.random.seed(42)
 
     df = pd.read_csv("./tests/integration_test_data.csv")
 
     X, y = prepare_data(df)
 
     quantile = 0.9
-    CB_est = cb_quantile_regressor_model(quantile)
+    CB_est = cb_multiplicative_quantile_regressor_model(quantile)
     CB_est.fit(X.copy(), y)
     # plot_CB('analysis_CB_iterfirst',
     #         [CB_est[-1].observers[0]], CB_est[-2])

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -36,3 +36,14 @@ def test_get_feature_column_names():
     np.testing.assert_equal(features, ["a", "b", "c"])
     features = utils.get_feature_column_names(X, exclude_columns=["a"])
     np.testing.assert_equal(features, ["b", "c"])
+
+
+def test_continuous_quantile_from_discrete():
+    y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    quantile_value = utils.continuous_quantile_from_discrete(y, 0.8)
+    assert quantile_value == 8.0
+    quantile_value = utils.continuous_quantile_from_discrete(y, 0.35)
+    assert quantile_value == 3.0
+    y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2)
+    quantile_value = utils.continuous_quantile_from_discrete(y, 0.35)
+    assert quantile_value == 3.5