introduced ordered smoothing of categorical features

Blue-Yonder-OSS · Oct 31, 2023 · d5b3d31 · d5b3d31
1 parent 350caf4
commit d5b3d31
Show file tree

Hide file tree

Showing 7 changed files with 152 additions and 10 deletions.
diff --git a/cyclic_boosting/common_smoothers.py b/cyclic_boosting/common_smoothers.py
@@ -47,7 +47,7 @@ def _simplify_flags(feature_property: int, feature_group: Optional[str] = None):
     elif flags.is_continuous_set(feature_property):
         return flags.IS_CONTINUOUS
     elif flags.is_ordered_set(feature_property):
-        return flags.IS_UNORDERED
+        return flags.IS_ORDERED
     elif flags.is_unordered_set(feature_property):
         return flags.IS_UNORDERED
     else:
@@ -67,7 +67,7 @@ def _simplify_flags(feature_property: int, feature_group: Optional[str] = None):
 def _default_smoother_types(neutral_factor_link=0, use_normalization=True):
     smoother_types = {
         flags.IS_UNORDERED: smoothing.onedim.WeightedMeanSmoother(prior_prediction=neutral_factor_link),
-        flags.IS_ORDERED: smoothing.onedim.WeightedMeanSmoother(prior_prediction=neutral_factor_link),
+        flags.IS_ORDERED: smoothing.onedim.WeightedMeanSmootherNeighbors(),
         flags.IS_CONTINUOUS: smoothing.onedim.OrthogonalPolynomialSmoother(),
         flags.IS_LINEAR: smoothing.extrapolate.LinearExtrapolator(),
         flags.IS_SEASONAL:

diff --git a/cyclic_boosting/smoothing/onedim.py b/cyclic_boosting/smoothing/onedim.py
@@ -220,6 +220,20 @@ def fit(self, X_for_smoother, y):
         self.smoothed_y_ = utils.regularize_to_error_weighted_mean(y, X_for_smoother[:, 2], self.prior_prediction)
 
 
+class WeightedMeanSmootherNeighbors(AbstractBinSmoother, PredictingBinValueMixin):
+    """
+    Smoother for regularizing one-dimensional bin values with uncertainties to
+    the weighted mean of a window including only its left and right neigboring
+    bins.
+    """
+
+    def fit(self, X_for_smoother, y):
+        if len(y) < 3:
+            self.smoothed_y_ = utils.regularize_to_error_weighted_mean(y, X_for_smoother[:, 2])
+        else:
+            self.smoothed_y_ = utils.regularize_to_error_weighted_mean_neighbors(y, X_for_smoother[:, 2], window_size=3)
+
+
 class OrthogonalPolynomialSmoother(AbstractBinSmoother):
     """A polynomial fit that uses orthogonal polynomials as basis functions.
 

diff --git a/cyclic_boosting/utils.py b/cyclic_boosting/utils.py
@@ -756,7 +756,7 @@ def regularize_to_prior_expectation(values, uncertainties, prior_expectation, th
 
 
 def regularize_to_error_weighted_mean(values, uncertainties, prior_prediction=None):
-    r"""Regularize values with uncertainties to the error weighted mean.
+    r"""Regularize values with uncertainties to its error-weighted mean.
 
     :param values: measured values
     :type values: :class:`numpy.ndarray` (float64, dim=1)
@@ -822,7 +822,7 @@ def regularize_to_error_weighted_mean(values, uncertainties, prior_prediction=No
     ValueError: <values> and <uncertainties> must have the same shape
     """
     if values.shape != uncertainties.shape:
-        raise ValueError("<values> and <uncertainties> " "must have the same shape")
+        raise ValueError("values and uncertainties must have the same shape")
     if len(values) < 1 or (prior_prediction is None and len(values) == 1):
         return values
 
@@ -834,14 +834,54 @@ def regularize_to_error_weighted_mean(values, uncertainties, prior_prediction=No
             # if all values are the same,
             # regularizing to the mean makes no sense
             return x
-        x_mean = np.sum(wx * x) / np.sum(wx)
+        x_mean = np.sum(wx * x) / sum_wx
     else:
         if np.allclose(x, prior_prediction):
             return x
         x_mean = prior_prediction
     wx_incl = 1.0 / (np.sum(wx * np.square(x - x_mean)) / sum_wx)
     res = (wx * x + wx_incl * x_mean) / (wx + wx_incl)
+    return res
+
+
+def regularize_to_error_weighted_mean_neighbors(values, uncertainties, window_size=3):
+    """
+    Regularize values with uncertainties to its error-weighted mean, using a
+    sliding window.
+
+    Parameters
+    ----------
+    values : np.ndarray
+        data (`float` type) to be regularized
+    uncertainties : np.ndarray
+        uncertainties (`float` type) of values
+    window_size : int
+        size of the sliding window to be used (e.g., 3 means include direct
+        left and right neighbors)
 
+    Returns
+    -------
+    np.ndarray
+        regularized values
+    """
+    if values.shape != uncertainties.shape:
+        raise ValueError("values and uncertainties must have the same shape")
+
+    if len(values) < 3:
+        return regularize_to_error_weighted_mean(values, uncertainties)
+
+    window_arr = np.ones(window_size)
+    x = values
+    wx = 1.0 / np.square(uncertainties)
+
+    sum_wx = np.convolve(wx, window_arr, "same")
+    x_mean = np.convolve(wx * x, window_arr, "same") / sum_wx
+
+    wx_incl = np.ones(len(x))
+    for i in range(len(x)):
+        wx_incl[i] = 1.0 / (np.convolve(wx * np.square(x - x_mean[i]), window_arr, "same") / sum_wx)[i]
+
+    res = (wx * x + wx_incl * x_mean) / (wx + wx_incl)
     return res
 
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -38,10 +38,10 @@ def feature_properties() -> dict:
     fp["P_ID"] = flags.IS_UNORDERED
     fp["PG_ID_3"] = flags.IS_UNORDERED
     fp["L_ID"] = flags.IS_UNORDERED
-    fp["dayofweek"] = flags.IS_ORDERED
+    fp["dayofweek"] = flags.IS_UNORDERED
     fp["dayofyear"] = flags.IS_CONTINUOUS | flags.IS_LINEAR
     fp["price_ratio"] = flags.IS_CONTINUOUS | flags.HAS_MISSING | flags.MISSING_NOT_LEARNED
-    fp["PROMOTION_TYPE"] = flags.IS_ORDERED
+    fp["PROMOTION_TYPE"] = flags.IS_UNORDERED
     return fp
 
 

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -68,6 +68,57 @@ def test_poisson_regression(is_plot, prepare_data, cb_poisson_regressor_model):
     np.testing.assert_almost_equal(mad, 1.70, 3)
 
 
+@pytest.fixture(scope="function")
+def cb_poisson_regressor_model_ordered_smoothing(features, feature_properties):
+    explicit_smoothers = {
+        ("dayofyear",): SeasonalSmoother(order=3),
+        ("price_ratio",): IsotonicRegressor(increasing=False),
+    }
+
+    plobs = [
+        observers.PlottingObserver(iteration=1),
+        observers.PlottingObserver(iteration=-1),
+    ]
+
+    fp = feature_properties.copy()
+    fp["P_ID"] = flags.IS_ORDERED
+    fp["PG_ID_3"] = flags.IS_ORDERED
+    fp["L_ID"] = flags.IS_ORDERED
+    fp["dayofweek"] = flags.IS_ORDERED
+    fp["price_ratio"] = flags.IS_ORDERED
+
+    CB_pipeline = pipeline_CBPoissonRegressor(
+        feature_properties=fp,
+        feature_groups=features,
+        observers=plobs,
+        maximal_iterations=50,
+        smoother_choice=common_smoothers.SmootherChoiceGroupBy(
+            use_regression_type=True, use_normalization=False, explicit_smoothers=explicit_smoothers
+        ),
+    )
+
+    return CB_pipeline
+
+
+def test_poisson_regression_ordered_smoothing(is_plot, prepare_data, cb_poisson_regressor_model_ordered_smoothing):
+    X, y = prepare_data
+
+    # make the effect visible with high-uncertainty bin
+    X["P_ID"].iloc[1] = 11.5
+
+    CB_est = cb_poisson_regressor_model_ordered_smoothing
+    CB_est.fit(X.copy(), y)
+
+    if is_plot:
+        plot_CB("analysis_CB_iterfirst_ordered", [CB_est[-1].observers[0]], CB_est[-2])
+        plot_CB("analysis_CB_iterlast_ordered", [CB_est[-1].observers[-1]], CB_est[-2])
+
+    yhat = CB_est.predict(X.copy())
+
+    mad = np.nanmean(np.abs(y - yhat))
+    np.testing.assert_almost_equal(mad, 1.70, 3)
+
+
 @pytest.fixture(scope="function")
 def cb_poisson_regressor_model_hierarchical(features, feature_properties):
     explicit_smoothers = {
@@ -178,7 +229,7 @@ def test_regression_ndarray_w_feature_properties(prepare_data, default_features,
     CB_est.fit(X.copy(), y)
     yhat = CB_est.predict(X.copy())
     mad = np.nanmean(np.abs(y - yhat))
-    np.testing.assert_almost_equal(mad, 1.695, 3)
+    np.testing.assert_almost_equal(mad, 1.697, 3)
 
 
 def test_poisson_regression_default_features_and_properties(is_plot, prepare_data, default_features):

diff --git a/tests/test_smoothing_onedim.py b/tests/test_smoothing_onedim.py
@@ -265,9 +265,26 @@ def test_crosstest_weighted_mean_smoother():
     X_for_reg = np.c_[np.arange(n), np.ones(n), [0.05, 0.05, 0.05, 0.05, 0.05, 0.15, 0.15, 0.05]]
     smoother = smoothing.onedim.WeightedMeanSmoother()
     smoother.fit(X_for_reg, y_for_reg)
-    erg = smoother.smoothed_y_
+    res = smoother.smoothed_y_
     ref = regularize1d(y_for_reg, X_for_reg[:, 2])
-    np.testing.assert_allclose(erg, ref)
+    np.testing.assert_allclose(res, ref)
+
+
+def test_crosstest_weighted_mean_neighbors_smoother():
+    y_for_reg = np.array([0.5, 2.0, 0.5])
+    n = len(y_for_reg)
+    X_for_reg = np.c_[np.arange(n), np.ones(n), np.ones(n)]
+    smoother = smoothing.onedim.WeightedMeanSmootherNeighbors()
+    smoother.fit(X_for_reg, y_for_reg)
+    res = smoother.smoothed_y_
+    ref = np.array(
+        [
+            (0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
+            (2.0 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)) / (1 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)),
+            (0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
+        ]
+    )
+    np.testing.assert_allclose(res, ref)
 
 
 def test_weighted_mean_smoother_raise_not_fitted_exception():

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -73,3 +73,23 @@ def test_get_normalized_values(values, expected_result):
         np.testing.assert_almost_equal(sum(normalized_values), 1.0, 6)
     else:
         np.testing.assert_almost_equal(sum(normalized_values), 0.0, 6)
+
+
+def test_regularize_to_error_weighted_mean_neighbors():
+    X = pd.DataFrame({"a": [0, 1], "b": [3, 6], "c": [2, 7]})
+    features = utils.get_feature_column_names(X)
+    np.testing.assert_equal(features, ["a", "b", "c"])
+    features = utils.get_feature_column_names(X, exclude_columns=["a"])
+    np.testing.assert_equal(features, ["b", "c"])
+
+    values = np.array([0.5, 2.0, 0.5])
+    uncertainties = np.array([1.0, 1.0, 1.0])
+    res = utils.regularize_to_error_weighted_mean_neighbors(values, uncertainties)
+    ref = np.array(
+        [
+            (0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
+            (2.0 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)) / (1 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)),
+            (0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
+        ]
+    )
+    np.testing.assert_allclose(res, ref)