Skip to content

Commit

Permalink
introduced ordered smoothing of categorical features
Browse files Browse the repository at this point in the history
  • Loading branch information
FelixWick committed Oct 31, 2023
1 parent 350caf4 commit d5b3d31
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 10 deletions.
4 changes: 2 additions & 2 deletions cyclic_boosting/common_smoothers.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _simplify_flags(feature_property: int, feature_group: Optional[str] = None):
elif flags.is_continuous_set(feature_property):
return flags.IS_CONTINUOUS
elif flags.is_ordered_set(feature_property):
return flags.IS_UNORDERED
return flags.IS_ORDERED
elif flags.is_unordered_set(feature_property):
return flags.IS_UNORDERED
else:
Expand All @@ -67,7 +67,7 @@ def _simplify_flags(feature_property: int, feature_group: Optional[str] = None):
def _default_smoother_types(neutral_factor_link=0, use_normalization=True):
smoother_types = {
flags.IS_UNORDERED: smoothing.onedim.WeightedMeanSmoother(prior_prediction=neutral_factor_link),
flags.IS_ORDERED: smoothing.onedim.WeightedMeanSmoother(prior_prediction=neutral_factor_link),
flags.IS_ORDERED: smoothing.onedim.WeightedMeanSmootherNeighbors(),
flags.IS_CONTINUOUS: smoothing.onedim.OrthogonalPolynomialSmoother(),
flags.IS_LINEAR: smoothing.extrapolate.LinearExtrapolator(),
flags.IS_SEASONAL:
Expand Down
14 changes: 14 additions & 0 deletions cyclic_boosting/smoothing/onedim.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,20 @@ def fit(self, X_for_smoother, y):
self.smoothed_y_ = utils.regularize_to_error_weighted_mean(y, X_for_smoother[:, 2], self.prior_prediction)


class WeightedMeanSmootherNeighbors(AbstractBinSmoother, PredictingBinValueMixin):
"""
Smoother for regularizing one-dimensional bin values with uncertainties to
the weighted mean of a window including only its left and right neigboring
bins.
"""

def fit(self, X_for_smoother, y):
if len(y) < 3:
self.smoothed_y_ = utils.regularize_to_error_weighted_mean(y, X_for_smoother[:, 2])
else:
self.smoothed_y_ = utils.regularize_to_error_weighted_mean_neighbors(y, X_for_smoother[:, 2], window_size=3)


class OrthogonalPolynomialSmoother(AbstractBinSmoother):
"""A polynomial fit that uses orthogonal polynomials as basis functions.
Expand Down
46 changes: 43 additions & 3 deletions cyclic_boosting/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ def regularize_to_prior_expectation(values, uncertainties, prior_expectation, th


def regularize_to_error_weighted_mean(values, uncertainties, prior_prediction=None):
r"""Regularize values with uncertainties to the error weighted mean.
r"""Regularize values with uncertainties to its error-weighted mean.
:param values: measured values
:type values: :class:`numpy.ndarray` (float64, dim=1)
Expand Down Expand Up @@ -822,7 +822,7 @@ def regularize_to_error_weighted_mean(values, uncertainties, prior_prediction=No
ValueError: <values> and <uncertainties> must have the same shape
"""
if values.shape != uncertainties.shape:
raise ValueError("<values> and <uncertainties> " "must have the same shape")
raise ValueError("values and uncertainties must have the same shape")
if len(values) < 1 or (prior_prediction is None and len(values) == 1):
return values

Expand All @@ -834,14 +834,54 @@ def regularize_to_error_weighted_mean(values, uncertainties, prior_prediction=No
# if all values are the same,
# regularizing to the mean makes no sense
return x
x_mean = np.sum(wx * x) / np.sum(wx)
x_mean = np.sum(wx * x) / sum_wx
else:
if np.allclose(x, prior_prediction):
return x
x_mean = prior_prediction
wx_incl = 1.0 / (np.sum(wx * np.square(x - x_mean)) / sum_wx)
res = (wx * x + wx_incl * x_mean) / (wx + wx_incl)
return res


def regularize_to_error_weighted_mean_neighbors(values, uncertainties, window_size=3):
"""
Regularize values with uncertainties to its error-weighted mean, using a
sliding window.
Parameters
----------
values : np.ndarray
data (`float` type) to be regularized
uncertainties : np.ndarray
uncertainties (`float` type) of values
window_size : int
size of the sliding window to be used (e.g., 3 means include direct
left and right neighbors)
Returns
-------
np.ndarray
regularized values
"""
if values.shape != uncertainties.shape:
raise ValueError("values and uncertainties must have the same shape")

if len(values) < 3:
return regularize_to_error_weighted_mean(values, uncertainties)

window_arr = np.ones(window_size)
x = values
wx = 1.0 / np.square(uncertainties)

sum_wx = np.convolve(wx, window_arr, "same")
x_mean = np.convolve(wx * x, window_arr, "same") / sum_wx

wx_incl = np.ones(len(x))
for i in range(len(x)):
wx_incl[i] = 1.0 / (np.convolve(wx * np.square(x - x_mean[i]), window_arr, "same") / sum_wx)[i]

res = (wx * x + wx_incl * x_mean) / (wx + wx_incl)
return res


Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ def feature_properties() -> dict:
fp["P_ID"] = flags.IS_UNORDERED
fp["PG_ID_3"] = flags.IS_UNORDERED
fp["L_ID"] = flags.IS_UNORDERED
fp["dayofweek"] = flags.IS_ORDERED
fp["dayofweek"] = flags.IS_UNORDERED
fp["dayofyear"] = flags.IS_CONTINUOUS | flags.IS_LINEAR
fp["price_ratio"] = flags.IS_CONTINUOUS | flags.HAS_MISSING | flags.MISSING_NOT_LEARNED
fp["PROMOTION_TYPE"] = flags.IS_ORDERED
fp["PROMOTION_TYPE"] = flags.IS_UNORDERED
return fp


Expand Down
53 changes: 52 additions & 1 deletion tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,57 @@ def test_poisson_regression(is_plot, prepare_data, cb_poisson_regressor_model):
np.testing.assert_almost_equal(mad, 1.70, 3)


@pytest.fixture(scope="function")
def cb_poisson_regressor_model_ordered_smoothing(features, feature_properties):
explicit_smoothers = {
("dayofyear",): SeasonalSmoother(order=3),
("price_ratio",): IsotonicRegressor(increasing=False),
}

plobs = [
observers.PlottingObserver(iteration=1),
observers.PlottingObserver(iteration=-1),
]

fp = feature_properties.copy()
fp["P_ID"] = flags.IS_ORDERED
fp["PG_ID_3"] = flags.IS_ORDERED
fp["L_ID"] = flags.IS_ORDERED
fp["dayofweek"] = flags.IS_ORDERED
fp["price_ratio"] = flags.IS_ORDERED

CB_pipeline = pipeline_CBPoissonRegressor(
feature_properties=fp,
feature_groups=features,
observers=plobs,
maximal_iterations=50,
smoother_choice=common_smoothers.SmootherChoiceGroupBy(
use_regression_type=True, use_normalization=False, explicit_smoothers=explicit_smoothers
),
)

return CB_pipeline


def test_poisson_regression_ordered_smoothing(is_plot, prepare_data, cb_poisson_regressor_model_ordered_smoothing):
X, y = prepare_data

# make the effect visible with high-uncertainty bin
X["P_ID"].iloc[1] = 11.5

CB_est = cb_poisson_regressor_model_ordered_smoothing
CB_est.fit(X.copy(), y)

if is_plot:
plot_CB("analysis_CB_iterfirst_ordered", [CB_est[-1].observers[0]], CB_est[-2])
plot_CB("analysis_CB_iterlast_ordered", [CB_est[-1].observers[-1]], CB_est[-2])

yhat = CB_est.predict(X.copy())

mad = np.nanmean(np.abs(y - yhat))
np.testing.assert_almost_equal(mad, 1.70, 3)


@pytest.fixture(scope="function")
def cb_poisson_regressor_model_hierarchical(features, feature_properties):
explicit_smoothers = {
Expand Down Expand Up @@ -178,7 +229,7 @@ def test_regression_ndarray_w_feature_properties(prepare_data, default_features,
CB_est.fit(X.copy(), y)
yhat = CB_est.predict(X.copy())
mad = np.nanmean(np.abs(y - yhat))
np.testing.assert_almost_equal(mad, 1.695, 3)
np.testing.assert_almost_equal(mad, 1.697, 3)


def test_poisson_regression_default_features_and_properties(is_plot, prepare_data, default_features):
Expand Down
21 changes: 19 additions & 2 deletions tests/test_smoothing_onedim.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,9 +265,26 @@ def test_crosstest_weighted_mean_smoother():
X_for_reg = np.c_[np.arange(n), np.ones(n), [0.05, 0.05, 0.05, 0.05, 0.05, 0.15, 0.15, 0.05]]
smoother = smoothing.onedim.WeightedMeanSmoother()
smoother.fit(X_for_reg, y_for_reg)
erg = smoother.smoothed_y_
res = smoother.smoothed_y_
ref = regularize1d(y_for_reg, X_for_reg[:, 2])
np.testing.assert_allclose(erg, ref)
np.testing.assert_allclose(res, ref)


def test_crosstest_weighted_mean_neighbors_smoother():
y_for_reg = np.array([0.5, 2.0, 0.5])
n = len(y_for_reg)
X_for_reg = np.c_[np.arange(n), np.ones(n), np.ones(n)]
smoother = smoothing.onedim.WeightedMeanSmootherNeighbors()
smoother.fit(X_for_reg, y_for_reg)
res = smoother.smoothed_y_
ref = np.array(
[
(0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
(2.0 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)) / (1 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)),
(0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
]
)
np.testing.assert_allclose(res, ref)


def test_weighted_mean_smoother_raise_not_fitted_exception():
Expand Down
20 changes: 20 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,23 @@ def test_get_normalized_values(values, expected_result):
np.testing.assert_almost_equal(sum(normalized_values), 1.0, 6)
else:
np.testing.assert_almost_equal(sum(normalized_values), 0.0, 6)


def test_regularize_to_error_weighted_mean_neighbors():
X = pd.DataFrame({"a": [0, 1], "b": [3, 6], "c": [2, 7]})
features = utils.get_feature_column_names(X)
np.testing.assert_equal(features, ["a", "b", "c"])
features = utils.get_feature_column_names(X, exclude_columns=["a"])
np.testing.assert_equal(features, ["b", "c"])

values = np.array([0.5, 2.0, 0.5])
uncertainties = np.array([1.0, 1.0, 1.0])
res = utils.regularize_to_error_weighted_mean_neighbors(values, uncertainties)
ref = np.array(
[
(0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
(2.0 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)) / (1 + 1.0 / ((2 * 0.5 * 0.5 + 1) / 3.0)),
(0.5 + 1.0 / (0.75 * 0.75) * 1.25) / (1 + 1.0 / (0.75 * 0.75)),
]
)
np.testing.assert_allclose(res, ref)

0 comments on commit d5b3d31

Please sign in to comment.