diff --git a/cyclic_boosting/quantile_matching.py b/cyclic_boosting/quantile_matching.py index 3d879c6..958348d 100644 --- a/cyclic_boosting/quantile_matching.py +++ b/cyclic_boosting/quantile_matching.py @@ -1,11 +1,161 @@ import numpy as np +from numpy import exp, log, sinh, arcsinh, arccosh from scipy.optimize import curve_fit -from scipy.stats import norm, gamma, nbinom +from scipy.stats import norm, gamma, nbinom, logistic from scipy.interpolate import InterpolatedUnivariateSpline from typing import Optional +class J_QPD_S: + """ + Implementation of the semi-bounded mode of Johnson Quantile-Parameterized + Distributions (J-QPD), see https://repositories.lib.utexas.edu/bitstream/handle/2152/63037/HADLOCK-DISSERTATION-2017.pdf. + A distribution is parameterized by a symmetric-percentile triplet (SPT). + + Parameters + ---------- + alpha : float + lower quantile of SPT (upper is ``1 - alpha``) + qv_low : float + quantile function value of ``alpha`` + qv_median : float + quantile function value of quantile 0.5 + qv_high : float + quantile function value of quantile ``1 - alpha`` + l : float + lower bound of semi-bounded range (default is 0) + version: str + options are ``normal`` (default) or ``logistic`` + """ + + def __init__( + self, + alpha: float, + qv_low: float, + qv_median: float, + qv_high: float, + l: Optional[float] = 0, + version: Optional[str] = "normal", + ): + if version == "normal": + self.phi = norm() + elif version == "logistic": + self.phi = logistic() + else: + raise Exception("Invalid version.") + + self.l = l + + self.c = self.phi.ppf(1 - alpha) + + self.L = log(qv_low - l) + self.H = log(qv_high - l) + self.B = log(qv_median - l) + + if self.L + self.H - 2 * self.B > 0: + self.n = 1 + self.theta = qv_low - l + elif self.L + self.H - 2 * self.B < 0: + self.n = -1 + self.theta = qv_high - l + else: + self.n = 0 + self.theta = qv_median - l + + self.delta = 1.0 / self.c * sinh(arccosh((self.H - self.L) / (2 * min(self.B - self.L, self.H - self.B)))) + + self.kappa = 1.0 / (self.delta * self.c) * min(self.H - self.B, self.B - self.L) + + def ppf(self, x): + return self.l + self.theta * exp( + self.kappa * sinh(arcsinh(self.delta * self.phi.ppf(x)) + arcsinh(self.n * self.c * self.delta)) + ) + + def cdf(self, x): + return self.phi.cdf( + 1.0 + / self.delta + * sinh(arcsinh(1.0 / self.kappa * log((x - self.l) / self.theta)) - arcsinh(self.n * self.c * self.delta)) + ) + + +class J_QPD_B: + """ + Implementation of the bounded mode of Johnson Quantile-Parameterized + Distributions (J-QPD), see https://repositories.lib.utexas.edu/bitstream/handle/2152/63037/HADLOCK-DISSERTATION-2017.pdf. + A distribution is parameterized by a symmetric-percentile triplet (SPT). + + Parameters + ---------- + alpha : float + lower quantile of SPT (upper is ``1 - alpha``) + qv_low : float + quantile function value of ``alpha`` + qv_median : float + quantile function value of quantile 0.5 + qv_high : float + quantile function value of quantile ``1 - alpha`` + l : float + lower bound of supported range + u : float + upper bound of supported range + version: str + options are ``normal`` (default) or ``logistic`` + """ + + def __init__( + self, + alpha: float, + qv_low: float, + qv_median: float, + qv_high: float, + l: float, + u: float, + version: Optional[str] = "normal", + ): + if version == "normal": + self.phi = norm() + elif version == "logistic": + self.phi = logistic() + else: + raise Exception("Invalid version.") + + self.l = l + self.u = u + + self.c = self.phi.ppf(1 - alpha) + + self.L = self.phi.ppf((qv_low - l) / (u - l)) + self.H = self.phi.ppf((qv_high - l) / (u - l)) + self.B = self.phi.ppf((qv_median - l) / (u - l)) + + if self.L + self.H - 2 * self.B > 0: + self.n = 1 + self.xi = self.L + elif self.L + self.H - 2 * self.B < 0: + self.n = -1 + self.xi = self.H + else: + self.n = 0 + self.xi = self.B + + self.delta = 1.0 / self.c * arccosh((self.H - self.L) / (2 * min(self.B - self.L, self.H - self.B))) + + self.kappa = (self.H - self.L) / sinh(2 * self.delta * self.c) + + def ppf(self, x): + return self.l + (self.u - self.l) * self.phi.cdf( + self.xi + self.kappa * sinh(self.delta * (self.phi.ppf(x) + self.n * self.c)) + ) + + def cdf(self, x): + return self.phi.cdf( + 1.0 / self.delta * arcsinh(1.0 / self.kappa * (self.phi.ppf((x - self.l) / (self.u - self.l)) - self.xi)) + - self.n * self.c + ) + + def quantile_fit_gaussian(quantiles: np.ndarray, quantile_values: np.ndarray, mode: Optional[str] = "ppf") -> callable: """ Interpolation of a quantile function (with quantiles estimated, e.g., by @@ -21,8 +171,8 @@ def quantile_fit_gaussian(quantiles: np.ndarray, quantile_values: np.ndarray, mo mode : str decides about kind of returned callable, possible values are: - - ``ppf``: quantile (default) - - ``dist``: fitted negative binomial (scipy function) + - ``ppf``: quantile function (default) + - ``dist``: fitted Gaussian function (scipy function) - ``cdf``: CDF function Returns @@ -60,8 +210,8 @@ def quantile_fit_gamma(quantiles: np.ndarray, quantile_values: np.ndarray, mode: mode : str decides about kind of returned callable, possible values are: - - ``ppf``: quantile (default) - - ``dist``: fitted negative binomial (scipy function) + - ``ppf``: quantile function (default) + - ``dist``: fitted Gamma function (scipy function) - ``cdf``: CDF function Returns @@ -123,8 +273,8 @@ def quantile_fit_nbinom(quantiles: np.ndarray, quantile_values: np.ndarray, mode mode : str decides about kind of returned callable, possible values are: - - ``ppf``: quantile (default) - - ``dist``: fitted negative binomial (scipy function) + - ``ppf``: quantile function (default) + - ``dist``: fitted negative binomial function (scipy function) - ``cdf``: CDF function Returns diff --git a/docs/source/tutorial.md b/docs/source/tutorial.md index 0588f9d..f03d105 100644 --- a/docs/source/tutorial.md +++ b/docs/source/tutorial.md @@ -143,3 +143,29 @@ the individual predictions of a given data set: ```python CB_est.get_feature_contributions(X_test) ``` + + +## Quantile Regression +Below you can find an example of a quantile regression model for three +different quantiles, with a subsequent quantile matching (to get a full +individual probability distribution from the estimated quantiles) by means of a +quantile-parameterized distribution for an arbitrary test sample: +```python +from cyclic_boosting.pipelines import pipeline_CBMultiplicativeQuantileRegressor +from cyclic_boosting.quantile_matching import J_QPD_S + +CB_est_qlow = pipeline_CBMultiplicativeQuantileRegressor(quantile=0.2) +CB_est_qlow.fit(X_train.copy(), y) +yhat_qlow = CB_est_qlow.predict(X_test.copy()) + +CB_est_qmedian = pipeline_CBMultiplicativeQuantileRegressor(quantile=0.5) +CB_est_qmedian.fit(X_train.copy(), y) +yhat_qmedian = CB_est_qmedian.predict(X_test.copy()) + +CB_est_qhigh = pipeline_CBMultiplicativeQuantileRegressor(quantile=0.8) +CB_est_qhigh.fit(X_train.copy(), y) +yhat_qhigh = CB_est_qhigh.predict(X_test.copy()) + +j_qpd_s_42 = J_QPD_S(0.2, yhat_qlow[42], yhat_qmedian[42], yhat_qhigh[42]) +yhat_42_percentile95 = j_qpd_s_42.ppf(0.95) +``` diff --git a/tests/test_integration.py b/tests/test_integration.py index 8877b6e..7407ce1 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -20,7 +20,7 @@ pipeline_CBAdditiveGenericCRegressor, pipeline_CBGenericClassifier, ) -from cyclic_boosting.quantile_matching import quantile_fit_gamma, quantile_fit_nbinom, quantile_fit_spline +from cyclic_boosting.quantile_matching import quantile_fit_gamma, quantile_fit_nbinom, quantile_fit_spline, J_QPD_S from cyclic_boosting.utils import smear_discrete_cdftruth from tests.utils import plot_CB, costs_mad, costs_mse @@ -424,6 +424,53 @@ def test_multiplicative_quantile_regression_90(is_plot, prepare_data, features, np.testing.assert_almost_equal(quantile_acc, 0.9015, 3) +@pytest.mark.skip(reason="Long running time") +def test_multiplicative_quantile_regression_pdf_J_QPD_S(is_plot, prepare_data, features, feature_properties): + X, y = prepare_data + + quantiles = [] + quantile_values = [] + for quantile in [0.2, 0.5, 0.8]: + CB_est = cb_multiplicative_quantile_regressor_model( + quantile=quantile, features=features, feature_properties=feature_properties + ) + CB_est.fit(X.copy(), y) + yhat = CB_est.predict(X.copy()) + quantile_values.append(yhat) + quantiles.append(quantile) + + quantiles = np.asarray(quantiles) + quantile_values = np.asarray(quantile_values) + + cdf_truth_list = [] + n_samples = len(X) + for i in range(n_samples): + j_qpd_s = J_QPD_S(0.2, quantile_values[0, i], quantile_values[1, i], quantile_values[2, i]) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.2), quantile_values[0, i], 3) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.5), quantile_values[1, i], 3) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.8), quantile_values[2, i], 3) + if i == 24: + np.testing.assert_almost_equal(j_qpd_s.ppf(0.1), 0.592, 3) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.9), 5.783, 3) + + if is_plot: + plt.plot([0.2, 0.5, 0.8], [quantile_values[0, i], quantile_values[1, i], quantile_values[2, i]], "ro") + xs = np.linspace(0.0, 1.0, 100) + plt.plot(xs, j_qpd_s.ppf(xs)) + plt.savefig("J_QPD_S_integration_" + str(i) + ".png") + plt.clf() + + if is_plot: + cdf_truth = smear_discrete_cdftruth(j_qpd_s.cdf, y[i]) + cdf_truth_list.append(cdf_truth) + + cdf_truth = np.asarray(cdf_truth_list) + if is_plot: + plt.hist(cdf_truth[cdf_truth > 0], bins=30) + plt.savefig("J_QPD_S_cdf_truth_histo.png") + plt.clf() + + @pytest.mark.skip(reason="Long running time") def test_multiplicative_quantile_regression_spline(is_plot, prepare_data, features, feature_properties): X, y = prepare_data diff --git a/tests/test_quantile_matching.py b/tests/test_quantile_matching.py index 2b4f222..569241b 100644 --- a/tests/test_quantile_matching.py +++ b/tests/test_quantile_matching.py @@ -3,6 +3,8 @@ import matplotlib.pyplot as plt from cyclic_boosting.quantile_matching import ( + J_QPD_S, + J_QPD_B, quantile_fit_gaussian, quantile_fit_gamma, quantile_fit_nbinom, @@ -10,6 +12,130 @@ ) +def test_J_QPD_S(is_plot): + alpha = 0.2 + # Gamma-like + qv_low = 1.12 + qv_median = 2.023 + qv_high = 3.322 + + j_qpd_s = J_QPD_S(alpha, qv_low, qv_median, qv_high) + + np.testing.assert_almost_equal(j_qpd_s.ppf(0.2), qv_low, 3) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.5), qv_median, 3) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.8), qv_high, 3) + # expectation from Gamma + np.testing.assert_almost_equal(j_qpd_s.ppf(0.1), 0.79, 2) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.3), 1.42, 2) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.7), 2.78, 2) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.9), 4.18, 2) + + np.testing.assert_almost_equal(j_qpd_s.cdf(qv_low), 0.2, 3) + np.testing.assert_almost_equal(j_qpd_s.cdf(qv_median), 0.5, 3) + np.testing.assert_almost_equal(j_qpd_s.cdf(qv_high), 0.8, 3) + # expectation from Gamma + np.testing.assert_almost_equal(j_qpd_s.cdf(0.79), 0.1, 2) + np.testing.assert_almost_equal(j_qpd_s.cdf(1.42), 0.3, 2) + np.testing.assert_almost_equal(j_qpd_s.cdf(2.78), 0.7, 2) + np.testing.assert_almost_equal(j_qpd_s.cdf(4.18), 0.9, 2) + + if is_plot: + plt.plot([alpha, 0.5, 1 - alpha], [qv_low, qv_median, qv_high], "ro") + xs = np.linspace(0.0, 1.0, 100) + plt.plot(xs, j_qpd_s.ppf(xs)) + plt.savefig("J_QPD_S.png") + plt.clf() + + j_qpd_s = J_QPD_S(alpha, qv_low, qv_median, qv_high, version="logistic") + + np.testing.assert_almost_equal(j_qpd_s.ppf(0.2), qv_low, 3) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.5), qv_median, 3) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.8), qv_high, 3) + # deviate from Gamma expecation + np.testing.assert_almost_equal(j_qpd_s.ppf(0.1), 0.76, 2) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.3), 1.42, 2) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.7), 2.78, 2) + np.testing.assert_almost_equal(j_qpd_s.ppf(0.9), 4.28, 2) + + np.testing.assert_almost_equal(j_qpd_s.cdf(qv_low), 0.2, 3) + np.testing.assert_almost_equal(j_qpd_s.cdf(qv_median), 0.5, 3) + np.testing.assert_almost_equal(j_qpd_s.cdf(qv_high), 0.8, 3) + # deviate from Gamma expecation + np.testing.assert_almost_equal(j_qpd_s.cdf(0.76), 0.1, 2) + np.testing.assert_almost_equal(j_qpd_s.cdf(1.42), 0.3, 2) + np.testing.assert_almost_equal(j_qpd_s.cdf(2.78), 0.7, 2) + np.testing.assert_almost_equal(j_qpd_s.cdf(4.28), 0.9, 2) + + if is_plot: + plt.plot([alpha, 0.5, 1 - alpha], [qv_low, qv_median, qv_high], "ro") + xs = np.linspace(0.0, 1.0, 100) + plt.plot(xs, j_qpd_s.ppf(xs)) + plt.savefig("J_QPD_S_logistic.png") + plt.clf() + + +def test_J_QPD_B(is_plot): + alpha = 0.2 + # Gaussian-like + qv_low = -0.878 + qv_median = 0.3 + qv_high = 1.478 + + j_qpd_b = J_QPD_B(alpha, qv_low, qv_median, qv_high, -5.0, 10.0) + + np.testing.assert_almost_equal(j_qpd_b.ppf(0.2), qv_low, 3) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.5), qv_median, 3) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.8), qv_high, 3) + # expect from Gaussian + np.testing.assert_almost_equal(j_qpd_b.ppf(0.1), -1.49, 2) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.3), -0.43, 2) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.7), 1.03, 2) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.9), 2.09, 1) + + np.testing.assert_almost_equal(j_qpd_b.cdf(qv_low), 0.2, 3) + np.testing.assert_almost_equal(j_qpd_b.cdf(qv_median), 0.5, 3) + np.testing.assert_almost_equal(j_qpd_b.cdf(qv_high), 0.8, 3) + # expect from Gaussian + np.testing.assert_almost_equal(j_qpd_b.cdf(-1.49), 0.1, 2) + np.testing.assert_almost_equal(j_qpd_b.cdf(-0.43), 0.3, 2) + np.testing.assert_almost_equal(j_qpd_b.cdf(1.03), 0.7, 2) + np.testing.assert_almost_equal(j_qpd_b.cdf(2.09), 0.9, 2) + + if is_plot: + plt.plot([alpha, 0.5, 1 - alpha], [qv_low, qv_median, qv_high], "ro") + xs = np.linspace(0.0, 1.0, 100) + plt.plot(xs, j_qpd_b.ppf(xs)) + plt.savefig("J_QPD_B.png") + plt.clf() + + j_qpd_b = J_QPD_B(alpha, qv_low, qv_median, qv_high, -5.0, 10.0, version="logistic") + + np.testing.assert_almost_equal(j_qpd_b.ppf(0.2), qv_low, 3) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.5), qv_median, 3) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.8), qv_high, 3) + # deviate from Gaussian expectation + np.testing.assert_almost_equal(j_qpd_b.ppf(0.1), -1.57, 2) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.3), -0.43, 2) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.7), 1.03, 2) + np.testing.assert_almost_equal(j_qpd_b.ppf(0.9), 2.19, 2) + + np.testing.assert_almost_equal(j_qpd_b.cdf(qv_low), 0.2, 3) + np.testing.assert_almost_equal(j_qpd_b.cdf(qv_median), 0.5, 3) + np.testing.assert_almost_equal(j_qpd_b.cdf(qv_high), 0.8, 3) + # deviate from Gaussian expectation + np.testing.assert_almost_equal(j_qpd_b.cdf(-1.57), 0.1, 2) + np.testing.assert_almost_equal(j_qpd_b.cdf(-0.43), 0.3, 2) + np.testing.assert_almost_equal(j_qpd_b.cdf(1.03), 0.7, 2) + np.testing.assert_almost_equal(j_qpd_b.cdf(2.19), 0.9, 2) + + if is_plot: + plt.plot([alpha, 0.5, 1 - alpha], [qv_low, qv_median, qv_high], "ro") + xs = np.linspace(0.0, 1.0, 100) + plt.plot(xs, j_qpd_b.ppf(xs)) + plt.savefig("J_QPD_B_logistic.png") + plt.clf() + + def test_cdf_fit_gaussian(is_plot): quantiles = np.array([0.1, 0.3, 0.5, 0.7, 0.9]) mu_exp = 0.3