diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py index 9508bfb..2310b8d 100755 --- a/examples/plot_huggingface_model.py +++ b/examples/plot_huggingface_model.py @@ -6,15 +6,16 @@ (QRF) model from Hugging Face Hub and use it to estimate new quantiles. In this scenario, a QRF has been trained with default parameters on a train-test split of the California housing dataset and uploaded to Hugging Face Hub. The -model is downloaded, and inference is performed over several quantiles for -each instance in the dataset. The estimates are visualized by the latitude and -longitude of each instance. The model used is available on Hugging Face Hub +model is downloaded and used to perform inference across several quantiles for +each dataset sample. The results are visualized by the latitude and longitude +of each sample. The model used is available on Hugging Face Hub `here `_. """ import os import pickle import shutil +import tempfile import altair as alt import numpy as np @@ -134,13 +135,16 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= # Create the repository on the Hugging Face Hub if it does not exist. # Push the model to the repository. - hub_utils.push( - repo_id=repo_id, - source=local_dir, - token=token, # personal token to be downloaded from Hugging Face - commit_message="Model commit", - create_remote=True, - ) + try: + hub_utils.push( + repo_id=repo_id, + source=local_dir, + token=token, # personal token to be downloaded from Hugging Face + commit_message="Model commit", + create_remote=True, + ) + except Exception as e: + print(f"Error pushing model to Hugging Face Hub: {e}") os.remove(model_filename) shutil.rmtree(local_dir) @@ -149,17 +153,13 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= if not load_existing: fit_and_upload_model(token, repo_id, random_state=random_seed) -# Download the repository locally. -local_dir = "./local_repo" -if os.path.exists(local_dir): - shutil.rmtree(local_dir) -hub_utils.download(repo_id=repo_id, dst=local_dir) - -# Load the fitted model. +# Download the repository locally and load the fitted model. model_filename = "model.pkl" -with open(f"{local_dir}/{model_filename}", "rb") as file: - qrf = pickle.load(file) -shutil.rmtree(local_dir) +local_dir = "./local_repo" +with tempfile.TemporaryDirectory() as local_dir: + hub_utils.download(repo_id=repo_id, dst=local_dir) + with open(f"{local_dir}/{model_filename}", "rb") as file: + qrf = pickle.load(file) # Estimate quantiles. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) @@ -174,7 +174,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= ) -def plot_quantiles_by_latlon(df, quantiles): +def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"): # Slider for varying the displayed quantile estimates. slider = alt.binding_range( min=0, @@ -183,11 +183,11 @@ def plot_quantiles_by_latlon(df, quantiles): name="Predicted Quantile: ", ) - quantile_selection = alt.param(value=0.5, bind=slider, name="quantile") + quantile_val = alt.param(value=0.5, bind=slider, name="quantile") chart = ( alt.Chart(df) - .add_params(quantile_selection) + .add_params(quantile_val) .transform_filter("datum.quantile == quantile") .mark_circle() .encode( @@ -203,7 +203,7 @@ def plot_quantiles_by_latlon(df, quantiles): scale=alt.Scale(zero=False), title="Latitude", ), - color=alt.Color("value:Q", scale=alt.Scale(scheme="cividis"), title="Prediction"), + color=alt.Color("value:Q", scale=alt.Scale(scheme=color_scheme), title="Prediction"), size=alt.Size("Population:Q"), tooltip=[ alt.Tooltip("index:N", title="Row ID"), diff --git a/examples/plot_predict_custom.py b/examples/plot_predict_custom.py index 94f434f..bc5e3cc 100755 --- a/examples/plot_predict_custom.py +++ b/examples/plot_predict_custom.py @@ -3,12 +3,11 @@ ============================================ This example demonstrates how to extract the empirical distribution from a -quantile regression forest (QRF) for one or more samples to calculate a -user-specified function of interest. While a QRF is designed to estimate -quantiles from the empirical distribution calculated for each sample, it can -also be useful to use this empirical distribution to calculate other -quantities of interest. Here, we calculate the empirical cumulative -distribution function (ECDF) for a test sample. +quantile regression forest (QRF) to calculate user-specified functions of +interest. While a QRF is designed to estimate quantiles, their empirical +distributions can also be used to calculate other statistical quantities. In +this scenario, we compute the empirical cumulative distribution function +(ECDF) for test samples. """ from itertools import chain @@ -93,7 +92,7 @@ def predict(reg, X, quantiles=0.5, what=None): "y_val": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])), "y_val2": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))[1:] + [np.nan], "proba": list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf])), - "sample_idx": [idx] * n_quantiles, + "index": [idx] * n_quantiles, } ) dfs.append(df_i) @@ -101,12 +100,12 @@ def predict(reg, X, quantiles=0.5, what=None): def plot_ecdf(df): - min_idx = df["sample_idx"].min() - max_idx = df["sample_idx"].max() + min_idx = df["index"].min() + max_idx = df["index"].max() # Slider for determining the sample index for which the custom function is being visualized. - slider = alt.binding_range(min=min_idx, max=max_idx, step=1, name="Sample Index: ") - sample_selection = alt.param(value=0, bind=slider, name="sample_idx") + slider = alt.binding_range(min=min_idx, max=max_idx, step=1, name="Test Sample Index: ") + index_selection = alt.selection_point(value=0, bind=slider, fields=["index"]) tooltip = [ alt.Tooltip("y_val:Q", title="Response Value"), @@ -136,8 +135,8 @@ def plot_ecdf(df): chart = ( (circles + lines) - .add_params(sample_selection) - .transform_filter(alt.datum["sample_idx"] == sample_selection) + .add_params(index_selection) + .transform_filter(index_selection) .properties( height=400, width=650, diff --git a/examples/plot_proximity_counts.py b/examples/plot_proximity_counts.py index f927de9..c35d87d 100644 --- a/examples/plot_proximity_counts.py +++ b/examples/plot_proximity_counts.py @@ -3,16 +3,15 @@ ================================================== This example demonstrates the use of quantile regression forest (QRF) -proximity counts to identify similar samples in an unsupervised manner, as the -target values are not used during model fitting. In this scenario, we train a -QRF on a noisy dataset to predict individual pixel values (i.e., denoise). We -then retrieve the proximity values for samples in a noisy test set. For each -test sample digit, we visualize it alongside a set of similar (non-noisy) -training samples determined by their proximity counts, as well as the -non-noisy digit. The similar samples are ordered from the highest to the -lowest proximity count for each digit, arranged from left to right and top to -bottom. This example illustrates the effectiveness of proximity counts in -identifying similar samples, even when using noisy training and test data. +proximity counts to identify similar samples. In this scenario, we train a QRF +on a noisy dataset to predict individual pixel values in an unsupervised +manner (the target labels are not used during training) for denoising +purposes. We then retrieve the proximity values for the noisy test samples. We +visualize each test sample alongside a set of similar (non-noisy) training +samples determined by their proximity counts. These similar samples are +ordered from the highest to the lowest proximity count. This illustrates how +proximity counts can effectively identify similar samples even in noisy +conditions. """ import altair as alt @@ -41,12 +40,7 @@ def add_gaussian_noise(X, mean=0, std=0.1, random_state=None): """Add Gaussian noise to input data.""" - if random_state is None: - rng = check_random_state(0) - elif isinstance(random_state, int): - rng = check_random_state(random_state) - else: - rng = random_state + rng = check_random_state(random_state) scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X) @@ -97,7 +91,6 @@ def extract_floats(combined_df, scale=100): .join(y_test) .reset_index() .join(df_prox) - .iloc[:n_test_samples] .explode("prox") .assign( **{ @@ -140,26 +133,16 @@ def plot_digits_proximities( subplot_dim = (width - subplot_spacing * (n_subplot_rows - 1)) / n_subplot_rows # Slider for determining the test index for which the data is being visualized. - slider = alt.binding_range( - min=0, - max=n_samples - 1, - step=1, - name="Test Index: ", - ) - - idx_val = alt.selection_point( - value=0, - bind=slider, - fields=["index"], - ) + slider = alt.binding_range(min=0, max=n_samples - 1, step=1, name="Test Sample Index: ") + index_selection = alt.selection_point(value=0, bind=slider, fields=["index"]) scale = alt.Scale(domain=[x_min, x_max], scheme="greys") - opacity = (alt.value(0), alt.value(0.67)) + opacity = (alt.value(0), alt.value(0.5)) - base = alt.Chart(df).add_params(idx_val).transform_filter(idx_val) + base = alt.Chart(df).add_params(index_selection).transform_filter(index_selection) chart1 = ( - base.transform_filter(f"datum.prox_idx == 0") + base.transform_filter("datum.prox_idx == 0") .transform_fold(fold=pixel_cols, as_=["pixel", "value"]) .transform_calculate(value_clean=f"floor(datum.value / {pixel_scale})") .transform_calculate(value_noisy=f"datum.value - (datum.value_clean * {pixel_scale})") @@ -172,7 +155,7 @@ def plot_digits_proximities( opacity=alt.condition(alt.datum["value_noisy"] == 0, *opacity), tooltip=[ alt.Tooltip("target:Q", title="Digit"), - alt.Tooltip("value_noisy:Q", format=".3f", title="Pixel Value"), + alt.Tooltip("value_noisy:Q", format=",.3f", title="Pixel Value"), alt.Tooltip("x:Q", title="Pixel X"), alt.Tooltip("y:Q", title="Pixel Y"), ], @@ -213,7 +196,7 @@ def plot_digits_proximities( ) chart3 = ( - base.transform_filter(f"datum.prox_idx == 0") + base.transform_filter("datum.prox_idx == 0") .transform_fold(fold=pixel_cols, as_=["pixel", "value"]) .transform_calculate(value_clean=f"floor(datum.value / {pixel_scale})") .transform_calculate(x=pixel_x, y=pixel_y) @@ -225,7 +208,7 @@ def plot_digits_proximities( opacity=alt.condition(alt.datum["value_clean"] == 0, *opacity), tooltip=[ alt.Tooltip("target:Q", title="Digit"), - alt.Tooltip("value_clean:Q", title="Pixel Value"), + alt.Tooltip("value_clean:Q", format=",.3f", title="Pixel Value"), alt.Tooltip("x:Q", title="Pixel X"), alt.Tooltip("y:Q", title="Pixel Y"), ], diff --git a/examples/plot_quantile_conformalized.py b/examples/plot_quantile_conformalized.py index 52d1857..9c14879 100755 --- a/examples/plot_quantile_conformalized.py +++ b/examples/plot_quantile_conformalized.py @@ -4,12 +4,13 @@ This example demonstrates the use of a quantile regression forest (QRF) to construct reliable prediction intervals using conformalized quantile -regression (CQR). CQR provides prediction intervals that attain valid -coverage, whereas QRF may require additional calibration for reliable interval -estimates. In this example, by using CQR, we achieve a level of coverage -(i.e., the percentage of samples that actually fall within their prediction -interval) that is generally closer to the target level. This example is -adapted from `"Prediction intervals: Quantile Regression Forests" +regression (CQR). While QRFs can estimate quantiles, they may require +additional calibration to provide reliable interval estimates. CQR provides +prediction intervals that attain valid coverage. In this example, we use CQR +to enhance QRF by producing prediction intervals that achieve a level of +coverage (i.e., the percentage of samples that actually fall within their +prediction interval) that is generally closer to the target level. This +example is adapted from `"Prediction intervals: Quantile Regression Forests" `_ by Carl McBride Ellis. """ @@ -69,6 +70,7 @@ def mean_width_score(y_pred_low, y_pred_upp): def qrf_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None): + """QRF (baseline) strategy.""" quantiles = [alpha / 2, 1 - alpha / 2] qrf = RandomForestQuantileRegressor(random_state=random_state) @@ -89,6 +91,7 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None): def cqr_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None): + """Conformalized Quantile Regression (CQR) strategy with a QRF.""" quantiles = [alpha / 2, 1 - alpha / 2] # Create calibration set. @@ -160,8 +163,7 @@ def plot_prediction_intervals_by_strategy(df): def plot_prediction_intervals(df, domain): # Slider for varying the target coverage level. slider = alt.binding_range(min=0, max=1, step=0.1, name="Coverage Target: ") - cov_selection = alt.param(value=0.9, bind=slider, name="coverage") - cov_tol = 0.01 + coverage_val = alt.param(value=0.9, bind=slider, name="coverage") click = alt.selection_point(fields=["y_label"], bind="legend") @@ -175,10 +177,7 @@ def plot_prediction_intervals(df, domain): base = ( alt.Chart(df) - .transform_filter( - (1 - alt.datum["alpha"] - cov_tol <= cov_selection) - & (1 - alt.datum["alpha"] + cov_tol >= cov_selection) - ) + .transform_filter("round((1 - datum.alpha) * 100) / 100 == coverage") .transform_calculate( y_label=( "((datum.y_test >= datum.y_pred_low) & (datum.y_test <= datum.y_pred_upp))" @@ -249,8 +248,8 @@ def plot_prediction_intervals(df, domain): ) .transform_calculate( coverage_text=( - f"'Coverage: ' + format(datum.coverage * 100, '.1f') + '%'" - f" + ' (target = ' + format((1 - datum.alpha) * 100, '.1f') + '%)'" + "'Coverage: ' + format(datum.coverage * 100, '.1f') + '%'" + " + ' (target = ' + format((1 - datum.alpha) * 100, '.1f') + '%)'" ) ) .mark_text(align="left", baseline="top") @@ -262,9 +261,7 @@ def plot_prediction_intervals(df, domain): ) text_with = ( base.transform_aggregate(width="mean(width)", groupby=["strategy"]) - .transform_calculate( - width_text=f"'Interval Width: ' + format({alt.datum['width']}, '$,d')" - ) + .transform_calculate(width_text="'Interval Width: ' + format(datum.width, '$,d')") .mark_text(align="left", baseline="top") .encode( x=alt.value(5), @@ -275,7 +272,7 @@ def plot_prediction_intervals(df, domain): chart = ( bar + tick_low + tick_upp + circle + diagonal + text_coverage + text_with - ).add_params(cov_selection) + ).add_params(coverage_val) return chart diff --git a/examples/plot_quantile_example.py b/examples/plot_quantile_example.py index a3a1008..7910b37 100755 --- a/examples/plot_quantile_example.py +++ b/examples/plot_quantile_example.py @@ -3,8 +3,9 @@ =========================================== This example demonstrates the use of a quantile regression forest (QRF) to -predict a conditional median and prediction intervals. The example compares -the QRF predictions to a ground truth function used to generate noisy samples. +predict the conditional median and construct prediction intervals. The QRF +predictions are compared to the ground truth function used to generate noisy +samples. """ import altair as alt @@ -27,8 +28,8 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0): f = x * np.sin(x) sigma = 0.25 + x / 10 - noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2) - y = f + (noise if add_noise else 0) + noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2) if add_noise else np.zeros_like(f) + y = f + noise return np.atleast_2d(x).T, y @@ -47,7 +48,7 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0): df = pd.DataFrame( { - "X": np.concatenate([X_func.reshape(-1), X_test.reshape(-1)]), + "x": np.concatenate([X_func.reshape(-1), X_test.reshape(-1)]), "y": np.concatenate([y_func, y_test]), "y_pred": np.concatenate([y_pred_func[:, 1], y_pred_test[:, 1]]), "y_pred_low": np.concatenate([y_pred_func[:, 0], y_pred_test[:, 0]]), @@ -63,11 +64,11 @@ def plot_fit_and_intervals(df): .transform_filter(alt.datum["test"]) # filter to test data .mark_circle(color="#f2a619") .encode( - x=alt.X("X:Q", scale=alt.Scale(nice=False)), + x=alt.X("x:Q", scale=alt.Scale(nice=False)), y=alt.Y("y:Q", title=""), color=alt.Color("point_label:N", scale=alt.Scale(range=["#f2a619"]), title=None), tooltip=[ - alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("x:Q", format=",.3f", title="X"), alt.Tooltip("y:Q", format=",.3f", title="Y"), ], ) @@ -78,11 +79,11 @@ def plot_fit_and_intervals(df): .transform_filter(~alt.datum["test"]) # filter to training data .mark_line(color="black", size=3) .encode( - x=alt.X("X:Q", scale=alt.Scale(nice=False)), - y=alt.Y("y:Q", title="f(x)"), + x=alt.X("x:Q", scale=alt.Scale(nice=False)), + y=alt.Y("y:Q", title="Y"), color=alt.Color("y_true_label:N", scale=alt.Scale(range=["black"]), title=None), tooltip=[ - alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("x:Q", format=",.3f", title="X"), alt.Tooltip("y:Q", format=",.3f", title="Y"), ], ) @@ -93,11 +94,11 @@ def plot_fit_and_intervals(df): .transform_filter(~alt.datum["test"]) # filter to training data .mark_line(color="#006aff", size=5) .encode( - x=alt.X("X:Q", scale=alt.Scale(nice=False)), + x=alt.X("x:Q", scale=alt.Scale(nice=False)), y=alt.Y("y_pred:Q", title=""), color=alt.Color("y_pred_label:N", scale=alt.Scale(range=["#006aff"]), title=None), tooltip=[ - alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("x:Q", format=",.3f", title="X"), alt.Tooltip("y:Q", format=",.3f", title="Y"), alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"), ], @@ -109,11 +110,11 @@ def plot_fit_and_intervals(df): .transform_filter(~alt.datum["test"]) # filter to training data .mark_area(color="#e0f2ff", opacity=0.8) .encode( - x=alt.X("X:Q", scale=alt.Scale(nice=False), title="x"), + x=alt.X("x:Q", scale=alt.Scale(nice=False), title="X"), y=alt.Y("y_pred_low:Q", title=""), y2=alt.Y2("y_pred_upp:Q", title=None), tooltip=[ - alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("x:Q", format=",.3f", title="X"), alt.Tooltip("y:Q", format=",.3f", title="Y"), alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"), alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"), diff --git a/examples/plot_quantile_extrapolation.py b/examples/plot_quantile_extrapolation.py index 0ce3a5f..6718ac4 100755 --- a/examples/plot_quantile_extrapolation.py +++ b/examples/plot_quantile_extrapolation.py @@ -435,8 +435,8 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None base = alt.Chart(df.assign(**{"point_label": "Observations", "line_label": func_str})) points_true = base.mark_circle(size=20).encode( - x=alt.X("X_true:Q", scale=x_scale, title="x"), - y=alt.Y("y_true:Q", scale=y_scale, title="y"), + x=alt.X("X_true:Q", scale=x_scale, title="X"), + y=alt.Y("y_true:Q", scale=y_scale, title="Y"), color=points_color, tooltip=tooltip_true, ) @@ -471,7 +471,7 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None base.transform_aggregate(coverage="mean(coverage)") .transform_calculate( coverage_text=( - f"'Extrapolated Coverage: '" + "'Extrapolated Coverage: '" f" + format({alt.datum['coverage'] * 100}, '.1f') + '%'" f" + ' (target = {(quantiles[1] - quantiles[0]) * 100}%)'" ) diff --git a/examples/plot_quantile_interpolation.py b/examples/plot_quantile_interpolation.py index 147ecc9..c9f589d 100755 --- a/examples/plot_quantile_interpolation.py +++ b/examples/plot_quantile_interpolation.py @@ -2,12 +2,14 @@ Comparing Quantile Interpolation Methods ======================================== -This example illustrates the interpolation methods that can be applied during -prediction when the desired quantile lies between two data points. In this toy -example, the forest estimator creates a single split that separates samples -1–3 and samples 4–5, with quantiles calculated separately for these two groups -based on the actual sample values. The interpolation methods are used when a -calculated quantile does not precisely correspond to one of the actual values. +This example illustrates different interpolation methods that can be used +during prediction in quantile regression forests (QRF). When a desired quantile +lies between two data points, interpolation methods determine the predicted +value. In this toy example, the QRF creates a split that divides the samples +into two groups (samples 1–3 and samples 4–5), with quantiles calculated +separately for each. The interpolation methods demonstrate how predictions are +handled when a quantile does not exactly match a data point. + """ import altair as alt @@ -23,6 +25,8 @@ X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]]) y = np.array([-2, -1, 0, 1, 2]) +# We use a single estimator that retains all leaf samples and is trained without bootstrap. +# By construction of the data, this leads to samples split between two terminal leaf nodes. qrf = RandomForestQuantileRegressor( n_estimators=1, max_samples_leaf=None, @@ -55,10 +59,11 @@ "quantile_upp": [None] * len(y), } + quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)] + # Populate data based on prediction results with different interpolations. for interpolation in interpolations: # Get predictions at median and prediction intervals. - quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)] y_pred = qrf.predict(X, quantiles=quantiles, interpolation=interpolation.lower()) data["method"].extend([interpolation] * len(y)) @@ -77,7 +82,7 @@ def plot_interpolations(df, legend): # Slider for varying the prediction interval that determines the quantiles being interpolated. slider = alt.binding_range(min=0, max=1, step=0.01, name="Prediction Interval: ") - interval_selection = alt.param(value=0.9, bind=slider, name="interval") + interval_val = alt.param(value=0.9, bind=slider, name="interval") click = alt.selection_point(fields=["method"], bind="legend") @@ -90,9 +95,9 @@ def plot_interpolations(df, legend): tooltip = [ alt.Tooltip("method:N", title="Method"), alt.Tooltip("X:N", title="X Values"), - alt.Tooltip("y_pred:Q", format=".3f", title="Predicted Y"), - alt.Tooltip("y_pred_low:Q", format=".3f", title="Predicted Lower Y"), - alt.Tooltip("y_pred_upp:Q", format=".3f", title="Predicted Upper Y"), + alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"), + alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"), + alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"), alt.Tooltip("quantile_low:Q", format=".3f", title="Lower Quantile"), alt.Tooltip("quantile_upp:Q", format=".3f", title="Upper Quantile"), ] @@ -132,7 +137,7 @@ def plot_interpolations(df, legend): chart = ( (area + point) - .add_params(interval_selection, click) + .add_params(interval_val, click) .transform_filter( "(datum.method == 'Actual')" "| (datum.quantile_low == round((0.5 - interval / 2) * 1000) / 1000)" diff --git a/examples/plot_quantile_intervals.py b/examples/plot_quantile_intervals.py index 72412a4..e678ac1 100755 --- a/examples/plot_quantile_intervals.py +++ b/examples/plot_quantile_intervals.py @@ -3,8 +3,8 @@ ================================================ This example demonstrates how to use quantile regression forests (QRF) to -generate prediction intervals on the California Housing dataset. Inspired by -Figure 3 of `"Quantile Regression Forests" +generate prediction intervals on the California Housing dataset. The +visualization is inspired by Figure 3 of `"Quantile Regression Forests" `_ by Meinshausen. """ diff --git a/examples/plot_quantile_multioutput.py b/examples/plot_quantile_multioutput.py index af035bf..be97688 100644 --- a/examples/plot_quantile_multioutput.py +++ b/examples/plot_quantile_multioutput.py @@ -5,7 +5,7 @@ This example demonstrates how to fit a single quantile regressor for multiple target variables on a toy dataset. For each target, multiple quantiles can be estimated simultaneously. In this example, the target variable has two output -values for each sample, with a single regressor used to estimate many +values for each sample, with a single regressor used to estimate multiple quantiles simultaneously. Three of these quantiles are visualized concurrently for each target: the median line and the area defined by the interval points. """ @@ -52,8 +52,7 @@ def make_func_Xy(funcs, bounds, n_samples): def format_frac(fraction): - formatted = ("%.3g" % fraction).rstrip("0").rstrip(".") - return formatted if formatted else "0" + return f"{fraction:.3g}".rstrip("0").rstrip(".") or "0" # Create the dataset with multiple target variables. @@ -67,28 +66,26 @@ def format_frac(fraction): # Get multi-target predictions at specified quantiles. y_pred = qrf.predict(X, quantiles=quantiles) +# Prepare the data frame. df = pd.DataFrame( { "x": np.tile(X.squeeze(), len(funcs)), "y": y.reshape(-1, order="F"), "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]), "y_pred": np.concatenate([y_pred[:, i, len(quantiles) // 2] for i in range(len(funcs))]), - "target": np.concatenate([[f"{i}"] * len(X) for i in range(len(funcs))]), + "target": np.concatenate([[str(i)] * len(X) for i in range(len(funcs))]), } -).join( - pd.DataFrame( - { - f"q_{format_frac(q)}": np.concatenate([y_pred[:, t, idx] for t in range(len(funcs))]) - for idx, q in enumerate(quantiles) - } - ) ) +# Add quantile predictions to the data frame. +for idx, q in enumerate(quantiles): + df[f"q_{format_frac(q)}"] = y_pred[:, :, idx].flatten(order="F") + def plot_multitargets(df, legend): # Slider for varying the displayed prediction intervals. slider = alt.binding_range(min=0, max=1, step=0.05, name="Prediction Interval: ") - interval_selection = alt.param(value=0.95, bind=slider, name="interval") + interval_val = alt.param(value=0.95, bind=slider, name="interval") click = alt.selection_point(fields=["target"], bind="legend") @@ -119,14 +116,14 @@ def plot_multitargets(df, legend): base = ( alt.Chart(df) .transform_calculate( - quantile_low=f"round((0.5 - interval / 2) * 1000) / 1000", - quantile_upp=f"round((0.5 + interval / 2) * 1000) / 1000", + quantile_low="round((0.5 - interval / 2) * 1000) / 1000", + quantile_upp="round((0.5 + interval / 2) * 1000) / 1000", quantile_low_col="'q_' + datum.quantile_low", quantile_upp_col="'q_' + datum.quantile_upp", ) .transform_calculate( - y_pred_low=f"datum[datum.quantile_low_col]", - y_pred_upp=f"datum[datum.quantile_upp_col]", + y_pred_low="datum[datum.quantile_low_col]", + y_pred_upp="datum[datum.quantile_upp_col]", ) ) @@ -138,15 +135,15 @@ def plot_multitargets(df, legend): ) line = base.mark_line(color="black", size=3).encode( - x=alt.X("x:Q", scale=alt.Scale(nice=False), title="x"), - y=alt.Y("y_pred:Q", title="y"), + x=alt.X("x:Q", scale=alt.Scale(nice=False), title="X"), + y=alt.Y("y_pred:Q", title="Y"), color=color, tooltip=tooltip, ) area = base.mark_area(opacity=0.25).encode( - x=alt.X("x:Q", scale=alt.Scale(nice=False), title="x"), - y=alt.Y("y_pred_low:Q", title="y"), + x=alt.X("x:Q", scale=alt.Scale(nice=False), title="X"), + y=alt.Y("y_pred_low:Q", title="Y"), y2=alt.Y2("y_pred_upp:Q", title=None), color=color, tooltip=tooltip, @@ -154,7 +151,7 @@ def plot_multitargets(df, legend): chart = ( (points + area + line) - .add_params(interval_selection, click) + .add_params(interval_val, click) .configure_range(category=alt.RangeScheme(list(legend.values()))) .properties( height=400, diff --git a/examples/plot_quantile_ranks.py b/examples/plot_quantile_ranks.py index 746052e..730d333 100644 --- a/examples/plot_quantile_ranks.py +++ b/examples/plot_quantile_ranks.py @@ -3,10 +3,10 @@ =================================================== This example demonstrates the use of quantile regression forest (QRF) quantile -ranks to identify potential outlier samples. In this scenario, we train a QRF -model on a toy dataset and use quantile ranks to highlight values that deviate -significantly from the expected range. Potential outliers are defined as -points whose quantile rank falls outside the specified threshold interval +ranks to identify potential outliers in a dataset. In this scenario, we train +a QRF model on a toy dataset and use quantile ranks to highlight values that +deviate significantly from the expected range. Potential outliers are defined +as points whose quantile rank falls outside the specified threshold interval around the median. """ @@ -33,37 +33,33 @@ def make_toy_dataset(n_samples, bounds, random_seed=0): X, y = make_toy_dataset(n_samples, bounds, random_seed=random_seed) -params = {"max_samples_leaf": None, "min_samples_leaf": 50, "random_state": random_seed} -qrf = RandomForestQuantileRegressor(**params).fit(X, y) +qrf = RandomForestQuantileRegressor( + max_samples_leaf=None, + min_samples_leaf=50, + random_state=random_seed, +).fit(X, y) y_pred = qrf.predict(X, quantiles=0.5) # Get the quantile rank for all samples. y_ranks = qrf.quantile_ranks(X, y) -df = pd.DataFrame( - { - "x": X.reshape(-1), - "y": y, - "y_pred": y_pred, - "y_rank": y_ranks, - } -) +df = pd.DataFrame({"x": X.reshape(-1), "y": y, "y_pred": y_pred, "y_rank": y_ranks}) def plot_fit_and_ranks(df): # Slider for varying the interval that defines the upper and lower quantile rank thresholds. slider = alt.binding_range(min=0, max=1, step=0.01, name="Rank Interval Threshold: ") - interval_selection = alt.param("interval_selection", bind=slider, value=0.05) + interval_val = alt.param(value=0.05, bind=slider, name="interval") click = alt.selection_point(fields=["outlier"], bind="legend") base = alt.Chart(df) points = ( - base.add_params(interval_selection, click) + base.add_params(interval_val, click) .transform_calculate( - outlier="abs(datum.y_rank - 0.5) > (0.5 - interval_selection / 2) ? 'Yes' : 'No'" + outlier="abs(datum.y_rank - 0.5) > (0.5 - interval / 2) ? 'Yes' : 'No'" ) .mark_circle(opacity=0.5, size=25) .encode( @@ -79,8 +75,8 @@ def plot_fit_and_ranks(df): alt.value("lightgray"), ), tooltip=[ - alt.Tooltip("x:Q", format=".3f", title="x"), - alt.Tooltip("y:Q", format=".3f", title="f(x)"), + alt.Tooltip("x:Q", format=",.3f", title="X"), + alt.Tooltip("y:Q", format=",.3f", title="Y"), alt.Tooltip("y_rank:Q", format=".3f", title="Quantile Rank"), alt.Tooltip("outlier:N", title="Outlier"), ], @@ -88,8 +84,13 @@ def plot_fit_and_ranks(df): ) line_pred = base.mark_line(color="#006aff", size=4).encode( - x=alt.X("x:Q", axis=alt.Axis(title="x")), - y=alt.Y("y_pred:Q", axis=alt.Axis(title="f(x)")), + x=alt.X("x:Q", axis=alt.Axis(title="X")), + y=alt.Y("y_pred:Q", axis=alt.Axis(title="Y")), + tooltip=[ + alt.Tooltip("x:Q", format=",.3f", title="X"), + alt.Tooltip("y:Q", format=",.3f", title="Y"), + alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"), + ], ) # For desired legend labels. diff --git a/examples/plot_quantile_vs_standard.py b/examples/plot_quantile_vs_standard.py index 56aa5ea..ee7ccd8 100755 --- a/examples/plot_quantile_vs_standard.py +++ b/examples/plot_quantile_vs_standard.py @@ -3,12 +3,12 @@ ============================================== This example compares the predictions generated by a quantile regression -forest (QRF) and a standard random forest regressor on a synthetic +forest (QRF) and a standard random forest regressor (RF) on a synthetic right-skewed dataset. In a right-skewed distribution, the mean is to the right -of the median. As illustrated by a greater overlap in the frequencies of the -actual and predicted values, the median (quantile = 0.5) predicted by a -quantile regressor can be a more reliable estimator of a skewed distribution -than the mean. +of the median. This example demonstrates how the median (quantile = 0.5) +predicted by a quantile regressor (QRF) can be a more reliable estimator than +the mean predicted by a standard random forest when dealing with skewed +distributions. """ import altair as alt @@ -24,18 +24,18 @@ random_seed = 0 rng = check_random_state(random_seed) -quantiles = np.linspace(0, 1, num=21, endpoint=True).round(2).tolist() +quantiles = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist() # Create right-skewed dataset. n_samples = 5000 -a, loc, scale = 5, -1, 1 +a, loc, scale = 7, -1, 1 skewnorm_rv = sp.stats.skewnorm(a, loc, scale) skewnorm_rv.random_state = rng y = skewnorm_rv.rvs(n_samples) X = rng.randn(n_samples, 2) * y.reshape(-1, 1) -regr_rf = RandomForestRegressor(n_estimators=10, random_state=random_seed) -regr_qrf = RandomForestQuantileRegressor(n_estimators=10, random_state=random_seed) +regr_rf = RandomForestRegressor(random_state=random_seed) +regr_qrf = RandomForestQuantileRegressor(random_state=random_seed) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed) @@ -53,14 +53,15 @@ def format_frac(fraction): - formatted = ("%.3g" % fraction).rstrip("0").rstrip(".") - return formatted if formatted else "0" + return f"{fraction:.3g}".rstrip("0").rstrip(".") or "0" -df = pd.DataFrame({"actual": y_test, "rf": y_pred_rf}).join( - pd.DataFrame( - {f"qrf_{format_frac(q)}": y_pred_qrf[..., idx] for idx, q in enumerate(quantiles)} - ) +df = pd.DataFrame( + { + "actual": y_test, + "rf": y_pred_rf, + **{f"qrf_{format_frac(q)}": y_pred_qrf[..., idx] for idx, q in enumerate(quantiles)}, + } ) @@ -72,13 +73,13 @@ def plot_prediction_histograms(df, legend): step=0.5 if len(quantiles) == 1 else 1 / (len(quantiles) - 1), name="Predicted Quantile: ", ) - quantile_selection = alt.param(value=0.5, bind=slider, name="quantile") + quantile_val = alt.param(value=0.5, bind=slider, name="quantile") click = alt.selection_point(fields=["label"], bind="legend") chart = ( alt.Chart(df) - .add_params(quantile_selection, click) + .add_params(quantile_val, click) .transform_calculate(qrf_col="'qrf_' + quantile") .transform_calculate(qrf="datum[datum.qrf_col]") .transform_calculate(calculate="round(datum.actual * 10) / 10", as_="Actual") @@ -112,7 +113,7 @@ def plot_prediction_histograms(df, legend): .properties( height=400, width=650, - title="Distribution of RF vs. QRF Predictions on Right-Skewed Distribution", + title="Distribution of QRF vs. RF Predictions on Right-Skewed Distribution", ) ) return chart diff --git a/examples/plot_treeshap_example.py b/examples/plot_treeshap_example.py index 6bc13b2..6d05c2f 100644 --- a/examples/plot_treeshap_example.py +++ b/examples/plot_treeshap_example.py @@ -3,14 +3,15 @@ ========================================== This example demonstrates the use of SHAP (SHapley Additive exPlanations) to -explain the predictions of a quantile regression forest (QRF) model. We -generate a waterfall plot using the `Tree SHAP +explain the predictions of a quantile regression forest (QRF) model. SHAP +values offer insight into the contribution of each feature to the predicted +value. We generate a waterfall plot using the `Tree SHAP `_ -method to visualize the explanations for a single instance across multiple +method to visualize the explanations for a single sample across multiple quantiles. In a QRF, quantile estimation is applied during inference, meaning -the selected quantile affects the expected value of the model output but does -not alter the feature contributions. This plot allows us to observe how the -SHAP explanations vary with different quantile choices. +the selected quantile affects the specific value of the model output but does +not alter the underlying feature contributions. This plot allows us to observe +how the SHAP explanations vary with different quantile choices. """ import altair as alt @@ -29,6 +30,27 @@ def get_shap_values(qrf, X, quantile=0.5, **kwargs): + """Get QRF model SHAP values using Tree SHAP. + + Note that, at the time of writing, SHAP does not natively provide support + for models in which quantile estimation is applied during inference, such + as with QRFs. To address this limitation, this function adjusts the + explainer outputs based on the difference between the mean and quantile + predictions for each sample, creating new base and expected values. + + Parameters + ---------- + qrf : BaseForestQuantileRegressor + Quantile forest model object. + X : array-like, of shape (n_samples, n_features) + Input dataset for which SHAP values are calculated. + quantiles : float, default=0.5 + Quantile for which SHAP values are calculated. + + Returns + ------- + shap.Explanation: The SHAP values explanation object. + """ # Define a custom tree model. model = { "objective": qrf.criterion, @@ -60,6 +82,7 @@ def get_shap_values(qrf, X, quantile=0.5, **kwargs): def get_shap_value_by_index(shap_values, index): + """Get QRF model SHAP values for a specified index.""" shap_values_i = shap_values[index] shap_values_i.base_values = shap_values.base_values[index] return shap_values_i @@ -75,6 +98,10 @@ def get_shap_value_by_index(shap_values, index): qrf = RandomForestQuantileRegressor(random_state=random_seed) qrf.fit(X_train, y_train) +shap_values_list = [ + get_shap_value_by_index(get_shap_values(qrf, X_test, quantile=q), test_idx) for q in quantiles +] + df = pd.concat( [ pd.DataFrame( @@ -89,8 +116,7 @@ def get_shap_value_by_index(shap_values, index): "quantile": q, } ) - for q in quantiles - for shap_i in [get_shap_value_by_index(get_shap_values(qrf, X_test, quantile=q), test_idx)] + for shap_i, q in zip(shap_values_list, quantiles) ], ignore_index=True, ) @@ -106,7 +132,7 @@ def plot_shap_waterfall_with_quantiles(df, height=300): step=0.5 if len(quantiles) == 1 else 1 / (len(quantiles) - 1), name="Predicted Quantile: ", ) - quantile_selection = alt.param(value=0.5, bind=slider, name="quantile") + quantile_val = alt.param(value=0.5, bind=slider, name="quantile") df_grouped = ( df.groupby("quantile")[df.columns.tolist()] @@ -187,7 +213,8 @@ def plot_shap_waterfall_with_quantiles(df, height=300): alt.datum["shap_value"] > 0, alt.value("#ff0251"), alt.value("#018bfb") ), tooltip=[ - alt.Tooltip("feature_name:N", title="Feature"), + alt.Tooltip("feature_name:N", title="Feature Name"), + alt.Tooltip("feature_value:Q", title="Feature Value"), alt.Tooltip("shap_value:Q", format=",.2f", title="SHAP Value"), alt.Tooltip("start:Q", format=",.2f", title="SHAP Start"), alt.Tooltip("end:Q", format=",.2f", title="SHAP End"), @@ -270,7 +297,7 @@ def plot_shap_waterfall_with_quantiles(df, height=300): chart = ( (bars + points + text + rules) - .add_params(quantile_selection) + .add_params(quantile_val) .configure_view(strokeOpacity=0) .properties( width=600, height=height, title="Waterfall Plot of SHAP Values for QRF Predictions"