From f72ba1b9d2268d98e888c52780aeaddc072fda3b Mon Sep 17 00:00:00 2001 From: Reid Johnson Date: Mon, 5 Aug 2024 19:47:12 -0700 Subject: [PATCH] Update example plots --- examples/plot_huggingface_model.py | 2 +- examples/plot_proximity_counts.py | 50 ++++++----- examples/plot_quantile_conformalized.py | 5 +- examples/plot_quantile_interpolation.py | 2 +- examples/plot_quantile_multioutput.py | 112 +++++++++++------------- examples/plot_quantile_vs_standard.py | 4 +- examples/plot_treeshap_example.py | 2 +- 7 files changed, 86 insertions(+), 91 deletions(-) diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py index de01c67..c263159 100755 --- a/examples/plot_huggingface_model.py +++ b/examples/plot_huggingface_model.py @@ -31,7 +31,7 @@ repo_id = "quantile-forest/california-housing-example" load_existing = True -quantiles = list((np.arange(5) * 25) / 100) +quantiles = np.arange(0, 1.25, 0.25).round(2).tolist() sample_frac = 1 diff --git a/examples/plot_proximity_counts.py b/examples/plot_proximity_counts.py index 9a2008a..e1cff22 100644 --- a/examples/plot_proximity_counts.py +++ b/examples/plot_proximity_counts.py @@ -27,13 +27,13 @@ rng = check_random_state(0) -n_examples = 25 +n_test = 25 noise_std = 0.1 # Load the Digits dataset. X, y = datasets.load_digits(return_X_y=True, as_frame=True) -X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, random_state=0) def add_gaussian_noise(X, mean=0, std=0.1, random_state=None): @@ -72,27 +72,29 @@ def extract_floats(combined_df, scale=100): return df1, df2 -# Randomly corrupt a fraction of the training and test data. -X_train_corrupt = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=rng) -X_test_corrupt = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=rng) +# Randomly add noise to the training and test data. +X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=rng) +X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=rng) -# We set `max_samples_leaf=None` so that all leaf node samples are stored. +# We set `max_samples_leaf=None` to ensure that every sample in the training +# data is stored in the leaf nodes. By doing this, we allow the model to +# consider all samples as potential candidates for proximity calculations. qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=0) -qrf.fit(X_train_corrupt, X_train) +qrf.fit(X_train_noisy, X_train) # Get the proximity counts. -proximities = qrf.proximity_counts(X_test_corrupt) +proximities = qrf.proximity_counts(X_test_noisy) df_prox = pd.DataFrame( {"prox": [[(j, *p) for j, p in enumerate(proximities[i])] for i in range(len(X_test))]} ) df = ( - combine_floats(X_test, X_test_corrupt) + combine_floats(X_test, X_test_noisy) .join(y_test) .reset_index() .join(df_prox) - .iloc[:n_examples] + .iloc[:n_test] .explode("prox") .assign( **{ @@ -106,7 +108,7 @@ def extract_floats(combined_df, scale=100): ) df_lookup = ( - combine_floats(X_train, X_train_corrupt) + combine_floats(X_train, X_train_noisy) .assign(**{"index": np.arange(len(X_train))}) .join(y_train) ) @@ -146,23 +148,23 @@ def plot_digits_proximities( fold=[f"pixel_{y}_{x}" for y in range(8) for x in range(8)], as_=["pixel", "value"], ) - .transform_calculate(value_orig="floor(datum.value / 100)") - .transform_calculate(value_corr="datum.value - (datum.value_orig * 100)") + .transform_calculate(value_clean="floor(datum.value / 100)") + .transform_calculate(value_noisy="datum.value - (datum.value_clean * 100)") .transform_calculate(x="substring(datum.pixel, 8, 9)", y="substring(datum.pixel, 6, 7)") .mark_rect() .encode( x=alt.X("x:N", axis=None), y=alt.Y("y:N", axis=None), - color=alt.Color("value_corr:Q", legend=None, scale=alt.Scale(scheme="greys")), - opacity=alt.condition(alt.datum["value_corr"] == 0, alt.value(0), alt.value(0.67)), + color=alt.Color("value_noisy:Q", legend=None, scale=alt.Scale(scheme="greys")), + opacity=alt.condition(alt.datum["value_noisy"] == 0, alt.value(0), alt.value(0.67)), tooltip=[ alt.Tooltip("target:Q", title="Digit"), - alt.Tooltip("value_corr:Q", format=".3f", title="Pixel Value"), + alt.Tooltip("value_noisy:Q", format=".3f", title="Pixel Value"), alt.Tooltip("x:Q", title="Pixel X"), alt.Tooltip("y:Q", title="Pixel Y"), ], ) - .properties(height=height, width=width, title="Test Digit (corrupted)") + .properties(height=height, width=width, title="Test Digit (noisy)") ) chart2 = ( @@ -171,8 +173,8 @@ def plot_digits_proximities( .encode( x=alt.X("x:N", axis=None), y=alt.Y("y:N", axis=None), - color=alt.Color("value_orig:Q", legend=None, scale=alt.Scale(scheme="greys")), - opacity=alt.condition(alt.datum["value_orig"] == 0, alt.value(0), alt.value(0.67)), + color=alt.Color("value_clean:Q", legend=None, scale=alt.Scale(scheme="greys")), + opacity=alt.condition(alt.datum["value_clean"] == 0, alt.value(0), alt.value(0.67)), tooltip=[ alt.Tooltip("prox_cnt", title="Proximity Count"), alt.Tooltip("target:Q", title="Digit"), @@ -196,7 +198,7 @@ def plot_digits_proximities( fold=[f"pixel_{y}_{x}" for y in range(8) for x in range(8)], as_=["pixel", "value"], ) - .transform_calculate(value_orig="floor(datum.value / 100)") + .transform_calculate(value_clean="floor(datum.value / 100)") .transform_calculate(x="substring(datum.pixel, 8, 9)", y="substring(datum.pixel, 6, 7)") .properties( height=subplot_dim, width=subplot_dim, title=f"Proximity Digits (top {n_prox})" @@ -209,17 +211,17 @@ def plot_digits_proximities( fold=[f"pixel_{y}_{x}" for y in range(8) for x in range(8)], as_=["pixel", "value"], ) - .transform_calculate(value_orig="floor(datum.value / 100)") + .transform_calculate(value_clean="floor(datum.value / 100)") .transform_calculate(x="substring(datum.pixel, 8, 9)", y="substring(datum.pixel, 6, 7)") .mark_rect() .encode( x=alt.X("x:N", axis=None), y=alt.Y("y:N", axis=None), - color=alt.Color("value_orig:Q", legend=None, scale=alt.Scale(scheme="greys")), - opacity=alt.condition(alt.datum["value_orig"] == 0, alt.value(0), alt.value(0.67)), + color=alt.Color("value_clean:Q", legend=None, scale=alt.Scale(scheme="greys")), + opacity=alt.condition(alt.datum["value_clean"] == 0, alt.value(0), alt.value(0.67)), tooltip=[ alt.Tooltip("target:Q", title="Digit"), - alt.Tooltip("value_orig:Q", title="Pixel Value"), + alt.Tooltip("value_clean:Q", title="Pixel Value"), alt.Tooltip("x:Q", title="Pixel X"), alt.Tooltip("y:Q", title="Pixel Y"), ], diff --git a/examples/plot_quantile_conformalized.py b/examples/plot_quantile_conformalized.py index e9c9e9f..2448805 100755 --- a/examples/plot_quantile_conformalized.py +++ b/examples/plot_quantile_conformalized.py @@ -33,7 +33,7 @@ random_state = 0 rng = check_random_state(random_state) -coverages = list(np.arange(11) / 10) # the "coverage level" +coverages = np.arange(0, 1.1, 0.1).round(1).tolist() # the "coverage level" # Load the California Housing Prices dataset. california = datasets.fetch_california_housing() @@ -146,7 +146,8 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test): "coverage": coverage_score(grp["y_test"], grp["y_pred_low"], grp["y_pred_upp"]), "width": mean_width_score(grp["y_pred_low"], grp["y_pred_upp"]), } - ) + ), + include_groups=False, ) .reset_index() ) diff --git a/examples/plot_quantile_interpolation.py b/examples/plot_quantile_interpolation.py index 7292fac..aa479a1 100755 --- a/examples/plot_quantile_interpolation.py +++ b/examples/plot_quantile_interpolation.py @@ -16,7 +16,7 @@ from quantile_forest import RandomForestQuantileRegressor -intervals = list(np.arange(101) / 100) +intervals = np.arange(0, 1.01, 0.01).round(2).tolist() # Create toy dataset. X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]]) diff --git a/examples/plot_quantile_multioutput.py b/examples/plot_quantile_multioutput.py index 2583df4..4b1001b 100755 --- a/examples/plot_quantile_multioutput.py +++ b/examples/plot_quantile_multioutput.py @@ -2,11 +2,12 @@ Multiple-Output Quantile Regression with QRFs ============================================= -This example demonstrates fitting a single quantile regressor for multiple +This example demonstrates how to fit a single quantile regressor for multiple target variables on a toy dataset. For each target, multiple quantiles can be estimated simultaneously. In this example, the target variable has two output -values for each sample, with a single regressor used to estimate three -quantiles (the median and interval points) for each target. +values for each sample, with a single regressor used to estimate many +quantiles simultaneously. Three of these quantiles are visualized concurrently +for each target: the median line and the area defined by the interval points. """ import altair as alt @@ -16,13 +17,11 @@ from quantile_forest import RandomForestQuantileRegressor -alt.data_transformers.disable_max_rows() - np.random.seed(0) n_samples = 2500 bounds = [0, 100] -intervals = list(np.arange(21) / 20) +quantiles = np.arange(0, 1.025, 0.025).round(3).tolist() # Define functions that generate targets; each function maps to one target. funcs = [ @@ -50,6 +49,11 @@ def make_func_Xy(funcs, bounds, n_samples): return np.atleast_2d(x).T, y +def format_frac(fraction): + formatted = ("%.3g" % fraction).rstrip("0").rstrip(".") + return formatted if formatted else "0" + + # Create the dataset with multiple target variables. X, y = make_func_Xy(funcs, bounds, n_samples) @@ -58,33 +62,30 @@ def make_func_Xy(funcs, bounds, n_samples): qrf = RandomForestQuantileRegressor(max_samples_leaf=None, max_depth=4, random_state=0) qrf.fit(X_train, y_train) # fit on all of the targets simultaneously -dfs = [] -for idx, interval in enumerate(intervals): - # Get multiple-output predictions at median and prediction intervals. - quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)] - y_pred = qrf.predict(X, quantiles=quantiles) +# Get multiple-output predictions at many quantiles. +y_pred = qrf.predict(X, quantiles=quantiles) - df_i = pd.DataFrame( +df = pd.DataFrame( + { + "x": np.tile(X.squeeze(), len(funcs)), + "y": y.reshape(-1, order="F"), + "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]), + "y_pred": np.concatenate([y_pred[:, i, len(quantiles) // 2] for i in range(len(funcs))]), + "target": np.concatenate([[f"{i}"] * len(X) for i in range(len(funcs))]), + } +).join( + pd.DataFrame( { - "x": np.tile(X.squeeze(), len(funcs)), - "y": y.reshape(-1, order="F"), - "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]), - "y_pred": np.concatenate([y_pred[:, i, 0] for i in range(len(funcs))]), - "y_pred_low": np.concatenate([y_pred[:, i, 1] for i in range(len(funcs))]), - "y_pred_upp": np.concatenate([y_pred[:, i, 2] for i in range(len(funcs))]), - "quantile_low": np.concatenate([[quantiles[1]] * len(X) for i in range(len(funcs))]), - "quantile_upp": np.concatenate([[quantiles[2]] * len(X) for i in range(len(funcs))]), - "target": np.concatenate([[f"{i}"] * len(X) for i in range(len(funcs))]), + f"q_{format_frac(q)}": np.concatenate([y_pred[:, t, idx] for t in range(len(funcs))]) + for idx, q in enumerate(quantiles) } ) - dfs.append(df_i) -df = pd.concat(dfs) +) def plot_multioutputs(df, legend): slider = alt.binding_range(min=0, max=1, step=0.05, name="Prediction Interval: ") interval_selection = alt.param(value=0.95, bind=slider, name="interval") - interval_tol = 0.001 click = alt.selection_point(fields=["target"], bind="legend") @@ -112,52 +113,43 @@ def plot_multioutputs(df, legend): alt.Tooltip("quantile_upp:Q", format=".3f", title="Upper Quantile"), ] - points = ( + base = ( alt.Chart(df) - .mark_circle(color="black", opacity=0.25, size=25) - .encode( - x=alt.X("x:Q", scale=alt.Scale(nice=False)), - y=alt.Y("y:Q"), - color=alt.condition(click, alt.Color("target:N"), alt.value("lightgray")), - tooltip=tooltip, + .transform_calculate( + quantile_low=f"round((0.5 - interval / 2) * 1000) / 1000", + quantile_upp=f"round((0.5 + interval / 2) * 1000) / 1000", + quantile_low_col="'q_' + datum.quantile_low", + quantile_upp_col="'q_' + datum.quantile_upp", + ) + .transform_calculate( + y_pred_low=f"datum[datum.quantile_low_col]", + y_pred_upp=f"datum[datum.quantile_upp_col]", ) ) - line = ( - alt.Chart(df) - .mark_line(color="black", size=3) - .encode( - x=alt.X("x:Q", scale=alt.Scale(nice=False), title="x"), - y=alt.Y("y_pred:Q", title="y"), - color=color, - tooltip=tooltip, - ) + points = base.mark_circle(color="black", opacity=0.25, size=25).encode( + x=alt.X("x:Q", scale=alt.Scale(nice=False)), + y=alt.Y("y:Q"), + color=alt.condition(click, alt.Color("target:N"), alt.value("lightgray")), + tooltip=tooltip, ) - area = ( - alt.Chart(df) - .mark_area(opacity=0.25) - .encode( - x=alt.X("x:Q", scale=alt.Scale(nice=False), title="x"), - y=alt.Y("y_pred_low:Q", title="y"), - y2=alt.Y2("y_pred_upp:Q", title=None), - color=color, - tooltip=tooltip, - ) + line = base.mark_line(color="black", size=3).encode( + x=alt.X("x:Q", scale=alt.Scale(nice=False), title="x"), + y=alt.Y("y_pred:Q", title="y"), + color=color, + tooltip=tooltip, ) + area = base.mark_area(opacity=0.25).encode( + x=alt.X("x:Q", scale=alt.Scale(nice=False), title="x"), + y=alt.Y("y_pred_low:Q", title="y"), + y2=alt.Y2("y_pred_upp:Q", title=None), + color=color, + tooltip=tooltip, + ) chart = ( (points + area + line) - .transform_filter( - ( - (alt.datum.quantile_low >= (0.5 - interval_selection / 2 - interval_tol)) - & (alt.datum.quantile_low <= (0.5 - interval_selection / 2 + interval_tol)) - ) - | ( - (alt.datum.quantile_upp >= (0.5 + interval_selection / 2 - interval_tol)) - & (alt.datum.quantile_upp <= (0.5 + interval_selection / 2 + interval_tol)) - ) - ) .add_params(interval_selection, click) .configure_range(category=alt.RangeScheme(list(legend.values()))) .properties(height=400, width=650, title="Multi-target Prediction Intervals") diff --git a/examples/plot_quantile_vs_standard.py b/examples/plot_quantile_vs_standard.py index 08f9d60..2d18973 100755 --- a/examples/plot_quantile_vs_standard.py +++ b/examples/plot_quantile_vs_standard.py @@ -25,6 +25,8 @@ rng = check_random_state(0) +quantiles = np.arange(0, 1.05, 0.05).round(2).tolist() + # Create right-skewed dataset. n_samples = 5000 a, loc, scale = 5, -1, 1 @@ -33,8 +35,6 @@ y = skewnorm_rv.rvs(n_samples) X = rng.randn(n_samples, 2) * y.reshape(-1, 1) -quantiles = list(np.arange(21) * 5 / 100) - regr_rf = RandomForestRegressor(n_estimators=10, random_state=0) regr_qrf = RandomForestQuantileRegressor(n_estimators=10, random_state=0) diff --git a/examples/plot_treeshap_example.py b/examples/plot_treeshap_example.py index 0bb8986..c6670ac 100644 --- a/examples/plot_treeshap_example.py +++ b/examples/plot_treeshap_example.py @@ -24,7 +24,7 @@ n_samples = 500 test_idx = 0 -quantiles = list((np.arange(11) * 10) / 100) +quantiles = np.arange(0, 1.1, 0.1).round(1).tolist() def get_shap_values(qrf, X, quantile=0.5, **kwargs):