From ee9f371eb847f9b4308d2d572f719ce97e8f2a1f Mon Sep 17 00:00:00 2001 From: Reid Johnson Date: Sun, 18 Aug 2024 19:35:55 -0700 Subject: [PATCH] Update example plots --- examples/plot_huggingface_model.py | 4 ++- examples/plot_predict_custom.py | 23 +++++++++------- examples/plot_proximity_counts.py | 36 ++++++++++++------------- examples/plot_quantile_conformalized.py | 2 +- examples/plot_quantile_example.py | 25 ++++++++--------- examples/plot_quantile_extrapolation.py | 5 ++-- examples/plot_quantile_interpolation.py | 5 ++-- examples/plot_quantile_intervals.py | 4 +-- examples/plot_quantile_multioutput.py | 5 ++-- examples/plot_quantile_ranks.py | 4 ++- examples/plot_quantile_vs_standard.py | 20 +++++++++----- examples/plot_treeshap_example.py | 3 ++- 12 files changed, 78 insertions(+), 58 deletions(-) diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py index 560fd62..be93c63 100755 --- a/examples/plot_huggingface_model.py +++ b/examples/plot_huggingface_model.py @@ -50,7 +50,9 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= from sklearn.model_selection import train_test_split from skops import card + # Load the California Housing dataset. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) # Fit the model. @@ -160,7 +162,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= with open(f"{local_dir}/{model_filename}", "rb") as file: qrf = pickle.load(file) -# Estimate quantiles. +# Fetch the California Housing dataset and estimate quantiles. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) y_pred = qrf.predict(X, quantiles=quantiles) * 100_000 # predict in dollars diff --git a/examples/plot_predict_custom.py b/examples/plot_predict_custom.py index 81d9507..38483a1 100755 --- a/examples/plot_predict_custom.py +++ b/examples/plot_predict_custom.py @@ -25,12 +25,12 @@ n_test_samples = 100 -def predict(reg, X, quantiles=0.5, what=None): +def predict(qrf, X, quantiles=0.5, what=None): """Custom prediction method that allows user-specified function. Parameters ---------- - reg : BaseForestQuantileRegressor + qrf : BaseForestQuantileRegressor Quantile forest model object. X : array-like, of shape (n_samples, n_features) New test data. @@ -45,16 +45,16 @@ def predict(reg, X, quantiles=0.5, what=None): Output array with the user-specified function applied (if any). """ if what is None: - return reg.predict(X, quantiles=quantiles) + return qrf.predict(X, quantiles=quantiles) - # Get the complete set of proximities (training indices) for each sample. - proximities = reg.proximity_counts(X) + # Get the complete set of proximities for each sample. + proximities = qrf.proximity_counts(X) # Retrieve the unsorted training responses from the model (stored in sorted order). - reverse_sorter = np.argsort(reg.sorter_, axis=0) - y_train = np.empty_like(reg.forest_.y_train).T + reverse_sorter = np.argsort(qrf.sorter_, axis=0) + y_train = np.empty_like(qrf.forest_.y_train).T for i in range(y_train.shape[1]): - y_train[:, i] = np.asarray(reg.forest_.y_train)[i, reverse_sorter[:, i]] + y_train[:, i] = np.asarray(qrf.forest_.y_train)[i, reverse_sorter[:, i]] # For each sample, construct an array of the training responses used for prediction. s = [np.concatenate([np.repeat(y_train[i[0]], i[1]) for i in prox]) for prox in proximities] @@ -65,25 +65,28 @@ def predict(reg, X, quantiles=0.5, what=None): return y_out +# Load the Diabetes dataset. X, y = datasets.load_diabetes(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=n_test_samples, random_state=random_state ) -reg = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train) +qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train) # Define a user-specified function. # Here we randomly sample 1,000 values with replacement from the empirical distribution. func = lambda x: random_state.choice(x, size=1000) # Output array with the user-specified function applied to each sample's empirical distribution. -y_out = predict(reg, X_test, what=func) +y_out = predict(qrf, X_test, what=func) dfs = [] for idx in range(n_test_samples): # Calculate the ECDF from output array. y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out[idx].reshape(1, -1)] + # Get quantiles (x-axis) and probabilities (y-axis). quantiles = list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])) probabilities = list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf])) diff --git a/examples/plot_proximity_counts.py b/examples/plot_proximity_counts.py index 9c950a2..bf701f4 100644 --- a/examples/plot_proximity_counts.py +++ b/examples/plot_proximity_counts.py @@ -5,13 +5,12 @@ This example demonstrates the use of quantile regression forest (QRF) proximity counts to identify similar samples. In this scenario, we train a QRF on a noisy dataset to predict individual pixel values in an unsupervised -manner (the target labels are not used during training) for denoising -purposes. We then retrieve the proximity values for the noisy test samples. We -visualize each test sample alongside a set of similar (non-noisy) training -samples determined by their proximity counts. These similar samples are -ordered from the highest to the lowest proximity count. This illustrates how -proximity counts can effectively identify similar samples even in noisy -conditions. +manner for denoising purposes; the target labels are not used during training. +We then retrieve the proximity values for the noisy test samples. We visualize +each test sample alongside a set of similar (non-noisy) training samples +determined by their proximity counts. These similar samples are ordered from +the highest to the lowest proximity count. This illustrates how proximity +counts can effectively identify similar samples even in noisy conditions. """ import altair as alt @@ -57,20 +56,20 @@ def add_gaussian_noise(X, mean=0, std=0.1, random_state=None): return X_noisy -def combine_floats(df1, df2, scale=100): +def combine_floats(df1, df2, scale=1000): """Combine two floats from separate data frames into a single number.""" combined_df = df1 * scale + df2 return combined_df -def extract_floats(combined_df, scale=100): +def extract_floats(combined_df, scale=1000): """Extract the original floats from the combined data frame.""" df1 = np.floor(combined_df / scale) df2 = combined_df - (df1 * scale) return df1, df2 -# Randomly add noise to the training and test data. +# Randomly add Gaussian noise to the training and test data. X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_state) X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_state) @@ -78,10 +77,13 @@ def extract_floats(combined_df, scale=100): # data is stored in the leaf nodes. By doing this, we allow the model to # consider all samples as potential candidates for proximity calculations. qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state) + +# Fit the model to predict the non-noisy pixels from noisy pixels (i.e., to denoise). +# We fit a single multi-target model, with each pixel treated as a distinct target. qrf.fit(X_train_noisy, X_train) # Get the proximity counts. -proximities = qrf.proximity_counts(X_test_noisy) +proximities = qrf.proximity_counts(X_test_noisy) # output is a list of tuples for each sample df_prox = pd.DataFrame( {"prox": [[(i, *p) for i, p in enumerate(proximities[x])] for x in range(len(X_test))]} @@ -107,8 +109,8 @@ def extract_floats(combined_df, scale=100): # Create a data frame for looking up training proximities. df_lookup = ( combine_floats(X_train, X_train_noisy, scale=pixel_scale) # combine to reduce transmitted data - .assign(**{"index": np.arange(len(X_train))}) .join(y_train) + .assign(**{"index": np.arange(len(X_train))}) # align with proximities, which are zero-indexed ) @@ -123,12 +125,10 @@ def plot_digits_proximities( height=225, width=225, ): - dim_x = pixel_dim[0] - dim_y = pixel_dim[1] - num_x = len(str(dim_x)) - num_y = len(str(dim_y)) + dim_x, dim_y = pixel_dim[0], pixel_dim[1] + dgt_x, dgt_y = len(str(dim_x)), len(str(dim_y)) - pixel_cols = [f"pixel_{y:0{num_y}}_{x:0{num_x}}" for y in range(dim_y) for x in range(dim_x)] + pixel_cols = [f"pixel_{y:0{dgt_y}}_{x:0{dgt_x}}" for y in range(dim_y) for x in range(dim_x)] pixel_x = "split(datum.pixel, '_')[2]" pixel_y = "split(datum.pixel, '_')[1]" @@ -171,7 +171,7 @@ def plot_digits_proximities( ) chart2 = ( - base.transform_filter(f"datum.prox_idx < {n_prox}") + base.transform_filter(f"datum.prox_idx < {n_prox}") # filter to the display proximities .transform_lookup( lookup="prox_val", from_=alt.LookupData(df_lookup, key="index", fields=pixel_cols + ["target"]), diff --git a/examples/plot_quantile_conformalized.py b/examples/plot_quantile_conformalized.py index 5612569..25b8713 100755 --- a/examples/plot_quantile_conformalized.py +++ b/examples/plot_quantile_conformalized.py @@ -34,7 +34,7 @@ "cqr": "Conformalized Quantile Regression (CQR)", } -# Load the California Housing Prices dataset. +# Load the California Housing dataset. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) perm = random_state.permutation(min(len(X), n_samples)) X = X.iloc[perm] diff --git a/examples/plot_quantile_example.py b/examples/plot_quantile_example.py index 97576dc..554a7bc 100755 --- a/examples/plot_quantile_example.py +++ b/examples/plot_quantile_example.py @@ -23,6 +23,7 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0): + """Make a toy dataset.""" random_state = check_random_state(random_state) x = random_state.uniform(*bounds, size=n_samples) @@ -35,7 +36,7 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0): return np.atleast_2d(x).T, y -# Create noisy data for modeling and non-noisy function data for illustration. +# Create a noisy dataset for modeling and a non-noisy version for illustration. X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0) X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_state=0) @@ -44,17 +45,17 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0): qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_state) qrf.fit(X_train, y_train) -y_pred_func = qrf.predict(X_func, quantiles=quantiles) -y_pred_test = qrf.predict(X_test, quantiles=quantiles) +y_pred_test = qrf.predict(X_test, quantiles=quantiles) # predict on noisy samples +y_pred_func = qrf.predict(X_func, quantiles=quantiles) # predict on non-noisy samples df = pd.DataFrame( { - "x": np.concatenate([X_func.reshape(-1), X_test.reshape(-1)]), - "y": np.concatenate([y_func, y_test]), - "y_pred": np.concatenate([y_pred_func[:, 1], y_pred_test[:, 1]]), - "y_pred_low": np.concatenate([y_pred_func[:, 0], y_pred_test[:, 0]]), - "y_pred_upp": np.concatenate([y_pred_func[:, 2], y_pred_test[:, 2]]), - "test": [False] * len(y_func) + [True] * len(y_test), + "x": np.concatenate([X_test.reshape(-1), X_func.reshape(-1)]), + "y": np.concatenate([y_test, y_func]), + "y_pred": np.concatenate([y_pred_test[:, 1], y_pred_func[:, 1]]), + "y_pred_low": np.concatenate([y_pred_test[:, 0], y_pred_func[:, 0]]), + "y_pred_upp": np.concatenate([y_pred_test[:, 2], y_pred_func[:, 2]]), + "test": [True] * len(y_test) + [False] * len(y_func), } ) @@ -62,7 +63,7 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0): def plot_fit_and_intervals(df): area_pred = ( alt.Chart(df) - .transform_filter(~alt.datum["test"]) # filter to training data + .transform_filter(~alt.datum["test"]) # filter to non-test data .mark_area(color="#e0f2ff", opacity=0.8) .encode( x=alt.X("x:Q", scale=alt.Scale(nice=False), title="X"), @@ -95,7 +96,7 @@ def plot_fit_and_intervals(df): line_true = ( alt.Chart(df.assign(**{"y_true_label": "f(x) = x sin(x)"})) - .transform_filter(~alt.datum["test"]) # filter to training data + .transform_filter(~alt.datum["test"]) # filter to non-test data .mark_line(color="black", size=3) .encode( x=alt.X("x:Q", scale=alt.Scale(nice=False)), @@ -110,7 +111,7 @@ def plot_fit_and_intervals(df): line_pred = ( alt.Chart(df.assign(**{"y_pred_label": "Predicted Median"})) - .transform_filter(~alt.datum["test"]) # filter to training data + .transform_filter(~alt.datum["test"]) # filter to non-test data .mark_line(color="#006aff", size=5) .encode( x=alt.X("x:Q", scale=alt.Scale(nice=False)), diff --git a/examples/plot_quantile_extrapolation.py b/examples/plot_quantile_extrapolation.py index 3c59082..9aa0e1e 100755 --- a/examples/plot_quantile_extrapolation.py +++ b/examples/plot_quantile_extrapolation.py @@ -34,6 +34,7 @@ def make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0): + """Make a dataset from a specified function.""" random_state = check_random_state(random_state) x = np.linspace(bounds[0], bounds[1], n_samples) @@ -366,7 +367,7 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, * return np.mean(xtra[test_indices]) -# Create the full dataset. +# Create a dataset that requires extrapolation. X, y = make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0) # Fit and extrapolate based on train-test split (depending on X). @@ -377,7 +378,7 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, * train_indices[sort_X[extrap_min_idx] : sort_X[extrap_max_idx]] = True res = train_test_split(train_indices, rng=random_state, **qrf_params) -# Get coverages on extrapolated samples. +# Get coverages for extrapolated samples. args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], random_state) cov_qrf = get_coverage_qrf(res["qmat"], *args) cov_xtr = get_coverage_xtr(res["bounds_list"], *args) diff --git a/examples/plot_quantile_interpolation.py b/examples/plot_quantile_interpolation.py index c6e0c29..3b19c9c 100755 --- a/examples/plot_quantile_interpolation.py +++ b/examples/plot_quantile_interpolation.py @@ -21,7 +21,7 @@ random_state = np.random.RandomState(0) intervals = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist() -# Create toy dataset. +# Create a simple toy dataset. X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]]) y = np.array([-2, -1, 0, 1, 2]) @@ -59,11 +59,12 @@ "quantile_upp": [None] * len(y), } + # Make predictions at the median and intervals. quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)] # Populate data based on prediction results with different interpolations. for interpolation in interpolations: - # Get predictions at median and prediction intervals. + # Get predictions using the specified quantiles and interpolation method. y_pred = qrf.predict(X, quantiles=quantiles, interpolation=interpolation.lower()) data["method"].extend([interpolation] * len(y)) diff --git a/examples/plot_quantile_intervals.py b/examples/plot_quantile_intervals.py index 638b889..6ee60a2 100755 --- a/examples/plot_quantile_intervals.py +++ b/examples/plot_quantile_intervals.py @@ -19,7 +19,7 @@ random_state = np.random.RandomState(0) n_samples = 1000 -# Load the California Housing Prices dataset. +# Load the California Housing dataset. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) perm = random_state.permutation(min(len(X), n_samples)) X = X.iloc[perm] @@ -27,7 +27,7 @@ qrf = RandomForestQuantileRegressor(random_state=random_state) -kf = KFold(n_splits=5) +kf = KFold(n_splits=5, shuffle=True, random_state=random_state) kf.get_n_splits(X) # Using k-fold cross-validation, get predictions for all samples. diff --git a/examples/plot_quantile_multioutput.py b/examples/plot_quantile_multioutput.py index 3089c82..a4a4aff 100644 --- a/examples/plot_quantile_multioutput.py +++ b/examples/plot_quantile_multioutput.py @@ -40,6 +40,7 @@ def make_func_Xy(funcs, bounds, n_samples): + """Make a dataset from a specified function.""" x = np.linspace(*bounds, n_samples) y = np.empty((len(x), len(funcs))) for i, func in enumerate(funcs): @@ -47,7 +48,7 @@ def make_func_Xy(funcs, bounds, n_samples): return np.atleast_2d(x).T, y -# Create the dataset with multiple target variables. +# Create a dataset with multiple target variables. X, y = make_func_Xy(funcs, bounds, n_samples) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) @@ -56,7 +57,7 @@ def make_func_Xy(funcs, bounds, n_samples): qrf.fit(X_train, y_train) # fit on all of the targets simultaneously # Get multi-target predictions at specified quantiles. -y_pred = qrf.predict(X, quantiles=quantiles) +y_pred = qrf.predict(X, quantiles=quantiles) # shape = (n_samples, n_targets, n_quantiles) df = pd.DataFrame( { diff --git a/examples/plot_quantile_ranks.py b/examples/plot_quantile_ranks.py index d9bdcad..24e28a3 100644 --- a/examples/plot_quantile_ranks.py +++ b/examples/plot_quantile_ranks.py @@ -25,6 +25,7 @@ def make_toy_dataset(n_samples, bounds, random_state=0): + """Make a toy dataset.""" random_state = check_random_state(random_state) X_1d = np.linspace(*bounds, num=n_samples) X = X_1d.reshape(-1, 1) @@ -32,6 +33,7 @@ def make_toy_dataset(n_samples, bounds, random_state=0): return X, y +# Create a toy dataset. X, y = make_toy_dataset(n_samples, bounds, random_state=0) qrf = RandomForestQuantileRegressor( @@ -43,7 +45,7 @@ def make_toy_dataset(n_samples, bounds, random_state=0): y_pred = qrf.predict(X, quantiles=0.5) # Get the quantile rank for all samples. -y_rank = qrf.quantile_ranks(X, y) +y_rank = qrf.quantile_ranks(X, y) # output is a value in the range [0, 1] for each sample df = pd.DataFrame({"x": X.reshape(-1), "y": y, "y_pred": y_pred, "y_rank": y_rank}) diff --git a/examples/plot_quantile_vs_standard.py b/examples/plot_quantile_vs_standard.py index 178d863..3b1f2dd 100755 --- a/examples/plot_quantile_vs_standard.py +++ b/examples/plot_quantile_vs_standard.py @@ -17,6 +17,7 @@ import scipy as sp from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split +from sklearn.utils.validation import check_random_state from quantile_forest import RandomForestQuantileRegressor @@ -24,12 +25,19 @@ n_samples = 5000 quantiles = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist() -# Create right-skewed dataset. -a, loc, scale = 7, -1, 1 -skewnorm_rv = sp.stats.skewnorm(a, loc, scale) -skewnorm_rv.random_state = random_state -y = skewnorm_rv.rvs(n_samples) -X = random_state.randn(n_samples, 2) * y.reshape(-1, 1) + +def make_skewed_dataset(a=7, loc=-1, scale=1, random_state=0): + """Make a skewed dataset.""" + random_state = check_random_state(random_state) + skewnorm_rv = sp.stats.skewnorm(a, loc, scale) + skewnorm_rv.random_state = random_state + y = skewnorm_rv.rvs(n_samples) + X = random_state.randn(n_samples, 2) * y.reshape(-1, 1) + return X, y + + +# Create a right-skewed toy dataset. +X, y = make_skewed_dataset(a=7, loc=-1, scale=1, random_state=0) regr_rf = RandomForestRegressor(random_state=random_state) regr_qrf = RandomForestQuantileRegressor(random_state=random_state) diff --git a/examples/plot_treeshap_example.py b/examples/plot_treeshap_example.py index dce47d4..b34f442 100644 --- a/examples/plot_treeshap_example.py +++ b/examples/plot_treeshap_example.py @@ -88,7 +88,7 @@ def get_shap_value_by_index(shap_values, index): return shap_values_i -# Load the California Housing Prices dataset. +# Load the California Housing dataset. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) perm = random_state.permutation(min(len(X), n_samples)) X = X.iloc[perm] @@ -100,6 +100,7 @@ def get_shap_value_by_index(shap_values, index): qrf = RandomForestQuantileRegressor(random_state=random_state) qrf.fit(X_train, y_train) +# Get the SHAP values at each quantile for the specified test sample. shap_values_list = [ get_shap_value_by_index(get_shap_values(qrf, X_test, quantile=q), test_idx) for q in quantiles ]