Skip to content

Commit

Permalink
Add Docs and Example for User-Specified Function (#43)
Browse files Browse the repository at this point in the history
* Update docs for user-specified function
* Add example for user-specified function
  • Loading branch information
reidjohnson authored Mar 18, 2024
1 parent 676cb2e commit 2132641
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 0 deletions.
5 changes: 5 additions & 0 deletions docs/user_guide/fit_predict.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,8 @@ The predictions of a standard random forest can also be recovered from a quantil
>>> y_pred_qrf = qrf.predict(X_test, **kwargs)
>>> np.allclose(y_pred_rf, y_pred_qrf)
True

Predicting with User-Specified Functions
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

While a QRF is designed to estimate quantiles from the empirical distribution calculated for each sample, in many cases it may be useful to use the empirical distribution to calculate other quantities of interest. For more details, see :ref:`gallery_plot_predict_custom`.
122 changes: 122 additions & 0 deletions examples/plot_predict_custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
Calculating User-Specified Functions with QRFs
==============================================
An example that demonstrates a way of extracting the empirical distribution
from a quantile regression forest (QRF) for one or more samples in order to
calculate a user-specified function of interest. While a QRF is designed to
estimate quantiles from the empirical distribution calculated for each sample,
in many cases it may be useful to use the empirical distribution to calculate
other quantities of interest. Here, we calculate the ECDF for a test sample.
"""

from itertools import chain

import altair as alt
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import datasets
from sklearn.model_selection import train_test_split

from quantile_forest import RandomForestQuantileRegressor

np.random.seed(0)


def predict(reg, X, quantiles=0.5, what=None):
"""Custom prediction method that allows user-specified function.
Parameters
----------
reg : BaseForestQuantileRegressor
Quantile forest model object.
X : array-like, of shape (n_samples, n_features)
New test data.
quantiles : float or list of floats, default=0.5
Scalar or vector of quantiles for quantile prediction.
what : Any, optional
User-specified function for prediction used instead of quantiles.
Must return a numeric vector.
Returns
-------
Output array with the user-specified function applied (if any).
"""
if what is None:
return reg.predict(X, quantiles=quantiles)

# Get the complete set of proximities for each sample.
proximities = reg.proximity_counts(X)

# Retrieve the unsorted training responses from the model (stored in sorted order).
reverse_sorter = np.argsort(reg.sorter_, axis=0)
y_train = np.empty_like(reg.forest_.y_train).T
for i in range(y_train.shape[1]):
y_train[:, i] = np.asarray(reg.forest_.y_train)[i, reverse_sorter[:, i]]

# For each sample, construct an array of the training responses used for prediction.
s = [np.concatenate([np.repeat(y_train[i[0]], i[1]) for i in prox]) for prox in proximities]

# Apply the user-specified function for each sample. Must return a numeric vector.
y_out = np.array([what(s_i) for s_i in s])

return y_out


X, y = datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=0)

reg = RandomForestQuantileRegressor().fit(X_train, y_train)

# Define a user-specified function; here we randomly sample 1000 values with replacement.
func = lambda x: np.random.choice(x, size=1000)

# Output array with the user-specified function applied to each sample's empirical distribution.
y_out = predict(reg, X_test, what=func)

# Calculate the ECDF from output array.
y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out]

df = pd.DataFrame(
{
"y_value": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])),
"y_value2": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))[1:] + [np.nan],
"probability": list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf])),
}
)


def plot_ecdf(df):
tooltip = [
alt.Tooltip("y_value", title="Response Value"),
alt.Tooltip("probability", title="Probability"),
]

circles = (
alt.Chart(df)
.mark_circle(color="#006aff")
.encode(
x=alt.X("y_value", title="Response Value"),
y=alt.Y("probability", title="Probability"),
tooltip=tooltip,
)
)

lines = (
alt.Chart(df)
.mark_line(color="#006aff")
.encode(
x=alt.X("y_value", title="Response Value"),
x2=alt.X2("y_value2"),
y=alt.Y("probability", title="Probability"),
tooltip=tooltip,
)
)

chart = (circles + lines).properties(height=400, width=650)
return chart


chart = plot_ecdf(df)
chart

0 comments on commit 2132641

Please sign in to comment.