Skip to content

Commit

Permalink
deploy: 0f225d9
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Mar 18, 2024
1 parent e6a057b commit bd49ec0
Show file tree
Hide file tree
Showing 23 changed files with 2,082 additions and 1,197 deletions.
22 changes: 11 additions & 11 deletions _sources/gallery/index.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@ General-purpose, introductory and illustrative examples.

<span class="gallery">

<a class="imagegroup" href="plot_predict_custom.html">
<span
class="image" alt="Calculating User-Specified Functions with QRFs"

style="background-image: url(../_static/plot_predict_custom-thumb.png);"

></span>

<span class="image-title">Calculating User-Specified Functions with QRFs</span>
</a>

<a class="imagegroup" href="plot_quantile_interpolation.html">
<span
class="image" alt="Comparing Quantile Interpolation Methods"
Expand Down Expand Up @@ -89,17 +100,6 @@ General-purpose, introductory and illustrative examples.
<span class="image-title">Quantile Regression Forests vs. Random Forests</span>
</a>

<a class="imagegroup" href="plot_quantile_weighting.html">
<span
class="image" alt="Weighted vs. Unweighted Quantile Runtimes"

style="background-image: url(../_static/plot_quantile_weighting-thumb.png);"

></span>

<span class="image-title">Weighted vs. Unweighted Quantile Runtimes</span>
</a>

</span>

<div style='clear:both;'></div>
Expand Down
250 changes: 250 additions & 0 deletions _sources/gallery/plot_predict_custom.rst.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@

:orphan:
:html_theme.sidebar_secondary.remove:

.. This document is auto-generated by the example-gallery extension. Do not modify directly.
.. _gallery_plot_predict_custom:


Calculating User-Specified Functions with QRFs
==============================================

An example that demonstrates a way of extracting the empirical distribution
from a quantile regression forest (QRF) for one or more samples in order to
calculate a user-specified function of interest. While a QRF is designed to
estimate quantiles from the empirical distribution calculated for each sample,
in many cases it may be useful to use the empirical distribution to calculate
other quantities of interest. Here, we calculate the ECDF for a test sample.


.. altair-plot::
:remove-code:



from itertools import chain

import altair as alt
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import datasets
from sklearn.model_selection import train_test_split

from quantile_forest import RandomForestQuantileRegressor

np.random.seed(0)


def predict(reg, X, quantiles=0.5, what=None):
"""Custom prediction method that allows user-specified function.

Parameters
----------
reg : BaseForestQuantileRegressor
Quantile forest model object.
X : array-like, of shape (n_samples, n_features)
New test data.
quantiles : float or list of floats, default=0.5
Scalar or vector of quantiles for quantile prediction.
what : Any, optional
User-specified function for prediction used instead of quantiles.
Must return a numeric vector.

Returns
-------
Output array with the user-specified function applied (if any).
"""
if what is None:
return reg.predict(X, quantiles=quantiles)

# Get the complete set of proximities for each sample.
proximities = reg.proximity_counts(X)

# Retrieve the unsorted training responses from the model (stored in sorted order).
reverse_sorter = np.argsort(reg.sorter_, axis=0)
y_train = np.empty_like(reg.forest_.y_train).T
for i in range(y_train.shape[1]):
y_train[:, i] = np.asarray(reg.forest_.y_train)[i, reverse_sorter[:, i]]

# For each sample, construct an array of the training responses used for prediction.
s = [np.concatenate([np.repeat(y_train[i[0]], i[1]) for i in prox]) for prox in proximities]

# Apply the user-specified function for each sample. Must return a numeric vector.
y_out = np.array([what(s_i) for s_i in s])

return y_out


X, y = datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=0)

reg = RandomForestQuantileRegressor().fit(X_train, y_train)

# Define a user-specified function; here we randomly sample 1000 values with replacement.
func = lambda x: np.random.choice(x, size=1000)

# Output array with the user-specified function applied to each sample's empirical distribution.
y_out = predict(reg, X_test, what=func)

# Calculate the ECDF from output array.
y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out]

df = pd.DataFrame(
{
"y_value": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])),
"y_value2": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))[1:] + [np.nan],
"probability": list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf])),
}
)


def plot_ecdf(df):
tooltip = [
alt.Tooltip("y_value", title="Response Value"),
alt.Tooltip("probability", title="Probability"),
]

circles = (
alt.Chart(df)
.mark_circle(color="#006aff")
.encode(
x=alt.X("y_value", title="Response Value"),
y=alt.Y("probability", title="Probability"),
tooltip=tooltip,
)
)

lines = (
alt.Chart(df)
.mark_line(color="#006aff")
.encode(
x=alt.X("y_value", title="Response Value"),
x2=alt.X2("y_value2"),
y=alt.Y("probability", title="Probability"),
tooltip=tooltip,
)
)

chart = (circles + lines).properties(height=400, width=650)
return chart


chart = plot_ecdf(df)
chart


.. code:: python
from itertools import chain
import altair as alt
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import datasets
from sklearn.model_selection import train_test_split
from quantile_forest import RandomForestQuantileRegressor
np.random.seed(0)
def predict(reg, X, quantiles=0.5, what=None):
"""Custom prediction method that allows user-specified function.
Parameters
----------
reg : BaseForestQuantileRegressor
Quantile forest model object.
X : array-like, of shape (n_samples, n_features)
New test data.
quantiles : float or list of floats, default=0.5
Scalar or vector of quantiles for quantile prediction.
what : Any, optional
User-specified function for prediction used instead of quantiles.
Must return a numeric vector.
Returns
-------
Output array with the user-specified function applied (if any).
"""
if what is None:
return reg.predict(X, quantiles=quantiles)
# Get the complete set of proximities for each sample.
proximities = reg.proximity_counts(X)
# Retrieve the unsorted training responses from the model (stored in sorted order).
reverse_sorter = np.argsort(reg.sorter_, axis=0)
y_train = np.empty_like(reg.forest_.y_train).T
for i in range(y_train.shape[1]):
y_train[:, i] = np.asarray(reg.forest_.y_train)[i, reverse_sorter[:, i]]
# For each sample, construct an array of the training responses used for prediction.
s = [np.concatenate([np.repeat(y_train[i[0]], i[1]) for i in prox]) for prox in proximities]
# Apply the user-specified function for each sample. Must return a numeric vector.
y_out = np.array([what(s_i) for s_i in s])
return y_out
X, y = datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=0)
reg = RandomForestQuantileRegressor().fit(X_train, y_train)
# Define a user-specified function; here we randomly sample 1000 values with replacement.
func = lambda x: np.random.choice(x, size=1000)
# Output array with the user-specified function applied to each sample's empirical distribution.
y_out = predict(reg, X_test, what=func)
# Calculate the ECDF from output array.
y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out]
df = pd.DataFrame(
{
"y_value": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])),
"y_value2": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))[1:] + [np.nan],
"probability": list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf])),
}
)
def plot_ecdf(df):
tooltip = [
alt.Tooltip("y_value", title="Response Value"),
alt.Tooltip("probability", title="Probability"),
]
circles = (
alt.Chart(df)
.mark_circle(color="#006aff")
.encode(
x=alt.X("y_value", title="Response Value"),
y=alt.Y("probability", title="Probability"),
tooltip=tooltip,
)
)
lines = (
alt.Chart(df)
.mark_line(color="#006aff")
.encode(
x=alt.X("y_value", title="Response Value"),
x2=alt.X2("y_value2"),
y=alt.Y("probability", title="Probability"),
tooltip=tooltip,
)
)
chart = (circles + lines).properties(height=400, width=650)
return chart
chart = plot_ecdf(df)
chart
Loading

0 comments on commit bd49ec0

Please sign in to comment.