Skip to content

Commit

Permalink
Update example plots
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Aug 7, 2024
1 parent b95444d commit 6f81ee7
Show file tree
Hide file tree
Showing 12 changed files with 149 additions and 106 deletions.
18 changes: 10 additions & 8 deletions examples/plot_huggingface_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,17 @@

alt.data_transformers.disable_max_rows()

random_seed = 0

token = "<Hugging Face Access Token>"
repo_id = "quantile-forest/california-housing-example"
load_existing = True

quantiles = np.arange(0, 1.25, 0.25).round(2).tolist()
quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist()
sample_frac = 1


def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None):
"""Function used to fit the model and upload it to Hugging Face Hub."""
from pathlib import Path

Expand All @@ -49,10 +51,10 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
from skops import card

X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

# Fit the model.
qrf = RandomForestQuantileRegressor(random_state=0).fit(X_train, y_train)
qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)

# Save the model to a file.
model_filename = "model.pkl"
Expand Down Expand Up @@ -145,7 +147,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):


if not load_existing:
fit_and_upload_model(token, repo_id)
fit_and_upload_model(token, repo_id, random_state=random_seed)

# Download the repository locally.
local_dir = "./local_repo"
Expand All @@ -166,7 +168,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
df = (
pd.DataFrame(y_pred, columns=quantiles)
.reset_index()
.sample(frac=sample_frac, random_state=0)
.sample(frac=sample_frac, random_state=random_seed)
.melt(id_vars=["index"], var_name="quantile", value_name="value")
.merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
)
Expand All @@ -178,7 +180,7 @@ def plot_quantiles_by_latlon(df, quantiles):
min=0,
max=1,
step=0.5 if len(quantiles) == 1 else 1 / (len(quantiles) - 1),
name="Quantile: ",
name="Predicted Quantile: ",
)

q_val = alt.selection_point(
Expand Down Expand Up @@ -217,7 +219,7 @@ def plot_quantiles_by_latlon(df, quantiles):
.properties(
height=650,
width=650,
title="Quantile Estimates on the California Housing Dataset",
title="Quantile Predictions on the California Housing Dataset",
)
)
return chart
Expand Down
12 changes: 8 additions & 4 deletions examples/plot_predict_custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
import scipy as sp
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_random_state

from quantile_forest import RandomForestQuantileRegressor

np.random.seed(0)
random_seed = 0
rng = check_random_state(random_seed)

n_test_samples = 100

Expand Down Expand Up @@ -68,12 +70,14 @@ def predict(reg, X, quantiles=0.5, what=None):


X, y = datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test_samples, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=n_test_samples, random_state=random_seed
)

reg = RandomForestQuantileRegressor().fit(X_train, y_train)
reg = RandomForestQuantileRegressor(random_state=random_seed).fit(X_train, y_train)

# Define a user-specified function; here we randomly sample 1000 values with replacement.
func = lambda x: np.random.choice(x, size=1000)
func = lambda x: rng.choice(x, size=1000)

# Output array with the user-specified function applied to each sample's empirical distribution.
y_out = predict(reg, X_test, what=func)
Expand Down
16 changes: 11 additions & 5 deletions examples/plot_proximity_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,26 @@

from quantile_forest import RandomForestQuantileRegressor

rng = check_random_state(0)
random_seed = 0
rng = check_random_state(random_seed)

n_test_samples = 25
noise_std = 0.1

# Load the Digits dataset.
X, y = datasets.load_digits(return_X_y=True, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test_samples, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=n_test_samples, random_state=random_seed
)


def add_gaussian_noise(X, mean=0, std=0.1, random_state=None):
"""Add Gaussian noise to input data."""
if random_state is None:
rng = check_random_state(0)
elif isinstance(random_state, int):
rng = check_random_state(random_state)
else:
rng = random_state

Expand Down Expand Up @@ -73,13 +78,13 @@ def extract_floats(combined_df, scale=100):


# Randomly add noise to the training and test data.
X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=rng)
X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=rng)
X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)
X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)

# We set `max_samples_leaf=None` to ensure that every sample in the training
# data is stored in the leaf nodes. By doing this, we allow the model to
# consider all samples as potential candidates for proximity calculations.
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=0)
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_seed)
qrf.fit(X_train_noisy, X_train)

# Get the proximity counts.
Expand Down Expand Up @@ -177,6 +182,7 @@ def plot_digits_proximities(
color=alt.Color("value_clean:Q", legend=None, scale=alt.Scale(scheme="greys")),
opacity=alt.condition(alt.datum["value_clean"] == 0, alt.value(0), alt.value(0.67)),
tooltip=[
alt.Tooltip("prox_idx", title="Proximity Index"),
alt.Tooltip("prox_cnt", title="Proximity Count"),
alt.Tooltip("target:Q", title="Digit"),
],
Expand Down
36 changes: 18 additions & 18 deletions examples/plot_quantile_conformalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,31 @@

alt.data_transformers.disable_max_rows()

random_seed = 0
rng = check_random_state(random_seed)

n_samples = 1000
coverages = np.linspace(0, 1, num=11, endpoint=True).round(1).tolist() # the "coverage level"

strategies = {
"qrf": "Quantile Regression Forest (QRF)",
"cqr": "Conformalized Quantile Regression (CQR)",
}

random_state = 0
rng = check_random_state(random_state)

coverages = np.arange(0, 1.1, 0.1).round(1).tolist() # the "coverage level"

# Load the California Housing Prices dataset.
california = datasets.fetch_california_housing()
n_samples = min(california.target.size, 1000)
perm = rng.permutation(n_samples)
X = california.data[perm]
y = california.target[perm]
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
perm = rng.permutation(min(len(X), n_samples))
X = X.iloc[perm]
y = y.iloc[perm]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)


def sort_y_values(y_test, y_pred, y_pis):
"""Sort the target values and predictions."""
indices = np.argsort(y_test)
return {
"y_test": y_test[indices],
"y_test": np.asarray(y_test)[indices],
"y_pred": y_pred[indices],
"y_pred_low": y_pis[:, 0][indices],
"y_pred_upp": y_pis[:, 1][indices],
Expand All @@ -68,10 +68,10 @@ def mean_width_score(y_pred_low, y_pred_upp):
return float(mean_width)


def qrf_strategy(alpha, X_train, X_test, y_train, y_test):
def qrf_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
quantiles = [alpha / 2, 1 - alpha / 2]

qrf = RandomForestQuantileRegressor(random_state=0)
qrf = RandomForestQuantileRegressor(random_state=random_state)
qrf.fit(X_train, y_train)

# Calculate the lower and upper quantile values on the test data.
Expand All @@ -88,15 +88,15 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test):
return pd.DataFrame(y_values).pipe(lambda x: x * 100_000).assign(strategy="qrf")


def cqr_strategy(alpha, X_train, X_test, y_train, y_test):
def cqr_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
quantiles = [alpha / 2, 1 - alpha / 2]

# Create calibration set.
X_train, X_calib, y_train, y_calib = train_test_split(
X_train, y_train, test_size=0.5, random_state=0
X_train, y_train, test_size=0.5, random_state=random_state
)

qrf = RandomForestQuantileRegressor(random_state=0)
qrf = RandomForestQuantileRegressor(random_state=random_state)
qrf.fit(X_train, y_train)

# Calculate the lower and upper quantile values on the test data.
Expand Down Expand Up @@ -134,7 +134,7 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test):
dfs = []
for cov_frac in coverages:
alpha = float(round(1 - cov_frac, 2))
args = (alpha, X_train, X_test, y_train, y_test)
args = (alpha, X_train, X_test, y_train, y_test, random_seed)
dfs.append(pd.concat([qrf_strategy(*args), cqr_strategy(*args)]).assign(alpha=alpha))
df = pd.concat(dfs)

Expand Down
11 changes: 6 additions & 5 deletions examples/plot_quantile_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from quantile_forest import RandomForestQuantileRegressor

random_seed = 0
n_samples = 1000
bounds = [0, 10]
quantiles = [0.025, 0.5, 0.975]
Expand All @@ -33,12 +34,12 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0):


# Create noisy data for modeling and non-noisy function data for illustration.
X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0)
X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=0)
X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=random_seed)
X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=random_seed)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)

qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=0)
qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_seed)
qrf.fit(X_train, y_train)

y_pred_func = qrf.predict(X_func, quantiles=quantiles)
Expand Down Expand Up @@ -133,7 +134,7 @@ def plot_fit_and_intervals(df):
chart = (
(area_pred + points + line_true + line_pred + blank)
.resolve_scale(color="independent")
.properties(height=400, width=650)
.properties(height=400, width=650, title="QRF Predictions vs. Ground Truth on Toy Dataset")
)

return chart
Expand Down
Loading

0 comments on commit 6f81ee7

Please sign in to comment.