Skip to content

Commit

Permalink
Update example plots
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Aug 16, 2024
1 parent ceceedb commit baf13bb
Show file tree
Hide file tree
Showing 12 changed files with 98 additions and 107 deletions.
10 changes: 5 additions & 5 deletions examples/plot_huggingface_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.utils.validation import check_random_state
from skops import hub_utils

import quantile_forest
from quantile_forest import RandomForestQuantileRegressor

alt.data_transformers.disable_max_rows()

random_seed = 0

token = "<Hugging Face Access Token>"
repo_id = "quantile-forest/california-housing-example"
load_existing = True

random_state = check_random_state(0)
quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist()
sample_frac = 1

Expand Down Expand Up @@ -151,7 +151,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=


if not load_existing:
fit_and_upload_model(token, repo_id, random_state=random_seed)
fit_and_upload_model(token, repo_id, random_state=random_state)

# Download the repository locally and load the fitted model.
model_filename = "model.pkl"
Expand All @@ -168,7 +168,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
df = (
pd.DataFrame(y_pred, columns=quantiles)
.reset_index()
.sample(frac=sample_frac, random_state=random_seed)
.sample(frac=sample_frac, random_state=random_state)
.melt(id_vars=["index"], var_name="quantile", value_name="value")
.merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
)
Expand Down Expand Up @@ -213,9 +213,9 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"):
],
)
.properties(
title="Quantile Predictions on the California Housing Dataset",
height=650,
width=650,
title="Quantile Predictions on the California Housing Dataset",
)
)
return chart
Expand Down
15 changes: 7 additions & 8 deletions examples/plot_predict_custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@

from quantile_forest import RandomForestQuantileRegressor

random_seed = 0
rng = check_random_state(random_seed)

random_state = check_random_state(0)
n_test_samples = 100


Expand Down Expand Up @@ -70,13 +68,14 @@ def predict(reg, X, quantiles=0.5, what=None):

X, y = datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=n_test_samples, random_state=random_seed
X, y, test_size=n_test_samples, random_state=random_state
)

reg = RandomForestQuantileRegressor(random_state=random_seed).fit(X_train, y_train)
reg = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)

# Define a user-specified function; here we randomly sample 1000 values with replacement.
func = lambda x: rng.choice(x, size=1000)
# Define a user-specified function.
# Here we randomly sample 1,000 values with replacement from the empirical distribution.
func = lambda x: random_state.choice(x, size=1000)

# Output array with the user-specified function applied to each sample's empirical distribution.
y_out = predict(reg, X_test, what=func)
Expand Down Expand Up @@ -140,9 +139,9 @@ def plot_ecdf(df):
.add_params(index_selection)
.transform_filter(index_selection)
.properties(
title="Empirical Cumulative Distribution Function (ECDF) Plot",
height=400,
width=650,
title="Empirical Cumulative Distribution Function (ECDF) Plot",
)
)
return chart
Expand Down
26 changes: 12 additions & 14 deletions examples/plot_proximity_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,28 +24,26 @@

from quantile_forest import RandomForestQuantileRegressor

random_seed = 0
rng = check_random_state(random_seed)

random_state = check_random_state(0)
n_test_samples = 25
noise_std = 0.1

# Load the Digits dataset.
X, y = datasets.load_digits(return_X_y=True, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=n_test_samples, random_state=random_seed
X, y, test_size=n_test_samples, random_state=random_state
)


def add_gaussian_noise(X, mean=0, std=0.1, random_state=None):
"""Add Gaussian noise to input data."""
rng = check_random_state(random_state)
random_state = check_random_state(random_state)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

noise = rng.normal(mean, std, X_scaled.shape)
noise = random_state.normal(mean, std, X_scaled.shape)
X_noisy = np.clip(X_scaled + noise, 0, 1)

X_noisy = scaler.inverse_transform(X_noisy)
Expand All @@ -70,20 +68,20 @@ def extract_floats(combined_df, scale=100):


# Randomly add noise to the training and test data.
X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)
X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)
X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)
X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)

# We set `max_samples_leaf=None` to ensure that every sample in the training
# data is stored in the leaf nodes. By doing this, we allow the model to
# consider all samples as potential candidates for proximity calculations.
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_seed)
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)
qrf.fit(X_train_noisy, X_train)

# Get the proximity counts.
proximities = qrf.proximity_counts(X_test_noisy)

df_prox = pd.DataFrame(
{"prox": [[(j, *p) for j, p in enumerate(proximities[i])] for i in range(len(X_test))]}
{"prox": [[(i, *p) for i, p in enumerate(proximities[x])] for x in range(len(X_test))]}
)

df = (
Expand Down Expand Up @@ -142,7 +140,7 @@ def plot_digits_proximities(
base = alt.Chart(df).add_params(index_selection).transform_filter(index_selection)

chart1 = (
base.transform_filter("datum.prox_idx == 0")
base.transform_filter("datum.prox_idx == 0") # filter to one test sample row
.transform_fold(fold=pixel_cols, as_=["pixel", "value"])
.transform_calculate(value_clean=f"floor(datum.value / {pixel_scale})")
.transform_calculate(value_noisy=f"datum.value - (datum.value_clean * {pixel_scale})")
Expand Down Expand Up @@ -196,7 +194,7 @@ def plot_digits_proximities(
)

chart3 = (
base.transform_filter("datum.prox_idx == 0")
base.transform_filter("datum.prox_idx == 0") # filter to one test sample row
.transform_fold(fold=pixel_cols, as_=["pixel", "value"])
.transform_calculate(value_clean=f"floor(datum.value / {pixel_scale})")
.transform_calculate(x=pixel_x, y=pixel_y)
Expand All @@ -208,12 +206,12 @@ def plot_digits_proximities(
opacity=alt.condition(alt.datum["value_clean"] == 0, *opacity),
tooltip=[
alt.Tooltip("target:Q", title="Digit"),
alt.Tooltip("value_clean:Q", format=",.3f", title="Pixel Value"),
alt.Tooltip("value_clean:Q", format=",.0f", title="Pixel Value"),
alt.Tooltip("x:Q", title="Pixel X"),
alt.Tooltip("y:Q", title="Pixel Y"),
],
)
.properties(height=height, width=width, title="Test Digit (original)")
.properties(title="Test Digit (original)", height=height, width=width)
)

chart_spacer = alt.Chart(pd.DataFrame()).mark_rect().properties(width=subplot_dim * 2)
Expand Down
14 changes: 6 additions & 8 deletions examples/plot_quantile_conformalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,10 @@

from quantile_forest import RandomForestQuantileRegressor

alt.data_transformers.disable_max_rows()

random_seed = 0
rng = check_random_state(random_seed)
random_state = check_random_state(random_seed)

n_samples = 1000
n_samples = 900
coverages = np.linspace(0, 1, num=11, endpoint=True).round(1).tolist() # the "coverage level"

strategies = {
Expand All @@ -39,11 +37,11 @@

# Load the California Housing Prices dataset.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
perm = rng.permutation(min(len(X), n_samples))
perm = random_state.permutation(min(len(X), n_samples))
X = X.iloc[perm]
y = y.iloc[perm]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)


def sort_y_values(y_test, y_pred, y_pis):
Expand Down Expand Up @@ -73,7 +71,7 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
"""QRF (baseline) strategy."""
quantiles = [alpha / 2, 1 - alpha / 2]

qrf = RandomForestQuantileRegressor(random_state=random_state)
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)
qrf.fit(X_train, y_train)

# Calculate the lower and upper quantile values on the test data.
Expand All @@ -99,7 +97,7 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
X_train, y_train, test_size=0.5, random_state=random_state
)

qrf = RandomForestQuantileRegressor(random_state=random_state)
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)
qrf.fit(X_train, y_train)

# Calculate the lower and upper quantile values on the test data.
Expand Down
23 changes: 12 additions & 11 deletions examples/plot_quantile_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,36 @@
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_random_state

from quantile_forest import RandomForestQuantileRegressor

random_seed = 0
random_state = check_random_state(0)
n_samples = 1000
bounds = [0, 10]
quantiles = [0.025, 0.5, 0.975]


def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0):
rng = np.random.RandomState(random_seed)
def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
random_state = check_random_state(random_state)

x = rng.uniform(*bounds, size=n_samples)
x = random_state.uniform(*bounds, size=n_samples)
f = x * np.sin(x)

sigma = 0.25 + x / 10
noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2) if add_noise else np.zeros_like(f)
y = f + noise
noise = random_state.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
y = f + (noise if add_noise else np.zeros_like(f))

return np.atleast_2d(x).T, y


# Create noisy data for modeling and non-noisy function data for illustration.
X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=random_seed)
X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=random_seed)
X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0)
X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_seed)
qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_state)
qrf.fit(X_train, y_train)

y_pred_func = qrf.predict(X_func, quantiles=quantiles)
Expand Down Expand Up @@ -135,7 +136,7 @@ def plot_fit_and_intervals(df):
chart = (
(area_pred + points + line_true + line_pred + blank)
.resolve_scale(color="independent")
.properties(height=400, width=650, title="QRF Predictions vs. Ground Truth on Toy Dataset")
.properties(title="QRF Predictions vs. Ground Truth on Toy Dataset", height=400, width=650)
)

return chart
Expand Down
23 changes: 12 additions & 11 deletions examples/plot_quantile_extrapolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,27 @@

from quantile_forest import RandomForestQuantileRegressor

random_seed = 0
rng = check_random_state(random_seed)

random_state = check_random_state(0)
n_samples = 500
bounds = [0, 15]
extrap_frac = 0.25
func = lambda x: x * np.sin(x)
func_str = "f(x) = x sin(x)"

quantiles = [0.025, 0.975, 0.5]
qrf_params = {"max_samples_leaf": None, "min_samples_leaf": 4, "random_state": random_seed}
qrf_params = {"max_samples_leaf": None, "min_samples_leaf": 4, "random_state": random_state}


def make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0):
random_state = check_random_state(random_state)

def make_func_Xy(func, bounds, n_samples, random_seed=0):
rng = np.random.RandomState(random_seed)
x = np.linspace(bounds[0], bounds[1], n_samples)
f = func(x)

std = 0.01 + np.abs(x - 5.0) / 5.0
noise = rng.normal(scale=std)
noise = random_state.normal(scale=std) if add_noise else np.zeros_like(f)
y = f + noise

return np.atleast_2d(x).T, y


Expand Down Expand Up @@ -367,18 +368,18 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, *


# Create the full dataset.
X, y = make_func_Xy(func, bounds, n_samples, random_seed=random_seed)
X, y = make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0)

# Fit and extrapolate based on train-test split (depending on X).
extrap_min_idx = int(n_samples * (extrap_frac / 2))
extrap_max_idx = int(n_samples - (n_samples * (extrap_frac / 2)))
sort_X = np.argsort(X.squeeze())
train_indices = np.repeat(False, len(y))
train_indices[sort_X[extrap_min_idx] : sort_X[extrap_max_idx]] = True
res = train_test_split(train_indices, rng=rng, **qrf_params)
res = train_test_split(train_indices, rng=random_state, **qrf_params)

# Get coverages on extrapolated samples.
args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], rng)
args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], random_state)
cov_qrf = get_coverage_qrf(res["qmat"], *args)
cov_xtr = get_coverage_xtr(res["bounds_list"], *args)

Expand Down Expand Up @@ -513,7 +514,7 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
chart += blank
chart = chart.resolve_scale(color="independent")

chart = chart.properties(height=200, width=300, title=title)
chart = chart.properties(title=title, height=200, width=300)
return chart

kwargs = {"x_domain": [0, 15], "y_domain": [-15, 20]}
Expand Down
5 changes: 3 additions & 2 deletions examples/plot_quantile_interpolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
import altair as alt
import numpy as np
import pandas as pd
from sklearn.utils.validation import check_random_state

from quantile_forest import RandomForestQuantileRegressor

random_seed = 0
random_state = check_random_state(0)
intervals = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist()

# Create toy dataset.
Expand All @@ -31,7 +32,7 @@
n_estimators=1,
max_samples_leaf=None,
bootstrap=False,
random_state=random_seed,
random_state=random_state,
)
qrf.fit(X, y)

Expand Down
8 changes: 3 additions & 5 deletions examples/plot_quantile_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,16 @@

from quantile_forest import RandomForestQuantileRegressor

random_seed = 0
rng = check_random_state(random_seed)

random_state = check_random_state(0)
n_samples = 1000

# Load the California Housing Prices dataset.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
perm = rng.permutation(min(len(X), n_samples))
perm = random_state.permutation(min(len(X), n_samples))
X = X.iloc[perm]
y = y.iloc[perm]

qrf = RandomForestQuantileRegressor(random_state=random_seed)
qrf = RandomForestQuantileRegressor(random_state=random_state)

kf = KFold(n_splits=5)
kf.get_n_splits(X)
Expand Down
Loading

0 comments on commit baf13bb

Please sign in to comment.