Skip to content

Commit

Permalink
Update example plots
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Aug 19, 2024
1 parent ceb4536 commit ee9f371
Show file tree
Hide file tree
Showing 12 changed files with 78 additions and 58 deletions.
4 changes: 3 additions & 1 deletion examples/plot_huggingface_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
from sklearn.model_selection import train_test_split
from skops import card

# Load the California Housing dataset.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

# Fit the model.
Expand Down Expand Up @@ -160,7 +162,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
with open(f"{local_dir}/{model_filename}", "rb") as file:
qrf = pickle.load(file)

# Estimate quantiles.
# Fetch the California Housing dataset and estimate quantiles.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
y_pred = qrf.predict(X, quantiles=quantiles) * 100_000 # predict in dollars

Expand Down
23 changes: 13 additions & 10 deletions examples/plot_predict_custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
n_test_samples = 100


def predict(reg, X, quantiles=0.5, what=None):
def predict(qrf, X, quantiles=0.5, what=None):
"""Custom prediction method that allows user-specified function.
Parameters
----------
reg : BaseForestQuantileRegressor
qrf : BaseForestQuantileRegressor
Quantile forest model object.
X : array-like, of shape (n_samples, n_features)
New test data.
Expand All @@ -45,16 +45,16 @@ def predict(reg, X, quantiles=0.5, what=None):
Output array with the user-specified function applied (if any).
"""
if what is None:
return reg.predict(X, quantiles=quantiles)
return qrf.predict(X, quantiles=quantiles)

# Get the complete set of proximities (training indices) for each sample.
proximities = reg.proximity_counts(X)
# Get the complete set of proximities for each sample.
proximities = qrf.proximity_counts(X)

# Retrieve the unsorted training responses from the model (stored in sorted order).
reverse_sorter = np.argsort(reg.sorter_, axis=0)
y_train = np.empty_like(reg.forest_.y_train).T
reverse_sorter = np.argsort(qrf.sorter_, axis=0)
y_train = np.empty_like(qrf.forest_.y_train).T
for i in range(y_train.shape[1]):
y_train[:, i] = np.asarray(reg.forest_.y_train)[i, reverse_sorter[:, i]]
y_train[:, i] = np.asarray(qrf.forest_.y_train)[i, reverse_sorter[:, i]]

# For each sample, construct an array of the training responses used for prediction.
s = [np.concatenate([np.repeat(y_train[i[0]], i[1]) for i in prox]) for prox in proximities]
Expand All @@ -65,25 +65,28 @@ def predict(reg, X, quantiles=0.5, what=None):
return y_out


# Load the Diabetes dataset.
X, y = datasets.load_diabetes(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=n_test_samples, random_state=random_state
)

reg = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)

# Define a user-specified function.
# Here we randomly sample 1,000 values with replacement from the empirical distribution.
func = lambda x: random_state.choice(x, size=1000)

# Output array with the user-specified function applied to each sample's empirical distribution.
y_out = predict(reg, X_test, what=func)
y_out = predict(qrf, X_test, what=func)

dfs = []
for idx in range(n_test_samples):
# Calculate the ECDF from output array.
y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out[idx].reshape(1, -1)]

# Get quantiles (x-axis) and probabilities (y-axis).
quantiles = list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))
probabilities = list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf]))

Expand Down
36 changes: 18 additions & 18 deletions examples/plot_proximity_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@
This example demonstrates the use of quantile regression forest (QRF)
proximity counts to identify similar samples. In this scenario, we train a QRF
on a noisy dataset to predict individual pixel values in an unsupervised
manner (the target labels are not used during training) for denoising
purposes. We then retrieve the proximity values for the noisy test samples. We
visualize each test sample alongside a set of similar (non-noisy) training
samples determined by their proximity counts. These similar samples are
ordered from the highest to the lowest proximity count. This illustrates how
proximity counts can effectively identify similar samples even in noisy
conditions.
manner for denoising purposes; the target labels are not used during training.
We then retrieve the proximity values for the noisy test samples. We visualize
each test sample alongside a set of similar (non-noisy) training samples
determined by their proximity counts. These similar samples are ordered from
the highest to the lowest proximity count. This illustrates how proximity
counts can effectively identify similar samples even in noisy conditions.
"""

import altair as alt
Expand Down Expand Up @@ -57,31 +56,34 @@ def add_gaussian_noise(X, mean=0, std=0.1, random_state=None):
return X_noisy


def combine_floats(df1, df2, scale=100):
def combine_floats(df1, df2, scale=1000):
"""Combine two floats from separate data frames into a single number."""
combined_df = df1 * scale + df2
return combined_df


def extract_floats(combined_df, scale=100):
def extract_floats(combined_df, scale=1000):
"""Extract the original floats from the combined data frame."""
df1 = np.floor(combined_df / scale)
df2 = combined_df - (df1 * scale)
return df1, df2


# Randomly add noise to the training and test data.
# Randomly add Gaussian noise to the training and test data.
X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)
X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)

# We set `max_samples_leaf=None` to ensure that every sample in the training
# data is stored in the leaf nodes. By doing this, we allow the model to
# consider all samples as potential candidates for proximity calculations.
qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)

# Fit the model to predict the non-noisy pixels from noisy pixels (i.e., to denoise).
# We fit a single multi-target model, with each pixel treated as a distinct target.
qrf.fit(X_train_noisy, X_train)

# Get the proximity counts.
proximities = qrf.proximity_counts(X_test_noisy)
proximities = qrf.proximity_counts(X_test_noisy) # output is a list of tuples for each sample

df_prox = pd.DataFrame(
{"prox": [[(i, *p) for i, p in enumerate(proximities[x])] for x in range(len(X_test))]}
Expand All @@ -107,8 +109,8 @@ def extract_floats(combined_df, scale=100):
# Create a data frame for looking up training proximities.
df_lookup = (
combine_floats(X_train, X_train_noisy, scale=pixel_scale) # combine to reduce transmitted data
.assign(**{"index": np.arange(len(X_train))})
.join(y_train)
.assign(**{"index": np.arange(len(X_train))}) # align with proximities, which are zero-indexed
)


Expand All @@ -123,12 +125,10 @@ def plot_digits_proximities(
height=225,
width=225,
):
dim_x = pixel_dim[0]
dim_y = pixel_dim[1]
num_x = len(str(dim_x))
num_y = len(str(dim_y))
dim_x, dim_y = pixel_dim[0], pixel_dim[1]
dgt_x, dgt_y = len(str(dim_x)), len(str(dim_y))

pixel_cols = [f"pixel_{y:0{num_y}}_{x:0{num_x}}" for y in range(dim_y) for x in range(dim_x)]
pixel_cols = [f"pixel_{y:0{dgt_y}}_{x:0{dgt_x}}" for y in range(dim_y) for x in range(dim_x)]
pixel_x = "split(datum.pixel, '_')[2]"
pixel_y = "split(datum.pixel, '_')[1]"

Expand Down Expand Up @@ -171,7 +171,7 @@ def plot_digits_proximities(
)

chart2 = (
base.transform_filter(f"datum.prox_idx < {n_prox}")
base.transform_filter(f"datum.prox_idx < {n_prox}") # filter to the display proximities
.transform_lookup(
lookup="prox_val",
from_=alt.LookupData(df_lookup, key="index", fields=pixel_cols + ["target"]),
Expand Down
2 changes: 1 addition & 1 deletion examples/plot_quantile_conformalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"cqr": "Conformalized Quantile Regression (CQR)",
}

# Load the California Housing Prices dataset.
# Load the California Housing dataset.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
perm = random_state.permutation(min(len(X), n_samples))
X = X.iloc[perm]
Expand Down
25 changes: 13 additions & 12 deletions examples/plot_quantile_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@


def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
"""Make a toy dataset."""
random_state = check_random_state(random_state)

x = random_state.uniform(*bounds, size=n_samples)
Expand All @@ -35,7 +36,7 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
return np.atleast_2d(x).T, y


# Create noisy data for modeling and non-noisy function data for illustration.
# Create a noisy dataset for modeling and a non-noisy version for illustration.
X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0)
X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_state=0)

Expand All @@ -44,25 +45,25 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_state)
qrf.fit(X_train, y_train)

y_pred_func = qrf.predict(X_func, quantiles=quantiles)
y_pred_test = qrf.predict(X_test, quantiles=quantiles)
y_pred_test = qrf.predict(X_test, quantiles=quantiles) # predict on noisy samples
y_pred_func = qrf.predict(X_func, quantiles=quantiles) # predict on non-noisy samples

df = pd.DataFrame(
{
"x": np.concatenate([X_func.reshape(-1), X_test.reshape(-1)]),
"y": np.concatenate([y_func, y_test]),
"y_pred": np.concatenate([y_pred_func[:, 1], y_pred_test[:, 1]]),
"y_pred_low": np.concatenate([y_pred_func[:, 0], y_pred_test[:, 0]]),
"y_pred_upp": np.concatenate([y_pred_func[:, 2], y_pred_test[:, 2]]),
"test": [False] * len(y_func) + [True] * len(y_test),
"x": np.concatenate([X_test.reshape(-1), X_func.reshape(-1)]),
"y": np.concatenate([y_test, y_func]),
"y_pred": np.concatenate([y_pred_test[:, 1], y_pred_func[:, 1]]),
"y_pred_low": np.concatenate([y_pred_test[:, 0], y_pred_func[:, 0]]),
"y_pred_upp": np.concatenate([y_pred_test[:, 2], y_pred_func[:, 2]]),
"test": [True] * len(y_test) + [False] * len(y_func),
}
)


def plot_fit_and_intervals(df):
area_pred = (
alt.Chart(df)
.transform_filter(~alt.datum["test"]) # filter to training data
.transform_filter(~alt.datum["test"]) # filter to non-test data
.mark_area(color="#e0f2ff", opacity=0.8)
.encode(
x=alt.X("x:Q", scale=alt.Scale(nice=False), title="X"),
Expand Down Expand Up @@ -95,7 +96,7 @@ def plot_fit_and_intervals(df):

line_true = (
alt.Chart(df.assign(**{"y_true_label": "f(x) = x sin(x)"}))
.transform_filter(~alt.datum["test"]) # filter to training data
.transform_filter(~alt.datum["test"]) # filter to non-test data
.mark_line(color="black", size=3)
.encode(
x=alt.X("x:Q", scale=alt.Scale(nice=False)),
Expand All @@ -110,7 +111,7 @@ def plot_fit_and_intervals(df):

line_pred = (
alt.Chart(df.assign(**{"y_pred_label": "Predicted Median"}))
.transform_filter(~alt.datum["test"]) # filter to training data
.transform_filter(~alt.datum["test"]) # filter to non-test data
.mark_line(color="#006aff", size=5)
.encode(
x=alt.X("x:Q", scale=alt.Scale(nice=False)),
Expand Down
5 changes: 3 additions & 2 deletions examples/plot_quantile_extrapolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@


def make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0):
"""Make a dataset from a specified function."""
random_state = check_random_state(random_state)

x = np.linspace(bounds[0], bounds[1], n_samples)
Expand Down Expand Up @@ -366,7 +367,7 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, *
return np.mean(xtra[test_indices])


# Create the full dataset.
# Create a dataset that requires extrapolation.
X, y = make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0)

# Fit and extrapolate based on train-test split (depending on X).
Expand All @@ -377,7 +378,7 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, *
train_indices[sort_X[extrap_min_idx] : sort_X[extrap_max_idx]] = True
res = train_test_split(train_indices, rng=random_state, **qrf_params)

# Get coverages on extrapolated samples.
# Get coverages for extrapolated samples.
args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], random_state)
cov_qrf = get_coverage_qrf(res["qmat"], *args)
cov_xtr = get_coverage_xtr(res["bounds_list"], *args)
Expand Down
5 changes: 3 additions & 2 deletions examples/plot_quantile_interpolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
random_state = np.random.RandomState(0)
intervals = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist()

# Create toy dataset.
# Create a simple toy dataset.
X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]])
y = np.array([-2, -1, 0, 1, 2])

Expand Down Expand Up @@ -59,11 +59,12 @@
"quantile_upp": [None] * len(y),
}

# Make predictions at the median and intervals.
quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)]

# Populate data based on prediction results with different interpolations.
for interpolation in interpolations:
# Get predictions at median and prediction intervals.
# Get predictions using the specified quantiles and interpolation method.
y_pred = qrf.predict(X, quantiles=quantiles, interpolation=interpolation.lower())

data["method"].extend([interpolation] * len(y))
Expand Down
4 changes: 2 additions & 2 deletions examples/plot_quantile_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
random_state = np.random.RandomState(0)
n_samples = 1000

# Load the California Housing Prices dataset.
# Load the California Housing dataset.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
perm = random_state.permutation(min(len(X), n_samples))
X = X.iloc[perm]
y = y.iloc[perm]

qrf = RandomForestQuantileRegressor(random_state=random_state)

kf = KFold(n_splits=5)
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
kf.get_n_splits(X)

# Using k-fold cross-validation, get predictions for all samples.
Expand Down
5 changes: 3 additions & 2 deletions examples/plot_quantile_multioutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,15 @@


def make_func_Xy(funcs, bounds, n_samples):
"""Make a dataset from a specified function."""
x = np.linspace(*bounds, n_samples)
y = np.empty((len(x), len(funcs)))
for i, func in enumerate(funcs):
y[:, i] = func["signal"](x) + func["noise"](x)
return np.atleast_2d(x).T, y


# Create the dataset with multiple target variables.
# Create a dataset with multiple target variables.
X, y = make_func_Xy(funcs, bounds, n_samples)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
Expand All @@ -56,7 +57,7 @@ def make_func_Xy(funcs, bounds, n_samples):
qrf.fit(X_train, y_train) # fit on all of the targets simultaneously

# Get multi-target predictions at specified quantiles.
y_pred = qrf.predict(X, quantiles=quantiles)
y_pred = qrf.predict(X, quantiles=quantiles) # shape = (n_samples, n_targets, n_quantiles)

df = pd.DataFrame(
{
Expand Down
4 changes: 3 additions & 1 deletion examples/plot_quantile_ranks.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@


def make_toy_dataset(n_samples, bounds, random_state=0):
"""Make a toy dataset."""
random_state = check_random_state(random_state)
X_1d = np.linspace(*bounds, num=n_samples)
X = X_1d.reshape(-1, 1)
y = X_1d * np.cos(X_1d) + random_state.normal(scale=X_1d / math.e)
return X, y


# Create a toy dataset.
X, y = make_toy_dataset(n_samples, bounds, random_state=0)

qrf = RandomForestQuantileRegressor(
Expand All @@ -43,7 +45,7 @@ def make_toy_dataset(n_samples, bounds, random_state=0):
y_pred = qrf.predict(X, quantiles=0.5)

# Get the quantile rank for all samples.
y_rank = qrf.quantile_ranks(X, y)
y_rank = qrf.quantile_ranks(X, y) # output is a value in the range [0, 1] for each sample

df = pd.DataFrame({"x": X.reshape(-1), "y": y, "y_pred": y_pred, "y_rank": y_rank})

Expand Down
Loading

0 comments on commit ee9f371

Please sign in to comment.