Skip to content

Commit

Permalink
Update example plots
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Sep 14, 2024
1 parent f54d55c commit e20cbcd
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 64 deletions.
2 changes: 2 additions & 0 deletions docs/sphinx_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
sphinx
altair
geopandas
numpydoc
pillow
pydata_sphinx_theme
Expand All @@ -8,4 +9,5 @@ skops
sphinxcontrib.bibtex
sphinx-design
sphinxext-altair
vega-datasets
vl-convert-python
194 changes: 139 additions & 55 deletions examples/plot_qrf_huggingface_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,30 @@
==============================================
This example demonstrates how to download a trained quantile regression forest
(QRF) model from Hugging Face Hub and use it to estimate new quantiles. In
this scenario, a QRF has been trained with default parameters on a train-test
split of the California housing dataset and uploaded to Hugging Face Hub. The
model is downloaded and used to perform inference across several quantiles for
each dataset sample. The results are visualized by the latitude and longitude
of each sample. The model used is available on Hugging Face Hub
(QRF) model from Hugging Face Hub and use it to estimate quantiles. In this
scenario, a QRF has been trained with default parameters on the California
Housing dataset using k-fold cross-validation and uploaded to Hugging Face
Hub. The model is downloaded and used to perform inference across multiple
quantiles for each sample in the dataset. The predictions are aggregated by
county based on the latitude and longitude of each sample and visualized.
The trained model is available on Hugging Face Hub
`here <https://huggingface.co/quantile-forest/california-housing-example>`_.
"""

import os
import pickle
import shutil
import tempfile

import altair as alt
import geopandas as gpd
import joblib
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.model_selection import KFold
from skops import hub_utils

import quantile_forest
from quantile_forest import RandomForestQuantileRegressor
from vega_datasets import data

alt.data_transformers.disable_max_rows()

Expand All @@ -33,8 +35,53 @@
load_existing = True

random_state = np.random.RandomState(0)
quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist()
sample_frac = 1
quantiles = np.linspace(0, 1, num=21, endpoint=True).round(2).tolist()


class CrossValidationPipeline(BaseEstimator, RegressorMixin):
"""Cross-validation pipeline for scikit-learn compatible models."""

def __init__(self, base_model, n_splits=5, random_state=None):
self.base_model = base_model
self.n_splits = n_splits
self.random_state = random_state
self.fold_models = {}
self.fold_indices = {}

def fit(self, X, y):
"""Fit the model using k-fold cross-validation."""
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
X_train, y_train = X.iloc[train_idx], y[train_idx]
model = clone(self.base_model)
model.fit(X_train, y_train)
self.fold_models[fold_idx] = model
self.fold_indices[fold_idx] = test_idx
return self

def predict(self, X, quantiles=None):
"""Predict using the appropriate k-fold model."""
if quantiles is None:
quantiles = 0.5
if not isinstance(quantiles, list):
quantiles = [quantiles]
y_pred = np.empty((X.shape[0], len(quantiles)) if len(quantiles) > 1 else (X.shape[0]))
for fold_idx, test_idx in self.fold_indices.items():
fold_model = self.fold_models[fold_idx]
y_pred[test_idx] = fold_model.predict(X.iloc[test_idx], quantiles=quantiles)
return y_pred

def save(self, filename):
with open(filename, "wb") as f:
joblib.dump(self.__getstate__(), f)

@classmethod
def load(cls, filename):
with open(filename, "rb") as f:
state = joblib.load(f)
obj = cls(base_model=None)
obj.__setstate__(state)
return obj


def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None):
Expand All @@ -47,21 +94,27 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
median_absolute_error,
r2_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from skops import card

import quantile_forest
from quantile_forest import RandomForestQuantileRegressor

# Load the California Housing dataset.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
# Define the model pipeline.
qrf = RandomForestQuantileRegressor(random_state=random_state)
pipeline = Pipeline(
[("cv_model", CrossValidationPipeline(qrf, n_splits=5, random_state=random_state))]
)

# Fit the model.
qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
# Fit the model pipeline.
pipeline.fit(X, y)

# Save the model to a file.
# Save the pipeline (with all models) to a file.
model_filename = "model.pkl"
with open(model_filename, mode="bw") as f:
pickle.dump(qrf, file=f)
pipeline.named_steps["cv_model"].save(model_filename)

# Prepare model repository.
if os.path.exists(local_dir):
Expand All @@ -74,7 +127,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
requirements=[f"quantile-forest={quantile_forest.__version__}"],
dst=local_dir,
task="tabular-regression",
data=X_train,
data=X,
)

# Create a model card.
Expand All @@ -91,19 +144,18 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
"prediction-intervals",
]
model_description = (
"This is a RandomForestQuantileRegressor trained on the California housing dataset."
"This is a RandomForestQuantileRegressor trained on the California Housing dataset."
)
limitations = "This model is not ready to be used in production."
training_procedure = (
"The model was trained using default parameters on a standard train-test split."
"The model was trained using default parameters on a 5-fold cross-validation pipeline."
)
get_started_code = """<details>
<summary> Click to expand </summary>
```python
import pickle
with open(qrf_pkl_filename, 'rb') as file:
qrf = pickle.load(file)
from examples.plot_qrf_huggingface_inference import CrossValidationPipeline
pipeline = CrossValidationPipeline.load(qrf_pkl_filename)
```
</details>"""
Expand All @@ -117,11 +169,11 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
)

# Add performance metrics to the model card.
y_pred = qrf.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
mdae = median_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
y_pred = pipeline.predict(X)
mape = mean_absolute_percentage_error(y, y_pred)
mdae = median_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
model_card.add_metrics(
**{
"Mean Absolute Percentage Error": mape,
Expand Down Expand Up @@ -159,23 +211,21 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
local_dir = "./local_repo"
with tempfile.TemporaryDirectory() as local_dir:
hub_utils.download(repo_id=repo_id, dst=local_dir)
with open(f"{local_dir}/{model_filename}", "rb") as file:
qrf = pickle.load(file)
pipeline = CrossValidationPipeline.load(f"{local_dir}/{model_filename}")

# Fetch the California Housing dataset and estimate quantiles.
X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
y_pred = qrf.predict(X, quantiles=quantiles) * 100_000 # predict in dollars
y_pred = pipeline.predict(X, quantiles=quantiles) * 100_000 # predict in dollars

df = (
pd.DataFrame(y_pred, columns=quantiles)
.reset_index()
.sample(frac=sample_frac, random_state=random_state)
.melt(id_vars=["index"], var_name="quantile", value_name="value")
.rename(columns={q: f"q_{q:.3g}" for q in quantiles})
.merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
)


def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"):
def plot_quantiles_by_latlon(df, quantiles, color_scheme="lightgreyred"):
"""Plot quantile predictions on California Housing dataset by lat/lon."""
# Slider for varying the displayed quantile estimates.
slider = alt.binding_range(
Expand All @@ -187,35 +237,69 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"):

quantile_val = alt.param(name="quantile", value=0.5, bind=slider)

# Load the US counties data and filter to California counties.
ca_counties = (
gpd.read_file(data.us_10m.url, layer="counties")
.set_crs("EPSG:4326")
.assign(**{"county_fips": lambda x: x["id"].astype(int)})
.drop(columns=["id"])
.query("(county_fips >= 6000) & (county_fips < 7000)")
)

x_min = df[[f"q_{q:.3g}" for q in quantiles]].min().min()
x_max = df[[f"q_{q:.3g}" for q in quantiles]].max().max()

df = (
gpd.GeoDataFrame(
df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]), crs="4326"
)
.sjoin(ca_counties, how="right")
.drop(columns=["index_left0"])
.assign(
**{f"w_q_{q:.3g}": lambda x, q=q: x[f"q_{q:.3g}"] * x["Population"] for q in quantiles}
)
)

grouped = (
df.groupby("county_fips")
.agg({**{f"w_q_{q:.3g}": "sum" for q in quantiles}, **{"Population": "sum"}})
.reset_index()
.assign(
**{f"q_{q:.3g}": lambda x, q=q: x[f"w_q_{q:.3g}"] / x["Population"] for q in quantiles}
)
)

df = (
df[["county_fips", "Latitude", "Longitude", "geometry"]]
.drop_duplicates(subset=["county_fips"])
.merge(
grouped[["county_fips", "Population"] + [f"q_{q:.3g}" for q in quantiles]],
on="county_fips",
how="left",
)
)

chart = (
alt.Chart(df)
.add_params(quantile_val)
.transform_filter("datum.quantile == quantile")
.mark_circle()
.transform_calculate(quantile_col="'q_' + quantile")
.transform_calculate(value=f"datum[datum.quantile_col]")
.mark_geoshape(stroke="black", strokeWidth=0.5)
.encode(
x=alt.X(
"Longitude:Q",
axis=alt.Axis(tickMinStep=1, format=".1f"),
scale=alt.Scale(zero=False),
title="Longitude",
),
y=alt.Y(
"Latitude:Q",
axis=alt.Axis(tickMinStep=1, format=".1f"),
scale=alt.Scale(zero=False),
title="Latitude",
color=alt.Color(
"value:Q",
scale=alt.Scale(domain=[x_min, x_max], scheme=color_scheme),
title="Prediction",
),
color=alt.Color("value:Q", scale=alt.Scale(scheme=color_scheme), title="Prediction"),
size=alt.Size("Population:Q"),
tooltip=[
alt.Tooltip("index:N", title="Row ID"),
alt.Tooltip("Latitude:Q", format=".2f", title="Latitude"),
alt.Tooltip("Longitude:Q", format=".2f", title="Longitude"),
alt.Tooltip("county_fips:N", title="County FIPS"),
alt.Tooltip("Population:N", format=",.0f", title="Population"),
alt.Tooltip("value:Q", format="$,.0f", title="Predicted Value"),
],
)
.project(type="mercator")
.properties(
title="Quantile Predictions on the California Housing Dataset",
title="Quantile Predictions on the California Housing Dataset by County",
height=650,
width=650,
)
Expand Down
4 changes: 2 additions & 2 deletions examples/plot_qrf_interpolation_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@
"method": ["Actual"] * len(y),
"X": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())],
"y_pred": y.tolist(),
"y_pred_low": y.tolist(),
"y_pred_upp": y.tolist(),
"y_pred_low": [None] * len(y),
"y_pred_upp": [None] * len(y),
"quantile_low": [None] * len(y),
"quantile_upp": [None] * len(y),
}
Expand Down
13 changes: 6 additions & 7 deletions examples/plot_qrf_multitarget.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
quantiles = np.linspace(0, 1, num=41, endpoint=True).round(3).tolist()

# Define functions that generate targets; each function maps to one target.
funcs = [
target_funcs = [
{
"signal": lambda x: np.log1p(x + 1),
"noise": lambda x: np.log1p(x) * random_state.uniform(size=len(x)),
Expand All @@ -36,18 +36,19 @@
},
]

legend = {k: v for f in funcs for k, v in f["legend"].items()}


def make_funcs_Xy(funcs, n_samples, bounds):
"""Make a dataset from specified function(s) with signal and noise."""
"""Make a dataset from specified function(s)."""
x = np.linspace(*bounds, n_samples)
y = np.empty((len(x), len(funcs)))
for i, func in enumerate(funcs):
y[:, i] = func["signal"](x) + func["noise"](x)
y[:, i] = func(x)
return np.atleast_2d(x).T, y


funcs = [lambda x, f=f: f["signal"](x) + f["noise"](x) for f in target_funcs]
legend = {k: v for f in target_funcs for k, v in f["legend"].items()}

# Create a dataset with multiple target variables.
X, y = make_funcs_Xy(funcs, n_samples, bounds)

Expand All @@ -63,7 +64,6 @@ def make_funcs_Xy(funcs, n_samples, bounds):
{
"x": np.tile(X.squeeze(), len(funcs)),
"y": y.reshape(-1, order="F"),
"y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]),
"y_pred": np.concatenate([y_pred[:, i, len(quantiles) // 2] for i in range(len(funcs))]),
"target": np.concatenate([[str(i)] * len(X) for i in range(len(funcs))]),
**{f"q_{q_i:.3g}": y_i.ravel() for q_i, y_i in zip(quantiles, y_pred.T)},
Expand Down Expand Up @@ -95,7 +95,6 @@ def plot_multitargets(df, legend):
alt.Tooltip("target:N", title="Target"),
alt.Tooltip("x:Q", format=",.3f", title="X"),
alt.Tooltip("y:Q", format=",.3f", title="Y"),
alt.Tooltip("y_true:Q", format=",.3f", title="Y"),
alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"),
alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"),
alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"),
Expand Down

0 comments on commit e20cbcd

Please sign in to comment.