From e20cbcd2e8b07ac51931f223d8ee31b964be909c Mon Sep 17 00:00:00 2001 From: Reid Johnson Date: Sat, 14 Sep 2024 04:54:02 -0700 Subject: [PATCH] Update example plots --- docs/sphinx_requirements.txt | 2 + examples/plot_qrf_huggingface_inference.py | 194 +++++++++++++++------ examples/plot_qrf_interpolation_methods.py | 4 +- examples/plot_qrf_multitarget.py | 13 +- 4 files changed, 149 insertions(+), 64 deletions(-) diff --git a/docs/sphinx_requirements.txt b/docs/sphinx_requirements.txt index 826e7ba..494024f 100644 --- a/docs/sphinx_requirements.txt +++ b/docs/sphinx_requirements.txt @@ -1,5 +1,6 @@ sphinx altair +geopandas numpydoc pillow pydata_sphinx_theme @@ -8,4 +9,5 @@ skops sphinxcontrib.bibtex sphinx-design sphinxext-altair +vega-datasets vl-convert-python diff --git a/examples/plot_qrf_huggingface_inference.py b/examples/plot_qrf_huggingface_inference.py index 01f3b47..36dc25c 100755 --- a/examples/plot_qrf_huggingface_inference.py +++ b/examples/plot_qrf_huggingface_inference.py @@ -3,28 +3,30 @@ ============================================== This example demonstrates how to download a trained quantile regression forest -(QRF) model from Hugging Face Hub and use it to estimate new quantiles. In -this scenario, a QRF has been trained with default parameters on a train-test -split of the California housing dataset and uploaded to Hugging Face Hub. The -model is downloaded and used to perform inference across several quantiles for -each dataset sample. The results are visualized by the latitude and longitude -of each sample. The model used is available on Hugging Face Hub +(QRF) model from Hugging Face Hub and use it to estimate quantiles. In this +scenario, a QRF has been trained with default parameters on the California +Housing dataset using k-fold cross-validation and uploaded to Hugging Face +Hub. The model is downloaded and used to perform inference across multiple +quantiles for each sample in the dataset. The predictions are aggregated by +county based on the latitude and longitude of each sample and visualized. +The trained model is available on Hugging Face Hub `here `_. """ import os -import pickle import shutil import tempfile import altair as alt +import geopandas as gpd +import joblib import numpy as np import pandas as pd from sklearn import datasets +from sklearn.base import BaseEstimator, RegressorMixin, clone +from sklearn.model_selection import KFold from skops import hub_utils - -import quantile_forest -from quantile_forest import RandomForestQuantileRegressor +from vega_datasets import data alt.data_transformers.disable_max_rows() @@ -33,8 +35,53 @@ load_existing = True random_state = np.random.RandomState(0) -quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist() -sample_frac = 1 +quantiles = np.linspace(0, 1, num=21, endpoint=True).round(2).tolist() + + +class CrossValidationPipeline(BaseEstimator, RegressorMixin): + """Cross-validation pipeline for scikit-learn compatible models.""" + + def __init__(self, base_model, n_splits=5, random_state=None): + self.base_model = base_model + self.n_splits = n_splits + self.random_state = random_state + self.fold_models = {} + self.fold_indices = {} + + def fit(self, X, y): + """Fit the model using k-fold cross-validation.""" + kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state) + for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)): + X_train, y_train = X.iloc[train_idx], y[train_idx] + model = clone(self.base_model) + model.fit(X_train, y_train) + self.fold_models[fold_idx] = model + self.fold_indices[fold_idx] = test_idx + return self + + def predict(self, X, quantiles=None): + """Predict using the appropriate k-fold model.""" + if quantiles is None: + quantiles = 0.5 + if not isinstance(quantiles, list): + quantiles = [quantiles] + y_pred = np.empty((X.shape[0], len(quantiles)) if len(quantiles) > 1 else (X.shape[0])) + for fold_idx, test_idx in self.fold_indices.items(): + fold_model = self.fold_models[fold_idx] + y_pred[test_idx] = fold_model.predict(X.iloc[test_idx], quantiles=quantiles) + return y_pred + + def save(self, filename): + with open(filename, "wb") as f: + joblib.dump(self.__getstate__(), f) + + @classmethod + def load(cls, filename): + with open(filename, "rb") as f: + state = joblib.load(f) + obj = cls(base_model=None) + obj.__setstate__(state) + return obj def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None): @@ -47,21 +94,27 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= median_absolute_error, r2_score, ) - from sklearn.model_selection import train_test_split + from sklearn.pipeline import Pipeline from skops import card + import quantile_forest + from quantile_forest import RandomForestQuantileRegressor + # Load the California Housing dataset. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) + # Define the model pipeline. + qrf = RandomForestQuantileRegressor(random_state=random_state) + pipeline = Pipeline( + [("cv_model", CrossValidationPipeline(qrf, n_splits=5, random_state=random_state))] + ) - # Fit the model. - qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train) + # Fit the model pipeline. + pipeline.fit(X, y) - # Save the model to a file. + # Save the pipeline (with all models) to a file. model_filename = "model.pkl" - with open(model_filename, mode="bw") as f: - pickle.dump(qrf, file=f) + pipeline.named_steps["cv_model"].save(model_filename) # Prepare model repository. if os.path.exists(local_dir): @@ -74,7 +127,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= requirements=[f"quantile-forest={quantile_forest.__version__}"], dst=local_dir, task="tabular-regression", - data=X_train, + data=X, ) # Create a model card. @@ -91,19 +144,18 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= "prediction-intervals", ] model_description = ( - "This is a RandomForestQuantileRegressor trained on the California housing dataset." + "This is a RandomForestQuantileRegressor trained on the California Housing dataset." ) limitations = "This model is not ready to be used in production." training_procedure = ( - "The model was trained using default parameters on a standard train-test split." + "The model was trained using default parameters on a 5-fold cross-validation pipeline." ) get_started_code = """
Click to expand ```python -import pickle -with open(qrf_pkl_filename, 'rb') as file: - qrf = pickle.load(file) +from examples.plot_qrf_huggingface_inference import CrossValidationPipeline +pipeline = CrossValidationPipeline.load(qrf_pkl_filename) ```
""" @@ -117,11 +169,11 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= ) # Add performance metrics to the model card. - y_pred = qrf.predict(X_test) - mape = mean_absolute_percentage_error(y_test, y_pred) - mdae = median_absolute_error(y_test, y_pred) - mse = mean_squared_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) + y_pred = pipeline.predict(X) + mape = mean_absolute_percentage_error(y, y_pred) + mdae = median_absolute_error(y, y_pred) + mse = mean_squared_error(y, y_pred) + r2 = r2_score(y, y_pred) model_card.add_metrics( **{ "Mean Absolute Percentage Error": mape, @@ -159,23 +211,21 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= local_dir = "./local_repo" with tempfile.TemporaryDirectory() as local_dir: hub_utils.download(repo_id=repo_id, dst=local_dir) - with open(f"{local_dir}/{model_filename}", "rb") as file: - qrf = pickle.load(file) + pipeline = CrossValidationPipeline.load(f"{local_dir}/{model_filename}") # Fetch the California Housing dataset and estimate quantiles. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) -y_pred = qrf.predict(X, quantiles=quantiles) * 100_000 # predict in dollars +y_pred = pipeline.predict(X, quantiles=quantiles) * 100_000 # predict in dollars df = ( pd.DataFrame(y_pred, columns=quantiles) .reset_index() - .sample(frac=sample_frac, random_state=random_state) - .melt(id_vars=["index"], var_name="quantile", value_name="value") + .rename(columns={q: f"q_{q:.3g}" for q in quantiles}) .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right") ) -def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"): +def plot_quantiles_by_latlon(df, quantiles, color_scheme="lightgreyred"): """Plot quantile predictions on California Housing dataset by lat/lon.""" # Slider for varying the displayed quantile estimates. slider = alt.binding_range( @@ -187,35 +237,69 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"): quantile_val = alt.param(name="quantile", value=0.5, bind=slider) + # Load the US counties data and filter to California counties. + ca_counties = ( + gpd.read_file(data.us_10m.url, layer="counties") + .set_crs("EPSG:4326") + .assign(**{"county_fips": lambda x: x["id"].astype(int)}) + .drop(columns=["id"]) + .query("(county_fips >= 6000) & (county_fips < 7000)") + ) + + x_min = df[[f"q_{q:.3g}" for q in quantiles]].min().min() + x_max = df[[f"q_{q:.3g}" for q in quantiles]].max().max() + + df = ( + gpd.GeoDataFrame( + df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]), crs="4326" + ) + .sjoin(ca_counties, how="right") + .drop(columns=["index_left0"]) + .assign( + **{f"w_q_{q:.3g}": lambda x, q=q: x[f"q_{q:.3g}"] * x["Population"] for q in quantiles} + ) + ) + + grouped = ( + df.groupby("county_fips") + .agg({**{f"w_q_{q:.3g}": "sum" for q in quantiles}, **{"Population": "sum"}}) + .reset_index() + .assign( + **{f"q_{q:.3g}": lambda x, q=q: x[f"w_q_{q:.3g}"] / x["Population"] for q in quantiles} + ) + ) + + df = ( + df[["county_fips", "Latitude", "Longitude", "geometry"]] + .drop_duplicates(subset=["county_fips"]) + .merge( + grouped[["county_fips", "Population"] + [f"q_{q:.3g}" for q in quantiles]], + on="county_fips", + how="left", + ) + ) + chart = ( alt.Chart(df) .add_params(quantile_val) - .transform_filter("datum.quantile == quantile") - .mark_circle() + .transform_calculate(quantile_col="'q_' + quantile") + .transform_calculate(value=f"datum[datum.quantile_col]") + .mark_geoshape(stroke="black", strokeWidth=0.5) .encode( - x=alt.X( - "Longitude:Q", - axis=alt.Axis(tickMinStep=1, format=".1f"), - scale=alt.Scale(zero=False), - title="Longitude", - ), - y=alt.Y( - "Latitude:Q", - axis=alt.Axis(tickMinStep=1, format=".1f"), - scale=alt.Scale(zero=False), - title="Latitude", + color=alt.Color( + "value:Q", + scale=alt.Scale(domain=[x_min, x_max], scheme=color_scheme), + title="Prediction", ), - color=alt.Color("value:Q", scale=alt.Scale(scheme=color_scheme), title="Prediction"), - size=alt.Size("Population:Q"), tooltip=[ - alt.Tooltip("index:N", title="Row ID"), - alt.Tooltip("Latitude:Q", format=".2f", title="Latitude"), - alt.Tooltip("Longitude:Q", format=".2f", title="Longitude"), + alt.Tooltip("county_fips:N", title="County FIPS"), + alt.Tooltip("Population:N", format=",.0f", title="Population"), alt.Tooltip("value:Q", format="$,.0f", title="Predicted Value"), ], ) + .project(type="mercator") .properties( - title="Quantile Predictions on the California Housing Dataset", + title="Quantile Predictions on the California Housing Dataset by County", height=650, width=650, ) diff --git a/examples/plot_qrf_interpolation_methods.py b/examples/plot_qrf_interpolation_methods.py index 3120d37..576c34d 100755 --- a/examples/plot_qrf_interpolation_methods.py +++ b/examples/plot_qrf_interpolation_methods.py @@ -53,8 +53,8 @@ "method": ["Actual"] * len(y), "X": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())], "y_pred": y.tolist(), - "y_pred_low": y.tolist(), - "y_pred_upp": y.tolist(), + "y_pred_low": [None] * len(y), + "y_pred_upp": [None] * len(y), "quantile_low": [None] * len(y), "quantile_upp": [None] * len(y), } diff --git a/examples/plot_qrf_multitarget.py b/examples/plot_qrf_multitarget.py index 1e16ffb..25c436c 100644 --- a/examples/plot_qrf_multitarget.py +++ b/examples/plot_qrf_multitarget.py @@ -23,7 +23,7 @@ quantiles = np.linspace(0, 1, num=41, endpoint=True).round(3).tolist() # Define functions that generate targets; each function maps to one target. -funcs = [ +target_funcs = [ { "signal": lambda x: np.log1p(x + 1), "noise": lambda x: np.log1p(x) * random_state.uniform(size=len(x)), @@ -36,18 +36,19 @@ }, ] -legend = {k: v for f in funcs for k, v in f["legend"].items()} - def make_funcs_Xy(funcs, n_samples, bounds): - """Make a dataset from specified function(s) with signal and noise.""" + """Make a dataset from specified function(s).""" x = np.linspace(*bounds, n_samples) y = np.empty((len(x), len(funcs))) for i, func in enumerate(funcs): - y[:, i] = func["signal"](x) + func["noise"](x) + y[:, i] = func(x) return np.atleast_2d(x).T, y +funcs = [lambda x, f=f: f["signal"](x) + f["noise"](x) for f in target_funcs] +legend = {k: v for f in target_funcs for k, v in f["legend"].items()} + # Create a dataset with multiple target variables. X, y = make_funcs_Xy(funcs, n_samples, bounds) @@ -63,7 +64,6 @@ def make_funcs_Xy(funcs, n_samples, bounds): { "x": np.tile(X.squeeze(), len(funcs)), "y": y.reshape(-1, order="F"), - "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]), "y_pred": np.concatenate([y_pred[:, i, len(quantiles) // 2] for i in range(len(funcs))]), "target": np.concatenate([[str(i)] * len(X) for i in range(len(funcs))]), **{f"q_{q_i:.3g}": y_i.ravel() for q_i, y_i in zip(quantiles, y_pred.T)}, @@ -95,7 +95,6 @@ def plot_multitargets(df, legend): alt.Tooltip("target:N", title="Target"), alt.Tooltip("x:Q", format=",.3f", title="X"), alt.Tooltip("y:Q", format=",.3f", title="Y"), - alt.Tooltip("y_true:Q", format=",.3f", title="Y"), alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"), alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"), alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"),