From 3c9c784e8c9cfaa0d4103934d8a539290e614d6d Mon Sep 17 00:00:00 2001 From: Reid Johnson Date: Sat, 14 Sep 2024 04:40:47 -0700 Subject: [PATCH] Update plot_qrf_huggingface_inference.py --- examples/plot_qrf_huggingface_inference.py | 121 ++++++++++++++------- 1 file changed, 84 insertions(+), 37 deletions(-) diff --git a/examples/plot_qrf_huggingface_inference.py b/examples/plot_qrf_huggingface_inference.py index 04c2432..36dc25c 100755 --- a/examples/plot_qrf_huggingface_inference.py +++ b/examples/plot_qrf_huggingface_inference.py @@ -3,31 +3,31 @@ ============================================== This example demonstrates how to download a trained quantile regression forest -(QRF) model from Hugging Face Hub and use it to estimate new quantiles. In -this scenario, a QRF has been trained with default parameters on a train-test -split of the California housing dataset and uploaded to Hugging Face Hub. The -model is downloaded and used to perform inference across several quantiles for -each dataset sample. The results are visualized by the latitude and longitude -of each sample. The model used is available on Hugging Face Hub +(QRF) model from Hugging Face Hub and use it to estimate quantiles. In this +scenario, a QRF has been trained with default parameters on the California +Housing dataset using k-fold cross-validation and uploaded to Hugging Face +Hub. The model is downloaded and used to perform inference across multiple +quantiles for each sample in the dataset. The predictions are aggregated by +county based on the latitude and longitude of each sample and visualized. +The trained model is available on Hugging Face Hub `here `_. """ import os -import pickle import shutil import tempfile import altair as alt import geopandas as gpd +import joblib import numpy as np import pandas as pd from sklearn import datasets +from sklearn.base import BaseEstimator, RegressorMixin, clone +from sklearn.model_selection import KFold from skops import hub_utils from vega_datasets import data -import quantile_forest -from quantile_forest import RandomForestQuantileRegressor - alt.data_transformers.disable_max_rows() token = "" @@ -35,8 +35,53 @@ load_existing = True random_state = np.random.RandomState(0) -quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist() -sample_frac = 1 +quantiles = np.linspace(0, 1, num=21, endpoint=True).round(2).tolist() + + +class CrossValidationPipeline(BaseEstimator, RegressorMixin): + """Cross-validation pipeline for scikit-learn compatible models.""" + + def __init__(self, base_model, n_splits=5, random_state=None): + self.base_model = base_model + self.n_splits = n_splits + self.random_state = random_state + self.fold_models = {} + self.fold_indices = {} + + def fit(self, X, y): + """Fit the model using k-fold cross-validation.""" + kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state) + for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)): + X_train, y_train = X.iloc[train_idx], y[train_idx] + model = clone(self.base_model) + model.fit(X_train, y_train) + self.fold_models[fold_idx] = model + self.fold_indices[fold_idx] = test_idx + return self + + def predict(self, X, quantiles=None): + """Predict using the appropriate k-fold model.""" + if quantiles is None: + quantiles = 0.5 + if not isinstance(quantiles, list): + quantiles = [quantiles] + y_pred = np.empty((X.shape[0], len(quantiles)) if len(quantiles) > 1 else (X.shape[0])) + for fold_idx, test_idx in self.fold_indices.items(): + fold_model = self.fold_models[fold_idx] + y_pred[test_idx] = fold_model.predict(X.iloc[test_idx], quantiles=quantiles) + return y_pred + + def save(self, filename): + with open(filename, "wb") as f: + joblib.dump(self.__getstate__(), f) + + @classmethod + def load(cls, filename): + with open(filename, "rb") as f: + state = joblib.load(f) + obj = cls(base_model=None) + obj.__setstate__(state) + return obj def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None): @@ -49,21 +94,27 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= median_absolute_error, r2_score, ) - from sklearn.model_selection import train_test_split + from sklearn.pipeline import Pipeline from skops import card + import quantile_forest + from quantile_forest import RandomForestQuantileRegressor + # Load the California Housing dataset. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state) + # Define the model pipeline. + qrf = RandomForestQuantileRegressor(random_state=random_state) + pipeline = Pipeline( + [("cv_model", CrossValidationPipeline(qrf, n_splits=5, random_state=random_state))] + ) - # Fit the model. - qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train) + # Fit the model pipeline. + pipeline.fit(X, y) - # Save the model to a file. + # Save the pipeline (with all models) to a file. model_filename = "model.pkl" - with open(model_filename, mode="bw") as f: - pickle.dump(qrf, file=f) + pipeline.named_steps["cv_model"].save(model_filename) # Prepare model repository. if os.path.exists(local_dir): @@ -76,7 +127,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= requirements=[f"quantile-forest={quantile_forest.__version__}"], dst=local_dir, task="tabular-regression", - data=X_train, + data=X, ) # Create a model card. @@ -93,19 +144,18 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= "prediction-intervals", ] model_description = ( - "This is a RandomForestQuantileRegressor trained on the California housing dataset." + "This is a RandomForestQuantileRegressor trained on the California Housing dataset." ) limitations = "This model is not ready to be used in production." training_procedure = ( - "The model was trained using default parameters on a standard train-test split." + "The model was trained using default parameters on a 5-fold cross-validation pipeline." ) get_started_code = """
Click to expand ```python -import pickle -with open(qrf_pkl_filename, 'rb') as file: - qrf = pickle.load(file) +from examples.plot_qrf_huggingface_inference import CrossValidationPipeline +pipeline = CrossValidationPipeline.load(qrf_pkl_filename) ```
""" @@ -119,11 +169,11 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= ) # Add performance metrics to the model card. - y_pred = qrf.predict(X_test) - mape = mean_absolute_percentage_error(y_test, y_pred) - mdae = median_absolute_error(y_test, y_pred) - mse = mean_squared_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) + y_pred = pipeline.predict(X) + mape = mean_absolute_percentage_error(y, y_pred) + mdae = median_absolute_error(y, y_pred) + mse = mean_squared_error(y, y_pred) + r2 = r2_score(y, y_pred) model_card.add_metrics( **{ "Mean Absolute Percentage Error": mape, @@ -161,18 +211,15 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state= local_dir = "./local_repo" with tempfile.TemporaryDirectory() as local_dir: hub_utils.download(repo_id=repo_id, dst=local_dir) - with open(f"{local_dir}/{model_filename}", "rb") as file: - qrf = pickle.load(file) + pipeline = CrossValidationPipeline.load(f"{local_dir}/{model_filename}") # Fetch the California Housing dataset and estimate quantiles. X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True) -y_pred = qrf.predict(X, quantiles=quantiles) * 100_000 # predict in dollars - +y_pred = pipeline.predict(X, quantiles=quantiles) * 100_000 # predict in dollars df = ( pd.DataFrame(y_pred, columns=quantiles) .reset_index() - .sample(frac=sample_frac, random_state=random_state) .rename(columns={q: f"q_{q:.3g}" for q in quantiles}) .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right") ) @@ -237,7 +284,7 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="lightgreyred"): .add_params(quantile_val) .transform_calculate(quantile_col="'q_' + quantile") .transform_calculate(value=f"datum[datum.quantile_col]") - .mark_geoshape(stroke="black", strokeWidth=0) + .mark_geoshape(stroke="black", strokeWidth=0.5) .encode( color=alt.Color( "value:Q", @@ -252,7 +299,7 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="lightgreyred"): ) .project(type="mercator") .properties( - title="Quantile Predictions on the California Housing Dataset", + title="Quantile Predictions on the California Housing Dataset by County", height=650, width=650, )