From 3c9c784e8c9cfaa0d4103934d8a539290e614d6d Mon Sep 17 00:00:00 2001
From: Reid Johnson <reidj@zillowgroup.com>
Date: Sat, 14 Sep 2024 04:40:47 -0700
Subject: [PATCH] Update plot_qrf_huggingface_inference.py

---
 examples/plot_qrf_huggingface_inference.py | 121 ++++++++++++++-------
 1 file changed, 84 insertions(+), 37 deletions(-)
diff --git a/examples/plot_qrf_huggingface_inference.py b/examples/plot_qrf_huggingface_inference.py
index 04c2432..36dc25c 100755
--- a/examples/plot_qrf_huggingface_inference.py
+++ b/examples/plot_qrf_huggingface_inference.py
@@ -3,31 +3,31 @@
 ==============================================
 
 This example demonstrates how to download a trained quantile regression forest
-(QRF) model from Hugging Face Hub and use it to estimate new quantiles. In
-this scenario, a QRF has been trained with default parameters on a train-test
-split of the California housing dataset and uploaded to Hugging Face Hub. The
-model is downloaded and used to perform inference across several quantiles for
-each dataset sample. The results are visualized by the latitude and longitude
-of each sample. The model used is available on Hugging Face Hub
+(QRF) model from Hugging Face Hub and use it to estimate quantiles. In this
+scenario, a QRF has been trained with default parameters on the California
+Housing dataset using k-fold cross-validation and uploaded to Hugging Face
+Hub. The model is downloaded and used to perform inference across multiple
+quantiles for each sample in the dataset. The predictions are aggregated by
+county based on the latitude and longitude of each sample and visualized.
+The trained model is available on Hugging Face Hub
 `here <https://huggingface.co/quantile-forest/california-housing-example>`_.
 """
 
 import os
-import pickle
 import shutil
 import tempfile
 
 import altair as alt
 import geopandas as gpd
+import joblib
 import numpy as np
 import pandas as pd
 from sklearn import datasets
+from sklearn.base import BaseEstimator, RegressorMixin, clone
+from sklearn.model_selection import KFold
 from skops import hub_utils
 from vega_datasets import data
 
-import quantile_forest
-from quantile_forest import RandomForestQuantileRegressor
-
 alt.data_transformers.disable_max_rows()
 
 token = "<Hugging Face Access Token>"
@@ -35,8 +35,53 @@
 load_existing = True
 
 random_state = np.random.RandomState(0)
-quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist()
-sample_frac = 1
+quantiles = np.linspace(0, 1, num=21, endpoint=True).round(2).tolist()
+
+
+class CrossValidationPipeline(BaseEstimator, RegressorMixin):
+    """Cross-validation pipeline for scikit-learn compatible models."""
+
+    def __init__(self, base_model, n_splits=5, random_state=None):
+        self.base_model = base_model
+        self.n_splits = n_splits
+        self.random_state = random_state
+        self.fold_models = {}
+        self.fold_indices = {}
+
+    def fit(self, X, y):
+        """Fit the model using k-fold cross-validation."""
+        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
+        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
+            X_train, y_train = X.iloc[train_idx], y[train_idx]
+            model = clone(self.base_model)
+            model.fit(X_train, y_train)
+            self.fold_models[fold_idx] = model
+            self.fold_indices[fold_idx] = test_idx
+        return self
+
+    def predict(self, X, quantiles=None):
+        """Predict using the appropriate k-fold model."""
+        if quantiles is None:
+            quantiles = 0.5
+        if not isinstance(quantiles, list):
+            quantiles = [quantiles]
+        y_pred = np.empty((X.shape[0], len(quantiles)) if len(quantiles) > 1 else (X.shape[0]))
+        for fold_idx, test_idx in self.fold_indices.items():
+            fold_model = self.fold_models[fold_idx]
+            y_pred[test_idx] = fold_model.predict(X.iloc[test_idx], quantiles=quantiles)
+        return y_pred
+
+    def save(self, filename):
+        with open(filename, "wb") as f:
+            joblib.dump(self.__getstate__(), f)
+
+    @classmethod
+    def load(cls, filename):
+        with open(filename, "rb") as f:
+            state = joblib.load(f)
+        obj = cls(base_model=None)
+        obj.__setstate__(state)
+        return obj
 
 
 def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None):
@@ -49,21 +94,27 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
         median_absolute_error,
         r2_score,
     )
-    from sklearn.model_selection import train_test_split
+    from sklearn.pipeline import Pipeline
     from skops import card
 
+    import quantile_forest
+    from quantile_forest import RandomForestQuantileRegressor
+
     # Load the California Housing dataset.
     X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
+    # Define the model pipeline.
+    qrf = RandomForestQuantileRegressor(random_state=random_state)
+    pipeline = Pipeline(
+        [("cv_model", CrossValidationPipeline(qrf, n_splits=5, random_state=random_state))]
+    )
 
-    # Fit the model.
-    qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
+    # Fit the model pipeline.
+    pipeline.fit(X, y)
 
-    # Save the model to a file.
+    # Save the pipeline (with all models) to a file.
     model_filename = "model.pkl"
-    with open(model_filename, mode="bw") as f:
-        pickle.dump(qrf, file=f)
+    pipeline.named_steps["cv_model"].save(model_filename)
 
     # Prepare model repository.
     if os.path.exists(local_dir):
@@ -76,7 +127,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
         requirements=[f"quantile-forest={quantile_forest.__version__}"],
         dst=local_dir,
         task="tabular-regression",
-        data=X_train,
+        data=X,
     )
 
     # Create a model card.
@@ -93,19 +144,18 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
         "prediction-intervals",
     ]
     model_description = (
-        "This is a RandomForestQuantileRegressor trained on the California housing dataset."
+        "This is a RandomForestQuantileRegressor trained on the California Housing dataset."
     )
     limitations = "This model is not ready to be used in production."
     training_procedure = (
-        "The model was trained using default parameters on a standard train-test split."
+        "The model was trained using default parameters on a 5-fold cross-validation pipeline."
     )
     get_started_code = """<details>
 <summary> Click to expand </summary>
 
 ```python
-import pickle
-with open(qrf_pkl_filename, 'rb') as file:
-    qrf = pickle.load(file)
+from examples.plot_qrf_huggingface_inference import CrossValidationPipeline
+pipeline = CrossValidationPipeline.load(qrf_pkl_filename)
 ```
 
 </details>"""
@@ -119,11 +169,11 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
     )
 
     # Add performance metrics to the model card.
-    y_pred = qrf.predict(X_test)
-    mape = mean_absolute_percentage_error(y_test, y_pred)
-    mdae = median_absolute_error(y_test, y_pred)
-    mse = mean_squared_error(y_test, y_pred)
-    r2 = r2_score(y_test, y_pred)
+    y_pred = pipeline.predict(X)
+    mape = mean_absolute_percentage_error(y, y_pred)
+    mdae = median_absolute_error(y, y_pred)
+    mse = mean_squared_error(y, y_pred)
+    r2 = r2_score(y, y_pred)
     model_card.add_metrics(
         **{
             "Mean Absolute Percentage Error": mape,
@@ -161,18 +211,15 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
 local_dir = "./local_repo"
 with tempfile.TemporaryDirectory() as local_dir:
     hub_utils.download(repo_id=repo_id, dst=local_dir)
-    with open(f"{local_dir}/{model_filename}", "rb") as file:
-        qrf = pickle.load(file)
+    pipeline = CrossValidationPipeline.load(f"{local_dir}/{model_filename}")
 
 # Fetch the California Housing dataset and estimate quantiles.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
-y_pred = qrf.predict(X, quantiles=quantiles) * 100_000  # predict in dollars
-
+y_pred = pipeline.predict(X, quantiles=quantiles) * 100_000  # predict in dollars
 
 df = (
     pd.DataFrame(y_pred, columns=quantiles)
     .reset_index()
-    .sample(frac=sample_frac, random_state=random_state)
     .rename(columns={q: f"q_{q:.3g}" for q in quantiles})
     .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
 )
@@ -237,7 +284,7 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="lightgreyred"):
         .add_params(quantile_val)
         .transform_calculate(quantile_col="'q_' + quantile")
         .transform_calculate(value=f"datum[datum.quantile_col]")
-        .mark_geoshape(stroke="black", strokeWidth=0)
+        .mark_geoshape(stroke="black", strokeWidth=0.5)
         .encode(
             color=alt.Color(
                 "value:Q",
@@ -252,7 +299,7 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="lightgreyred"):
         )
         .project(type="mercator")
         .properties(
-            title="Quantile Predictions on the California Housing Dataset",
+            title="Quantile Predictions on the California Housing Dataset by County",
             height=650,
             width=650,
         )