From e20cbcd2e8b07ac51931f223d8ee31b964be909c Mon Sep 17 00:00:00 2001
From: Reid Johnson <reidj@zillowgroup.com>
Date: Sat, 14 Sep 2024 04:54:02 -0700
Subject: [PATCH] Update example plots

---
 docs/sphinx_requirements.txt               |   2 +
 examples/plot_qrf_huggingface_inference.py | 194 +++++++++++++++------
 examples/plot_qrf_interpolation_methods.py |   4 +-
 examples/plot_qrf_multitarget.py           |  13 +-
 4 files changed, 149 insertions(+), 64 deletions(-)

diff --git a/docs/sphinx_requirements.txt b/docs/sphinx_requirements.txt
index 826e7ba..494024f 100644
--- a/docs/sphinx_requirements.txt
+++ b/docs/sphinx_requirements.txt
@@ -1,5 +1,6 @@
 sphinx
 altair
+geopandas
 numpydoc
 pillow
 pydata_sphinx_theme
@@ -8,4 +9,5 @@ skops
 sphinxcontrib.bibtex
 sphinx-design
 sphinxext-altair
+vega-datasets
 vl-convert-python
diff --git a/examples/plot_qrf_huggingface_inference.py b/examples/plot_qrf_huggingface_inference.py
index 01f3b47..36dc25c 100755
--- a/examples/plot_qrf_huggingface_inference.py
+++ b/examples/plot_qrf_huggingface_inference.py
@@ -3,28 +3,30 @@
 ==============================================
 
 This example demonstrates how to download a trained quantile regression forest
-(QRF) model from Hugging Face Hub and use it to estimate new quantiles. In
-this scenario, a QRF has been trained with default parameters on a train-test
-split of the California housing dataset and uploaded to Hugging Face Hub. The
-model is downloaded and used to perform inference across several quantiles for
-each dataset sample. The results are visualized by the latitude and longitude
-of each sample. The model used is available on Hugging Face Hub
+(QRF) model from Hugging Face Hub and use it to estimate quantiles. In this
+scenario, a QRF has been trained with default parameters on the California
+Housing dataset using k-fold cross-validation and uploaded to Hugging Face
+Hub. The model is downloaded and used to perform inference across multiple
+quantiles for each sample in the dataset. The predictions are aggregated by
+county based on the latitude and longitude of each sample and visualized.
+The trained model is available on Hugging Face Hub
 `here <https://huggingface.co/quantile-forest/california-housing-example>`_.
 """
 
 import os
-import pickle
 import shutil
 import tempfile
 
 import altair as alt
+import geopandas as gpd
+import joblib
 import numpy as np
 import pandas as pd
 from sklearn import datasets
+from sklearn.base import BaseEstimator, RegressorMixin, clone
+from sklearn.model_selection import KFold
 from skops import hub_utils
-
-import quantile_forest
-from quantile_forest import RandomForestQuantileRegressor
+from vega_datasets import data
 
 alt.data_transformers.disable_max_rows()
 
@@ -33,8 +35,53 @@
 load_existing = True
 
 random_state = np.random.RandomState(0)
-quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist()
-sample_frac = 1
+quantiles = np.linspace(0, 1, num=21, endpoint=True).round(2).tolist()
+
+
+class CrossValidationPipeline(BaseEstimator, RegressorMixin):
+    """Cross-validation pipeline for scikit-learn compatible models."""
+
+    def __init__(self, base_model, n_splits=5, random_state=None):
+        self.base_model = base_model
+        self.n_splits = n_splits
+        self.random_state = random_state
+        self.fold_models = {}
+        self.fold_indices = {}
+
+    def fit(self, X, y):
+        """Fit the model using k-fold cross-validation."""
+        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
+        for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
+            X_train, y_train = X.iloc[train_idx], y[train_idx]
+            model = clone(self.base_model)
+            model.fit(X_train, y_train)
+            self.fold_models[fold_idx] = model
+            self.fold_indices[fold_idx] = test_idx
+        return self
+
+    def predict(self, X, quantiles=None):
+        """Predict using the appropriate k-fold model."""
+        if quantiles is None:
+            quantiles = 0.5
+        if not isinstance(quantiles, list):
+            quantiles = [quantiles]
+        y_pred = np.empty((X.shape[0], len(quantiles)) if len(quantiles) > 1 else (X.shape[0]))
+        for fold_idx, test_idx in self.fold_indices.items():
+            fold_model = self.fold_models[fold_idx]
+            y_pred[test_idx] = fold_model.predict(X.iloc[test_idx], quantiles=quantiles)
+        return y_pred
+
+    def save(self, filename):
+        with open(filename, "wb") as f:
+            joblib.dump(self.__getstate__(), f)
+
+    @classmethod
+    def load(cls, filename):
+        with open(filename, "rb") as f:
+            state = joblib.load(f)
+        obj = cls(base_model=None)
+        obj.__setstate__(state)
+        return obj
 
 
 def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None):
@@ -47,21 +94,27 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
         median_absolute_error,
         r2_score,
     )
-    from sklearn.model_selection import train_test_split
+    from sklearn.pipeline import Pipeline
     from skops import card
 
+    import quantile_forest
+    from quantile_forest import RandomForestQuantileRegressor
+
     # Load the California Housing dataset.
     X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
 
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
+    # Define the model pipeline.
+    qrf = RandomForestQuantileRegressor(random_state=random_state)
+    pipeline = Pipeline(
+        [("cv_model", CrossValidationPipeline(qrf, n_splits=5, random_state=random_state))]
+    )
 
-    # Fit the model.
-    qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
+    # Fit the model pipeline.
+    pipeline.fit(X, y)
 
-    # Save the model to a file.
+    # Save the pipeline (with all models) to a file.
     model_filename = "model.pkl"
-    with open(model_filename, mode="bw") as f:
-        pickle.dump(qrf, file=f)
+    pipeline.named_steps["cv_model"].save(model_filename)
 
     # Prepare model repository.
     if os.path.exists(local_dir):
@@ -74,7 +127,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
         requirements=[f"quantile-forest={quantile_forest.__version__}"],
         dst=local_dir,
         task="tabular-regression",
-        data=X_train,
+        data=X,
     )
 
     # Create a model card.
@@ -91,19 +144,18 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
         "prediction-intervals",
     ]
     model_description = (
-        "This is a RandomForestQuantileRegressor trained on the California housing dataset."
+        "This is a RandomForestQuantileRegressor trained on the California Housing dataset."
     )
     limitations = "This model is not ready to be used in production."
     training_procedure = (
-        "The model was trained using default parameters on a standard train-test split."
+        "The model was trained using default parameters on a 5-fold cross-validation pipeline."
     )
     get_started_code = """<details>
 <summary> Click to expand </summary>
 
 ```python
-import pickle
-with open(qrf_pkl_filename, 'rb') as file:
-    qrf = pickle.load(file)
+from examples.plot_qrf_huggingface_inference import CrossValidationPipeline
+pipeline = CrossValidationPipeline.load(qrf_pkl_filename)
 ```
 
 </details>"""
@@ -117,11 +169,11 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
     )
 
     # Add performance metrics to the model card.
-    y_pred = qrf.predict(X_test)
-    mape = mean_absolute_percentage_error(y_test, y_pred)
-    mdae = median_absolute_error(y_test, y_pred)
-    mse = mean_squared_error(y_test, y_pred)
-    r2 = r2_score(y_test, y_pred)
+    y_pred = pipeline.predict(X)
+    mape = mean_absolute_percentage_error(y, y_pred)
+    mdae = median_absolute_error(y, y_pred)
+    mse = mean_squared_error(y, y_pred)
+    r2 = r2_score(y, y_pred)
     model_card.add_metrics(
         **{
             "Mean Absolute Percentage Error": mape,
@@ -159,23 +211,21 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
 local_dir = "./local_repo"
 with tempfile.TemporaryDirectory() as local_dir:
     hub_utils.download(repo_id=repo_id, dst=local_dir)
-    with open(f"{local_dir}/{model_filename}", "rb") as file:
-        qrf = pickle.load(file)
+    pipeline = CrossValidationPipeline.load(f"{local_dir}/{model_filename}")
 
 # Fetch the California Housing dataset and estimate quantiles.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
-y_pred = qrf.predict(X, quantiles=quantiles) * 100_000  # predict in dollars
+y_pred = pipeline.predict(X, quantiles=quantiles) * 100_000  # predict in dollars
 
 df = (
     pd.DataFrame(y_pred, columns=quantiles)
     .reset_index()
-    .sample(frac=sample_frac, random_state=random_state)
-    .melt(id_vars=["index"], var_name="quantile", value_name="value")
+    .rename(columns={q: f"q_{q:.3g}" for q in quantiles})
     .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
 )
 
 
-def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"):
+def plot_quantiles_by_latlon(df, quantiles, color_scheme="lightgreyred"):
     """Plot quantile predictions on California Housing dataset by lat/lon."""
     # Slider for varying the displayed quantile estimates.
     slider = alt.binding_range(
@@ -187,35 +237,69 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"):
 
     quantile_val = alt.param(name="quantile", value=0.5, bind=slider)
 
+    # Load the US counties data and filter to California counties.
+    ca_counties = (
+        gpd.read_file(data.us_10m.url, layer="counties")
+        .set_crs("EPSG:4326")
+        .assign(**{"county_fips": lambda x: x["id"].astype(int)})
+        .drop(columns=["id"])
+        .query("(county_fips >= 6000) & (county_fips < 7000)")
+    )
+
+    x_min = df[[f"q_{q:.3g}" for q in quantiles]].min().min()
+    x_max = df[[f"q_{q:.3g}" for q in quantiles]].max().max()
+
+    df = (
+        gpd.GeoDataFrame(
+            df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]), crs="4326"
+        )
+        .sjoin(ca_counties, how="right")
+        .drop(columns=["index_left0"])
+        .assign(
+            **{f"w_q_{q:.3g}": lambda x, q=q: x[f"q_{q:.3g}"] * x["Population"] for q in quantiles}
+        )
+    )
+
+    grouped = (
+        df.groupby("county_fips")
+        .agg({**{f"w_q_{q:.3g}": "sum" for q in quantiles}, **{"Population": "sum"}})
+        .reset_index()
+        .assign(
+            **{f"q_{q:.3g}": lambda x, q=q: x[f"w_q_{q:.3g}"] / x["Population"] for q in quantiles}
+        )
+    )
+
+    df = (
+        df[["county_fips", "Latitude", "Longitude", "geometry"]]
+        .drop_duplicates(subset=["county_fips"])
+        .merge(
+            grouped[["county_fips", "Population"] + [f"q_{q:.3g}" for q in quantiles]],
+            on="county_fips",
+            how="left",
+        )
+    )
+
     chart = (
         alt.Chart(df)
         .add_params(quantile_val)
-        .transform_filter("datum.quantile == quantile")
-        .mark_circle()
+        .transform_calculate(quantile_col="'q_' + quantile")
+        .transform_calculate(value=f"datum[datum.quantile_col]")
+        .mark_geoshape(stroke="black", strokeWidth=0.5)
         .encode(
-            x=alt.X(
-                "Longitude:Q",
-                axis=alt.Axis(tickMinStep=1, format=".1f"),
-                scale=alt.Scale(zero=False),
-                title="Longitude",
-            ),
-            y=alt.Y(
-                "Latitude:Q",
-                axis=alt.Axis(tickMinStep=1, format=".1f"),
-                scale=alt.Scale(zero=False),
-                title="Latitude",
+            color=alt.Color(
+                "value:Q",
+                scale=alt.Scale(domain=[x_min, x_max], scheme=color_scheme),
+                title="Prediction",
             ),
-            color=alt.Color("value:Q", scale=alt.Scale(scheme=color_scheme), title="Prediction"),
-            size=alt.Size("Population:Q"),
             tooltip=[
-                alt.Tooltip("index:N", title="Row ID"),
-                alt.Tooltip("Latitude:Q", format=".2f", title="Latitude"),
-                alt.Tooltip("Longitude:Q", format=".2f", title="Longitude"),
+                alt.Tooltip("county_fips:N", title="County FIPS"),
+                alt.Tooltip("Population:N", format=",.0f", title="Population"),
                 alt.Tooltip("value:Q", format="$,.0f", title="Predicted Value"),
             ],
         )
+        .project(type="mercator")
         .properties(
-            title="Quantile Predictions on the California Housing Dataset",
+            title="Quantile Predictions on the California Housing Dataset by County",
             height=650,
             width=650,
         )
diff --git a/examples/plot_qrf_interpolation_methods.py b/examples/plot_qrf_interpolation_methods.py
index 3120d37..576c34d 100755
--- a/examples/plot_qrf_interpolation_methods.py
+++ b/examples/plot_qrf_interpolation_methods.py
@@ -53,8 +53,8 @@
         "method": ["Actual"] * len(y),
         "X": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())],
         "y_pred": y.tolist(),
-        "y_pred_low": y.tolist(),
-        "y_pred_upp": y.tolist(),
+        "y_pred_low": [None] * len(y),
+        "y_pred_upp": [None] * len(y),
         "quantile_low": [None] * len(y),
         "quantile_upp": [None] * len(y),
     }
diff --git a/examples/plot_qrf_multitarget.py b/examples/plot_qrf_multitarget.py
index 1e16ffb..25c436c 100644
--- a/examples/plot_qrf_multitarget.py
+++ b/examples/plot_qrf_multitarget.py
@@ -23,7 +23,7 @@
 quantiles = np.linspace(0, 1, num=41, endpoint=True).round(3).tolist()
 
 # Define functions that generate targets; each function maps to one target.
-funcs = [
+target_funcs = [
     {
         "signal": lambda x: np.log1p(x + 1),
         "noise": lambda x: np.log1p(x) * random_state.uniform(size=len(x)),
@@ -36,18 +36,19 @@
     },
 ]
 
-legend = {k: v for f in funcs for k, v in f["legend"].items()}
-
 
 def make_funcs_Xy(funcs, n_samples, bounds):
-    """Make a dataset from specified function(s) with signal and noise."""
+    """Make a dataset from specified function(s)."""
     x = np.linspace(*bounds, n_samples)
     y = np.empty((len(x), len(funcs)))
     for i, func in enumerate(funcs):
-        y[:, i] = func["signal"](x) + func["noise"](x)
+        y[:, i] = func(x)
     return np.atleast_2d(x).T, y
 
 
+funcs = [lambda x, f=f: f["signal"](x) + f["noise"](x) for f in target_funcs]
+legend = {k: v for f in target_funcs for k, v in f["legend"].items()}
+
 # Create a dataset with multiple target variables.
 X, y = make_funcs_Xy(funcs, n_samples, bounds)
 
@@ -63,7 +64,6 @@ def make_funcs_Xy(funcs, n_samples, bounds):
     {
         "x": np.tile(X.squeeze(), len(funcs)),
         "y": y.reshape(-1, order="F"),
-        "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]),
         "y_pred": np.concatenate([y_pred[:, i, len(quantiles) // 2] for i in range(len(funcs))]),
         "target": np.concatenate([[str(i)] * len(X) for i in range(len(funcs))]),
         **{f"q_{q_i:.3g}": y_i.ravel() for q_i, y_i in zip(quantiles, y_pred.T)},
@@ -95,7 +95,6 @@ def plot_multitargets(df, legend):
         alt.Tooltip("target:N", title="Target"),
         alt.Tooltip("x:Q", format=",.3f", title="X"),
         alt.Tooltip("y:Q", format=",.3f", title="Y"),
-        alt.Tooltip("y_true:Q", format=",.3f", title="Y"),
         alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"),
         alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"),
         alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"),