From 18f3275e5a24822605f649d8b6c690d0bb97cd44 Mon Sep 17 00:00:00 2001
From: Reid Johnson <reid.johnson@gmail.com>
Date: Sun, 28 Jul 2024 17:38:04 -0700
Subject: [PATCH] Update plots (#68)

---
 examples/plot_huggingface_model.py      | 63 ++++++++++---------
 examples/plot_predict_custom.py         | 62 ++++++++++++-------
 examples/plot_quantile_interpolation.py | 82 +++++++++++++++++--------
 examples/plot_quantile_multioutput.py   | 56 ++++++++++++-----
 examples/plot_treeshap_example.py       | 38 +++++-------
 5 files changed, 187 insertions(+), 114 deletions(-)

diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py
index 2d8a81f..9dd49ec 100755
--- a/examples/plot_huggingface_model.py
+++ b/examples/plot_huggingface_model.py
@@ -8,6 +8,8 @@
 housing dataset and uploaded to Hugging Face Hub. The model is downloaded and
 inference is performed over several quantiles for each instance in the dataset.
 The estimates are visualized by the latitude and longitude of each instance.
+The model used is availabe on Hugging Face Hub:
+https://huggingface.co/quantile-forest/california-housing-example
 """
 
 import os
@@ -29,6 +31,9 @@
 repo_id = "quantile-forest/california-housing-example"
 load_existing = True
 
+quantiles = list((np.arange(5) * 25) / 100)
+sample_frac = 1
+
 
 def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
     """Function used to fit the model and upload it to Hugging Face Hub."""
@@ -139,6 +144,35 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
     shutil.rmtree(local_dir)
 
 
+if not load_existing:
+    fit_and_upload_model(token, repo_id)
+
+# Download the repository locally.
+local_dir = "./local_repo"
+if os.path.exists(local_dir):
+    shutil.rmtree(local_dir)
+hub_utils.download(repo_id=repo_id, dst=local_dir)
+
+# Load the fitted model.
+model_filename = "model.pkl"
+with open(f"{local_dir}/{model_filename}", "rb") as file:
+    qrf = pickle.load(file)
+shutil.rmtree(local_dir)
+
+# Estimate quantiles.
+X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
+y_pred = qrf.predict(X, quantiles=quantiles) * 100_000  # predict in dollars
+
+df = (
+    pd.DataFrame(y_pred, columns=quantiles)
+    .reset_index()
+    .sample(frac=sample_frac, random_state=0)
+    .melt(id_vars=["index"], var_name="quantile", value_name="value")
+    .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
+)
+print(df)
+
+
 def plot_quantiles_by_latlon(df, quantiles):
     # Slider for varying the displayed quantile estimates.
     slider = alt.binding_range(
@@ -172,7 +206,7 @@ def plot_quantiles_by_latlon(df, quantiles):
                 scale=alt.Scale(zero=False),
                 title="Latitude",
             ),
-            color=alt.Color("value:Q", scale=alt.Scale(scheme="viridis"), title="Prediction"),
+            color=alt.Color("value:Q", scale=alt.Scale(scheme="cividis"), title="Prediction"),
             size=alt.Size("Population:Q"),
             tooltip=[
                 alt.Tooltip("index:N", title="Row ID"),
@@ -190,32 +224,5 @@ def plot_quantiles_by_latlon(df, quantiles):
     return chart
 
 
-if not load_existing:
-    fit_and_upload_model(token, repo_id)
-
-# Download the repository locally.
-local_dir = "./local_repo"
-if os.path.exists(local_dir):
-    shutil.rmtree(local_dir)
-hub_utils.download(repo_id=repo_id, dst=local_dir)
-
-# Load the fitted model.
-model_filename = "model.pkl"
-with open(f"{local_dir}/{model_filename}", "rb") as file:
-    qrf = pickle.load(file)
-shutil.rmtree(local_dir)
-
-# Estimate quantiles.
-quantiles = list((np.arange(11) * 10) / 100)
-X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
-y_pred = qrf.predict(X, quantiles=quantiles) * 100_000  # predict in dollars
-
-df = (
-    pd.DataFrame(y_pred, columns=quantiles)
-    .reset_index()
-    .melt(id_vars=["index"], var_name="quantile", value_name="value")
-    .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
-)
-
 chart = plot_quantiles_by_latlon(df, quantiles)
 chart
diff --git a/examples/plot_predict_custom.py b/examples/plot_predict_custom.py
index 2486299..03e6c20 100755
--- a/examples/plot_predict_custom.py
+++ b/examples/plot_predict_custom.py
@@ -23,6 +23,8 @@
 
 np.random.seed(0)
 
+n_test_samples = 10
+
 
 def predict(reg, X, quantiles=0.5, what=None):
     """Custom prediction method that allows user-specified function.
@@ -65,7 +67,7 @@ def predict(reg, X, quantiles=0.5, what=None):
 
 
 X, y = datasets.load_diabetes(return_X_y=True)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test_samples, random_state=0)
 
 reg = RandomForestQuantileRegressor().fit(X_train, y_train)
 
@@ -75,30 +77,41 @@ def predict(reg, X, quantiles=0.5, what=None):
 # Output array with the user-specified function applied to each sample's empirical distribution.
 y_out = predict(reg, X_test, what=func)
 
-# Calculate the ECDF from output array.
-y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out]
-
-df = pd.DataFrame(
-    {
-        "y_value": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])),
-        "y_value2": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))[1:] + [np.nan],
-        "probability": list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf])),
-    }
-)
+dfs = []
+for idx in range(n_test_samples):
+    # Calculate the ECDF from output array.
+    y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out[idx].reshape(1, -1)]
+    n_quantiles = len(list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])))
+
+    df_i = pd.DataFrame(
+        {
+            "y_val": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf])),
+            "y_val2": list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))[1:] + [np.nan],
+            "proba": list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf])),
+            "sample_idx": [idx] * n_quantiles,
+        }
+    )
+    dfs.append(df_i)
+df = pd.concat(dfs)
 
 
 def plot_ecdf(df):
+    min_idx = df["sample_idx"].min()
+    max_idx = df["sample_idx"].max()
+    slider = alt.binding_range(min=min_idx, max=max_idx, step=1, name="Sample Index:")
+    sample_selection = alt.param(value=0, bind=slider, name="sample_idx")
+
     tooltip = [
-        alt.Tooltip("y_value", title="Response Value"),
-        alt.Tooltip("probability", title="Probability"),
+        alt.Tooltip("y_val", title="Response Value"),
+        alt.Tooltip("proba", title="Probability"),
     ]
 
     circles = (
         alt.Chart(df)
         .mark_circle(color="#006aff", opacity=1, size=50)
         .encode(
-            x=alt.X("y_value", title="Response Value"),
-            y=alt.Y("probability", title="Probability"),
+            x=alt.X("y_val", title="Response Value"),
+            y=alt.Y("proba", title="Probability"),
             tooltip=tooltip,
         )
     )
@@ -107,17 +120,22 @@ def plot_ecdf(df):
         alt.Chart(df)
         .mark_line(color="#006aff", size=2)
         .encode(
-            x=alt.X("y_value", title="Response Value"),
-            x2=alt.X2("y_value2"),
-            y=alt.Y("probability", title="Probability"),
+            x=alt.X("y_val", title="Response Value"),
+            x2=alt.X2("y_val2"),
+            y=alt.Y("proba", title="Probability"),
             tooltip=tooltip,
         )
     )
 
-    chart = (circles + lines).properties(
-        height=400,
-        width=650,
-        title="Empirical Cumulative Distribution Function (ECDF) Plot",
+    chart = (
+        (circles + lines)
+        .transform_filter(alt.datum.sample_idx == sample_selection)
+        .add_params(sample_selection)
+        .properties(
+            height=400,
+            width=650,
+            title="Empirical Cumulative Distribution Function (ECDF) Plot",
+        )
     )
     return chart
 
diff --git a/examples/plot_quantile_interpolation.py b/examples/plot_quantile_interpolation.py
index 4b7588c..a34bd5d 100755
--- a/examples/plot_quantile_interpolation.py
+++ b/examples/plot_quantile_interpolation.py
@@ -17,17 +17,19 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
+intervals = list(np.arange(101) / 100)
+
 # Create toy dataset.
 X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]])
 y = np.array([-2, -1, 0, 1, 2])
 
-est = RandomForestQuantileRegressor(
+qrf = RandomForestQuantileRegressor(
     n_estimators=1,
     max_samples_leaf=None,
     bootstrap=False,
     random_state=0,
 )
-est.fit(X, y)
+qrf.fit(X, y)
 
 interpolations = {
     "Linear": "#006aff",
@@ -41,30 +43,43 @@
 legend = {"Actual": "#000000"}
 legend.update(interpolations)
 
-# Initialize data with actual values.
-data = {
-    "method": ["Actual"] * len(y),
-    "X": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())],
-    "y_pred": y.tolist(),
-    "y_pred_low": y.tolist(),
-    "y_pred_upp": y.tolist(),
-}
-
-# Populate data based on prediction results with different interpolations.
-for interpolation in interpolations:
-    # Get predictions at 95% prediction intervals and median.
-    y_pred = est.predict(X, quantiles=[0.025, 0.5, 0.975], interpolation=interpolation.lower())
-
-    data["method"].extend([interpolation] * len(y))
-    data["X"].extend([f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())])
-    data["y_pred"].extend(y_pred[:, 1])
-    data["y_pred_low"].extend(y_pred[:, 0])
-    data["y_pred_upp"].extend(y_pred[:, 2])
-
-df = pd.DataFrame(data)
+dfs = []
+for idx, interval in enumerate(intervals):
+    # Initialize data with actual values.
+    data = {
+        "method": ["Actual"] * len(y),
+        "X": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())],
+        "y_pred": y.tolist(),
+        "y_pred_low": y.tolist(),
+        "y_pred_upp": y.tolist(),
+        "quantile_low": [None] * len(y),
+        "quantile_upp": [None] * len(y),
+    }
+
+    # Populate data based on prediction results with different interpolations.
+    for interpolation in interpolations:
+        # Get predictions at median and prediction intervals.
+        quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)]
+        y_pred = qrf.predict(X, quantiles=quantiles, interpolation=interpolation.lower())
+
+        data["method"].extend([interpolation] * len(y))
+        data["X"].extend([f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())])
+        data["y_pred"].extend(y_pred[:, 0])
+        data["y_pred_low"].extend(y_pred[:, 1])
+        data["y_pred_upp"].extend(y_pred[:, 2])
+        data["quantile_low"].extend([quantiles[1]] * len(y))
+        data["quantile_upp"].extend([quantiles[2]] * len(y))
+
+    df_i = pd.DataFrame(data)
+    dfs.append(df_i)
+df = pd.concat(dfs)
 
 
 def plot_interpolations(df, legend):
+    slider = alt.binding_range(min=0, max=1, step=0.01, name="Prediction Interval:")
+    interval_selection = alt.param(value=0.95, bind=slider, name="interval")
+    interval_tol = 0.001
+
     click = alt.selection_point(fields=["method"], bind="legend")
 
     color = alt.condition(
@@ -76,9 +91,11 @@ def plot_interpolations(df, legend):
     tooltip = [
         alt.Tooltip("method:N", title="Method"),
         alt.Tooltip("X:N", title="X Values"),
-        alt.Tooltip("y_pred:N", format=".3f", title="Median Y Value"),
-        alt.Tooltip("y_pred_low:N", format=".3f", title="Lower Y Value"),
-        alt.Tooltip("y_pred_upp:N", format=".3f", title="Upper Y Value"),
+        alt.Tooltip("y_pred:Q", format=".3f", title="Predicted Y"),
+        alt.Tooltip("y_pred_low:Q", format=".3f", title="Predicted Lower Y"),
+        alt.Tooltip("y_pred_upp:Q", format=".3f", title="Predicted Upper Y"),
+        alt.Tooltip("quantile_low:Q", format=".3f", title="Lower Quantile"),
+        alt.Tooltip("quantile_upp:Q", format=".3f", title="Upper Quantile"),
     ]
 
     point = (
@@ -116,7 +133,18 @@ def plot_interpolations(df, legend):
 
     chart = (
         (area + point)
-        .add_params(click)
+        .transform_filter(
+            (
+                (alt.datum.quantile_low >= (0.5 - interval_selection / 2 - interval_tol))
+                & (alt.datum.quantile_low <= (0.5 - interval_selection / 2 + interval_tol))
+            )
+            | (
+                (alt.datum.quantile_upp >= (0.5 + interval_selection / 2 - interval_tol))
+                & (alt.datum.quantile_upp <= (0.5 + interval_selection / 2 + interval_tol))
+            )
+            | (alt.datum.method == "Actual")
+        )
+        .add_params(interval_selection, click)
         .properties(height=400)
         .facet(
             column=alt.Column(
diff --git a/examples/plot_quantile_multioutput.py b/examples/plot_quantile_multioutput.py
index 4766a77..1524931 100755
--- a/examples/plot_quantile_multioutput.py
+++ b/examples/plot_quantile_multioutput.py
@@ -6,7 +6,7 @@
 regressor for multiple target variables. For each target, multiple quantiles
 can be estimated simultaneously. In this example, the target variable has
 two output values for each sample, with a single regressor used to estimate
-three quantiles (the median and 95% interval) for each target output.
+three quantiles (the median and interval points) for each target.
 """
 
 import altair as alt
@@ -20,6 +20,7 @@
 
 n_samples = 2500
 bounds = [0, 100]
+intervals = list(np.arange(21) / 20)
 
 # Define functions that generate targets; each function maps to one target.
 funcs = [
@@ -55,23 +56,34 @@ def make_func_Xy(funcs, bounds, n_samples):
 qrf = RandomForestQuantileRegressor(max_samples_leaf=None, max_depth=4, random_state=0)
 qrf.fit(X_train, y_train)  # fit on all of the targets simultaneously
 
-# Get multiple-output predictions at 95% prediction intervals and median.
-y_pred = qrf.predict(X, quantiles=[0.025, 0.5, 0.975])
-
-df = pd.DataFrame(
-    {
-        "x": np.tile(X.squeeze(), len(funcs)),
-        "y": y.reshape(-1, order="F"),
-        "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]),
-        "y_pred": np.concatenate([y_pred[:, i, 1] for i in range(len(funcs))]),
-        "y_pred_low": np.concatenate([y_pred[:, i, 0] for i in range(len(funcs))]),
-        "y_pred_upp": np.concatenate([y_pred[:, i, 2] for i in range(len(funcs))]),
-        "target": np.concatenate([[f"{i}"] * len(X) for i in range(len(funcs))]),
-    }
-)
+dfs = []
+for idx, interval in enumerate(intervals):
+    # Get multiple-output predictions at median and prediction intervals.
+    quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)]
+    y_pred = qrf.predict(X, quantiles=quantiles)
+
+    df_i = pd.DataFrame(
+        {
+            "x": np.tile(X.squeeze(), len(funcs)),
+            "y": y.reshape(-1, order="F"),
+            "y_true": np.concatenate([f["signal"](X.squeeze()) for f in funcs]),
+            "y_pred": np.concatenate([y_pred[:, i, 0] for i in range(len(funcs))]),
+            "y_pred_low": np.concatenate([y_pred[:, i, 1] for i in range(len(funcs))]),
+            "y_pred_upp": np.concatenate([y_pred[:, i, 2] for i in range(len(funcs))]),
+            "quantile_low": np.concatenate([[quantiles[1]] * len(X) for i in range(len(funcs))]),
+            "quantile_upp": np.concatenate([[quantiles[2]] * len(X) for i in range(len(funcs))]),
+            "target": np.concatenate([[f"{i}"] * len(X) for i in range(len(funcs))]),
+        }
+    )
+    dfs.append(df_i)
+df = pd.concat(dfs)
 
 
 def plot_multioutputs(df, legend):
+    slider = alt.binding_range(min=0, max=1, step=0.05, name="Prediction Interval:")
+    interval_selection = alt.param(value=0.95, bind=slider, name="interval")
+    interval_tol = 0.001
+
     click = alt.selection_point(fields=["target"], bind="legend")
 
     color = alt.condition(
@@ -94,6 +106,8 @@ def plot_multioutputs(df, legend):
         alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"),
         alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"),
         alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"),
+        alt.Tooltip("quantile_low:Q", format=".3f", title="Lower Quantile"),
+        alt.Tooltip("quantile_upp:Q", format=".3f", title="Upper Quantile"),
     ]
 
     points = (
@@ -132,7 +146,17 @@ def plot_multioutputs(df, legend):
 
     chart = (
         (points + area + line)
-        .add_params(click)
+        .transform_filter(
+            (
+                (alt.datum.quantile_low >= (0.5 - interval_selection / 2 - interval_tol))
+                & (alt.datum.quantile_low <= (0.5 - interval_selection / 2 + interval_tol))
+            )
+            | (
+                (alt.datum.quantile_upp >= (0.5 + interval_selection / 2 - interval_tol))
+                & (alt.datum.quantile_upp <= (0.5 + interval_selection / 2 + interval_tol))
+            )
+        )
+        .add_params(interval_selection, click)
         .configure_range(category=alt.RangeScheme(list(legend.values())))
         .properties(height=400, width=650, title="Multi-target Prediction Intervals")
     )
diff --git a/examples/plot_treeshap_example.py b/examples/plot_treeshap_example.py
index b6c6275..d668f05 100644
--- a/examples/plot_treeshap_example.py
+++ b/examples/plot_treeshap_example.py
@@ -17,6 +17,10 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
+n_samples = 500
+test_idx = 0
+quantiles = list((np.arange(11) * 10) / 100)
+
 
 def get_shap_values(qrf, X, quantile=0.5, **kwargs):
     # Define a custom tree model.
@@ -57,39 +61,31 @@ def get_shap_value_by_index(shap_values, index):
 
 # Load the California Housing Prices dataset.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
-X = X.iloc[:500]
-y = y[:500]
+X = X.iloc[:n_samples]
+y = y[:n_samples]
 y *= 100_000  # convert to dollars
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=0)
 
 qrf = RandomForestQuantileRegressor(random_state=0)
 qrf.fit(X_train, y_train)
 
-test_idx = 0
-quantiles = list((np.arange(11) * 10) / 100)
-
-dfs = []
-for quantile in quantiles:
-    # Get the SHAP values for the test data.
-    shap_values = get_shap_values(qrf, X_test, quantile=quantile)
-
-    # Get the SHAP values for a particular test instance (by index).
-    shap_values_by_idx = get_shap_value_by_index(shap_values, test_idx)
-
-    dfs.append(
+df = pd.concat(
+    [
         pd.DataFrame(
             {
                 "feature": [f"{X.iloc[test_idx, i]} = {X.columns[i]}" for i in range(X.shape[1])],
                 "feature_name": X.columns,
-                "shap_value": shap_values_by_idx.values,
-                "abs_shap_value": abs(shap_values_by_idx.values),
-                "base_value": shap_values_by_idx.base_values,
-                "model_output": shap_values_by_idx.base_values + sum(shap_values_by_idx.values),
-                "quantile": quantile,
+                "shap_value": shap_i.values,
+                "abs_shap_value": abs(shap_i.values),
+                "base_value": shap_i.base_values,
+                "model_output": shap_i.base_values + sum(shap_i.values),
+                "quantile": q,
             }
         )
-    )
-df = pd.concat(dfs)
+        for q in quantiles
+        for shap_i in [get_shap_value_by_index(get_shap_values(qrf, X_test, quantile=q), test_idx)]
+    ]
+)
 
 
 def plot_shap_waterfall_with_quantiles(df):