Update examples

zillow · Feb 20, 2024 · 6a1c9d0 · 6a1c9d0
1 parent 722f8ee
commit 6a1c9d0
Show file tree

Hide file tree

Showing 8 changed files with 205 additions and 237 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -23,21 +23,21 @@ quantile-forest
       :link-type: ref
       :link-alt: Getting started
 
-      A guide that provides installation instructions and information on testing the package.
+      A guide that provides installation requirements and instructions, as well as procedures for developers.
 
    .. grid-item-card:: User Guide
       :link: user_guide
       :link-type: ref
       :link-alt: User guide
 
-      Check out the User Guide for information on the key concepts behind quantile forests.
+      Information on the key concepts behind quantile forests and how they apply to this package.
 
    .. grid-item-card:: Examples
       :link: example-gallery
       :link-type: ref
       :link-alt: Examples
 
-      General-purpose, introductory and illustrative examples of using quantile forests.
+      Examples that demonstrate the broad applications and introductory concepts of quantile forests.
 
    .. grid-item-card:: API
       :link: api

diff --git a/quantile_forest/tests/examples/plot_quantile_conformalized.py b/quantile_forest/tests/examples/plot_quantile_conformalized.py
@@ -81,7 +81,9 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test):
     # Calculate the point predictions on the test data.
     y_pred = qrf.predict(X_test, quantiles="mean", aggregate_leaves_first=False)
 
-    return sort_y_values(y_test, y_pred, y_pis)
+    y_values = sort_y_values(y_test, y_pred, y_pis)
+
+    return pd.DataFrame(y_values).pipe(lambda x: x * 100_000).assign(strategy="qrf")
 
 
 def cqr_strategy(alpha, X_train, X_test, y_train, y_test):
@@ -121,32 +123,32 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test):
     # Calculate the point predictions on the test data.
     y_pred = qrf.predict(X_test, quantiles="mean", aggregate_leaves_first=False)
 
-    return sort_y_values(y_test, y_pred, y_pis)
+    y_values = sort_y_values(y_test, y_pred, y_pis)
+
+    return pd.DataFrame(y_values).pipe(lambda x: x * 100_000).assign(strategy="cqr")
 
 
 # Get strategy outputs as a data frame.
 args = (alpha, X_train, X_test, y_train, y_test)
-df = pd.concat(
-    [
-        pd.DataFrame(qrf_strategy(*args)).pipe(lambda x: x * 100_000).assign(strategy="qrf"),
-        pd.DataFrame(cqr_strategy(*args)).pipe(lambda x: x * 100_000).assign(strategy="cqr"),
-    ]
-)
+df = pd.concat([qrf_strategy(*args), cqr_strategy(*args)])
 
-# Add coverage and mean width metrics to the data frame.
-df = df.merge(
+# Calculate coverage and width metrics.
+metrics = (
     df.groupby("strategy")
     .apply(
-        lambda x: pd.Series(
+        lambda grp: pd.Series(
             {
-                "coverage": coverage_score(x["y_test"], x["y_pred_low"], x["y_pred_upp"]),
-                "width": mean_width_score(x["y_pred_low"], x["y_pred_upp"]),
+                "coverage": coverage_score(grp["y_test"], grp["y_pred_low"], grp["y_pred_upp"]),
+                "width": mean_width_score(grp["y_pred_low"], grp["y_pred_upp"]),
             }
         )
     )
     .reset_index()
 )
 
+# Merge the metrics into the data frame.
+df = df.merge(metrics, on="strategy", how="left")
+
 
 def plot_prediction_intervals(df, domain):
     click = alt.selection_point(fields=["y_label"], bind="legend")
@@ -222,9 +224,7 @@ def plot_prediction_intervals(df, domain):
     )
 
     text_coverage = (
-        base.transform_aggregate(
-            coverage="mean(coverage)", width="mean(width)", groupby=["strategy"]
-        )
+        base.transform_aggregate(coverage="mean(coverage)", groupby=["strategy"])
         .transform_calculate(
             coverage_text=(
                 f"'Coverage: ' + format({alt.datum['coverage'] * 100}, '.1f') + '%'"
@@ -239,9 +239,7 @@ def plot_prediction_intervals(df, domain):
         )
     )
     text_with = (
-        base.transform_aggregate(
-            coverage="mean(coverage)", width="mean(width)", groupby=["strategy"]
-        )
+        base.transform_aggregate(width="mean(width)", groupby=["strategy"])
         .transform_calculate(
             width_text=f"'Interval Width: ' + format({alt.datum['width']}, '$,d')"
         )

diff --git a/quantile_forest/tests/examples/plot_quantile_example.py b/quantile_forest/tests/examples/plot_quantile_example.py
@@ -8,127 +8,113 @@
 """
 
 import altair as alt
-import pandas as pd
 import numpy as np
+import pandas as pd
 from sklearn.model_selection import train_test_split
 
 from quantile_forest import RandomForestQuantileRegressor
 
 n_samples = 1000
 bounds = [0, 10]
+quantiles = [0.025, 0.5, 0.975]
 
 
-def make_toy_dataset(n_samples, bounds, random_seed=0):
+def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0):
     rng = np.random.RandomState(random_seed)
 
     x = rng.uniform(*bounds, size=n_samples)
     f = x * np.sin(x)
 
     sigma = 0.25 + x / 10
     noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
-    y = f + noise
+    y = f + (noise if add_noise else 0)
 
     return np.atleast_2d(x).T, y
 
 
-X, y = make_toy_dataset(n_samples, bounds)
+# Create noisy data for modeling and non-noisy function data for illustration.
+X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0)
+X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=0)
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
-X_sampled = np.atleast_2d(np.linspace(*bounds, n_samples)).T
-y_sampled = (X_sampled * np.sin(X_sampled)).reshape(-1)
-
 qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=0)
 qrf.fit(X_train, y_train)
 
-y_pred = qrf.predict(X_sampled, quantiles=[0.025, 0.5, 0.975])
+y_pred_func = qrf.predict(X_func, quantiles=quantiles)
+y_pred_test = qrf.predict(X_test, quantiles=quantiles)
 
-df_train = pd.DataFrame(
+df = pd.DataFrame(
     {
-        "X_sampled": X_sampled.reshape(-1),
-        "y_sampled": y_sampled,
-        "y_pred_low": y_pred[:, 0],
-        "y_pred_med": y_pred[:, 1],
-        "y_pred_upp": y_pred[:, 2],
+        "X": np.concatenate([X_func.reshape(-1), X_test.reshape(-1)]),
+        "y": np.concatenate([y_func, y_test]),
+        "y_pred": np.concatenate([y_pred_func[:, 1], y_pred_test[:, 1]]),
+        "y_pred_low": np.concatenate([y_pred_func[:, 0], y_pred_test[:, 0]]),
+        "y_pred_upp": np.concatenate([y_pred_func[:, 2], y_pred_test[:, 2]]),
+        "test": [False] * len(y_func) + [True] * len(y_test),
     }
 )
 
-df_test = pd.DataFrame(
-    {
-        "X_test": X_test.reshape(-1),
-        "y_test": y_test,
-    }
-)
-
-
-def plot_fit_and_intervals(df_train, df_test):
-    df_train = df_train.copy()
-    df_test = df_test.copy()
-
-    df_train = df_train.assign(
-        **{
-            "y_true_label": "f(x) = x sin(x)",
-            "y_pred_label": "Predicted Median",
-            "y_area_label": "Predicted 95% Interval",
-        }
-    )
-
-    df_test["point_label"] = "Test Observations"
 
+def plot_fit_and_intervals(df):
     points = (
-        alt.Chart(df_test)
+        alt.Chart(df.assign(**{"point_label": "Test Observations"}))
+        .transform_filter(alt.datum["test"])  # filter to test data
         .mark_circle(color="#f2a619")
         .encode(
-            x=alt.X("X_test:Q", scale=alt.Scale(nice=False)),
-            y=alt.Y("y_test:Q", title=""),
+            x=alt.X("X:Q", scale=alt.Scale(nice=False)),
+            y=alt.Y("y:Q", title=""),
             color=alt.Color("point_label:N", scale=alt.Scale(range=["#f2a619"]), title=None),
             tooltip=[
-                alt.Tooltip("X_test:Q", format=",.3f", title="X"),
-                alt.Tooltip("y_test:Q", format=",.3f", title="Y"),
+                alt.Tooltip("X:Q", format=",.3f", title="X"),
+                alt.Tooltip("y:Q", format=",.3f", title="Y"),
             ],
         )
     )
 
     line_true = (
-        alt.Chart(df_train)
+        alt.Chart(df.assign(**{"y_true_label": "f(x) = x sin(x)"}))
+        .transform_filter(~alt.datum["test"])  # filter to training data
         .mark_line(color="black", size=3)
         .encode(
-            x=alt.X("X_sampled:Q", scale=alt.Scale(nice=False)),
-            y=alt.Y("y_sampled:Q", title="f(x)"),
+            x=alt.X("X:Q", scale=alt.Scale(nice=False)),
+            y=alt.Y("y:Q", title="f(x)"),
             color=alt.Color("y_true_label:N", scale=alt.Scale(range=["black"]), title=None),
             tooltip=[
-                alt.Tooltip("X_sampled:Q", format=",.3f", title="X"),
-                alt.Tooltip("y_sampled:Q", format=",.3f", title="Y"),
+                alt.Tooltip("X:Q", format=",.3f", title="X"),
+                alt.Tooltip("y:Q", format=",.3f", title="Y"),
             ],
         )
     )
 
     line_pred = (
-        alt.Chart(df_train)
+        alt.Chart(df.assign(**{"y_pred_label": "Predicted Median"}))
+        .transform_filter(~alt.datum["test"])  # filter to training data
         .mark_line(color="#006aff", size=5)
         .encode(
-            x=alt.X("X_sampled:Q", scale=alt.Scale(nice=False)),
-            y=alt.Y("y_pred_med:Q", title=""),
+            x=alt.X("X:Q", scale=alt.Scale(nice=False)),
+            y=alt.Y("y_pred:Q", title=""),
             color=alt.Color("y_pred_label:N", scale=alt.Scale(range=["#006aff"]), title=None),
             tooltip=[
-                alt.Tooltip("X_sampled:Q", format=",.3f", title="X"),
-                alt.Tooltip("y_sampled:Q", format=",.3f", title="Y"),
-                alt.Tooltip("y_pred_med:Q", format=",.3f", title="Predicted Y"),
+                alt.Tooltip("X:Q", format=",.3f", title="X"),
+                alt.Tooltip("y:Q", format=",.3f", title="Y"),
+                alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"),
             ],
         )
     )
 
     area_pred = (
-        alt.Chart(df_train)
+        alt.Chart(df)
+        .transform_filter(~alt.datum["test"])  # filter to training data
         .mark_area(color="#e0f2ff", opacity=0.8)
         .encode(
-            x=alt.X("X_sampled:Q", scale=alt.Scale(nice=False), title="x"),
+            x=alt.X("X:Q", scale=alt.Scale(nice=False), title="x"),
             y=alt.Y("y_pred_low:Q", title=""),
             y2=alt.Y2("y_pred_upp:Q", title=None),
             tooltip=[
-                alt.Tooltip("X_sampled:Q", format=",.3f", title="X"),
-                alt.Tooltip("y_sampled:Q", format=",.3f", title="Y"),
-                alt.Tooltip("y_pred_med:Q", format=",.3f", title="Predicted Y"),
+                alt.Tooltip("X:Q", format=",.3f", title="X"),
+                alt.Tooltip("y:Q", format=",.3f", title="Y"),
+                alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"),
                 alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"),
                 alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"),
             ],
@@ -153,5 +139,5 @@ def plot_fit_and_intervals(df_train, df_test):
     return chart
 
 
-chart = plot_fit_and_intervals(df_train, df_test)
+chart = plot_fit_and_intervals(df)
 chart
diff --git a/quantile_forest/tests/examples/plot_quantile_extrapolation.py b/quantile_forest/tests/examples/plot_quantile_extrapolation.py
@@ -58,17 +58,12 @@ def get_test_X(X):
 X_train, y_train = get_train_Xy(X, y, extrap_min_idx, extrap_max_idx)
 X_test = get_test_X(X)
 
-qrf = RandomForestQuantileRegressor(
-    max_samples_leaf=None,
-    min_samples_leaf=10,
-    random_state=0,
-)
+qrf = RandomForestQuantileRegressor(max_samples_leaf=None, min_samples_leaf=10, random_state=0)
 qrf.fit(np.expand_dims(X_train, axis=-1), y_train)
 
 # Get predictions at 95% prediction intervals and median.
 y_pred = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])
 
-
 df = pd.DataFrame(
     {
         "X_true": X,
@@ -77,35 +72,13 @@ def get_test_X(X):
         "y_pred": y_pred[:, 1],
         "y_pred_low": y_pred[:, 0],
         "y_pred_upp": y_pred[:, 2],
-        "train": np.concatenate(
-            [
-                np.zeros(extrap_min_idx),
-                np.ones(extrap_max_idx - extrap_min_idx),
-                np.zeros(len(y) - extrap_max_idx),
-            ]
-        ),
-        "test_left": np.concatenate(
-            [
-                np.ones(extrap_min_idx),
-                np.zeros(len(y) - extrap_min_idx),
-            ]
-        ),
-        "test_right": np.concatenate(
-            [
-                np.zeros(extrap_max_idx),
-                np.ones(len(y) - extrap_max_idx),
-            ]
-        ),
+        "test_left": [True] * extrap_min_idx + [False] * (len(y) - extrap_min_idx),
+        "test_right": [False] * extrap_max_idx + [True] * (len(y) - extrap_max_idx),
     }
 )
 
 
 def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None):
-    df = df.copy()
-
-    df["point_label"] = "Observations"
-    df["line_label"] = func_str
-
     x_scale = None
     if x_domain is not None:
         x_scale = alt.Scale(domain=x_domain, nice=False, padding=0)
@@ -130,7 +103,7 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
         alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"),
     ]
 
-    base = alt.Chart(df)
+    base = alt.Chart(df.assign(**{"point_label": "Observations", "line_label": func_str}))
 
     points_true = base.mark_circle(size=20).encode(
         x=alt.X("X_true:Q", scale=x_scale, title="x"),
@@ -195,17 +168,17 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
 }
 
 chart1 = plot_extrapolations(
-    df.query(f"train == 1"), title="Prediction Intervals on Training Data", **kwargs
+    df.query("~(test_left | test_right)"), title="Prediction Intervals on Training Data", **kwargs
 )
 chart2 = alt.layer(
     plot_extrapolations(
-        df.query(f"(train == 1)"),
+        df.query("~(test_left | test_right)"),
         title="Prediction Intervals with Extrapolated Values",
         legend=True,
         **kwargs,
     ).resolve_scale(color="independent"),
-    plot_extrapolations(df.query(f"(test_left == 1)").assign(extrapolate=True), **kwargs),
-    plot_extrapolations(df.query(f"(test_right == 1)").assign(extrapolate=True), **kwargs),
+    plot_extrapolations(df.query("test_left").assign(extrapolate=True), **kwargs),
+    plot_extrapolations(df.query("test_right").assign(extrapolate=True), **kwargs),
 )
 chart = chart1 | chart2
 chart