Update example plots

zillow · Aug 16, 2024 · baf13bb · baf13bb
1 parent ceceedb
commit baf13bb
Show file tree

Hide file tree

Showing 12 changed files with 98 additions and 107 deletions.
diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py
@@ -21,19 +21,19 @@
 import numpy as np
 import pandas as pd
 from sklearn import datasets
+from sklearn.utils.validation import check_random_state
 from skops import hub_utils
 
 import quantile_forest
 from quantile_forest import RandomForestQuantileRegressor
 
 alt.data_transformers.disable_max_rows()
 
-random_seed = 0
-
 token = "<Hugging Face Access Token>"
 repo_id = "quantile-forest/california-housing-example"
 load_existing = True
 
+random_state = check_random_state(0)
 quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist()
 sample_frac = 1
 
@@ -151,7 +151,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
 
 
 if not load_existing:
-    fit_and_upload_model(token, repo_id, random_state=random_seed)
+    fit_and_upload_model(token, repo_id, random_state=random_state)
 
 # Download the repository locally and load the fitted model.
 model_filename = "model.pkl"
@@ -168,7 +168,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
 df = (
     pd.DataFrame(y_pred, columns=quantiles)
     .reset_index()
-    .sample(frac=sample_frac, random_state=random_seed)
+    .sample(frac=sample_frac, random_state=random_state)
     .melt(id_vars=["index"], var_name="quantile", value_name="value")
     .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
 )
@@ -213,9 +213,9 @@ def plot_quantiles_by_latlon(df, quantiles, color_scheme="cividis"):
             ],
         )
         .properties(
+            title="Quantile Predictions on the California Housing Dataset",
             height=650,
             width=650,
-            title="Quantile Predictions on the California Housing Dataset",
         )
     )
     return chart

diff --git a/examples/plot_predict_custom.py b/examples/plot_predict_custom.py
@@ -22,9 +22,7 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
-random_seed = 0
-rng = check_random_state(random_seed)
-
+random_state = check_random_state(0)
 n_test_samples = 100
 
 
@@ -70,13 +68,14 @@ def predict(reg, X, quantiles=0.5, what=None):
 
 X, y = datasets.load_diabetes(return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=n_test_samples, random_state=random_seed
+    X, y, test_size=n_test_samples, random_state=random_state
 )
 
-reg = RandomForestQuantileRegressor(random_state=random_seed).fit(X_train, y_train)
+reg = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
 
-# Define a user-specified function; here we randomly sample 1000 values with replacement.
-func = lambda x: rng.choice(x, size=1000)
+# Define a user-specified function.
+# Here we randomly sample 1,000 values with replacement from the empirical distribution.
+func = lambda x: random_state.choice(x, size=1000)
 
 # Output array with the user-specified function applied to each sample's empirical distribution.
 y_out = predict(reg, X_test, what=func)
@@ -140,9 +139,9 @@ def plot_ecdf(df):
         .add_params(index_selection)
         .transform_filter(index_selection)
         .properties(
+            title="Empirical Cumulative Distribution Function (ECDF) Plot",
             height=400,
             width=650,
-            title="Empirical Cumulative Distribution Function (ECDF) Plot",
         )
     )
     return chart

diff --git a/examples/plot_proximity_counts.py b/examples/plot_proximity_counts.py
@@ -24,28 +24,26 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
-random_seed = 0
-rng = check_random_state(random_seed)
-
+random_state = check_random_state(0)
 n_test_samples = 25
 noise_std = 0.1
 
 # Load the Digits dataset.
 X, y = datasets.load_digits(return_X_y=True, as_frame=True)
 
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=n_test_samples, random_state=random_seed
+    X, y, test_size=n_test_samples, random_state=random_state
 )
 
 
 def add_gaussian_noise(X, mean=0, std=0.1, random_state=None):
     """Add Gaussian noise to input data."""
-    rng = check_random_state(random_state)
+    random_state = check_random_state(random_state)
 
     scaler = MinMaxScaler()
     X_scaled = scaler.fit_transform(X)
 
-    noise = rng.normal(mean, std, X_scaled.shape)
+    noise = random_state.normal(mean, std, X_scaled.shape)
     X_noisy = np.clip(X_scaled + noise, 0, 1)
 
     X_noisy = scaler.inverse_transform(X_noisy)
@@ -70,20 +68,20 @@ def extract_floats(combined_df, scale=100):
 
 
 # Randomly add noise to the training and test data.
-X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)
-X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)
+X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)
+X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)
 
 # We set `max_samples_leaf=None` to ensure that every sample in the training
 # data is stored in the leaf nodes. By doing this, we allow the model to
 # consider all samples as potential candidates for proximity calculations.
-qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_seed)
+qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)
 qrf.fit(X_train_noisy, X_train)
 
 # Get the proximity counts.
 proximities = qrf.proximity_counts(X_test_noisy)
 
 df_prox = pd.DataFrame(
-    {"prox": [[(j, *p) for j, p in enumerate(proximities[i])] for i in range(len(X_test))]}
+    {"prox": [[(i, *p) for i, p in enumerate(proximities[x])] for x in range(len(X_test))]}
 )
 
 df = (
@@ -142,7 +140,7 @@ def plot_digits_proximities(
     base = alt.Chart(df).add_params(index_selection).transform_filter(index_selection)
 
     chart1 = (
-        base.transform_filter("datum.prox_idx == 0")
+        base.transform_filter("datum.prox_idx == 0")  # filter to one test sample row
         .transform_fold(fold=pixel_cols, as_=["pixel", "value"])
         .transform_calculate(value_clean=f"floor(datum.value / {pixel_scale})")
         .transform_calculate(value_noisy=f"datum.value - (datum.value_clean * {pixel_scale})")
@@ -196,7 +194,7 @@ def plot_digits_proximities(
     )
 
     chart3 = (
-        base.transform_filter("datum.prox_idx == 0")
+        base.transform_filter("datum.prox_idx == 0")  # filter to one test sample row
         .transform_fold(fold=pixel_cols, as_=["pixel", "value"])
         .transform_calculate(value_clean=f"floor(datum.value / {pixel_scale})")
         .transform_calculate(x=pixel_x, y=pixel_y)
@@ -208,12 +206,12 @@ def plot_digits_proximities(
             opacity=alt.condition(alt.datum["value_clean"] == 0, *opacity),
             tooltip=[
                 alt.Tooltip("target:Q", title="Digit"),
-                alt.Tooltip("value_clean:Q", format=",.3f", title="Pixel Value"),
+                alt.Tooltip("value_clean:Q", format=",.0f", title="Pixel Value"),
                 alt.Tooltip("x:Q", title="Pixel X"),
                 alt.Tooltip("y:Q", title="Pixel Y"),
             ],
         )
-        .properties(height=height, width=width, title="Test Digit (original)")
+        .properties(title="Test Digit (original)", height=height, width=width)
     )
 
     chart_spacer = alt.Chart(pd.DataFrame()).mark_rect().properties(width=subplot_dim * 2)

diff --git a/examples/plot_quantile_conformalized.py b/examples/plot_quantile_conformalized.py
@@ -24,12 +24,10 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
-alt.data_transformers.disable_max_rows()
-
 random_seed = 0
-rng = check_random_state(random_seed)
+random_state = check_random_state(random_seed)
 
-n_samples = 1000
+n_samples = 900
 coverages = np.linspace(0, 1, num=11, endpoint=True).round(1).tolist()  # the "coverage level"
 
 strategies = {
@@ -39,11 +37,11 @@
 
 # Load the California Housing Prices dataset.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
-perm = rng.permutation(min(len(X), n_samples))
+perm = random_state.permutation(min(len(X), n_samples))
 X = X.iloc[perm]
 y = y.iloc[perm]
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
 
 def sort_y_values(y_test, y_pred, y_pis):
@@ -73,7 +71,7 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
     """QRF (baseline) strategy."""
     quantiles = [alpha / 2, 1 - alpha / 2]
 
-    qrf = RandomForestQuantileRegressor(random_state=random_state)
+    qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)
     qrf.fit(X_train, y_train)
 
     # Calculate the lower and upper quantile values on the test data.
@@ -99,7 +97,7 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
         X_train, y_train, test_size=0.5, random_state=random_state
     )
 
-    qrf = RandomForestQuantileRegressor(random_state=random_state)
+    qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)
     qrf.fit(X_train, y_train)
 
     # Calculate the lower and upper quantile values on the test data.

diff --git a/examples/plot_quantile_example.py b/examples/plot_quantile_example.py
@@ -12,35 +12,36 @@
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
+from sklearn.utils.validation import check_random_state
 
 from quantile_forest import RandomForestQuantileRegressor
 
-random_seed = 0
+random_state = check_random_state(0)
 n_samples = 1000
 bounds = [0, 10]
 quantiles = [0.025, 0.5, 0.975]
 
 
-def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0):
-    rng = np.random.RandomState(random_seed)
+def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
+    random_state = check_random_state(random_state)
 
-    x = rng.uniform(*bounds, size=n_samples)
+    x = random_state.uniform(*bounds, size=n_samples)
     f = x * np.sin(x)
 
     sigma = 0.25 + x / 10
-    noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2) if add_noise else np.zeros_like(f)
-    y = f + noise
+    noise = random_state.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)
+    y = f + (noise if add_noise else np.zeros_like(f))
 
     return np.atleast_2d(x).T, y
 
 
 # Create noisy data for modeling and non-noisy function data for illustration.
-X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=random_seed)
-X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=random_seed)
+X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0)
+X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_state=0)
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
-qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_seed)
+qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_state)
 qrf.fit(X_train, y_train)
 
 y_pred_func = qrf.predict(X_func, quantiles=quantiles)
@@ -135,7 +136,7 @@ def plot_fit_and_intervals(df):
     chart = (
         (area_pred + points + line_true + line_pred + blank)
         .resolve_scale(color="independent")
-        .properties(height=400, width=650, title="QRF Predictions vs. Ground Truth on Toy Dataset")
+        .properties(title="QRF Predictions vs. Ground Truth on Toy Dataset", height=400, width=650)
     )
 
     return chart

diff --git a/examples/plot_quantile_extrapolation.py b/examples/plot_quantile_extrapolation.py
@@ -23,26 +23,27 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
-random_seed = 0
-rng = check_random_state(random_seed)
-
+random_state = check_random_state(0)
 n_samples = 500
 bounds = [0, 15]
 extrap_frac = 0.25
 func = lambda x: x * np.sin(x)
 func_str = "f(x) = x sin(x)"
 
 quantiles = [0.025, 0.975, 0.5]
-qrf_params = {"max_samples_leaf": None, "min_samples_leaf": 4, "random_state": random_seed}
+qrf_params = {"max_samples_leaf": None, "min_samples_leaf": 4, "random_state": random_state}
+
 
+def make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0):
+    random_state = check_random_state(random_state)
 
-def make_func_Xy(func, bounds, n_samples, random_seed=0):
-    rng = np.random.RandomState(random_seed)
     x = np.linspace(bounds[0], bounds[1], n_samples)
     f = func(x)
+
     std = 0.01 + np.abs(x - 5.0) / 5.0
-    noise = rng.normal(scale=std)
+    noise = random_state.normal(scale=std) if add_noise else np.zeros_like(f)
     y = f + noise
+
     return np.atleast_2d(x).T, y
 
 
@@ -367,18 +368,18 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, *
 
 
 # Create the full dataset.
-X, y = make_func_Xy(func, bounds, n_samples, random_seed=random_seed)
+X, y = make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0)
 
 # Fit and extrapolate based on train-test split (depending on X).
 extrap_min_idx = int(n_samples * (extrap_frac / 2))
 extrap_max_idx = int(n_samples - (n_samples * (extrap_frac / 2)))
 sort_X = np.argsort(X.squeeze())
 train_indices = np.repeat(False, len(y))
 train_indices[sort_X[extrap_min_idx] : sort_X[extrap_max_idx]] = True
-res = train_test_split(train_indices, rng=rng, **qrf_params)
+res = train_test_split(train_indices, rng=random_state, **qrf_params)
 
 # Get coverages on extrapolated samples.
-args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], rng)
+args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], random_state)
 cov_qrf = get_coverage_qrf(res["qmat"], *args)
 cov_xtr = get_coverage_xtr(res["bounds_list"], *args)
 
@@ -513,7 +514,7 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None
                 chart += blank
             chart = chart.resolve_scale(color="independent")
 
-        chart = chart.properties(height=200, width=300, title=title)
+        chart = chart.properties(title=title, height=200, width=300)
         return chart
 
     kwargs = {"x_domain": [0, 15], "y_domain": [-15, 20]}

diff --git a/examples/plot_quantile_interpolation.py b/examples/plot_quantile_interpolation.py
@@ -15,10 +15,11 @@
 import altair as alt
 import numpy as np
 import pandas as pd
+from sklearn.utils.validation import check_random_state
 
 from quantile_forest import RandomForestQuantileRegressor
 
-random_seed = 0
+random_state = check_random_state(0)
 intervals = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist()
 
 # Create toy dataset.
@@ -31,7 +32,7 @@
     n_estimators=1,
     max_samples_leaf=None,
     bootstrap=False,
-    random_state=random_seed,
+    random_state=random_state,
 )
 qrf.fit(X, y)
 

diff --git a/examples/plot_quantile_intervals.py b/examples/plot_quantile_intervals.py
@@ -17,18 +17,16 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
-random_seed = 0
-rng = check_random_state(random_seed)
-
+random_state = check_random_state(0)
 n_samples = 1000
 
 # Load the California Housing Prices dataset.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
-perm = rng.permutation(min(len(X), n_samples))
+perm = random_state.permutation(min(len(X), n_samples))
 X = X.iloc[perm]
 y = y.iloc[perm]
 
-qrf = RandomForestQuantileRegressor(random_state=random_seed)
+qrf = RandomForestQuantileRegressor(random_state=random_state)
 
 kf = KFold(n_splits=5)
 kf.get_n_splits(X)