Update example plots

zillow · Aug 7, 2024 · 6f81ee7 · 6f81ee7
1 parent b95444d
commit 6f81ee7
Show file tree

Hide file tree

Showing 12 changed files with 149 additions and 106 deletions.
diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py
@@ -27,15 +27,17 @@
 
 alt.data_transformers.disable_max_rows()
 
+random_seed = 0
+
 token = "<Hugging Face Access Token>"
 repo_id = "quantile-forest/california-housing-example"
 load_existing = True
 
-quantiles = np.arange(0, 1.25, 0.25).round(2).tolist()
+quantiles = np.linspace(0, 1, num=5, endpoint=True).round(2).tolist()
 sample_frac = 1
 
 
-def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
+def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=None):
     """Function used to fit the model and upload it to Hugging Face Hub."""
     from pathlib import Path
 
@@ -49,10 +51,10 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
     from skops import card
 
     X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
     # Fit the model.
-    qrf = RandomForestQuantileRegressor(random_state=0).fit(X_train, y_train)
+    qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
 
     # Save the model to a file.
     model_filename = "model.pkl"
@@ -145,7 +147,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
 
 
 if not load_existing:
-    fit_and_upload_model(token, repo_id)
+    fit_and_upload_model(token, repo_id, random_state=random_seed)
 
 # Download the repository locally.
 local_dir = "./local_repo"
@@ -166,7 +168,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo"):
 df = (
     pd.DataFrame(y_pred, columns=quantiles)
     .reset_index()
-    .sample(frac=sample_frac, random_state=0)
+    .sample(frac=sample_frac, random_state=random_seed)
     .melt(id_vars=["index"], var_name="quantile", value_name="value")
     .merge(X[["Latitude", "Longitude", "Population"]].reset_index(), on="index", how="right")
 )
@@ -178,7 +180,7 @@ def plot_quantiles_by_latlon(df, quantiles):
         min=0,
         max=1,
         step=0.5 if len(quantiles) == 1 else 1 / (len(quantiles) - 1),
-        name="Quantile: ",
+        name="Predicted Quantile: ",
     )
 
     q_val = alt.selection_point(
@@ -217,7 +219,7 @@ def plot_quantiles_by_latlon(df, quantiles):
         .properties(
             height=650,
             width=650,
-            title="Quantile Estimates on the California Housing Dataset",
+            title="Quantile Predictions on the California Housing Dataset",
         )
     )
     return chart

diff --git a/examples/plot_predict_custom.py b/examples/plot_predict_custom.py
@@ -19,10 +19,12 @@
 import scipy as sp
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
+from sklearn.utils.validation import check_random_state
 
 from quantile_forest import RandomForestQuantileRegressor
 
-np.random.seed(0)
+random_seed = 0
+rng = check_random_state(random_seed)
 
 n_test_samples = 100
 
@@ -68,12 +70,14 @@ def predict(reg, X, quantiles=0.5, what=None):
 
 
 X, y = datasets.load_diabetes(return_X_y=True)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test_samples, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=n_test_samples, random_state=random_seed
+)
 
-reg = RandomForestQuantileRegressor().fit(X_train, y_train)
+reg = RandomForestQuantileRegressor(random_state=random_seed).fit(X_train, y_train)
 
 # Define a user-specified function; here we randomly sample 1000 values with replacement.
-func = lambda x: np.random.choice(x, size=1000)
+func = lambda x: rng.choice(x, size=1000)
 
 # Output array with the user-specified function applied to each sample's empirical distribution.
 y_out = predict(reg, X_test, what=func)

diff --git a/examples/plot_proximity_counts.py b/examples/plot_proximity_counts.py
@@ -25,21 +25,26 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
-rng = check_random_state(0)
+random_seed = 0
+rng = check_random_state(random_seed)
 
 n_test_samples = 25
 noise_std = 0.1
 
 # Load the Digits dataset.
 X, y = datasets.load_digits(return_X_y=True, as_frame=True)
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test_samples, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=n_test_samples, random_state=random_seed
+)
 
 
 def add_gaussian_noise(X, mean=0, std=0.1, random_state=None):
     """Add Gaussian noise to input data."""
     if random_state is None:
         rng = check_random_state(0)
+    elif isinstance(random_state, int):
+        rng = check_random_state(random_state)
     else:
         rng = random_state
 
@@ -73,13 +78,13 @@ def extract_floats(combined_df, scale=100):
 
 
 # Randomly add noise to the training and test data.
-X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=rng)
-X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=rng)
+X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)
+X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_seed)
 
 # We set `max_samples_leaf=None` to ensure that every sample in the training
 # data is stored in the leaf nodes. By doing this, we allow the model to
 # consider all samples as potential candidates for proximity calculations.
-qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=0)
+qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_seed)
 qrf.fit(X_train_noisy, X_train)
 
 # Get the proximity counts.
@@ -177,6 +182,7 @@ def plot_digits_proximities(
             color=alt.Color("value_clean:Q", legend=None, scale=alt.Scale(scheme="greys")),
             opacity=alt.condition(alt.datum["value_clean"] == 0, alt.value(0), alt.value(0.67)),
             tooltip=[
+                alt.Tooltip("prox_idx", title="Proximity Index"),
                 alt.Tooltip("prox_cnt", title="Proximity Count"),
                 alt.Tooltip("target:Q", title="Digit"),
             ],

diff --git a/examples/plot_quantile_conformalized.py b/examples/plot_quantile_conformalized.py
@@ -25,31 +25,31 @@
 
 alt.data_transformers.disable_max_rows()
 
+random_seed = 0
+rng = check_random_state(random_seed)
+
+n_samples = 1000
+coverages = np.linspace(0, 1, num=11, endpoint=True).round(1).tolist()  # the "coverage level"
+
 strategies = {
     "qrf": "Quantile Regression Forest (QRF)",
     "cqr": "Conformalized Quantile Regression (CQR)",
 }
 
-random_state = 0
-rng = check_random_state(random_state)
-
-coverages = np.arange(0, 1.1, 0.1).round(1).tolist()  # the "coverage level"
-
 # Load the California Housing Prices dataset.
-california = datasets.fetch_california_housing()
-n_samples = min(california.target.size, 1000)
-perm = rng.permutation(n_samples)
-X = california.data[perm]
-y = california.target[perm]
+X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
+perm = rng.permutation(min(len(X), n_samples))
+X = X.iloc[perm]
+y = y.iloc[perm]
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)
 
 
 def sort_y_values(y_test, y_pred, y_pis):
     """Sort the target values and predictions."""
     indices = np.argsort(y_test)
     return {
-        "y_test": y_test[indices],
+        "y_test": np.asarray(y_test)[indices],
         "y_pred": y_pred[indices],
         "y_pred_low": y_pis[:, 0][indices],
         "y_pred_upp": y_pis[:, 1][indices],
@@ -68,10 +68,10 @@ def mean_width_score(y_pred_low, y_pred_upp):
     return float(mean_width)
 
 
-def qrf_strategy(alpha, X_train, X_test, y_train, y_test):
+def qrf_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
     quantiles = [alpha / 2, 1 - alpha / 2]
 
-    qrf = RandomForestQuantileRegressor(random_state=0)
+    qrf = RandomForestQuantileRegressor(random_state=random_state)
     qrf.fit(X_train, y_train)
 
     # Calculate the lower and upper quantile values on the test data.
@@ -88,15 +88,15 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test):
     return pd.DataFrame(y_values).pipe(lambda x: x * 100_000).assign(strategy="qrf")
 
 
-def cqr_strategy(alpha, X_train, X_test, y_train, y_test):
+def cqr_strategy(alpha, X_train, X_test, y_train, y_test, random_state=None):
     quantiles = [alpha / 2, 1 - alpha / 2]
 
     # Create calibration set.
     X_train, X_calib, y_train, y_calib = train_test_split(
-        X_train, y_train, test_size=0.5, random_state=0
+        X_train, y_train, test_size=0.5, random_state=random_state
     )
 
-    qrf = RandomForestQuantileRegressor(random_state=0)
+    qrf = RandomForestQuantileRegressor(random_state=random_state)
     qrf.fit(X_train, y_train)
 
     # Calculate the lower and upper quantile values on the test data.
@@ -134,7 +134,7 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test):
 dfs = []
 for cov_frac in coverages:
     alpha = float(round(1 - cov_frac, 2))
-    args = (alpha, X_train, X_test, y_train, y_test)
+    args = (alpha, X_train, X_test, y_train, y_test, random_seed)
     dfs.append(pd.concat([qrf_strategy(*args), cqr_strategy(*args)]).assign(alpha=alpha))
 df = pd.concat(dfs)
 

diff --git a/examples/plot_quantile_example.py b/examples/plot_quantile_example.py
@@ -14,6 +14,7 @@
 
 from quantile_forest import RandomForestQuantileRegressor
 
+random_seed = 0
 n_samples = 1000
 bounds = [0, 10]
 quantiles = [0.025, 0.5, 0.975]
@@ -33,12 +34,12 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0):
 
 
 # Create noisy data for modeling and non-noisy function data for illustration.
-X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0)
-X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=0)
+X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=random_seed)
+X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=random_seed)
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_seed)
 
-qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=0)
+qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_seed)
 qrf.fit(X_train, y_train)
 
 y_pred_func = qrf.predict(X_func, quantiles=quantiles)
@@ -133,7 +134,7 @@ def plot_fit_and_intervals(df):
     chart = (
         (area_pred + points + line_true + line_pred + blank)
         .resolve_scale(color="independent")
-        .properties(height=400, width=650)
+        .properties(height=400, width=650, title="QRF Predictions vs. Ground Truth on Toy Dataset")
     )
 
     return chart