Update example plots

zillow · Aug 19, 2024 · ee9f371 · ee9f371
1 parent ceb4536
commit ee9f371
Show file tree

Hide file tree

Showing 12 changed files with 78 additions and 58 deletions.
diff --git a/examples/plot_huggingface_model.py b/examples/plot_huggingface_model.py
@@ -50,7 +50,9 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
     from sklearn.model_selection import train_test_split
     from skops import card
 
+    # Load the California Housing dataset.
     X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
+
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
 
     # Fit the model.
@@ -160,7 +162,7 @@ def fit_and_upload_model(token, repo_id, local_dir="./local_repo", random_state=
     with open(f"{local_dir}/{model_filename}", "rb") as file:
         qrf = pickle.load(file)
 
-# Estimate quantiles.
+# Fetch the California Housing dataset and estimate quantiles.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
 y_pred = qrf.predict(X, quantiles=quantiles) * 100_000  # predict in dollars
 

diff --git a/examples/plot_predict_custom.py b/examples/plot_predict_custom.py
@@ -25,12 +25,12 @@
 n_test_samples = 100
 
 
-def predict(reg, X, quantiles=0.5, what=None):
+def predict(qrf, X, quantiles=0.5, what=None):
     """Custom prediction method that allows user-specified function.
 
     Parameters
     ----------
-    reg : BaseForestQuantileRegressor
+    qrf : BaseForestQuantileRegressor
         Quantile forest model object.
     X : array-like, of shape (n_samples, n_features)
         New test data.
@@ -45,16 +45,16 @@ def predict(reg, X, quantiles=0.5, what=None):
     Output array with the user-specified function applied (if any).
     """
     if what is None:
-        return reg.predict(X, quantiles=quantiles)
+        return qrf.predict(X, quantiles=quantiles)
 
-    # Get the complete set of proximities (training indices) for each sample.
-    proximities = reg.proximity_counts(X)
+    # Get the complete set of proximities for each sample.
+    proximities = qrf.proximity_counts(X)
 
     # Retrieve the unsorted training responses from the model (stored in sorted order).
-    reverse_sorter = np.argsort(reg.sorter_, axis=0)
-    y_train = np.empty_like(reg.forest_.y_train).T
+    reverse_sorter = np.argsort(qrf.sorter_, axis=0)
+    y_train = np.empty_like(qrf.forest_.y_train).T
     for i in range(y_train.shape[1]):
-        y_train[:, i] = np.asarray(reg.forest_.y_train)[i, reverse_sorter[:, i]]
+        y_train[:, i] = np.asarray(qrf.forest_.y_train)[i, reverse_sorter[:, i]]
 
     # For each sample, construct an array of the training responses used for prediction.
     s = [np.concatenate([np.repeat(y_train[i[0]], i[1]) for i in prox]) for prox in proximities]
@@ -65,25 +65,28 @@ def predict(reg, X, quantiles=0.5, what=None):
     return y_out
 
 
+# Load the Diabetes dataset.
 X, y = datasets.load_diabetes(return_X_y=True)
+
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=n_test_samples, random_state=random_state
 )
 
-reg = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
+qrf = RandomForestQuantileRegressor(random_state=random_state).fit(X_train, y_train)
 
 # Define a user-specified function.
 # Here we randomly sample 1,000 values with replacement from the empirical distribution.
 func = lambda x: random_state.choice(x, size=1000)
 
 # Output array with the user-specified function applied to each sample's empirical distribution.
-y_out = predict(reg, X_test, what=func)
+y_out = predict(qrf, X_test, what=func)
 
 dfs = []
 for idx in range(n_test_samples):
     # Calculate the ECDF from output array.
     y_ecdf = [sp.stats.ecdf(y_i).cdf for y_i in y_out[idx].reshape(1, -1)]
 
+    # Get quantiles (x-axis) and probabilities (y-axis).
     quantiles = list(chain.from_iterable([y_i.quantiles for y_i in y_ecdf]))
     probabilities = list(chain.from_iterable([y_i.probabilities for y_i in y_ecdf]))
 

diff --git a/examples/plot_proximity_counts.py b/examples/plot_proximity_counts.py
@@ -5,13 +5,12 @@
 This example demonstrates the use of quantile regression forest (QRF)
 proximity counts to identify similar samples. In this scenario, we train a QRF
 on a noisy dataset to predict individual pixel values in an unsupervised
-manner (the target labels are not used during training) for denoising
-purposes. We then retrieve the proximity values for the noisy test samples. We
-visualize each test sample alongside a set of similar (non-noisy) training
-samples determined by their proximity counts. These similar samples are
-ordered from the highest to the lowest proximity count. This illustrates how
-proximity counts can effectively identify similar samples even in noisy
-conditions.
+manner for denoising purposes; the target labels are not used during training.
+We then retrieve the proximity values for the noisy test samples. We visualize
+each test sample alongside a set of similar (non-noisy) training samples
+determined by their proximity counts. These similar samples are ordered from
+the highest to the lowest proximity count. This illustrates how proximity
+counts can effectively identify similar samples even in noisy conditions.
 """
 
 import altair as alt
@@ -57,31 +56,34 @@ def add_gaussian_noise(X, mean=0, std=0.1, random_state=None):
     return X_noisy
 
 
-def combine_floats(df1, df2, scale=100):
+def combine_floats(df1, df2, scale=1000):
     """Combine two floats from separate data frames into a single number."""
     combined_df = df1 * scale + df2
     return combined_df
 
 
-def extract_floats(combined_df, scale=100):
+def extract_floats(combined_df, scale=1000):
     """Extract the original floats from the combined data frame."""
     df1 = np.floor(combined_df / scale)
     df2 = combined_df - (df1 * scale)
     return df1, df2
 
 
-# Randomly add noise to the training and test data.
+# Randomly add Gaussian noise to the training and test data.
 X_train_noisy = X_train.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)
 X_test_noisy = X_test.pipe(add_gaussian_noise, std=noise_std, random_state=random_state)
 
 # We set `max_samples_leaf=None` to ensure that every sample in the training
 # data is stored in the leaf nodes. By doing this, we allow the model to
 # consider all samples as potential candidates for proximity calculations.
 qrf = RandomForestQuantileRegressor(max_samples_leaf=None, random_state=random_state)
+
+# Fit the model to predict the non-noisy pixels from noisy pixels (i.e., to denoise).
+# We fit a single multi-target model, with each pixel treated as a distinct target.
 qrf.fit(X_train_noisy, X_train)
 
 # Get the proximity counts.
-proximities = qrf.proximity_counts(X_test_noisy)
+proximities = qrf.proximity_counts(X_test_noisy)  # output is a list of tuples for each sample
 
 df_prox = pd.DataFrame(
     {"prox": [[(i, *p) for i, p in enumerate(proximities[x])] for x in range(len(X_test))]}
@@ -107,8 +109,8 @@ def extract_floats(combined_df, scale=100):
 # Create a data frame for looking up training proximities.
 df_lookup = (
     combine_floats(X_train, X_train_noisy, scale=pixel_scale)  # combine to reduce transmitted data
-    .assign(**{"index": np.arange(len(X_train))})
     .join(y_train)
+    .assign(**{"index": np.arange(len(X_train))})  # align with proximities, which are zero-indexed
 )
 
 
@@ -123,12 +125,10 @@ def plot_digits_proximities(
     height=225,
     width=225,
 ):
-    dim_x = pixel_dim[0]
-    dim_y = pixel_dim[1]
-    num_x = len(str(dim_x))
-    num_y = len(str(dim_y))
+    dim_x, dim_y = pixel_dim[0], pixel_dim[1]
+    dgt_x, dgt_y = len(str(dim_x)), len(str(dim_y))
 
-    pixel_cols = [f"pixel_{y:0{num_y}}_{x:0{num_x}}" for y in range(dim_y) for x in range(dim_x)]
+    pixel_cols = [f"pixel_{y:0{dgt_y}}_{x:0{dgt_x}}" for y in range(dim_y) for x in range(dim_x)]
     pixel_x = "split(datum.pixel, '_')[2]"
     pixel_y = "split(datum.pixel, '_')[1]"
 
@@ -171,7 +171,7 @@ def plot_digits_proximities(
     )
 
     chart2 = (
-        base.transform_filter(f"datum.prox_idx < {n_prox}")
+        base.transform_filter(f"datum.prox_idx < {n_prox}")  # filter to the display proximities
         .transform_lookup(
             lookup="prox_val",
             from_=alt.LookupData(df_lookup, key="index", fields=pixel_cols + ["target"]),

diff --git a/examples/plot_quantile_conformalized.py b/examples/plot_quantile_conformalized.py
@@ -34,7 +34,7 @@
     "cqr": "Conformalized Quantile Regression (CQR)",
 }
 
-# Load the California Housing Prices dataset.
+# Load the California Housing dataset.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
 perm = random_state.permutation(min(len(X), n_samples))
 X = X.iloc[perm]

diff --git a/examples/plot_quantile_example.py b/examples/plot_quantile_example.py
@@ -23,6 +23,7 @@
 
 
 def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
+    """Make a toy dataset."""
     random_state = check_random_state(random_state)
 
     x = random_state.uniform(*bounds, size=n_samples)
@@ -35,7 +36,7 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
     return np.atleast_2d(x).T, y
 
 
-# Create noisy data for modeling and non-noisy function data for illustration.
+# Create a noisy dataset for modeling and a non-noisy version for illustration.
 X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0)
 X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_state=0)
 
@@ -44,25 +45,25 @@ def make_toy_dataset(n_samples, bounds, add_noise=True, random_state=0):
 qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=random_state)
 qrf.fit(X_train, y_train)
 
-y_pred_func = qrf.predict(X_func, quantiles=quantiles)
-y_pred_test = qrf.predict(X_test, quantiles=quantiles)
+y_pred_test = qrf.predict(X_test, quantiles=quantiles)  # predict on noisy samples
+y_pred_func = qrf.predict(X_func, quantiles=quantiles)  # predict on non-noisy samples
 
 df = pd.DataFrame(
     {
-        "x": np.concatenate([X_func.reshape(-1), X_test.reshape(-1)]),
-        "y": np.concatenate([y_func, y_test]),
-        "y_pred": np.concatenate([y_pred_func[:, 1], y_pred_test[:, 1]]),
-        "y_pred_low": np.concatenate([y_pred_func[:, 0], y_pred_test[:, 0]]),
-        "y_pred_upp": np.concatenate([y_pred_func[:, 2], y_pred_test[:, 2]]),
-        "test": [False] * len(y_func) + [True] * len(y_test),
+        "x": np.concatenate([X_test.reshape(-1), X_func.reshape(-1)]),
+        "y": np.concatenate([y_test, y_func]),
+        "y_pred": np.concatenate([y_pred_test[:, 1], y_pred_func[:, 1]]),
+        "y_pred_low": np.concatenate([y_pred_test[:, 0], y_pred_func[:, 0]]),
+        "y_pred_upp": np.concatenate([y_pred_test[:, 2], y_pred_func[:, 2]]),
+        "test": [True] * len(y_test) + [False] * len(y_func),
     }
 )
 
 
 def plot_fit_and_intervals(df):
     area_pred = (
         alt.Chart(df)
-        .transform_filter(~alt.datum["test"])  # filter to training data
+        .transform_filter(~alt.datum["test"])  # filter to non-test data
         .mark_area(color="#e0f2ff", opacity=0.8)
         .encode(
             x=alt.X("x:Q", scale=alt.Scale(nice=False), title="X"),
@@ -95,7 +96,7 @@ def plot_fit_and_intervals(df):
 
     line_true = (
         alt.Chart(df.assign(**{"y_true_label": "f(x) = x sin(x)"}))
-        .transform_filter(~alt.datum["test"])  # filter to training data
+        .transform_filter(~alt.datum["test"])  # filter to non-test data
         .mark_line(color="black", size=3)
         .encode(
             x=alt.X("x:Q", scale=alt.Scale(nice=False)),
@@ -110,7 +111,7 @@ def plot_fit_and_intervals(df):
 
     line_pred = (
         alt.Chart(df.assign(**{"y_pred_label": "Predicted Median"}))
-        .transform_filter(~alt.datum["test"])  # filter to training data
+        .transform_filter(~alt.datum["test"])  # filter to non-test data
         .mark_line(color="#006aff", size=5)
         .encode(
             x=alt.X("x:Q", scale=alt.Scale(nice=False)),

diff --git a/examples/plot_quantile_extrapolation.py b/examples/plot_quantile_extrapolation.py
@@ -34,6 +34,7 @@
 
 
 def make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0):
+    """Make a dataset from a specified function."""
     random_state = check_random_state(random_state)
 
     x = np.linspace(bounds[0], bounds[1], n_samples)
@@ -366,7 +367,7 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, *
     return np.mean(xtra[test_indices])
 
 
-# Create the full dataset.
+# Create a dataset that requires extrapolation.
 X, y = make_func_Xy(func, bounds, n_samples, add_noise=True, random_state=0)
 
 # Fit and extrapolate based on train-test split (depending on X).
@@ -377,7 +378,7 @@ def get_coverage_xtr(bounds_list, train_indices, test_indices, y_train, level, *
 train_indices[sort_X[extrap_min_idx] : sort_X[extrap_max_idx]] = True
 res = train_test_split(train_indices, rng=random_state, **qrf_params)
 
-# Get coverages on extrapolated samples.
+# Get coverages for extrapolated samples.
 args = (train_indices, ~train_indices, y[train_indices], quantiles[1] - quantiles[0], random_state)
 cov_qrf = get_coverage_qrf(res["qmat"], *args)
 cov_xtr = get_coverage_xtr(res["bounds_list"], *args)

diff --git a/examples/plot_quantile_interpolation.py b/examples/plot_quantile_interpolation.py
@@ -21,7 +21,7 @@
 random_state = np.random.RandomState(0)
 intervals = np.linspace(0, 1, num=101, endpoint=True).round(2).tolist()
 
-# Create toy dataset.
+# Create a simple toy dataset.
 X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]])
 y = np.array([-2, -1, 0, 1, 2])
 
@@ -59,11 +59,12 @@
         "quantile_upp": [None] * len(y),
     }
 
+    # Make predictions at the median and intervals.
     quantiles = [0.5, round(0.5 - interval / 2, 3), round(0.5 + interval / 2, 3)]
 
     # Populate data based on prediction results with different interpolations.
     for interpolation in interpolations:
-        # Get predictions at median and prediction intervals.
+        # Get predictions using the specified quantiles and interpolation method.
         y_pred = qrf.predict(X, quantiles=quantiles, interpolation=interpolation.lower())
 
         data["method"].extend([interpolation] * len(y))

diff --git a/examples/plot_quantile_intervals.py b/examples/plot_quantile_intervals.py
@@ -19,15 +19,15 @@
 random_state = np.random.RandomState(0)
 n_samples = 1000
 
-# Load the California Housing Prices dataset.
+# Load the California Housing dataset.
 X, y = datasets.fetch_california_housing(as_frame=True, return_X_y=True)
 perm = random_state.permutation(min(len(X), n_samples))
 X = X.iloc[perm]
 y = y.iloc[perm]
 
 qrf = RandomForestQuantileRegressor(random_state=random_state)
 
-kf = KFold(n_splits=5)
+kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
 kf.get_n_splits(X)
 
 # Using k-fold cross-validation, get predictions for all samples.

diff --git a/examples/plot_quantile_multioutput.py b/examples/plot_quantile_multioutput.py
@@ -40,14 +40,15 @@
 
 
 def make_func_Xy(funcs, bounds, n_samples):
+    """Make a dataset from a specified function."""
     x = np.linspace(*bounds, n_samples)
     y = np.empty((len(x), len(funcs)))
     for i, func in enumerate(funcs):
         y[:, i] = func["signal"](x) + func["noise"](x)
     return np.atleast_2d(x).T, y
 
 
-# Create the dataset with multiple target variables.
+# Create a dataset with multiple target variables.
 X, y = make_func_Xy(funcs, bounds, n_samples)
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
@@ -56,7 +57,7 @@ def make_func_Xy(funcs, bounds, n_samples):
 qrf.fit(X_train, y_train)  # fit on all of the targets simultaneously
 
 # Get multi-target predictions at specified quantiles.
-y_pred = qrf.predict(X, quantiles=quantiles)
+y_pred = qrf.predict(X, quantiles=quantiles)  # shape = (n_samples, n_targets, n_quantiles)
 
 df = pd.DataFrame(
     {

diff --git a/examples/plot_quantile_ranks.py b/examples/plot_quantile_ranks.py
@@ -25,13 +25,15 @@
 
 
 def make_toy_dataset(n_samples, bounds, random_state=0):
+    """Make a toy dataset."""
     random_state = check_random_state(random_state)
     X_1d = np.linspace(*bounds, num=n_samples)
     X = X_1d.reshape(-1, 1)
     y = X_1d * np.cos(X_1d) + random_state.normal(scale=X_1d / math.e)
     return X, y
 
 
+# Create a toy dataset.
 X, y = make_toy_dataset(n_samples, bounds, random_state=0)
 
 qrf = RandomForestQuantileRegressor(
@@ -43,7 +45,7 @@ def make_toy_dataset(n_samples, bounds, random_state=0):
 y_pred = qrf.predict(X, quantiles=0.5)
 
 # Get the quantile rank for all samples.
-y_rank = qrf.quantile_ranks(X, y)
+y_rank = qrf.quantile_ranks(X, y)  # output is a value in the range [0, 1] for each sample
 
 df = pd.DataFrame({"x": X.reshape(-1), "y": y, "y_pred": y_pred, "y_rank": y_rank})