Update examples

zillow · Feb 19, 2024 · b69c2ae · b69c2ae
1 parent 8b1170d
commit b69c2ae
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 42 deletions.
diff --git a/quantile_forest/tests/examples/plot_quantile_extrapolation.py b/quantile_forest/tests/examples/plot_quantile_extrapolation.py
@@ -46,11 +46,15 @@ def get_test_X(X):
     return X_test
 
 
+# Create the full dataset.
 X, y = make_func_Xy(func, bounds, n_samples)
 
+# Calculate the extrapolation bounds.
 extrap_min_idx = int(n_samples * (extrap_frac / 2))
 extrap_max_idx = int(n_samples - (n_samples * (extrap_frac / 2)))
 
+# Based on the extrapolation bounds, get the training and test data.
+# Training data excludes extrapolated regions; test data includes them.
 X_train, y_train = get_train_Xy(X, y, extrap_min_idx, extrap_max_idx)
 X_test = get_test_X(X)
 
@@ -61,6 +65,7 @@ def get_test_X(X):
 )
 qrf.fit(np.expand_dims(X_train, axis=-1), y_train)
 
+# Get predictions at 95% prediction intervals and median.
 y_pred = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])
 
 

diff --git a/quantile_forest/tests/examples/plot_quantile_interpolation.py b/quantile_forest/tests/examples/plot_quantile_interpolation.py
@@ -41,38 +41,24 @@
 )
 est.fit(X, y)
 
-y_medians = []
-y_errs = []
-for interpolation in interpolations:
-    y_pred = est.predict(
-        X,
-        quantiles=[0.025, 0.5, 0.975],
-        interpolation=interpolation.lower(),
-    )
-    y_medians.append(y_pred[:, 1])
-    y_errs.append(
-        np.concatenate(
-            (
-                [y_pred[:, 1] - y_pred[:, 0]],
-                [y_pred[:, 2] - y_pred[:, 1]],
-            ),
-            axis=0,
-        )
-    )
-
+# Initialize data with actual values.
 data = {
     "method": ["Actual"] * len(y),
     "x": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())],
     "y_med": y.tolist(),
     "y_low": y.tolist(),
     "y_upp": y.tolist(),
 }
-for idx, interpolation in enumerate(interpolations):
+
+# Populate data based on prediction results with different interpolations.
+for interpolation in interpolations:
+    y_pred = est.predict(X, quantiles=[0.025, 0.5, 0.975], interpolation=interpolation.lower())
+
     data["method"].extend([interpolation] * len(y))
     data["x"].extend([f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())])
-    data["y_med"].extend(y_medians[idx])
-    data["y_low"].extend(y_medians[idx] - y_errs[idx][0])
-    data["y_upp"].extend(y_medians[idx] + y_errs[idx][1])
+    data["y_low"].extend(y_pred[:, 0])
+    data["y_med"].extend(y_pred[:, 1])
+    data["y_upp"].extend(y_pred[:, 2])
 
 df = pd.DataFrame(data)
 

diff --git a/quantile_forest/tests/examples/plot_quantile_intervals.py b/quantile_forest/tests/examples/plot_quantile_intervals.py
@@ -36,6 +36,7 @@
 y_pred_low = []
 y_pred_upp = []
 
+# Using k-fold cross-validation, get predictions for all samples.
 for train_index, test_index in kf.split(X):
     X_train, X_test, y_train, y_test = (
         X[train_index],

diff --git a/quantile_forest/tests/examples/plot_quantile_multioutput.py b/quantile_forest/tests/examples/plot_quantile_multioutput.py
@@ -21,6 +21,7 @@
 n_samples = 2500
 bounds = [0, 100]
 
+# Define functions that generate targets; each function maps to one target.
 funcs = [
     {
         "signal": lambda x: np.log1p(x + 1),
@@ -46,15 +47,16 @@ def make_func_Xy(funcs, bounds, n_samples):
     return np.atleast_2d(x).T, y
 
 
+# Create the dataset with multiple target variables.
 X, y = make_func_Xy(funcs, bounds, n_samples)
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
 
 qrf = RandomForestQuantileRegressor(max_samples_leaf=None, max_depth=4, random_state=0)
-qrf.fit(X_train, y_train)
+qrf.fit(X_train, y_train)  # fit on all of the targets simultaneously
 
-y_pred = qrf.predict(X, quantiles=[0.025, 0.5, 0.975], weighted_quantile=False)
-y_pred = y_pred.reshape(-1, 3, len(funcs))
+# Get multiple-output predictions at 95% prediction intervals and median.
+y_pred = qrf.predict(X, quantiles=[0.025, 0.5, 0.975])
 
 df = pd.DataFrame(
     {

diff --git a/quantile_forest/tests/examples/plot_quantile_vs_standard.py b/quantile_forest/tests/examples/plot_quantile_vs_standard.py
@@ -37,8 +37,8 @@
 regr_rf.fit(X_train, y_train)
 regr_qrf.fit(X_train, y_train)
 
-y_pred_rf = regr_rf.predict(X_test)
-y_pred_qrf = regr_qrf.predict(X_test, quantiles=0.5)
+y_pred_rf = regr_rf.predict(X_test)  # standard RF predictions (mean)
+y_pred_qrf = regr_qrf.predict(X_test, quantiles=0.5)  # QRF predictions (median)
 
 legend = {
     "Actual": "#c0c0c0",

diff --git a/quantile_forest/tests/examples/plot_quantile_weighting.py b/quantile_forest/tests/examples/plot_quantile_weighting.py
@@ -43,11 +43,10 @@ def timing():
 est_sizes = [1, 5, 10, 25, 50, 75, 100]
 n_repeats = 5
 
-timings = np.empty((len(est_sizes), n_repeats, 3))
+# Populate data with timing results over estimators.
+data = {"name": [], "n_estimators": [], "iteration": [], "runtime": []}
 for i, n_estimators in enumerate(est_sizes):
     for j in range(n_repeats):
-        result = {}
-
         rf = RandomForestRegressor(
             n_estimators=n_estimators,
             random_state=0,
@@ -68,19 +67,15 @@ def timing():
         with timing() as qrf_unweighted_time:
             _ = qrf.predict(X_test, quantiles=0.5, weighted_quantile=False)
 
-        timings[i, j, :] = [rf_time(), qrf_weighted_time(), qrf_unweighted_time()]
-        timings[i, j, :] *= 1000  # convert from milliseconds to seconds
+        timings = [rf_time(), qrf_weighted_time(), qrf_unweighted_time()]
 
-timings /= timings.min()  # normalize by minimum runtime
-timings = np.transpose(timings, axes=[2, 0, 1])  # put the estimator name first
+        for name, runtime in zip(legend.keys(), timings):
+            runtime *= 1000  # convert from milliseconds to seconds
 
-data = {"name": [], "n_estimators": [], "iteration": [], "runtime": []}
-for i, name in enumerate(legend):
-    for j in range(timings.shape[1]):
-        data["name"].extend([name] * n_repeats)
-        data["n_estimators"].extend([est_sizes[j]] * n_repeats)
-        data["iteration"].extend(list(range(n_repeats)))
-        data["runtime"].extend(timings[i, j])
+            data["name"].extend([name])
+            data["n_estimators"].extend([est_sizes[i]])
+            data["iteration"].extend([j])
+            data["runtime"].extend([runtime])
 
 df = (
     pd.DataFrame(data)