diff --git a/docs/index.rst b/docs/index.rst index 59249c4..97d7f63 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,21 +23,21 @@ quantile-forest :link-type: ref :link-alt: Getting started - A guide that provides installation instructions and information on testing the package. + A guide that provides installation requirements and instructions, as well as procedures for developers. .. grid-item-card:: User Guide :link: user_guide :link-type: ref :link-alt: User guide - Check out the User Guide for information on the key concepts behind quantile forests. + Information on the key concepts behind quantile forests and how they apply to this package. .. grid-item-card:: Examples :link: example-gallery :link-type: ref :link-alt: Examples - General-purpose, introductory and illustrative examples of using quantile forests. + Examples that demonstrate the broad applications and introductory concepts of quantile forests. .. grid-item-card:: API :link: api diff --git a/quantile_forest/tests/examples/plot_quantile_conformalized.py b/quantile_forest/tests/examples/plot_quantile_conformalized.py index 8b0418d..58b0e98 100755 --- a/quantile_forest/tests/examples/plot_quantile_conformalized.py +++ b/quantile_forest/tests/examples/plot_quantile_conformalized.py @@ -81,7 +81,9 @@ def qrf_strategy(alpha, X_train, X_test, y_train, y_test): # Calculate the point predictions on the test data. y_pred = qrf.predict(X_test, quantiles="mean", aggregate_leaves_first=False) - return sort_y_values(y_test, y_pred, y_pis) + y_values = sort_y_values(y_test, y_pred, y_pis) + + return pd.DataFrame(y_values).pipe(lambda x: x * 100_000).assign(strategy="qrf") def cqr_strategy(alpha, X_train, X_test, y_train, y_test): @@ -121,32 +123,32 @@ def cqr_strategy(alpha, X_train, X_test, y_train, y_test): # Calculate the point predictions on the test data. y_pred = qrf.predict(X_test, quantiles="mean", aggregate_leaves_first=False) - return sort_y_values(y_test, y_pred, y_pis) + y_values = sort_y_values(y_test, y_pred, y_pis) + + return pd.DataFrame(y_values).pipe(lambda x: x * 100_000).assign(strategy="cqr") # Get strategy outputs as a data frame. args = (alpha, X_train, X_test, y_train, y_test) -df = pd.concat( - [ - pd.DataFrame(qrf_strategy(*args)).pipe(lambda x: x * 100_000).assign(strategy="qrf"), - pd.DataFrame(cqr_strategy(*args)).pipe(lambda x: x * 100_000).assign(strategy="cqr"), - ] -) +df = pd.concat([qrf_strategy(*args), cqr_strategy(*args)]) -# Add coverage and mean width metrics to the data frame. -df = df.merge( +# Calculate coverage and width metrics. +metrics = ( df.groupby("strategy") .apply( - lambda x: pd.Series( + lambda grp: pd.Series( { - "coverage": coverage_score(x["y_test"], x["y_pred_low"], x["y_pred_upp"]), - "width": mean_width_score(x["y_pred_low"], x["y_pred_upp"]), + "coverage": coverage_score(grp["y_test"], grp["y_pred_low"], grp["y_pred_upp"]), + "width": mean_width_score(grp["y_pred_low"], grp["y_pred_upp"]), } ) ) .reset_index() ) +# Merge the metrics into the data frame. +df = df.merge(metrics, on="strategy", how="left") + def plot_prediction_intervals(df, domain): click = alt.selection_point(fields=["y_label"], bind="legend") @@ -222,9 +224,7 @@ def plot_prediction_intervals(df, domain): ) text_coverage = ( - base.transform_aggregate( - coverage="mean(coverage)", width="mean(width)", groupby=["strategy"] - ) + base.transform_aggregate(coverage="mean(coverage)", groupby=["strategy"]) .transform_calculate( coverage_text=( f"'Coverage: ' + format({alt.datum['coverage'] * 100}, '.1f') + '%'" @@ -239,9 +239,7 @@ def plot_prediction_intervals(df, domain): ) ) text_with = ( - base.transform_aggregate( - coverage="mean(coverage)", width="mean(width)", groupby=["strategy"] - ) + base.transform_aggregate(width="mean(width)", groupby=["strategy"]) .transform_calculate( width_text=f"'Interval Width: ' + format({alt.datum['width']}, '$,d')" ) diff --git a/quantile_forest/tests/examples/plot_quantile_example.py b/quantile_forest/tests/examples/plot_quantile_example.py index bbe4798..13e7e52 100755 --- a/quantile_forest/tests/examples/plot_quantile_example.py +++ b/quantile_forest/tests/examples/plot_quantile_example.py @@ -8,17 +8,18 @@ """ import altair as alt -import pandas as pd import numpy as np +import pandas as pd from sklearn.model_selection import train_test_split from quantile_forest import RandomForestQuantileRegressor n_samples = 1000 bounds = [0, 10] +quantiles = [0.025, 0.5, 0.975] -def make_toy_dataset(n_samples, bounds, random_seed=0): +def make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0): rng = np.random.RandomState(random_seed) x = rng.uniform(*bounds, size=n_samples) @@ -26,109 +27,94 @@ def make_toy_dataset(n_samples, bounds, random_seed=0): sigma = 0.25 + x / 10 noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2) - y = f + noise + y = f + (noise if add_noise else 0) return np.atleast_2d(x).T, y -X, y = make_toy_dataset(n_samples, bounds) +# Create noisy data for modeling and non-noisy function data for illustration. +X, y = make_toy_dataset(n_samples, bounds, add_noise=True, random_seed=0) +X_func, y_func = make_toy_dataset(n_samples, bounds, add_noise=False, random_seed=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) -X_sampled = np.atleast_2d(np.linspace(*bounds, n_samples)).T -y_sampled = (X_sampled * np.sin(X_sampled)).reshape(-1) - qrf = RandomForestQuantileRegressor(max_depth=3, min_samples_leaf=5, random_state=0) qrf.fit(X_train, y_train) -y_pred = qrf.predict(X_sampled, quantiles=[0.025, 0.5, 0.975]) +y_pred_func = qrf.predict(X_func, quantiles=quantiles) +y_pred_test = qrf.predict(X_test, quantiles=quantiles) -df_train = pd.DataFrame( +df = pd.DataFrame( { - "X_sampled": X_sampled.reshape(-1), - "y_sampled": y_sampled, - "y_pred_low": y_pred[:, 0], - "y_pred_med": y_pred[:, 1], - "y_pred_upp": y_pred[:, 2], + "X": np.concatenate([X_func.reshape(-1), X_test.reshape(-1)]), + "y": np.concatenate([y_func, y_test]), + "y_pred": np.concatenate([y_pred_func[:, 1], y_pred_test[:, 1]]), + "y_pred_low": np.concatenate([y_pred_func[:, 0], y_pred_test[:, 0]]), + "y_pred_upp": np.concatenate([y_pred_func[:, 2], y_pred_test[:, 2]]), + "test": [False] * len(y_func) + [True] * len(y_test), } ) -df_test = pd.DataFrame( - { - "X_test": X_test.reshape(-1), - "y_test": y_test, - } -) - - -def plot_fit_and_intervals(df_train, df_test): - df_train = df_train.copy() - df_test = df_test.copy() - - df_train = df_train.assign( - **{ - "y_true_label": "f(x) = x sin(x)", - "y_pred_label": "Predicted Median", - "y_area_label": "Predicted 95% Interval", - } - ) - - df_test["point_label"] = "Test Observations" +def plot_fit_and_intervals(df): points = ( - alt.Chart(df_test) + alt.Chart(df.assign(**{"point_label": "Test Observations"})) + .transform_filter(alt.datum["test"]) # filter to test data .mark_circle(color="#f2a619") .encode( - x=alt.X("X_test:Q", scale=alt.Scale(nice=False)), - y=alt.Y("y_test:Q", title=""), + x=alt.X("X:Q", scale=alt.Scale(nice=False)), + y=alt.Y("y:Q", title=""), color=alt.Color("point_label:N", scale=alt.Scale(range=["#f2a619"]), title=None), tooltip=[ - alt.Tooltip("X_test:Q", format=",.3f", title="X"), - alt.Tooltip("y_test:Q", format=",.3f", title="Y"), + alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("y:Q", format=",.3f", title="Y"), ], ) ) line_true = ( - alt.Chart(df_train) + alt.Chart(df.assign(**{"y_true_label": "f(x) = x sin(x)"})) + .transform_filter(~alt.datum["test"]) # filter to training data .mark_line(color="black", size=3) .encode( - x=alt.X("X_sampled:Q", scale=alt.Scale(nice=False)), - y=alt.Y("y_sampled:Q", title="f(x)"), + x=alt.X("X:Q", scale=alt.Scale(nice=False)), + y=alt.Y("y:Q", title="f(x)"), color=alt.Color("y_true_label:N", scale=alt.Scale(range=["black"]), title=None), tooltip=[ - alt.Tooltip("X_sampled:Q", format=",.3f", title="X"), - alt.Tooltip("y_sampled:Q", format=",.3f", title="Y"), + alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("y:Q", format=",.3f", title="Y"), ], ) ) line_pred = ( - alt.Chart(df_train) + alt.Chart(df.assign(**{"y_pred_label": "Predicted Median"})) + .transform_filter(~alt.datum["test"]) # filter to training data .mark_line(color="#006aff", size=5) .encode( - x=alt.X("X_sampled:Q", scale=alt.Scale(nice=False)), - y=alt.Y("y_pred_med:Q", title=""), + x=alt.X("X:Q", scale=alt.Scale(nice=False)), + y=alt.Y("y_pred:Q", title=""), color=alt.Color("y_pred_label:N", scale=alt.Scale(range=["#006aff"]), title=None), tooltip=[ - alt.Tooltip("X_sampled:Q", format=",.3f", title="X"), - alt.Tooltip("y_sampled:Q", format=",.3f", title="Y"), - alt.Tooltip("y_pred_med:Q", format=",.3f", title="Predicted Y"), + alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("y:Q", format=",.3f", title="Y"), + alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"), ], ) ) area_pred = ( - alt.Chart(df_train) + alt.Chart(df) + .transform_filter(~alt.datum["test"]) # filter to training data .mark_area(color="#e0f2ff", opacity=0.8) .encode( - x=alt.X("X_sampled:Q", scale=alt.Scale(nice=False), title="x"), + x=alt.X("X:Q", scale=alt.Scale(nice=False), title="x"), y=alt.Y("y_pred_low:Q", title=""), y2=alt.Y2("y_pred_upp:Q", title=None), tooltip=[ - alt.Tooltip("X_sampled:Q", format=",.3f", title="X"), - alt.Tooltip("y_sampled:Q", format=",.3f", title="Y"), - alt.Tooltip("y_pred_med:Q", format=",.3f", title="Predicted Y"), + alt.Tooltip("X:Q", format=",.3f", title="X"), + alt.Tooltip("y:Q", format=",.3f", title="Y"), + alt.Tooltip("y_pred:Q", format=",.3f", title="Predicted Y"), alt.Tooltip("y_pred_low:Q", format=",.3f", title="Predicted Lower Y"), alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"), ], @@ -153,5 +139,5 @@ def plot_fit_and_intervals(df_train, df_test): return chart -chart = plot_fit_and_intervals(df_train, df_test) +chart = plot_fit_and_intervals(df) chart diff --git a/quantile_forest/tests/examples/plot_quantile_extrapolation.py b/quantile_forest/tests/examples/plot_quantile_extrapolation.py index ae1ef9f..b5a664e 100755 --- a/quantile_forest/tests/examples/plot_quantile_extrapolation.py +++ b/quantile_forest/tests/examples/plot_quantile_extrapolation.py @@ -58,17 +58,12 @@ def get_test_X(X): X_train, y_train = get_train_Xy(X, y, extrap_min_idx, extrap_max_idx) X_test = get_test_X(X) -qrf = RandomForestQuantileRegressor( - max_samples_leaf=None, - min_samples_leaf=10, - random_state=0, -) +qrf = RandomForestQuantileRegressor(max_samples_leaf=None, min_samples_leaf=10, random_state=0) qrf.fit(np.expand_dims(X_train, axis=-1), y_train) # Get predictions at 95% prediction intervals and median. y_pred = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975]) - df = pd.DataFrame( { "X_true": X, @@ -77,35 +72,13 @@ def get_test_X(X): "y_pred": y_pred[:, 1], "y_pred_low": y_pred[:, 0], "y_pred_upp": y_pred[:, 2], - "train": np.concatenate( - [ - np.zeros(extrap_min_idx), - np.ones(extrap_max_idx - extrap_min_idx), - np.zeros(len(y) - extrap_max_idx), - ] - ), - "test_left": np.concatenate( - [ - np.ones(extrap_min_idx), - np.zeros(len(y) - extrap_min_idx), - ] - ), - "test_right": np.concatenate( - [ - np.zeros(extrap_max_idx), - np.ones(len(y) - extrap_max_idx), - ] - ), + "test_left": [True] * extrap_min_idx + [False] * (len(y) - extrap_min_idx), + "test_right": [False] * extrap_max_idx + [True] * (len(y) - extrap_max_idx), } ) def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None): - df = df.copy() - - df["point_label"] = "Observations" - df["line_label"] = func_str - x_scale = None if x_domain is not None: x_scale = alt.Scale(domain=x_domain, nice=False, padding=0) @@ -130,7 +103,7 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None alt.Tooltip("y_pred_upp:Q", format=",.3f", title="Predicted Upper Y"), ] - base = alt.Chart(df) + base = alt.Chart(df.assign(**{"point_label": "Observations", "line_label": func_str})) points_true = base.mark_circle(size=20).encode( x=alt.X("X_true:Q", scale=x_scale, title="x"), @@ -195,17 +168,17 @@ def plot_extrapolations(df, title="", legend=False, x_domain=None, y_domain=None } chart1 = plot_extrapolations( - df.query(f"train == 1"), title="Prediction Intervals on Training Data", **kwargs + df.query("~(test_left | test_right)"), title="Prediction Intervals on Training Data", **kwargs ) chart2 = alt.layer( plot_extrapolations( - df.query(f"(train == 1)"), + df.query("~(test_left | test_right)"), title="Prediction Intervals with Extrapolated Values", legend=True, **kwargs, ).resolve_scale(color="independent"), - plot_extrapolations(df.query(f"(test_left == 1)").assign(extrapolate=True), **kwargs), - plot_extrapolations(df.query(f"(test_right == 1)").assign(extrapolate=True), **kwargs), + plot_extrapolations(df.query("test_left").assign(extrapolate=True), **kwargs), + plot_extrapolations(df.query("test_right").assign(extrapolate=True), **kwargs), ) chart = chart1 | chart2 chart diff --git a/quantile_forest/tests/examples/plot_quantile_interpolation.py b/quantile_forest/tests/examples/plot_quantile_interpolation.py index 3656ae0..07e760a 100755 --- a/quantile_forest/tests/examples/plot_quantile_interpolation.py +++ b/quantile_forest/tests/examples/plot_quantile_interpolation.py @@ -44,21 +44,22 @@ # Initialize data with actual values. data = { "method": ["Actual"] * len(y), - "x": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())], - "y_med": y.tolist(), - "y_low": y.tolist(), - "y_upp": y.tolist(), + "X": [f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())], + "y_pred": y.tolist(), + "y_pred_low": y.tolist(), + "y_pred_upp": y.tolist(), } # Populate data based on prediction results with different interpolations. for interpolation in interpolations: + # Get predictions at 95% prediction intervals and median. y_pred = est.predict(X, quantiles=[0.025, 0.5, 0.975], interpolation=interpolation.lower()) data["method"].extend([interpolation] * len(y)) - data["x"].extend([f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())]) - data["y_low"].extend(y_pred[:, 0]) - data["y_med"].extend(y_pred[:, 1]) - data["y_upp"].extend(y_pred[:, 2]) + data["X"].extend([f"Sample {idx + 1} ({x})" for idx, x in enumerate(X.tolist())]) + data["y_pred"].extend(y_pred[:, 1]) + data["y_pred_low"].extend(y_pred[:, 0]) + data["y_pred_upp"].extend(y_pred[:, 2]) df = pd.DataFrame(data) @@ -74,10 +75,10 @@ def plot_interpolations(df, legend): tooltip = [ alt.Tooltip("method:N", title="Method"), - alt.Tooltip("x:N", title="X Values"), - alt.Tooltip("y_med:N", format=".3f", title="Median Y Value"), - alt.Tooltip("y_low:N", format=".3f", title="Lower Y Value"), - alt.Tooltip("y_upp:N", format=".3f", title="Upper Y Value"), + alt.Tooltip("X:N", title="X Values"), + alt.Tooltip("y_pred:N", format=".3f", title="Median Y Value"), + alt.Tooltip("y_pred_low:N", format=".3f", title="Lower Y Value"), + alt.Tooltip("y_pred_upp:N", format=".3f", title="Upper Y Value"), ] point = ( @@ -90,7 +91,7 @@ def plot_interpolations(df, legend): sort=list(legend.keys()), title=None, ), - y=alt.Y("y_med:Q", title="Actual and Predicted Values"), + y=alt.Y("y_pred:Q", title="Actual and Predicted Values"), color=color, tooltip=tooltip, ) @@ -106,8 +107,8 @@ def plot_interpolations(df, legend): sort=list(legend.keys()), title=None, ), - y=alt.Y("y_low:Q", title=""), - y2=alt.Y2("y_upp:Q", title=None), + y=alt.Y("y_pred_low:Q", title=""), + y2=alt.Y2("y_pred_upp:Q", title=None), color=color, tooltip=tooltip, ) @@ -119,7 +120,7 @@ def plot_interpolations(df, legend): .properties(height=400) .facet( column=alt.Column( - "x:N", + "X:N", header=alt.Header(labelOrient="bottom", titleOrient="bottom"), title="Samples (Feature Values)", ) diff --git a/quantile_forest/tests/examples/plot_quantile_intervals.py b/quantile_forest/tests/examples/plot_quantile_intervals.py index 3477d2e..01d6b68 100755 --- a/quantile_forest/tests/examples/plot_quantile_intervals.py +++ b/quantile_forest/tests/examples/plot_quantile_intervals.py @@ -31,19 +31,11 @@ kf = KFold(n_splits=5) kf.get_n_splits(X) -y_true = [] -y_pred = [] -y_pred_low = [] -y_pred_upp = [] - # Using k-fold cross-validation, get predictions for all samples. +data = {"y_true": [], "y_pred": [], "y_pred_low": [], "y_pred_upp": []} for train_index, test_index in kf.split(X): - X_train, X_test, y_train, y_test = ( - X[train_index], - X[test_index], - y[train_index], - y[test_index], - ) + X_train, y_train = X[train_index], y[train_index] + X_test, y_test = X[test_index], y[test_index] qrf.set_params(max_features=X_train.shape[1] // 3) qrf.fit(X_train, y_train) @@ -51,21 +43,12 @@ # Get predictions at 95% prediction intervals and median. y_pred_i = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975]) - y_true.append(y_test) - y_pred.append(y_pred_i[:, 1]) - y_pred_low.append(y_pred_i[:, 0]) - y_pred_upp.append(y_pred_i[:, 2]) - -df = pd.DataFrame( - { - "y_true": np.concatenate(y_true), - "y_pred": np.concatenate(y_pred), - "y_pred_low": np.concatenate(y_pred_low), - "y_pred_upp": np.concatenate(y_pred_upp), - } -).pipe( - lambda x: x * 100_000 # convert to dollars -) + data["y_true"].extend(y_test) + data["y_pred"].extend(y_pred_i[:, 1]) + data["y_pred_low"].extend(y_pred_i[:, 0]) + data["y_pred_upp"].extend(y_pred_i[:, 2]) + +df = pd.DataFrame(data).pipe(lambda x: x * 100_000) # convert to dollars def plot_calibration_and_intervals(df): diff --git a/quantile_forest/tests/examples/plot_quantile_multioutput.py b/quantile_forest/tests/examples/plot_quantile_multioutput.py index e0dcc4f..6f445cb 100755 --- a/quantile_forest/tests/examples/plot_quantile_multioutput.py +++ b/quantile_forest/tests/examples/plot_quantile_multioutput.py @@ -1,6 +1,6 @@ """ -Multiple-Output Quantile Regression -=================================== +Multiple-Output Quantile Regression with QRFs +============================================= An example on a toy dataset that demonstrates fitting a single quantile regressor for multiple target variables. For each target, multiple quantiles @@ -10,8 +10,8 @@ """ import altair as alt -import pandas as pd import numpy as np +import pandas as pd from sklearn.model_selection import train_test_split from quantile_forest import RandomForestQuantileRegressor diff --git a/quantile_forest/tests/examples/plot_quantile_weighting.py b/quantile_forest/tests/examples/plot_quantile_weighting.py index 3e7efe3..1f79b22 100755 --- a/quantile_forest/tests/examples/plot_quantile_weighting.py +++ b/quantile_forest/tests/examples/plot_quantile_weighting.py @@ -1,38 +1,37 @@ """ -Weighted vs. Unweighted Quantile Estimates -========================================== +Weighted vs. Unweighted Quantile Runtimes +========================================= An example comparison of the prediction runtime when using a quantile regression forest with weighted and unweighted quantiles to compute the predicted output values. While weighted and unweighted quantiles produce identical outputs, the relative runtime of the methods depends on the number -of training samples and the total number of leaf samples used to calculate the -quantiles. A standard random forest regressor is included for comparison. +of training samples and the total number of leaf samples across all trees used +to calculate the quantiles. A standard random forest regressor is included for +comparison. """ import time from contextlib import contextmanager import altair as alt -import numpy as np import pandas as pd from sklearn import datasets from sklearn.ensemble import RandomForestRegressor -from sklearn.model_selection import train_test_split from quantile_forest import RandomForestQuantileRegressor +samples = [100, 175, 250, 325, 500] +estimators = [10, 25, 50, 75, 100] +repeats = 3 + @contextmanager def timing(): - t0 = time.time() + t0 = time.process_time() yield lambda: (t1 - t0) - t1 = time.time() - - -X, y = datasets.make_regression(n_samples=500, n_features=4, n_targets=5, random_state=0) + t1 = time.process_time() -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) legend = { "RF": "#f2a619", @@ -40,61 +39,71 @@ def timing(): "QRF Unweighted Quantile": "#001751", } -est_sizes = [1, 5, 10, 25, 50, 75, 100] -n_repeats = 5 - -# Populate data with timing results over estimators. -data = {"name": [], "n_estimators": [], "iteration": [], "runtime": []} -for i, n_estimators in enumerate(est_sizes): - for j in range(n_repeats): - rf = RandomForestRegressor( - n_estimators=n_estimators, - random_state=0, - ) - qrf = RandomForestQuantileRegressor( - n_estimators=n_estimators, - max_samples_leaf=None, - random_state=0, - ) - - rf.fit(X_train, y_train) - qrf.fit(X_train, y_train) - - with timing() as rf_time: - _ = rf.predict(X_test) - with timing() as qrf_weighted_time: - _ = qrf.predict(X_test, quantiles=0.5, weighted_quantile=True) - with timing() as qrf_unweighted_time: - _ = qrf.predict(X_test, quantiles=0.5, weighted_quantile=False) - - timings = [rf_time(), qrf_weighted_time(), qrf_unweighted_time()] - - for name, runtime in zip(legend.keys(), timings): - runtime *= 1000 # convert from milliseconds to seconds +dataset = datasets.make_regression( + n_samples=max(samples), n_features=3, n_targets=1, random_state=0 +) - data["name"].extend([name]) - data["n_estimators"].extend([est_sizes[i]]) - data["iteration"].extend([j]) - data["runtime"].extend([runtime]) +# Populate data with timing results over samples and estimators. +data = {"name": [], "n_samples": [], "n_estimators": [], "iteration": [], "runtime": []} +for n_samples in samples: + X = dataset[0][:n_samples, :] + y = dataset[1][:n_samples] + for n_estimators in estimators: + for repeat in range(repeats): + rf = RandomForestRegressor( + n_estimators=n_estimators, + max_depth=7, + random_state=0, + ) + qrf = RandomForestQuantileRegressor( + n_estimators=n_estimators, + max_depth=7, + max_samples_leaf=None, + random_state=0, + ) + + rf.fit(X, y) + qrf.fit(X, y) + + with timing() as rf_time: + _ = rf.predict(X) + with timing() as qrf_weighted_time: + _ = qrf.predict(X, quantiles=0.5, weighted_quantile=True) + with timing() as qrf_unweighted_time: + _ = qrf.predict(X, quantiles=0.5, weighted_quantile=False) + + timings = [rf_time(), qrf_weighted_time(), qrf_unweighted_time()] + + for name, runtime in zip(legend.keys(), timings): + runtime *= 1000 # convert from milliseconds to seconds + + data["name"].extend([name]) + data["n_samples"].extend([n_samples]) + data["n_estimators"].extend([n_estimators]) + data["iteration"].extend([repeat]) + data["runtime"].extend([runtime]) df = ( pd.DataFrame(data) - .groupby(["name", "n_estimators"]) + .groupby(["name", "n_samples", "n_estimators"]) .agg({"runtime": ["mean", "std"]}) .pipe(lambda x: x.set_axis(["_".join(map(str, col)) for col in x.columns], axis=1)) .reset_index() .assign( **{ - "mean": lambda x: x["runtime_mean"], - "std": lambda x: x["runtime_std"], - "ymin": lambda x: x["mean"] - (x["std"] / 2), - "ymax": lambda x: x["mean"] + (x["std"] / 2), + "ymean": lambda x: x["runtime_mean"], + "ystd": lambda x: x["runtime_std"], + "ymin": lambda x: x["ymean"] - (x["ystd"] / 2), + "ymax": lambda x: x["ymean"] + (x["ystd"] / 2), } ) + .drop(columns=["runtime_mean", "runtime_std"]) ) -def plot_timings_by_size(df, legend): +def plot_timings_by_factor(df, legend, constant, factor, factor_title): + max_value = df[constant].max() + click = alt.selection_point(fields=["name"], bind="legend") color = alt.condition( @@ -108,43 +117,61 @@ def plot_timings_by_size(df, legend): alt.value("lightgray"), ) - line = ( + base = ( alt.Chart(df) - .mark_line() - .encode( - x=alt.X("n_estimators:Q", title="Number of Estimators"), - y=alt.Y("mean:Q", title="Prediction Runtime (normalized)"), - color=color, - ) + .transform_filter(alt.datum[constant] == max_value) # hold this factor constant + .transform_joinaggregate(min_runtime="min(ymin):Q", groupby=[constant]) + .transform_calculate(ymean_norm="datum.ymean / datum.min_runtime") + .transform_calculate(ystd_norm="datum.ystd / datum.min_runtime") + .transform_calculate(ymin_norm="datum.ymin / datum.min_runtime") + .transform_calculate(ymax_norm="datum.ymax / datum.min_runtime") ) - area = ( - alt.Chart(df) - .mark_area(opacity=0.1) + line = base.mark_line().encode( + x=alt.X(f"{factor}:Q", scale=alt.Scale(nice=False), title=factor_title), + y=alt.Y( + "ymean_norm:Q", + scale=alt.Scale(zero=False), + title="Prediction Runtime (normalized)", + ), + color=color, + ) + + area = base.mark_area(opacity=0.1).encode( + x=alt.X(f"{factor}:Q"), + y=alt.Y("ymin_norm:Q"), + y2=alt.Y2("ymax_norm:Q"), + color=color, + tooltip=[ + alt.Tooltip("name:N", title="Estimator Name"), + alt.Tooltip(f"{factor}:Q", format=",d", title=factor_title), + alt.Tooltip("ymean_norm:Q", format=",.3f", title="Average Runtime"), + alt.Tooltip("ymin_norm:Q", format=",.3f", title="Minimum Runtime"), + alt.Tooltip("ymax_norm:Q", format=",.3f", title="Maximum Runtime"), + ], + ) + + text = ( + base.transform_aggregate(coverage="mean(constant)", groupby=[constant]) + .transform_calculate(text=(f"'{constant} = ' + format({alt.datum[constant]}, ',d')")) + .mark_text(align="left", baseline="top") .encode( - x=alt.X("n_estimators:Q"), - y=alt.Y("ymin:Q"), - y2=alt.Y2("ymax:Q"), - color=color, - tooltip=[ - alt.Tooltip("name:N", title="Estimator Name"), - alt.Tooltip("n_estimators:Q", format=",d", title="Number of Estimators"), - alt.Tooltip("mean:Q", format=",.3f", title="Average Runtime"), - alt.Tooltip("ymin:Q", format=",.3f", title="Minimum Runtime"), - alt.Tooltip("ymax:Q", format=",.3f", title="Maximum Runtime"), - ], + x=alt.value(5), + y=alt.value(5), + text=alt.Text("text:N"), ) ) chart = ( - (line + area) + (line + area + text) .add_params(click) - .configure_range(category=alt.RangeScheme(list(legend.values()))) - .properties(height=400, width=650) + .properties(height=200, width=300, title=f"Runtime by {factor_title}") ) return chart -chart = plot_timings_by_size(df, legend) +chart1 = plot_timings_by_factor(df, legend, "n_samples", "n_estimators", "Number of Estimators") +chart2 = plot_timings_by_factor(df, legend, "n_estimators", "n_samples", "Number of Samples") +chart = (chart1 | chart2).configure_range(category=alt.RangeScheme(list(legend.values()))) chart