PrePredict Estimator (#1189)

DistrictDataLabs · Feb 19, 2022 · 062ad14 · 062ad14
1 parent df45161
commit 062ad14
Show file tree

Hide file tree

Showing 6 changed files with 340 additions and 0 deletions.
diff --git a/docs/api/contrib/index.rst b/docs/api/contrib/index.rst
@@ -13,6 +13,7 @@ The following contrib packages are currently available:
    :maxdepth: 1
 
    wrapper
+   prepredict
    statsmodels
    boundaries
    scatter

diff --git a/docs/api/contrib/prepredict.rst b/docs/api/contrib/prepredict.rst
@@ -0,0 +1,52 @@
+.. -*- mode: rst -*-
+
+PrePredict Estimators
+=====================
+
+Occassionally it is useful to be able to use predictions made during an inferencing workflow that does not involve Yellowbrick, for example when the inferencing process requires extra compute resources such as a cluster or when the model takes a very long time to train and inference. In other instances there are models that Yellowbrick simply does not support, even with the :doc:`third-party estimator wrapper <wrapper>` or the results may have been collected from some source out of your control.
+
+Some Yellowbrick visualizers are still able to create visual diagnostics with predictions already made using the contrib library ``PrePredict`` estimator, which is a simple wrapper around some data and an estimator type. Although not quite as straight forward as a scikit-learn metric in the form ``metric(y_true, y_pred)``, this estimator allows Yellowbrick to be used in the cases described above, an example is below:
+
+.. code:: python
+
+    # Import the prepredict estimator and a Yellowbrick visualizer
+    from yellowbrick.contrib.prepredict import PrePredict, CLASSIFIER
+    from yellowbrick.classifier import classification_report
+
+    # Instantiate the estimator with the pre-predicted data
+    model = PrePredict(y_pred, CLASSIFIER)
+
+    # Use the visualizer, setting X to None since it is not required
+    oz = classification_report(model, None, y_test)
+    oz.show()
+
+.. warning:: Many Yellowbrick visualizers inspect the estimator for learned attributes in order to deliver rich diagnostics. You may run into visualizers that cannot use the prepredict method, or you can manually set attributes on the ``PrePredict`` estimator with the learned attributes the visualizer requires.
+
+In the case where you've saved pre-predicted data from disk, the ``PrePredict`` estimator can load it using ``np.load``. A full workflow is described below:
+
+.. code:: python
+
+    # Phase one: fit your estimator, make inferences, and save the inferences to disk
+    np.save("y_pred.npy", y_pred)
+
+    # Import the prepredict estimator and a Yellowbrick visualizer
+    from yellowbrick.contrib.prepredict import PrePredict, REGRESSOR
+    from yellowbrick.regressor import prediction_error
+
+    # Instantiate the estimator with the pre-predicted data and pass a path to where
+    # the data has been saved on disk.
+    model = PrePredict("y_pred.npy", REGRESSOR)
+
+    # Use the visualizer, setting X to None since it is not required
+    oz = prediction_error(model, X_test, y_test)
+    oz.show()
+
+The ``PrePredict`` estimator can use a callable function to return pre-predicted data, a ``str``, file-like object, or ``pathlib.Path`` to load from disk using ``np.load``, otherwise it simply returns the data it wraps. See the API reference for more details.
+
+API Reference
+-------------
+
+.. automodule:: yellowbrick.contrib.prepredict
+    :members: PrePredict
+    :undoc-members:
+    :show-inheritance:
diff --git a/tests/baseline_images/test_contrib/test_prepredict/test_prepredict_classifier.png b/tests/baseline_images/test_contrib/test_prepredict/test_prepredict_classifier.png
diff --git a/tests/baseline_images/test_contrib/test_prepredict/test_prepredict_regressor.png b/tests/baseline_images/test_contrib/test_prepredict/test_prepredict_regressor.png
diff --git a/tests/test_contrib/test_prepredict.py b/tests/test_contrib/test_prepredict.py
@@ -0,0 +1,183 @@
+# tests.test_contrib.test_prepredict
+# Test the prepredict estimator.
+#
+# Author:  Benjamin Bengfort <[email protected]>
+# Created: Mon Jul 12 07:07:33 2021 -0400
+#
+# ID: test_prepredict.py [] [email protected] $
+
+"""
+Test the prepredict estimator.
+"""
+
+##########################################################################
+## Imports
+##########################################################################
+
+import pytest
+
+from io import BytesIO
+from tests.fixtures import Dataset, Split
+from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase
+
+from sklearn.naive_bayes import GaussianNB
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split as tts
+from sklearn.datasets import make_classification, make_regression, make_blobs
+
+from yellowbrick.contrib.prepredict import *
+from yellowbrick.regressor import PredictionError
+from yellowbrick.classifier import ClassificationReport
+
+
+##########################################################################
+## Fixtures
+##########################################################################
+
+@pytest.fixture(scope="class")
+def multiclass(request):
+    """
+    Creates a random multiclass classification dataset fixture
+    """
+    X, y = make_classification(
+        n_samples=500,
+        n_features=20,
+        n_informative=8,
+        n_redundant=2,
+        n_classes=6,
+        n_clusters_per_class=3,
+        random_state=87,
+    )
+
+    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=93)
+
+    dataset = Dataset(Split(X_train, X_test), Split(y_train, y_test))
+    request.cls.multiclass = dataset
+
+
+@pytest.fixture(scope="class")
+def continuous(request):
+    """
+    Creates a random continuous regression dataset fixture
+    """
+    X, y = make_regression(
+        n_samples=500,
+        n_features=22,
+        n_informative=8,
+        random_state=42,
+        noise=0.2,
+        bias=0.2,
+    )
+
+    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=11)
+
+    # Set a class attribute for regression
+    request.cls.continuous = Dataset(Split(X_train, X_test), Split(y_train, y_test))
+
+
+@pytest.fixture(scope="class")
+def blobs(request):
+    """
+    Create a random blobs clustering dataset fixture
+    """
+    X, y = make_blobs(
+        n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42
+    )
+
+    # Set a class attribute for blobs
+    request.cls.blobs = Dataset(X, y)
+
+
+##########################################################################
+## Tests
+##########################################################################
+
+@pytest.mark.usefixtures("multiclass")
+@pytest.mark.usefixtures("continuous")
+@pytest.mark.usefixtures("blobs")
+class TestPrePrePredictEstimator(VisualTestCase):
+    """
+    Pre-predict contrib tests.
+    """
+
+    @pytest.mark.xfail(
+        IS_WINDOWS_OR_CONDA,
+        reason="image comparison failure on Conda 3.8 and 3.9 with RMS 19.307",
+    )
+    def test_prepredict_classifier(self):
+        """
+        Test the prepredict estimator with classification report
+        """
+        # Make prepredictions
+        X, y = self.multiclass.X, self.multiclass.y
+        y_pred = GaussianNB().fit(X.train, y.train).predict(X.test)
+
+        # Create prepredict estimator with prior predictions
+        estimator = PrePredict(y_pred, CLASSIFIER)
+        assert estimator.fit(X.train, y.train) is estimator
+        assert estimator.predict(X.train) is y_pred
+        assert estimator.score(X.test, y.test) == pytest.approx(0.41, rel=1e-3)
+
+        # Test that a visualizer works with the pre-predictions.
+        viz = ClassificationReport(estimator)
+        viz.fit(None, y.train)
+        viz.score(None, y.test)
+        viz.finalize()
+
+        self.assert_images_similar(viz)
+
+    def test_prepredict_regressor(self):
+        """
+        Test the prepredict estimator with a prediction error plot
+        """
+        # Make prepredictions
+        X, y = self.continuous.X, self.continuous.y
+        y_pred = LinearRegression().fit(X.train, y.train).predict(X.test)
+
+        # Create prepredict estimator with prior predictions
+        estimator = PrePredict(y_pred, REGRESSOR)
+        assert estimator.fit(X.train, y.train) is estimator
+        assert estimator.predict(X.train) is y_pred
+        assert estimator.score(X.test, y.test) == pytest.approx(0.9999983124154966, rel=1e-2)
+
+        # Test that a visualizer works with the pre-predictions.
+        viz = PredictionError(estimator)
+        viz.fit(X.train, y.train)
+        viz.score(X.test, y.test)
+        viz.finalize()
+
+        self.assert_images_similar(viz, tol=10.0)
+
+    def test_prepredict_clusterer(self):
+        """
+        Test the prepredict estimator with a silhouette visualizer
+        """
+        X = self.blobs.X
+        y_pred = MiniBatchKMeans(random_state=831).fit(X).predict(X)
+
+         # Create prepredict estimator with prior predictions
+        estimator = PrePredict(y_pred, CLUSTERER)
+        assert estimator.fit(X) is estimator
+        assert estimator.predict(X) is y_pred
+        assert estimator.score(X) == pytest.approx(0.5477478541994333, rel=1e-2)
+
+        # NOTE: there is currently no cluster visualizer that can take advantage of
+        # the prepredict utility since they all require learned attributes.
+
+    def test_load(self):
+        """
+        Test the various ways that prepredict loads data
+        """
+        # Test callable
+        ppe = PrePredict(lambda: self.multiclass.y.test)
+        assert ppe._load() is self.multiclass.y.test
+
+        # Test file-like object, assume that str and pathlib.Path work similarly
+        f = BytesIO()
+        np.save(f, self.continuous.y.test)
+        f.seek(0)
+        ppe = PrePredict(f)
+        assert np.array_equal(ppe._load(), self.continuous.y.test)
+
+        # Test direct array-like completed in other tests.
diff --git a/yellowbrick/contrib/prepredict.py b/yellowbrick/contrib/prepredict.py
@@ -0,0 +1,104 @@
+# yellowbrick.contrib.prepredict
+# PrePredict estimator allows Yellowbrick to work with results produced by an estimator.
+#
+# Author:  Benjamin Bengfort <[email protected]>
+# Created: Mon Jul 12 07:07:33 2021 -0400
+#
+# ID: prepredict.py [] [email protected] $
+
+"""
+PrePredict estimator allows Yellowbrick to work with results produced by an estimator
+prior to the visual diagnostic workflow, particularly for inferences that require
+extensive time or compute resources.
+"""
+
+##########################################################################
+## Imports
+##########################################################################
+
+import pathlib
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.metrics import accuracy_score, r2_score, silhouette_score
+from yellowbrick.contrib.wrapper import CLASSIFIER, CLUSTERER, REGRESSOR
+
+
+class PrePredict(BaseEstimator):
+    """
+    The Passthrough estimator allows users to specify pre-predicted results to
+    Yellowbrick without the need to input the original estimator. Note that Yellowbrick
+    often uses the learned attributes of the estimator to produce rich visual
+    diagnostics, so this estimator may not work for all Yellowbrick visualizers.
+
+    The passthrough estimator can accept data either in memory as a numpy array or it
+    can accept a string, which it interprets as a path on disk to load the data from.
+
+    Currently passthrough does not support predict_proba or decision_function methods,
+    which it could if it was passed predicted data as 2D array instead of a 1D array.
+
+    Parameters
+    ----------
+    data : array-like, func, or file-like object, string, or pathlib.Path
+        The predicted values wrapped by the estimator and returned on predict() and
+        used by the score function. The default expectation is that data is a 1D numpy
+        array of y_hat or y_pred values produced by some other estimator. Data can also
+        be a func, which is called and returned, or a file-like object, string, or
+        pathlib.Path at which point the data is loaded from disk using ``np.load``.
+
+    estimator_type : str, optional
+        One of "classifier", "regressor", "clusterer", "DensityEstimator", or
+        "outlier_detector" that allows the contrib estimator to pass the scikit-learn
+        ``is_classifier``, etc. functions. If not specified, the Yellowbrick visualizer
+        you're trying to use may error.
+    """
+
+    def __init__(self, data, estimator_type=None):
+        self.data = data
+        self._estimator_type = estimator_type
+
+    def fit(self, X, y=None):
+        """
+        Fit is a no-op, simply returning self per the scikit-learn API.
+        """
+        return self
+
+    def predict(self, X):
+        """
+        Predict returns the embedded data but does not perform any checks on the
+        validity of X (e.g. that it has the same shape as the internal data).
+        """
+        return self._load()
+
+    def score(self, X, y=None):
+        """
+        Score uses an appropriate metric for the estimator type and compares the input
+        y values with the pre-predicted values.
+        """
+        if self._estimator_type == CLASSIFIER:
+            return accuracy_score(y, self._load())
+
+        if self._estimator_type == REGRESSOR:
+            return r2_score(y, self._load())
+
+        if self._estimator_type == CLUSTERER:
+            labels = y if y is not None else self._load()
+            return silhouette_score(X, labels)
+
+        # If the estimator type is unknown return NaN since the score can't be computed.
+        return np.nan
+
+    def _load(self):
+        """
+        Loads the data by performing type checking to determine if data is a callable
+        whose result needs to be returned, or an argument that supports from disk
+        loading. If neither of these things, then assumes the data is array-like and
+        returns it directly.
+        """
+        if callable(self.data):
+            return self.data()
+
+        if hasattr(self.data, "read") or isinstance(self.data, (str, pathlib.Path)):
+            return np.load(self.data)
+
+        return self.data