machine_learning/polynomial_regression.py

"""
Polynomial regression is a type of regression analysis that models the relationship
between a predictor x and the response y as an mth-degree polynomial:

y = β₀ + β₁x + β₂x² + ... + βₘxᵐ + ε

By treating x, x², ..., xᵐ as distinct variables, we see that polynomial regression is a
special case of multiple linear regression. Therefore, we can use ordinary least squares
(OLS) estimation to estimate the vector of model parameters β = (β₀, β₁, β₂, ..., βₘ)
for polynomial regression:

β = (XᵀX)⁻¹Xᵀy = X⁺y

where X is the design matrix, y is the response vector, and X⁺ denotes the Moore-Penrose
pseudoinverse of X. In the case of polynomial regression, the design matrix is

    |1  x₁  x₁² ⋯ x₁ᵐ|
X = |1  x₂  x₂² ⋯ x₂ᵐ|
    |⋮  ⋮   ⋮   ⋱ ⋮  |
    |1  xₙ  xₙ² ⋯  xₙᵐ|

In OLS estimation, inverting XᵀX to compute X⁺ can be very numerically unstable. This
implementation sidesteps this need to invert XᵀX by computing X⁺ using singular value
decomposition (SVD):

β = VΣ⁺Uᵀy

where UΣVᵀ is an SVD of X.

References:
    - https://en.wikipedia.org/wiki/Polynomial_regression
    - https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse
    - https://en.wikipedia.org/wiki/Numerical_methods_for_linear_least_squares
    - https://en.wikipedia.org/wiki/Singular_value_decomposition
"""

import matplotlib.pyplot as plt
import numpy as np


class PolynomialRegression:
    __slots__ = "degree", "params"

    def __init__(self, degree: int) -> None:
        """
        @raises ValueError: if the polynomial degree is negative
        """
        if degree < 0:
            raise ValueError("Polynomial degree must be non-negative")

        self.degree = degree
        self.params = None

    @staticmethod
    def _design_matrix(data: np.ndarray, degree: int) -> np.ndarray:
        """
        Constructs a polynomial regression design matrix for the given input data. For
        input data x = (x₁, x₂, ..., xₙ) and polynomial degree m, the design matrix is
        the Vandermonde matrix

            |1  x₁  x₁² ⋯ x₁ᵐ|
        X = |1  x₂  x₂² ⋯ x₂ᵐ|
            |⋮  ⋮   ⋮   ⋱ ⋮  |
            |1  xₙ  xₙ² ⋯  xₙᵐ|

        Reference: https://en.wikipedia.org/wiki/Vandermonde_matrix

        @param data:    the input predictor values x, either for model fitting or for
                        prediction
        @param degree:  the polynomial degree m
        @returns:       the Vandermonde matrix X (see above)
        @raises ValueError: if input data is not N x 1

        >>> x = np.array([0, 1, 2])
        >>> PolynomialRegression._design_matrix(x, degree=0)
        array([[1],
               [1],
               [1]])
        >>> PolynomialRegression._design_matrix(x, degree=1)
        array([[1, 0],
               [1, 1],
               [1, 2]])
        >>> PolynomialRegression._design_matrix(x, degree=2)
        array([[1, 0, 0],
               [1, 1, 1],
               [1, 2, 4]])
        >>> PolynomialRegression._design_matrix(x, degree=3)
        array([[1, 0, 0, 0],
               [1, 1, 1, 1],
               [1, 2, 4, 8]])
        >>> PolynomialRegression._design_matrix(np.array([[0, 0], [0 , 0]]), degree=3)
        Traceback (most recent call last):
        ...
        ValueError: Data must have dimensions N x 1
        """
        rows, *remaining = data.shape
        if remaining:
            raise ValueError("Data must have dimensions N x 1")

        return np.vander(data, N=degree + 1, increasing=True)

    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
        """
        Computes the polynomial regression model parameters using ordinary least squares
        (OLS) estimation:

        β = (XᵀX)⁻¹Xᵀy = X⁺y

        where X⁺ denotes the Moore-Penrose pseudoinverse of the design matrix X. This
        function computes X⁺ using singular value decomposition (SVD).

        References:
            - https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse
            - https://en.wikipedia.org/wiki/Singular_value_decomposition
            - https://en.wikipedia.org/wiki/Multicollinearity

        @param x_train: the predictor values x for model fitting
        @param y_train: the response values y for model fitting
        @raises ArithmeticError:    if X isn't full rank, then XᵀX is singular and β
                                    doesn't exist

        >>> x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        >>> y = x**3 - 2 * x**2 + 3 * x - 5
        >>> poly_reg = PolynomialRegression(degree=3)
        >>> poly_reg.fit(x, y)
        >>> poly_reg.params
        array([-5.,  3., -2.,  1.])
        >>> poly_reg = PolynomialRegression(degree=20)
        >>> poly_reg.fit(x, y)
        Traceback (most recent call last):
        ...
        ArithmeticError: Design matrix is not full rank, can't compute coefficients

        Make sure errors don't grow too large:
        >>> coefs = np.array([-250, 50, -2, 36, 20, -12, 10, 2, -1, -15, 1])
        >>> y = PolynomialRegression._design_matrix(x, len(coefs) - 1) @ coefs
        >>> poly_reg = PolynomialRegression(degree=len(coefs) - 1)
        >>> poly_reg.fit(x, y)
        >>> np.allclose(poly_reg.params, coefs, atol=10e-3)
        True
        """
        X = PolynomialRegression._design_matrix(x_train, self.degree)  # noqa: N806
        _, cols = X.shape
        if np.linalg.matrix_rank(X) < cols:
            raise ArithmeticError(
                "Design matrix is not full rank, can't compute coefficients"
            )

        # np.linalg.pinv() computes the Moore-Penrose pseudoinverse using SVD
        self.params = np.linalg.pinv(X) @ y_train

    def predict(self, data: np.ndarray) -> np.ndarray:
        """
        Computes the predicted response values y for the given input data by
        constructing the design matrix X and evaluating y = Xβ.

        @param data:    the predictor values x for prediction
        @returns:       the predicted response values y = Xβ
        @raises ArithmeticError:    if this function is called before the model
                                    parameters are fit

        >>> x = np.array([0, 1, 2, 3, 4])
        >>> y = x**3 - 2 * x**2 + 3 * x - 5
        >>> poly_reg = PolynomialRegression(degree=3)
        >>> poly_reg.fit(x, y)
        >>> poly_reg.predict(np.array([-1]))
        array([-11.])
        >>> poly_reg.predict(np.array([-2]))
        array([-27.])
        >>> poly_reg.predict(np.array([6]))
        array([157.])
        >>> PolynomialRegression(degree=3).predict(x)
        Traceback (most recent call last):
        ...
        ArithmeticError: Predictor hasn't been fit yet
        """
        if self.params is None:
            raise ArithmeticError("Predictor hasn't been fit yet")

        return PolynomialRegression._design_matrix(data, self.degree) @ self.params


def main() -> None:
    """
    Fit a polynomial regression model to predict fuel efficiency using seaborn's mpg
    dataset

    >>> pass    # Placeholder, function is only for demo purposes
    """
    import seaborn as sns

    mpg_data = sns.load_dataset("mpg")

    poly_reg = PolynomialRegression(degree=2)
    poly_reg.fit(mpg_data.weight, mpg_data.mpg)

    weight_sorted = np.sort(mpg_data.weight)
    predictions = poly_reg.predict(weight_sorted)

    plt.scatter(mpg_data.weight, mpg_data.mpg, color="gray", alpha=0.5)
    plt.plot(weight_sorted, predictions, color="red", linewidth=3)
    plt.title("Predicting Fuel Efficiency Using Polynomial Regression")
    plt.xlabel("Weight (lbs)")
    plt.ylabel("Fuel Efficiency (mpg)")
    plt.show()


if __name__ == "__main__":
    import doctest

    doctest.testmod()

    main()