From 96317a8c1d97412951b27218ef375b3ce6034845 Mon Sep 17 00:00:00 2001 From: Anton Chernyatevich Date: Wed, 28 Jul 2021 23:57:49 +0300 Subject: [PATCH] Add SGD and OHE --- .gitignore | 3 + afsklearn/_class_weight.py | 62 ++ afsklearn/_classifier_mixin.py | 78 ++- afsklearn/_encode.py | 256 ++++++++ afsklearn/_multiclass.py | 34 ++ afsklearn/_validation.py | 52 ++ afsklearn/base.py | 46 +- afsklearn/linear_model/__init__.py | 0 afsklearn/linear_model/base.py | 53 ++ afsklearn/linear_model/sgd_base.py | 675 +++++++++++++++++++++ afsklearn/linear_model/sgd_classifier.py | 329 ++++++++++ afsklearn/neural_network/base.py | 48 +- afsklearn/neural_network/mlp_classifier.py | 8 +- afsklearn/patched_modules.yml | 10 + afsklearn/preprocessing/_encoders.py | 588 ++++++++++++++++++ afsklearn/preprocessing/_label.py | 120 ++++ requirements.txt | 2 +- setup.cfg | 15 + tests/test_one_hot_encoder.py | 27 + tests/test_sgd_classifier.py | 31 + 20 files changed, 2369 insertions(+), 68 deletions(-) create mode 100644 afsklearn/_class_weight.py create mode 100644 afsklearn/_encode.py create mode 100644 afsklearn/_multiclass.py create mode 100644 afsklearn/linear_model/__init__.py create mode 100644 afsklearn/linear_model/base.py create mode 100644 afsklearn/linear_model/sgd_base.py create mode 100644 afsklearn/linear_model/sgd_classifier.py create mode 100644 afsklearn/preprocessing/_encoders.py create mode 100644 afsklearn/preprocessing/_label.py create mode 100644 tests/test_one_hot_encoder.py create mode 100644 tests/test_sgd_classifier.py diff --git a/.gitignore b/.gitignore index 7c0cad9..c4b1e59 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,8 @@ build dist *.egg-info +# Static typing +.mypy_cache + # Virtual env venv diff --git a/afsklearn/_class_weight.py b/afsklearn/_class_weight.py new file mode 100644 index 0000000..a446605 --- /dev/null +++ b/afsklearn/_class_weight.py @@ -0,0 +1,62 @@ +import numpy as np # FIXME +from sklearn.utils import _deprecate_positional_args + + +@_deprecate_positional_args +def compute_class_weight(class_weight, *, classes, y): + """Estimate class weights for unbalanced datasets. + Parameters + ---------- + class_weight : dict, 'balanced' or None + If 'balanced', class weights will be given by + ``n_samples / (n_classes * np.bincount(y))``. + If a dictionary is given, keys are classes and values + are corresponding class weights. + If None is given, the class weights will be uniform. + classes : ndarray + Array of the classes occurring in the data, as given by + ``np.unique(y_org)`` with ``y_org`` the original class labels. + y : array-like of shape (n_samples,) + Array of original class labels per sample. + Returns + ------- + class_weight_vect : ndarray of shape (n_classes,) + Array with class_weight_vect[i] the weight for i-th class. + References + ---------- + The "balanced" heuristic is inspired by + Logistic Regression in Rare Events Data, King, Zen, 2001. + """ + # Import error caused by circular imports. + from .preprocessing._label import afLabelEncoder + + if set(y) - set(classes): + raise ValueError("classes should include all valid labels that can " + "be in y") + if class_weight is None or len(class_weight) == 0: + # uniform class weights + weight = np.ones(classes.shape[0], dtype=np.float64, order='C') + elif class_weight == 'balanced': + # Find the weight of each class as present in y. + le = afLabelEncoder() + y_ind = le.fit_transform(y) + if not all(np.in1d(classes, le.classes_)): + raise ValueError("classes should have valid labels that are in y") + + recip_freq = len(y) / (len(le.classes_) * + np.bincount(y_ind).astype(np.float64)) + weight = recip_freq[le.transform(classes)] + else: + # user-defined dictionary + weight = np.ones(classes.shape[0], dtype=np.float64, order='C') + if not isinstance(class_weight, dict): + raise ValueError("class_weight must be dict, 'balanced', or None," + " got: %r" % class_weight) + for c in class_weight: + i = np.searchsorted(classes, c) + if i >= len(classes) or classes[i] != c: + raise ValueError("Class label {} not present.".format(c)) + else: + weight[i] = class_weight[c] + + return weight diff --git a/afsklearn/_classifier_mixin.py b/afsklearn/_classifier_mixin.py index db81060..2d87ed4 100644 --- a/afsklearn/_classifier_mixin.py +++ b/afsklearn/_classifier_mixin.py @@ -1,5 +1,6 @@ import arrayfire as af + def _weighted_sum(sample_score, sample_weight, normalize=False): if normalize: return np.average(sample_score, weights=sample_weight) @@ -8,6 +9,7 @@ def _weighted_sum(sample_score, sample_weight, normalize=False): else: return sample_score.sum() + def _check_targets(y_true, y_pred): """Check that y_true and y_pred belong to the same classification task This converts multiclass or binary types to a common shape, and raises a @@ -63,8 +65,6 @@ def _check_targets(y_true, y_pred): return y_type, y_true, y_pred - - def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None): """Accuracy classification score. In multilabel classification, this function computes subset accuracy: @@ -123,6 +123,7 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None): return _weighted_sum(score, sample_weight, normalize) + class afClassifierMixin: """ArrayFire enabled Mixin class for all classifiers in scikit-learn.""" @@ -147,8 +148,77 @@ def score(self, X, y, sample_weight=None): score : float Mean accuracy of self.predict(X) wrt. y. """ - #return accuracy_score(y, self.predict(X), sample_weight=sample_weight) - return #TMP + # return accuracy_score(y, self.predict(X), sample_weight=sample_weight) + return # TMP def _more_tags(self): return {'requires_y': True} + + +class afLinearClassifierMixin(afClassifierMixin): + """Mixin for linear classifiers. + Handles prediction for sparse and dense X. + """ + + def decision_function(self, X): + """ + Predict confidence scores for samples. + The confidence score for a sample is proportional to the signed + distance of that sample to the hyperplane. + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Samples. + Returns + ------- + array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) + Confidence scores per (sample, class) combination. In the binary + case, confidence score for self.classes_[1] where >0 means this + class would be predicted. + """ + check_is_fitted(self) + + X = check_array(X, accept_sparse='csr') + + n_features = self.coef_.shape[1] + if X.shape[1] != n_features: + raise ValueError("X has %d features per sample; expecting %d" + % (X.shape[1], n_features)) + + scores = safe_sparse_dot(X, self.coef_.T, + dense_output=True) + self.intercept_ + return scores.ravel() if scores.shape[1] == 1 else scores + + def predict(self, X): + """ + Predict class labels for samples in X. + Parameters + ---------- + X : array-like or sparse matrix, shape (n_samples, n_features) + Samples. + Returns + ------- + C : array, shape [n_samples] + Predicted class label per sample. + """ + scores = self.decision_function(X) + if len(scores.shape) == 1: + indices = (scores > 0).astype(int) + else: + indices = scores.argmax(axis=1) + return self.classes_[indices] + + def _predict_proba_lr(self, X): + """Probability estimation for OvR logistic regression. + Positive class probabilities are computed as + 1. / (1. + np.exp(-self.decision_function(X))); + multiclass is handled by normalizing that over all classes. + """ + prob = self.decision_function(X) + expit(prob, out=prob) + if prob.ndim == 1: + return np.vstack([1 - prob, prob]).T + else: + # OvR normalization, like LibLinear's predict_probability + prob /= prob.sum(axis=1).reshape((prob.shape[0], -1)) + return prob diff --git a/afsklearn/_encode.py b/afsklearn/_encode.py new file mode 100644 index 0000000..5b4aa31 --- /dev/null +++ b/afsklearn/_encode.py @@ -0,0 +1,256 @@ +from typing import NamedTuple + +import numpy as np # FIXME + +from ._validation import is_scalar_nan + + +def _encode(values, *, uniques, check_unknown=True): + """Helper function to encode values into [0, n_uniques - 1]. + Uses pure python method for object dtype, and numpy method for + all other dtypes. + The numpy method has the limitation that the `uniques` need to + be sorted. Importantly, this is not checked but assumed to already be + the case. The calling method needs to ensure this for all non-object + values. + Parameters + ---------- + values : ndarray + Values to encode. + uniques : ndarray + The unique values in `values`. If the dtype is not object, then + `uniques` needs to be sorted. + check_unknown : bool, default=True + If True, check for values in `values` that are not in `unique` + and raise an error. This is ignored for object dtype, and treated as + True in this case. This parameter is useful for + _BaseEncoder._transform() to avoid calling _check_unknown() + twice. + Returns + ------- + encoded : ndarray + Encoded values + """ + if values.dtype.kind in 'OUS': + try: + return _map_to_integer(values, uniques) + except KeyError as e: + raise ValueError(f"y contains previously unseen labels: {str(e)}") + else: + if check_unknown: + diff = _check_unknown(values, uniques) + if diff: + raise ValueError(f"y contains previously unseen labels: " + f"{str(diff)}") + return np.searchsorted(uniques, values) + + +def _unique(values, *, return_inverse=False): + """Helper function to find unique values with support for python objects. + Uses pure python method for object dtype, and numpy method for + all other dtypes. + Parameters + ---------- + values : ndarray + Values to check for unknowns. + return_inverse : bool, default=False + If True, also return the indices of the unique values. + Returns + ------- + unique : ndarray + The sorted unique values. + unique_inverse : ndarray + The indices to reconstruct the original array from the unique array. + Only provided if `return_inverse` is True. + """ + if values.dtype == object: + return _unique_python(values, return_inverse=return_inverse) + # numerical + out = np.unique(values, return_inverse=return_inverse) + + if return_inverse: + uniques, inverse = out + else: + uniques = out + + # np.unique will have duplicate missing values at the end of `uniques` + # here we clip the nans and remove it from uniques + if uniques.size and is_scalar_nan(uniques[-1]): + nan_idx = np.searchsorted(uniques, np.nan) + uniques = uniques[:nan_idx + 1] + if return_inverse: + inverse[inverse > nan_idx] = nan_idx + + if return_inverse: + return uniques, inverse + return uniques + + +def _unique_python(values, *, return_inverse): + # Only used in `_uniques`, see docstring there for details + try: + uniques_set = set(values) + uniques_set, missing_values = _extract_missing(uniques_set) + + uniques = sorted(uniques_set) + uniques.extend(missing_values.to_list()) + uniques = np.array(uniques, dtype=values.dtype) + except TypeError: + types = sorted(t.__qualname__ + for t in set(type(v) for v in values)) + raise TypeError("Encoders require their input to be uniformly " + f"strings or numbers. Got {types}") + + if return_inverse: + return uniques, _map_to_integer(values, uniques) + + return uniques + + +def _map_to_integer(values, uniques): + """Map values based on its position in uniques.""" + table = _nandict({val: i for i, val in enumerate(uniques)}) + return np.array([table[v] for v in values]) + + +class _nandict(dict): + """Dictionary with support for nans.""" + + def __init__(self, mapping): + super().__init__(mapping) + for key, value in mapping.items(): + if is_scalar_nan(key): + self.nan_value = value + break + + def __missing__(self, key): + if hasattr(self, 'nan_value') and is_scalar_nan(key): + return self.nan_value + raise KeyError(key) + + +class MissingValues(NamedTuple): + """Data class for missing data information""" + nan: bool + none: bool + + def to_list(self): + """Convert tuple to a list where None is always first.""" + output = [] + if self.none: + output.append(None) + if self.nan: + output.append(np.nan) + return output + + +def _extract_missing(values): + """Extract missing values from `values`. + Parameters + ---------- + values: set + Set of values to extract missing from. + Returns + ------- + output: set + Set with missing values extracted. + missing_values: MissingValues + Object with missing value information. + """ + missing_values_set = {value for value in values + if value is None or is_scalar_nan(value)} + + if not missing_values_set: + return values, MissingValues(nan=False, none=False) + + if None in missing_values_set: + if len(missing_values_set) == 1: + output_missing_values = MissingValues(nan=False, none=True) + else: + # If there is more than one missing value, then it has to be + # float('nan') or np.nan + output_missing_values = MissingValues(nan=True, none=True) + else: + output_missing_values = MissingValues(nan=True, none=False) + + # create set without the missing values + output = values - missing_values_set + return output, output_missing_values + + +def _check_unknown(values, known_values, return_mask=False): + """ + Helper function to check for unknowns in values to be encoded. + Uses pure python method for object dtype, and numpy method for + all other dtypes. + Parameters + ---------- + values : array + Values to check for unknowns. + known_values : array + Known values. Must be unique. + return_mask : bool, default=False + If True, return a mask of the same shape as `values` indicating + the valid values. + Returns + ------- + diff : list + The unique values present in `values` and not in `know_values`. + valid_mask : boolean array + Additionally returned if ``return_mask=True``. + """ + valid_mask = None + + if values.dtype.kind in 'OUS': + values_set = set(values) + values_set, missing_in_values = _extract_missing(values_set) + + uniques_set = set(known_values) + uniques_set, missing_in_uniques = _extract_missing(uniques_set) + diff = values_set - uniques_set + + nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan + none_in_diff = missing_in_values.none and not missing_in_uniques.none + + def is_valid(value): + return (value in uniques_set or + missing_in_uniques.none and value is None or + missing_in_uniques.nan and is_scalar_nan(value)) + + if return_mask: + if diff or nan_in_diff or none_in_diff: + valid_mask = np.array([is_valid(value) for value in values]) + else: + valid_mask = np.ones(len(values), dtype=bool) + + diff = list(diff) + if none_in_diff: + diff.append(None) + if nan_in_diff: + diff.append(np.nan) + else: + unique_values = np.unique(values) + diff = np.setdiff1d(unique_values, known_values, + assume_unique=True) + if return_mask: + if diff.size: + valid_mask = np.in1d(values, known_values) + else: + valid_mask = np.ones(len(values), dtype=bool) + + # check for nans in the known_values + if np.isnan(known_values).any(): + diff_is_nan = np.isnan(diff) + if diff_is_nan.any(): + # removes nan from valid_mask + if diff.size and return_mask: + is_nan = np.isnan(values) + valid_mask[is_nan] = 1 + + # remove nan from diff + diff = diff[~diff_is_nan] + diff = list(diff) + + if return_mask: + return diff, valid_mask + return diff diff --git a/afsklearn/_multiclass.py b/afsklearn/_multiclass.py new file mode 100644 index 0000000..6d9c644 --- /dev/null +++ b/afsklearn/_multiclass.py @@ -0,0 +1,34 @@ +import numpy as np # FIXME + +from .base import unique_labels + + +def _check_partial_fit_first_call(clf, classes=None): + """Private helper function for factorizing common classes param logic. + Estimators that implement the ``partial_fit`` API need to be provided with + the list of possible classes at the first call to partial_fit. + Subsequent calls to partial_fit should check that ``classes`` is still + consistent with a previous value of ``clf.classes_`` when provided. + This function returns True if it detects that this was the first call to + ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also + set on ``clf``. + """ + if getattr(clf, 'classes_', None) is None and classes is None: + raise ValueError("classes must be passed on the first call " + "to partial_fit.") + + elif classes is not None: + if getattr(clf, 'classes_', None) is not None: + if not np.array_equal(clf.classes_, unique_labels(classes)): + raise ValueError( + "`classes=%r` is not the same as on last call " + "to partial_fit, was: %r" % (classes, clf.classes_)) + + else: + # This is the first call to partial_fit + clf.classes_ = unique_labels(classes) + return True + + # classes is None and clf.classes_ has already previously been set: + # nothing to do + return False diff --git a/afsklearn/_validation.py b/afsklearn/_validation.py index 1620c64..d58ab52 100644 --- a/afsklearn/_validation.py +++ b/afsklearn/_validation.py @@ -11,9 +11,11 @@ from sklearn.utils.validation import _deprecate_positional_args from sklearn._config import get_config as _get_config + def _object_dtype_isnan(X): return X != X + def is_scalar_nan(x): """Tests if x is NaN. This function is meant to overcome the issue that np.isnan does not allow @@ -60,6 +62,56 @@ def check_consistent_length(*arrays): " samples: %r" % [int(l) for l in lengths]) +def _check_sample_weight(sample_weight, X, dtype=None, copy=False): + """Validate sample weights. + Note that passing sample_weight=None will output an array of ones. + Therefore, in some cases, you may want to protect the call with: + if sample_weight is not None: + sample_weight = _check_sample_weight(...) + Parameters + ---------- + sample_weight : {ndarray, Number or None}, shape (n_samples,) + Input sample weights. + X : {ndarray, list, sparse matrix} + Input data. + dtype: dtype, default=None + dtype of the validated `sample_weight`. + If None, and the input `sample_weight` is an array, the dtype of the + input is preserved; otherwise an array with the default numpy dtype + is be allocated. If `dtype` is not one of `float32`, `float64`, + `None`, the output will be of dtype `float64`. + copy : bool, default=False + If True, a copy of sample_weight will be created. + Returns + ------- + sample_weight : ndarray of shape (n_samples,) + Validated sample weight. It is guaranteed to be "C" contiguous. + """ + n_samples = _num_samples(X) + + if dtype is not None and dtype not in [np.float32, np.float64]: + dtype = np.float64 + + if sample_weight is None: + sample_weight = np.ones(n_samples, dtype=dtype) + elif isinstance(sample_weight, numbers.Number): + sample_weight = np.full(n_samples, sample_weight, dtype=dtype) + else: + if dtype is None: + dtype = [np.float64, np.float32] + sample_weight = check_array( + sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, + order="C", copy=copy + ) + if sample_weight.ndim != 1: + raise ValueError("Sample weights must be 1D array or scalar") + + if sample_weight.shape != (n_samples,): + raise ValueError("sample_weight.shape == {}, expected {}!" + .format(sample_weight.shape, (n_samples,))) + return sample_weight + + def _safe_accumulator_op(op, x, *args, **kwargs): """ This function provides numpy accumulator functions with a float64 dtype diff --git a/afsklearn/base.py b/afsklearn/base.py index 097d2df..e66cfe2 100644 --- a/afsklearn/base.py +++ b/afsklearn/base.py @@ -1,15 +1,16 @@ -import numbers -import warnings +from collections.abc import Sequence +from itertools import chain import arrayfire as af import numpy as np import scipy.sparse as sp -from numpy.core.numeric import ComplexWarning +from scipy.sparse.base import spmatrix from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import LabelBinarizer +from sklearn.utils.validation import _deprecate_positional_args -from ._validation import ( - _assert_all_finite, _ensure_no_complex_data, _num_samples, _safe_accumulator_op, check_array, - check_consistent_length, check_X_y, column_or_1d) +from ._validation import (_assert_all_finite, _num_samples, check_array, + check_is_fitted, check_X_y, column_or_1d) # Class inheriting from BaseEstimator @@ -100,24 +101,10 @@ def _validate_data(self, X, y=None, reset=True, # Class inheriting from TransformerMixin # all methods that touch np.array are replaced # with ArrayFire compatible functionality -class afTransformerMixin(TransformerMixin): - pass -import numbers -import warnings -from collections.abc import Sequence -from itertools import chain -import arrayfire as af -#import numpy as np -import numpy -import numpy as np -import scipy.sparse as sp -from scipy.sparse.base import spmatrix -from sklearn.preprocessing import LabelBinarizer -from sklearn.utils.validation import _deprecate_positional_args - -from ._validation import _num_samples, check_array, check_is_fitted, column_or_1d +class afTransformerMixin(TransformerMixin): + pass def _unique_multiclass(y): @@ -139,6 +126,7 @@ def _unique_indicator(y): 'multilabel-indicator': _unique_indicator, } + def unique_labels(*ys): """Extract an ordered array of unique labels @@ -205,6 +193,7 @@ def unique_labels(*ys): return np.array(sorted(ys_labels)) + def is_multilabel(y): """ Check if ``y`` is in a multilabel format. @@ -250,6 +239,7 @@ def is_multilabel(y): return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint _is_integral_float(labels)) + def type_of_target(y): """Determine the type of data indicated by the target. @@ -370,6 +360,7 @@ def type_of_target(y): else: return 'binary' # [1, 2] or [["a"], ["b"]] + def _inverse_binarize_multiclass(y, classes): """Inverse label binarization transformation for multiclass. @@ -459,12 +450,13 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold): def af_in1d(arr0, arr1): - #temporarily perform computation in numy, potentially change to arrayfire + # temporarily perform computation in numy, potentially change to arrayfire #a0 = arr0.to_ndarray() #a1 = arr1.to_ndarray() isin = np.in1d(arr0, arr1) return isin + @_deprecate_positional_args def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False): @@ -586,7 +578,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, y_in_classes = af.interop.from_ndarray(y_in_classes, copy=True) y[y_in_classes] y_seen = y[y_in_classes] - y_seen = y_seen#.to_ndarray() + y_seen = y_seen # .to_ndarray() indices = np.searchsorted(sorted_class, y_seen) indptr = np.hstack((0, np.cumsum(y_in_classes))) @@ -594,8 +586,8 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, data.fill(pos_label) Y = data - #Y = sp.csr_matrix((data, indices, indptr), - #shape=(n_samples, n_classes)) + # Y = sp.csr_matrix((data, indices, indptr), + # shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": Y = sp.csr_matrix(y) if pos_label != 1: @@ -607,7 +599,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, "binarization" % y_type) if not sparse_output: - #Y = Y.toarray() #TODO: test if ndarray, then cast if not + # Y = Y.toarray() #TODO: test if ndarray, then cast if not Y = Y.astype(int, copy=False) if neg_label != 0: diff --git a/afsklearn/linear_model/__init__.py b/afsklearn/linear_model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/afsklearn/linear_model/base.py b/afsklearn/linear_model/base.py new file mode 100644 index 0000000..0a4033d --- /dev/null +++ b/afsklearn/linear_model/base.py @@ -0,0 +1,53 @@ +import numpy as np # FIXME +import scipy.sparse as sp +from sklearn.linear_model._base import SPARSE_INTERCEPT_DECAY +from sklearn.utils._seq_dataset import ArrayDataset32, ArrayDataset64, CSRDataset32, CSRDataset64 + +from .._validation import check_random_state + + +def make_dataset(X, y, sample_weight, random_state=None): + """Create ``Dataset`` abstraction for sparse and dense inputs. + This also returns the ``intercept_decay`` which is different + for sparse datasets. + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data + y : array-like, shape (n_samples, ) + Target values. + sample_weight : numpy array of shape (n_samples,) + The weight of each sample + random_state : int, RandomState instance or None (default) + Determines random number generation for dataset shuffling and noise. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + Returns + ------- + dataset + The ``Dataset`` abstraction + intercept_decay + The intercept decay + """ + + rng = check_random_state(random_state) + # seed should never be 0 in SequentialDataset64 + seed = rng.randint(1, np.iinfo(np.int32).max) + + if X.dtype == np.float32: + CSRData = CSRDataset32 + ArrayData = ArrayDataset32 + else: + CSRData = CSRDataset64 + ArrayData = ArrayDataset64 + + if sp.issparse(X): + dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, + seed=seed) + intercept_decay = SPARSE_INTERCEPT_DECAY + else: + X = np.ascontiguousarray(X) + dataset = ArrayData(X, y, sample_weight, seed=seed) + intercept_decay = 1.0 + + return dataset, intercept_decay diff --git a/afsklearn/linear_model/sgd_base.py b/afsklearn/linear_model/sgd_base.py new file mode 100644 index 0000000..01313e6 --- /dev/null +++ b/afsklearn/linear_model/sgd_base.py @@ -0,0 +1,675 @@ +import warnings +from abc import ABCMeta, abstractmethod + +import numpy as np # FIXME +from joblib import Parallel +from sklearn.base import clone, is_classifier +from sklearn.exceptions import ConvergenceWarning +from sklearn.linear_model._base import SparseCoefMixin +from sklearn.linear_model._sgd_fast import ( # FIXME + EpsilonInsensitive, Hinge, Huber, Log, ModifiedHuber, SquaredEpsilonInsensitive, SquaredHinge, SquaredLoss, + _plain_sgd) +from sklearn.linear_model._stochastic_gradient import DEFAULT_EPSILON, LEARNING_RATE_TYPES, MAX_INT, PENALTY_TYPES +from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit +from sklearn.utils import deprecated +from sklearn.utils.fixes import _joblib_parallel_args, delayed +from sklearn.utils.validation import _deprecate_positional_args + +from .._class_weight import compute_class_weight +from .._classifier_mixin import afLinearClassifierMixin +from .._multiclass import _check_partial_fit_first_call +from .._validation import _check_sample_weight, check_random_state, check_X_y +from ..base import afBaseEstimator +from .base import make_dataset + + +class _ValidationScoreCallback: + """Callback for early stopping based on validation score""" + + def __init__(self, estimator, X_val, y_val, sample_weight_val, + classes=None): + self.estimator = clone(estimator) + self.estimator.t_ = 1 # to pass check_is_fitted + if classes is not None: + self.estimator.classes_ = classes + self.X_val = X_val + self.y_val = y_val + self.sample_weight_val = sample_weight_val + + def __call__(self, coef, intercept): + est = self.estimator + est.coef_ = coef.reshape(1, -1) + est.intercept_ = np.atleast_1d(intercept) + return est.score(self.X_val, self.y_val, self.sample_weight_val) + + +class afBaseSGD(SparseCoefMixin, afBaseEstimator, metaclass=ABCMeta): + """Base class for SGD classification and regression.""" + @_deprecate_positional_args + def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0, + l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, + shuffle=True, verbose=0, epsilon=0.1, random_state=None, + learning_rate="optimal", eta0=0.0, power_t=0.5, + early_stopping=False, validation_fraction=0.1, + n_iter_no_change=5, warm_start=False, average=False): + self.loss = loss + self.penalty = penalty + self.learning_rate = learning_rate + self.epsilon = epsilon + self.alpha = alpha + self.C = C + self.l1_ratio = l1_ratio + self.fit_intercept = fit_intercept + self.shuffle = shuffle + self.random_state = random_state + self.verbose = verbose + self.eta0 = eta0 + self.power_t = power_t + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.warm_start = warm_start + self.average = average + self.max_iter = max_iter + self.tol = tol + # current tests expect init to do parameter validation + # but we are not allowed to set attributes + self._validate_params() + + def set_params(self, **kwargs): + """Set and validate the parameters of estimator. + Parameters + ---------- + **kwargs : dict + Estimator parameters. + Returns + ------- + self : object + Estimator instance. + """ + super().set_params(**kwargs) + self._validate_params() + return self + + @abstractmethod + def fit(self, X, y): + """Fit model.""" + + def _validate_params(self, for_partial_fit=False): + """Validate input params. """ + if not isinstance(self.shuffle, bool): + raise ValueError("shuffle must be either True or False") + if not isinstance(self.early_stopping, bool): + raise ValueError("early_stopping must be either True or False") + if self.early_stopping and for_partial_fit: + raise ValueError("early_stopping should be False with partial_fit") + if self.max_iter is not None and self.max_iter <= 0: + raise ValueError("max_iter must be > zero. Got %f" % self.max_iter) + if not (0.0 <= self.l1_ratio <= 1.0): + raise ValueError("l1_ratio must be in [0, 1]") + if self.alpha < 0.0: + raise ValueError("alpha must be >= 0") + if self.n_iter_no_change < 1: + raise ValueError("n_iter_no_change must be >= 1") + if not (0.0 < self.validation_fraction < 1.0): + raise ValueError("validation_fraction must be in range (0, 1)") + if self.learning_rate in ("constant", "invscaling", "adaptive"): + if self.eta0 <= 0.0: + raise ValueError("eta0 must be > 0") + if self.learning_rate == "optimal" and self.alpha == 0: + raise ValueError("alpha must be > 0 since " + "learning_rate is 'optimal'. alpha is used " + "to compute the optimal learning rate.") + + # raises ValueError if not registered + self._get_penalty_type(self.penalty) + self._get_learning_rate_type(self.learning_rate) + + if self.loss not in self.loss_functions: + raise ValueError("The loss %s is not supported. " % self.loss) + + def _get_loss_function(self, loss): + """Get concrete ``LossFunction`` object for str ``loss``. """ + try: + loss_ = self.loss_functions[loss] + loss_class, args = loss_[0], loss_[1:] + if loss in ('huber', 'epsilon_insensitive', + 'squared_epsilon_insensitive'): + args = (self.epsilon, ) + return loss_class(*args) + except KeyError as e: + raise ValueError("The loss %s is not supported. " % loss) from e + + def _get_learning_rate_type(self, learning_rate): + try: + return LEARNING_RATE_TYPES[learning_rate] + except KeyError as e: + raise ValueError("learning rate %s " + "is not supported. " % learning_rate) from e + + def _get_penalty_type(self, penalty): + penalty = str(penalty).lower() + try: + return PENALTY_TYPES[penalty] + except KeyError as e: + raise ValueError("Penalty %s is not supported. " % penalty) from e + + def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None, + intercept_init=None): + """Allocate mem for parameters; initialize if provided.""" + if n_classes > 2: + # allocate coef_ for multi-class + if coef_init is not None: + coef_init = np.asarray(coef_init, order="C") + if coef_init.shape != (n_classes, n_features): + raise ValueError("Provided ``coef_`` does not match " + "dataset. ") + self.coef_ = coef_init + else: + self.coef_ = np.zeros((n_classes, n_features), + dtype=np.float64, order="C") + + # allocate intercept_ for multi-class + if intercept_init is not None: + intercept_init = np.asarray(intercept_init, order="C") + if intercept_init.shape != (n_classes, ): + raise ValueError("Provided intercept_init " + "does not match dataset.") + self.intercept_ = intercept_init + else: + self.intercept_ = np.zeros(n_classes, dtype=np.float64, + order="C") + else: + # allocate coef_ for binary problem + if coef_init is not None: + coef_init = np.asarray(coef_init, dtype=np.float64, + order="C") + coef_init = coef_init.ravel() + if coef_init.shape != (n_features,): + raise ValueError("Provided coef_init does not " + "match dataset.") + self.coef_ = coef_init + else: + self.coef_ = np.zeros(n_features, + dtype=np.float64, + order="C") + + # allocate intercept_ for binary problem + if intercept_init is not None: + intercept_init = np.asarray(intercept_init, dtype=np.float64) + if intercept_init.shape != (1,) and intercept_init.shape != (): + raise ValueError("Provided intercept_init " + "does not match dataset.") + self.intercept_ = intercept_init.reshape(1,) + else: + self.intercept_ = np.zeros(1, dtype=np.float64, order="C") + + # initialize average parameters + if self.average > 0: + self._standard_coef = self.coef_ + self._standard_intercept = self.intercept_ + self._average_coef = np.zeros(self.coef_.shape, + dtype=np.float64, + order="C") + self._average_intercept = np.zeros(self._standard_intercept.shape, + dtype=np.float64, + order="C") + + def _make_validation_split(self, y): + """Split the dataset between training set and validation set. + Parameters + ---------- + y : ndarray of shape (n_samples, ) + Target values. + Returns + ------- + validation_mask : ndarray of shape (n_samples, ) + Equal to 1 on the validation set, 0 on the training set. + """ + n_samples = y.shape[0] + validation_mask = np.zeros(n_samples, dtype=np.uint8) + if not self.early_stopping: + # use the full set for training, with an empty validation set + return validation_mask + + if is_classifier(self): + splitter_type = StratifiedShuffleSplit + else: + splitter_type = ShuffleSplit + cv = splitter_type(test_size=self.validation_fraction, + random_state=self.random_state) + idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y)) + if idx_train.shape[0] == 0 or idx_val.shape[0] == 0: + raise ValueError( + "Splitting %d samples into a train set and a validation set " + "with validation_fraction=%r led to an empty set (%d and %d " + "samples). Please either change validation_fraction, increase " + "number of samples, or disable early_stopping." + % (n_samples, self.validation_fraction, idx_train.shape[0], + idx_val.shape[0])) + + validation_mask[idx_val] = 1 + return validation_mask + + def _make_validation_score_cb(self, validation_mask, X, y, sample_weight, + classes=None): + if not self.early_stopping: + return None + + return _ValidationScoreCallback( + self, X[validation_mask], y[validation_mask], + sample_weight[validation_mask], classes=classes) + + # mypy error: Decorated property not supported + @deprecated("Attribute standard_coef_ was deprecated " # type: ignore + "in version 0.23 and will be removed in 1.0 " + "(renaming of 0.25).") + @property + def standard_coef_(self): + return self._standard_coef + + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute standard_intercept_ was deprecated " + "in version 0.23 and will be removed in 1.0 (renaming of 0.25)." + ) + @property + def standard_intercept_(self): + return self._standard_intercept + + # mypy error: Decorated property not supported + @deprecated("Attribute average_coef_ was deprecated " # type: ignore + "in version 0.23 and will be removed in 1.0 " + "(renaming of 0.25).") + @property + def average_coef_(self): + return self._average_coef + + # mypy error: Decorated property not supported + @deprecated("Attribute average_intercept_ was deprecated " # type: ignore + "in version 0.23 and will be removed in 1.0 " + "(renaming of 0.25).") + @property + def average_intercept_(self): + return self._average_intercept + + +class afBaseSGDClassifier(afLinearClassifierMixin, afBaseSGD, metaclass=ABCMeta): + + loss_functions = { + "hinge": (Hinge, 1.0), + "squared_hinge": (SquaredHinge, 1.0), + "perceptron": (Hinge, 0.0), + "log": (Log, ), + "modified_huber": (ModifiedHuber, ), + "squared_loss": (SquaredLoss, ), + "huber": (Huber, DEFAULT_EPSILON), + "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON), + "squared_epsilon_insensitive": (SquaredEpsilonInsensitive, + DEFAULT_EPSILON), + } + + @abstractmethod + @_deprecate_positional_args + def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001, + l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, + shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, + random_state=None, learning_rate="optimal", eta0=0.0, + power_t=0.5, early_stopping=False, + validation_fraction=0.1, n_iter_no_change=5, + class_weight=None, warm_start=False, average=False): + + super().__init__( + loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, + fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, + shuffle=shuffle, verbose=verbose, epsilon=epsilon, + random_state=random_state, learning_rate=learning_rate, eta0=eta0, + power_t=power_t, early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, warm_start=warm_start, + average=average) + self.class_weight = class_weight + self.n_jobs = n_jobs + + def _partial_fit(self, X, y, alpha, C, + loss, learning_rate, max_iter, + classes, sample_weight, + coef_init, intercept_init): + X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, + order="C", accept_large_sparse=False) + + n_samples, n_features = X.shape + + _check_partial_fit_first_call(self, classes) + + n_classes = self.classes_.shape[0] + + # Allocate datastructures from input arguments + self._expanded_class_weight = compute_class_weight( + self.class_weight, classes=self.classes_, y=y) + sample_weight = _check_sample_weight(sample_weight, X) + + if getattr(self, "coef_", None) is None or coef_init is not None: + self._allocate_parameter_mem(n_classes, n_features, + coef_init, intercept_init) + elif n_features != self.coef_.shape[-1]: + raise ValueError("Number of features %d does not match previous " + "data %d." % (n_features, self.coef_.shape[-1])) + + self.loss_function_ = self._get_loss_function(loss) + if not hasattr(self, "t_"): + self.t_ = 1.0 + + # delegate to concrete training procedure + if n_classes > 2: + self._fit_multiclass(X, y, alpha=alpha, C=C, + learning_rate=learning_rate, + sample_weight=sample_weight, + max_iter=max_iter) + elif n_classes == 2: + self._fit_binary(X, y, alpha=alpha, C=C, + learning_rate=learning_rate, + sample_weight=sample_weight, + max_iter=max_iter) + else: + raise ValueError( + "The number of classes has to be greater than one;" + " got %d class" % n_classes) + + return self + + def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, + intercept_init=None, sample_weight=None): + self._validate_params() + if hasattr(self, "classes_"): + self.classes_ = None + + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) + + # labels can be encoded as float, int, or string literals + # np.unique sorts in asc order; largest class id is positive class + classes = np.unique(y) + + if self.warm_start and hasattr(self, "coef_"): + if coef_init is None: + coef_init = self.coef_ + if intercept_init is None: + intercept_init = self.intercept_ + else: + self.coef_ = None + self.intercept_ = None + + if self.average > 0: + self._standard_coef = self.coef_ + self._standard_intercept = self.intercept_ + self._average_coef = None + self._average_intercept = None + + # Clear iteration count for multiple call to fit. + self.t_ = 1.0 + + self._partial_fit(X, y, alpha, C, loss, learning_rate, self.max_iter, + classes, sample_weight, coef_init, intercept_init) + + if (self.tol is not None and self.tol > -np.inf + and self.n_iter_ == self.max_iter): + warnings.warn("Maximum number of iteration reached before " + "convergence. Consider increasing max_iter to " + "improve the fit.", + ConvergenceWarning) + return self + + def _fit_binary(self, X, y, alpha, C, sample_weight, + learning_rate, max_iter): + """Fit a binary classifier on X and y. """ + coef, intercept, n_iter_ = fit_binary(self, 1, X, y, alpha, C, + learning_rate, max_iter, + self._expanded_class_weight[1], + self._expanded_class_weight[0], + sample_weight, + random_state=self.random_state) + + self.t_ += n_iter_ * X.shape[0] + self.n_iter_ = n_iter_ + + # need to be 2d + if self.average > 0: + if self.average <= self.t_ - 1: + self.coef_ = self._average_coef.reshape(1, -1) + self.intercept_ = self._average_intercept + else: + self.coef_ = self._standard_coef.reshape(1, -1) + self._standard_intercept = np.atleast_1d(intercept) + self.intercept_ = self._standard_intercept + else: + self.coef_ = coef.reshape(1, -1) + # intercept is a float, need to convert it to an array of length 1 + self.intercept_ = np.atleast_1d(intercept) + + def _fit_multiclass(self, X, y, alpha, C, learning_rate, + sample_weight, max_iter): + """Fit a multi-class classifier by combining binary classifiers + Each binary classifier predicts one class versus all others. This + strategy is called OvA (One versus All) or OvR (One versus Rest). + """ + # Precompute the validation split using the multiclass labels + # to ensure proper balancing of the classes. + validation_mask = self._make_validation_split(y) + + # Use joblib to fit OvA in parallel. + # Pick the random seed for each job outside of fit_binary to avoid + # sharing the estimator random state between threads which could lead + # to non-deterministic behavior + random_state = check_random_state(self.random_state) + seeds = random_state.randint(MAX_INT, size=len(self.classes_)) + result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, + **_joblib_parallel_args(require="sharedmem"))( + delayed(fit_binary)(self, i, X, y, alpha, C, learning_rate, + max_iter, self._expanded_class_weight[i], + 1., sample_weight, + validation_mask=validation_mask, + random_state=seed) + for i, seed in enumerate(seeds)) + + # take the maximum of n_iter_ over every binary fit + n_iter_ = 0. + for i, (_, intercept, n_iter_i) in enumerate(result): + self.intercept_[i] = intercept + n_iter_ = max(n_iter_, n_iter_i) + + self.t_ += n_iter_ * X.shape[0] + self.n_iter_ = n_iter_ + + if self.average > 0: + if self.average <= self.t_ - 1.0: + self.coef_ = self._average_coef + self.intercept_ = self._average_intercept + else: + self.coef_ = self._standard_coef + self._standard_intercept = np.atleast_1d(self.intercept_) + self.intercept_ = self._standard_intercept + + def partial_fit(self, X, y, classes=None, sample_weight=None): + """Perform one epoch of stochastic gradient descent on given samples. + Internally, this method uses ``max_iter = 1``. Therefore, it is not + guaranteed that a minimum of the cost function is reached after calling + it once. Matters such as objective convergence and early stopping + should be handled by the user. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Subset of the training data. + y : ndarray of shape (n_samples,) + Subset of the target values. + classes : ndarray of shape (n_classes,), default=None + Classes across all calls to partial_fit. + Can be obtained by via `np.unique(y_all)`, where y_all is the + target vector of the entire dataset. + This argument is required for the first call to partial_fit + and can be omitted in the subsequent calls. + Note that y doesn't need to contain all labels in `classes`. + sample_weight : array-like, shape (n_samples,), default=None + Weights applied to individual samples. + If not provided, uniform weights are assumed. + Returns + ------- + self : + Returns an instance of self. + """ + self._validate_params(for_partial_fit=True) + if self.class_weight in ['balanced']: + raise ValueError("class_weight '{0}' is not supported for " + "partial_fit. In order to use 'balanced' weights," + " use compute_class_weight('{0}', " + "classes=classes, y=y). " + "In place of y you can us a large enough sample " + "of the full training set target to properly " + "estimate the class frequency distributions. " + "Pass the resulting weights as the class_weight " + "parameter.".format(self.class_weight)) + return self._partial_fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss, + learning_rate=self.learning_rate, max_iter=1, + classes=classes, sample_weight=sample_weight, + coef_init=None, intercept_init=None) + + def fit(self, X, y, coef_init=None, intercept_init=None, + sample_weight=None): + """Fit linear model with Stochastic Gradient Descent. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + y : ndarray of shape (n_samples,) + Target values. + coef_init : ndarray of shape (n_classes, n_features), default=None + The initial coefficients to warm-start the optimization. + intercept_init : ndarray of shape (n_classes,), default=None + The initial intercept to warm-start the optimization. + sample_weight : array-like, shape (n_samples,), default=None + Weights applied to individual samples. + If not provided, uniform weights are assumed. These weights will + be multiplied with class_weight (passed through the + constructor) if class_weight is specified. + Returns + ------- + self : + Returns an instance of self. + """ + return self._fit(X, y, alpha=self.alpha, C=1.0, + loss=self.loss, learning_rate=self.learning_rate, + coef_init=coef_init, intercept_init=intercept_init, + sample_weight=sample_weight) + + +def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, + pos_weight, neg_weight, sample_weight, validation_mask=None, + random_state=None): + """Fit a single binary classifier. + The i'th class is considered the "positive" class. + Parameters + ---------- + est : Estimator object + The estimator to fit + i : int + Index of the positive class + X : numpy array or sparse matrix of shape [n_samples,n_features] + Training data + y : numpy array of shape [n_samples, ] + Target values + alpha : float + The regularization parameter + C : float + Maximum step size for passive aggressive + learning_rate : string + The learning rate. Accepted values are 'constant', 'optimal', + 'invscaling', 'pa1' and 'pa2'. + max_iter : int + The maximum number of iterations (epochs) + pos_weight : float + The weight of the positive class + neg_weight : float + The weight of the negative class + sample_weight : numpy array of shape [n_samples, ] + The weight of each sample + validation_mask : numpy array of shape [n_samples, ], default=None + Precomputed validation mask in case _fit_binary is called in the + context of a one-vs-rest reduction. + random_state : int, RandomState instance, default=None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + """ + # if average is not true, average_coef, and average_intercept will be + # unused + y_i, coef, intercept, average_coef, average_intercept = \ + _prepare_fit_binary(est, y, i) + assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0] + + random_state = check_random_state(random_state) + dataset, intercept_decay = make_dataset( + X, y_i, sample_weight, random_state=random_state) + + penalty_type = est._get_penalty_type(est.penalty) + learning_rate_type = est._get_learning_rate_type(learning_rate) + + if validation_mask is None: + validation_mask = est._make_validation_split(y_i) + classes = np.array([-1, 1], dtype=y_i.dtype) + validation_score_cb = est._make_validation_score_cb( + validation_mask, X, y_i, sample_weight, classes=classes) + + # numpy mtrand expects a C long which is a signed 32 bit integer under + # Windows + seed = random_state.randint(MAX_INT) + + tol = est.tol if est.tol is not None else -np.inf + + coef, intercept, average_coef, average_intercept, n_iter_ = _plain_sgd( + coef, intercept, average_coef, average_intercept, est.loss_function_, + penalty_type, alpha, C, est.l1_ratio, dataset, validation_mask, + est.early_stopping, validation_score_cb, int(est.n_iter_no_change), + max_iter, tol, int(est.fit_intercept), int(est.verbose), + int(est.shuffle), seed, pos_weight, neg_weight, learning_rate_type, + est.eta0, est.power_t, est.t_, intercept_decay, est.average) + + if est.average: + if len(est.classes_) == 2: + est._average_intercept[0] = average_intercept + else: + est._average_intercept[i] = average_intercept + + return coef, intercept, n_iter_ + + +def _prepare_fit_binary(est, y, i): + """Initialization for fit_binary. + Returns y, coef, intercept, average_coef, average_intercept. + """ + y_i = np.ones(y.shape, dtype=np.float64, order="C") + y_i[y != est.classes_[i]] = -1.0 + average_intercept = 0 + average_coef = None + + if len(est.classes_) == 2: + if not est.average: + coef = est.coef_.ravel() + intercept = est.intercept_[0] + else: + coef = est._standard_coef.ravel() + intercept = est._standard_intercept[0] + average_coef = est._average_coef.ravel() + average_intercept = est._average_intercept[0] + else: + if not est.average: + coef = est.coef_[i] + intercept = est.intercept_[i] + else: + coef = est._standard_coef[i] + intercept = est._standard_intercept[i] + average_coef = est._average_coef[i] + average_intercept = est._average_intercept[i] + + return y_i, coef, intercept, average_coef, average_intercept diff --git a/afsklearn/linear_model/sgd_classifier.py b/afsklearn/linear_model/sgd_classifier.py new file mode 100644 index 0000000..04a755e --- /dev/null +++ b/afsklearn/linear_model/sgd_classifier.py @@ -0,0 +1,329 @@ +import numpy as np # FIXME +from sklearn.linear_model._stochastic_gradient import DEFAULT_EPSILON +from sklearn.utils import _deprecate_positional_args + +from .._validation import check_is_fitted +from .sgd_base import afBaseSGDClassifier + + +class SGDClassifier(afBaseSGDClassifier): + """Linear classifiers (SVM, logistic regression, etc.) with SGD training. + This estimator implements regularized linear models with stochastic + gradient descent (SGD) learning: the gradient of the loss is estimated + each sample at a time and the model is updated along the way with a + decreasing strength schedule (aka learning rate). SGD allows minibatch + (online/out-of-core) learning via the `partial_fit` method. + For best results using the default learning rate schedule, the data should + have zero mean and unit variance. + This implementation works with data represented as dense or sparse arrays + of floating point values for the features. The model it fits can be + controlled with the loss parameter; by default, it fits a linear support + vector machine (SVM). + The regularizer is a penalty added to the loss function that shrinks model + parameters towards the zero vector using either the squared euclidean norm + L2 or the absolute norm L1 or a combination of both (Elastic Net). If the + parameter update crosses the 0.0 value because of the regularizer, the + update is truncated to 0.0 to allow for learning sparse models and achieve + online feature selection. + Read more in the :ref:`User Guide `. + Parameters + ---------- + loss : str, default='hinge' + The loss function to be used. Defaults to 'hinge', which gives a + linear SVM. + The possible options are 'hinge', 'log', 'modified_huber', + 'squared_hinge', 'perceptron', or a regression loss: 'squared_loss', + 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. + The 'log' loss gives logistic regression, a probabilistic classifier. + 'modified_huber' is another smooth loss that brings tolerance to + outliers as well as probability estimates. + 'squared_hinge' is like hinge but is quadratically penalized. + 'perceptron' is the linear loss used by the perceptron algorithm. + The other losses are designed for regression but can be useful in + classification as well; see + :class:`~sklearn.linear_model.SGDRegressor` for a description. + More details about the losses formulas can be found in the + :ref:`User Guide `. + penalty : {'l2', 'l1', 'elasticnet'}, default='l2' + The penalty (aka regularization term) to be used. Defaults to 'l2' + which is the standard regularizer for linear SVM models. 'l1' and + 'elasticnet' might bring sparsity to the model (feature selection) + not achievable with 'l2'. + alpha : float, default=0.0001 + Constant that multiplies the regularization term. The higher the + value, the stronger the regularization. + Also used to compute the learning rate when set to `learning_rate` is + set to 'optimal'. + l1_ratio : float, default=0.15 + The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. + l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. + Only used if `penalty` is 'elasticnet'. + fit_intercept : bool, default=True + Whether the intercept should be estimated or not. If False, the + data is assumed to be already centered. + max_iter : int, default=1000 + The maximum number of passes over the training data (aka epochs). + It only impacts the behavior in the ``fit`` method, and not the + :meth:`partial_fit` method. + .. versionadded:: 0.19 + tol : float, default=1e-3 + The stopping criterion. If it is not None, training will stop + when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive + epochs. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + .. versionadded:: 0.19 + shuffle : bool, default=True + Whether or not the training data should be shuffled after each epoch. + verbose : int, default=0 + The verbosity level. + epsilon : float, default=0.1 + Epsilon in the epsilon-insensitive loss functions; only if `loss` is + 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'. + For 'huber', determines the threshold at which it becomes less + important to get the prediction exactly right. + For epsilon-insensitive, any differences between the current prediction + and the correct label are ignored if they are less than this threshold. + n_jobs : int, default=None + The number of CPUs to use to do the OVA (One Versus All, for + multi-class problems) computation. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. + random_state : int, RandomState instance, default=None + Used for shuffling the data, when ``shuffle`` is set to ``True``. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. + learning_rate : str, default='optimal' + The learning rate schedule: + - 'constant': `eta = eta0` + - 'optimal': `eta = 1.0 / (alpha * (t + t0))` + where t0 is chosen by a heuristic proposed by Leon Bottou. + - 'invscaling': `eta = eta0 / pow(t, power_t)` + - 'adaptive': eta = eta0, as long as the training keeps decreasing. + Each time n_iter_no_change consecutive epochs fail to decrease the + training loss by tol or fail to increase validation score by tol if + early_stopping is True, the current learning rate is divided by 5. + .. versionadded:: 0.20 + Added 'adaptive' option + eta0 : double, default=0.0 + The initial learning rate for the 'constant', 'invscaling' or + 'adaptive' schedules. The default value is 0.0 as eta0 is not used by + the default schedule 'optimal'. + power_t : double, default=0.5 + The exponent for inverse scaling learning rate [default 0.5]. + early_stopping : bool, default=False + Whether to use early stopping to terminate training when validation + score is not improving. If set to True, it will automatically set aside + a stratified fraction of training data as validation and terminate + training when validation score returned by the `score` method is not + improving by at least tol for n_iter_no_change consecutive epochs. + .. versionadded:: 0.20 + Added 'early_stopping' option + validation_fraction : float, default=0.1 + The proportion of training data to set aside as validation set for + early stopping. Must be between 0 and 1. + Only used if `early_stopping` is True. + .. versionadded:: 0.20 + Added 'validation_fraction' option + n_iter_no_change : int, default=5 + Number of iterations with no improvement to wait before stopping + fitting. + Convergence is checked against the training loss or the + validation loss depending on the `early_stopping` parameter. + .. versionadded:: 0.20 + Added 'n_iter_no_change' option + class_weight : dict, {class_label: weight} or "balanced", default=None + Preset for the class_weight fit parameter. + Weights associated with classes. If not given, all classes + are supposed to have weight one. + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))``. + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + See :term:`the Glossary `. + Repeatedly calling fit or partial_fit when warm_start is True can + result in a different solution than when calling fit a single time + because of the way the data is shuffled. + If a dynamic learning rate is used, the learning rate is adapted + depending on the number of samples already seen. Calling ``fit`` resets + this counter, while ``partial_fit`` will result in increasing the + existing counter. + average : bool or int, default=False + When set to True, computes the averaged SGD weights accross all + updates and stores the result in the ``coef_`` attribute. If set to + an int greater than 1, averaging will begin once the total number of + samples seen reaches `average`. So ``average=10`` will begin + averaging after seeing 10 samples. + Attributes + ---------- + coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \ + (n_classes, n_features) + Weights assigned to the features. + intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,) + Constants in decision function. + n_iter_ : int + The actual number of iterations before reaching the stopping criterion. + For multiclass fits, it is the maximum over every binary fit. + loss_function_ : concrete ``LossFunction`` + classes_ : array of shape (n_classes,) + t_ : int + Number of weight updates performed during training. + Same as ``(n_iter_ * n_samples)``. + See Also + -------- + sklearn.svm.LinearSVC : Linear support vector classification. + LogisticRegression : Logistic regression. + Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to + ``SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", + penalty=None)``. + Examples + -------- + >>> import numpy as np + >>> from sklearn.linear_model import SGDClassifier + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.pipeline import make_pipeline + >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + >>> Y = np.array([1, 1, 2, 2]) + >>> # Always scale the input. The most convenient way is to use a pipeline. + >>> clf = make_pipeline(StandardScaler(), + ... SGDClassifier(max_iter=1000, tol=1e-3)) + >>> clf.fit(X, Y) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('sgdclassifier', SGDClassifier())]) + >>> print(clf.predict([[-0.8, -1]])) + [1] + """ + @_deprecate_positional_args + def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001, + l1_ratio=0.15, + fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, + verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, + random_state=None, learning_rate="optimal", eta0=0.0, + power_t=0.5, early_stopping=False, validation_fraction=0.1, + n_iter_no_change=5, class_weight=None, warm_start=False, + average=False): + super().__init__( + loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, + fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, + shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs, + random_state=random_state, learning_rate=learning_rate, eta0=eta0, + power_t=power_t, early_stopping=early_stopping, + validation_fraction=validation_fraction, + n_iter_no_change=n_iter_no_change, class_weight=class_weight, + warm_start=warm_start, average=average) + + def _check_proba(self): + if self.loss not in ("log", "modified_huber"): + raise AttributeError("probability estimates are not available for" + " loss=%r" % self.loss) + + @property + def predict_proba(self): + """Probability estimates. + This method is only available for log loss and modified Huber loss. + Multiclass probability estimates are derived from binary (one-vs.-rest) + estimates by simple normalization, as recommended by Zadrozny and + Elkan. + Binary probability estimates for loss="modified_huber" are given by + (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions + it is necessary to perform proper probability calibration by wrapping + the classifier with + :class:`~sklearn.calibration.CalibratedClassifierCV` instead. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data for prediction. + Returns + ------- + ndarray of shape (n_samples, n_classes) + Returns the probability of the sample for each class in the model, + where classes are ordered as they are in `self.classes_`. + References + ---------- + Zadrozny and Elkan, "Transforming classifier scores into multiclass + probability estimates", SIGKDD'02, + http://www.research.ibm.com/people/z/zadrozny/kdd2002-Transf.pdf + The justification for the formula in the loss="modified_huber" + case is in the appendix B in: + http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf + """ + self._check_proba() + return self._predict_proba + + def _predict_proba(self, X): + check_is_fitted(self) + + if self.loss == "log": + return self._predict_proba_lr(X) + + elif self.loss == "modified_huber": + binary = (len(self.classes_) == 2) + scores = self.decision_function(X) + + if binary: + prob2 = np.ones((scores.shape[0], 2)) + prob = prob2[:, 1] + else: + prob = scores + + np.clip(scores, -1, 1, prob) + prob += 1. + prob /= 2. + + if binary: + prob2[:, 0] -= prob + prob = prob2 + else: + # the above might assign zero to all classes, which doesn't + # normalize neatly; work around this to produce uniform + # probabilities + prob_sum = prob.sum(axis=1) + all_zero = (prob_sum == 0) + if np.any(all_zero): + prob[all_zero, :] = 1 + prob_sum[all_zero] = len(self.classes_) + + # normalize + prob /= prob_sum.reshape((prob.shape[0], -1)) + + return prob + + else: + raise NotImplementedError("predict_(log_)proba only supported when" + " loss='log' or loss='modified_huber' " + "(%r given)" % self.loss) + + @property + def predict_log_proba(self): + """Log of probability estimates. + This method is only available for log loss and modified Huber loss. + When loss="modified_huber", probability estimates may be hard zeros + and ones, so taking the logarithm is not possible. + See ``predict_proba`` for details. + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data for prediction. + Returns + ------- + T : array-like, shape (n_samples, n_classes) + Returns the log-probability of the sample for each class in the + model, where classes are ordered as they are in + `self.classes_`. + """ + self._check_proba() + return self._predict_log_proba + + def _predict_log_proba(self, X): + return np.log(self.predict_proba(X)) + + def _more_tags(self): + return { + '_xfail_checks': { + 'check_sample_weights_invariance': + 'zero sample_weight is not equivalent to removing samples', + } + } diff --git a/afsklearn/neural_network/base.py b/afsklearn/neural_network/base.py index 605ecaa..dd83105 100644 --- a/afsklearn/neural_network/base.py +++ b/afsklearn/neural_network/base.py @@ -1,32 +1,19 @@ -import numpy as np -import arrayfire as af -import time -from math import sqrt - -from abc import ABCMeta, abstractmethod import warnings +from abc import ABCMeta, abstractmethod +from math import sqrt -import sklearn +import arrayfire as af +import numpy as np from sklearn.base import is_classifier -from sklearn.utils.validation import _deprecate_positional_args -from sklearn.utils import check_random_state -from sklearn.utils import gen_batches -from sklearn.utils import shuffle +from sklearn.exceptions import ConvergenceWarning from sklearn.model_selection import train_test_split +from sklearn.utils import check_random_state, gen_batches, shuffle -from ..base import afBaseEstimator -from .._stochastic_optimizers import SGDOptimizer, AdamOptimizer -from .._validation import _safe_indexing, check_is_fitted, check_array, column_or_1d from .._extmath import safe_sparse_dot from .._nn_utils import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS - -# from ..exceptions import ConvergenceWarning -# from ..utils.extmath import safe_sparse_dot -# from ..utils.multiclass import _check_partial_fit_first_call, unique_labels -# from ..utils.multiclass import type_of_target -# from ..utils.optimize import _check_optimize_result -# import scipy.optimize - +from .._stochastic_optimizers import AdamOptimizer, SGDOptimizer +from .._validation import _safe_indexing, check_array +from ..base import afBaseEstimator _STOCHASTIC_SOLVERS = ['sgd', 'adam'] @@ -108,7 +95,7 @@ def _forward_pass(self, activations): activations[i + 1] = output_activation(activations[i + 1]) return activations -# + def _compute_loss_grad(self, layer, n_samples, activations, deltas, coef_grads, intercept_grads): """Compute the gradient of loss with respect to coefs and intercept for @@ -247,7 +234,7 @@ def _initialize(self, y, layer_units): # Output for regression if not is_classifier(self): self.out_activation_ = 'identity' - # Output for multi class + # Output for multi class elif self._label_binarizer.y_type_ == 'multiclass': self.out_activation_ = 'softmax' # Output for binary class and multi-label @@ -304,8 +291,8 @@ def _fit(self, X, y, incremental=False): n_samples, n_features = X.shape # Ensure y is 2D - #if y.numdims() == 1: - #y = af.moddims(y, y.elements(), 1) + # if y.numdims() == 1: + # y = af.moddims(y, y.elements(), 1) self.n_outputs_ = y.shape[1] if y.numdims() > 1 else 1 @@ -452,8 +439,6 @@ def _validate_hyperparameters(self): # def _fit_stochastic(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental): - - if not incremental or not hasattr(self, '_optimizer'): params = self.coefs_ + self.intercepts_ @@ -499,7 +484,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads, sample_idx = shuffle(sample_idx, random_state=self._random_state) - #sloooow loop + # sloooow loop accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): if self.shuffle: @@ -565,7 +550,6 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads, "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning) - except KeyboardInterrupt: warnings.warn("Training interrupted by user.") @@ -668,8 +652,8 @@ def _predict(self, X): activations = [X] for i in range(self.n_layers_ - 1): - #activations.append(np.empty((X.shape[0], - #layer_units[i + 1]))) + # activations.append(np.empty((X.shape[0], + # layer_units[i + 1]))) activations.append(af.constant(0, X.shape[0], layer_units[i + 1])) # forward propagate diff --git a/afsklearn/neural_network/mlp_classifier.py b/afsklearn/neural_network/mlp_classifier.py index 8b74e86..8dbfa54 100644 --- a/afsklearn/neural_network/mlp_classifier.py +++ b/afsklearn/neural_network/mlp_classifier.py @@ -1,12 +1,12 @@ import arrayfire as af import numpy as np +from sklearn.utils.validation import _deprecate_positional_args +from .._classifier_mixin import afClassifierMixin +from .._validation import check_is_fitted, column_or_1d from ..base import afLabelBinarizer, unique_labels from .base import BaseMultilayerPerceptron -from .._classifier_mixin import afClassifierMixin -from .._validation import column_or_1d, check_is_fitted -from sklearn.utils.validation import _deprecate_positional_args class MLPClassifier(afClassifierMixin, BaseMultilayerPerceptron): """Multi-layer Perceptron classifier. @@ -215,7 +215,7 @@ def _validate_input(self, X, y, incremental): X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True) # if y.ndim == 2 and y.shape[1] == 1: - #y = column_or_1d(y, warn=True) + # y = column_or_1d(y, warn=True) if y.ndim == 2 and y.dims(1) == 1: y = column_or_1d(y, warn=True) diff --git a/afsklearn/patched_modules.yml b/afsklearn/patched_modules.yml index 4621b0c..07fcda4 100644 --- a/afsklearn/patched_modules.yml +++ b/afsklearn/patched_modules.yml @@ -12,3 +12,13 @@ simple_imputer: name: SimpleImputer module: sklearn.impute module_patch: afsklearn.impute.simple_imputer + +sgd_classifier: + name: SGDClassifier + module: sklearn.linear_model + module_patch: afsklearn.linear_model.sgd_classifier + +one_hot_encoder: + name: OneHotEncoder + module: sklearn.preprocessing + module_patch: afsklearn.preprocessing._encoders diff --git a/afsklearn/preprocessing/_encoders.py b/afsklearn/preprocessing/_encoders.py new file mode 100644 index 0000000..4b6db6b --- /dev/null +++ b/afsklearn/preprocessing/_encoders.py @@ -0,0 +1,588 @@ +import numpy as np # FIXME +from scipy import sparse +from sklearn.utils import _deprecate_positional_args + +from .._encode import _check_unknown, _encode, _unique +from .._validation import check_array, check_is_fitted, is_scalar_nan +from ..base import afBaseEstimator, afTransformerMixin + + +class _BaseEncoder(afTransformerMixin, afBaseEstimator): + """ + Base class for encoders that includes the code to categorize and + transform the input features. + """ + + def _check_X(self, X, force_all_finite=True): + """ + Perform custom check_array: + - convert list of strings to object dtype + - check for missing values for object dtype data (check_array does + not do that) + - return list of features (arrays): this list of features is + constructed feature by feature to preserve the data types + of pandas DataFrame columns, as otherwise information is lost + and cannot be used, eg for the `categories_` attribute. + """ + if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2): + # if not a dataframe, do normal check_array validation + X_temp = check_array(X, dtype=None, + force_all_finite=force_all_finite) + if (not hasattr(X, 'dtype') + and np.issubdtype(X_temp.dtype, np.str_)): + X = check_array(X, dtype=object, + force_all_finite=force_all_finite) + else: + X = X_temp + needs_validation = False + else: + # pandas dataframe, do validation later column by column, in order + # to keep the dtype information to be used in the encoder. + needs_validation = force_all_finite + + n_samples, n_features = X.shape + X_columns = [] + + for i in range(n_features): + Xi = self._get_feature(X, feature_idx=i) + Xi = check_array(Xi, ensure_2d=False, dtype=None, + force_all_finite=needs_validation) + X_columns.append(Xi) + + return X_columns, n_samples, n_features + + def _get_feature(self, X, feature_idx): + if hasattr(X, 'iloc'): + # pandas dataframes + return X.iloc[:, feature_idx] + # numpy arrays, sparse arrays + return X[:, feature_idx] + + def _fit(self, X, handle_unknown='error', force_all_finite=True): + X_list, n_samples, n_features = self._check_X( + X, force_all_finite=force_all_finite) + + if self.categories != 'auto': + if len(self.categories) != n_features: + raise ValueError("Shape mismatch: if categories is an array," + " it has to be of shape (n_features,).") + + self.categories_ = [] + + for i in range(n_features): + Xi = X_list[i] + if self.categories == 'auto': + cats = _unique(Xi) + else: + cats = np.array(self.categories[i], dtype=Xi.dtype) + if Xi.dtype.kind not in 'OUS': + sorted_cats = np.sort(cats) + error_msg = ("Unsorted categories are not " + "supported for numerical categories") + # if there are nans, nan should be the last element + stop_idx = -1 if np.isnan(sorted_cats[-1]) else None + if (np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or + (np.isnan(sorted_cats[-1]) and + not np.isnan(sorted_cats[-1]))): + raise ValueError(error_msg) + + if handle_unknown == 'error': + diff = _check_unknown(Xi, cats) + if diff: + msg = ("Found unknown categories {0} in column {1}" + " during fit".format(diff, i)) + raise ValueError(msg) + self.categories_.append(cats) + + def _transform(self, X, handle_unknown='error', force_all_finite=True): + X_list, n_samples, n_features = self._check_X( + X, force_all_finite=force_all_finite) + + X_int = np.zeros((n_samples, n_features), dtype=int) + X_mask = np.ones((n_samples, n_features), dtype=bool) + + if n_features != len(self.categories_): + raise ValueError( + "The number of features in X is different to the number of " + "features of the fitted data. The fitted data had {} features " + "and the X has {} features." + .format(len(self.categories_,), n_features) + ) + + for i in range(n_features): + Xi = X_list[i] + diff, valid_mask = _check_unknown(Xi, self.categories_[i], + return_mask=True) + + if not np.all(valid_mask): + if handle_unknown == 'error': + msg = ("Found unknown categories {0} in column {1}" + " during transform".format(diff, i)) + raise ValueError(msg) + else: + # Set the problematic rows to an acceptable value and + # continue `The rows are marked `X_mask` and will be + # removed later. + X_mask[:, i] = valid_mask + # cast Xi into the largest string type necessary + # to handle different lengths of numpy strings + if (self.categories_[i].dtype.kind in ('U', 'S') + and self.categories_[i].itemsize > Xi.itemsize): + Xi = Xi.astype(self.categories_[i].dtype) + elif (self.categories_[i].dtype.kind == 'O' and + Xi.dtype.kind == 'U'): + # categories are objects and Xi are numpy strings. + # Cast Xi to an object dtype to prevent truncation + # when setting invalid values. + Xi = Xi.astype('O') + else: + Xi = Xi.copy() + + Xi[~valid_mask] = self.categories_[i][0] + # We use check_unknown=False, since _check_unknown was + # already called above. + X_int[:, i] = _encode(Xi, uniques=self.categories_[i], + check_unknown=False) + + return X_int, X_mask + + def _more_tags(self): + return {'X_types': ['categorical']} + + +class OneHotEncoder(_BaseEncoder): + """ + Encode categorical features as a one-hot numeric array. + The input to this transformer should be an array-like of integers or + strings, denoting the values taken on by categorical (discrete) features. + The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') + encoding scheme. This creates a binary column for each category and + returns a sparse matrix or dense array (depending on the ``sparse`` + parameter) + By default, the encoder derives the categories based on the unique values + in each feature. Alternatively, you can also specify the `categories` + manually. + This encoding is needed for feeding categorical data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + Note: a one-hot encoding of y labels should use a LabelBinarizer + instead. + Read more in the :ref:`User Guide `. + Parameters + ---------- + categories : 'auto' or a list of array-like, default='auto' + Categories (unique values) per feature: + - 'auto' : Determine categories automatically from the training data. + - list : ``categories[i]`` holds the categories expected in the ith + column. The passed categories should not mix strings and numeric + values within a single feature, and should be sorted in case of + numeric values. + The used categories can be found in the ``categories_`` attribute. + .. versionadded:: 0.20 + drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \ + default=None + Specifies a methodology to use to drop one of the categories per + feature. This is useful in situations where perfectly collinear + features cause problems, such as when feeding the resulting data + into a neural network or an unregularized regression. + However, dropping one category breaks the symmetry of the original + representation and can therefore induce a bias in downstream models, + for instance for penalized linear classification or regression models. + - None : retain all features (the default). + - 'first' : drop the first category in each feature. If only one + category is present, the feature will be dropped entirely. + - 'if_binary' : drop the first category in each feature with two + categories. Features with 1 or more than 2 categories are + left intact. + - array : ``drop[i]`` is the category in feature ``X[:, i]`` that + should be dropped. + .. versionadded:: 0.21 + The parameter `drop` was added in 0.21. + .. versionchanged:: 0.23 + The option `drop='if_binary'` was added in 0.23. + sparse : bool, default=True + Will return sparse matrix if set True else will return an array. + dtype : number type, default=float + Desired dtype of output. + handle_unknown : {'error', 'ignore'}, default='error' + Whether to raise an error or ignore if an unknown categorical feature + is present during transform (default is to raise). When this parameter + is set to 'ignore' and an unknown category is encountered during + transform, the resulting one-hot encoded columns for this feature + will be all zeros. In the inverse transform, an unknown category + will be denoted as None. + Attributes + ---------- + categories_ : list of arrays + The categories of each feature determined during fitting + (in order of the features in X and corresponding with the output + of ``transform``). This includes the category specified in ``drop`` + (if any). + drop_idx_ : array of shape (n_features,) + - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category + to be dropped for each feature. + - ``drop_idx_[i] = None`` if no category is to be dropped from the + feature with index ``i``, e.g. when `drop='if_binary'` and the + feature isn't binary. + - ``drop_idx_ = None`` if all the transformed features will be + retained. + .. versionchanged:: 0.23 + Added the possibility to contain `None` values. + See Also + -------- + OrdinalEncoder : Performs an ordinal (integer) + encoding of the categorical features. + sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot + encoding of dictionary items or strings. + LabelBinarizer : Binarizes labels in a one-vs-all + fashion. + MultiLabelBinarizer : Transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. + Examples + -------- + Given a dataset with two features, we let the encoder find the unique + values per feature and transform the data to a binary one-hot encoding. + >>> from sklearn.preprocessing import OneHotEncoder + One can discard categories not seen during `fit`: + >>> enc = OneHotEncoder(handle_unknown='ignore') + >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] + >>> enc.fit(X) + OneHotEncoder(handle_unknown='ignore') + >>> enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() + array([[1., 0., 1., 0., 0.], + [0., 1., 0., 0., 0.]]) + >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) + array([['Male', 1], + [None, 2]], dtype=object) + >>> enc.get_feature_names(['gender', 'group']) + array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], + dtype=object) + One can always drop the first column for each feature: + >>> drop_enc = OneHotEncoder(drop='first').fit(X) + >>> drop_enc.categories_ + [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] + >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray() + array([[0., 0., 0.], + [1., 1., 0.]]) + Or drop a column for feature only having 2 categories: + >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X) + >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray() + array([[0., 1., 0., 0.], + [1., 0., 1., 0.]]) + """ + + @_deprecate_positional_args + def __init__(self, *, categories='auto', drop=None, sparse=True, + dtype=np.float64, handle_unknown='error'): + self.categories = categories + self.sparse = sparse + self.dtype = dtype + self.handle_unknown = handle_unknown + self.drop = drop + + def _validate_keywords(self): + if self.handle_unknown not in ('error', 'ignore'): + msg = ("handle_unknown should be either 'error' or 'ignore', " + "got {0}.".format(self.handle_unknown)) + raise ValueError(msg) + # If we have both dropped columns and ignored unknown + # values, there will be ambiguous cells. This creates difficulties + # in interpreting the model. + if self.drop is not None and self.handle_unknown != 'error': + raise ValueError( + "`handle_unknown` must be 'error' when the drop parameter is " + "specified, as both would create categories that are all " + "zero.") + + def _compute_drop_idx(self): + if self.drop is None: + return None + elif isinstance(self.drop, str): + if self.drop == 'first': + return np.zeros(len(self.categories_), dtype=object) + elif self.drop == 'if_binary': + return np.array([0 if len(cats) == 2 else None + for cats in self.categories_], dtype=object) + else: + msg = ( + "Wrong input for parameter `drop`. Expected " + "'first', 'if_binary', None or array of objects, got {}" + ) + raise ValueError(msg.format(type(self.drop))) + + else: + try: + drop_array = np.asarray(self.drop, dtype=object) + droplen = len(drop_array) + except (ValueError, TypeError): + msg = ( + "Wrong input for parameter `drop`. Expected " + "'first', 'if_binary', None or array of objects, got {}" + ) + raise ValueError(msg.format(type(drop_array))) + if droplen != len(self.categories_): + msg = ("`drop` should have length equal to the number " + "of features ({}), got {}") + raise ValueError(msg.format(len(self.categories_), droplen)) + missing_drops = [] + drop_indices = [] + for col_idx, (val, cat_list) in enumerate(zip(drop_array, + self.categories_)): + if not is_scalar_nan(val): + drop_idx = np.where(cat_list == val)[0] + if drop_idx.size: # found drop idx + drop_indices.append(drop_idx[0]) + else: + missing_drops.append((col_idx, val)) + continue + + # val is nan, find nan in categories manually + for cat_idx, cat in enumerate(cat_list): + if is_scalar_nan(cat): + drop_indices.append(cat_idx) + break + else: # loop did not break thus drop is missing + missing_drops.append((col_idx, val)) + + if any(missing_drops): + msg = ("The following categories were supposed to be " + "dropped, but were not found in the training " + "data.\n{}".format( + "\n".join( + ["Category: {}, Feature: {}".format(c, v) + for c, v in missing_drops]))) + raise ValueError(msg) + return np.array(drop_indices, dtype=object) + + def fit(self, X, y=None): + """ + Fit OneHotEncoder to X. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to determine the categories of each feature. + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + Returns + ------- + self + """ + self._validate_keywords() + self._fit(X, handle_unknown=self.handle_unknown, + force_all_finite='allow-nan') + self.drop_idx_ = self._compute_drop_idx() + return self + + def fit_transform(self, X, y=None): + """ + Fit OneHotEncoder to X, then transform X. + Equivalent to fit(X).transform(X) but more convenient. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + y : None + Ignored. This parameter exists only for compatibility with + :class:`~sklearn.pipeline.Pipeline`. + Returns + ------- + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed input. If `sparse=True`, a sparse matrix will be + returned. + """ + self._validate_keywords() + return super().fit_transform(X, y) + + def transform(self, X): + """ + Transform X using one-hot encoding. + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to encode. + Returns + ------- + X_out : {ndarray, sparse matrix} of shape \ + (n_samples, n_encoded_features) + Transformed input. If `sparse=True`, a sparse matrix will be + returned. + """ + check_is_fitted(self) + # validation of X happens in _check_X called by _transform + X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown, + force_all_finite='allow-nan') + + n_samples, n_features = X_int.shape + + if self.drop_idx_ is not None: + to_drop = self.drop_idx_.copy() + # We remove all the dropped categories from mask, and decrement all + # categories that occur after them to avoid an empty column. + keep_cells = X_int != to_drop + n_values = [] + for i, cats in enumerate(self.categories_): + n_cats = len(cats) + + # drop='if_binary' but feature isn't binary + if to_drop[i] is None: + # set to cardinality to not drop from X_int + to_drop[i] = n_cats + n_values.append(n_cats) + else: # dropped + n_values.append(n_cats - 1) + + to_drop = to_drop.reshape(1, -1) + X_int[X_int > to_drop] -= 1 + X_mask &= keep_cells + else: + n_values = [len(cats) for cats in self.categories_] + + mask = X_mask.ravel() + feature_indices = np.cumsum([0] + n_values) + indices = (X_int + feature_indices[:-1]).ravel()[mask] + + indptr = np.empty(n_samples + 1, dtype=int) + indptr[0] = 0 + np.sum(X_mask, axis=1, out=indptr[1:]) + np.cumsum(indptr[1:], out=indptr[1:]) + data = np.ones(indptr[-1]) + + out = sparse.csr_matrix((data, indices, indptr), + shape=(n_samples, feature_indices[-1]), + dtype=self.dtype) + if not self.sparse: + return out.toarray() + else: + return out + + def inverse_transform(self, X): + """ + Convert the data back to the original representation. + In case unknown categories are encountered (all zeros in the + one-hot encoding), ``None`` is used to represent this category. + Parameters + ---------- + X : {array-like, sparse matrix} of shape \ + (n_samples, n_encoded_features) + The transformed data. + Returns + ------- + X_tr : ndarray of shape (n_samples, n_features) + Inverse transformed array. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse='csr') + + n_samples, _ = X.shape + n_features = len(self.categories_) + if self.drop_idx_ is None: + n_transformed_features = sum(len(cats) + for cats in self.categories_) + else: + n_transformed_features = sum( + len(cats) - 1 if to_drop is not None else len(cats) + for cats, to_drop in zip(self.categories_, self.drop_idx_) + ) + + # validate shape of passed X + msg = ("Shape of the passed X data is not correct. Expected {0} " + "columns, got {1}.") + if X.shape[1] != n_transformed_features: + raise ValueError(msg.format(n_transformed_features, X.shape[1])) + + # create resulting array of appropriate dtype + dt = np.find_common_type([cat.dtype for cat in self.categories_], []) + X_tr = np.empty((n_samples, n_features), dtype=dt) + + j = 0 + found_unknown = {} + + for i in range(n_features): + if self.drop_idx_ is None or self.drop_idx_[i] is None: + cats = self.categories_[i] + else: + cats = np.delete(self.categories_[i], self.drop_idx_[i]) + n_categories = len(cats) + + # Only happens if there was a column with a unique + # category. In this case we just fill the column with this + # unique category value. + if n_categories == 0: + X_tr[:, i] = self.categories_[i][self.drop_idx_[i]] + j += n_categories + continue + sub = X[:, j:j + n_categories] + # for sparse X argmax returns 2D matrix, ensure 1D array + labels = np.asarray(sub.argmax(axis=1)).flatten() + X_tr[:, i] = cats[labels] + if self.handle_unknown == 'ignore': + unknown = np.asarray(sub.sum(axis=1) == 0).flatten() + # ignored unknown categories: we have a row of all zero + if unknown.any(): + found_unknown[i] = unknown + else: + dropped = np.asarray(sub.sum(axis=1) == 0).flatten() + if dropped.any(): + if self.drop_idx_ is None: + all_zero_samples = np.flatnonzero(dropped) + raise ValueError( + f"Samples {all_zero_samples} can not be inverted " + "when drop=None and handle_unknown='error' " + "because they contain all zeros") + # we can safely assume that all of the nulls in each column + # are the dropped value + X_tr[dropped, i] = self.categories_[i][ + self.drop_idx_[i] + ] + + j += n_categories + + # if ignored are found: potentially need to upcast result to + # insert None values + if found_unknown: + if X_tr.dtype != object: + X_tr = X_tr.astype(object) + + for idx, mask in found_unknown.items(): + X_tr[mask, idx] = None + + return X_tr + + def get_feature_names(self, input_features=None): + """ + Return feature names for output features. + Parameters + ---------- + input_features : list of str of shape (n_features,) + String names for input features if available. By default, + "x0", "x1", ... "xn_features" is used. + Returns + ------- + output_feature_names : ndarray of shape (n_output_features,) + Array of feature names. + """ + check_is_fitted(self) + cats = self.categories_ + if input_features is None: + input_features = ['x%d' % i for i in range(len(cats))] + elif len(input_features) != len(self.categories_): + raise ValueError( + "input_features should have length equal to number of " + "features ({}), got {}".format(len(self.categories_), + len(input_features))) + + feature_names = [] + for i in range(len(cats)): + names = [ + input_features[i] + '_' + str(t) for t in cats[i]] + if self.drop_idx_ is not None and self.drop_idx_[i] is not None: + names.pop(self.drop_idx_[i]) + feature_names.extend(names) + + return np.array(feature_names, dtype=object) diff --git a/afsklearn/preprocessing/_label.py b/afsklearn/preprocessing/_label.py new file mode 100644 index 0000000..7e1020c --- /dev/null +++ b/afsklearn/preprocessing/_label.py @@ -0,0 +1,120 @@ +import numpy as np # FIXME +from sklearn.utils.validation import _num_samples + +from .._encode import _encode, _unique +from .._validation import check_is_fitted, column_or_1d +from ..base import afBaseEstimator, afTransformerMixin + + +class afLabelEncoder(afTransformerMixin, afBaseEstimator): + """Encode target labels with value between 0 and n_classes-1. + This transformer should be used to encode target values, *i.e.* `y`, and + not the input `X`. + Read more in the :ref:`User Guide `. + .. versionadded:: 0.12 + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + Holds the label for each class. + Examples + -------- + `LabelEncoder` can be used to normalize labels. + >>> from sklearn import preprocessing + >>> le = preprocessing.LabelEncoder() + >>> le.fit([1, 2, 2, 6]) + LabelEncoder() + >>> le.classes_ + array([1, 2, 6]) + >>> le.transform([1, 1, 2, 6]) + array([0, 0, 1, 2]...) + >>> le.inverse_transform([0, 0, 1, 2]) + array([1, 1, 2, 6]) + It can also be used to transform non-numerical labels (as long as they are + hashable and comparable) to numerical labels. + >>> le = preprocessing.LabelEncoder() + >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) + LabelEncoder() + >>> list(le.classes_) + ['amsterdam', 'paris', 'tokyo'] + >>> le.transform(["tokyo", "tokyo", "paris"]) + array([2, 2, 1]...) + >>> list(le.inverse_transform([2, 2, 1])) + ['tokyo', 'tokyo', 'paris'] + See Also + -------- + OrdinalEncoder : Encode categorical features using an ordinal encoding + scheme. + OneHotEncoder : Encode categorical features as a one-hot numeric array. + """ + + def fit(self, y): + """Fit label encoder. + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + Returns + ------- + self : returns an instance of self. + """ + y = column_or_1d(y, warn=True) + self.classes_ = _unique(y) + return self + + def fit_transform(self, y): + """Fit label encoder and return encoded labels. + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + Returns + ------- + y : array-like of shape (n_samples,) + """ + y = column_or_1d(y, warn=True) + self.classes_, y = _unique(y, return_inverse=True) + return y + + def transform(self, y): + """Transform labels to normalized encoding. + Parameters + ---------- + y : array-like of shape (n_samples,) + Target values. + Returns + ------- + y : array-like of shape (n_samples,) + """ + check_is_fitted(self) + y = column_or_1d(y, warn=True) + # transform of empty array is empty array + if _num_samples(y) == 0: + return np.array([]) + + return _encode(y, uniques=self.classes_) + + def inverse_transform(self, y): + """Transform labels back to original encoding. + Parameters + ---------- + y : ndarray of shape (n_samples,) + Target values. + Returns + ------- + y : ndarray of shape (n_samples,) + """ + check_is_fitted(self) + y = column_or_1d(y, warn=True) + # inverse transform of empty array is empty array + if _num_samples(y) == 0: + return np.array([]) + + diff = np.setdiff1d(y, np.arange(len(self.classes_))) + if len(diff): + raise ValueError( + "y contains previously unseen labels: %s" % str(diff)) + y = np.asarray(y) + return self.classes_[y] + + def _more_tags(self): + return {'X_types': ['1dlabels']} diff --git a/requirements.txt b/requirements.txt index 17dcea9..d39d106 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ --e . +-e .[dev] arrayfire==3.8.0+cu112 -f https://repo.arrayfire.com/python/wheels/3.8.0/ diff --git a/setup.cfg b/setup.cfg index 0378a80..8a4021f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,3 +18,18 @@ install_requires = [options.packages.find] exclude = tests + +[options.extras_require] +dev = + autopep8==1.5.7 + isort==5.9.2 + flake8==3.9.2 + flake8-import-order==0.18.1 + +[tool:isort] +line_length = 119 +multi_line_output = 4 + +[flake8] +import-order-style = pep8 +max-line-length = 119 diff --git a/tests/test_one_hot_encoder.py b/tests/test_one_hot_encoder.py new file mode 100644 index 0000000..07662dc --- /dev/null +++ b/tests/test_one_hot_encoder.py @@ -0,0 +1,27 @@ +from afsklearn.patcher import Patcher + +from . import measure_time + + +def sklearn_example() -> None: + from sklearn.preprocessing import OneHotEncoder + enc = OneHotEncoder(handle_unknown='ignore') + X = [['Male', 1], ['Female', 3], ['Female', 2]] + enc.fit(X) + + +@measure_time +def test_sklearn() -> None: + sklearn_example() + + +@measure_time +def test_afsklearn() -> None: + Patcher.patch("one_hot_encoder") + sklearn_example() + Patcher.rollback("one_hot_encoder") + + +if __name__ == "__main__": + # test_afsklearn() + test_sklearn() diff --git a/tests/test_sgd_classifier.py b/tests/test_sgd_classifier.py new file mode 100644 index 0000000..88c7804 --- /dev/null +++ b/tests/test_sgd_classifier.py @@ -0,0 +1,31 @@ +import numpy as np + +from afsklearn.patcher import Patcher + +from . import measure_time + + +def sklearn_example() -> None: + from sklearn.linear_model import SGDClassifier + X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) + Y = np.array([1, 1, 2, 2]) + clf = SGDClassifier() + clf.fit(X, Y) + print(f"Predict: {clf.predict([[-0.8, -1]])}") + + +@measure_time +def test_sklearn() -> None: + sklearn_example() + + +@measure_time +def test_afsklearn() -> None: + Patcher.patch("sgd_classifier") + sklearn_example() + Patcher.rollback("sgd_classifier") + + +if __name__ == "__main__": + test_afsklearn() + # test_sklearn()