From 96317a8c1d97412951b27218ef375b3ce6034845 Mon Sep 17 00:00:00 2001
From: Anton Chernyatevich <profa142@gmail.com>
Date: Wed, 28 Jul 2021 23:57:49 +0300
Subject: [PATCH] Add SGD and OHE

---
 .gitignore                                 |   3 +
 afsklearn/_class_weight.py                 |  62 ++
 afsklearn/_classifier_mixin.py             |  78 ++-
 afsklearn/_encode.py                       | 256 ++++++++
 afsklearn/_multiclass.py                   |  34 ++
 afsklearn/_validation.py                   |  52 ++
 afsklearn/base.py                          |  46 +-
 afsklearn/linear_model/__init__.py         |   0
 afsklearn/linear_model/base.py             |  53 ++
 afsklearn/linear_model/sgd_base.py         | 675 +++++++++++++++++++++
 afsklearn/linear_model/sgd_classifier.py   | 329 ++++++++++
 afsklearn/neural_network/base.py           |  48 +-
 afsklearn/neural_network/mlp_classifier.py |   8 +-
 afsklearn/patched_modules.yml              |  10 +
 afsklearn/preprocessing/_encoders.py       | 588 ++++++++++++++++++
 afsklearn/preprocessing/_label.py          | 120 ++++
 requirements.txt                           |   2 +-
 setup.cfg                                  |  15 +
 tests/test_one_hot_encoder.py              |  27 +
 tests/test_sgd_classifier.py               |  31 +
 20 files changed, 2369 insertions(+), 68 deletions(-)
 create mode 100644 afsklearn/_class_weight.py
 create mode 100644 afsklearn/_encode.py
 create mode 100644 afsklearn/_multiclass.py
 create mode 100644 afsklearn/linear_model/__init__.py
 create mode 100644 afsklearn/linear_model/base.py
 create mode 100644 afsklearn/linear_model/sgd_base.py
 create mode 100644 afsklearn/linear_model/sgd_classifier.py
 create mode 100644 afsklearn/preprocessing/_encoders.py
 create mode 100644 afsklearn/preprocessing/_label.py
 create mode 100644 tests/test_one_hot_encoder.py
 create mode 100644 tests/test_sgd_classifier.py

diff --git a/.gitignore b/.gitignore
index 7c0cad9..c4b1e59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,5 +6,8 @@ build
 dist
 *.egg-info
 
+# Static typing
+.mypy_cache
+
 # Virtual env
 venv
diff --git a/afsklearn/_class_weight.py b/afsklearn/_class_weight.py
new file mode 100644
index 0000000..a446605
--- /dev/null
+++ b/afsklearn/_class_weight.py
@@ -0,0 +1,62 @@
+import numpy as np  # FIXME
+from sklearn.utils import _deprecate_positional_args
+
+
+@_deprecate_positional_args
+def compute_class_weight(class_weight, *, classes, y):
+    """Estimate class weights for unbalanced datasets.
+    Parameters
+    ----------
+    class_weight : dict, 'balanced' or None
+        If 'balanced', class weights will be given by
+        ``n_samples / (n_classes * np.bincount(y))``.
+        If a dictionary is given, keys are classes and values
+        are corresponding class weights.
+        If None is given, the class weights will be uniform.
+    classes : ndarray
+        Array of the classes occurring in the data, as given by
+        ``np.unique(y_org)`` with ``y_org`` the original class labels.
+    y : array-like of shape (n_samples,)
+        Array of original class labels per sample.
+    Returns
+    -------
+    class_weight_vect : ndarray of shape (n_classes,)
+        Array with class_weight_vect[i] the weight for i-th class.
+    References
+    ----------
+    The "balanced" heuristic is inspired by
+    Logistic Regression in Rare Events Data, King, Zen, 2001.
+    """
+    # Import error caused by circular imports.
+    from .preprocessing._label import afLabelEncoder
+
+    if set(y) - set(classes):
+        raise ValueError("classes should include all valid labels that can "
+                         "be in y")
+    if class_weight is None or len(class_weight) == 0:
+        # uniform class weights
+        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
+    elif class_weight == 'balanced':
+        # Find the weight of each class as present in y.
+        le = afLabelEncoder()
+        y_ind = le.fit_transform(y)
+        if not all(np.in1d(classes, le.classes_)):
+            raise ValueError("classes should have valid labels that are in y")
+
+        recip_freq = len(y) / (len(le.classes_) *
+                               np.bincount(y_ind).astype(np.float64))
+        weight = recip_freq[le.transform(classes)]
+    else:
+        # user-defined dictionary
+        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
+        if not isinstance(class_weight, dict):
+            raise ValueError("class_weight must be dict, 'balanced', or None,"
+                             " got: %r" % class_weight)
+        for c in class_weight:
+            i = np.searchsorted(classes, c)
+            if i >= len(classes) or classes[i] != c:
+                raise ValueError("Class label {} not present.".format(c))
+            else:
+                weight[i] = class_weight[c]
+
+    return weight
diff --git a/afsklearn/_classifier_mixin.py b/afsklearn/_classifier_mixin.py
index db81060..2d87ed4 100644
--- a/afsklearn/_classifier_mixin.py
+++ b/afsklearn/_classifier_mixin.py
@@ -1,5 +1,6 @@
 import arrayfire as af
 
+
 def _weighted_sum(sample_score, sample_weight, normalize=False):
     if normalize:
         return np.average(sample_score, weights=sample_weight)
@@ -8,6 +9,7 @@ def _weighted_sum(sample_score, sample_weight, normalize=False):
     else:
         return sample_score.sum()
 
+
 def _check_targets(y_true, y_pred):
     """Check that y_true and y_pred belong to the same classification task
     This converts multiclass or binary types to a common shape, and raises a
@@ -63,8 +65,6 @@ def _check_targets(y_true, y_pred):
     return y_type, y_true, y_pred
 
 
-
-
 def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Accuracy classification score.
     In multilabel classification, this function computes subset accuracy:
@@ -123,6 +123,7 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
 
     return _weighted_sum(score, sample_weight, normalize)
 
+
 class afClassifierMixin:
     """ArrayFire enabled Mixin class for all classifiers in scikit-learn."""
 
@@ -147,8 +148,77 @@ def score(self, X, y, sample_weight=None):
         score : float
             Mean accuracy of self.predict(X) wrt. y.
         """
-        #return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
-        return #TMP
+        # return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
+        return  # TMP
 
     def _more_tags(self):
         return {'requires_y': True}
+
+
+class afLinearClassifierMixin(afClassifierMixin):
+    """Mixin for linear classifiers.
+    Handles prediction for sparse and dense X.
+    """
+
+    def decision_function(self, X):
+        """
+        Predict confidence scores for samples.
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+        Returns
+        -------
+        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
+            Confidence scores per (sample, class) combination. In the binary
+            case, confidence score for self.classes_[1] where >0 means this
+            class would be predicted.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, accept_sparse='csr')
+
+        n_features = self.coef_.shape[1]
+        if X.shape[1] != n_features:
+            raise ValueError("X has %d features per sample; expecting %d"
+                             % (X.shape[1], n_features))
+
+        scores = safe_sparse_dot(X, self.coef_.T,
+                                 dense_output=True) + self.intercept_
+        return scores.ravel() if scores.shape[1] == 1 else scores
+
+    def predict(self, X):
+        """
+        Predict class labels for samples in X.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+        Returns
+        -------
+        C : array, shape [n_samples]
+            Predicted class label per sample.
+        """
+        scores = self.decision_function(X)
+        if len(scores.shape) == 1:
+            indices = (scores > 0).astype(int)
+        else:
+            indices = scores.argmax(axis=1)
+        return self.classes_[indices]
+
+    def _predict_proba_lr(self, X):
+        """Probability estimation for OvR logistic regression.
+        Positive class probabilities are computed as
+        1. / (1. + np.exp(-self.decision_function(X)));
+        multiclass is handled by normalizing that over all classes.
+        """
+        prob = self.decision_function(X)
+        expit(prob, out=prob)
+        if prob.ndim == 1:
+            return np.vstack([1 - prob, prob]).T
+        else:
+            # OvR normalization, like LibLinear's predict_probability
+            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
+            return prob
diff --git a/afsklearn/_encode.py b/afsklearn/_encode.py
new file mode 100644
index 0000000..5b4aa31
--- /dev/null
+++ b/afsklearn/_encode.py
@@ -0,0 +1,256 @@
+from typing import NamedTuple
+
+import numpy as np  # FIXME
+
+from ._validation import is_scalar_nan
+
+
+def _encode(values, *, uniques, check_unknown=True):
+    """Helper function to encode values into [0, n_uniques - 1].
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+    The numpy method has the limitation that the `uniques` need to
+    be sorted. Importantly, this is not checked but assumed to already be
+    the case. The calling method needs to ensure this for all non-object
+    values.
+    Parameters
+    ----------
+    values : ndarray
+        Values to encode.
+    uniques : ndarray
+        The unique values in `values`. If the dtype is not object, then
+        `uniques` needs to be sorted.
+    check_unknown : bool, default=True
+        If True, check for values in `values` that are not in `unique`
+        and raise an error. This is ignored for object dtype, and treated as
+        True in this case. This parameter is useful for
+        _BaseEncoder._transform() to avoid calling _check_unknown()
+        twice.
+    Returns
+    -------
+    encoded : ndarray
+        Encoded values
+    """
+    if values.dtype.kind in 'OUS':
+        try:
+            return _map_to_integer(values, uniques)
+        except KeyError as e:
+            raise ValueError(f"y contains previously unseen labels: {str(e)}")
+    else:
+        if check_unknown:
+            diff = _check_unknown(values, uniques)
+            if diff:
+                raise ValueError(f"y contains previously unseen labels: "
+                                 f"{str(diff)}")
+        return np.searchsorted(uniques, values)
+
+
+def _unique(values, *, return_inverse=False):
+    """Helper function to find unique values with support for python objects.
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+    Parameters
+    ----------
+    values : ndarray
+        Values to check for unknowns.
+    return_inverse : bool, default=False
+        If True, also return the indices of the unique values.
+    Returns
+    -------
+    unique : ndarray
+        The sorted unique values.
+    unique_inverse : ndarray
+        The indices to reconstruct the original array from the unique array.
+        Only provided if `return_inverse` is True.
+    """
+    if values.dtype == object:
+        return _unique_python(values, return_inverse=return_inverse)
+    # numerical
+    out = np.unique(values, return_inverse=return_inverse)
+
+    if return_inverse:
+        uniques, inverse = out
+    else:
+        uniques = out
+
+    # np.unique will have duplicate missing values at the end of `uniques`
+    # here we clip the nans and remove it from uniques
+    if uniques.size and is_scalar_nan(uniques[-1]):
+        nan_idx = np.searchsorted(uniques, np.nan)
+        uniques = uniques[:nan_idx + 1]
+        if return_inverse:
+            inverse[inverse > nan_idx] = nan_idx
+
+    if return_inverse:
+        return uniques, inverse
+    return uniques
+
+
+def _unique_python(values, *, return_inverse):
+    # Only used in `_uniques`, see docstring there for details
+    try:
+        uniques_set = set(values)
+        uniques_set, missing_values = _extract_missing(uniques_set)
+
+        uniques = sorted(uniques_set)
+        uniques.extend(missing_values.to_list())
+        uniques = np.array(uniques, dtype=values.dtype)
+    except TypeError:
+        types = sorted(t.__qualname__
+                       for t in set(type(v) for v in values))
+        raise TypeError("Encoders require their input to be uniformly "
+                        f"strings or numbers. Got {types}")
+
+    if return_inverse:
+        return uniques, _map_to_integer(values, uniques)
+
+    return uniques
+
+
+def _map_to_integer(values, uniques):
+    """Map values based on its position in uniques."""
+    table = _nandict({val: i for i, val in enumerate(uniques)})
+    return np.array([table[v] for v in values])
+
+
+class _nandict(dict):
+    """Dictionary with support for nans."""
+
+    def __init__(self, mapping):
+        super().__init__(mapping)
+        for key, value in mapping.items():
+            if is_scalar_nan(key):
+                self.nan_value = value
+                break
+
+    def __missing__(self, key):
+        if hasattr(self, 'nan_value') and is_scalar_nan(key):
+            return self.nan_value
+        raise KeyError(key)
+
+
+class MissingValues(NamedTuple):
+    """Data class for missing data information"""
+    nan: bool
+    none: bool
+
+    def to_list(self):
+        """Convert tuple to a list where None is always first."""
+        output = []
+        if self.none:
+            output.append(None)
+        if self.nan:
+            output.append(np.nan)
+        return output
+
+
+def _extract_missing(values):
+    """Extract missing values from `values`.
+    Parameters
+    ----------
+    values: set
+        Set of values to extract missing from.
+    Returns
+    -------
+    output: set
+        Set with missing values extracted.
+    missing_values: MissingValues
+        Object with missing value information.
+    """
+    missing_values_set = {value for value in values
+                          if value is None or is_scalar_nan(value)}
+
+    if not missing_values_set:
+        return values, MissingValues(nan=False, none=False)
+
+    if None in missing_values_set:
+        if len(missing_values_set) == 1:
+            output_missing_values = MissingValues(nan=False, none=True)
+        else:
+            # If there is more than one missing value, then it has to be
+            # float('nan') or np.nan
+            output_missing_values = MissingValues(nan=True, none=True)
+    else:
+        output_missing_values = MissingValues(nan=True, none=False)
+
+    # create set without the missing values
+    output = values - missing_values_set
+    return output, output_missing_values
+
+
+def _check_unknown(values, known_values, return_mask=False):
+    """
+    Helper function to check for unknowns in values to be encoded.
+    Uses pure python method for object dtype, and numpy method for
+    all other dtypes.
+    Parameters
+    ----------
+    values : array
+        Values to check for unknowns.
+    known_values : array
+        Known values. Must be unique.
+    return_mask : bool, default=False
+        If True, return a mask of the same shape as `values` indicating
+        the valid values.
+    Returns
+    -------
+    diff : list
+        The unique values present in `values` and not in `know_values`.
+    valid_mask : boolean array
+        Additionally returned if ``return_mask=True``.
+    """
+    valid_mask = None
+
+    if values.dtype.kind in 'OUS':
+        values_set = set(values)
+        values_set, missing_in_values = _extract_missing(values_set)
+
+        uniques_set = set(known_values)
+        uniques_set, missing_in_uniques = _extract_missing(uniques_set)
+        diff = values_set - uniques_set
+
+        nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan
+        none_in_diff = missing_in_values.none and not missing_in_uniques.none
+
+        def is_valid(value):
+            return (value in uniques_set or
+                    missing_in_uniques.none and value is None or
+                    missing_in_uniques.nan and is_scalar_nan(value))
+
+        if return_mask:
+            if diff or nan_in_diff or none_in_diff:
+                valid_mask = np.array([is_valid(value) for value in values])
+            else:
+                valid_mask = np.ones(len(values), dtype=bool)
+
+        diff = list(diff)
+        if none_in_diff:
+            diff.append(None)
+        if nan_in_diff:
+            diff.append(np.nan)
+    else:
+        unique_values = np.unique(values)
+        diff = np.setdiff1d(unique_values, known_values,
+                            assume_unique=True)
+        if return_mask:
+            if diff.size:
+                valid_mask = np.in1d(values, known_values)
+            else:
+                valid_mask = np.ones(len(values), dtype=bool)
+
+        # check for nans in the known_values
+        if np.isnan(known_values).any():
+            diff_is_nan = np.isnan(diff)
+            if diff_is_nan.any():
+                # removes nan from valid_mask
+                if diff.size and return_mask:
+                    is_nan = np.isnan(values)
+                    valid_mask[is_nan] = 1
+
+                # remove nan from diff
+                diff = diff[~diff_is_nan]
+        diff = list(diff)
+
+    if return_mask:
+        return diff, valid_mask
+    return diff
diff --git a/afsklearn/_multiclass.py b/afsklearn/_multiclass.py
new file mode 100644
index 0000000..6d9c644
--- /dev/null
+++ b/afsklearn/_multiclass.py
@@ -0,0 +1,34 @@
+import numpy as np  # FIXME
+
+from .base import unique_labels
+
+
+def _check_partial_fit_first_call(clf, classes=None):
+    """Private helper function for factorizing common classes param logic.
+    Estimators that implement the ``partial_fit`` API need to be provided with
+    the list of possible classes at the first call to partial_fit.
+    Subsequent calls to partial_fit should check that ``classes`` is still
+    consistent with a previous value of ``clf.classes_`` when provided.
+    This function returns True if it detects that this was the first call to
+    ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
+    set on ``clf``.
+    """
+    if getattr(clf, 'classes_', None) is None and classes is None:
+        raise ValueError("classes must be passed on the first call "
+                         "to partial_fit.")
+
+    elif classes is not None:
+        if getattr(clf, 'classes_', None) is not None:
+            if not np.array_equal(clf.classes_, unique_labels(classes)):
+                raise ValueError(
+                    "`classes=%r` is not the same as on last call "
+                    "to partial_fit, was: %r" % (classes, clf.classes_))
+
+        else:
+            # This is the first call to partial_fit
+            clf.classes_ = unique_labels(classes)
+            return True
+
+    # classes is None and clf.classes_ has already previously been set:
+    # nothing to do
+    return False
diff --git a/afsklearn/_validation.py b/afsklearn/_validation.py
index 1620c64..d58ab52 100644
--- a/afsklearn/_validation.py
+++ b/afsklearn/_validation.py
@@ -11,9 +11,11 @@
 from sklearn.utils.validation import _deprecate_positional_args
 from sklearn._config import get_config as _get_config
 
+
 def _object_dtype_isnan(X):
     return X != X
 
+
 def is_scalar_nan(x):
     """Tests if x is NaN.
     This function is meant to overcome the issue that np.isnan does not allow
@@ -60,6 +62,56 @@ def check_consistent_length(*arrays):
                          " samples: %r" % [int(l) for l in lengths])
 
 
+def _check_sample_weight(sample_weight, X, dtype=None, copy=False):
+    """Validate sample weights.
+    Note that passing sample_weight=None will output an array of ones.
+    Therefore, in some cases, you may want to protect the call with:
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(...)
+    Parameters
+    ----------
+    sample_weight : {ndarray, Number or None}, shape (n_samples,)
+       Input sample weights.
+    X : {ndarray, list, sparse matrix}
+        Input data.
+    dtype: dtype, default=None
+       dtype of the validated `sample_weight`.
+       If None, and the input `sample_weight` is an array, the dtype of the
+       input is preserved; otherwise an array with the default numpy dtype
+       is be allocated.  If `dtype` is not one of `float32`, `float64`,
+       `None`, the output will be of dtype `float64`.
+    copy : bool, default=False
+        If True, a copy of sample_weight will be created.
+    Returns
+    -------
+    sample_weight : ndarray of shape (n_samples,)
+       Validated sample weight. It is guaranteed to be "C" contiguous.
+    """
+    n_samples = _num_samples(X)
+
+    if dtype is not None and dtype not in [np.float32, np.float64]:
+        dtype = np.float64
+
+    if sample_weight is None:
+        sample_weight = np.ones(n_samples, dtype=dtype)
+    elif isinstance(sample_weight, numbers.Number):
+        sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
+    else:
+        if dtype is None:
+            dtype = [np.float64, np.float32]
+        sample_weight = check_array(
+            sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype,
+            order="C", copy=copy
+        )
+        if sample_weight.ndim != 1:
+            raise ValueError("Sample weights must be 1D array or scalar")
+
+        if sample_weight.shape != (n_samples,):
+            raise ValueError("sample_weight.shape == {}, expected {}!"
+                             .format(sample_weight.shape, (n_samples,)))
+    return sample_weight
+
+
 def _safe_accumulator_op(op, x, *args, **kwargs):
     """
     This function provides numpy accumulator functions with a float64 dtype
diff --git a/afsklearn/base.py b/afsklearn/base.py
index 097d2df..e66cfe2 100644
--- a/afsklearn/base.py
+++ b/afsklearn/base.py
@@ -1,15 +1,16 @@
-import numbers
-import warnings
+from collections.abc import Sequence
+from itertools import chain
 
 import arrayfire as af
 import numpy as np
 import scipy.sparse as sp
-from numpy.core.numeric import ComplexWarning
+from scipy.sparse.base import spmatrix
 from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils.validation import _deprecate_positional_args
 
-from ._validation import (
-    _assert_all_finite, _ensure_no_complex_data, _num_samples, _safe_accumulator_op, check_array,
-    check_consistent_length, check_X_y, column_or_1d)
+from ._validation import (_assert_all_finite, _num_samples, check_array,
+                          check_is_fitted, check_X_y, column_or_1d)
 
 
 # Class inheriting from BaseEstimator
@@ -100,24 +101,10 @@ def _validate_data(self, X, y=None, reset=True,
 # Class inheriting from TransformerMixin
 # all methods that touch np.array are replaced
 # with ArrayFire compatible functionality
-class afTransformerMixin(TransformerMixin):
-    pass
 
-import numbers
-import warnings
-from collections.abc import Sequence
-from itertools import chain
 
-import arrayfire as af
-#import numpy as np
-import numpy
-import numpy as np
-import scipy.sparse as sp
-from scipy.sparse.base import spmatrix
-from sklearn.preprocessing import LabelBinarizer
-from sklearn.utils.validation import _deprecate_positional_args
-
-from ._validation import _num_samples, check_array, check_is_fitted, column_or_1d
+class afTransformerMixin(TransformerMixin):
+    pass
 
 
 def _unique_multiclass(y):
@@ -139,6 +126,7 @@ def _unique_indicator(y):
     'multilabel-indicator': _unique_indicator,
 }
 
+
 def unique_labels(*ys):
     """Extract an ordered array of unique labels
 
@@ -205,6 +193,7 @@ def unique_labels(*ys):
 
     return np.array(sorted(ys_labels))
 
+
 def is_multilabel(y):
     """ Check if ``y`` is in a multilabel format.
 
@@ -250,6 +239,7 @@ def is_multilabel(y):
         return len(labels) < 3 and (y.dtype.kind in 'biu' or  # bool, int, uint
                                     _is_integral_float(labels))
 
+
 def type_of_target(y):
     """Determine the type of data indicated by the target.
 
@@ -370,6 +360,7 @@ def type_of_target(y):
     else:
         return 'binary'  # [1, 2] or [["a"], ["b"]]
 
+
 def _inverse_binarize_multiclass(y, classes):
     """Inverse label binarization transformation for multiclass.
 
@@ -459,12 +450,13 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold):
 
 
 def af_in1d(arr0, arr1):
-    #temporarily perform computation in numy, potentially change to arrayfire
+    # temporarily perform computation in numy, potentially change to arrayfire
     #a0 = arr0.to_ndarray()
     #a1 = arr1.to_ndarray()
     isin = np.in1d(arr0,  arr1)
     return isin
 
+
 @_deprecate_positional_args
 def label_binarize(y, *, classes, neg_label=0, pos_label=1,
                    sparse_output=False):
@@ -586,7 +578,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1,
         y_in_classes = af.interop.from_ndarray(y_in_classes, copy=True)
         y[y_in_classes]
         y_seen = y[y_in_classes]
-        y_seen = y_seen#.to_ndarray()
+        y_seen = y_seen  # .to_ndarray()
         indices = np.searchsorted(sorted_class, y_seen)
         indptr = np.hstack((0, np.cumsum(y_in_classes)))
 
@@ -594,8 +586,8 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1,
         data.fill(pos_label)
         Y = data
 
-        #Y = sp.csr_matrix((data, indices, indptr),
-                          #shape=(n_samples, n_classes))
+        # Y = sp.csr_matrix((data, indices, indptr),
+        # shape=(n_samples, n_classes))
     elif y_type == "multilabel-indicator":
         Y = sp.csr_matrix(y)
         if pos_label != 1:
@@ -607,7 +599,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1,
                          "binarization" % y_type)
 
     if not sparse_output:
-        #Y = Y.toarray() #TODO: test if ndarray, then cast if not
+        # Y = Y.toarray() #TODO: test if ndarray, then cast if not
         Y = Y.astype(int, copy=False)
 
         if neg_label != 0:
diff --git a/afsklearn/linear_model/__init__.py b/afsklearn/linear_model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/afsklearn/linear_model/base.py b/afsklearn/linear_model/base.py
new file mode 100644
index 0000000..0a4033d
--- /dev/null
+++ b/afsklearn/linear_model/base.py
@@ -0,0 +1,53 @@
+import numpy as np  # FIXME
+import scipy.sparse as sp
+from sklearn.linear_model._base import SPARSE_INTERCEPT_DECAY
+from sklearn.utils._seq_dataset import ArrayDataset32, ArrayDataset64, CSRDataset32, CSRDataset64
+
+from .._validation import check_random_state
+
+
+def make_dataset(X, y, sample_weight, random_state=None):
+    """Create ``Dataset`` abstraction for sparse and dense inputs.
+    This also returns the ``intercept_decay`` which is different
+    for sparse datasets.
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Training data
+    y : array-like, shape (n_samples, )
+        Target values.
+    sample_weight : numpy array of shape (n_samples,)
+        The weight of each sample
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for dataset shuffling and noise.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    Returns
+    -------
+    dataset
+        The ``Dataset`` abstraction
+    intercept_decay
+        The intercept decay
+    """
+
+    rng = check_random_state(random_state)
+    # seed should never be 0 in SequentialDataset64
+    seed = rng.randint(1, np.iinfo(np.int32).max)
+
+    if X.dtype == np.float32:
+        CSRData = CSRDataset32
+        ArrayData = ArrayDataset32
+    else:
+        CSRData = CSRDataset64
+        ArrayData = ArrayDataset64
+
+    if sp.issparse(X):
+        dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight,
+                          seed=seed)
+        intercept_decay = SPARSE_INTERCEPT_DECAY
+    else:
+        X = np.ascontiguousarray(X)
+        dataset = ArrayData(X, y, sample_weight, seed=seed)
+        intercept_decay = 1.0
+
+    return dataset, intercept_decay
diff --git a/afsklearn/linear_model/sgd_base.py b/afsklearn/linear_model/sgd_base.py
new file mode 100644
index 0000000..01313e6
--- /dev/null
+++ b/afsklearn/linear_model/sgd_base.py
@@ -0,0 +1,675 @@
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import numpy as np  # FIXME
+from joblib import Parallel
+from sklearn.base import clone, is_classifier
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model._base import SparseCoefMixin
+from sklearn.linear_model._sgd_fast import (  # FIXME
+    EpsilonInsensitive, Hinge, Huber, Log, ModifiedHuber, SquaredEpsilonInsensitive, SquaredHinge, SquaredLoss,
+    _plain_sgd)
+from sklearn.linear_model._stochastic_gradient import DEFAULT_EPSILON, LEARNING_RATE_TYPES, MAX_INT, PENALTY_TYPES
+from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
+from sklearn.utils import deprecated
+from sklearn.utils.fixes import _joblib_parallel_args, delayed
+from sklearn.utils.validation import _deprecate_positional_args
+
+from .._class_weight import compute_class_weight
+from .._classifier_mixin import afLinearClassifierMixin
+from .._multiclass import _check_partial_fit_first_call
+from .._validation import _check_sample_weight, check_random_state, check_X_y
+from ..base import afBaseEstimator
+from .base import make_dataset
+
+
+class _ValidationScoreCallback:
+    """Callback for early stopping based on validation score"""
+
+    def __init__(self, estimator, X_val, y_val, sample_weight_val,
+                 classes=None):
+        self.estimator = clone(estimator)
+        self.estimator.t_ = 1  # to pass check_is_fitted
+        if classes is not None:
+            self.estimator.classes_ = classes
+        self.X_val = X_val
+        self.y_val = y_val
+        self.sample_weight_val = sample_weight_val
+
+    def __call__(self, coef, intercept):
+        est = self.estimator
+        est.coef_ = coef.reshape(1, -1)
+        est.intercept_ = np.atleast_1d(intercept)
+        return est.score(self.X_val, self.y_val, self.sample_weight_val)
+
+
+class afBaseSGD(SparseCoefMixin, afBaseEstimator, metaclass=ABCMeta):
+    """Base class for SGD classification and regression."""
+    @_deprecate_positional_args
+    def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0,
+                 l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
+                 shuffle=True, verbose=0, epsilon=0.1, random_state=None,
+                 learning_rate="optimal", eta0=0.0, power_t=0.5,
+                 early_stopping=False, validation_fraction=0.1,
+                 n_iter_no_change=5, warm_start=False, average=False):
+        self.loss = loss
+        self.penalty = penalty
+        self.learning_rate = learning_rate
+        self.epsilon = epsilon
+        self.alpha = alpha
+        self.C = C
+        self.l1_ratio = l1_ratio
+        self.fit_intercept = fit_intercept
+        self.shuffle = shuffle
+        self.random_state = random_state
+        self.verbose = verbose
+        self.eta0 = eta0
+        self.power_t = power_t
+        self.early_stopping = early_stopping
+        self.validation_fraction = validation_fraction
+        self.n_iter_no_change = n_iter_no_change
+        self.warm_start = warm_start
+        self.average = average
+        self.max_iter = max_iter
+        self.tol = tol
+        # current tests expect init to do parameter validation
+        # but we are not allowed to set attributes
+        self._validate_params()
+
+    def set_params(self, **kwargs):
+        """Set and validate the parameters of estimator.
+        Parameters
+        ----------
+        **kwargs : dict
+            Estimator parameters.
+        Returns
+        -------
+        self : object
+            Estimator instance.
+        """
+        super().set_params(**kwargs)
+        self._validate_params()
+        return self
+
+    @abstractmethod
+    def fit(self, X, y):
+        """Fit model."""
+
+    def _validate_params(self, for_partial_fit=False):
+        """Validate input params. """
+        if not isinstance(self.shuffle, bool):
+            raise ValueError("shuffle must be either True or False")
+        if not isinstance(self.early_stopping, bool):
+            raise ValueError("early_stopping must be either True or False")
+        if self.early_stopping and for_partial_fit:
+            raise ValueError("early_stopping should be False with partial_fit")
+        if self.max_iter is not None and self.max_iter <= 0:
+            raise ValueError("max_iter must be > zero. Got %f" % self.max_iter)
+        if not (0.0 <= self.l1_ratio <= 1.0):
+            raise ValueError("l1_ratio must be in [0, 1]")
+        if self.alpha < 0.0:
+            raise ValueError("alpha must be >= 0")
+        if self.n_iter_no_change < 1:
+            raise ValueError("n_iter_no_change must be >= 1")
+        if not (0.0 < self.validation_fraction < 1.0):
+            raise ValueError("validation_fraction must be in range (0, 1)")
+        if self.learning_rate in ("constant", "invscaling", "adaptive"):
+            if self.eta0 <= 0.0:
+                raise ValueError("eta0 must be > 0")
+        if self.learning_rate == "optimal" and self.alpha == 0:
+            raise ValueError("alpha must be > 0 since "
+                             "learning_rate is 'optimal'. alpha is used "
+                             "to compute the optimal learning rate.")
+
+        # raises ValueError if not registered
+        self._get_penalty_type(self.penalty)
+        self._get_learning_rate_type(self.learning_rate)
+
+        if self.loss not in self.loss_functions:
+            raise ValueError("The loss %s is not supported. " % self.loss)
+
+    def _get_loss_function(self, loss):
+        """Get concrete ``LossFunction`` object for str ``loss``. """
+        try:
+            loss_ = self.loss_functions[loss]
+            loss_class, args = loss_[0], loss_[1:]
+            if loss in ('huber', 'epsilon_insensitive',
+                        'squared_epsilon_insensitive'):
+                args = (self.epsilon, )
+            return loss_class(*args)
+        except KeyError as e:
+            raise ValueError("The loss %s is not supported. " % loss) from e
+
+    def _get_learning_rate_type(self, learning_rate):
+        try:
+            return LEARNING_RATE_TYPES[learning_rate]
+        except KeyError as e:
+            raise ValueError("learning rate %s "
+                             "is not supported. " % learning_rate) from e
+
+    def _get_penalty_type(self, penalty):
+        penalty = str(penalty).lower()
+        try:
+            return PENALTY_TYPES[penalty]
+        except KeyError as e:
+            raise ValueError("Penalty %s is not supported. " % penalty) from e
+
+    def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None,
+                                intercept_init=None):
+        """Allocate mem for parameters; initialize if provided."""
+        if n_classes > 2:
+            # allocate coef_ for multi-class
+            if coef_init is not None:
+                coef_init = np.asarray(coef_init, order="C")
+                if coef_init.shape != (n_classes, n_features):
+                    raise ValueError("Provided ``coef_`` does not match "
+                                     "dataset. ")
+                self.coef_ = coef_init
+            else:
+                self.coef_ = np.zeros((n_classes, n_features),
+                                      dtype=np.float64, order="C")
+
+            # allocate intercept_ for multi-class
+            if intercept_init is not None:
+                intercept_init = np.asarray(intercept_init, order="C")
+                if intercept_init.shape != (n_classes, ):
+                    raise ValueError("Provided intercept_init "
+                                     "does not match dataset.")
+                self.intercept_ = intercept_init
+            else:
+                self.intercept_ = np.zeros(n_classes, dtype=np.float64,
+                                           order="C")
+        else:
+            # allocate coef_ for binary problem
+            if coef_init is not None:
+                coef_init = np.asarray(coef_init, dtype=np.float64,
+                                       order="C")
+                coef_init = coef_init.ravel()
+                if coef_init.shape != (n_features,):
+                    raise ValueError("Provided coef_init does not "
+                                     "match dataset.")
+                self.coef_ = coef_init
+            else:
+                self.coef_ = np.zeros(n_features,
+                                      dtype=np.float64,
+                                      order="C")
+
+            # allocate intercept_ for binary problem
+            if intercept_init is not None:
+                intercept_init = np.asarray(intercept_init, dtype=np.float64)
+                if intercept_init.shape != (1,) and intercept_init.shape != ():
+                    raise ValueError("Provided intercept_init "
+                                     "does not match dataset.")
+                self.intercept_ = intercept_init.reshape(1,)
+            else:
+                self.intercept_ = np.zeros(1, dtype=np.float64, order="C")
+
+        # initialize average parameters
+        if self.average > 0:
+            self._standard_coef = self.coef_
+            self._standard_intercept = self.intercept_
+            self._average_coef = np.zeros(self.coef_.shape,
+                                          dtype=np.float64,
+                                          order="C")
+            self._average_intercept = np.zeros(self._standard_intercept.shape,
+                                               dtype=np.float64,
+                                               order="C")
+
+    def _make_validation_split(self, y):
+        """Split the dataset between training set and validation set.
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples, )
+            Target values.
+        Returns
+        -------
+        validation_mask : ndarray of shape (n_samples, )
+            Equal to 1 on the validation set, 0 on the training set.
+        """
+        n_samples = y.shape[0]
+        validation_mask = np.zeros(n_samples, dtype=np.uint8)
+        if not self.early_stopping:
+            # use the full set for training, with an empty validation set
+            return validation_mask
+
+        if is_classifier(self):
+            splitter_type = StratifiedShuffleSplit
+        else:
+            splitter_type = ShuffleSplit
+        cv = splitter_type(test_size=self.validation_fraction,
+                           random_state=self.random_state)
+        idx_train, idx_val = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y))
+        if idx_train.shape[0] == 0 or idx_val.shape[0] == 0:
+            raise ValueError(
+                "Splitting %d samples into a train set and a validation set "
+                "with validation_fraction=%r led to an empty set (%d and %d "
+                "samples). Please either change validation_fraction, increase "
+                "number of samples, or disable early_stopping."
+                % (n_samples, self.validation_fraction, idx_train.shape[0],
+                   idx_val.shape[0]))
+
+        validation_mask[idx_val] = 1
+        return validation_mask
+
+    def _make_validation_score_cb(self, validation_mask, X, y, sample_weight,
+                                  classes=None):
+        if not self.early_stopping:
+            return None
+
+        return _ValidationScoreCallback(
+            self, X[validation_mask], y[validation_mask],
+            sample_weight[validation_mask], classes=classes)
+
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute standard_coef_ was deprecated "  # type: ignore
+                "in version 0.23 and will be removed in 1.0 "
+                "(renaming of 0.25).")
+    @property
+    def standard_coef_(self):
+        return self._standard_coef
+
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute standard_intercept_ was deprecated "
+        "in version 0.23 and will be removed in 1.0 (renaming of 0.25)."
+    )
+    @property
+    def standard_intercept_(self):
+        return self._standard_intercept
+
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute average_coef_ was deprecated "  # type: ignore
+                "in version 0.23 and will be removed in 1.0 "
+                "(renaming of 0.25).")
+    @property
+    def average_coef_(self):
+        return self._average_coef
+
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute average_intercept_ was deprecated "  # type: ignore
+                "in version 0.23 and will be removed in 1.0 "
+                "(renaming of 0.25).")
+    @property
+    def average_intercept_(self):
+        return self._average_intercept
+
+
+class afBaseSGDClassifier(afLinearClassifierMixin, afBaseSGD, metaclass=ABCMeta):
+
+    loss_functions = {
+        "hinge": (Hinge, 1.0),
+        "squared_hinge": (SquaredHinge, 1.0),
+        "perceptron": (Hinge, 0.0),
+        "log": (Log, ),
+        "modified_huber": (ModifiedHuber, ),
+        "squared_loss": (SquaredLoss, ),
+        "huber": (Huber, DEFAULT_EPSILON),
+        "epsilon_insensitive": (EpsilonInsensitive, DEFAULT_EPSILON),
+        "squared_epsilon_insensitive": (SquaredEpsilonInsensitive,
+                                        DEFAULT_EPSILON),
+    }
+
+    @abstractmethod
+    @_deprecate_positional_args
+    def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
+                 l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
+                 shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None,
+                 random_state=None, learning_rate="optimal", eta0=0.0,
+                 power_t=0.5, early_stopping=False,
+                 validation_fraction=0.1, n_iter_no_change=5,
+                 class_weight=None, warm_start=False, average=False):
+
+        super().__init__(
+            loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept, max_iter=max_iter, tol=tol,
+            shuffle=shuffle, verbose=verbose, epsilon=epsilon,
+            random_state=random_state, learning_rate=learning_rate, eta0=eta0,
+            power_t=power_t, early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change, warm_start=warm_start,
+            average=average)
+        self.class_weight = class_weight
+        self.n_jobs = n_jobs
+
+    def _partial_fit(self, X, y, alpha, C,
+                     loss, learning_rate, max_iter,
+                     classes, sample_weight,
+                     coef_init, intercept_init):
+        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
+                         order="C", accept_large_sparse=False)
+
+        n_samples, n_features = X.shape
+
+        _check_partial_fit_first_call(self, classes)
+
+        n_classes = self.classes_.shape[0]
+
+        # Allocate datastructures from input arguments
+        self._expanded_class_weight = compute_class_weight(
+            self.class_weight, classes=self.classes_, y=y)
+        sample_weight = _check_sample_weight(sample_weight, X)
+
+        if getattr(self, "coef_", None) is None or coef_init is not None:
+            self._allocate_parameter_mem(n_classes, n_features,
+                                         coef_init, intercept_init)
+        elif n_features != self.coef_.shape[-1]:
+            raise ValueError("Number of features %d does not match previous "
+                             "data %d." % (n_features, self.coef_.shape[-1]))
+
+        self.loss_function_ = self._get_loss_function(loss)
+        if not hasattr(self, "t_"):
+            self.t_ = 1.0
+
+        # delegate to concrete training procedure
+        if n_classes > 2:
+            self._fit_multiclass(X, y, alpha=alpha, C=C,
+                                 learning_rate=learning_rate,
+                                 sample_weight=sample_weight,
+                                 max_iter=max_iter)
+        elif n_classes == 2:
+            self._fit_binary(X, y, alpha=alpha, C=C,
+                             learning_rate=learning_rate,
+                             sample_weight=sample_weight,
+                             max_iter=max_iter)
+        else:
+            raise ValueError(
+                "The number of classes has to be greater than one;"
+                " got %d class" % n_classes)
+
+        return self
+
+    def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None,
+             intercept_init=None, sample_weight=None):
+        self._validate_params()
+        if hasattr(self, "classes_"):
+            self.classes_ = None
+
+        X, y = self._validate_data(X, y, accept_sparse='csr',
+                                   dtype=np.float64, order="C",
+                                   accept_large_sparse=False)
+
+        # labels can be encoded as float, int, or string literals
+        # np.unique sorts in asc order; largest class id is positive class
+        classes = np.unique(y)
+
+        if self.warm_start and hasattr(self, "coef_"):
+            if coef_init is None:
+                coef_init = self.coef_
+            if intercept_init is None:
+                intercept_init = self.intercept_
+        else:
+            self.coef_ = None
+            self.intercept_ = None
+
+        if self.average > 0:
+            self._standard_coef = self.coef_
+            self._standard_intercept = self.intercept_
+            self._average_coef = None
+            self._average_intercept = None
+
+        # Clear iteration count for multiple call to fit.
+        self.t_ = 1.0
+
+        self._partial_fit(X, y, alpha, C, loss, learning_rate, self.max_iter,
+                          classes, sample_weight, coef_init, intercept_init)
+
+        if (self.tol is not None and self.tol > -np.inf
+                and self.n_iter_ == self.max_iter):
+            warnings.warn("Maximum number of iteration reached before "
+                          "convergence. Consider increasing max_iter to "
+                          "improve the fit.",
+                          ConvergenceWarning)
+        return self
+
+    def _fit_binary(self, X, y, alpha, C, sample_weight,
+                    learning_rate, max_iter):
+        """Fit a binary classifier on X and y. """
+        coef, intercept, n_iter_ = fit_binary(self, 1, X, y, alpha, C,
+                                              learning_rate, max_iter,
+                                              self._expanded_class_weight[1],
+                                              self._expanded_class_weight[0],
+                                              sample_weight,
+                                              random_state=self.random_state)
+
+        self.t_ += n_iter_ * X.shape[0]
+        self.n_iter_ = n_iter_
+
+        # need to be 2d
+        if self.average > 0:
+            if self.average <= self.t_ - 1:
+                self.coef_ = self._average_coef.reshape(1, -1)
+                self.intercept_ = self._average_intercept
+            else:
+                self.coef_ = self._standard_coef.reshape(1, -1)
+                self._standard_intercept = np.atleast_1d(intercept)
+                self.intercept_ = self._standard_intercept
+        else:
+            self.coef_ = coef.reshape(1, -1)
+            # intercept is a float, need to convert it to an array of length 1
+            self.intercept_ = np.atleast_1d(intercept)
+
+    def _fit_multiclass(self, X, y, alpha, C, learning_rate,
+                        sample_weight, max_iter):
+        """Fit a multi-class classifier by combining binary classifiers
+        Each binary classifier predicts one class versus all others. This
+        strategy is called OvA (One versus All) or OvR (One versus Rest).
+        """
+        # Precompute the validation split using the multiclass labels
+        # to ensure proper balancing of the classes.
+        validation_mask = self._make_validation_split(y)
+
+        # Use joblib to fit OvA in parallel.
+        # Pick the random seed for each job outside of fit_binary to avoid
+        # sharing the estimator random state between threads which could lead
+        # to non-deterministic behavior
+        random_state = check_random_state(self.random_state)
+        seeds = random_state.randint(MAX_INT, size=len(self.classes_))
+        result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
+                          **_joblib_parallel_args(require="sharedmem"))(
+            delayed(fit_binary)(self, i, X, y, alpha, C, learning_rate,
+                                max_iter, self._expanded_class_weight[i],
+                                1., sample_weight,
+                                validation_mask=validation_mask,
+                                random_state=seed)
+            for i, seed in enumerate(seeds))
+
+        # take the maximum of n_iter_ over every binary fit
+        n_iter_ = 0.
+        for i, (_, intercept, n_iter_i) in enumerate(result):
+            self.intercept_[i] = intercept
+            n_iter_ = max(n_iter_, n_iter_i)
+
+        self.t_ += n_iter_ * X.shape[0]
+        self.n_iter_ = n_iter_
+
+        if self.average > 0:
+            if self.average <= self.t_ - 1.0:
+                self.coef_ = self._average_coef
+                self.intercept_ = self._average_intercept
+            else:
+                self.coef_ = self._standard_coef
+                self._standard_intercept = np.atleast_1d(self.intercept_)
+                self.intercept_ = self._standard_intercept
+
+    def partial_fit(self, X, y, classes=None, sample_weight=None):
+        """Perform one epoch of stochastic gradient descent on given samples.
+        Internally, this method uses ``max_iter = 1``. Therefore, it is not
+        guaranteed that a minimum of the cost function is reached after calling
+        it once. Matters such as objective convergence and early stopping
+        should be handled by the user.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Subset of the training data.
+        y : ndarray of shape (n_samples,)
+            Subset of the target values.
+        classes : ndarray of shape (n_classes,), default=None
+            Classes across all calls to partial_fit.
+            Can be obtained by via `np.unique(y_all)`, where y_all is the
+            target vector of the entire dataset.
+            This argument is required for the first call to partial_fit
+            and can be omitted in the subsequent calls.
+            Note that y doesn't need to contain all labels in `classes`.
+        sample_weight : array-like, shape (n_samples,), default=None
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed.
+        Returns
+        -------
+        self :
+            Returns an instance of self.
+        """
+        self._validate_params(for_partial_fit=True)
+        if self.class_weight in ['balanced']:
+            raise ValueError("class_weight '{0}' is not supported for "
+                             "partial_fit. In order to use 'balanced' weights,"
+                             " use compute_class_weight('{0}', "
+                             "classes=classes, y=y). "
+                             "In place of y you can us a large enough sample "
+                             "of the full training set target to properly "
+                             "estimate the class frequency distributions. "
+                             "Pass the resulting weights as the class_weight "
+                             "parameter.".format(self.class_weight))
+        return self._partial_fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss,
+                                 learning_rate=self.learning_rate, max_iter=1,
+                                 classes=classes, sample_weight=sample_weight,
+                                 coef_init=None, intercept_init=None)
+
+    def fit(self, X, y, coef_init=None, intercept_init=None,
+            sample_weight=None):
+        """Fit linear model with Stochastic Gradient Descent.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+        y : ndarray of shape (n_samples,)
+            Target values.
+        coef_init : ndarray of shape (n_classes, n_features), default=None
+            The initial coefficients to warm-start the optimization.
+        intercept_init : ndarray of shape (n_classes,), default=None
+            The initial intercept to warm-start the optimization.
+        sample_weight : array-like, shape (n_samples,), default=None
+            Weights applied to individual samples.
+            If not provided, uniform weights are assumed. These weights will
+            be multiplied with class_weight (passed through the
+            constructor) if class_weight is specified.
+        Returns
+        -------
+        self :
+            Returns an instance of self.
+        """
+        return self._fit(X, y, alpha=self.alpha, C=1.0,
+                         loss=self.loss, learning_rate=self.learning_rate,
+                         coef_init=coef_init, intercept_init=intercept_init,
+                         sample_weight=sample_weight)
+
+
+def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
+               pos_weight, neg_weight, sample_weight, validation_mask=None,
+               random_state=None):
+    """Fit a single binary classifier.
+    The i'th class is considered the "positive" class.
+    Parameters
+    ----------
+    est : Estimator object
+        The estimator to fit
+    i : int
+        Index of the positive class
+    X : numpy array or sparse matrix of shape [n_samples,n_features]
+        Training data
+    y : numpy array of shape [n_samples, ]
+        Target values
+    alpha : float
+        The regularization parameter
+    C : float
+        Maximum step size for passive aggressive
+    learning_rate : string
+        The learning rate. Accepted values are 'constant', 'optimal',
+        'invscaling', 'pa1' and 'pa2'.
+    max_iter : int
+        The maximum number of iterations (epochs)
+    pos_weight : float
+        The weight of the positive class
+    neg_weight : float
+        The weight of the negative class
+    sample_weight : numpy array of shape [n_samples, ]
+        The weight of each sample
+    validation_mask : numpy array of shape [n_samples, ], default=None
+        Precomputed validation mask in case _fit_binary is called in the
+        context of a one-vs-rest reduction.
+    random_state : int, RandomState instance, default=None
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+    """
+    # if average is not true, average_coef, and average_intercept will be
+    # unused
+    y_i, coef, intercept, average_coef, average_intercept = \
+        _prepare_fit_binary(est, y, i)
+    assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0]
+
+    random_state = check_random_state(random_state)
+    dataset, intercept_decay = make_dataset(
+        X, y_i, sample_weight, random_state=random_state)
+
+    penalty_type = est._get_penalty_type(est.penalty)
+    learning_rate_type = est._get_learning_rate_type(learning_rate)
+
+    if validation_mask is None:
+        validation_mask = est._make_validation_split(y_i)
+    classes = np.array([-1, 1], dtype=y_i.dtype)
+    validation_score_cb = est._make_validation_score_cb(
+        validation_mask, X, y_i, sample_weight, classes=classes)
+
+    # numpy mtrand expects a C long which is a signed 32 bit integer under
+    # Windows
+    seed = random_state.randint(MAX_INT)
+
+    tol = est.tol if est.tol is not None else -np.inf
+
+    coef, intercept, average_coef, average_intercept, n_iter_ = _plain_sgd(
+        coef, intercept, average_coef, average_intercept, est.loss_function_,
+        penalty_type, alpha, C, est.l1_ratio, dataset, validation_mask,
+        est.early_stopping, validation_score_cb, int(est.n_iter_no_change),
+        max_iter, tol, int(est.fit_intercept), int(est.verbose),
+        int(est.shuffle), seed, pos_weight, neg_weight, learning_rate_type,
+        est.eta0, est.power_t, est.t_, intercept_decay, est.average)
+
+    if est.average:
+        if len(est.classes_) == 2:
+            est._average_intercept[0] = average_intercept
+        else:
+            est._average_intercept[i] = average_intercept
+
+    return coef, intercept, n_iter_
+
+
+def _prepare_fit_binary(est, y, i):
+    """Initialization for fit_binary.
+    Returns y, coef, intercept, average_coef, average_intercept.
+    """
+    y_i = np.ones(y.shape, dtype=np.float64, order="C")
+    y_i[y != est.classes_[i]] = -1.0
+    average_intercept = 0
+    average_coef = None
+
+    if len(est.classes_) == 2:
+        if not est.average:
+            coef = est.coef_.ravel()
+            intercept = est.intercept_[0]
+        else:
+            coef = est._standard_coef.ravel()
+            intercept = est._standard_intercept[0]
+            average_coef = est._average_coef.ravel()
+            average_intercept = est._average_intercept[0]
+    else:
+        if not est.average:
+            coef = est.coef_[i]
+            intercept = est.intercept_[i]
+        else:
+            coef = est._standard_coef[i]
+            intercept = est._standard_intercept[i]
+            average_coef = est._average_coef[i]
+            average_intercept = est._average_intercept[i]
+
+    return y_i, coef, intercept, average_coef, average_intercept
diff --git a/afsklearn/linear_model/sgd_classifier.py b/afsklearn/linear_model/sgd_classifier.py
new file mode 100644
index 0000000..04a755e
--- /dev/null
+++ b/afsklearn/linear_model/sgd_classifier.py
@@ -0,0 +1,329 @@
+import numpy as np  # FIXME
+from sklearn.linear_model._stochastic_gradient import DEFAULT_EPSILON
+from sklearn.utils import _deprecate_positional_args
+
+from .._validation import check_is_fitted
+from .sgd_base import afBaseSGDClassifier
+
+
+class SGDClassifier(afBaseSGDClassifier):
+    """Linear classifiers (SVM, logistic regression, etc.) with SGD training.
+    This estimator implements regularized linear models with stochastic
+    gradient descent (SGD) learning: the gradient of the loss is estimated
+    each sample at a time and the model is updated along the way with a
+    decreasing strength schedule (aka learning rate). SGD allows minibatch
+    (online/out-of-core) learning via the `partial_fit` method.
+    For best results using the default learning rate schedule, the data should
+    have zero mean and unit variance.
+    This implementation works with data represented as dense or sparse arrays
+    of floating point values for the features. The model it fits can be
+    controlled with the loss parameter; by default, it fits a linear support
+    vector machine (SVM).
+    The regularizer is a penalty added to the loss function that shrinks model
+    parameters towards the zero vector using either the squared euclidean norm
+    L2 or the absolute norm L1 or a combination of both (Elastic Net). If the
+    parameter update crosses the 0.0 value because of the regularizer, the
+    update is truncated to 0.0 to allow for learning sparse models and achieve
+    online feature selection.
+    Read more in the :ref:`User Guide <sgd>`.
+    Parameters
+    ----------
+    loss : str, default='hinge'
+        The loss function to be used. Defaults to 'hinge', which gives a
+        linear SVM.
+        The possible options are 'hinge', 'log', 'modified_huber',
+        'squared_hinge', 'perceptron', or a regression loss: 'squared_loss',
+        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
+        The 'log' loss gives logistic regression, a probabilistic classifier.
+        'modified_huber' is another smooth loss that brings tolerance to
+        outliers as well as probability estimates.
+        'squared_hinge' is like hinge but is quadratically penalized.
+        'perceptron' is the linear loss used by the perceptron algorithm.
+        The other losses are designed for regression but can be useful in
+        classification as well; see
+        :class:`~sklearn.linear_model.SGDRegressor` for a description.
+        More details about the losses formulas can be found in the
+        :ref:`User Guide <sgd_mathematical_formulation>`.
+    penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
+        The penalty (aka regularization term) to be used. Defaults to 'l2'
+        which is the standard regularizer for linear SVM models. 'l1' and
+        'elasticnet' might bring sparsity to the model (feature selection)
+        not achievable with 'l2'.
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term. The higher the
+        value, the stronger the regularization.
+        Also used to compute the learning rate when set to `learning_rate` is
+        set to 'optimal'.
+    l1_ratio : float, default=0.15
+        The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
+        l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
+        Only used if `penalty` is 'elasticnet'.
+    fit_intercept : bool, default=True
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+    max_iter : int, default=1000
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the ``fit`` method, and not the
+        :meth:`partial_fit` method.
+        .. versionadded:: 0.19
+    tol : float, default=1e-3
+        The stopping criterion. If it is not None, training will stop
+        when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
+        epochs.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
+        .. versionadded:: 0.19
+    shuffle : bool, default=True
+        Whether or not the training data should be shuffled after each epoch.
+    verbose : int, default=0
+        The verbosity level.
+    epsilon : float, default=0.1
+        Epsilon in the epsilon-insensitive loss functions; only if `loss` is
+        'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.
+        For 'huber', determines the threshold at which it becomes less
+        important to get the prediction exactly right.
+        For epsilon-insensitive, any differences between the current prediction
+        and the correct label are ignored if they are less than this threshold.
+    n_jobs : int, default=None
+        The number of CPUs to use to do the OVA (One Versus All, for
+        multi-class problems) computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+    random_state : int, RandomState instance, default=None
+        Used for shuffling the data, when ``shuffle`` is set to ``True``.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    learning_rate : str, default='optimal'
+        The learning rate schedule:
+        - 'constant': `eta = eta0`
+        - 'optimal': `eta = 1.0 / (alpha * (t + t0))`
+          where t0 is chosen by a heuristic proposed by Leon Bottou.
+        - 'invscaling': `eta = eta0 / pow(t, power_t)`
+        - 'adaptive': eta = eta0, as long as the training keeps decreasing.
+          Each time n_iter_no_change consecutive epochs fail to decrease the
+          training loss by tol or fail to increase validation score by tol if
+          early_stopping is True, the current learning rate is divided by 5.
+            .. versionadded:: 0.20
+                Added 'adaptive' option
+    eta0 : double, default=0.0
+        The initial learning rate for the 'constant', 'invscaling' or
+        'adaptive' schedules. The default value is 0.0 as eta0 is not used by
+        the default schedule 'optimal'.
+    power_t : double, default=0.5
+        The exponent for inverse scaling learning rate [default 0.5].
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to True, it will automatically set aside
+        a stratified fraction of training data as validation and terminate
+        training when validation score returned by the `score` method is not
+        improving by at least tol for n_iter_no_change consecutive epochs.
+        .. versionadded:: 0.20
+            Added 'early_stopping' option
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if `early_stopping` is True.
+        .. versionadded:: 0.20
+            Added 'validation_fraction' option
+    n_iter_no_change : int, default=5
+        Number of iterations with no improvement to wait before stopping
+        fitting.
+        Convergence is checked against the training loss or the
+        validation loss depending on the `early_stopping` parameter.
+        .. versionadded:: 0.20
+            Added 'n_iter_no_change' option
+    class_weight : dict, {class_label: weight} or "balanced", default=None
+        Preset for the class_weight fit parameter.
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``.
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous call to fit as
+        initialization, otherwise, just erase the previous solution.
+        See :term:`the Glossary <warm_start>`.
+        Repeatedly calling fit or partial_fit when warm_start is True can
+        result in a different solution than when calling fit a single time
+        because of the way the data is shuffled.
+        If a dynamic learning rate is used, the learning rate is adapted
+        depending on the number of samples already seen. Calling ``fit`` resets
+        this counter, while ``partial_fit`` will result in increasing the
+        existing counter.
+    average : bool or int, default=False
+        When set to True, computes the averaged SGD weights accross all
+        updates and stores the result in the ``coef_`` attribute. If set to
+        an int greater than 1, averaging will begin once the total number of
+        samples seen reaches `average`. So ``average=10`` will begin
+        averaging after seeing 10 samples.
+    Attributes
+    ----------
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
+        Weights assigned to the features.
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
+        Constants in decision function.
+    n_iter_ : int
+        The actual number of iterations before reaching the stopping criterion.
+        For multiclass fits, it is the maximum over every binary fit.
+    loss_function_ : concrete ``LossFunction``
+    classes_ : array of shape (n_classes,)
+    t_ : int
+        Number of weight updates performed during training.
+        Same as ``(n_iter_ * n_samples)``.
+    See Also
+    --------
+    sklearn.svm.LinearSVC : Linear support vector classification.
+    LogisticRegression : Logistic regression.
+    Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to
+        ``SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant",
+        penalty=None)``.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.linear_model import SGDClassifier
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> from sklearn.pipeline import make_pipeline
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> Y = np.array([1, 1, 2, 2])
+    >>> # Always scale the input. The most convenient way is to use a pipeline.
+    >>> clf = make_pipeline(StandardScaler(),
+    ...                     SGDClassifier(max_iter=1000, tol=1e-3))
+    >>> clf.fit(X, Y)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('sgdclassifier', SGDClassifier())])
+    >>> print(clf.predict([[-0.8, -1]]))
+    [1]
+    """
+    @_deprecate_positional_args
+    def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
+                 l1_ratio=0.15,
+                 fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True,
+                 verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None,
+                 random_state=None, learning_rate="optimal", eta0=0.0,
+                 power_t=0.5, early_stopping=False, validation_fraction=0.1,
+                 n_iter_no_change=5, class_weight=None, warm_start=False,
+                 average=False):
+        super().__init__(
+            loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio,
+            fit_intercept=fit_intercept, max_iter=max_iter, tol=tol,
+            shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs,
+            random_state=random_state, learning_rate=learning_rate, eta0=eta0,
+            power_t=power_t, early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            n_iter_no_change=n_iter_no_change, class_weight=class_weight,
+            warm_start=warm_start, average=average)
+
+    def _check_proba(self):
+        if self.loss not in ("log", "modified_huber"):
+            raise AttributeError("probability estimates are not available for"
+                                 " loss=%r" % self.loss)
+
+    @property
+    def predict_proba(self):
+        """Probability estimates.
+        This method is only available for log loss and modified Huber loss.
+        Multiclass probability estimates are derived from binary (one-vs.-rest)
+        estimates by simple normalization, as recommended by Zadrozny and
+        Elkan.
+        Binary probability estimates for loss="modified_huber" are given by
+        (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions
+        it is necessary to perform proper probability calibration by wrapping
+        the classifier with
+        :class:`~sklearn.calibration.CalibratedClassifierCV` instead.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data for prediction.
+        Returns
+        -------
+        ndarray of shape (n_samples, n_classes)
+            Returns the probability of the sample for each class in the model,
+            where classes are ordered as they are in `self.classes_`.
+        References
+        ----------
+        Zadrozny and Elkan, "Transforming classifier scores into multiclass
+        probability estimates", SIGKDD'02,
+        http://www.research.ibm.com/people/z/zadrozny/kdd2002-Transf.pdf
+        The justification for the formula in the loss="modified_huber"
+        case is in the appendix B in:
+        http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf
+        """
+        self._check_proba()
+        return self._predict_proba
+
+    def _predict_proba(self, X):
+        check_is_fitted(self)
+
+        if self.loss == "log":
+            return self._predict_proba_lr(X)
+
+        elif self.loss == "modified_huber":
+            binary = (len(self.classes_) == 2)
+            scores = self.decision_function(X)
+
+            if binary:
+                prob2 = np.ones((scores.shape[0], 2))
+                prob = prob2[:, 1]
+            else:
+                prob = scores
+
+            np.clip(scores, -1, 1, prob)
+            prob += 1.
+            prob /= 2.
+
+            if binary:
+                prob2[:, 0] -= prob
+                prob = prob2
+            else:
+                # the above might assign zero to all classes, which doesn't
+                # normalize neatly; work around this to produce uniform
+                # probabilities
+                prob_sum = prob.sum(axis=1)
+                all_zero = (prob_sum == 0)
+                if np.any(all_zero):
+                    prob[all_zero, :] = 1
+                    prob_sum[all_zero] = len(self.classes_)
+
+                # normalize
+                prob /= prob_sum.reshape((prob.shape[0], -1))
+
+            return prob
+
+        else:
+            raise NotImplementedError("predict_(log_)proba only supported when"
+                                      " loss='log' or loss='modified_huber' "
+                                      "(%r given)" % self.loss)
+
+    @property
+    def predict_log_proba(self):
+        """Log of probability estimates.
+        This method is only available for log loss and modified Huber loss.
+        When loss="modified_huber", probability estimates may be hard zeros
+        and ones, so taking the logarithm is not possible.
+        See ``predict_proba`` for details.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data for prediction.
+        Returns
+        -------
+        T : array-like, shape (n_samples, n_classes)
+            Returns the log-probability of the sample for each class in the
+            model, where classes are ordered as they are in
+            `self.classes_`.
+        """
+        self._check_proba()
+        return self._predict_log_proba
+
+    def _predict_log_proba(self, X):
+        return np.log(self.predict_proba(X))
+
+    def _more_tags(self):
+        return {
+            '_xfail_checks': {
+                'check_sample_weights_invariance':
+                'zero sample_weight is not equivalent to removing samples',
+            }
+        }
diff --git a/afsklearn/neural_network/base.py b/afsklearn/neural_network/base.py
index 605ecaa..dd83105 100644
--- a/afsklearn/neural_network/base.py
+++ b/afsklearn/neural_network/base.py
@@ -1,32 +1,19 @@
-import numpy as np
-import arrayfire as af
-import time
-from math import sqrt
-
-from abc import ABCMeta, abstractmethod
 import warnings
+from abc import ABCMeta, abstractmethod
+from math import sqrt
 
-import sklearn
+import arrayfire as af
+import numpy as np
 from sklearn.base import is_classifier
-from sklearn.utils.validation import _deprecate_positional_args
-from sklearn.utils import check_random_state
-from sklearn.utils import gen_batches
-from sklearn.utils import shuffle
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.model_selection import train_test_split
+from sklearn.utils import check_random_state, gen_batches, shuffle
 
-from ..base import afBaseEstimator
-from .._stochastic_optimizers import SGDOptimizer, AdamOptimizer
-from .._validation import _safe_indexing, check_is_fitted, check_array, column_or_1d
 from .._extmath import safe_sparse_dot
 from .._nn_utils import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
-
-# from ..exceptions import ConvergenceWarning
-# from ..utils.extmath import safe_sparse_dot
-# from ..utils.multiclass import _check_partial_fit_first_call, unique_labels
-# from ..utils.multiclass import type_of_target
-# from ..utils.optimize import _check_optimize_result
-# import scipy.optimize
-
+from .._stochastic_optimizers import AdamOptimizer, SGDOptimizer
+from .._validation import _safe_indexing, check_array
+from ..base import afBaseEstimator
 
 _STOCHASTIC_SOLVERS = ['sgd', 'adam']
 
@@ -108,7 +95,7 @@ def _forward_pass(self, activations):
         activations[i + 1] = output_activation(activations[i + 1])
 
         return activations
-#
+
     def _compute_loss_grad(self, layer, n_samples, activations, deltas,
                            coef_grads, intercept_grads):
         """Compute the gradient of loss with respect to coefs and intercept for
@@ -247,7 +234,7 @@ def _initialize(self, y, layer_units):
         # Output for regression
         if not is_classifier(self):
             self.out_activation_ = 'identity'
-       # Output for multi class
+        # Output for multi class
         elif self._label_binarizer.y_type_ == 'multiclass':
             self.out_activation_ = 'softmax'
         # Output for binary class and multi-label
@@ -304,8 +291,8 @@ def _fit(self, X, y, incremental=False):
         n_samples, n_features = X.shape
 
         # Ensure y is 2D
-        #if y.numdims() == 1:
-            #y = af.moddims(y, y.elements(), 1)
+        # if y.numdims() == 1:
+        # y = af.moddims(y, y.elements(), 1)
 
         self.n_outputs_ = y.shape[1] if y.numdims() > 1 else 1
 
@@ -452,8 +439,6 @@ def _validate_hyperparameters(self):
 #
     def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                         intercept_grads, layer_units, incremental):
-
-
         if not incremental or not hasattr(self, '_optimizer'):
             params = self.coefs_ + self.intercepts_
 
@@ -499,7 +484,7 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                     sample_idx = shuffle(sample_idx,
                                          random_state=self._random_state)
 
-                #sloooow loop
+                # sloooow loop
                 accumulated_loss = 0.0
                 for batch_slice in gen_batches(n_samples, batch_size):
                     if self.shuffle:
@@ -565,7 +550,6 @@ def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
                         "reached and the optimization hasn't converged yet."
                         % self.max_iter, ConvergenceWarning)
 
-
         except KeyboardInterrupt:
             warnings.warn("Training interrupted by user.")
 
@@ -668,8 +652,8 @@ def _predict(self, X):
         activations = [X]
 
         for i in range(self.n_layers_ - 1):
-            #activations.append(np.empty((X.shape[0],
-                                         #layer_units[i + 1])))
+            # activations.append(np.empty((X.shape[0],
+            # layer_units[i + 1])))
             activations.append(af.constant(0, X.shape[0], layer_units[i + 1]))
 
         # forward propagate
diff --git a/afsklearn/neural_network/mlp_classifier.py b/afsklearn/neural_network/mlp_classifier.py
index 8b74e86..8dbfa54 100644
--- a/afsklearn/neural_network/mlp_classifier.py
+++ b/afsklearn/neural_network/mlp_classifier.py
@@ -1,12 +1,12 @@
 import arrayfire as af
 import numpy as np
+from sklearn.utils.validation import _deprecate_positional_args
 
+from .._classifier_mixin import afClassifierMixin
+from .._validation import check_is_fitted, column_or_1d
 from ..base import afLabelBinarizer, unique_labels
 from .base import BaseMultilayerPerceptron
-from .._classifier_mixin import afClassifierMixin
-from .._validation import column_or_1d, check_is_fitted
 
-from sklearn.utils.validation import _deprecate_positional_args
 
 class MLPClassifier(afClassifierMixin, BaseMultilayerPerceptron):
     """Multi-layer Perceptron classifier.
@@ -215,7 +215,7 @@ def _validate_input(self, X, y, incremental):
         X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
                                    multi_output=True)
         # if y.ndim == 2 and y.shape[1] == 1:
-        #y = column_or_1d(y, warn=True)
+        # y = column_or_1d(y, warn=True)
         if y.ndim == 2 and y.dims(1) == 1:
             y = column_or_1d(y, warn=True)
 
diff --git a/afsklearn/patched_modules.yml b/afsklearn/patched_modules.yml
index 4621b0c..07fcda4 100644
--- a/afsklearn/patched_modules.yml
+++ b/afsklearn/patched_modules.yml
@@ -12,3 +12,13 @@ simple_imputer:
   name: SimpleImputer
   module: sklearn.impute
   module_patch: afsklearn.impute.simple_imputer
+
+sgd_classifier:
+  name: SGDClassifier
+  module: sklearn.linear_model
+  module_patch: afsklearn.linear_model.sgd_classifier
+
+one_hot_encoder:
+  name: OneHotEncoder
+  module: sklearn.preprocessing
+  module_patch: afsklearn.preprocessing._encoders
diff --git a/afsklearn/preprocessing/_encoders.py b/afsklearn/preprocessing/_encoders.py
new file mode 100644
index 0000000..4b6db6b
--- /dev/null
+++ b/afsklearn/preprocessing/_encoders.py
@@ -0,0 +1,588 @@
+import numpy as np  # FIXME
+from scipy import sparse
+from sklearn.utils import _deprecate_positional_args
+
+from .._encode import _check_unknown, _encode, _unique
+from .._validation import check_array, check_is_fitted, is_scalar_nan
+from ..base import afBaseEstimator, afTransformerMixin
+
+
+class _BaseEncoder(afTransformerMixin, afBaseEstimator):
+    """
+    Base class for encoders that includes the code to categorize and
+    transform the input features.
+    """
+
+    def _check_X(self, X, force_all_finite=True):
+        """
+        Perform custom check_array:
+        - convert list of strings to object dtype
+        - check for missing values for object dtype data (check_array does
+          not do that)
+        - return list of features (arrays): this list of features is
+          constructed feature by feature to preserve the data types
+          of pandas DataFrame columns, as otherwise information is lost
+          and cannot be used, eg for the `categories_` attribute.
+        """
+        if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
+            # if not a dataframe, do normal check_array validation
+            X_temp = check_array(X, dtype=None,
+                                 force_all_finite=force_all_finite)
+            if (not hasattr(X, 'dtype')
+                    and np.issubdtype(X_temp.dtype, np.str_)):
+                X = check_array(X, dtype=object,
+                                force_all_finite=force_all_finite)
+            else:
+                X = X_temp
+            needs_validation = False
+        else:
+            # pandas dataframe, do validation later column by column, in order
+            # to keep the dtype information to be used in the encoder.
+            needs_validation = force_all_finite
+
+        n_samples, n_features = X.shape
+        X_columns = []
+
+        for i in range(n_features):
+            Xi = self._get_feature(X, feature_idx=i)
+            Xi = check_array(Xi, ensure_2d=False, dtype=None,
+                             force_all_finite=needs_validation)
+            X_columns.append(Xi)
+
+        return X_columns, n_samples, n_features
+
+    def _get_feature(self, X, feature_idx):
+        if hasattr(X, 'iloc'):
+            # pandas dataframes
+            return X.iloc[:, feature_idx]
+        # numpy arrays, sparse arrays
+        return X[:, feature_idx]
+
+    def _fit(self, X, handle_unknown='error', force_all_finite=True):
+        X_list, n_samples, n_features = self._check_X(
+            X, force_all_finite=force_all_finite)
+
+        if self.categories != 'auto':
+            if len(self.categories) != n_features:
+                raise ValueError("Shape mismatch: if categories is an array,"
+                                 " it has to be of shape (n_features,).")
+
+        self.categories_ = []
+
+        for i in range(n_features):
+            Xi = X_list[i]
+            if self.categories == 'auto':
+                cats = _unique(Xi)
+            else:
+                cats = np.array(self.categories[i], dtype=Xi.dtype)
+                if Xi.dtype.kind not in 'OUS':
+                    sorted_cats = np.sort(cats)
+                    error_msg = ("Unsorted categories are not "
+                                 "supported for numerical categories")
+                    # if there are nans, nan should be the last element
+                    stop_idx = -1 if np.isnan(sorted_cats[-1]) else None
+                    if (np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or
+                        (np.isnan(sorted_cats[-1]) and
+                         not np.isnan(sorted_cats[-1]))):
+                        raise ValueError(error_msg)
+
+                if handle_unknown == 'error':
+                    diff = _check_unknown(Xi, cats)
+                    if diff:
+                        msg = ("Found unknown categories {0} in column {1}"
+                               " during fit".format(diff, i))
+                        raise ValueError(msg)
+            self.categories_.append(cats)
+
+    def _transform(self, X, handle_unknown='error', force_all_finite=True):
+        X_list, n_samples, n_features = self._check_X(
+            X, force_all_finite=force_all_finite)
+
+        X_int = np.zeros((n_samples, n_features), dtype=int)
+        X_mask = np.ones((n_samples, n_features), dtype=bool)
+
+        if n_features != len(self.categories_):
+            raise ValueError(
+                "The number of features in X is different to the number of "
+                "features of the fitted data. The fitted data had {} features "
+                "and the X has {} features."
+                .format(len(self.categories_,), n_features)
+            )
+
+        for i in range(n_features):
+            Xi = X_list[i]
+            diff, valid_mask = _check_unknown(Xi, self.categories_[i],
+                                              return_mask=True)
+
+            if not np.all(valid_mask):
+                if handle_unknown == 'error':
+                    msg = ("Found unknown categories {0} in column {1}"
+                           " during transform".format(diff, i))
+                    raise ValueError(msg)
+                else:
+                    # Set the problematic rows to an acceptable value and
+                    # continue `The rows are marked `X_mask` and will be
+                    # removed later.
+                    X_mask[:, i] = valid_mask
+                    # cast Xi into the largest string type necessary
+                    # to handle different lengths of numpy strings
+                    if (self.categories_[i].dtype.kind in ('U', 'S')
+                            and self.categories_[i].itemsize > Xi.itemsize):
+                        Xi = Xi.astype(self.categories_[i].dtype)
+                    elif (self.categories_[i].dtype.kind == 'O' and
+                            Xi.dtype.kind == 'U'):
+                        # categories are objects and Xi are numpy strings.
+                        # Cast Xi to an object dtype to prevent truncation
+                        # when setting invalid values.
+                        Xi = Xi.astype('O')
+                    else:
+                        Xi = Xi.copy()
+
+                    Xi[~valid_mask] = self.categories_[i][0]
+            # We use check_unknown=False, since _check_unknown was
+            # already called above.
+            X_int[:, i] = _encode(Xi, uniques=self.categories_[i],
+                                  check_unknown=False)
+
+        return X_int, X_mask
+
+    def _more_tags(self):
+        return {'X_types': ['categorical']}
+
+
+class OneHotEncoder(_BaseEncoder):
+    """
+    Encode categorical features as a one-hot numeric array.
+    The input to this transformer should be an array-like of integers or
+    strings, denoting the values taken on by categorical (discrete) features.
+    The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
+    encoding scheme. This creates a binary column for each category and
+    returns a sparse matrix or dense array (depending on the ``sparse``
+    parameter)
+    By default, the encoder derives the categories based on the unique values
+    in each feature. Alternatively, you can also specify the `categories`
+    manually.
+    This encoding is needed for feeding categorical data to many scikit-learn
+    estimators, notably linear models and SVMs with the standard kernels.
+    Note: a one-hot encoding of y labels should use a LabelBinarizer
+    instead.
+    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+        - 'auto' : Determine categories automatically from the training data.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values within a single feature, and should be sorted in case of
+          numeric values.
+        The used categories can be found in the ``categories_`` attribute.
+        .. versionadded:: 0.20
+    drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \
+            default=None
+        Specifies a methodology to use to drop one of the categories per
+        feature. This is useful in situations where perfectly collinear
+        features cause problems, such as when feeding the resulting data
+        into a neural network or an unregularized regression.
+        However, dropping one category breaks the symmetry of the original
+        representation and can therefore induce a bias in downstream models,
+        for instance for penalized linear classification or regression models.
+        - None : retain all features (the default).
+        - 'first' : drop the first category in each feature. If only one
+          category is present, the feature will be dropped entirely.
+        - 'if_binary' : drop the first category in each feature with two
+          categories. Features with 1 or more than 2 categories are
+          left intact.
+        - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
+          should be dropped.
+        .. versionadded:: 0.21
+           The parameter `drop` was added in 0.21.
+        .. versionchanged:: 0.23
+           The option `drop='if_binary'` was added in 0.23.
+    sparse : bool, default=True
+        Will return sparse matrix if set True else will return an array.
+    dtype : number type, default=float
+        Desired dtype of output.
+    handle_unknown : {'error', 'ignore'}, default='error'
+        Whether to raise an error or ignore if an unknown categorical feature
+        is present during transform (default is to raise). When this parameter
+        is set to 'ignore' and an unknown category is encountered during
+        transform, the resulting one-hot encoded columns for this feature
+        will be all zeros. In the inverse transform, an unknown category
+        will be denoted as None.
+    Attributes
+    ----------
+    categories_ : list of arrays
+        The categories of each feature determined during fitting
+        (in order of the features in X and corresponding with the output
+        of ``transform``). This includes the category specified in ``drop``
+        (if any).
+    drop_idx_ : array of shape (n_features,)
+        - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
+          to be dropped for each feature.
+        - ``drop_idx_[i] = None`` if no category is to be dropped from the
+          feature with index ``i``, e.g. when `drop='if_binary'` and the
+          feature isn't binary.
+        - ``drop_idx_ = None`` if all the transformed features will be
+          retained.
+        .. versionchanged:: 0.23
+           Added the possibility to contain `None` values.
+    See Also
+    --------
+    OrdinalEncoder : Performs an ordinal (integer)
+      encoding of the categorical features.
+    sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
+      dictionary items (also handles string-valued features).
+    sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
+      encoding of dictionary items or strings.
+    LabelBinarizer : Binarizes labels in a one-vs-all
+      fashion.
+    MultiLabelBinarizer : Transforms between iterable of
+      iterables and a multilabel format, e.g. a (samples x classes) binary
+      matrix indicating the presence of a class label.
+    Examples
+    --------
+    Given a dataset with two features, we let the encoder find the unique
+    values per feature and transform the data to a binary one-hot encoding.
+    >>> from sklearn.preprocessing import OneHotEncoder
+    One can discard categories not seen during `fit`:
+    >>> enc = OneHotEncoder(handle_unknown='ignore')
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    OneHotEncoder(handle_unknown='ignore')
+    >>> enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
+    array([[1., 0., 1., 0., 0.],
+           [0., 1., 0., 0., 0.]])
+    >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
+    array([['Male', 1],
+           [None, 2]], dtype=object)
+    >>> enc.get_feature_names(['gender', 'group'])
+    array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
+      dtype=object)
+    One can always drop the first column for each feature:
+    >>> drop_enc = OneHotEncoder(drop='first').fit(X)
+    >>> drop_enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 0., 0.],
+           [1., 1., 0.]])
+    Or drop a column for feature only having 2 categories:
+    >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
+    >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 1., 0., 0.],
+           [1., 0., 1., 0.]])
+    """
+
+    @_deprecate_positional_args
+    def __init__(self, *, categories='auto', drop=None, sparse=True,
+                 dtype=np.float64, handle_unknown='error'):
+        self.categories = categories
+        self.sparse = sparse
+        self.dtype = dtype
+        self.handle_unknown = handle_unknown
+        self.drop = drop
+
+    def _validate_keywords(self):
+        if self.handle_unknown not in ('error', 'ignore'):
+            msg = ("handle_unknown should be either 'error' or 'ignore', "
+                   "got {0}.".format(self.handle_unknown))
+            raise ValueError(msg)
+        # If we have both dropped columns and ignored unknown
+        # values, there will be ambiguous cells. This creates difficulties
+        # in interpreting the model.
+        if self.drop is not None and self.handle_unknown != 'error':
+            raise ValueError(
+                "`handle_unknown` must be 'error' when the drop parameter is "
+                "specified, as both would create categories that are all "
+                "zero.")
+
+    def _compute_drop_idx(self):
+        if self.drop is None:
+            return None
+        elif isinstance(self.drop, str):
+            if self.drop == 'first':
+                return np.zeros(len(self.categories_), dtype=object)
+            elif self.drop == 'if_binary':
+                return np.array([0 if len(cats) == 2 else None
+                                for cats in self.categories_], dtype=object)
+            else:
+                msg = (
+                    "Wrong input for parameter `drop`. Expected "
+                    "'first', 'if_binary', None or array of objects, got {}"
+                )
+                raise ValueError(msg.format(type(self.drop)))
+
+        else:
+            try:
+                drop_array = np.asarray(self.drop, dtype=object)
+                droplen = len(drop_array)
+            except (ValueError, TypeError):
+                msg = (
+                    "Wrong input for parameter `drop`. Expected "
+                    "'first', 'if_binary', None or array of objects, got {}"
+                )
+                raise ValueError(msg.format(type(drop_array)))
+            if droplen != len(self.categories_):
+                msg = ("`drop` should have length equal to the number "
+                       "of features ({}), got {}")
+                raise ValueError(msg.format(len(self.categories_), droplen))
+            missing_drops = []
+            drop_indices = []
+            for col_idx, (val, cat_list) in enumerate(zip(drop_array,
+                                                          self.categories_)):
+                if not is_scalar_nan(val):
+                    drop_idx = np.where(cat_list == val)[0]
+                    if drop_idx.size:  # found drop idx
+                        drop_indices.append(drop_idx[0])
+                    else:
+                        missing_drops.append((col_idx, val))
+                    continue
+
+                # val is nan, find nan in categories manually
+                for cat_idx, cat in enumerate(cat_list):
+                    if is_scalar_nan(cat):
+                        drop_indices.append(cat_idx)
+                        break
+                else:  # loop did not break thus drop is missing
+                    missing_drops.append((col_idx, val))
+
+            if any(missing_drops):
+                msg = ("The following categories were supposed to be "
+                       "dropped, but were not found in the training "
+                       "data.\n{}".format(
+                           "\n".join(
+                               ["Category: {}, Feature: {}".format(c, v)
+                                for c, v in missing_drops])))
+                raise ValueError(msg)
+            return np.array(drop_indices, dtype=object)
+
+    def fit(self, X, y=None):
+        """
+        Fit OneHotEncoder to X.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to determine the categories of each feature.
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+        Returns
+        -------
+        self
+        """
+        self._validate_keywords()
+        self._fit(X, handle_unknown=self.handle_unknown,
+                  force_all_finite='allow-nan')
+        self.drop_idx_ = self._compute_drop_idx()
+        return self
+
+    def fit_transform(self, X, y=None):
+        """
+        Fit OneHotEncoder to X, then transform X.
+        Equivalent to fit(X).transform(X) but more convenient.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`~sklearn.pipeline.Pipeline`.
+        Returns
+        -------
+        X_out : {ndarray, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            Transformed input. If `sparse=True`, a sparse matrix will be
+            returned.
+        """
+        self._validate_keywords()
+        return super().fit_transform(X, y)
+
+    def transform(self, X):
+        """
+        Transform X using one-hot encoding.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to encode.
+        Returns
+        -------
+        X_out : {ndarray, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            Transformed input. If `sparse=True`, a sparse matrix will be
+            returned.
+        """
+        check_is_fitted(self)
+        # validation of X happens in _check_X called by _transform
+        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown,
+                                        force_all_finite='allow-nan')
+
+        n_samples, n_features = X_int.shape
+
+        if self.drop_idx_ is not None:
+            to_drop = self.drop_idx_.copy()
+            # We remove all the dropped categories from mask, and decrement all
+            # categories that occur after them to avoid an empty column.
+            keep_cells = X_int != to_drop
+            n_values = []
+            for i, cats in enumerate(self.categories_):
+                n_cats = len(cats)
+
+                # drop='if_binary' but feature isn't binary
+                if to_drop[i] is None:
+                    # set to cardinality to not drop from X_int
+                    to_drop[i] = n_cats
+                    n_values.append(n_cats)
+                else:  # dropped
+                    n_values.append(n_cats - 1)
+
+            to_drop = to_drop.reshape(1, -1)
+            X_int[X_int > to_drop] -= 1
+            X_mask &= keep_cells
+        else:
+            n_values = [len(cats) for cats in self.categories_]
+
+        mask = X_mask.ravel()
+        feature_indices = np.cumsum([0] + n_values)
+        indices = (X_int + feature_indices[:-1]).ravel()[mask]
+
+        indptr = np.empty(n_samples + 1, dtype=int)
+        indptr[0] = 0
+        np.sum(X_mask, axis=1, out=indptr[1:])
+        np.cumsum(indptr[1:], out=indptr[1:])
+        data = np.ones(indptr[-1])
+
+        out = sparse.csr_matrix((data, indices, indptr),
+                                shape=(n_samples, feature_indices[-1]),
+                                dtype=self.dtype)
+        if not self.sparse:
+            return out.toarray()
+        else:
+            return out
+
+    def inverse_transform(self, X):
+        """
+        Convert the data back to the original representation.
+        In case unknown categories are encountered (all zeros in the
+        one-hot encoding), ``None`` is used to represent this category.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape \
+                (n_samples, n_encoded_features)
+            The transformed data.
+        Returns
+        -------
+        X_tr : ndarray of shape (n_samples, n_features)
+            Inverse transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse='csr')
+
+        n_samples, _ = X.shape
+        n_features = len(self.categories_)
+        if self.drop_idx_ is None:
+            n_transformed_features = sum(len(cats)
+                                         for cats in self.categories_)
+        else:
+            n_transformed_features = sum(
+                len(cats) - 1 if to_drop is not None else len(cats)
+                for cats, to_drop in zip(self.categories_, self.drop_idx_)
+            )
+
+        # validate shape of passed X
+        msg = ("Shape of the passed X data is not correct. Expected {0} "
+               "columns, got {1}.")
+        if X.shape[1] != n_transformed_features:
+            raise ValueError(msg.format(n_transformed_features, X.shape[1]))
+
+        # create resulting array of appropriate dtype
+        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
+        X_tr = np.empty((n_samples, n_features), dtype=dt)
+
+        j = 0
+        found_unknown = {}
+
+        for i in range(n_features):
+            if self.drop_idx_ is None or self.drop_idx_[i] is None:
+                cats = self.categories_[i]
+            else:
+                cats = np.delete(self.categories_[i], self.drop_idx_[i])
+            n_categories = len(cats)
+
+            # Only happens if there was a column with a unique
+            # category. In this case we just fill the column with this
+            # unique category value.
+            if n_categories == 0:
+                X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
+                j += n_categories
+                continue
+            sub = X[:, j:j + n_categories]
+            # for sparse X argmax returns 2D matrix, ensure 1D array
+            labels = np.asarray(sub.argmax(axis=1)).flatten()
+            X_tr[:, i] = cats[labels]
+            if self.handle_unknown == 'ignore':
+                unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
+                # ignored unknown categories: we have a row of all zero
+                if unknown.any():
+                    found_unknown[i] = unknown
+            else:
+                dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
+                if dropped.any():
+                    if self.drop_idx_ is None:
+                        all_zero_samples = np.flatnonzero(dropped)
+                        raise ValueError(
+                            f"Samples {all_zero_samples} can not be inverted "
+                            "when drop=None and handle_unknown='error' "
+                            "because they contain all zeros")
+                    # we can safely assume that all of the nulls in each column
+                    # are the dropped value
+                    X_tr[dropped, i] = self.categories_[i][
+                        self.drop_idx_[i]
+                    ]
+
+            j += n_categories
+
+        # if ignored are found: potentially need to upcast result to
+        # insert None values
+        if found_unknown:
+            if X_tr.dtype != object:
+                X_tr = X_tr.astype(object)
+
+            for idx, mask in found_unknown.items():
+                X_tr[mask, idx] = None
+
+        return X_tr
+
+    def get_feature_names(self, input_features=None):
+        """
+        Return feature names for output features.
+        Parameters
+        ----------
+        input_features : list of str of shape (n_features,)
+            String names for input features if available. By default,
+            "x0", "x1", ... "xn_features" is used.
+        Returns
+        -------
+        output_feature_names : ndarray of shape (n_output_features,)
+            Array of feature names.
+        """
+        check_is_fitted(self)
+        cats = self.categories_
+        if input_features is None:
+            input_features = ['x%d' % i for i in range(len(cats))]
+        elif len(input_features) != len(self.categories_):
+            raise ValueError(
+                "input_features should have length equal to number of "
+                "features ({}), got {}".format(len(self.categories_),
+                                               len(input_features)))
+
+        feature_names = []
+        for i in range(len(cats)):
+            names = [
+                input_features[i] + '_' + str(t) for t in cats[i]]
+            if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
+                names.pop(self.drop_idx_[i])
+            feature_names.extend(names)
+
+        return np.array(feature_names, dtype=object)
diff --git a/afsklearn/preprocessing/_label.py b/afsklearn/preprocessing/_label.py
new file mode 100644
index 0000000..7e1020c
--- /dev/null
+++ b/afsklearn/preprocessing/_label.py
@@ -0,0 +1,120 @@
+import numpy as np  # FIXME
+from sklearn.utils.validation import _num_samples
+
+from .._encode import _encode, _unique
+from .._validation import check_is_fitted, column_or_1d
+from ..base import afBaseEstimator, afTransformerMixin
+
+
+class afLabelEncoder(afTransformerMixin, afBaseEstimator):
+    """Encode target labels with value between 0 and n_classes-1.
+    This transformer should be used to encode target values, *i.e.* `y`, and
+    not the input `X`.
+    Read more in the :ref:`User Guide <preprocessing_targets>`.
+    .. versionadded:: 0.12
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        Holds the label for each class.
+    Examples
+    --------
+    `LabelEncoder` can be used to normalize labels.
+    >>> from sklearn import preprocessing
+    >>> le = preprocessing.LabelEncoder()
+    >>> le.fit([1, 2, 2, 6])
+    LabelEncoder()
+    >>> le.classes_
+    array([1, 2, 6])
+    >>> le.transform([1, 1, 2, 6])
+    array([0, 0, 1, 2]...)
+    >>> le.inverse_transform([0, 0, 1, 2])
+    array([1, 1, 2, 6])
+    It can also be used to transform non-numerical labels (as long as they are
+    hashable and comparable) to numerical labels.
+    >>> le = preprocessing.LabelEncoder()
+    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
+    LabelEncoder()
+    >>> list(le.classes_)
+    ['amsterdam', 'paris', 'tokyo']
+    >>> le.transform(["tokyo", "tokyo", "paris"])
+    array([2, 2, 1]...)
+    >>> list(le.inverse_transform([2, 2, 1]))
+    ['tokyo', 'tokyo', 'paris']
+    See Also
+    --------
+    OrdinalEncoder : Encode categorical features using an ordinal encoding
+        scheme.
+    OneHotEncoder : Encode categorical features as a one-hot numeric array.
+    """
+
+    def fit(self, y):
+        """Fit label encoder.
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_ = _unique(y)
+        return self
+
+    def fit_transform(self, y):
+        """Fit label encoder and return encoded labels.
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+        """
+        y = column_or_1d(y, warn=True)
+        self.classes_, y = _unique(y, return_inverse=True)
+        return y
+
+    def transform(self, y):
+        """Transform labels to normalized encoding.
+        Parameters
+        ----------
+        y : array-like of shape (n_samples,)
+            Target values.
+        Returns
+        -------
+        y : array-like of shape (n_samples,)
+        """
+        check_is_fitted(self)
+        y = column_or_1d(y, warn=True)
+        # transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return np.array([])
+
+        return _encode(y, uniques=self.classes_)
+
+    def inverse_transform(self, y):
+        """Transform labels back to original encoding.
+        Parameters
+        ----------
+        y : ndarray of shape (n_samples,)
+            Target values.
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+        """
+        check_is_fitted(self)
+        y = column_or_1d(y, warn=True)
+        # inverse transform of empty array is empty array
+        if _num_samples(y) == 0:
+            return np.array([])
+
+        diff = np.setdiff1d(y, np.arange(len(self.classes_)))
+        if len(diff):
+            raise ValueError(
+                "y contains previously unseen labels: %s" % str(diff))
+        y = np.asarray(y)
+        return self.classes_[y]
+
+    def _more_tags(self):
+        return {'X_types': ['1dlabels']}
diff --git a/requirements.txt b/requirements.txt
index 17dcea9..d39d106 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
--e .
+-e .[dev]
 arrayfire==3.8.0+cu112 -f https://repo.arrayfire.com/python/wheels/3.8.0/
diff --git a/setup.cfg b/setup.cfg
index 0378a80..8a4021f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,3 +18,18 @@ install_requires =
 
 [options.packages.find]
 exclude = tests
+
+[options.extras_require]
+dev =
+    autopep8==1.5.7
+    isort==5.9.2
+    flake8==3.9.2
+    flake8-import-order==0.18.1
+
+[tool:isort]
+line_length = 119
+multi_line_output = 4
+
+[flake8]
+import-order-style = pep8
+max-line-length = 119
diff --git a/tests/test_one_hot_encoder.py b/tests/test_one_hot_encoder.py
new file mode 100644
index 0000000..07662dc
--- /dev/null
+++ b/tests/test_one_hot_encoder.py
@@ -0,0 +1,27 @@
+from afsklearn.patcher import Patcher
+
+from . import measure_time
+
+
+def sklearn_example() -> None:
+    from sklearn.preprocessing import OneHotEncoder
+    enc = OneHotEncoder(handle_unknown='ignore')
+    X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    enc.fit(X)
+
+
+@measure_time
+def test_sklearn() -> None:
+    sklearn_example()
+
+
+@measure_time
+def test_afsklearn() -> None:
+    Patcher.patch("one_hot_encoder")
+    sklearn_example()
+    Patcher.rollback("one_hot_encoder")
+
+
+if __name__ == "__main__":
+    # test_afsklearn()
+    test_sklearn()
diff --git a/tests/test_sgd_classifier.py b/tests/test_sgd_classifier.py
new file mode 100644
index 0000000..88c7804
--- /dev/null
+++ b/tests/test_sgd_classifier.py
@@ -0,0 +1,31 @@
+import numpy as np
+
+from afsklearn.patcher import Patcher
+
+from . import measure_time
+
+
+def sklearn_example() -> None:
+    from sklearn.linear_model import SGDClassifier
+    X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    Y = np.array([1, 1, 2, 2])
+    clf = SGDClassifier()
+    clf.fit(X, Y)
+    print(f"Predict: {clf.predict([[-0.8, -1]])}")
+
+
+@measure_time
+def test_sklearn() -> None:
+    sklearn_example()
+
+
+@measure_time
+def test_afsklearn() -> None:
+    Patcher.patch("sgd_classifier")
+    sklearn_example()
+    Patcher.rollback("sgd_classifier")
+
+
+if __name__ == "__main__":
+    test_afsklearn()
+    # test_sklearn()