Add SGD and OHE

syurkevi · Jul 28, 2021 · 96317a8 · 96317a8
1 parent 68fca33
commit 96317a8
Show file tree

Hide file tree

Showing 20 changed files with 2,369 additions and 68 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,8 @@ build
 dist
 *.egg-info
 
+# Static typing
+.mypy_cache
+
 # Virtual env
 venv
diff --git a/afsklearn/_class_weight.py b/afsklearn/_class_weight.py
@@ -0,0 +1,62 @@
+import numpy as np  # FIXME
+from sklearn.utils import _deprecate_positional_args
+
+
+@_deprecate_positional_args
+def compute_class_weight(class_weight, *, classes, y):
+    """Estimate class weights for unbalanced datasets.
+    Parameters
+    ----------
+    class_weight : dict, 'balanced' or None
+        If 'balanced', class weights will be given by
+        ``n_samples / (n_classes * np.bincount(y))``.
+        If a dictionary is given, keys are classes and values
+        are corresponding class weights.
+        If None is given, the class weights will be uniform.
+    classes : ndarray
+        Array of the classes occurring in the data, as given by
+        ``np.unique(y_org)`` with ``y_org`` the original class labels.
+    y : array-like of shape (n_samples,)
+        Array of original class labels per sample.
+    Returns
+    -------
+    class_weight_vect : ndarray of shape (n_classes,)
+        Array with class_weight_vect[i] the weight for i-th class.
+    References
+    ----------
+    The "balanced" heuristic is inspired by
+    Logistic Regression in Rare Events Data, King, Zen, 2001.
+    """
+    # Import error caused by circular imports.
+    from .preprocessing._label import afLabelEncoder
+
+    if set(y) - set(classes):
+        raise ValueError("classes should include all valid labels that can "
+                         "be in y")
+    if class_weight is None or len(class_weight) == 0:
+        # uniform class weights
+        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
+    elif class_weight == 'balanced':
+        # Find the weight of each class as present in y.
+        le = afLabelEncoder()
+        y_ind = le.fit_transform(y)
+        if not all(np.in1d(classes, le.classes_)):
+            raise ValueError("classes should have valid labels that are in y")
+
+        recip_freq = len(y) / (len(le.classes_) *
+                               np.bincount(y_ind).astype(np.float64))
+        weight = recip_freq[le.transform(classes)]
+    else:
+        # user-defined dictionary
+        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
+        if not isinstance(class_weight, dict):
+            raise ValueError("class_weight must be dict, 'balanced', or None,"
+                             " got: %r" % class_weight)
+        for c in class_weight:
+            i = np.searchsorted(classes, c)
+            if i >= len(classes) or classes[i] != c:
+                raise ValueError("Class label {} not present.".format(c))
+            else:
+                weight[i] = class_weight[c]
+
+    return weight
diff --git a/afsklearn/_classifier_mixin.py b/afsklearn/_classifier_mixin.py
@@ -1,5 +1,6 @@
 import arrayfire as af
 
+
 def _weighted_sum(sample_score, sample_weight, normalize=False):
     if normalize:
         return np.average(sample_score, weights=sample_weight)
@@ -8,6 +9,7 @@ def _weighted_sum(sample_score, sample_weight, normalize=False):
     else:
         return sample_score.sum()
 
+
 def _check_targets(y_true, y_pred):
     """Check that y_true and y_pred belong to the same classification task
     This converts multiclass or binary types to a common shape, and raises a
@@ -63,8 +65,6 @@ def _check_targets(y_true, y_pred):
     return y_type, y_true, y_pred
 
 
-
-
 def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Accuracy classification score.
     In multilabel classification, this function computes subset accuracy:
@@ -123,6 +123,7 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
 
     return _weighted_sum(score, sample_weight, normalize)
 
+
 class afClassifierMixin:
     """ArrayFire enabled Mixin class for all classifiers in scikit-learn."""
 
@@ -147,8 +148,77 @@ def score(self, X, y, sample_weight=None):
         score : float
             Mean accuracy of self.predict(X) wrt. y.
         """
-        #return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
-        return #TMP
+        # return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
+        return  # TMP
 
     def _more_tags(self):
         return {'requires_y': True}
+
+
+class afLinearClassifierMixin(afClassifierMixin):
+    """Mixin for linear classifiers.
+    Handles prediction for sparse and dense X.
+    """
+
+    def decision_function(self, X):
+        """
+        Predict confidence scores for samples.
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+        Returns
+        -------
+        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
+            Confidence scores per (sample, class) combination. In the binary
+            case, confidence score for self.classes_[1] where >0 means this
+            class would be predicted.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, accept_sparse='csr')
+
+        n_features = self.coef_.shape[1]
+        if X.shape[1] != n_features:
+            raise ValueError("X has %d features per sample; expecting %d"
+                             % (X.shape[1], n_features))
+
+        scores = safe_sparse_dot(X, self.coef_.T,
+                                 dense_output=True) + self.intercept_
+        return scores.ravel() if scores.shape[1] == 1 else scores
+
+    def predict(self, X):
+        """
+        Predict class labels for samples in X.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+        Returns
+        -------
+        C : array, shape [n_samples]
+            Predicted class label per sample.
+        """
+        scores = self.decision_function(X)
+        if len(scores.shape) == 1:
+            indices = (scores > 0).astype(int)
+        else:
+            indices = scores.argmax(axis=1)
+        return self.classes_[indices]
+
+    def _predict_proba_lr(self, X):
+        """Probability estimation for OvR logistic regression.
+        Positive class probabilities are computed as
+        1. / (1. + np.exp(-self.decision_function(X)));
+        multiclass is handled by normalizing that over all classes.
+        """
+        prob = self.decision_function(X)
+        expit(prob, out=prob)
+        if prob.ndim == 1:
+            return np.vstack([1 - prob, prob]).T
+        else:
+            # OvR normalization, like LibLinear's predict_probability
+            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
+            return prob