Merge pull request arrayfire#3 from syurkevi/gaussian_random_projection

Gaussian random projection
syurkevi · Sep 22, 2021 · 2761ded · 2761ded
2 parents 15cc94d + 0683b6b
commit 2761ded
Show file tree

Hide file tree

Showing 54 changed files with 8,099 additions and 1,068 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,13 @@
 # Python cache
 __pycache__/
 
+# Build
+build
+dist
+*.egg-info
+
+# Static typing
+.mypy_cache
+
 # Virtual env
 venv
diff --git a/README.md b/README.md
@@ -2,6 +2,46 @@
 
 MonkeyPatch sklearn with ArrayFire accelerated variants.
 
-## Quick Start
+## Installation
 
+```console
+pip install -r requirements.txt
+```
 
+## Tests
+
+To run all tests
+
+```console
+pytest .
+```
+
+To run specific test
+
+```console
+pytest tests/test_mlp.py
+```
+
+To run with time measurements use the flag `-s`
+
+```console
+pytest test/test_mlp.py -s
+```
+
+--
+
+## TODO
+
+- GradientBoosting
+- OneHotEncoder  - Anton
+- RandomForest
+- SelectFWE
+- QuantileTransformer
+- ExtraTreesClassifier
+- Imputer  - Anton
+- GenericUnivariateSelect
+- SGDClassifier  - Anton
+- LinearSVC  - Stef
+- LinearSVR  - Stef
+- LogisticRegression  - Stef
+- Var threshold  - Anton
diff --git a/afsklearn/_class_weight.py b/afsklearn/_class_weight.py
@@ -0,0 +1,62 @@
+import numpy as np  # FIXME
+from sklearn.utils.validation import _deprecate_positional_args
+
+
+@_deprecate_positional_args
+def compute_class_weight(class_weight, *, classes, y):
+    """Estimate class weights for unbalanced datasets.
+    Parameters
+    ----------
+    class_weight : dict, 'balanced' or None
+        If 'balanced', class weights will be given by
+        ``n_samples / (n_classes * np.bincount(y))``.
+        If a dictionary is given, keys are classes and values
+        are corresponding class weights.
+        If None is given, the class weights will be uniform.
+    classes : ndarray
+        Array of the classes occurring in the data, as given by
+        ``np.unique(y_org)`` with ``y_org`` the original class labels.
+    y : array-like of shape (n_samples,)
+        Array of original class labels per sample.
+    Returns
+    -------
+    class_weight_vect : ndarray of shape (n_classes,)
+        Array with class_weight_vect[i] the weight for i-th class.
+    References
+    ----------
+    The "balanced" heuristic is inspired by
+    Logistic Regression in Rare Events Data, King, Zen, 2001.
+    """
+    # Import error caused by circular imports.
+    from .preprocessing._label import afLabelEncoder
+
+    if set(y) - set(classes):
+        raise ValueError("classes should include all valid labels that can "
+                         "be in y")
+    if class_weight is None or len(class_weight) == 0:
+        # uniform class weights
+        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
+    elif class_weight == 'balanced':
+        # Find the weight of each class as present in y.
+        le = afLabelEncoder()
+        y_ind = le.fit_transform(y)
+        if not all(np.in1d(classes, le.classes_)):
+            raise ValueError("classes should have valid labels that are in y")
+
+        recip_freq = len(y) / (len(le.classes_) *
+                               np.bincount(y_ind).astype(np.float64))
+        weight = recip_freq[le.transform(classes)]
+    else:
+        # user-defined dictionary
+        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
+        if not isinstance(class_weight, dict):
+            raise ValueError("class_weight must be dict, 'balanced', or None,"
+                             " got: %r" % class_weight)
+        for c in class_weight:
+            i = np.searchsorted(classes, c)
+            if i >= len(classes) or classes[i] != c:
+                raise ValueError("Class label {} not present.".format(c))
+            else:
+                weight[i] = class_weight[c]
+
+    return weight
diff --git a/afsklearn/_classifier_mixin.py b/afsklearn/_classifier_mixin.py
@@ -1,4 +1,12 @@
-import arrayfire as af
+import numpy as np  # FIXME
+from numpy import count_nonzero
+from scipy.sparse import csr_matrix
+from scipy.special import expit
+
+from ._extmath import safe_sparse_dot
+from ._validation import check_array, check_consistent_length, check_is_fitted, column_or_1d
+from .base import type_of_target
+
 
 def _weighted_sum(sample_score, sample_weight, normalize=False):
     if normalize:
@@ -8,6 +16,7 @@ def _weighted_sum(sample_score, sample_weight, normalize=False):
     else:
         return sample_score.sum()
 
+
 def _check_targets(y_true, y_pred):
     """Check that y_true and y_pred belong to the same classification task
     This converts multiclass or binary types to a common shape, and raises a
@@ -63,8 +72,6 @@ def _check_targets(y_true, y_pred):
     return y_type, y_true, y_pred
 
 
-
-
 def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Accuracy classification score.
     In multilabel classification, this function computes subset accuracy:
@@ -116,13 +123,15 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
     if y_type.startswith('multilabel'):
-        differing_labels = count_nonzero(y_true - y_pred, axis=1)
+        diff =  (y_true - y_pred).todense()
+        differing_labels = count_nonzero(diff, axis=1)
         score = differing_labels == 0
     else:
         score = y_true == y_pred
 
     return _weighted_sum(score, sample_weight, normalize)
 
+
 class afClassifierMixin:
     """ArrayFire enabled Mixin class for all classifiers in scikit-learn."""
 
@@ -147,8 +156,76 @@ def score(self, X, y, sample_weight=None):
         score : float
             Mean accuracy of self.predict(X) wrt. y.
         """
-        #return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
-        return #TMP
+        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
 
     def _more_tags(self):
         return {'requires_y': True}
+
+
+class afLinearClassifierMixin(afClassifierMixin):
+    """Mixin for linear classifiers.
+    Handles prediction for sparse and dense X.
+    """
+
+    def decision_function(self, X):
+        """
+        Predict confidence scores for samples.
+        The confidence score for a sample is proportional to the signed
+        distance of that sample to the hyperplane.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+        Returns
+        -------
+        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
+            Confidence scores per (sample, class) combination. In the binary
+            case, confidence score for self.classes_[1] where >0 means this
+            class would be predicted.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, accept_sparse='csr')
+
+        n_features = self.coef_.shape[1]
+        if X.shape[1] != n_features:
+            raise ValueError("X has %d features per sample; expecting %d"
+                             % (X.shape[1], n_features))
+
+        scores = safe_sparse_dot(X, self.coef_.T,
+                                 dense_output=True) + self.intercept_
+        return scores.ravel() if scores.shape[1] == 1 else scores
+
+    def predict(self, X):
+        """
+        Predict class labels for samples in X.
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape (n_samples, n_features)
+            Samples.
+        Returns
+        -------
+        C : array, shape [n_samples]
+            Predicted class label per sample.
+        """
+        scores = self.decision_function(X)
+        if len(scores.shape) == 1:
+            indices = (scores > 0).astype(int)
+        else:
+            indices = scores.argmax(axis=1)
+        return self.classes_[indices]
+
+    def _predict_proba_lr(self, X):
+        """Probability estimation for OvR logistic regression.
+        Positive class probabilities are computed as
+        1. / (1. + np.exp(-self.decision_function(X)));
+        multiclass is handled by normalizing that over all classes.
+        """
+        prob = self.decision_function(X)
+        expit(prob, out=prob)
+        if prob.ndim == 1:
+            return np.vstack([1 - prob, prob]).T
+        else:
+            # OvR normalization, like LibLinear's predict_probability
+            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
+            return prob