Skip to content

Commit

Permalink
Merge pull request arrayfire#3 from syurkevi/gaussian_random_projection
Browse files Browse the repository at this point in the history
Gaussian random projection
  • Loading branch information
syurkevi authored Sep 22, 2021
2 parents 15cc94d + 0683b6b commit 2761ded
Show file tree
Hide file tree
Showing 54 changed files with 8,099 additions and 1,068 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Python cache
__pycache__/

# Build
build
dist
*.egg-info

# Static typing
.mypy_cache

# Virtual env
venv
42 changes: 41 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,46 @@

MonkeyPatch sklearn with ArrayFire accelerated variants.

## Quick Start
## Installation

```console
pip install -r requirements.txt
```

## Tests

To run all tests

```console
pytest .
```

To run specific test

```console
pytest tests/test_mlp.py
```

To run with time measurements use the flag `-s`

```console
pytest test/test_mlp.py -s
```

--

## TODO

- GradientBoosting
- OneHotEncoder - Anton
- RandomForest
- SelectFWE
- QuantileTransformer
- ExtraTreesClassifier
- Imputer - Anton
- GenericUnivariateSelect
- SGDClassifier - Anton
- LinearSVC - Stef
- LinearSVR - Stef
- LogisticRegression - Stef
- Var threshold - Anton
62 changes: 62 additions & 0 deletions afsklearn/_class_weight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np # FIXME
from sklearn.utils.validation import _deprecate_positional_args


@_deprecate_positional_args
def compute_class_weight(class_weight, *, classes, y):
"""Estimate class weights for unbalanced datasets.
Parameters
----------
class_weight : dict, 'balanced' or None
If 'balanced', class weights will be given by
``n_samples / (n_classes * np.bincount(y))``.
If a dictionary is given, keys are classes and values
are corresponding class weights.
If None is given, the class weights will be uniform.
classes : ndarray
Array of the classes occurring in the data, as given by
``np.unique(y_org)`` with ``y_org`` the original class labels.
y : array-like of shape (n_samples,)
Array of original class labels per sample.
Returns
-------
class_weight_vect : ndarray of shape (n_classes,)
Array with class_weight_vect[i] the weight for i-th class.
References
----------
The "balanced" heuristic is inspired by
Logistic Regression in Rare Events Data, King, Zen, 2001.
"""
# Import error caused by circular imports.
from .preprocessing._label import afLabelEncoder

if set(y) - set(classes):
raise ValueError("classes should include all valid labels that can "
"be in y")
if class_weight is None or len(class_weight) == 0:
# uniform class weights
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
elif class_weight == 'balanced':
# Find the weight of each class as present in y.
le = afLabelEncoder()
y_ind = le.fit_transform(y)
if not all(np.in1d(classes, le.classes_)):
raise ValueError("classes should have valid labels that are in y")

recip_freq = len(y) / (len(le.classes_) *
np.bincount(y_ind).astype(np.float64))
weight = recip_freq[le.transform(classes)]
else:
# user-defined dictionary
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
if not isinstance(class_weight, dict):
raise ValueError("class_weight must be dict, 'balanced', or None,"
" got: %r" % class_weight)
for c in class_weight:
i = np.searchsorted(classes, c)
if i >= len(classes) or classes[i] != c:
raise ValueError("Class label {} not present.".format(c))
else:
weight[i] = class_weight[c]

return weight
89 changes: 83 additions & 6 deletions afsklearn/_classifier_mixin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
import arrayfire as af
import numpy as np # FIXME
from numpy import count_nonzero
from scipy.sparse import csr_matrix
from scipy.special import expit

from ._extmath import safe_sparse_dot
from ._validation import check_array, check_consistent_length, check_is_fitted, column_or_1d
from .base import type_of_target


def _weighted_sum(sample_score, sample_weight, normalize=False):
if normalize:
Expand All @@ -8,6 +16,7 @@ def _weighted_sum(sample_score, sample_weight, normalize=False):
else:
return sample_score.sum()


def _check_targets(y_true, y_pred):
"""Check that y_true and y_pred belong to the same classification task
This converts multiclass or binary types to a common shape, and raises a
Expand Down Expand Up @@ -63,8 +72,6 @@ def _check_targets(y_true, y_pred):
return y_type, y_true, y_pred




def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
Expand Down Expand Up @@ -116,13 +123,15 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)
if y_type.startswith('multilabel'):
differing_labels = count_nonzero(y_true - y_pred, axis=1)
diff = (y_true - y_pred).todense()
differing_labels = count_nonzero(diff, axis=1)
score = differing_labels == 0
else:
score = y_true == y_pred

return _weighted_sum(score, sample_weight, normalize)


class afClassifierMixin:
"""ArrayFire enabled Mixin class for all classifiers in scikit-learn."""

Expand All @@ -147,8 +156,76 @@ def score(self, X, y, sample_weight=None):
score : float
Mean accuracy of self.predict(X) wrt. y.
"""
#return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
return #TMP
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

def _more_tags(self):
return {'requires_y': True}


class afLinearClassifierMixin(afClassifierMixin):
"""Mixin for linear classifiers.
Handles prediction for sparse and dense X.
"""

def decision_function(self, X):
"""
Predict confidence scores for samples.
The confidence score for a sample is proportional to the signed
distance of that sample to the hyperplane.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Samples.
Returns
-------
array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
Confidence scores per (sample, class) combination. In the binary
case, confidence score for self.classes_[1] where >0 means this
class would be predicted.
"""
check_is_fitted(self)

X = check_array(X, accept_sparse='csr')

n_features = self.coef_.shape[1]
if X.shape[1] != n_features:
raise ValueError("X has %d features per sample; expecting %d"
% (X.shape[1], n_features))

scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores

def predict(self, X):
"""
Predict class labels for samples in X.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Samples.
Returns
-------
C : array, shape [n_samples]
Predicted class label per sample.
"""
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]

def _predict_proba_lr(self, X):
"""Probability estimation for OvR logistic regression.
Positive class probabilities are computed as
1. / (1. + np.exp(-self.decision_function(X)));
multiclass is handled by normalizing that over all classes.
"""
prob = self.decision_function(X)
expit(prob, out=prob)
if prob.ndim == 1:
return np.vstack([1 - prob, prob]).T
else:
# OvR normalization, like LibLinear's predict_probability
prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
return prob
Loading

0 comments on commit 2761ded

Please sign in to comment.