diff --git a/disteval/__init__.py b/disteval/__init__.py index 38a52e6..72714f4 100644 --- a/disteval/__init__.py +++ b/disteval/__init__.py @@ -1,251 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from logging import getLogger +from . import visualization +from . import evaluation -import numpy as np - -from sklearn.model_selection import StratifiedKFold - -from .scripts.classifier_characteristics import ClassifierCharacteristics -from .scripts.recursive_selection_parallel import get_all_auc_scores - - -logger = getLogger('disteval') +from .basics import prepare_data +from .recursive_selection_parallel import recursive_feature_selection_roc_auc +from .basic_classification import cv_test_ref_classification __author__ = "Mathis Börner and Jens Buß" - -def cv_test_ref_classification(clf, - X, - y, - sample_weight=None, - cv_steps=10, - return_all_models=False, - random_state=None): - """Runs a classification betwenn the test data and the reference data. - This classification is run in a cross-validation with a provided - classifier. The classifier needs a fit function to start the model - building process and a predict_func to obtain the classifier score. - The score is expected to be between 0 and 1. - - Parameters - ---------- - clf: object - Classifier that should be used for the classification. - It needs a fit and a predict_proba function. - - X : numpy.float32array, shape=(n_samples, n_obs) - Values describing the samples. - - y : numpy.float32array, shape=(n_samples) - Array of the true labels. - - sample_weight : None or numpy.float32array, shape=(n_samples) - If weights are used this has to contains the sample weights. - None in the case of no weights. - - cv_steps: int, optional (default=10) - Number of cross-validation steps. If < 2 the model is trained on - all samples and no prediction is made. - - return_all_models: bool, optional (default=False) - If all models for the cross-validiation should be saved and - returned. - - random_state: None, int or RandomState - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. - - Returns - ------- - clf: object - Trained classifier. If return_all_models, a liste of all trained - classifiers, is returned. - - y_pred : numpy.float32array, shape=(n_samples) - Array of the classifier score. - - cv_step : numpy.int, shape=(n_samples) - Iteration in which the sample was classified. - """ - if not isinstance(random_state, np.random.RandomState): - random_state = np.random.RandomState(random_state) - desired_characteristics = ClassifierCharacteristics() - desired_characteristics.opts['callable:fit'] = True - desired_characteristics.opts['callable:predict_proba'] = True - - clf_characteristics = ClassifierCharacteristics(clf) - assert clf_characteristics.fulfilling(desired_characteristics), \ - 'Classifier sanity check failed!' - - if cv_steps < 2: - clf = clf.fit(X=X, - y=y, - sample_weight=sample_weight) - return clf, None, None - - else: - strat_kfold = StratifiedKFold(n_splits=cv_steps, - shuffle=True, - random_state=random_state) - cv_iterator = strat_kfold.split(X, y) - y_pred = np.zeros_like(y, dtype=float) - cv_step = np.zeros_like(y, dtype=int) - if return_all_models: - from copy import deepcopy - trained_clfs = [] - for i, [train_idx, test_idx] in enumerate(cv_iterator): - X_train = X[train_idx] - X_test = X[test_idx] - y_train = y[train_idx] - if sample_weight is None: - sample_weight_train = None - else: - sample_weight_train = sample_weight[train_idx] - clf = clf.fit(X=X_train, - y=y_train, - sample_weight=sample_weight_train) - y_pred[test_idx] = clf.predict_proba(X_test)[:, 1] - cv_step[test_idx] = i - if return_all_models: - trained_clfs.append(deepcopy(clf)) - if return_all_models: - clf = trained_clfs - return clf, y_pred, cv_step - - -def recursive_feature_selection_roc_auc(clf, - X, - y, - sample_weight=None, - n_features=10, - cv_steps=10, - n_jobs=1, - forward=True, - matching_features=True): - """Method building a feature set in a recursive fashion. Depending - on the setting it is run as a forward selection/backward elimination - searching for a set of n features with the highest/lowest mismatch. - To get the set with the size n starting from n_total features the - following approaches are used: - - Forward Selection: - To get the k+1 set every not yet selected feature is used to - generate (n_total - k sets). The set with the best score is the - k + 1 set. Those steps are repeated until n features are selected - - Backward Elimination: - To get k+1 eliminated features every not yet eleminated feature is used - to generate (n_total - k) sets. The sets consist of all not yet - eliminated features minus the one that is tested. The set with the - best score determines the next feature to eliminate. Those steps are - repeated until n features are eliminated. - - What the best score depends also on the settings: - matching_features: - forward: min(|auc - 0.5|) - not forward: max(|aux - 0.5|) - - not matching_features: - forward: max(auc ) - not forward: min(aux) - - - Parameters - ---------- - clf: object - Classifier that should be used for the classification. - It needs a fit and a predict_proba function. - - X : numpy.float32array, shape=(n_samples, n_obs) - Values describing the samples. - - y : numpy.float32array, shape=(n_samples) - Array of the true labels. - - sample_weight : None or numpy.float32array, shape=(n_samples) - If weights are used this has to contains the sample weights. - None in the case of no weights. - - n_features : int, optional (default=10) - Number of feature that are selected (forward=True) or eliminated - (forward=False) - - n_jobs: int, optional (default=1) - Number of parallel jobs spawned in each a classification in run. - Total number of used cores is the product of n_jobs from the clf - and the n_jobs of this function. - - forward: bool, optional (default=True) - If True it is a 'forward selection'. If False it is a 'backward - elimination'. - - matching_features: bool, optional (default=True) - Wether for matching or mismatching feature should be searched - - Returns - ------- - selected_features: list of ints - Return a list containing the indeces of X, that were - selected/eliminated. The order corresponds to the order the - features were selected/eliminated. - - auc_scores: np.array float shape(n_features_total, n_features) - Return a array containing the auc values for all steps. - np.nan is the feature was already selected in the specific run. - """ - desired_characteristics = ClassifierCharacteristics() - desired_characteristics.opts['callable:fit'] = True - desired_characteristics.opts['callable:predict_proba'] = True - - clf_characteristics = ClassifierCharacteristics(clf) - assert clf_characteristics.fulfilling(desired_characteristics), \ - 'Classifier sanity check failed!' - - if n_features > X.shape[1]: - logger.info(' \'n_features\' higher than total number of features.' - ' \'n_features\' reduced!') - n_features = X.shape[1] - auc_scores = np.zeros((X.shape[1], n_features)) - selected_features = [] - - while len(selected_features) != n_features: - auc_scores_i = get_all_auc_scores(clf, - selected_features, - X, - y, - sample_weight=sample_weight, - cv_steps=cv_steps, - n_jobs=n_jobs, - forward=forward) - value_best = None - index_best = None - for idx, auc in enumerate(auc_scores_i): - if not np.isfinite(auc): - continue - if value_best is None: - value_best = auc - index_best = idx - if matching_features: - if forward: - if np.abs(auc - 0.5) < np.abs(value_best - 0.5): - value_best = auc - index_best = idx - else: - if np.abs(auc - 0.5) > np.abs(value_best - 0.5): - value_best = auc - index_best = idx - else: - if forward: - if auc > value_best: - value_best = auc - index_best = idx - else: - if auc < value_best: - value_best = auc - index_best = idx - auc_scores[:, len(selected_features)] = auc_scores_i - selected_features.append(index_best) - return selected_features, auc_scores +__all__ = ['evaluation', + 'visualization', + 'prepare_data', + 'recursive_feature_selection_roc_auc', + 'cv_test_ref_classification'] diff --git a/disteval/basic_classification.py b/disteval/basic_classification.py index afb1e93..42edafc 100644 --- a/disteval/basic_classification.py +++ b/disteval/basic_classification.py @@ -5,6 +5,7 @@ from sklearn.model_selection import StratifiedKFold +from .basics.classifier_characteristics import ClassifierCharacteristics logger = getLogger('disteval.basic_classification') diff --git a/disteval/basics/__init__.py b/disteval/basics/__init__.py index 5ed0825..b137400 100644 --- a/disteval/basics/__init__.py +++ b/disteval/basics/__init__.py @@ -1,6 +1,4 @@ # -*- coding:utf-8 -*- -from __future__ import absolute_import, print_function, division - from .classifier_characteristics import ClassifierCharacteristics from .preparation import prepare_data, shrink_data from .preparation import convert_and_remove_non_finites diff --git a/disteval/basics/preparation.py b/disteval/basics/preparation.py index a91e5d1..ef2430a 100644 --- a/disteval/basics/preparation.py +++ b/disteval/basics/preparation.py @@ -1,11 +1,12 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- +from logging import getLogger import warnings import numpy as np -import logging -log = logging.getLogger('disteval.preparation') -log.setLevel(logging.INFO) + + +logger = getLogger('disteval.preparation') + def prepare_data(test_df, ref_df, diff --git a/disteval/evaluation/__init__.py b/disteval/evaluation/__init__.py index 39e926a..6712e4e 100644 --- a/disteval/evaluation/__init__.py +++ b/disteval/evaluation/__init__.py @@ -2,297 +2,10 @@ """ Collection of methods to evaluate the results of disteval functions """ -import numpy as np +from .feature_importance_test import feature_importance_mad +from .feature_importance_test import feature_importance_mad_majority +from .roc_curve_equivalence_test import roc_curve_equivalence_ks_test -from scipy.stats import norm -from sklearn.metrics import roc_curve - -from ..scripts.classifier_characteristics import ClassifierCharacteristics -from .stat_tests import kstest_2sample - -def feature_importance_mad(clf, alpha=0.05): - """This function fetches the feature importance values and runs a - criteria using the median absolute deviation. If a feature - importance difference to the median importance is greater than - a certain threshold and the feature is more important than the - median feature, the feature is removed. The threshold is: - 1.4826 * cdf_norm**-1(1 - alpha/2) * MAD - The distribution of the feature importance can be expected, to have - a relativ flat distribution up from 0 upto a normal distributed - peak. The flat part is for constant or close to constant features. - The rest of the features can be expected to be choosen in a random - fashion. Therefore they build a normal distributed peak - around ~(1. / (n_features - n_constant_features)). To have a robust - measure for outliers the meadian absolute diviation (MAD) is used. - The definition of the MAD is: - median(|X_i - median(X)|) - For a mormal distribution the 1 sigma region is included in the - region between 1.4826 * MAD - median(X) and 1.4826 * MAD + median(X). - With the parameter alpha the used threshold is tuned in a way, for - a pure normal distribution alpha / 2 (only features above the - median are removed) features would be removed. - - Parameters - ---------- - clf: object or list - Trained classifier or list of trained classifier. - - alpha : float, optional (default=0.05) - Parameter tuning the threshold. See function describtion. - - Returns - ------- - kept: numpy.boolarray, shape=(n_features) - Whether the feature passes the MAD criteria. - - feature_importance: numpy.array, shape=(n_features) - Array of the importance values for the features. If a list of - classifier is passed, it is the mean over all classifier. - - feature_importance_std: None or numpy.array, shape=(n_features) - If a list of classifier is passed the standard deviation is of - the feature importance values is returned. Otherwise None is - returned - """ - desired_characteristics = ClassifierCharacteristics() - desired_characteristics.opts['has:feature_importances_'] = True - - if isinstance(clf, list): - feature_importances = [] - for i, clf_i in enumerate(clf): - clf_characteristics = ClassifierCharacteristics(clf_i) - assert clf_characteristics.fulfilling(desired_characteristics), \ - 'Classifier sanity check failed!' - feature_importances.append(clf_i.feature_importances_) - feature_importances = np.array(feature_importances) - feature_importance = np.mean(feature_importances, axis=0) - feature_importance_std = np.std(feature_importances, axis=0, ddof=1) - else: - clf_characteristics = ClassifierCharacteristics(clf) - assert clf_characteristics.fulfilling(desired_characteristics), \ - 'Classifier sanity check failed!' - feature_importance = clf.feature_importances_ - feature_importance_std = np.NaN - - threshold = norm.ppf(1 - alpha/2) * 1.4826 # see docstring - median_importance = np.median(feature_importance) - MAD = np.median(np.absolute(feature_importance - median_importance)) - diff = feature_importance - median_importance - kept = np.logical_or(np.absolute(diff) < threshold * MAD, - feature_importance <= median_importance) - return kept, feature_importance, feature_importance_std - - -def feature_importance_mad_majority(clfs, ratio=0.9, alpha=0.10): - """In this function a list of classifier must be provided. To decide - if a feature is removed, for each classifier the function - feature_importance_mad with the provided alpha is evaluated. And if - a feature is removed in atleast ratio-percent of the classifiers - the feature is removed. The motivation behind the majority vote is, - that if a feature is just above the threshold in a single test - because of statistical fluctuation is should be below the threshold - for most of the classifications. The alpha can be set less - conservative because this criteria is more robust against - statistical fluctuationsc. - - Parameters - ---------- - clf: list - List of trained classifier. - - ratio : float, optional (default=0.9) - Ratio of classifiers in which the feature should be removed. - - alpha : float, optional (default=0.05) - Parameter tuning the threshold. See feature_importance_mad - describtion. - - Returns - ------- - kept: numpy.boolarray, shape=(n_features) - Whether the feature passes the MAD criteria. - - feature_importance: numpy.array, shape=(n_features) - Array of the importance values for the features. If a list of - classifier is passed, it is the mean over all classifier. - - feature_importance_std: numpy.array, shape=(n_features) - If a list of classifier is passed the standard deviation is of - the feature importance values is returned. Otherwise None is - returned - """ - desired_characteristics = ClassifierCharacteristics() - desired_characteristics.opts['has:feature_importances_'] = True - assert isinstance(clfs, list), 'List of classifier has to be provided' - kept_arr = [] - feature_importances = [] - for i, clf_i in enumerate(clfs): - kept, feature_importance, _ = feature_importance_mad(clf_i, - alpha=alpha) - kept_arr.append(kept) - feature_importances.append(feature_importance) - kept_arr = np.array(kept_arr) - feature_importances = np.array(feature_importances) - feature_importance = np.mean(feature_importances, axis=0) - feature_importance_std = np.std(feature_importances, axis=0, ddof=1) - kept = np.sum(kept_arr, axis=0) >= ratio * kept_arr.shape[0] - return kept, feature_importance, feature_importance_std - - -def roc_curve_equivalence_ks_test(y_pred_a, - y_pred_b, - y_true, - y_true_b=None, - alpha=0.05, - scale=False): - """Function evaluating the equivalence between the ROC curves of - two classifier. The method is described by Andrew P. Bradley in - "ROC curve equivalence using the Kolmogorov-Smirnov test" - DOI: 10.1016/j.patrec.2012.12.021 - - Parameters - ---------- - y_pred_a: numpy.array, shape=(n_samples_a) - Predictions of classifier a. The predictions are expected to be - between [0, 1]. - - y_pred_b: numpy.array, shape=(n_samples_b) - Predictions of classifier b. he predictions are expected to be - between [0, 1]. If y_true_b is not provided, the - sample must be of the same length as sample a. - - y_true : numpy.array, shape=(n_samples_a) - True labels for sample_a. If y_true_b is not provided, it is - also used as the true labels for sample b - - y_true_b : None numpy.array, shape=(n_samples_b), optional - True labels for sample_b. If None y_true is used as labels for - sample b. - - alpha : float, optional (default=0.05) - Significance for the Kolmogorov Smirnov test. - - scale : boolean, optional (default=False) - Wether the predictions should be to the interval [0,1]. - - Returns - ------- - passed: bool - True if test is accepted. False if the test is rejected. A - rejection has the error rate alpha. - - op_point_a: numpy.array, shape=(2,2) - [False positive rate, True positive rate] Rate at the operation - points of both KS test for sample a. - - op_point_b: numpy.array, shape=(2,2) - [False positive rate, True positive rate] Rate at the operation - points of both KS test for sample b. - - fpr_b: numpy.array - False positive rate for sample b at the thresholds. - - tpr_b: numpy.array - True positive rate for sample b at the thresholds. - - threshold: numpy.array - Thresholds to the false/true positive rates. - """ - - bincount_y = np.bincount(y_true) - num_positive_a = bincount_y[1] - num_negative_a = bincount_y[0] - if y_true_b is not None: - bincount_y = np.bincount(y_true_b) - num_positive_b = bincount_y[1] - num_negative_b = bincount_y[0] - else: - y_true_b = y_true - num_positive_b = num_positive_a - num_negative_b = num_negative_a - if scale: - min_pred_a = np.min(y_pred_a) - max_pred_a = np.max(y_pred_a) - y_pred_a = (y_pred_a - min_pred_a) / (max_pred_a - min_pred_a) - - min_pred_b = np.min(y_pred_b) - max_pred_b = np.max(y_pred_b) - y_pred_b = (y_pred_b - min_pred_b) / (max_pred_b - min_pred_b) - - - fpr_a, tpr_a, thresholds_a = roc_curve(y_true, - y_pred_a, - drop_intermediate=True) - fpr_b, tpr_b, thresholds_b = roc_curve(y_true_b, - y_pred_b, - drop_intermediate=True) - - thresholds = np.sort(np.unique(np.hstack((thresholds_a, thresholds_b)))) - - order_a = np.argsort(thresholds_a) - thresholds_a = thresholds_a[order_a] - fpr_a = fpr_a[order_a] - tpr_a = tpr_a[order_a] - - order_b = np.argsort(thresholds_b) - thresholds_b = thresholds_b[order_b] - fpr_b = fpr_b[order_b] - tpr_b = tpr_b[order_b] - - fpr_a_full = np.ones_like(thresholds) - tpr_a_full = np.ones_like(thresholds) - fpr_b_full = np.ones_like(thresholds) - tpr_b_full = np.ones_like(thresholds) - pointer_a = -1 - pointer_b = -1 - - for i, t_i in enumerate(thresholds): - if pointer_a + 1 < len(thresholds_a): - if t_i == thresholds_a[pointer_a + 1]: - pointer_a += 1 - fpr_a_full[i] = fpr_a[pointer_a] - tpr_a_full[i] = tpr_a[pointer_a] - if pointer_a == -1: - fpr_a_full[i] = 1. - tpr_a_full[i] = 1. - else: - fpr_a_full[i] = 0. - tpr_a_full[i] = 0. - - if pointer_b + 1 < len(thresholds_b): - if t_i == thresholds_b[pointer_b + 1]: - pointer_b += 1 - fpr_b_full[i] = fpr_b[pointer_b] - tpr_b_full[i] = tpr_b[pointer_b] - if pointer_b == -1: - fpr_b_full[i] = 1. - tpr_b_full[i] = 1. - else: - fpr_b_full[i] = 0. - tpr_b_full[i] = 0. - - passed_neg, idx_max_neg, dist_max_neg = kstest_2sample( - x=thresholds, - cdf_a=fpr_a_full, - cdf_b=fpr_b_full, - n_a=num_negative_a, - n_b=num_negative_b, - alpha=alpha) - - passed_pos, idx_max_pos, dist_max_pos = kstest_2sample( - x=thresholds, - cdf_a=tpr_a_full, - cdf_b=tpr_b_full, - n_a=num_positive_a, - n_b=num_positive_b, - alpha=alpha) - - op_point_n = np.array([[fpr_a_full[idx_max_neg], fpr_b_full[idx_max_neg]], - [tpr_a_full[idx_max_neg], tpr_b_full[idx_max_neg]]]) - op_point_p = np.array([[fpr_a_full[idx_max_pos], fpr_b_full[idx_max_pos]], - [tpr_a_full[idx_max_pos], tpr_b_full[idx_max_pos]]]) - - passed = np.logical_and(passed_pos, passed_neg) - - return passed, op_point_n, op_point_p, \ - fpr_a_full, tpr_a_full, fpr_b_full, tpr_b_full, thresholds +__all__ = ['feature_importance_mad', + 'feature_importance_mad_majority', + 'roc_curve_equivalence_ks_test'] diff --git a/disteval/evaluation/feature_importance_test.py b/disteval/evaluation/feature_importance_test.py index 6b7375f..8483f16 100644 --- a/disteval/evaluation/feature_importance_test.py +++ b/disteval/evaluation/feature_importance_test.py @@ -1,9 +1,7 @@ # -*- coding:utf-8 -*- import numpy as np -from sklearn.metrics import roc_curve - -from ..scripts.classifier_characteristics import ClassifierCharacteristics +from ..basics.classifier_characteristics import ClassifierCharacteristics def feature_importance_mad(clf, alpha=0.05): diff --git a/disteval/evaluation/roc_curve_equivalence_test.py b/disteval/evaluation/roc_curve_equivalence_test.py index 9120ef8..230d084 100644 --- a/disteval/evaluation/roc_curve_equivalence_test.py +++ b/disteval/evaluation/roc_curve_equivalence_test.py @@ -4,6 +4,7 @@ """ import numpy as np +from sklearn.metrics import roc_curve def kstest_2sample(x, cdf_a, cdf_b, n_a, n_b, alpha=0.05): """Function evaluating the Kolmogorov Smirrnoff Test. Variable @@ -53,3 +54,162 @@ def kstest_2sample(x, cdf_a, cdf_b, n_a, n_b, alpha=0.05): passed = factor * d_max <= K_alpha return passed, idx_max, d_max + + +def roc_curve_equivalence_ks_test(y_pred_a, + y_pred_b, + y_true, + y_true_b=None, + alpha=0.05, + scale=False): + """Function evaluating the equivalence between the ROC curves of + two classifier. The method is described by Andrew P. Bradley in + "ROC curve equivalence using the Kolmogorov-Smirnov test" + DOI: 10.1016/j.patrec.2012.12.021 + + Parameters + ---------- + y_pred_a: numpy.array, shape=(n_samples_a) + Predictions of classifier a. The predictions are expected to be + between [0, 1]. + + y_pred_b: numpy.array, shape=(n_samples_b) + Predictions of classifier b. he predictions are expected to be + between [0, 1]. If y_true_b is not provided, the + sample must be of the same length as sample a. + + y_true : numpy.array, shape=(n_samples_a) + True labels for sample_a. If y_true_b is not provided, it is + also used as the true labels for sample b + + y_true_b : None numpy.array, shape=(n_samples_b), optional + True labels for sample_b. If None y_true is used as labels for + sample b. + + alpha : float, optional (default=0.05) + Significance for the Kolmogorov Smirnov test. + + scale : boolean, optional (default=False) + Wether the predictions should be to the interval [0,1]. + + Returns + ------- + passed: bool + True if test is accepted. False if the test is rejected. A + rejection has the error rate alpha. + + op_point_a: numpy.array, shape=(2,2) + [False positive rate, True positive rate] Rate at the operation + points of both KS test for sample a. + + op_point_b: numpy.array, shape=(2,2) + [False positive rate, True positive rate] Rate at the operation + points of both KS test for sample b. + + fpr_b: numpy.array + False positive rate for sample b at the thresholds. + + tpr_b: numpy.array + True positive rate for sample b at the thresholds. + + threshold: numpy.array + Thresholds to the false/true positive rates. + """ + + bincount_y = np.bincount(y_true) + num_positive_a = bincount_y[1] + num_negative_a = bincount_y[0] + if y_true_b is not None: + bincount_y = np.bincount(y_true_b) + num_positive_b = bincount_y[1] + num_negative_b = bincount_y[0] + else: + y_true_b = y_true + num_positive_b = num_positive_a + num_negative_b = num_negative_a + if scale: + min_pred_a = np.min(y_pred_a) + max_pred_a = np.max(y_pred_a) + y_pred_a = (y_pred_a - min_pred_a) / (max_pred_a - min_pred_a) + + min_pred_b = np.min(y_pred_b) + max_pred_b = np.max(y_pred_b) + y_pred_b = (y_pred_b - min_pred_b) / (max_pred_b - min_pred_b) + + + fpr_a, tpr_a, thresholds_a = roc_curve(y_true, + y_pred_a, + drop_intermediate=True) + fpr_b, tpr_b, thresholds_b = roc_curve(y_true_b, + y_pred_b, + drop_intermediate=True) + + thresholds = np.sort(np.unique(np.hstack((thresholds_a, thresholds_b)))) + + order_a = np.argsort(thresholds_a) + thresholds_a = thresholds_a[order_a] + fpr_a = fpr_a[order_a] + tpr_a = tpr_a[order_a] + + order_b = np.argsort(thresholds_b) + thresholds_b = thresholds_b[order_b] + fpr_b = fpr_b[order_b] + tpr_b = tpr_b[order_b] + + fpr_a_full = np.ones_like(thresholds) + tpr_a_full = np.ones_like(thresholds) + fpr_b_full = np.ones_like(thresholds) + tpr_b_full = np.ones_like(thresholds) + pointer_a = -1 + pointer_b = -1 + + for i, t_i in enumerate(thresholds): + if pointer_a + 1 < len(thresholds_a): + if t_i == thresholds_a[pointer_a + 1]: + pointer_a += 1 + fpr_a_full[i] = fpr_a[pointer_a] + tpr_a_full[i] = tpr_a[pointer_a] + if pointer_a == -1: + fpr_a_full[i] = 1. + tpr_a_full[i] = 1. + else: + fpr_a_full[i] = 0. + tpr_a_full[i] = 0. + + if pointer_b + 1 < len(thresholds_b): + if t_i == thresholds_b[pointer_b + 1]: + pointer_b += 1 + fpr_b_full[i] = fpr_b[pointer_b] + tpr_b_full[i] = tpr_b[pointer_b] + if pointer_b == -1: + fpr_b_full[i] = 1. + tpr_b_full[i] = 1. + else: + fpr_b_full[i] = 0. + tpr_b_full[i] = 0. + + passed_neg, idx_max_neg, dist_max_neg = kstest_2sample( + x=thresholds, + cdf_a=fpr_a_full, + cdf_b=fpr_b_full, + n_a=num_negative_a, + n_b=num_negative_b, + alpha=alpha) + + passed_pos, idx_max_pos, dist_max_pos = kstest_2sample( + x=thresholds, + cdf_a=tpr_a_full, + cdf_b=tpr_b_full, + n_a=num_positive_a, + n_b=num_positive_b, + alpha=alpha) + + op_point_n = np.array([[fpr_a_full[idx_max_neg], fpr_b_full[idx_max_neg]], + [tpr_a_full[idx_max_neg], tpr_b_full[idx_max_neg]]]) + op_point_p = np.array([[fpr_a_full[idx_max_pos], fpr_b_full[idx_max_pos]], + [tpr_a_full[idx_max_pos], tpr_b_full[idx_max_pos]]]) + + passed = np.logical_and(passed_pos, passed_neg) + + return passed, op_point_n, op_point_p, \ + fpr_a_full, tpr_a_full, fpr_b_full, tpr_b_full, thresholds diff --git a/disteval/recursive_selection_parallel.py b/disteval/recursive_selection_parallel.py index 1db8f08..af60a1d 100644 --- a/disteval/recursive_selection_parallel.py +++ b/disteval/recursive_selection_parallel.py @@ -1,5 +1,6 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- +from logging import getLogger + from concurrent.futures import ProcessPoolExecutor, wait import numpy as np @@ -7,6 +8,146 @@ from sklearn.metrics import roc_auc_score from sklearn.model_selection import StratifiedKFold +from .basics.classifier_characteristics import ClassifierCharacteristics + +logger = getLogger('disteval.recursive_selection') + + +def recursive_feature_selection_roc_auc(clf, + X, + y, + sample_weight=None, + n_features=10, + cv_steps=10, + n_jobs=1, + forward=True, + matching_features=True): + """Method building a feature set in a recursive fashion. Depending + on the setting it is run as a forward selection/backward elimination + searching for a set of n features with the highest/lowest mismatch. + To get the set with the size n starting from n_total features the + following approaches are used: + + Forward Selection: + To get the k+1 set every not yet selected feature is used to + generate (n_total - k sets). The set with the best score is the + k + 1 set. Those steps are repeated until n features are selected + + Backward Elimination: + To get k+1 eliminated features every not yet eleminated feature is used + to generate (n_total - k) sets. The sets consist of all not yet + eliminated features minus the one that is tested. The set with the + best score determines the next feature to eliminate. Those steps are + repeated until n features are eliminated. + + What the best score depends also on the settings: + matching_features: + forward: min(|auc - 0.5|) + not forward: max(|aux - 0.5|) + + not matching_features: + forward: max(auc ) + not forward: min(aux) + + + Parameters + ---------- + clf: object + Classifier that should be used for the classification. + It needs a fit and a predict_proba function. + + X : numpy.float32array, shape=(n_samples, n_obs) + Values describing the samples. + + y : numpy.float32array, shape=(n_samples) + Array of the true labels. + + sample_weight : None or numpy.float32array, shape=(n_samples) + If weights are used this has to contains the sample weights. + None in the case of no weights. + + n_features : int, optional (default=10) + Number of feature that are selected (forward=True) or eliminated + (forward=False) + + n_jobs: int, optional (default=1) + Number of parallel jobs spawned in each a classification in run. + Total number of used cores is the product of n_jobs from the clf + and the n_jobs of this function. + + forward: bool, optional (default=True) + If True it is a 'forward selection'. If False it is a 'backward + elimination'. + + matching_features: bool, optional (default=True) + Wether for matching or mismatching feature should be searched + + Returns + ------- + selected_features: list of ints + Return a list containing the indeces of X, that were + selected/eliminated. The order corresponds to the order the + features were selected/eliminated. + + auc_scores: np.array float shape(n_features_total, n_features) + Return a array containing the auc values for all steps. + np.nan is the feature was already selected in the specific run. + """ + desired_characteristics = ClassifierCharacteristics() + desired_characteristics.opts['callable:fit'] = True + desired_characteristics.opts['callable:predict_proba'] = True + + clf_characteristics = ClassifierCharacteristics(clf) + assert clf_characteristics.fulfilling(desired_characteristics), \ + 'Classifier sanity check failed!' + + if n_features > X.shape[1]: + logger.info(' \'n_features\' higher than total number of features.' + ' \'n_features\' reduced!') + n_features = X.shape[1] + auc_scores = np.zeros((X.shape[1], n_features)) + selected_features = [] + + while len(selected_features) != n_features: + auc_scores_i = get_all_auc_scores(clf, + selected_features, + X, + y, + sample_weight=sample_weight, + cv_steps=cv_steps, + n_jobs=n_jobs, + forward=forward) + value_best = None + index_best = None + for idx, auc in enumerate(auc_scores_i): + if not np.isfinite(auc): + continue + if value_best is None: + value_best = auc + index_best = idx + if matching_features: + if forward: + if np.abs(auc - 0.5) < np.abs(value_best - 0.5): + value_best = auc + index_best = idx + else: + if np.abs(auc - 0.5) > np.abs(value_best - 0.5): + value_best = auc + index_best = idx + else: + if forward: + if auc > value_best: + value_best = auc + index_best = idx + else: + if auc < value_best: + value_best = auc + index_best = idx + auc_scores[:, len(selected_features)] = auc_scores_i + selected_features.append(index_best) + return selected_features, auc_scores + + def __single_auc_score__(feature_i, clf, @@ -184,3 +325,6 @@ def get_all_auc_scores(clf, sample_weight=sample_weight) auc_scores[feature_i] = auc return auc_scores + + + diff --git a/disteval/visualization/__init__.py b/disteval/visualization/__init__.py index 48e7e53..0786f5e 100644 --- a/disteval/visualization/__init__.py +++ b/disteval/visualization/__init__.py @@ -4,7 +4,7 @@ Collection of methods to visualize the results of disteval functions """ from .feature_importance_test import visualize_feature_importance_mad -from .roc_curve_equivalence_test import roc_curve_equivalence_ks_test +from .roc_curve_equivalence_test import visualize_roc_curve_equivalence_test from .comparison_plotter import ComparisonPlotter __all__ = ['visualize_feature_importance_mad', diff --git a/examples/roc_curve_equivalence.py b/examples/roc_curve_equivalence.py index 133f51a..ae51c2d 100644 --- a/examples/roc_curve_equivalence.py +++ b/examples/roc_curve_equivalence.py @@ -4,6 +4,7 @@ ''' import logging import matplotlib +matplotlib.use('Agg') import numpy as np from sklearn.datasets import make_classification @@ -13,8 +14,8 @@ from disteval import evaluation as eval from disteval import visualization as visu -log = logging.getLogger("disteval.fact_example") -matplotlib.use('Agg') +log = logging.getLogger("disteval.roc_curve_equivalence_example") + def main():