diff --git a/disteval/__init__.py b/disteval/__init__.py
index 38a52e6..72714f4 100644
--- a/disteval/__init__.py
+++ b/disteval/__init__.py
@@ -1,251 +1,16 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-from logging import getLogger
+from . import visualization
+from . import evaluation
 
-import numpy as np
-
-from sklearn.model_selection import StratifiedKFold
-
-from .scripts.classifier_characteristics import ClassifierCharacteristics
-from .scripts.recursive_selection_parallel import get_all_auc_scores
-
-
-logger = getLogger('disteval')
+from .basics import prepare_data
+from .recursive_selection_parallel import recursive_feature_selection_roc_auc
+from .basic_classification import cv_test_ref_classification
 
 __author__ = "Mathis Börner and Jens Buß"
 
-
-def cv_test_ref_classification(clf,
-                               X,
-                               y,
-                               sample_weight=None,
-                               cv_steps=10,
-                               return_all_models=False,
-                               random_state=None):
-    """Runs a classification betwenn the test data and the reference data.
-    This classification is run in a cross-validation with a provided
-    classifier. The classifier needs a fit function to start the model
-    building process and a predict_func to obtain the classifier score.
-    The score is expected to be between 0 and 1.
-
-    Parameters
-    ----------
-    clf: object
-        Classifier that should be used for the classification.
-        It needs a fit and a predict_proba function.
-
-    X : numpy.float32array, shape=(n_samples, n_obs)
-        Values describing the samples.
-
-    y : numpy.float32array, shape=(n_samples)
-        Array of the true labels.
-
-    sample_weight : None or numpy.float32array, shape=(n_samples)
-        If weights are used this has to contains the sample weights.
-        None in the case of no weights.
-
-    cv_steps: int, optional (default=10)
-        Number of cross-validation steps. If < 2 the model is trained on
-        all samples and no prediction is made.
-
-    return_all_models: bool, optional (default=False)
-        If all models for the cross-validiation should be saved and
-        returned.
-
-    random_state: None, int or RandomState
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by np.random.
-
-    Returns
-    -------
-    clf: object
-        Trained classifier. If return_all_models, a liste of all trained
-        classifiers, is returned.
-
-    y_pred : numpy.float32array, shape=(n_samples)
-        Array of the classifier score.
-
-    cv_step : numpy.int, shape=(n_samples)
-        Iteration in which the sample was classified.
-    """
-    if not isinstance(random_state, np.random.RandomState):
-        random_state = np.random.RandomState(random_state)
-    desired_characteristics = ClassifierCharacteristics()
-    desired_characteristics.opts['callable:fit'] = True
-    desired_characteristics.opts['callable:predict_proba'] = True
-
-    clf_characteristics = ClassifierCharacteristics(clf)
-    assert clf_characteristics.fulfilling(desired_characteristics), \
-        'Classifier sanity check failed!'
-
-    if cv_steps < 2:
-        clf = clf.fit(X=X,
-                      y=y,
-                      sample_weight=sample_weight)
-        return clf, None, None
-
-    else:
-        strat_kfold = StratifiedKFold(n_splits=cv_steps,
-                                      shuffle=True,
-                                      random_state=random_state)
-        cv_iterator = strat_kfold.split(X, y)
-        y_pred = np.zeros_like(y, dtype=float)
-        cv_step = np.zeros_like(y, dtype=int)
-        if return_all_models:
-            from copy import deepcopy
-            trained_clfs = []
-        for i, [train_idx, test_idx] in enumerate(cv_iterator):
-            X_train = X[train_idx]
-            X_test = X[test_idx]
-            y_train = y[train_idx]
-            if sample_weight is None:
-                sample_weight_train = None
-            else:
-                sample_weight_train = sample_weight[train_idx]
-            clf = clf.fit(X=X_train,
-                          y=y_train,
-                          sample_weight=sample_weight_train)
-            y_pred[test_idx] = clf.predict_proba(X_test)[:, 1]
-            cv_step[test_idx] = i
-            if return_all_models:
-                trained_clfs.append(deepcopy(clf))
-        if return_all_models:
-            clf = trained_clfs
-        return clf, y_pred, cv_step
-
-
-def recursive_feature_selection_roc_auc(clf,
-                                        X,
-                                        y,
-                                        sample_weight=None,
-                                        n_features=10,
-                                        cv_steps=10,
-                                        n_jobs=1,
-                                        forward=True,
-                                        matching_features=True):
-    """Method building a feature set in a recursive fashion. Depending
-    on the setting it is run as a forward selection/backward elimination
-    searching for a set of n features with the highest/lowest mismatch.
-    To get the set with the size n starting from n_total features the
-    following approaches are used:
-
-    Forward Selection:
-    To get the k+1 set every not yet selected feature is used to
-    generate (n_total - k sets). The set with the best score is the
-    k + 1 set. Those steps are repeated until n features are selected
-
-    Backward Elimination:
-    To get k+1 eliminated features every not yet eleminated feature is used
-    to generate (n_total - k) sets. The sets consist of all not yet
-    eliminated features minus the one that is tested. The set with the
-    best score determines the next feature to eliminate. Those steps are
-    repeated until n features are eliminated.
-
-    What the best score depends also on the settings:
-    matching_features:
-        forward: min(|auc - 0.5|)
-        not forward: max(|aux - 0.5|)
-
-    not matching_features:
-        forward: max(auc )
-        not forward: min(aux)
-
-
-    Parameters
-    ----------
-    clf: object
-        Classifier that should be used for the classification.
-        It needs a fit and a predict_proba function.
-
-    X : numpy.float32array, shape=(n_samples, n_obs)
-        Values describing the samples.
-
-    y : numpy.float32array, shape=(n_samples)
-        Array of the true labels.
-
-    sample_weight : None or numpy.float32array, shape=(n_samples)
-        If weights are used this has to contains the sample weights.
-        None in the case of no weights.
-
-    n_features : int, optional (default=10)
-        Number of feature that are selected (forward=True) or eliminated
-        (forward=False)
-
-    n_jobs: int, optional (default=1)
-        Number of parallel jobs spawned in each a classification in run.
-        Total number of used cores is the product of n_jobs from the clf
-        and the n_jobs of this function.
-
-    forward: bool, optional (default=True)
-        If True it is a 'forward selection'. If False it is a 'backward
-        elimination'.
-
-    matching_features: bool, optional (default=True)
-        Wether for matching or mismatching feature should be searched
-
-    Returns
-    -------
-    selected_features: list of ints
-        Return a list containing the indeces of X, that were
-        selected/eliminated. The order corresponds to the order the
-        features were selected/eliminated.
-
-    auc_scores: np.array float shape(n_features_total, n_features)
-        Return a array containing the auc values for all steps.
-        np.nan is the feature was already selected in the specific run.
-    """
-    desired_characteristics = ClassifierCharacteristics()
-    desired_characteristics.opts['callable:fit'] = True
-    desired_characteristics.opts['callable:predict_proba'] = True
-
-    clf_characteristics = ClassifierCharacteristics(clf)
-    assert clf_characteristics.fulfilling(desired_characteristics), \
-        'Classifier sanity check failed!'
-
-    if n_features > X.shape[1]:
-        logger.info(' \'n_features\' higher than total number of features.'
-                    ' \'n_features\' reduced!')
-        n_features = X.shape[1]
-    auc_scores = np.zeros((X.shape[1], n_features))
-    selected_features = []
-
-    while len(selected_features) != n_features:
-        auc_scores_i = get_all_auc_scores(clf,
-                                          selected_features,
-                                          X,
-                                          y,
-                                          sample_weight=sample_weight,
-                                          cv_steps=cv_steps,
-                                          n_jobs=n_jobs,
-                                          forward=forward)
-        value_best = None
-        index_best = None
-        for idx, auc in enumerate(auc_scores_i):
-            if not np.isfinite(auc):
-                continue
-            if value_best is None:
-                value_best = auc
-                index_best = idx
-            if matching_features:
-                if forward:
-                    if np.abs(auc - 0.5) < np.abs(value_best - 0.5):
-                        value_best = auc
-                        index_best = idx
-                else:
-                    if np.abs(auc - 0.5) > np.abs(value_best - 0.5):
-                        value_best = auc
-                        index_best = idx
-            else:
-                if forward:
-                    if auc > value_best:
-                        value_best = auc
-                        index_best = idx
-                else:
-                    if auc < value_best:
-                        value_best = auc
-                        index_best = idx
-        auc_scores[:, len(selected_features)] = auc_scores_i
-        selected_features.append(index_best)
-    return selected_features, auc_scores
+__all__ = ['evaluation',
+           'visualization',
+           'prepare_data',
+           'recursive_feature_selection_roc_auc',
+           'cv_test_ref_classification']
diff --git a/disteval/basic_classification.py b/disteval/basic_classification.py
index afb1e93..42edafc 100644
--- a/disteval/basic_classification.py
+++ b/disteval/basic_classification.py
@@ -5,6 +5,7 @@
 
 from sklearn.model_selection import StratifiedKFold
 
+from .basics.classifier_characteristics import ClassifierCharacteristics
 
 logger = getLogger('disteval.basic_classification')
 
diff --git a/disteval/basics/__init__.py b/disteval/basics/__init__.py
index 5ed0825..b137400 100644
--- a/disteval/basics/__init__.py
+++ b/disteval/basics/__init__.py
@@ -1,6 +1,4 @@
 # -*- coding:utf-8 -*-
-from __future__ import absolute_import, print_function, division
-
 from .classifier_characteristics import ClassifierCharacteristics
 from .preparation import prepare_data, shrink_data
 from .preparation import convert_and_remove_non_finites
diff --git a/disteval/basics/preparation.py b/disteval/basics/preparation.py
index a91e5d1..ef2430a 100644
--- a/disteval/basics/preparation.py
+++ b/disteval/basics/preparation.py
@@ -1,11 +1,12 @@
-#!/usr/bin/env python
 # -*- coding: utf-8 -*-
+from logging import getLogger
 import warnings
 
 import numpy as np
-import logging
-log = logging.getLogger('disteval.preparation')
-log.setLevel(logging.INFO)
+
+
+logger = getLogger('disteval.preparation')
+
 
 def prepare_data(test_df,
                  ref_df,
diff --git a/disteval/evaluation/__init__.py b/disteval/evaluation/__init__.py
index 39e926a..6712e4e 100644
--- a/disteval/evaluation/__init__.py
+++ b/disteval/evaluation/__init__.py
@@ -2,297 +2,10 @@
 """
 Collection of methods to evaluate the results of disteval functions
 """
-import numpy as np
+from .feature_importance_test import feature_importance_mad
+from .feature_importance_test import feature_importance_mad_majority
+from .roc_curve_equivalence_test import roc_curve_equivalence_ks_test
 
-from scipy.stats import norm
-from sklearn.metrics import roc_curve
-
-from ..scripts.classifier_characteristics import ClassifierCharacteristics
-from .stat_tests import kstest_2sample
-
-def feature_importance_mad(clf, alpha=0.05):
-    """This function fetches the feature importance values and runs a
-    criteria using the median absolute deviation. If a feature
-    importance difference to the median importance is greater than
-    a certain threshold and the feature is more important than the
-    median feature, the feature is removed. The threshold is:
-    1.4826 * cdf_norm**-1(1 - alpha/2) * MAD
-    The distribution of the feature importance can be expected, to have
-    a relativ flat distribution up from 0 upto a normal distributed
-    peak. The flat part is for constant or close to constant features.
-    The rest of the features can be expected to be choosen in a random
-    fashion. Therefore they build a normal distributed peak
-    around ~(1. / (n_features - n_constant_features)). To have a robust
-    measure for outliers the meadian absolute diviation (MAD) is used.
-    The definition of the MAD is:
-    median(|X_i - median(X)|)
-    For a mormal distribution the 1 sigma region is included in the
-    region between 1.4826 * MAD - median(X) and 1.4826 * MAD + median(X).
-    With the parameter alpha the used threshold is tuned in a way, for
-    a pure normal distribution alpha / 2  (only features above the
-    median are removed) features would be removed.
-
-    Parameters
-    ----------
-    clf: object or list
-        Trained classifier or list of trained classifier.
-
-    alpha : float, optional (default=0.05)
-        Parameter tuning the threshold. See function describtion.
-
-    Returns
-    -------
-    kept: numpy.boolarray, shape=(n_features)
-        Whether the feature passes the MAD criteria.
-
-    feature_importance: numpy.array, shape=(n_features)
-        Array of the importance values for the features. If a list of
-        classifier is passed, it is the mean over all classifier.
-
-    feature_importance_std: None or numpy.array, shape=(n_features)
-        If a list of classifier is passed the standard deviation is of
-        the feature importance values is returned. Otherwise None is
-        returned
-    """
-    desired_characteristics = ClassifierCharacteristics()
-    desired_characteristics.opts['has:feature_importances_'] = True
-
-    if isinstance(clf, list):
-        feature_importances = []
-        for i, clf_i in enumerate(clf):
-            clf_characteristics = ClassifierCharacteristics(clf_i)
-            assert clf_characteristics.fulfilling(desired_characteristics), \
-                'Classifier sanity check failed!'
-            feature_importances.append(clf_i.feature_importances_)
-        feature_importances = np.array(feature_importances)
-        feature_importance = np.mean(feature_importances, axis=0)
-        feature_importance_std = np.std(feature_importances, axis=0, ddof=1)
-    else:
-        clf_characteristics = ClassifierCharacteristics(clf)
-        assert clf_characteristics.fulfilling(desired_characteristics), \
-            'Classifier sanity check failed!'
-        feature_importance = clf.feature_importances_
-        feature_importance_std = np.NaN
-
-    threshold = norm.ppf(1 - alpha/2) * 1.4826  # see docstring
-    median_importance = np.median(feature_importance)
-    MAD = np.median(np.absolute(feature_importance - median_importance))
-    diff = feature_importance - median_importance
-    kept = np.logical_or(np.absolute(diff) < threshold * MAD,
-                         feature_importance <= median_importance)
-    return kept, feature_importance, feature_importance_std
-
-
-def feature_importance_mad_majority(clfs, ratio=0.9, alpha=0.10):
-    """In this function a list of classifier must be provided. To decide
-    if a feature is removed, for each classifier the function
-    feature_importance_mad with the provided alpha is evaluated. And if
-    a feature is removed in atleast ratio-percent of the classifiers
-    the feature is removed. The motivation behind the majority vote is,
-    that if a feature is just above the threshold in a single test
-    because of statistical fluctuation is should be below the threshold
-    for most of the classifications. The alpha can be set less
-    conservative because this criteria is more robust against
-    statistical fluctuationsc.
-
-    Parameters
-    ----------
-    clf: list
-        List of trained classifier.
-
-    ratio : float, optional (default=0.9)
-        Ratio of classifiers in which the feature should be removed.
-
-    alpha : float, optional (default=0.05)
-        Parameter tuning the threshold. See feature_importance_mad
-        describtion.
-
-    Returns
-    -------
-    kept: numpy.boolarray, shape=(n_features)
-        Whether the feature passes the MAD criteria.
-
-    feature_importance: numpy.array, shape=(n_features)
-        Array of the importance values for the features. If a list of
-        classifier is passed, it is the mean over all classifier.
-
-    feature_importance_std: numpy.array, shape=(n_features)
-        If a list of classifier is passed the standard deviation is of
-        the feature importance values is returned. Otherwise None is
-        returned
-    """
-    desired_characteristics = ClassifierCharacteristics()
-    desired_characteristics.opts['has:feature_importances_'] = True
-    assert isinstance(clfs, list), 'List of classifier has to be provided'
-    kept_arr = []
-    feature_importances = []
-    for i, clf_i in enumerate(clfs):
-        kept, feature_importance, _ = feature_importance_mad(clf_i,
-                                                             alpha=alpha)
-        kept_arr.append(kept)
-        feature_importances.append(feature_importance)
-    kept_arr = np.array(kept_arr)
-    feature_importances = np.array(feature_importances)
-    feature_importance = np.mean(feature_importances, axis=0)
-    feature_importance_std = np.std(feature_importances, axis=0, ddof=1)
-    kept = np.sum(kept_arr, axis=0) >= ratio * kept_arr.shape[0]
-    return kept, feature_importance, feature_importance_std
-
-
-def roc_curve_equivalence_ks_test(y_pred_a,
-                                  y_pred_b,
-                                  y_true,
-                                  y_true_b=None,
-                                  alpha=0.05,
-                                  scale=False):
-    """Function evaluating the equivalence between the ROC curves of
-    two classifier. The method is described by Andrew P. Bradley in
-    "ROC curve equivalence using the Kolmogorov-Smirnov test"
-    DOI: 10.1016/j.patrec.2012.12.021
-
-    Parameters
-    ----------
-    y_pred_a: numpy.array, shape=(n_samples_a)
-        Predictions of classifier a. The predictions are expected to be
-        between [0, 1].
-
-    y_pred_b: numpy.array, shape=(n_samples_b)
-        Predictions of classifier b. he predictions are expected to be
-        between [0, 1]. If y_true_b is not provided, the
-        sample must be of the same length as sample a.
-
-    y_true : numpy.array, shape=(n_samples_a)
-        True labels for sample_a. If y_true_b is not provided, it is
-        also used as the true labels for sample b
-
-    y_true_b : None numpy.array, shape=(n_samples_b), optional
-        True labels for sample_b. If None y_true is used as labels for
-        sample b.
-
-    alpha : float, optional (default=0.05)
-        Significance for the Kolmogorov Smirnov test.
-
-    scale : boolean, optional (default=False)
-        Wether the predictions should be to the interval [0,1].
-
-    Returns
-    -------
-    passed: bool
-        True if test is accepted. False if the test is rejected. A
-        rejection has the error rate alpha.
-
-    op_point_a: numpy.array, shape=(2,2)
-        [False positive rate, True positive rate] Rate at the operation
-        points of both KS test for sample a.
-
-    op_point_b: numpy.array, shape=(2,2)
-        [False positive rate, True positive rate] Rate at the operation
-        points of both KS test for sample b.
-
-    fpr_b: numpy.array
-        False positive rate for sample b at the thresholds.
-
-    tpr_b: numpy.array
-        True positive rate for sample b at the thresholds.
-
-    threshold: numpy.array
-        Thresholds to the false/true positive rates.
-    """
-
-    bincount_y = np.bincount(y_true)
-    num_positive_a = bincount_y[1]
-    num_negative_a = bincount_y[0]
-    if y_true_b is not None:
-        bincount_y = np.bincount(y_true_b)
-        num_positive_b = bincount_y[1]
-        num_negative_b = bincount_y[0]
-    else:
-        y_true_b = y_true
-        num_positive_b = num_positive_a
-        num_negative_b = num_negative_a
-    if scale:
-        min_pred_a = np.min(y_pred_a)
-        max_pred_a = np.max(y_pred_a)
-        y_pred_a = (y_pred_a - min_pred_a) / (max_pred_a - min_pred_a)
-
-        min_pred_b = np.min(y_pred_b)
-        max_pred_b = np.max(y_pred_b)
-        y_pred_b = (y_pred_b - min_pred_b) / (max_pred_b - min_pred_b)
-
-
-    fpr_a, tpr_a, thresholds_a = roc_curve(y_true,
-                                           y_pred_a,
-                                           drop_intermediate=True)
-    fpr_b, tpr_b, thresholds_b = roc_curve(y_true_b,
-                                           y_pred_b,
-                                           drop_intermediate=True)
-
-    thresholds = np.sort(np.unique(np.hstack((thresholds_a, thresholds_b))))
-
-    order_a = np.argsort(thresholds_a)
-    thresholds_a = thresholds_a[order_a]
-    fpr_a = fpr_a[order_a]
-    tpr_a = tpr_a[order_a]
-
-    order_b = np.argsort(thresholds_b)
-    thresholds_b = thresholds_b[order_b]
-    fpr_b = fpr_b[order_b]
-    tpr_b = tpr_b[order_b]
-
-    fpr_a_full = np.ones_like(thresholds)
-    tpr_a_full = np.ones_like(thresholds)
-    fpr_b_full = np.ones_like(thresholds)
-    tpr_b_full = np.ones_like(thresholds)
-    pointer_a = -1
-    pointer_b = -1
-
-    for i, t_i in enumerate(thresholds):
-        if pointer_a + 1 < len(thresholds_a):
-            if t_i == thresholds_a[pointer_a + 1]:
-                pointer_a += 1
-            fpr_a_full[i] = fpr_a[pointer_a]
-            tpr_a_full[i] = tpr_a[pointer_a]
-            if pointer_a == -1:
-                fpr_a_full[i] = 1.
-                tpr_a_full[i] = 1.
-        else:
-            fpr_a_full[i] = 0.
-            tpr_a_full[i] = 0.
-
-        if pointer_b + 1 < len(thresholds_b):
-            if t_i == thresholds_b[pointer_b + 1]:
-                pointer_b += 1
-            fpr_b_full[i] = fpr_b[pointer_b]
-            tpr_b_full[i] = tpr_b[pointer_b]
-            if pointer_b == -1:
-                fpr_b_full[i] = 1.
-                tpr_b_full[i] = 1.
-        else:
-            fpr_b_full[i] = 0.
-            tpr_b_full[i] = 0.
-
-    passed_neg, idx_max_neg, dist_max_neg = kstest_2sample(
-        x=thresholds,
-        cdf_a=fpr_a_full,
-        cdf_b=fpr_b_full,
-        n_a=num_negative_a,
-        n_b=num_negative_b,
-        alpha=alpha)
-
-    passed_pos, idx_max_pos, dist_max_pos = kstest_2sample(
-        x=thresholds,
-        cdf_a=tpr_a_full,
-        cdf_b=tpr_b_full,
-        n_a=num_positive_a,
-        n_b=num_positive_b,
-        alpha=alpha)
-
-    op_point_n = np.array([[fpr_a_full[idx_max_neg], fpr_b_full[idx_max_neg]],
-                           [tpr_a_full[idx_max_neg], tpr_b_full[idx_max_neg]]])
-    op_point_p = np.array([[fpr_a_full[idx_max_pos], fpr_b_full[idx_max_pos]],
-                           [tpr_a_full[idx_max_pos], tpr_b_full[idx_max_pos]]])
-
-    passed = np.logical_and(passed_pos, passed_neg)
-
-    return passed, op_point_n, op_point_p, \
-        fpr_a_full, tpr_a_full, fpr_b_full, tpr_b_full, thresholds
+__all__ = ['feature_importance_mad',
+           'feature_importance_mad_majority',
+           'roc_curve_equivalence_ks_test']
diff --git a/disteval/evaluation/feature_importance_test.py b/disteval/evaluation/feature_importance_test.py
index 6b7375f..8483f16 100644
--- a/disteval/evaluation/feature_importance_test.py
+++ b/disteval/evaluation/feature_importance_test.py
@@ -1,9 +1,7 @@
 # -*- coding:utf-8 -*-
 import numpy as np
 
-from sklearn.metrics import roc_curve
-
-from ..scripts.classifier_characteristics import ClassifierCharacteristics
+from ..basics.classifier_characteristics import ClassifierCharacteristics
 
 
 def feature_importance_mad(clf, alpha=0.05):
diff --git a/disteval/evaluation/roc_curve_equivalence_test.py b/disteval/evaluation/roc_curve_equivalence_test.py
index 9120ef8..230d084 100644
--- a/disteval/evaluation/roc_curve_equivalence_test.py
+++ b/disteval/evaluation/roc_curve_equivalence_test.py
@@ -4,6 +4,7 @@
 """
 import numpy as np
 
+from sklearn.metrics import roc_curve
 
 def kstest_2sample(x, cdf_a, cdf_b, n_a, n_b, alpha=0.05):
     """Function evaluating the Kolmogorov Smirrnoff Test. Variable
@@ -53,3 +54,162 @@ def kstest_2sample(x, cdf_a, cdf_b, n_a, n_b, alpha=0.05):
     passed = factor * d_max <= K_alpha
 
     return passed, idx_max, d_max
+
+
+def roc_curve_equivalence_ks_test(y_pred_a,
+                                  y_pred_b,
+                                  y_true,
+                                  y_true_b=None,
+                                  alpha=0.05,
+                                  scale=False):
+    """Function evaluating the equivalence between the ROC curves of
+    two classifier. The method is described by Andrew P. Bradley in
+    "ROC curve equivalence using the Kolmogorov-Smirnov test"
+    DOI: 10.1016/j.patrec.2012.12.021
+
+    Parameters
+    ----------
+    y_pred_a: numpy.array, shape=(n_samples_a)
+        Predictions of classifier a. The predictions are expected to be
+        between [0, 1].
+
+    y_pred_b: numpy.array, shape=(n_samples_b)
+        Predictions of classifier b. he predictions are expected to be
+        between [0, 1]. If y_true_b is not provided, the
+        sample must be of the same length as sample a.
+
+    y_true : numpy.array, shape=(n_samples_a)
+        True labels for sample_a. If y_true_b is not provided, it is
+        also used as the true labels for sample b
+
+    y_true_b : None numpy.array, shape=(n_samples_b), optional
+        True labels for sample_b. If None y_true is used as labels for
+        sample b.
+
+    alpha : float, optional (default=0.05)
+        Significance for the Kolmogorov Smirnov test.
+
+    scale : boolean, optional (default=False)
+        Wether the predictions should be to the interval [0,1].
+
+    Returns
+    -------
+    passed: bool
+        True if test is accepted. False if the test is rejected. A
+        rejection has the error rate alpha.
+
+    op_point_a: numpy.array, shape=(2,2)
+        [False positive rate, True positive rate] Rate at the operation
+        points of both KS test for sample a.
+
+    op_point_b: numpy.array, shape=(2,2)
+        [False positive rate, True positive rate] Rate at the operation
+        points of both KS test for sample b.
+
+    fpr_b: numpy.array
+        False positive rate for sample b at the thresholds.
+
+    tpr_b: numpy.array
+        True positive rate for sample b at the thresholds.
+
+    threshold: numpy.array
+        Thresholds to the false/true positive rates.
+    """
+
+    bincount_y = np.bincount(y_true)
+    num_positive_a = bincount_y[1]
+    num_negative_a = bincount_y[0]
+    if y_true_b is not None:
+        bincount_y = np.bincount(y_true_b)
+        num_positive_b = bincount_y[1]
+        num_negative_b = bincount_y[0]
+    else:
+        y_true_b = y_true
+        num_positive_b = num_positive_a
+        num_negative_b = num_negative_a
+    if scale:
+        min_pred_a = np.min(y_pred_a)
+        max_pred_a = np.max(y_pred_a)
+        y_pred_a = (y_pred_a - min_pred_a) / (max_pred_a - min_pred_a)
+
+        min_pred_b = np.min(y_pred_b)
+        max_pred_b = np.max(y_pred_b)
+        y_pred_b = (y_pred_b - min_pred_b) / (max_pred_b - min_pred_b)
+
+
+    fpr_a, tpr_a, thresholds_a = roc_curve(y_true,
+                                           y_pred_a,
+                                           drop_intermediate=True)
+    fpr_b, tpr_b, thresholds_b = roc_curve(y_true_b,
+                                           y_pred_b,
+                                           drop_intermediate=True)
+
+    thresholds = np.sort(np.unique(np.hstack((thresholds_a, thresholds_b))))
+
+    order_a = np.argsort(thresholds_a)
+    thresholds_a = thresholds_a[order_a]
+    fpr_a = fpr_a[order_a]
+    tpr_a = tpr_a[order_a]
+
+    order_b = np.argsort(thresholds_b)
+    thresholds_b = thresholds_b[order_b]
+    fpr_b = fpr_b[order_b]
+    tpr_b = tpr_b[order_b]
+
+    fpr_a_full = np.ones_like(thresholds)
+    tpr_a_full = np.ones_like(thresholds)
+    fpr_b_full = np.ones_like(thresholds)
+    tpr_b_full = np.ones_like(thresholds)
+    pointer_a = -1
+    pointer_b = -1
+
+    for i, t_i in enumerate(thresholds):
+        if pointer_a + 1 < len(thresholds_a):
+            if t_i == thresholds_a[pointer_a + 1]:
+                pointer_a += 1
+            fpr_a_full[i] = fpr_a[pointer_a]
+            tpr_a_full[i] = tpr_a[pointer_a]
+            if pointer_a == -1:
+                fpr_a_full[i] = 1.
+                tpr_a_full[i] = 1.
+        else:
+            fpr_a_full[i] = 0.
+            tpr_a_full[i] = 0.
+
+        if pointer_b + 1 < len(thresholds_b):
+            if t_i == thresholds_b[pointer_b + 1]:
+                pointer_b += 1
+            fpr_b_full[i] = fpr_b[pointer_b]
+            tpr_b_full[i] = tpr_b[pointer_b]
+            if pointer_b == -1:
+                fpr_b_full[i] = 1.
+                tpr_b_full[i] = 1.
+        else:
+            fpr_b_full[i] = 0.
+            tpr_b_full[i] = 0.
+
+    passed_neg, idx_max_neg, dist_max_neg = kstest_2sample(
+        x=thresholds,
+        cdf_a=fpr_a_full,
+        cdf_b=fpr_b_full,
+        n_a=num_negative_a,
+        n_b=num_negative_b,
+        alpha=alpha)
+
+    passed_pos, idx_max_pos, dist_max_pos = kstest_2sample(
+        x=thresholds,
+        cdf_a=tpr_a_full,
+        cdf_b=tpr_b_full,
+        n_a=num_positive_a,
+        n_b=num_positive_b,
+        alpha=alpha)
+
+    op_point_n = np.array([[fpr_a_full[idx_max_neg], fpr_b_full[idx_max_neg]],
+                           [tpr_a_full[idx_max_neg], tpr_b_full[idx_max_neg]]])
+    op_point_p = np.array([[fpr_a_full[idx_max_pos], fpr_b_full[idx_max_pos]],
+                           [tpr_a_full[idx_max_pos], tpr_b_full[idx_max_pos]]])
+
+    passed = np.logical_and(passed_pos, passed_neg)
+
+    return passed, op_point_n, op_point_p, \
+        fpr_a_full, tpr_a_full, fpr_b_full, tpr_b_full, thresholds
diff --git a/disteval/recursive_selection_parallel.py b/disteval/recursive_selection_parallel.py
index 1db8f08..af60a1d 100644
--- a/disteval/recursive_selection_parallel.py
+++ b/disteval/recursive_selection_parallel.py
@@ -1,5 +1,6 @@
-#!/usr/bin/env python
 # -*- coding: utf-8 -*-
+from logging import getLogger
+
 from concurrent.futures import ProcessPoolExecutor, wait
 
 import numpy as np
@@ -7,6 +8,146 @@
 from sklearn.metrics import roc_auc_score
 from sklearn.model_selection import StratifiedKFold
 
+from .basics.classifier_characteristics import ClassifierCharacteristics
+
+logger = getLogger('disteval.recursive_selection')
+
+
+def recursive_feature_selection_roc_auc(clf,
+                                        X,
+                                        y,
+                                        sample_weight=None,
+                                        n_features=10,
+                                        cv_steps=10,
+                                        n_jobs=1,
+                                        forward=True,
+                                        matching_features=True):
+    """Method building a feature set in a recursive fashion. Depending
+    on the setting it is run as a forward selection/backward elimination
+    searching for a set of n features with the highest/lowest mismatch.
+    To get the set with the size n starting from n_total features the
+    following approaches are used:
+
+    Forward Selection:
+    To get the k+1 set every not yet selected feature is used to
+    generate (n_total - k sets). The set with the best score is the
+    k + 1 set. Those steps are repeated until n features are selected
+
+    Backward Elimination:
+    To get k+1 eliminated features every not yet eleminated feature is used
+    to generate (n_total - k) sets. The sets consist of all not yet
+    eliminated features minus the one that is tested. The set with the
+    best score determines the next feature to eliminate. Those steps are
+    repeated until n features are eliminated.
+
+    What the best score depends also on the settings:
+    matching_features:
+        forward: min(|auc - 0.5|)
+        not forward: max(|aux - 0.5|)
+
+    not matching_features:
+        forward: max(auc )
+        not forward: min(aux)
+
+
+    Parameters
+    ----------
+    clf: object
+        Classifier that should be used for the classification.
+        It needs a fit and a predict_proba function.
+
+    X : numpy.float32array, shape=(n_samples, n_obs)
+        Values describing the samples.
+
+    y : numpy.float32array, shape=(n_samples)
+        Array of the true labels.
+
+    sample_weight : None or numpy.float32array, shape=(n_samples)
+        If weights are used this has to contains the sample weights.
+        None in the case of no weights.
+
+    n_features : int, optional (default=10)
+        Number of feature that are selected (forward=True) or eliminated
+        (forward=False)
+
+    n_jobs: int, optional (default=1)
+        Number of parallel jobs spawned in each a classification in run.
+        Total number of used cores is the product of n_jobs from the clf
+        and the n_jobs of this function.
+
+    forward: bool, optional (default=True)
+        If True it is a 'forward selection'. If False it is a 'backward
+        elimination'.
+
+    matching_features: bool, optional (default=True)
+        Wether for matching or mismatching feature should be searched
+
+    Returns
+    -------
+    selected_features: list of ints
+        Return a list containing the indeces of X, that were
+        selected/eliminated. The order corresponds to the order the
+        features were selected/eliminated.
+
+    auc_scores: np.array float shape(n_features_total, n_features)
+        Return a array containing the auc values for all steps.
+        np.nan is the feature was already selected in the specific run.
+    """
+    desired_characteristics = ClassifierCharacteristics()
+    desired_characteristics.opts['callable:fit'] = True
+    desired_characteristics.opts['callable:predict_proba'] = True
+
+    clf_characteristics = ClassifierCharacteristics(clf)
+    assert clf_characteristics.fulfilling(desired_characteristics), \
+        'Classifier sanity check failed!'
+
+    if n_features > X.shape[1]:
+        logger.info(' \'n_features\' higher than total number of features.'
+                    ' \'n_features\' reduced!')
+        n_features = X.shape[1]
+    auc_scores = np.zeros((X.shape[1], n_features))
+    selected_features = []
+
+    while len(selected_features) != n_features:
+        auc_scores_i = get_all_auc_scores(clf,
+                                          selected_features,
+                                          X,
+                                          y,
+                                          sample_weight=sample_weight,
+                                          cv_steps=cv_steps,
+                                          n_jobs=n_jobs,
+                                          forward=forward)
+        value_best = None
+        index_best = None
+        for idx, auc in enumerate(auc_scores_i):
+            if not np.isfinite(auc):
+                continue
+            if value_best is None:
+                value_best = auc
+                index_best = idx
+            if matching_features:
+                if forward:
+                    if np.abs(auc - 0.5) < np.abs(value_best - 0.5):
+                        value_best = auc
+                        index_best = idx
+                else:
+                    if np.abs(auc - 0.5) > np.abs(value_best - 0.5):
+                        value_best = auc
+                        index_best = idx
+            else:
+                if forward:
+                    if auc > value_best:
+                        value_best = auc
+                        index_best = idx
+                else:
+                    if auc < value_best:
+                        value_best = auc
+                        index_best = idx
+        auc_scores[:, len(selected_features)] = auc_scores_i
+        selected_features.append(index_best)
+    return selected_features, auc_scores
+
+
 
 def __single_auc_score__(feature_i,
                          clf,
@@ -184,3 +325,6 @@ def get_all_auc_scores(clf,
                                           sample_weight=sample_weight)
             auc_scores[feature_i] = auc
     return auc_scores
+
+
+
diff --git a/disteval/visualization/__init__.py b/disteval/visualization/__init__.py
index 48e7e53..0786f5e 100644
--- a/disteval/visualization/__init__.py
+++ b/disteval/visualization/__init__.py
@@ -4,7 +4,7 @@
 Collection of methods to visualize the results of disteval functions
 """
 from .feature_importance_test import visualize_feature_importance_mad
-from .roc_curve_equivalence_test import roc_curve_equivalence_ks_test
+from .roc_curve_equivalence_test import visualize_roc_curve_equivalence_test
 from .comparison_plotter import ComparisonPlotter
 
 __all__ = ['visualize_feature_importance_mad',
diff --git a/examples/roc_curve_equivalence.py b/examples/roc_curve_equivalence.py
index 133f51a..ae51c2d 100644
--- a/examples/roc_curve_equivalence.py
+++ b/examples/roc_curve_equivalence.py
@@ -4,6 +4,7 @@
 '''
 import logging
 import matplotlib
+matplotlib.use('Agg')
 import numpy as np
 
 from sklearn.datasets import make_classification
@@ -13,8 +14,8 @@
 from disteval import evaluation as eval
 from disteval import visualization as visu
 
-log = logging.getLogger("disteval.fact_example")
-matplotlib.use('Agg')
+log = logging.getLogger("disteval.roc_curve_equivalence_example")
+
 
 
 def main():