From 0a312433bd18e91200c7c9bb66194d7669f50a3c Mon Sep 17 00:00:00 2001
From: Feelx234 <52208598+Feelx234@users.noreply.github.com>
Date: Tue, 19 Sep 2023 17:32:15 +0200
Subject: [PATCH] Improved Generalization Aware Qualities (#52)

Greatly increased generalization aware qualities
+ cleanup
---
 .gitignore                                   |   6 +-
 docs/sections/components/gp_growth.rst       |   7 +-
 setup.cfg                                    |   4 +-
 src/pysubgroup/algorithms.py                 |  42 +--
 src/pysubgroup/binary_target.py              | 142 +++++++++-
 src/pysubgroup/measures.py                   |  44 +--
 src/pysubgroup/numeric_target.py             |  96 ++++---
 src/pysubgroup/subgroup_description.py       |   4 +-
 src/pysubgroup/utils.py                      |   6 +-
 tests/algorithms_testing.py                  |   2 +-
 tests/test_algorithms_boolean.py             |   3 -
 tests/test_algorithms_boolean_constraints.py |   3 -
 tests/test_algorithms_numeric.py             |   8 +-
 tests/test_generalisation_aware.py           | 276 ++++++++++++++++---
 tests/test_generalisations.py                |   5 +-
 tests/test_representations.py                |  60 ++--
 16 files changed, 513 insertions(+), 195 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4885096..18805a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,7 @@ test_gp_model.txt
 program.prof
 import.log
 doc/_build
-Untitled.ipynb
+
 
 
 # Temporary and binary files
@@ -69,3 +69,7 @@ MANIFEST
 .venv*/
 .conda*/
 .python-version
+
+
+Untitled1.ipynb
+Untitled.ipynb
diff --git a/docs/sections/components/gp_growth.rst b/docs/sections/components/gp_growth.rst
index 63485d5..1b4b2f3 100644
--- a/docs/sections/components/gp_growth.rst
+++ b/docs/sections/components/gp_growth.rst
@@ -23,12 +23,11 @@ The basic usage of the gp-growth algorithm is not very different from the usage
     from pysubgroup.datasets import get_titanic_data
     data = get_titanic_data()
 
-    target = ps.NominalSelector ('Survived', True)
+    target = ps.BinaryTarget ('Survived', True)
     searchspace = ps.create_selectors(data, ignore=['Survived'])
-    task = ps.SubgroupDiscoveryTask (data, target, dearchspace, result_set_size=5, depth=2, qf=ps.WRAccQF())
-    GpGrowth.execute(task)
+    task = ps.SubgroupDiscoveryTask (data, target, searchspace, result_set_size=5, depth=2, qf=ps.WRAccQF())
+    result = ps.GpGrowth().execute(task)
 
-But beware that gp-growth is using an exhaustive search strategy! This can greatly increase the runtime for high search depth.
 You can specify the :code:`mode` argument in the constructor of GpGrowth to run gp-growth either bottom up (:code:`mode='b_u'`) or top down (:code:`mode='b_u'`).
 As gp growth is a generalisation of fp-growth you can also perform standard fp-growth using gp_growth by using the CountQF (:ref:`countqf`) quality function.
 
diff --git a/setup.cfg b/setup.cfg
index 72f2a45..f4cc25b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -110,8 +110,8 @@ norecursedirs =
     .tox
 testpaths = tests
 # Use pytest markers to select/deselect specific tests
-# markers =
-#     slow: mark tests as slow (deselect with '-m "not slow"')
+markers =
+    slow: mark tests as slow (deselect with '-m "not slow"')
 #     system: mark end-to-end system tests
 
 [devpi:upload]
diff --git a/src/pysubgroup/algorithms.py b/src/pysubgroup/algorithms.py
index be9132b..0ba3a9a 100644
--- a/src/pysubgroup/algorithms.py
+++ b/src/pysubgroup/algorithms.py
@@ -55,6 +55,27 @@ def constraints_satisfied(constraints, subgroup, statistics=None, data=None):
     )
 
 
+try:  # pragma: no cover
+    from numba import (  # pylint: disable=import-error, import-outside-toplevel
+        int32,
+        int64,
+        njit,
+    )
+
+    @njit([(int32[:, :], int64[:])], cache=True)
+    def getNewCandidates(candidates, hashes):  # pragma: no cover
+        result = []
+        for i in range(len(candidates) - 1):
+            for j in range(i + 1, len(candidates)):
+                if hashes[i] == hashes[j]:
+                    if np.all(candidates[i, :-1] == candidates[j, :-1]):
+                        result.append((i, j))
+        return result
+
+except ImportError:  # pragma: no cover
+    pass
+
+
 class Apriori:
     def __init__(
         self, representation_type=None, combination_name="Conjunction", use_numba=True
@@ -133,20 +154,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
         return promising_candidates
 
     def get_next_level_numba(self, promising_candidates):
-        from numba import njit  # pylint: disable=import-error, import-outside-toplevel
-
         if not hasattr(self, "compiled_func") or self.compiled_func is None:
-
-            @njit
-            def getNewCandidates(candidates, hashes):  # pragma: no cover
-                result = []
-                for i in range(len(candidates) - 1):
-                    for j in range(i + 1, len(candidates)):
-                        if hashes[i] == hashes[j]:
-                            if np.all(candidates[i, :-1] == candidates[j, :-1]):
-                                result.append((i, j))
-                return result
-
             self.compiled_func = getNewCandidates
 
         all_selectors = Counter(chain.from_iterable(promising_candidates))
@@ -182,7 +190,9 @@ def execute(self, task):
         if not isinstance(
             task.qf, ps.BoundedInterestingnessMeasure
         ):  # pragma: no cover
-            raise RuntimeWarning("Quality function is unbounded, long runtime expected")
+            warnings.warn(
+                "Quality function is unbounded, long runtime expected", RuntimeWarning
+            )
 
         task.qf.calculate_constant_statistics(task.data, task.target)
 
@@ -302,7 +312,7 @@ def execute(self, task):
 
             sg = candidate_description
             statistics = task.qf.calculate_statistics(sg, task.target, task.data)
-            quality = task.qf.evaluate(sg, statistics)
+            quality = task.qf.evaluate(sg, task.target, task.data, statistics)
             ps.add_if_required(result, sg, quality, task, statistics=statistics)
 
             qual = ps.minimum_required_quality(result, task)
@@ -336,8 +346,6 @@ def execute(self, task):
                 self.discarded[len(candidate_description)] += 1
 
         result.sort(key=lambda x: x[0], reverse=True)
-        for qual, sg in result:
-            print(f"{qual} {sg}")
         print("discarded " + str(self.discarded))
         return ps.SubgroupDiscoveryResult(result, task)
 
diff --git a/src/pysubgroup/binary_target.py b/src/pysubgroup/binary_target.py
index 5640788..25433f6 100644
--- a/src/pysubgroup/binary_target.py
+++ b/src/pysubgroup/binary_target.py
@@ -455,31 +455,143 @@ def __init__(self):
 #####
 # GeneralizationAware Interestingness Measures
 #####
-class GeneralizationAware_StandardQF(GeneralizationAwareQF_stats):
-    def __init__(self, a):
-        super().__init__(StandardQF(0))
+class GeneralizationAware_StandardQF(
+    GeneralizationAwareQF_stats, BoundedInterestingnessMeasure
+):
+    ga_sQF_agg_tuple = namedtuple(
+        "ga_sQF_agg_tuple", ["max_p", "min_delta_negatives", "min_negatives"]
+    )
+
+    def __init__(self, a, optimistic_estimate_strategy="default"):
+        super().__init__(StandardQF(a))
+        if optimistic_estimate_strategy in ("default", "difference"):
+            self.optimistic_estimate = self.difference_based_optimistic_estimate
+            self.aggregate_statistics = self.difference_based_agg_function
+            self.read_p = self.difference_based_read_p
+        elif optimistic_estimate_strategy == "max":
+            self.optimistic_estimate = self.max_based_optimistic_estimate
+            self.aggregate_statistics = self.max_based_aggregate_statistics
+            self.read_p = self.max_based_read_p
+        else:
+            raise ValueError(
+                "optimistic_estimate_strategy should be one of "
+                "('default', 'max', 'difference')"
+            )
         self.a = a
 
-    def get_max(self, *args):
-        max_ratio = 0.0
+    def evaluate(self, subgroup, target, data, statistics=None):
+        statistics = self.ensure_statistics(subgroup, target, data, statistics)
+        sg_stats = statistics.subgroup_stats
+        general_stats = statistics.generalisation_stats
+        if sg_stats.size_sg == 0:
+            return np.nan
+        sg_ratio = sg_stats.positives_count / sg_stats.size_sg
+        return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * (
+            sg_ratio - self.read_p(general_stats)
+        )
+
+    def max_based_aggregate_statistics(self, stats_subgroup, list_of_pairs):
+        if len(list_of_pairs) == 0:
+            return stats_subgroup
+        max_ratio = -100
         max_stats = None
-        for stat in args:
-            assert stat.size_sg > 0
-            ratio = stat.positives_count / stat.size_sg
-            if ratio > max_ratio:
-                max_ratio = ratio
-                max_stats = stat
+        for pair in list_of_pairs:
+            ratio = -np.inf
+            for agg_stat in pair:
+                if agg_stat.size_sg == 0:  # pragma: no cover
+                    continue
+                ratio = agg_stat.positives_count / agg_stat.size_sg
+                if ratio > max_ratio:
+                    max_ratio = ratio
+                    max_stats = agg_stat
+
         return max_stats
 
-    def evaluate(self, subgroup, target, data, statistics=None):
+    def max_based_optimistic_estimate(self, subgroup, target, data, statistics=None):
+        """
+        Computes the oe as the hypothetical subgroup containing only positive instances
+        """
         statistics = self.ensure_statistics(subgroup, target, data, statistics)
         sg_stats = statistics.subgroup_stats
         general_stats = statistics.generalisation_stats
         if sg_stats.size_sg == 0 or general_stats.size_sg == 0:
             return np.nan
 
-        sg_ratio = sg_stats.positives_count / sg_stats.size_sg
         general_ratio = general_stats.positives_count / general_stats.size_sg
-        return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * (
-            sg_ratio - general_ratio
+        return (sg_stats.positives_count / self.stats0.size_sg) ** self.a * (
+            1 - general_ratio
         )
+
+    def max_based_read_p(self, agg_tuple):
+        return agg_tuple.positives_count / agg_tuple.size_sg
+
+    def difference_based_optimistic_estimate(self, subgroup, target, data, statistics):
+        sg_stats, agg_stats = self.ensure_statistics(subgroup, target, data, statistics)
+        if np.isposinf(agg_stats.min_delta_negatives):
+            return np.inf
+        delta_n = agg_stats.min_delta_negatives
+        size_dataset = self.qf.dataset_statistics.size_sg
+        tau_diff = 0
+        if self.qf.a == 0:
+            pos = 1
+            # return delta_n /(1 + delta_n)
+        elif self.qf.a == 1.0:
+            pos = sg_stats.positives_count
+            # return pos / size_dataset * delta_n /(pos + delta_n)
+        else:
+            a = self.qf.a
+            p_hat = min(np.ceil(a * delta_n / (1 - a)), sg_stats.positives_count)
+            pos = p_hat
+            # return (p_hat / size_dataset) ** a * delta_n /(p_hat+delta_n)
+        tau_diff = pos / (pos + delta_n)
+        if sg_stats.size_sg > 0:
+            tau_sg = sg_stats.positives_count / sg_stats.size_sg
+        else:
+            tau_sg = -1
+        tau_max = max(tau_diff, tau_sg, agg_stats.max_p)
+        return (sg_stats.positives_count / size_dataset) ** self.a * (1 - tau_max)
+
+    def difference_based_agg_function(self, stats_subgroup, list_of_pairs):
+        """
+        list_of_pairs is a list of (stats, agg_tuple) for all the generalizations
+        """
+
+        def get_negatives_count(sg_stats):
+            return sg_stats.size_sg - sg_stats.positives_count
+
+        def get_percentage_positives(sg_stats):
+            if sg_stats.size_sg == 0:
+                return np.nan
+            return sg_stats.positives_count / sg_stats.size_sg
+
+        if len(list_of_pairs) == 0:  # empty pattern
+            return GeneralizationAware_StandardQF.ga_sQF_agg_tuple(
+                get_percentage_positives(stats_subgroup), np.infty, np.infty
+            )
+
+        subgroup_negatives = stats_subgroup.size_sg - stats_subgroup.positives_count
+        min_immediate_generalizations_negatives = min(
+            get_negatives_count(x.subgroup_stats) for x in list_of_pairs
+        )
+        min_immediate_generalizations_delta_negatives = min(
+            x.generalisation_stats.min_delta_negatives for x in list_of_pairs
+        )
+        max_percentage_positives = max(
+            max(
+                get_percentage_positives(x.subgroup_stats), x.generalisation_stats.max_p
+            )
+            for x in list_of_pairs
+        )
+
+        sg_delta_negatives = (
+            min_immediate_generalizations_negatives - subgroup_negatives
+        )
+        min_delta_negatives = min(
+            sg_delta_negatives, min_immediate_generalizations_delta_negatives
+        )
+        return GeneralizationAware_StandardQF.ga_sQF_agg_tuple(
+            max_percentage_positives, min_delta_negatives, sg_delta_negatives
+        )
+
+    def difference_based_read_p(self, agg_tuple):
+        return agg_tuple.max_p
diff --git a/src/pysubgroup/measures.py b/src/pysubgroup/measures.py
index bae33c5..7fd8372 100644
--- a/src/pysubgroup/measures.py
+++ b/src/pysubgroup/measures.py
@@ -190,6 +190,10 @@ def __hasattr__(self, name):
 # GeneralizationAware Interestingness Measures
 #####
 class GeneralizationAwareQF(AbstractInterestingnessMeasure):
+    """A class that computes the generalization aware qf as follows:
+    qf(sg) = qf(sg) - max_{generalizations} qf(sq)
+    """
+
     ga_tuple = namedtuple("ga_tuple", ["subgroup_quality", "generalisation_quality"])
 
     def __init__(self, qf):
@@ -241,6 +245,8 @@ def evaluate(self, subgroup, target, data, statistics=None):
 # GeneralizationAware Interestingness Measures
 #####
 class GeneralizationAwareQF_stats(AbstractInterestingnessMeasure):
+    """An abstract base class that implements aggregation of stats of generalisations"""
+
     ga_tuple = namedtuple("ga_stats_tuple", ["subgroup_stats", "generalisation_stats"])
 
     def __init__(self, qf):
@@ -263,30 +269,34 @@ def calculate_constant_statistics(self, data, target):
     def calculate_statistics(self, subgroup, target, data, statistics=None):
         sg_repr = repr(subgroup)
         if sg_repr in self.cache:
-            return GeneralizationAwareQF_stats.ga_tuple(*self.cache[sg_repr])
+            return self.cache[sg_repr]
 
-        (stats_sg, stats_prev) = self.get_stats_and_previous_stats(
-            subgroup, target, data
-        )
-        self.cache[sg_repr] = (stats_sg, stats_prev)
-        return GeneralizationAwareQF_stats.ga_tuple(stats_sg, stats_prev)
+        tpl = self.get_stats_and_previous_stats(subgroup, target, data)
+        self.cache[sg_repr] = tpl
+        return tpl
 
     def get_stats_and_previous_stats(self, subgroup, target, data):
         stats_subgroup = self.qf.calculate_statistics(subgroup, target, data)
-        max_stats = self.stats0
+        # pylint: disable=no-member
+        if len(subgroup.selectors) == 0:
+            return GeneralizationAwareQF_stats.ga_tuple(
+                stats_subgroup, self.aggregate_statistics(stats_subgroup, [])
+            )
+
         selectors = subgroup.selectors
-        if len(selectors) > 0:
-            # compute quality of all generalizations
-            generalizations = combinations(selectors, len(selectors) - 1)
+        immediate_generalizations = combinations(selectors, len(selectors) - 1)
 
-            for sels in generalizations:
-                sgd = ps.Conjunction(list(sels))
-                (stats_sg, stats_prev) = self.calculate_statistics(sgd, target, data)
-                max_stats = self.get_max(max_stats, stats_sg, stats_prev)
-        return (stats_subgroup, max_stats)
+        list_of_pairs = []
+        for sels in immediate_generalizations:
+            sgd = ps.Conjunction(list(sels))
+            list_of_pairs.append(self.calculate_statistics(sgd, target, data))
+        agg_stats = self.aggregate_statistics(stats_subgroup, list_of_pairs)
+        # pylint: enable=no-member
+        return GeneralizationAwareQF_stats.ga_tuple(stats_subgroup, agg_stats)
 
     def evaluate(self, subgroup, target, data, statistics=None):
         raise NotImplementedError
 
-    def get_max(self, *args):
-        raise NotImplementedError
+
+#    def aggregate_statistics(self, *args):
+#        raise NotImplementedError
diff --git a/src/pysubgroup/numeric_target.py b/src/pysubgroup/numeric_target.py
index 62973f4..b1ce680 100644
--- a/src/pysubgroup/numeric_target.py
+++ b/src/pysubgroup/numeric_target.py
@@ -197,7 +197,9 @@ def evaluate(self, subgroup, target, data, statistics=None):
             self.read_centroid(statistics),
         )
 
-    def calculate_statistics(self, subgroup, target, data, statistics=None):
+    def calculate_statistics(
+        self, subgroup, target, data, statistics=None
+    ):  # pylint: disable=unused-argument
         cover_arr, sg_size = ps.get_cover_array_and_size(
             subgroup, len(self.all_target_values), data
         )
@@ -218,12 +220,22 @@ def optimistic_estimate(self, subgroup, target, data, statistics=None):
         return statistics.estimate
 
     class Summation_Estimator:
+        r"""\
+        This estimator calculates the optimistic estimate as a hyppothetical subgroup\
+         which contains only instances with value greater than the dataset mean and\
+         is of maximal size.
+        .. math::
+            oe(sg) = \sum_{x \in sg, T(x)>0} (T(sg) - \mu_0)
+
+        From Florian Lemmerich's Dissertation [section 4.2.2.1, Theorem 2 (page 81)]
+        """
+
         def __init__(self, qf):
             self.qf = qf
             self.indices_greater_centroid = None
             self.target_values_greater_centroid = None
 
-        def get_data(self, data, target):
+        def get_data(self, data, target):  # pylint: disable=unused-argument
             return data
 
         def calculate_constant_statistics(
@@ -251,12 +263,19 @@ def get_estimate(
             )
 
     class Max_Estimator:
+        r"""
+        This estimator calculates the optimistic estimate
+        .. math::
+            oe(sg) = n_{>\mu_0}^a (T^{\max}(sg) - \mu_0)
+        From Florian Lemmerich's Dissertation [section 4.2.2.1, Theorem 4 (page 82)]
+        """
+
         def __init__(self, qf):
             self.qf = qf
             self.indices_greater_centroid = None
             self.target_values_greater_centroid = None
 
-        def get_data(self, data, target):
+        def get_data(self, data, target):  # pylint: disable=unused-argument
             return data
 
         def calculate_constant_statistics(
@@ -295,7 +314,9 @@ def get_data(self, data, target):
             data.sort_values(target.get_attributes()[0], ascending=False, inplace=True)
             return data
 
-        def calculate_constant_statistics(self, data, target):
+        def calculate_constant_statistics(
+            self, data, target
+        ):  # pylint: disable=unused-argument
             if not self.use_numba or self.numba_in_place:
                 return
             try:
@@ -402,7 +423,9 @@ def evaluate(self, subgroup, target, data, statistics=None):
             statistics.std,
         )
 
-    def calculate_statistics(self, subgroup, target, data, statistics=None):
+    def calculate_statistics(
+        self, subgroup, target, data, statistics=None
+    ):  # pylint: disable=unused-argument
         cover_arr, sg_size = ps.get_cover_array_and_size(
             subgroup, len(self.all_target_values), data
         )
@@ -423,35 +446,34 @@ def optimistic_estimate(self, subgroup, target, data, statistics=None):
         return statistics.estimate
 
 
-# TODO Update to new format
-# class GAStandardQFNumeric(ps.AbstractInterestingnessMeasure):
-#    def __init__(self, a, invert=False):
-#        self.a = a
-#        self.invert = invert
-#
-#    def evaluate_from_dataset(self, data, subgroup, weighting_attribute=None):
-#        (instances_dataset, _, instances_subgroup, mean_sg) = \
-#           subgroup.get_base_statistics(data, weighting_attribute)
-#        if instances_subgroup in (0, instances_dataset):
-#            return 0
-#        max_mean = get_max_generalization_mean(data, subgroup, weighting_attribute)
-#        relative_size = (instances_subgroup / instances_dataset)
-#        return ps.conditional_invert(
-#           relative_size ** self.a * (mean_sg - max_mean), self.invert)
-
-#    def supports_weights(self):
-#        return True
-
-#    def is_applicable(self, subgroup):
-#        return isinstance(subgroup.target, NumericTarget)
-
-
-# def get_max_generalization_mean(data, subgroup, weighting_attribute=None):
-#    selectors = subgroup.subgroup_description.selectors
-#    generalizations = ps.powerset(selectors)
-#    max_mean = 0
-#    for sels in generalizations:
-#        sg = ps.Subgroup(subgroup.target, ps.Conjunction(list(sels)))
-#        mean_sg = sg.get_base_statistics(data, weighting_attribute)[3]
-#        max_mean = max(max_mean, mean_sg)
-#    return max_mean
+class GeneralizationAware_StandardQFNumeric(ps.GeneralizationAwareQF_stats):
+    def __init__(self, a, invert=False, estimator="default", centroid="mean"):
+        super().__init__(
+            StandardQFNumeric(a, invert=invert, estimator=estimator, centroid=centroid)
+        )
+
+    def evaluate(self, subgroup, target, data, statistics=None):
+        statistics = self.ensure_statistics(subgroup, target, data, statistics)
+        sg_stats = statistics.subgroup_stats
+        general_stats = statistics.generalisation_stats
+        if sg_stats.size_sg == 0:
+            return np.nan
+        read_centroid = self.qf.read_centroid
+        return (sg_stats.size_sg / self.stats0.size_sg) ** self.qf.a * (
+            read_centroid(sg_stats) - read_centroid(general_stats)
+        )
+
+    def aggregate_statistics(self, stats_subgroup, list_of_pairs):
+        read_centroid = self.qf.read_centroid
+        if len(list_of_pairs) == 0:
+            return stats_subgroup
+        max_centroid = 0.0
+        max_stats = None
+        for stat, agg_stat in list_of_pairs:
+            if stat.size_sg == 0:
+                continue
+            centroid = max(read_centroid(agg_stat), read_centroid(stat))
+            if centroid > max_centroid:
+                max_centroid = centroid
+                max_stats = stat
+        return max_stats
diff --git a/src/pysubgroup/subgroup_description.py b/src/pysubgroup/subgroup_description.py
index 622fe02..ebe3cab 100644
--- a/src/pysubgroup/subgroup_description.py
+++ b/src/pysubgroup/subgroup_description.py
@@ -42,7 +42,7 @@ def __new__(cls, *args, **kwargs):
             if ref == tmp:
                 return ref
         # if not return
-        return tmp
+        return tmp  # pragma: no cover
 
     def __getnewargs_ex__(self):  # pylint: disable=invalid-getnewargs-ex-returned
         tmp_args = self.__new_args__
@@ -548,7 +548,7 @@ def _compute_repr(self):
         if not self._selectors:
             return "True"
         reprs = sorted(repr(sel) for sel in self._selectors)
-        return "".join(("(", " and ".join(reprs), ")"))
+        return "(" + " and ".join(reprs) + ")"
 
     def _compute_hash(self):
         return hash(repr(self))
diff --git a/src/pysubgroup/utils.py b/src/pysubgroup/utils.py
index 9984884..0f023c7 100644
--- a/src/pysubgroup/utils.py
+++ b/src/pysubgroup/utils.py
@@ -24,7 +24,7 @@ def minimum_required_quality(result, task):
 
 def prepare_subgroup_discovery_result(result, task):
     result_filtered = [tpl for tpl in result if tpl[0] > task.min_quality]
-    result_filtered.sort(key=lambda x: x[0], reverse=True)
+    result_filtered.sort(reverse=True)
     result_filtered = result_filtered[: task.result_set_size]
     return result_filtered
 
@@ -202,11 +202,13 @@ def intersect_of_ordered_list(list_1, list_2):
 
 class BaseTarget:
     def all_statistics_present(self, cached_statistics):
+        # pylint: disable=no-member
         if isinstance(cached_statistics, dict) and all(
             expected_value in cached_statistics
             for expected_value in self.__class__.statistic_types
-        ):  # pylint: disable=no-member
+        ):
             return True
+        # pylint: enable=no-member
         return False
 
 
diff --git a/tests/algorithms_testing.py b/tests/algorithms_testing.py
index 944d5a1..8faf735 100644
--- a/tests/algorithms_testing.py
+++ b/tests/algorithms_testing.py
@@ -20,7 +20,7 @@ def evaluate_result(self, algorithm_result, result, qualities):
             algorithm_result, qualities, result
         ):
             self.assertEqual(repr(algorithm_SG), repr(expected_SGD))
-            self.assertEqual(algorithm_q, expected_q)
+            self.assertAlmostEqual(algorithm_q, expected_q)
 
     def runAlgorithm(self, algorithm, name, result, qualities, task):
         print()
diff --git a/tests/test_algorithms_boolean.py b/tests/test_algorithms_boolean.py
index ee35406..6f92244 100644
--- a/tests/test_algorithms_boolean.py
+++ b/tests/test_algorithms_boolean.py
@@ -22,9 +22,6 @@ class TestSettings:
     GpGrowth = False
 
 
-skip_long_running = True
-
-
 class BooleanTargetBase:
     # pylint: disable=no-member
     @unittest.skipUnless(TestSettings.All or TestSettings.Apriori, "flag not set")
diff --git a/tests/test_algorithms_boolean_constraints.py b/tests/test_algorithms_boolean_constraints.py
index c45163d..443eba5 100644
--- a/tests/test_algorithms_boolean_constraints.py
+++ b/tests/test_algorithms_boolean_constraints.py
@@ -21,9 +21,6 @@ class TestSettings:
     SimpleSearch = False
 
 
-skip_long_running = True
-
-
 class BooleanTargetBase(TestAlgorithmsBase):
     # pylint: disable=no-member
     @unittest.skipUnless(TestSettings.All or TestSettings.Apriori, "flag not set")
diff --git a/tests/test_algorithms_numeric.py b/tests/test_algorithms_numeric.py
index 212c3fa..8b7c404 100644
--- a/tests/test_algorithms_numeric.py
+++ b/tests/test_algorithms_numeric.py
@@ -3,6 +3,7 @@
 import unittest
 from copy import copy
 
+import pandas as pd
 from algorithms_testing import TestAlgorithmsBase
 from t_utils import conjunctions_from_str
 
@@ -58,8 +59,8 @@ def setUp(self):
         l = conjunctions_from_str(
             """316646.0    job=='b'high qualif/self emp/mgmt''
    310615.0    foreign_worker=='b'yes'' AND job=='b'high qualif/self emp/mgmt''
-   297844.5    foreign_worker=='b'yes'' AND own_telephone=='b'yes'' AND property_magnitude=='b'no known property''
    297844.5    own_telephone=='b'yes'' AND property_magnitude=='b'no known property''
+    297844.5    foreign_worker=='b'yes'' AND own_telephone=='b'yes'' AND property_magnitude=='b'no known property''
    288480.5    job=='b'high qualif/self emp/mgmt'' AND own_telephone=='b'yes''
    283002.0    own_telephone=='b'yes''
    282217.5    class=='b'bad'' AND own_telephone=='b'yes''
@@ -248,7 +249,7 @@ def test_DFS_average(self):
 
     def test_DFS_order_with_numba(self):
         try:
-            import numba
+            import numba  # pylint: disable=import-outside-toplevel, unused-import
         except ImportError:
             self.skipTest("No numba installed")
         self.task.qf = ps.StandardQFNumeric(self.task.qf.a, False, "order")
@@ -339,9 +340,6 @@ def test_DFSNumeric(self):
     #   self.runAlgorithm(ps.SimpleSearch(), "SimpleSearch", self.result, self.qualities, self.task)
 
 
-import pandas as pd
-
-
 class TestNumericEstimators(unittest.TestCase):
     def test_estimator1(self):
         records = [(1, 100), (1, 75), (1, 53), (1, 12), (0, 11), (0, 49)]
diff --git a/tests/test_generalisation_aware.py b/tests/test_generalisation_aware.py
index cee347f..3f2eb15 100644
--- a/tests/test_generalisation_aware.py
+++ b/tests/test_generalisation_aware.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 from algorithms_testing import TestAlgorithmsBase
+from t_utils import conjunctions_from_str
 
 import pysubgroup as ps
 from pysubgroup.datasets import get_credit_data
@@ -68,7 +69,9 @@ def setUp(self):
         self.A0 = None
         self.BD = None
         TestGeneralisationAwareQf.prepare_df(self)
-        self.ga_qf = ps.GeneralizationAware_StandardQF(0)
+        self.ga_qf = ps.GeneralizationAware_StandardQF(
+            0, optimistic_estimate_strategy="max"
+        )
 
     def test_simple(self):
         target = ps.BinaryTarget("columnC", 1)
@@ -89,7 +92,7 @@ def test_simple(self):
         self.assertEqual(ga_stat.generalisation_stats, ps.SimplePositivesQF.tpl(5, 3))
 
         # Ensure cache works properly
-        self.assertEqual(
+        self.assertIs(
             ga_stat,
             self.ga_qf.calculate_statistics(
                 ps.Conjunction([self.A1, self.BA]), target, self.df
@@ -105,46 +108,31 @@ def test_simple(self):
         ga_score3 = self.ga_qf.evaluate(
             ps.Conjunction([self.A0, self.BD]), target, self.df
         )
-
         self.assertEqual(ga_score, ga_score2)
         self.assertAlmostEqual(ga_score, 0.06666666666666)
         self.assertTrue(np.isnan(ga_score3))
 
+    def test_error(self):
+        with self.assertRaises(ValueError):
+            ps.GeneralizationAware_StandardQF(0.5, "blabla")
+
 
-class TestAlgorithms(TestAlgorithmsBase, unittest.TestCase):
-    # TODO properly specify desired result
+class TestSimpleGA(TestAlgorithmsBase, unittest.TestCase):
     def setUp(self):
-        NS_checking = ps.EqualitySelector("checking_status", b"<0")
-        NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes")
-        NS_other_parties = ps.EqualitySelector("other_parties", b"none")
-        NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
-        NS_job = ps.EqualitySelector("job", b"skilled")
-        self.result = [
-            ps.Conjunction([NS_checking, NS_foreign_worker]),
-            ps.Conjunction([NS_checking]),
-            ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]),
-            ps.Conjunction([NS_checking, NS_other_parties]),
-            ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]),
-            ps.Conjunction([NS_checking, NS_savings_status]),
-            ps.Conjunction(
-                [NS_checking, NS_savings_status, NS_other_parties, NS_foreign_worker]
-            ),
-            ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]),
-            ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]),
-            ps.Conjunction([NS_checking, NS_job]),
-        ]
-        self.qualities = [
-            0.055299999999999995,
-            0.05280000000000001,
-            0.052300000000000006,
-            0.05059999999999999,
-            0.04959999999999999,
-            0.048299999999999996,
-            0.04660000000000001,
-            0.04550000000000001,
-            0.0452,
-            0.044399999999999995,
-        ]
+        conj_list = conjunctions_from_str(
+            """0.05280000000000001 checking_status=='b'<0''
+   0.03610000000000002 savings_status=='b'<100''
+   0.0243      checking_status=='b'0<=X<200''
+   0.0208      property_magnitude=='b'no known property''
+   0.0188      purpose=='b'new car''
+   0.0184      employment=='b'<1''
+   0.0163      housing=='b'rent''
+   0.016000000000000007        personal_status=='b'female div/dep/mar''
+   0.015300000000000003        other_payment_plans=='b'bank''
+   0.0133      credit_history=='b'all paid''"""
+        )
+        self.result = [conjunction for quality, conjunction in conj_list]
+        self.qualities = [quality for quality, conjunction in conj_list]
         data = get_credit_data()
         target = ps.BinaryTarget("class", b"bad")
         searchSpace = ps.create_nominal_selectors(data, ignore=["class"])
@@ -157,20 +145,222 @@ def setUp(self):
             qf=ps.GeneralizationAwareQF(ps.StandardQF(1.0)),
         )
 
-    @unittest.skip
     def test_GA_SimpleDFS(self):
         self.runAlgorithm(
             ps.SimpleDFS(), "SimpleDFS", self.result, self.qualities, self.task
         )
 
-    @unittest.skip
-    def test_StandardQF_GA_SimpleDFS(self):
-        self.task.qf = ps.GeneralizationAware_StandardQF(0.5)
+
+class TestGeneralizationAware_StandardQF_a05(TestAlgorithmsBase, unittest.TestCase):
+    def get_a(self):
+        return 0.5
+
+    def get_output_str(self):
+        return """0.10086921502691486 checking_status=='b'<0''
+        0.065       credit_history=='b'no credits/all paid''
+        0.0600832755431992  credit_history=='b'all paid''
+        0.05300330790951305 property_magnitude=='b'no known property''
+        0.046852215652241715        checking_status=='b'0<=X<200''
+        0.046488822458723086        savings_status=='b'<100''
+        0.04436633963967792 employment=='b'<1''
+        0.043698221718866   other_payment_plans=='b'bank'' AND purpose=='b'new car''
+        0.04183300132670378 housing=='b'rent'' AND property_magnitude=='b'no known property'' AND savings_status=='b'<100''
+        0.04103779623011525 other_payment_plans=='b'bank''"""  # noqa: 501
+
+    def setUp(self):
+        conj_list = conjunctions_from_str(self.get_output_str())
+        self.result = [conjunction for quality, conjunction in conj_list]
+        self.qualities = [quality for quality, conjunction in conj_list]
+
+        data = get_credit_data()
+        target = ps.BinaryTarget("class", b"bad")
+        searchSpace = ps.create_nominal_selectors(data, ignore=["class"])
+        self.task = ps.SubgroupDiscoveryTask(
+            data,
+            target,
+            searchSpace,
+            result_set_size=10,
+            depth=3,
+            qf=ps.GeneralizationAware_StandardQF(self.get_a()),
+        )
+
+    def test_SimpleDFS(self):
+        self.task.qf = ps.GeneralizationAware_StandardQF(self.get_a())
         self.runAlgorithm(
-            ps.SimpleDFS(), "Standard_SimpleDFS", self.result, self.qualities, self.task
+            ps.SimpleDFS(),
+            f"StandardQF_SimpleDFS, a={self.get_a()}",
+            self.result,
+            self.qualities,
+            self.task,
         )
-        print(self.task.qf.cache)
+
+    def test_Apriori_diff(self):
+        self.task.qf = ps.GeneralizationAware_StandardQF(
+            self.get_a(), optimistic_estimate_strategy="difference"
+        )
+        apriori = ps.Apriori()
+        apriori.use_vectorization = False
+        self.runAlgorithm(
+            apriori,
+            f"StandardQF_Apriori diff, a={self.get_a()}",
+            self.result,
+            self.qualities,
+            self.task,
+        )
+
+    def test_Apriori_max(self):
+        self.task.qf = ps.GeneralizationAware_StandardQF(
+            self.get_a(), optimistic_estimate_strategy="max"
+        )
+        apriori = ps.Apriori()
+        apriori.use_vectorization = False
+        self.runAlgorithm(
+            apriori,
+            f"StandardQF_Apriori, max, a={self.get_a()}",
+            self.result,
+            self.qualities,
+            self.task,
+        )
+
+
+class TestGeneralizationAware_StandardQF_a(TestGeneralizationAware_StandardQF_a05):
+    def get_a(self):
+        return 1
+
+    def get_output_str(self):
+        return """   0.05280000000000001 checking_status=='b'<0''
+   0.03610000000000002 savings_status=='b'<100''
+   0.0243      checking_status=='b'0<=X<200''
+   0.0208      property_magnitude=='b'no known property''
+   0.0188      purpose=='b'new car''
+   0.0184      employment=='b'<1''
+   0.0163      housing=='b'rent''
+   0.016000000000000007        personal_status=='b'female div/dep/mar''
+   0.015300000000000003        other_payment_plans=='b'bank''
+   0.0133      credit_history=='b'all paid''"""
+
+
+class TestGeneralizationAware_StandardQF_a0_d2(TestGeneralizationAware_StandardQF_a05):
+    def get_a(self):
+        return 0
+
+    def get_output_str(self):
+        return """    0.6795580110497237  job=='b'unemp/unskilled non res'' AND purpose=='b'furniture/equipment''
+            0.6666666666666667  purpose=='b'domestic appliance'' AND savings_status=='b'100<=X<500''
+            0.6666666666666667  personal_status=='b'male mar/wid'' AND purpose=='b'domestic appliance''
+            0.6666666666666667  job=='b'unskilled resident'' AND purpose=='b'domestic appliance''
+   0.6554054054054055  foreign_worker=='b'no'' AND job=='b'high qualif/self emp/mgmt''
+   0.6363636363636364  purpose=='b'repairs'' AND savings_status=='b'>=1000''
+   0.6290322580645161  employment=='b'unemployed'' AND purpose=='b'domestic appliance''
+   0.6089385474860336  housing=='b'rent'' AND purpose=='b'retraining''
+   0.6 foreign_worker=='b'no'' AND personal_status=='b'male div/sep''
+   0.5957446808510638  other_payment_plans=='b'stores'' AND purpose=='b'repairs''"""  # noqa: 501
+
+    def setUp(self):
+        conj_list = conjunctions_from_str(self.get_output_str())
+        self.result = [conjunction for quality, conjunction in conj_list]
+        self.qualities = [quality for quality, conjunction in conj_list]
+
+        data = get_credit_data()
+        target = ps.BinaryTarget("class", b"bad")
+        searchSpace = ps.create_nominal_selectors(data, ignore=["class"])
+        self.task = ps.SubgroupDiscoveryTask(
+            data,
+            target,
+            searchSpace,
+            result_set_size=10,
+            depth=2,
+            qf=ps.GeneralizationAware_StandardQF(self.get_a()),
+        )
+
+
+class TestGeneralizationAware_StandardQF_a0(TestGeneralizationAware_StandardQF_a05):
+    def get_a(self):
+        return 0
+
+    def get_output_str(self):
+        return """   0.7 job=='b'unskilled resident'' AND own_telephone=='b'yes'' AND personal_status=='b'male mar/wid''
+   0.7 foreign_worker=='b'no'' AND other_parties=='b'guarantor'' AND personal_status=='b'male mar/wid''
+   0.7 employment=='b'>=7'' AND job=='b'unskilled resident'' AND purpose=='b'used car''
+   0.7 credit_history=='b'critical/other existing credit'' AND job=='b'unskilled resident'' AND purpose=='b'used car''
+   0.7 checking_status=='b'>=200'' AND personal_status=='b'male mar/wid'' AND savings_status=='b'500<=X<1000''
+   0.7 checking_status=='b'>=200'' AND own_telephone=='b'yes'' AND personal_status=='b'male mar/wid''
+   0.7 checking_status=='b'>=200'' AND credit_history=='b'critical/other existing credit'' AND personal_status=='b'male mar/wid''
+   0.6939655172413793  other_parties=='b'guarantor'' AND property_magnitude=='b'life insurance'' AND savings_status=='b'no known savings''
+   0.6939655172413793  foreign_worker=='b'no'' AND other_parties=='b'guarantor'' AND property_magnitude=='b'life insurance''
+   0.6818181818181819  credit_history=='b'delayed previously'' AND property_magnitude=='b'real estate'' AND savings_status=='b'500<=X<1000''"""  # noqa: 501
+
+    def setUp(self):
+        conj_list = conjunctions_from_str(self.get_output_str())
+        self.result = [conjunction for quality, conjunction in conj_list]
+        self.qualities = [quality for quality, conjunction in conj_list]
+
+        data = get_credit_data()
+        target = ps.BinaryTarget("class", b"bad")
+        searchSpace = ps.create_nominal_selectors(data, ignore=["class"])
+        self.task = ps.SubgroupDiscoveryTask(
+            data,
+            target,
+            searchSpace,
+            result_set_size=10,
+            depth=3,
+            qf=ps.GeneralizationAware_StandardQF(self.get_a()),
+        )
+
+
+class TestGeneralizationAware_StandardQFNumeric(TestAlgorithmsBase, unittest.TestCase):
+    def setUp(self):
+        conj_list = conjunctions_from_str(
+            """   832.5979220717699   job=='b'high qualif/self emp/mgmt''
+   673.6338022041458   purpose=='b'used car''
+   645.953015714855    property_magnitude=='b'no known property''
+   603.3209078187183   own_telephone=='b'yes''
+   576.235405327832    class=='b'bad'' AND own_telephone=='b'yes''
+   540.9390501453018   purpose=='b'other''
+   537.3010282319029   housing=='b'for free''
+   440.3787869550485   checking_status=='b'0<=X<200'' AND foreign_worker=='b'no'' AND property_magnitude=='b'life insurance''
+   407.6428886169854   checking_status=='b'0<=X<200'' AND foreign_worker=='b'no'' AND other_payment_plans=='b'bank''
+   406.8834000000001   credit_history=='b'no credits/all paid'' """  # noqa: 501
+        )
+        self.result = [conjunction for quality, conjunction in conj_list]
+        self.qualities = [quality for quality, conjunction in conj_list]
+
+        data = get_credit_data()
+        target = ps.NumericTarget("credit_amount")
+        searchSpace_Nominal = ps.create_nominal_selectors(
+            data, ignore=["credit_amount"]
+        )
+        searchSpace_Numeric = (
+            []
+        )  # ps.create_numeric_selectors(data, ignore=['credit_amount'], nbins=10)
+        searchSpace = searchSpace_Nominal + searchSpace_Numeric
+        self.task = ps.SubgroupDiscoveryTask(
+            data,
+            target,
+            searchSpace,
+            result_set_size=10,
+            depth=3,
+            qf=ps.GeneralizationAware_StandardQFNumeric(1, False, centroid="mean"),
+        )
+
+    def test_SimpleDFS(self):
+        self.task.qf = ps.GeneralizationAware_StandardQFNumeric(0.5)
+        self.runAlgorithm(
+            ps.SimpleDFS(),
+            "Numeric StandardQF_SimpleDFS",
+            self.result,
+            self.qualities,
+            self.task,
+        )
+
+    # def test_DFS(self):
+    #     self.task.qf = ps.GeneralizationAware_StandardQFNumeric(0.5)
+    #     apriori = ps.Apriori()
+    #     apriori.use_vectorization = False
+    #     self.runAlgorithm(
+    #         apriori, "StandardQF_Apriori", self.result, self.qualities, self.task
+    #     )
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main(module="test_generalisation_aware")
diff --git a/tests/test_generalisations.py b/tests/test_generalisations.py
index 6e41273..9c253c6 100644
--- a/tests/test_generalisations.py
+++ b/tests/test_generalisations.py
@@ -1,16 +1,15 @@
 import unittest
 
+import pytest
 from algorithms_testing import TestAlgorithmsBase
 
 import pysubgroup as ps
 from pysubgroup.datasets import get_credit_data
 
-skip_long_running = True
-
 
 class BooleanTargetBase(TestAlgorithmsBase):
     # pylint: disable=no-member
-    @unittest.skipIf(skip_long_running, "as skip_long_running flag is True")
+    @pytest.mark.slow
     def test_GeneralisingBFS(self):
         self.runAlgorithm(
             ps.GeneralisingBFS(),
diff --git a/tests/test_representations.py b/tests/test_representations.py
index 82d358e..5a09bce 100644
--- a/tests/test_representations.py
+++ b/tests/test_representations.py
@@ -72,65 +72,45 @@ def test_Set(self):
         with ps.SetRepresentation(
             self.df, [self.A1, self.A0, self.BA, self.BC, self.CA, self.CNan]
         ) as representation:
-            self.assertEqual(
-                self.A1.representation, {2, 3, 6, 7, 8, 9}
-            )  # pylint: disable=no-member
-            self.assertEqual(
-                self.A0.representation, {0, 1, 4, 5}
-            )  # pylint: disable=no-member
+            # pylint: disable=no-member
+            self.assertEqual(self.A1.representation, {2, 3, 6, 7, 8, 9})
+            self.assertEqual(self.A0.representation, {0, 1, 4, 5})
 
-            self.assertEqual(
-                self.BA.representation, {0, 5, 7, 8, 9}
-            )  # pylint: disable=no-member
-            self.assertEqual(
-                self.BC.representation, {2, 3}
-            )  # pylint: disable=no-member
+            self.assertEqual(self.BA.representation, {0, 5, 7, 8, 9})
+            self.assertEqual(self.BC.representation, {2, 3})
 
-            self.assertEqual(
-                self.CA.representation, {2, 3}
-            )  # pylint: disable=no-member
-            self.assertEqual(
-                self.CNan.representation, {0, 1}
-            )  # pylint: disable=no-member
+            self.assertEqual(self.CA.representation, {2, 3})
+            self.assertEqual(self.CNan.representation, {0, 1})
 
             self.assertEqual(
                 representation.Conjunction([self.BA, self.CNan]).representation, {0}
-            )  # pylint: disable=no-member
+            )
             self.assertEqual(
                 representation.Conjunction([self.A0, self.CNan]).representation, {0, 1}
-            )  # pylint: disable=no-member
+            )
+            # pylint: enable=no-member
 
     def test_NumpySet(self):
         with ps.NumpySetRepresentation(
             self.df, [self.A1, self.A0, self.BA, self.BC, self.CA, self.CNan]
         ) as representation:
-            np.testing.assert_array_equal(
-                self.A1.representation, [2, 3, 6, 7, 8, 9]
-            )  # pylint: disable=no-member
-            np.testing.assert_array_equal(
-                self.A0.representation, [0, 1, 4, 5]
-            )  # pylint: disable=no-member
+            # pylint: disable=no-member
+            np.testing.assert_array_equal(self.A1.representation, [2, 3, 6, 7, 8, 9])
+            np.testing.assert_array_equal(self.A0.representation, [0, 1, 4, 5])
 
-            np.testing.assert_array_equal(
-                self.BA.representation, [0, 5, 7, 8, 9]
-            )  # pylint: disable=no-member
-            np.testing.assert_array_equal(
-                self.BC.representation, [2, 3]
-            )  # pylint: disable=no-member
+            np.testing.assert_array_equal(self.BA.representation, [0, 5, 7, 8, 9])
+            np.testing.assert_array_equal(self.BC.representation, [2, 3])
 
-            np.testing.assert_array_equal(
-                self.CA.representation, [2, 3]
-            )  # pylint: disable=no-member
-            np.testing.assert_array_equal(
-                self.CNan.representation, [0, 1]
-            )  # pylint: disable=no-member
+            np.testing.assert_array_equal(self.CA.representation, [2, 3])
+            np.testing.assert_array_equal(self.CNan.representation, [0, 1])
 
             np.testing.assert_array_equal(
                 representation.Conjunction([self.BA, self.CNan]).representation, [0]
-            )  # pylint: disable=no-member
+            )
             np.testing.assert_array_equal(
                 representation.Conjunction([self.A0, self.CNan]).representation, [0, 1]
-            )  # pylint: disable=no-member
+            )
+            # pylint: enable=no-member
 
 
 if __name__ == "__main__":