Improved Generalization Aware Qualities (#52)

Greatly increased generalization aware qualities + cleanup
flemmerich · Sep 19, 2023 · 0a31243 · 0a31243
1 parent 63ccae2
commit 0a31243
Show file tree

Hide file tree

Showing 16 changed files with 513 additions and 195 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,7 +10,7 @@ test_gp_model.txt
 program.prof
 import.log
 doc/_build
-Untitled.ipynb
+
 
 
 # Temporary and binary files
@@ -69,3 +69,7 @@ MANIFEST
 .venv*/
 .conda*/
 .python-version
+
+
+Untitled1.ipynb
+Untitled.ipynb
diff --git a/docs/sections/components/gp_growth.rst b/docs/sections/components/gp_growth.rst
@@ -23,12 +23,11 @@ The basic usage of the gp-growth algorithm is not very different from the usage
     from pysubgroup.datasets import get_titanic_data
     data = get_titanic_data()
 
-    target = ps.NominalSelector ('Survived', True)
+    target = ps.BinaryTarget ('Survived', True)
     searchspace = ps.create_selectors(data, ignore=['Survived'])
-    task = ps.SubgroupDiscoveryTask (data, target, dearchspace, result_set_size=5, depth=2, qf=ps.WRAccQF())
-    GpGrowth.execute(task)
+    task = ps.SubgroupDiscoveryTask (data, target, searchspace, result_set_size=5, depth=2, qf=ps.WRAccQF())
+    result = ps.GpGrowth().execute(task)
 
-But beware that gp-growth is using an exhaustive search strategy! This can greatly increase the runtime for high search depth.
 You can specify the :code:`mode` argument in the constructor of GpGrowth to run gp-growth either bottom up (:code:`mode='b_u'`) or top down (:code:`mode='b_u'`).
 As gp growth is a generalisation of fp-growth you can also perform standard fp-growth using gp_growth by using the CountQF (:ref:`countqf`) quality function.
 

diff --git a/setup.cfg b/setup.cfg
@@ -110,8 +110,8 @@ norecursedirs =
     .tox
 testpaths = tests
 # Use pytest markers to select/deselect specific tests
-# markers =
-#     slow: mark tests as slow (deselect with '-m "not slow"')
+markers =
+    slow: mark tests as slow (deselect with '-m "not slow"')
 #     system: mark end-to-end system tests
 
 [devpi:upload]

diff --git a/src/pysubgroup/algorithms.py b/src/pysubgroup/algorithms.py
@@ -55,6 +55,27 @@ def constraints_satisfied(constraints, subgroup, statistics=None, data=None):
     )
 
 
+try:  # pragma: no cover
+    from numba import (  # pylint: disable=import-error, import-outside-toplevel
+        int32,
+        int64,
+        njit,
+    )
+
+    @njit([(int32[:, :], int64[:])], cache=True)
+    def getNewCandidates(candidates, hashes):  # pragma: no cover
+        result = []
+        for i in range(len(candidates) - 1):
+            for j in range(i + 1, len(candidates)):
+                if hashes[i] == hashes[j]:
+                    if np.all(candidates[i, :-1] == candidates[j, :-1]):
+                        result.append((i, j))
+        return result
+
+except ImportError:  # pragma: no cover
+    pass
+
+
 class Apriori:
     def __init__(
         self, representation_type=None, combination_name="Conjunction", use_numba=True
@@ -133,20 +154,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
         return promising_candidates
 
     def get_next_level_numba(self, promising_candidates):
-        from numba import njit  # pylint: disable=import-error, import-outside-toplevel
-
         if not hasattr(self, "compiled_func") or self.compiled_func is None:
-
-            @njit
-            def getNewCandidates(candidates, hashes):  # pragma: no cover
-                result = []
-                for i in range(len(candidates) - 1):
-                    for j in range(i + 1, len(candidates)):
-                        if hashes[i] == hashes[j]:
-                            if np.all(candidates[i, :-1] == candidates[j, :-1]):
-                                result.append((i, j))
-                return result
-
             self.compiled_func = getNewCandidates
 
         all_selectors = Counter(chain.from_iterable(promising_candidates))
@@ -182,7 +190,9 @@ def execute(self, task):
         if not isinstance(
             task.qf, ps.BoundedInterestingnessMeasure
         ):  # pragma: no cover
-            raise RuntimeWarning("Quality function is unbounded, long runtime expected")
+            warnings.warn(
+                "Quality function is unbounded, long runtime expected", RuntimeWarning
+            )
 
         task.qf.calculate_constant_statistics(task.data, task.target)
 
@@ -302,7 +312,7 @@ def execute(self, task):
 
             sg = candidate_description
             statistics = task.qf.calculate_statistics(sg, task.target, task.data)
-            quality = task.qf.evaluate(sg, statistics)
+            quality = task.qf.evaluate(sg, task.target, task.data, statistics)
             ps.add_if_required(result, sg, quality, task, statistics=statistics)
 
             qual = ps.minimum_required_quality(result, task)
@@ -336,8 +346,6 @@ def execute(self, task):
                 self.discarded[len(candidate_description)] += 1
 
         result.sort(key=lambda x: x[0], reverse=True)
-        for qual, sg in result:
-            print(f"{qual} {sg}")
         print("discarded " + str(self.discarded))
         return ps.SubgroupDiscoveryResult(result, task)
 

diff --git a/src/pysubgroup/binary_target.py b/src/pysubgroup/binary_target.py
@@ -455,31 +455,143 @@ def __init__(self):
 #####
 # GeneralizationAware Interestingness Measures
 #####
-class GeneralizationAware_StandardQF(GeneralizationAwareQF_stats):
-    def __init__(self, a):
-        super().__init__(StandardQF(0))
+class GeneralizationAware_StandardQF(
+    GeneralizationAwareQF_stats, BoundedInterestingnessMeasure
+):
+    ga_sQF_agg_tuple = namedtuple(
+        "ga_sQF_agg_tuple", ["max_p", "min_delta_negatives", "min_negatives"]
+    )
+
+    def __init__(self, a, optimistic_estimate_strategy="default"):
+        super().__init__(StandardQF(a))
+        if optimistic_estimate_strategy in ("default", "difference"):
+            self.optimistic_estimate = self.difference_based_optimistic_estimate
+            self.aggregate_statistics = self.difference_based_agg_function
+            self.read_p = self.difference_based_read_p
+        elif optimistic_estimate_strategy == "max":
+            self.optimistic_estimate = self.max_based_optimistic_estimate
+            self.aggregate_statistics = self.max_based_aggregate_statistics
+            self.read_p = self.max_based_read_p
+        else:
+            raise ValueError(
+                "optimistic_estimate_strategy should be one of "
+                "('default', 'max', 'difference')"
+            )
         self.a = a
 
-    def get_max(self, *args):
-        max_ratio = 0.0
+    def evaluate(self, subgroup, target, data, statistics=None):
+        statistics = self.ensure_statistics(subgroup, target, data, statistics)
+        sg_stats = statistics.subgroup_stats
+        general_stats = statistics.generalisation_stats
+        if sg_stats.size_sg == 0:
+            return np.nan
+        sg_ratio = sg_stats.positives_count / sg_stats.size_sg
+        return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * (
+            sg_ratio - self.read_p(general_stats)
+        )
+
+    def max_based_aggregate_statistics(self, stats_subgroup, list_of_pairs):
+        if len(list_of_pairs) == 0:
+            return stats_subgroup
+        max_ratio = -100
         max_stats = None
-        for stat in args:
-            assert stat.size_sg > 0
-            ratio = stat.positives_count / stat.size_sg
-            if ratio > max_ratio:
-                max_ratio = ratio
-                max_stats = stat
+        for pair in list_of_pairs:
+            ratio = -np.inf
+            for agg_stat in pair:
+                if agg_stat.size_sg == 0:  # pragma: no cover
+                    continue
+                ratio = agg_stat.positives_count / agg_stat.size_sg
+                if ratio > max_ratio:
+                    max_ratio = ratio
+                    max_stats = agg_stat
+
         return max_stats
 
-    def evaluate(self, subgroup, target, data, statistics=None):
+    def max_based_optimistic_estimate(self, subgroup, target, data, statistics=None):
+        """
+        Computes the oe as the hypothetical subgroup containing only positive instances
+        """
         statistics = self.ensure_statistics(subgroup, target, data, statistics)
         sg_stats = statistics.subgroup_stats
         general_stats = statistics.generalisation_stats
         if sg_stats.size_sg == 0 or general_stats.size_sg == 0:
             return np.nan
 
-        sg_ratio = sg_stats.positives_count / sg_stats.size_sg
         general_ratio = general_stats.positives_count / general_stats.size_sg
-        return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * (
-            sg_ratio - general_ratio
+        return (sg_stats.positives_count / self.stats0.size_sg) ** self.a * (
+            1 - general_ratio
         )
+
+    def max_based_read_p(self, agg_tuple):
+        return agg_tuple.positives_count / agg_tuple.size_sg
+
+    def difference_based_optimistic_estimate(self, subgroup, target, data, statistics):
+        sg_stats, agg_stats = self.ensure_statistics(subgroup, target, data, statistics)
+        if np.isposinf(agg_stats.min_delta_negatives):
+            return np.inf
+        delta_n = agg_stats.min_delta_negatives
+        size_dataset = self.qf.dataset_statistics.size_sg
+        tau_diff = 0
+        if self.qf.a == 0:
+            pos = 1
+            # return delta_n /(1 + delta_n)
+        elif self.qf.a == 1.0:
+            pos = sg_stats.positives_count
+            # return pos / size_dataset * delta_n /(pos + delta_n)
+        else:
+            a = self.qf.a
+            p_hat = min(np.ceil(a * delta_n / (1 - a)), sg_stats.positives_count)
+            pos = p_hat
+            # return (p_hat / size_dataset) ** a * delta_n /(p_hat+delta_n)
+        tau_diff = pos / (pos + delta_n)
+        if sg_stats.size_sg > 0:
+            tau_sg = sg_stats.positives_count / sg_stats.size_sg
+        else:
+            tau_sg = -1
+        tau_max = max(tau_diff, tau_sg, agg_stats.max_p)
+        return (sg_stats.positives_count / size_dataset) ** self.a * (1 - tau_max)
+
+    def difference_based_agg_function(self, stats_subgroup, list_of_pairs):
+        """
+        list_of_pairs is a list of (stats, agg_tuple) for all the generalizations
+        """
+
+        def get_negatives_count(sg_stats):
+            return sg_stats.size_sg - sg_stats.positives_count
+
+        def get_percentage_positives(sg_stats):
+            if sg_stats.size_sg == 0:
+                return np.nan
+            return sg_stats.positives_count / sg_stats.size_sg
+
+        if len(list_of_pairs) == 0:  # empty pattern
+            return GeneralizationAware_StandardQF.ga_sQF_agg_tuple(
+                get_percentage_positives(stats_subgroup), np.infty, np.infty
+            )
+
+        subgroup_negatives = stats_subgroup.size_sg - stats_subgroup.positives_count
+        min_immediate_generalizations_negatives = min(
+            get_negatives_count(x.subgroup_stats) for x in list_of_pairs
+        )
+        min_immediate_generalizations_delta_negatives = min(
+            x.generalisation_stats.min_delta_negatives for x in list_of_pairs
+        )
+        max_percentage_positives = max(
+            max(
+                get_percentage_positives(x.subgroup_stats), x.generalisation_stats.max_p
+            )
+            for x in list_of_pairs
+        )
+
+        sg_delta_negatives = (
+            min_immediate_generalizations_negatives - subgroup_negatives
+        )
+        min_delta_negatives = min(
+            sg_delta_negatives, min_immediate_generalizations_delta_negatives
+        )
+        return GeneralizationAware_StandardQF.ga_sQF_agg_tuple(
+            max_percentage_positives, min_delta_negatives, sg_delta_negatives
+        )
+
+    def difference_based_read_p(self, agg_tuple):
+        return agg_tuple.max_p
diff --git a/src/pysubgroup/measures.py b/src/pysubgroup/measures.py
@@ -190,6 +190,10 @@ def __hasattr__(self, name):
 # GeneralizationAware Interestingness Measures
 #####
 class GeneralizationAwareQF(AbstractInterestingnessMeasure):
+    """A class that computes the generalization aware qf as follows:
+    qf(sg) = qf(sg) - max_{generalizations} qf(sq)
+    """
+
     ga_tuple = namedtuple("ga_tuple", ["subgroup_quality", "generalisation_quality"])
 
     def __init__(self, qf):
@@ -241,6 +245,8 @@ def evaluate(self, subgroup, target, data, statistics=None):
 # GeneralizationAware Interestingness Measures
 #####
 class GeneralizationAwareQF_stats(AbstractInterestingnessMeasure):
+    """An abstract base class that implements aggregation of stats of generalisations"""
+
     ga_tuple = namedtuple("ga_stats_tuple", ["subgroup_stats", "generalisation_stats"])
 
     def __init__(self, qf):
@@ -263,30 +269,34 @@ def calculate_constant_statistics(self, data, target):
     def calculate_statistics(self, subgroup, target, data, statistics=None):
         sg_repr = repr(subgroup)
         if sg_repr in self.cache:
-            return GeneralizationAwareQF_stats.ga_tuple(*self.cache[sg_repr])
+            return self.cache[sg_repr]
 
-        (stats_sg, stats_prev) = self.get_stats_and_previous_stats(
-            subgroup, target, data
-        )
-        self.cache[sg_repr] = (stats_sg, stats_prev)
-        return GeneralizationAwareQF_stats.ga_tuple(stats_sg, stats_prev)
+        tpl = self.get_stats_and_previous_stats(subgroup, target, data)
+        self.cache[sg_repr] = tpl
+        return tpl
 
     def get_stats_and_previous_stats(self, subgroup, target, data):
         stats_subgroup = self.qf.calculate_statistics(subgroup, target, data)
-        max_stats = self.stats0
+        # pylint: disable=no-member
+        if len(subgroup.selectors) == 0:
+            return GeneralizationAwareQF_stats.ga_tuple(
+                stats_subgroup, self.aggregate_statistics(stats_subgroup, [])
+            )
+
         selectors = subgroup.selectors
-        if len(selectors) > 0:
-            # compute quality of all generalizations
-            generalizations = combinations(selectors, len(selectors) - 1)
+        immediate_generalizations = combinations(selectors, len(selectors) - 1)
 
-            for sels in generalizations:
-                sgd = ps.Conjunction(list(sels))
-                (stats_sg, stats_prev) = self.calculate_statistics(sgd, target, data)
-                max_stats = self.get_max(max_stats, stats_sg, stats_prev)
-        return (stats_subgroup, max_stats)
+        list_of_pairs = []
+        for sels in immediate_generalizations:
+            sgd = ps.Conjunction(list(sels))
+            list_of_pairs.append(self.calculate_statistics(sgd, target, data))
+        agg_stats = self.aggregate_statistics(stats_subgroup, list_of_pairs)
+        # pylint: enable=no-member
+        return GeneralizationAwareQF_stats.ga_tuple(stats_subgroup, agg_stats)
 
     def evaluate(self, subgroup, target, data, statistics=None):
         raise NotImplementedError
 
-    def get_max(self, *args):
-        raise NotImplementedError
+
+#    def aggregate_statistics(self, *args):
+#        raise NotImplementedError