From 0a312433bd18e91200c7c9bb66194d7669f50a3c Mon Sep 17 00:00:00 2001 From: Feelx234 <52208598+Feelx234@users.noreply.github.com> Date: Tue, 19 Sep 2023 17:32:15 +0200 Subject: [PATCH] Improved Generalization Aware Qualities (#52) Greatly increased generalization aware qualities + cleanup --- .gitignore | 6 +- docs/sections/components/gp_growth.rst | 7 +- setup.cfg | 4 +- src/pysubgroup/algorithms.py | 42 +-- src/pysubgroup/binary_target.py | 142 +++++++++- src/pysubgroup/measures.py | 44 +-- src/pysubgroup/numeric_target.py | 96 ++++--- src/pysubgroup/subgroup_description.py | 4 +- src/pysubgroup/utils.py | 6 +- tests/algorithms_testing.py | 2 +- tests/test_algorithms_boolean.py | 3 - tests/test_algorithms_boolean_constraints.py | 3 - tests/test_algorithms_numeric.py | 8 +- tests/test_generalisation_aware.py | 276 ++++++++++++++++--- tests/test_generalisations.py | 5 +- tests/test_representations.py | 60 ++-- 16 files changed, 513 insertions(+), 195 deletions(-) diff --git a/.gitignore b/.gitignore index 4885096..18805a8 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,7 @@ test_gp_model.txt program.prof import.log doc/_build -Untitled.ipynb + # Temporary and binary files @@ -69,3 +69,7 @@ MANIFEST .venv*/ .conda*/ .python-version + + +Untitled1.ipynb +Untitled.ipynb diff --git a/docs/sections/components/gp_growth.rst b/docs/sections/components/gp_growth.rst index 63485d5..1b4b2f3 100644 --- a/docs/sections/components/gp_growth.rst +++ b/docs/sections/components/gp_growth.rst @@ -23,12 +23,11 @@ The basic usage of the gp-growth algorithm is not very different from the usage from pysubgroup.datasets import get_titanic_data data = get_titanic_data() - target = ps.NominalSelector ('Survived', True) + target = ps.BinaryTarget ('Survived', True) searchspace = ps.create_selectors(data, ignore=['Survived']) - task = ps.SubgroupDiscoveryTask (data, target, dearchspace, result_set_size=5, depth=2, qf=ps.WRAccQF()) - GpGrowth.execute(task) + task = ps.SubgroupDiscoveryTask (data, target, searchspace, result_set_size=5, depth=2, qf=ps.WRAccQF()) + result = ps.GpGrowth().execute(task) -But beware that gp-growth is using an exhaustive search strategy! This can greatly increase the runtime for high search depth. You can specify the :code:`mode` argument in the constructor of GpGrowth to run gp-growth either bottom up (:code:`mode='b_u'`) or top down (:code:`mode='b_u'`). As gp growth is a generalisation of fp-growth you can also perform standard fp-growth using gp_growth by using the CountQF (:ref:`countqf`) quality function. diff --git a/setup.cfg b/setup.cfg index 72f2a45..f4cc25b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -110,8 +110,8 @@ norecursedirs = .tox testpaths = tests # Use pytest markers to select/deselect specific tests -# markers = -# slow: mark tests as slow (deselect with '-m "not slow"') +markers = + slow: mark tests as slow (deselect with '-m "not slow"') # system: mark end-to-end system tests [devpi:upload] diff --git a/src/pysubgroup/algorithms.py b/src/pysubgroup/algorithms.py index be9132b..0ba3a9a 100644 --- a/src/pysubgroup/algorithms.py +++ b/src/pysubgroup/algorithms.py @@ -55,6 +55,27 @@ def constraints_satisfied(constraints, subgroup, statistics=None, data=None): ) +try: # pragma: no cover + from numba import ( # pylint: disable=import-error, import-outside-toplevel + int32, + int64, + njit, + ) + + @njit([(int32[:, :], int64[:])], cache=True) + def getNewCandidates(candidates, hashes): # pragma: no cover + result = [] + for i in range(len(candidates) - 1): + for j in range(i + 1, len(candidates)): + if hashes[i] == hashes[j]: + if np.all(candidates[i, :-1] == candidates[j, :-1]): + result.append((i, j)) + return result + +except ImportError: # pragma: no cover + pass + + class Apriori: def __init__( self, representation_type=None, combination_name="Conjunction", use_numba=True @@ -133,20 +154,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat return promising_candidates def get_next_level_numba(self, promising_candidates): - from numba import njit # pylint: disable=import-error, import-outside-toplevel - if not hasattr(self, "compiled_func") or self.compiled_func is None: - - @njit - def getNewCandidates(candidates, hashes): # pragma: no cover - result = [] - for i in range(len(candidates) - 1): - for j in range(i + 1, len(candidates)): - if hashes[i] == hashes[j]: - if np.all(candidates[i, :-1] == candidates[j, :-1]): - result.append((i, j)) - return result - self.compiled_func = getNewCandidates all_selectors = Counter(chain.from_iterable(promising_candidates)) @@ -182,7 +190,9 @@ def execute(self, task): if not isinstance( task.qf, ps.BoundedInterestingnessMeasure ): # pragma: no cover - raise RuntimeWarning("Quality function is unbounded, long runtime expected") + warnings.warn( + "Quality function is unbounded, long runtime expected", RuntimeWarning + ) task.qf.calculate_constant_statistics(task.data, task.target) @@ -302,7 +312,7 @@ def execute(self, task): sg = candidate_description statistics = task.qf.calculate_statistics(sg, task.target, task.data) - quality = task.qf.evaluate(sg, statistics) + quality = task.qf.evaluate(sg, task.target, task.data, statistics) ps.add_if_required(result, sg, quality, task, statistics=statistics) qual = ps.minimum_required_quality(result, task) @@ -336,8 +346,6 @@ def execute(self, task): self.discarded[len(candidate_description)] += 1 result.sort(key=lambda x: x[0], reverse=True) - for qual, sg in result: - print(f"{qual} {sg}") print("discarded " + str(self.discarded)) return ps.SubgroupDiscoveryResult(result, task) diff --git a/src/pysubgroup/binary_target.py b/src/pysubgroup/binary_target.py index 5640788..25433f6 100644 --- a/src/pysubgroup/binary_target.py +++ b/src/pysubgroup/binary_target.py @@ -455,31 +455,143 @@ def __init__(self): ##### # GeneralizationAware Interestingness Measures ##### -class GeneralizationAware_StandardQF(GeneralizationAwareQF_stats): - def __init__(self, a): - super().__init__(StandardQF(0)) +class GeneralizationAware_StandardQF( + GeneralizationAwareQF_stats, BoundedInterestingnessMeasure +): + ga_sQF_agg_tuple = namedtuple( + "ga_sQF_agg_tuple", ["max_p", "min_delta_negatives", "min_negatives"] + ) + + def __init__(self, a, optimistic_estimate_strategy="default"): + super().__init__(StandardQF(a)) + if optimistic_estimate_strategy in ("default", "difference"): + self.optimistic_estimate = self.difference_based_optimistic_estimate + self.aggregate_statistics = self.difference_based_agg_function + self.read_p = self.difference_based_read_p + elif optimistic_estimate_strategy == "max": + self.optimistic_estimate = self.max_based_optimistic_estimate + self.aggregate_statistics = self.max_based_aggregate_statistics + self.read_p = self.max_based_read_p + else: + raise ValueError( + "optimistic_estimate_strategy should be one of " + "('default', 'max', 'difference')" + ) self.a = a - def get_max(self, *args): - max_ratio = 0.0 + def evaluate(self, subgroup, target, data, statistics=None): + statistics = self.ensure_statistics(subgroup, target, data, statistics) + sg_stats = statistics.subgroup_stats + general_stats = statistics.generalisation_stats + if sg_stats.size_sg == 0: + return np.nan + sg_ratio = sg_stats.positives_count / sg_stats.size_sg + return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * ( + sg_ratio - self.read_p(general_stats) + ) + + def max_based_aggregate_statistics(self, stats_subgroup, list_of_pairs): + if len(list_of_pairs) == 0: + return stats_subgroup + max_ratio = -100 max_stats = None - for stat in args: - assert stat.size_sg > 0 - ratio = stat.positives_count / stat.size_sg - if ratio > max_ratio: - max_ratio = ratio - max_stats = stat + for pair in list_of_pairs: + ratio = -np.inf + for agg_stat in pair: + if agg_stat.size_sg == 0: # pragma: no cover + continue + ratio = agg_stat.positives_count / agg_stat.size_sg + if ratio > max_ratio: + max_ratio = ratio + max_stats = agg_stat + return max_stats - def evaluate(self, subgroup, target, data, statistics=None): + def max_based_optimistic_estimate(self, subgroup, target, data, statistics=None): + """ + Computes the oe as the hypothetical subgroup containing only positive instances + """ statistics = self.ensure_statistics(subgroup, target, data, statistics) sg_stats = statistics.subgroup_stats general_stats = statistics.generalisation_stats if sg_stats.size_sg == 0 or general_stats.size_sg == 0: return np.nan - sg_ratio = sg_stats.positives_count / sg_stats.size_sg general_ratio = general_stats.positives_count / general_stats.size_sg - return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * ( - sg_ratio - general_ratio + return (sg_stats.positives_count / self.stats0.size_sg) ** self.a * ( + 1 - general_ratio ) + + def max_based_read_p(self, agg_tuple): + return agg_tuple.positives_count / agg_tuple.size_sg + + def difference_based_optimistic_estimate(self, subgroup, target, data, statistics): + sg_stats, agg_stats = self.ensure_statistics(subgroup, target, data, statistics) + if np.isposinf(agg_stats.min_delta_negatives): + return np.inf + delta_n = agg_stats.min_delta_negatives + size_dataset = self.qf.dataset_statistics.size_sg + tau_diff = 0 + if self.qf.a == 0: + pos = 1 + # return delta_n /(1 + delta_n) + elif self.qf.a == 1.0: + pos = sg_stats.positives_count + # return pos / size_dataset * delta_n /(pos + delta_n) + else: + a = self.qf.a + p_hat = min(np.ceil(a * delta_n / (1 - a)), sg_stats.positives_count) + pos = p_hat + # return (p_hat / size_dataset) ** a * delta_n /(p_hat+delta_n) + tau_diff = pos / (pos + delta_n) + if sg_stats.size_sg > 0: + tau_sg = sg_stats.positives_count / sg_stats.size_sg + else: + tau_sg = -1 + tau_max = max(tau_diff, tau_sg, agg_stats.max_p) + return (sg_stats.positives_count / size_dataset) ** self.a * (1 - tau_max) + + def difference_based_agg_function(self, stats_subgroup, list_of_pairs): + """ + list_of_pairs is a list of (stats, agg_tuple) for all the generalizations + """ + + def get_negatives_count(sg_stats): + return sg_stats.size_sg - sg_stats.positives_count + + def get_percentage_positives(sg_stats): + if sg_stats.size_sg == 0: + return np.nan + return sg_stats.positives_count / sg_stats.size_sg + + if len(list_of_pairs) == 0: # empty pattern + return GeneralizationAware_StandardQF.ga_sQF_agg_tuple( + get_percentage_positives(stats_subgroup), np.infty, np.infty + ) + + subgroup_negatives = stats_subgroup.size_sg - stats_subgroup.positives_count + min_immediate_generalizations_negatives = min( + get_negatives_count(x.subgroup_stats) for x in list_of_pairs + ) + min_immediate_generalizations_delta_negatives = min( + x.generalisation_stats.min_delta_negatives for x in list_of_pairs + ) + max_percentage_positives = max( + max( + get_percentage_positives(x.subgroup_stats), x.generalisation_stats.max_p + ) + for x in list_of_pairs + ) + + sg_delta_negatives = ( + min_immediate_generalizations_negatives - subgroup_negatives + ) + min_delta_negatives = min( + sg_delta_negatives, min_immediate_generalizations_delta_negatives + ) + return GeneralizationAware_StandardQF.ga_sQF_agg_tuple( + max_percentage_positives, min_delta_negatives, sg_delta_negatives + ) + + def difference_based_read_p(self, agg_tuple): + return agg_tuple.max_p diff --git a/src/pysubgroup/measures.py b/src/pysubgroup/measures.py index bae33c5..7fd8372 100644 --- a/src/pysubgroup/measures.py +++ b/src/pysubgroup/measures.py @@ -190,6 +190,10 @@ def __hasattr__(self, name): # GeneralizationAware Interestingness Measures ##### class GeneralizationAwareQF(AbstractInterestingnessMeasure): + """A class that computes the generalization aware qf as follows: + qf(sg) = qf(sg) - max_{generalizations} qf(sq) + """ + ga_tuple = namedtuple("ga_tuple", ["subgroup_quality", "generalisation_quality"]) def __init__(self, qf): @@ -241,6 +245,8 @@ def evaluate(self, subgroup, target, data, statistics=None): # GeneralizationAware Interestingness Measures ##### class GeneralizationAwareQF_stats(AbstractInterestingnessMeasure): + """An abstract base class that implements aggregation of stats of generalisations""" + ga_tuple = namedtuple("ga_stats_tuple", ["subgroup_stats", "generalisation_stats"]) def __init__(self, qf): @@ -263,30 +269,34 @@ def calculate_constant_statistics(self, data, target): def calculate_statistics(self, subgroup, target, data, statistics=None): sg_repr = repr(subgroup) if sg_repr in self.cache: - return GeneralizationAwareQF_stats.ga_tuple(*self.cache[sg_repr]) + return self.cache[sg_repr] - (stats_sg, stats_prev) = self.get_stats_and_previous_stats( - subgroup, target, data - ) - self.cache[sg_repr] = (stats_sg, stats_prev) - return GeneralizationAwareQF_stats.ga_tuple(stats_sg, stats_prev) + tpl = self.get_stats_and_previous_stats(subgroup, target, data) + self.cache[sg_repr] = tpl + return tpl def get_stats_and_previous_stats(self, subgroup, target, data): stats_subgroup = self.qf.calculate_statistics(subgroup, target, data) - max_stats = self.stats0 + # pylint: disable=no-member + if len(subgroup.selectors) == 0: + return GeneralizationAwareQF_stats.ga_tuple( + stats_subgroup, self.aggregate_statistics(stats_subgroup, []) + ) + selectors = subgroup.selectors - if len(selectors) > 0: - # compute quality of all generalizations - generalizations = combinations(selectors, len(selectors) - 1) + immediate_generalizations = combinations(selectors, len(selectors) - 1) - for sels in generalizations: - sgd = ps.Conjunction(list(sels)) - (stats_sg, stats_prev) = self.calculate_statistics(sgd, target, data) - max_stats = self.get_max(max_stats, stats_sg, stats_prev) - return (stats_subgroup, max_stats) + list_of_pairs = [] + for sels in immediate_generalizations: + sgd = ps.Conjunction(list(sels)) + list_of_pairs.append(self.calculate_statistics(sgd, target, data)) + agg_stats = self.aggregate_statistics(stats_subgroup, list_of_pairs) + # pylint: enable=no-member + return GeneralizationAwareQF_stats.ga_tuple(stats_subgroup, agg_stats) def evaluate(self, subgroup, target, data, statistics=None): raise NotImplementedError - def get_max(self, *args): - raise NotImplementedError + +# def aggregate_statistics(self, *args): +# raise NotImplementedError diff --git a/src/pysubgroup/numeric_target.py b/src/pysubgroup/numeric_target.py index 62973f4..b1ce680 100644 --- a/src/pysubgroup/numeric_target.py +++ b/src/pysubgroup/numeric_target.py @@ -197,7 +197,9 @@ def evaluate(self, subgroup, target, data, statistics=None): self.read_centroid(statistics), ) - def calculate_statistics(self, subgroup, target, data, statistics=None): + def calculate_statistics( + self, subgroup, target, data, statistics=None + ): # pylint: disable=unused-argument cover_arr, sg_size = ps.get_cover_array_and_size( subgroup, len(self.all_target_values), data ) @@ -218,12 +220,22 @@ def optimistic_estimate(self, subgroup, target, data, statistics=None): return statistics.estimate class Summation_Estimator: + r"""\ + This estimator calculates the optimistic estimate as a hyppothetical subgroup\ + which contains only instances with value greater than the dataset mean and\ + is of maximal size. + .. math:: + oe(sg) = \sum_{x \in sg, T(x)>0} (T(sg) - \mu_0) + + From Florian Lemmerich's Dissertation [section 4.2.2.1, Theorem 2 (page 81)] + """ + def __init__(self, qf): self.qf = qf self.indices_greater_centroid = None self.target_values_greater_centroid = None - def get_data(self, data, target): + def get_data(self, data, target): # pylint: disable=unused-argument return data def calculate_constant_statistics( @@ -251,12 +263,19 @@ def get_estimate( ) class Max_Estimator: + r""" + This estimator calculates the optimistic estimate + .. math:: + oe(sg) = n_{>\mu_0}^a (T^{\max}(sg) - \mu_0) + From Florian Lemmerich's Dissertation [section 4.2.2.1, Theorem 4 (page 82)] + """ + def __init__(self, qf): self.qf = qf self.indices_greater_centroid = None self.target_values_greater_centroid = None - def get_data(self, data, target): + def get_data(self, data, target): # pylint: disable=unused-argument return data def calculate_constant_statistics( @@ -295,7 +314,9 @@ def get_data(self, data, target): data.sort_values(target.get_attributes()[0], ascending=False, inplace=True) return data - def calculate_constant_statistics(self, data, target): + def calculate_constant_statistics( + self, data, target + ): # pylint: disable=unused-argument if not self.use_numba or self.numba_in_place: return try: @@ -402,7 +423,9 @@ def evaluate(self, subgroup, target, data, statistics=None): statistics.std, ) - def calculate_statistics(self, subgroup, target, data, statistics=None): + def calculate_statistics( + self, subgroup, target, data, statistics=None + ): # pylint: disable=unused-argument cover_arr, sg_size = ps.get_cover_array_and_size( subgroup, len(self.all_target_values), data ) @@ -423,35 +446,34 @@ def optimistic_estimate(self, subgroup, target, data, statistics=None): return statistics.estimate -# TODO Update to new format -# class GAStandardQFNumeric(ps.AbstractInterestingnessMeasure): -# def __init__(self, a, invert=False): -# self.a = a -# self.invert = invert -# -# def evaluate_from_dataset(self, data, subgroup, weighting_attribute=None): -# (instances_dataset, _, instances_subgroup, mean_sg) = \ -# subgroup.get_base_statistics(data, weighting_attribute) -# if instances_subgroup in (0, instances_dataset): -# return 0 -# max_mean = get_max_generalization_mean(data, subgroup, weighting_attribute) -# relative_size = (instances_subgroup / instances_dataset) -# return ps.conditional_invert( -# relative_size ** self.a * (mean_sg - max_mean), self.invert) - -# def supports_weights(self): -# return True - -# def is_applicable(self, subgroup): -# return isinstance(subgroup.target, NumericTarget) - - -# def get_max_generalization_mean(data, subgroup, weighting_attribute=None): -# selectors = subgroup.subgroup_description.selectors -# generalizations = ps.powerset(selectors) -# max_mean = 0 -# for sels in generalizations: -# sg = ps.Subgroup(subgroup.target, ps.Conjunction(list(sels))) -# mean_sg = sg.get_base_statistics(data, weighting_attribute)[3] -# max_mean = max(max_mean, mean_sg) -# return max_mean +class GeneralizationAware_StandardQFNumeric(ps.GeneralizationAwareQF_stats): + def __init__(self, a, invert=False, estimator="default", centroid="mean"): + super().__init__( + StandardQFNumeric(a, invert=invert, estimator=estimator, centroid=centroid) + ) + + def evaluate(self, subgroup, target, data, statistics=None): + statistics = self.ensure_statistics(subgroup, target, data, statistics) + sg_stats = statistics.subgroup_stats + general_stats = statistics.generalisation_stats + if sg_stats.size_sg == 0: + return np.nan + read_centroid = self.qf.read_centroid + return (sg_stats.size_sg / self.stats0.size_sg) ** self.qf.a * ( + read_centroid(sg_stats) - read_centroid(general_stats) + ) + + def aggregate_statistics(self, stats_subgroup, list_of_pairs): + read_centroid = self.qf.read_centroid + if len(list_of_pairs) == 0: + return stats_subgroup + max_centroid = 0.0 + max_stats = None + for stat, agg_stat in list_of_pairs: + if stat.size_sg == 0: + continue + centroid = max(read_centroid(agg_stat), read_centroid(stat)) + if centroid > max_centroid: + max_centroid = centroid + max_stats = stat + return max_stats diff --git a/src/pysubgroup/subgroup_description.py b/src/pysubgroup/subgroup_description.py index 622fe02..ebe3cab 100644 --- a/src/pysubgroup/subgroup_description.py +++ b/src/pysubgroup/subgroup_description.py @@ -42,7 +42,7 @@ def __new__(cls, *args, **kwargs): if ref == tmp: return ref # if not return - return tmp + return tmp # pragma: no cover def __getnewargs_ex__(self): # pylint: disable=invalid-getnewargs-ex-returned tmp_args = self.__new_args__ @@ -548,7 +548,7 @@ def _compute_repr(self): if not self._selectors: return "True" reprs = sorted(repr(sel) for sel in self._selectors) - return "".join(("(", " and ".join(reprs), ")")) + return "(" + " and ".join(reprs) + ")" def _compute_hash(self): return hash(repr(self)) diff --git a/src/pysubgroup/utils.py b/src/pysubgroup/utils.py index 9984884..0f023c7 100644 --- a/src/pysubgroup/utils.py +++ b/src/pysubgroup/utils.py @@ -24,7 +24,7 @@ def minimum_required_quality(result, task): def prepare_subgroup_discovery_result(result, task): result_filtered = [tpl for tpl in result if tpl[0] > task.min_quality] - result_filtered.sort(key=lambda x: x[0], reverse=True) + result_filtered.sort(reverse=True) result_filtered = result_filtered[: task.result_set_size] return result_filtered @@ -202,11 +202,13 @@ def intersect_of_ordered_list(list_1, list_2): class BaseTarget: def all_statistics_present(self, cached_statistics): + # pylint: disable=no-member if isinstance(cached_statistics, dict) and all( expected_value in cached_statistics for expected_value in self.__class__.statistic_types - ): # pylint: disable=no-member + ): return True + # pylint: enable=no-member return False diff --git a/tests/algorithms_testing.py b/tests/algorithms_testing.py index 944d5a1..8faf735 100644 --- a/tests/algorithms_testing.py +++ b/tests/algorithms_testing.py @@ -20,7 +20,7 @@ def evaluate_result(self, algorithm_result, result, qualities): algorithm_result, qualities, result ): self.assertEqual(repr(algorithm_SG), repr(expected_SGD)) - self.assertEqual(algorithm_q, expected_q) + self.assertAlmostEqual(algorithm_q, expected_q) def runAlgorithm(self, algorithm, name, result, qualities, task): print() diff --git a/tests/test_algorithms_boolean.py b/tests/test_algorithms_boolean.py index ee35406..6f92244 100644 --- a/tests/test_algorithms_boolean.py +++ b/tests/test_algorithms_boolean.py @@ -22,9 +22,6 @@ class TestSettings: GpGrowth = False -skip_long_running = True - - class BooleanTargetBase: # pylint: disable=no-member @unittest.skipUnless(TestSettings.All or TestSettings.Apriori, "flag not set") diff --git a/tests/test_algorithms_boolean_constraints.py b/tests/test_algorithms_boolean_constraints.py index c45163d..443eba5 100644 --- a/tests/test_algorithms_boolean_constraints.py +++ b/tests/test_algorithms_boolean_constraints.py @@ -21,9 +21,6 @@ class TestSettings: SimpleSearch = False -skip_long_running = True - - class BooleanTargetBase(TestAlgorithmsBase): # pylint: disable=no-member @unittest.skipUnless(TestSettings.All or TestSettings.Apriori, "flag not set") diff --git a/tests/test_algorithms_numeric.py b/tests/test_algorithms_numeric.py index 212c3fa..8b7c404 100644 --- a/tests/test_algorithms_numeric.py +++ b/tests/test_algorithms_numeric.py @@ -3,6 +3,7 @@ import unittest from copy import copy +import pandas as pd from algorithms_testing import TestAlgorithmsBase from t_utils import conjunctions_from_str @@ -58,8 +59,8 @@ def setUp(self): l = conjunctions_from_str( """316646.0 job=='b'high qualif/self emp/mgmt'' 310615.0 foreign_worker=='b'yes'' AND job=='b'high qualif/self emp/mgmt'' - 297844.5 foreign_worker=='b'yes'' AND own_telephone=='b'yes'' AND property_magnitude=='b'no known property'' 297844.5 own_telephone=='b'yes'' AND property_magnitude=='b'no known property'' + 297844.5 foreign_worker=='b'yes'' AND own_telephone=='b'yes'' AND property_magnitude=='b'no known property'' 288480.5 job=='b'high qualif/self emp/mgmt'' AND own_telephone=='b'yes'' 283002.0 own_telephone=='b'yes'' 282217.5 class=='b'bad'' AND own_telephone=='b'yes'' @@ -248,7 +249,7 @@ def test_DFS_average(self): def test_DFS_order_with_numba(self): try: - import numba + import numba # pylint: disable=import-outside-toplevel, unused-import except ImportError: self.skipTest("No numba installed") self.task.qf = ps.StandardQFNumeric(self.task.qf.a, False, "order") @@ -339,9 +340,6 @@ def test_DFSNumeric(self): # self.runAlgorithm(ps.SimpleSearch(), "SimpleSearch", self.result, self.qualities, self.task) -import pandas as pd - - class TestNumericEstimators(unittest.TestCase): def test_estimator1(self): records = [(1, 100), (1, 75), (1, 53), (1, 12), (0, 11), (0, 49)] diff --git a/tests/test_generalisation_aware.py b/tests/test_generalisation_aware.py index cee347f..3f2eb15 100644 --- a/tests/test_generalisation_aware.py +++ b/tests/test_generalisation_aware.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from algorithms_testing import TestAlgorithmsBase +from t_utils import conjunctions_from_str import pysubgroup as ps from pysubgroup.datasets import get_credit_data @@ -68,7 +69,9 @@ def setUp(self): self.A0 = None self.BD = None TestGeneralisationAwareQf.prepare_df(self) - self.ga_qf = ps.GeneralizationAware_StandardQF(0) + self.ga_qf = ps.GeneralizationAware_StandardQF( + 0, optimistic_estimate_strategy="max" + ) def test_simple(self): target = ps.BinaryTarget("columnC", 1) @@ -89,7 +92,7 @@ def test_simple(self): self.assertEqual(ga_stat.generalisation_stats, ps.SimplePositivesQF.tpl(5, 3)) # Ensure cache works properly - self.assertEqual( + self.assertIs( ga_stat, self.ga_qf.calculate_statistics( ps.Conjunction([self.A1, self.BA]), target, self.df @@ -105,46 +108,31 @@ def test_simple(self): ga_score3 = self.ga_qf.evaluate( ps.Conjunction([self.A0, self.BD]), target, self.df ) - self.assertEqual(ga_score, ga_score2) self.assertAlmostEqual(ga_score, 0.06666666666666) self.assertTrue(np.isnan(ga_score3)) + def test_error(self): + with self.assertRaises(ValueError): + ps.GeneralizationAware_StandardQF(0.5, "blabla") + -class TestAlgorithms(TestAlgorithmsBase, unittest.TestCase): - # TODO properly specify desired result +class TestSimpleGA(TestAlgorithmsBase, unittest.TestCase): def setUp(self): - NS_checking = ps.EqualitySelector("checking_status", b"<0") - NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") - NS_other_parties = ps.EqualitySelector("other_parties", b"none") - NS_savings_status = ps.EqualitySelector("savings_status", b"<100") - NS_job = ps.EqualitySelector("job", b"skilled") - self.result = [ - ps.Conjunction([NS_checking, NS_foreign_worker]), - ps.Conjunction([NS_checking]), - ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), - ps.Conjunction([NS_checking, NS_other_parties]), - ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), - ps.Conjunction([NS_checking, NS_savings_status]), - ps.Conjunction( - [NS_checking, NS_savings_status, NS_other_parties, NS_foreign_worker] - ), - ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]), - ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]), - ps.Conjunction([NS_checking, NS_job]), - ] - self.qualities = [ - 0.055299999999999995, - 0.05280000000000001, - 0.052300000000000006, - 0.05059999999999999, - 0.04959999999999999, - 0.048299999999999996, - 0.04660000000000001, - 0.04550000000000001, - 0.0452, - 0.044399999999999995, - ] + conj_list = conjunctions_from_str( + """0.05280000000000001 checking_status=='b'<0'' + 0.03610000000000002 savings_status=='b'<100'' + 0.0243 checking_status=='b'0<=X<200'' + 0.0208 property_magnitude=='b'no known property'' + 0.0188 purpose=='b'new car'' + 0.0184 employment=='b'<1'' + 0.0163 housing=='b'rent'' + 0.016000000000000007 personal_status=='b'female div/dep/mar'' + 0.015300000000000003 other_payment_plans=='b'bank'' + 0.0133 credit_history=='b'all paid''""" + ) + self.result = [conjunction for quality, conjunction in conj_list] + self.qualities = [quality for quality, conjunction in conj_list] data = get_credit_data() target = ps.BinaryTarget("class", b"bad") searchSpace = ps.create_nominal_selectors(data, ignore=["class"]) @@ -157,20 +145,222 @@ def setUp(self): qf=ps.GeneralizationAwareQF(ps.StandardQF(1.0)), ) - @unittest.skip def test_GA_SimpleDFS(self): self.runAlgorithm( ps.SimpleDFS(), "SimpleDFS", self.result, self.qualities, self.task ) - @unittest.skip - def test_StandardQF_GA_SimpleDFS(self): - self.task.qf = ps.GeneralizationAware_StandardQF(0.5) + +class TestGeneralizationAware_StandardQF_a05(TestAlgorithmsBase, unittest.TestCase): + def get_a(self): + return 0.5 + + def get_output_str(self): + return """0.10086921502691486 checking_status=='b'<0'' + 0.065 credit_history=='b'no credits/all paid'' + 0.0600832755431992 credit_history=='b'all paid'' + 0.05300330790951305 property_magnitude=='b'no known property'' + 0.046852215652241715 checking_status=='b'0<=X<200'' + 0.046488822458723086 savings_status=='b'<100'' + 0.04436633963967792 employment=='b'<1'' + 0.043698221718866 other_payment_plans=='b'bank'' AND purpose=='b'new car'' + 0.04183300132670378 housing=='b'rent'' AND property_magnitude=='b'no known property'' AND savings_status=='b'<100'' + 0.04103779623011525 other_payment_plans=='b'bank''""" # noqa: 501 + + def setUp(self): + conj_list = conjunctions_from_str(self.get_output_str()) + self.result = [conjunction for quality, conjunction in conj_list] + self.qualities = [quality for quality, conjunction in conj_list] + + data = get_credit_data() + target = ps.BinaryTarget("class", b"bad") + searchSpace = ps.create_nominal_selectors(data, ignore=["class"]) + self.task = ps.SubgroupDiscoveryTask( + data, + target, + searchSpace, + result_set_size=10, + depth=3, + qf=ps.GeneralizationAware_StandardQF(self.get_a()), + ) + + def test_SimpleDFS(self): + self.task.qf = ps.GeneralizationAware_StandardQF(self.get_a()) self.runAlgorithm( - ps.SimpleDFS(), "Standard_SimpleDFS", self.result, self.qualities, self.task + ps.SimpleDFS(), + f"StandardQF_SimpleDFS, a={self.get_a()}", + self.result, + self.qualities, + self.task, ) - print(self.task.qf.cache) + + def test_Apriori_diff(self): + self.task.qf = ps.GeneralizationAware_StandardQF( + self.get_a(), optimistic_estimate_strategy="difference" + ) + apriori = ps.Apriori() + apriori.use_vectorization = False + self.runAlgorithm( + apriori, + f"StandardQF_Apriori diff, a={self.get_a()}", + self.result, + self.qualities, + self.task, + ) + + def test_Apriori_max(self): + self.task.qf = ps.GeneralizationAware_StandardQF( + self.get_a(), optimistic_estimate_strategy="max" + ) + apriori = ps.Apriori() + apriori.use_vectorization = False + self.runAlgorithm( + apriori, + f"StandardQF_Apriori, max, a={self.get_a()}", + self.result, + self.qualities, + self.task, + ) + + +class TestGeneralizationAware_StandardQF_a(TestGeneralizationAware_StandardQF_a05): + def get_a(self): + return 1 + + def get_output_str(self): + return """ 0.05280000000000001 checking_status=='b'<0'' + 0.03610000000000002 savings_status=='b'<100'' + 0.0243 checking_status=='b'0<=X<200'' + 0.0208 property_magnitude=='b'no known property'' + 0.0188 purpose=='b'new car'' + 0.0184 employment=='b'<1'' + 0.0163 housing=='b'rent'' + 0.016000000000000007 personal_status=='b'female div/dep/mar'' + 0.015300000000000003 other_payment_plans=='b'bank'' + 0.0133 credit_history=='b'all paid''""" + + +class TestGeneralizationAware_StandardQF_a0_d2(TestGeneralizationAware_StandardQF_a05): + def get_a(self): + return 0 + + def get_output_str(self): + return """ 0.6795580110497237 job=='b'unemp/unskilled non res'' AND purpose=='b'furniture/equipment'' + 0.6666666666666667 purpose=='b'domestic appliance'' AND savings_status=='b'100<=X<500'' + 0.6666666666666667 personal_status=='b'male mar/wid'' AND purpose=='b'domestic appliance'' + 0.6666666666666667 job=='b'unskilled resident'' AND purpose=='b'domestic appliance'' + 0.6554054054054055 foreign_worker=='b'no'' AND job=='b'high qualif/self emp/mgmt'' + 0.6363636363636364 purpose=='b'repairs'' AND savings_status=='b'>=1000'' + 0.6290322580645161 employment=='b'unemployed'' AND purpose=='b'domestic appliance'' + 0.6089385474860336 housing=='b'rent'' AND purpose=='b'retraining'' + 0.6 foreign_worker=='b'no'' AND personal_status=='b'male div/sep'' + 0.5957446808510638 other_payment_plans=='b'stores'' AND purpose=='b'repairs''""" # noqa: 501 + + def setUp(self): + conj_list = conjunctions_from_str(self.get_output_str()) + self.result = [conjunction for quality, conjunction in conj_list] + self.qualities = [quality for quality, conjunction in conj_list] + + data = get_credit_data() + target = ps.BinaryTarget("class", b"bad") + searchSpace = ps.create_nominal_selectors(data, ignore=["class"]) + self.task = ps.SubgroupDiscoveryTask( + data, + target, + searchSpace, + result_set_size=10, + depth=2, + qf=ps.GeneralizationAware_StandardQF(self.get_a()), + ) + + +class TestGeneralizationAware_StandardQF_a0(TestGeneralizationAware_StandardQF_a05): + def get_a(self): + return 0 + + def get_output_str(self): + return """ 0.7 job=='b'unskilled resident'' AND own_telephone=='b'yes'' AND personal_status=='b'male mar/wid'' + 0.7 foreign_worker=='b'no'' AND other_parties=='b'guarantor'' AND personal_status=='b'male mar/wid'' + 0.7 employment=='b'>=7'' AND job=='b'unskilled resident'' AND purpose=='b'used car'' + 0.7 credit_history=='b'critical/other existing credit'' AND job=='b'unskilled resident'' AND purpose=='b'used car'' + 0.7 checking_status=='b'>=200'' AND personal_status=='b'male mar/wid'' AND savings_status=='b'500<=X<1000'' + 0.7 checking_status=='b'>=200'' AND own_telephone=='b'yes'' AND personal_status=='b'male mar/wid'' + 0.7 checking_status=='b'>=200'' AND credit_history=='b'critical/other existing credit'' AND personal_status=='b'male mar/wid'' + 0.6939655172413793 other_parties=='b'guarantor'' AND property_magnitude=='b'life insurance'' AND savings_status=='b'no known savings'' + 0.6939655172413793 foreign_worker=='b'no'' AND other_parties=='b'guarantor'' AND property_magnitude=='b'life insurance'' + 0.6818181818181819 credit_history=='b'delayed previously'' AND property_magnitude=='b'real estate'' AND savings_status=='b'500<=X<1000''""" # noqa: 501 + + def setUp(self): + conj_list = conjunctions_from_str(self.get_output_str()) + self.result = [conjunction for quality, conjunction in conj_list] + self.qualities = [quality for quality, conjunction in conj_list] + + data = get_credit_data() + target = ps.BinaryTarget("class", b"bad") + searchSpace = ps.create_nominal_selectors(data, ignore=["class"]) + self.task = ps.SubgroupDiscoveryTask( + data, + target, + searchSpace, + result_set_size=10, + depth=3, + qf=ps.GeneralizationAware_StandardQF(self.get_a()), + ) + + +class TestGeneralizationAware_StandardQFNumeric(TestAlgorithmsBase, unittest.TestCase): + def setUp(self): + conj_list = conjunctions_from_str( + """ 832.5979220717699 job=='b'high qualif/self emp/mgmt'' + 673.6338022041458 purpose=='b'used car'' + 645.953015714855 property_magnitude=='b'no known property'' + 603.3209078187183 own_telephone=='b'yes'' + 576.235405327832 class=='b'bad'' AND own_telephone=='b'yes'' + 540.9390501453018 purpose=='b'other'' + 537.3010282319029 housing=='b'for free'' + 440.3787869550485 checking_status=='b'0<=X<200'' AND foreign_worker=='b'no'' AND property_magnitude=='b'life insurance'' + 407.6428886169854 checking_status=='b'0<=X<200'' AND foreign_worker=='b'no'' AND other_payment_plans=='b'bank'' + 406.8834000000001 credit_history=='b'no credits/all paid'' """ # noqa: 501 + ) + self.result = [conjunction for quality, conjunction in conj_list] + self.qualities = [quality for quality, conjunction in conj_list] + + data = get_credit_data() + target = ps.NumericTarget("credit_amount") + searchSpace_Nominal = ps.create_nominal_selectors( + data, ignore=["credit_amount"] + ) + searchSpace_Numeric = ( + [] + ) # ps.create_numeric_selectors(data, ignore=['credit_amount'], nbins=10) + searchSpace = searchSpace_Nominal + searchSpace_Numeric + self.task = ps.SubgroupDiscoveryTask( + data, + target, + searchSpace, + result_set_size=10, + depth=3, + qf=ps.GeneralizationAware_StandardQFNumeric(1, False, centroid="mean"), + ) + + def test_SimpleDFS(self): + self.task.qf = ps.GeneralizationAware_StandardQFNumeric(0.5) + self.runAlgorithm( + ps.SimpleDFS(), + "Numeric StandardQF_SimpleDFS", + self.result, + self.qualities, + self.task, + ) + + # def test_DFS(self): + # self.task.qf = ps.GeneralizationAware_StandardQFNumeric(0.5) + # apriori = ps.Apriori() + # apriori.use_vectorization = False + # self.runAlgorithm( + # apriori, "StandardQF_Apriori", self.result, self.qualities, self.task + # ) if __name__ == "__main__": - unittest.main() + unittest.main(module="test_generalisation_aware") diff --git a/tests/test_generalisations.py b/tests/test_generalisations.py index 6e41273..9c253c6 100644 --- a/tests/test_generalisations.py +++ b/tests/test_generalisations.py @@ -1,16 +1,15 @@ import unittest +import pytest from algorithms_testing import TestAlgorithmsBase import pysubgroup as ps from pysubgroup.datasets import get_credit_data -skip_long_running = True - class BooleanTargetBase(TestAlgorithmsBase): # pylint: disable=no-member - @unittest.skipIf(skip_long_running, "as skip_long_running flag is True") + @pytest.mark.slow def test_GeneralisingBFS(self): self.runAlgorithm( ps.GeneralisingBFS(), diff --git a/tests/test_representations.py b/tests/test_representations.py index 82d358e..5a09bce 100644 --- a/tests/test_representations.py +++ b/tests/test_representations.py @@ -72,65 +72,45 @@ def test_Set(self): with ps.SetRepresentation( self.df, [self.A1, self.A0, self.BA, self.BC, self.CA, self.CNan] ) as representation: - self.assertEqual( - self.A1.representation, {2, 3, 6, 7, 8, 9} - ) # pylint: disable=no-member - self.assertEqual( - self.A0.representation, {0, 1, 4, 5} - ) # pylint: disable=no-member + # pylint: disable=no-member + self.assertEqual(self.A1.representation, {2, 3, 6, 7, 8, 9}) + self.assertEqual(self.A0.representation, {0, 1, 4, 5}) - self.assertEqual( - self.BA.representation, {0, 5, 7, 8, 9} - ) # pylint: disable=no-member - self.assertEqual( - self.BC.representation, {2, 3} - ) # pylint: disable=no-member + self.assertEqual(self.BA.representation, {0, 5, 7, 8, 9}) + self.assertEqual(self.BC.representation, {2, 3}) - self.assertEqual( - self.CA.representation, {2, 3} - ) # pylint: disable=no-member - self.assertEqual( - self.CNan.representation, {0, 1} - ) # pylint: disable=no-member + self.assertEqual(self.CA.representation, {2, 3}) + self.assertEqual(self.CNan.representation, {0, 1}) self.assertEqual( representation.Conjunction([self.BA, self.CNan]).representation, {0} - ) # pylint: disable=no-member + ) self.assertEqual( representation.Conjunction([self.A0, self.CNan]).representation, {0, 1} - ) # pylint: disable=no-member + ) + # pylint: enable=no-member def test_NumpySet(self): with ps.NumpySetRepresentation( self.df, [self.A1, self.A0, self.BA, self.BC, self.CA, self.CNan] ) as representation: - np.testing.assert_array_equal( - self.A1.representation, [2, 3, 6, 7, 8, 9] - ) # pylint: disable=no-member - np.testing.assert_array_equal( - self.A0.representation, [0, 1, 4, 5] - ) # pylint: disable=no-member + # pylint: disable=no-member + np.testing.assert_array_equal(self.A1.representation, [2, 3, 6, 7, 8, 9]) + np.testing.assert_array_equal(self.A0.representation, [0, 1, 4, 5]) - np.testing.assert_array_equal( - self.BA.representation, [0, 5, 7, 8, 9] - ) # pylint: disable=no-member - np.testing.assert_array_equal( - self.BC.representation, [2, 3] - ) # pylint: disable=no-member + np.testing.assert_array_equal(self.BA.representation, [0, 5, 7, 8, 9]) + np.testing.assert_array_equal(self.BC.representation, [2, 3]) - np.testing.assert_array_equal( - self.CA.representation, [2, 3] - ) # pylint: disable=no-member - np.testing.assert_array_equal( - self.CNan.representation, [0, 1] - ) # pylint: disable=no-member + np.testing.assert_array_equal(self.CA.representation, [2, 3]) + np.testing.assert_array_equal(self.CNan.representation, [0, 1]) np.testing.assert_array_equal( representation.Conjunction([self.BA, self.CNan]).representation, [0] - ) # pylint: disable=no-member + ) np.testing.assert_array_equal( representation.Conjunction([self.A0, self.CNan]).representation, [0, 1] - ) # pylint: disable=no-member + ) + # pylint: enable=no-member if __name__ == "__main__":