Skip to content

Commit

Permalink
Improved Generalization Aware Qualities (#52)
Browse files Browse the repository at this point in the history
Greatly increased generalization aware qualities
+ cleanup
  • Loading branch information
Feelx234 authored Sep 19, 2023
1 parent 63ccae2 commit 0a31243
Show file tree
Hide file tree
Showing 16 changed files with 513 additions and 195 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ test_gp_model.txt
program.prof
import.log
doc/_build
Untitled.ipynb



# Temporary and binary files
Expand Down Expand Up @@ -69,3 +69,7 @@ MANIFEST
.venv*/
.conda*/
.python-version


Untitled1.ipynb
Untitled.ipynb
7 changes: 3 additions & 4 deletions docs/sections/components/gp_growth.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,11 @@ The basic usage of the gp-growth algorithm is not very different from the usage
from pysubgroup.datasets import get_titanic_data
data = get_titanic_data()
target = ps.NominalSelector ('Survived', True)
target = ps.BinaryTarget ('Survived', True)
searchspace = ps.create_selectors(data, ignore=['Survived'])
task = ps.SubgroupDiscoveryTask (data, target, dearchspace, result_set_size=5, depth=2, qf=ps.WRAccQF())
GpGrowth.execute(task)
task = ps.SubgroupDiscoveryTask (data, target, searchspace, result_set_size=5, depth=2, qf=ps.WRAccQF())
result = ps.GpGrowth().execute(task)
But beware that gp-growth is using an exhaustive search strategy! This can greatly increase the runtime for high search depth.
You can specify the :code:`mode` argument in the constructor of GpGrowth to run gp-growth either bottom up (:code:`mode='b_u'`) or top down (:code:`mode='b_u'`).
As gp growth is a generalisation of fp-growth you can also perform standard fp-growth using gp_growth by using the CountQF (:ref:`countqf`) quality function.

Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ norecursedirs =
.tox
testpaths = tests
# Use pytest markers to select/deselect specific tests
# markers =
# slow: mark tests as slow (deselect with '-m "not slow"')
markers =
slow: mark tests as slow (deselect with '-m "not slow"')
# system: mark end-to-end system tests

[devpi:upload]
Expand Down
42 changes: 25 additions & 17 deletions src/pysubgroup/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,27 @@ def constraints_satisfied(constraints, subgroup, statistics=None, data=None):
)


try: # pragma: no cover
from numba import ( # pylint: disable=import-error, import-outside-toplevel
int32,
int64,
njit,
)

@njit([(int32[:, :], int64[:])], cache=True)
def getNewCandidates(candidates, hashes): # pragma: no cover
result = []
for i in range(len(candidates) - 1):
for j in range(i + 1, len(candidates)):
if hashes[i] == hashes[j]:
if np.all(candidates[i, :-1] == candidates[j, :-1]):
result.append((i, j))
return result

except ImportError: # pragma: no cover
pass


class Apriori:
def __init__(
self, representation_type=None, combination_name="Conjunction", use_numba=True
Expand Down Expand Up @@ -133,20 +154,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
return promising_candidates

def get_next_level_numba(self, promising_candidates):
from numba import njit # pylint: disable=import-error, import-outside-toplevel

if not hasattr(self, "compiled_func") or self.compiled_func is None:

@njit
def getNewCandidates(candidates, hashes): # pragma: no cover
result = []
for i in range(len(candidates) - 1):
for j in range(i + 1, len(candidates)):
if hashes[i] == hashes[j]:
if np.all(candidates[i, :-1] == candidates[j, :-1]):
result.append((i, j))
return result

self.compiled_func = getNewCandidates

all_selectors = Counter(chain.from_iterable(promising_candidates))
Expand Down Expand Up @@ -182,7 +190,9 @@ def execute(self, task):
if not isinstance(
task.qf, ps.BoundedInterestingnessMeasure
): # pragma: no cover
raise RuntimeWarning("Quality function is unbounded, long runtime expected")
warnings.warn(
"Quality function is unbounded, long runtime expected", RuntimeWarning
)

task.qf.calculate_constant_statistics(task.data, task.target)

Expand Down Expand Up @@ -302,7 +312,7 @@ def execute(self, task):

sg = candidate_description
statistics = task.qf.calculate_statistics(sg, task.target, task.data)
quality = task.qf.evaluate(sg, statistics)
quality = task.qf.evaluate(sg, task.target, task.data, statistics)
ps.add_if_required(result, sg, quality, task, statistics=statistics)

qual = ps.minimum_required_quality(result, task)
Expand Down Expand Up @@ -336,8 +346,6 @@ def execute(self, task):
self.discarded[len(candidate_description)] += 1

result.sort(key=lambda x: x[0], reverse=True)
for qual, sg in result:
print(f"{qual} {sg}")
print("discarded " + str(self.discarded))
return ps.SubgroupDiscoveryResult(result, task)

Expand Down
142 changes: 127 additions & 15 deletions src/pysubgroup/binary_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,31 +455,143 @@ def __init__(self):
#####
# GeneralizationAware Interestingness Measures
#####
class GeneralizationAware_StandardQF(GeneralizationAwareQF_stats):
def __init__(self, a):
super().__init__(StandardQF(0))
class GeneralizationAware_StandardQF(
GeneralizationAwareQF_stats, BoundedInterestingnessMeasure
):
ga_sQF_agg_tuple = namedtuple(
"ga_sQF_agg_tuple", ["max_p", "min_delta_negatives", "min_negatives"]
)

def __init__(self, a, optimistic_estimate_strategy="default"):
super().__init__(StandardQF(a))
if optimistic_estimate_strategy in ("default", "difference"):
self.optimistic_estimate = self.difference_based_optimistic_estimate
self.aggregate_statistics = self.difference_based_agg_function
self.read_p = self.difference_based_read_p
elif optimistic_estimate_strategy == "max":
self.optimistic_estimate = self.max_based_optimistic_estimate
self.aggregate_statistics = self.max_based_aggregate_statistics
self.read_p = self.max_based_read_p
else:
raise ValueError(
"optimistic_estimate_strategy should be one of "
"('default', 'max', 'difference')"
)
self.a = a

def get_max(self, *args):
max_ratio = 0.0
def evaluate(self, subgroup, target, data, statistics=None):
statistics = self.ensure_statistics(subgroup, target, data, statistics)
sg_stats = statistics.subgroup_stats
general_stats = statistics.generalisation_stats
if sg_stats.size_sg == 0:
return np.nan
sg_ratio = sg_stats.positives_count / sg_stats.size_sg
return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * (
sg_ratio - self.read_p(general_stats)
)

def max_based_aggregate_statistics(self, stats_subgroup, list_of_pairs):
if len(list_of_pairs) == 0:
return stats_subgroup
max_ratio = -100
max_stats = None
for stat in args:
assert stat.size_sg > 0
ratio = stat.positives_count / stat.size_sg
if ratio > max_ratio:
max_ratio = ratio
max_stats = stat
for pair in list_of_pairs:
ratio = -np.inf
for agg_stat in pair:
if agg_stat.size_sg == 0: # pragma: no cover
continue
ratio = agg_stat.positives_count / agg_stat.size_sg
if ratio > max_ratio:
max_ratio = ratio
max_stats = agg_stat

return max_stats

def evaluate(self, subgroup, target, data, statistics=None):
def max_based_optimistic_estimate(self, subgroup, target, data, statistics=None):
"""
Computes the oe as the hypothetical subgroup containing only positive instances
"""
statistics = self.ensure_statistics(subgroup, target, data, statistics)
sg_stats = statistics.subgroup_stats
general_stats = statistics.generalisation_stats
if sg_stats.size_sg == 0 or general_stats.size_sg == 0:
return np.nan

sg_ratio = sg_stats.positives_count / sg_stats.size_sg
general_ratio = general_stats.positives_count / general_stats.size_sg
return (sg_stats.size_sg / self.stats0.size_sg) ** self.a * (
sg_ratio - general_ratio
return (sg_stats.positives_count / self.stats0.size_sg) ** self.a * (
1 - general_ratio
)

def max_based_read_p(self, agg_tuple):
return agg_tuple.positives_count / agg_tuple.size_sg

def difference_based_optimistic_estimate(self, subgroup, target, data, statistics):
sg_stats, agg_stats = self.ensure_statistics(subgroup, target, data, statistics)
if np.isposinf(agg_stats.min_delta_negatives):
return np.inf
delta_n = agg_stats.min_delta_negatives
size_dataset = self.qf.dataset_statistics.size_sg
tau_diff = 0
if self.qf.a == 0:
pos = 1
# return delta_n /(1 + delta_n)
elif self.qf.a == 1.0:
pos = sg_stats.positives_count
# return pos / size_dataset * delta_n /(pos + delta_n)
else:
a = self.qf.a
p_hat = min(np.ceil(a * delta_n / (1 - a)), sg_stats.positives_count)
pos = p_hat
# return (p_hat / size_dataset) ** a * delta_n /(p_hat+delta_n)
tau_diff = pos / (pos + delta_n)
if sg_stats.size_sg > 0:
tau_sg = sg_stats.positives_count / sg_stats.size_sg
else:
tau_sg = -1
tau_max = max(tau_diff, tau_sg, agg_stats.max_p)
return (sg_stats.positives_count / size_dataset) ** self.a * (1 - tau_max)

def difference_based_agg_function(self, stats_subgroup, list_of_pairs):
"""
list_of_pairs is a list of (stats, agg_tuple) for all the generalizations
"""

def get_negatives_count(sg_stats):
return sg_stats.size_sg - sg_stats.positives_count

def get_percentage_positives(sg_stats):
if sg_stats.size_sg == 0:
return np.nan
return sg_stats.positives_count / sg_stats.size_sg

if len(list_of_pairs) == 0: # empty pattern
return GeneralizationAware_StandardQF.ga_sQF_agg_tuple(
get_percentage_positives(stats_subgroup), np.infty, np.infty
)

subgroup_negatives = stats_subgroup.size_sg - stats_subgroup.positives_count
min_immediate_generalizations_negatives = min(
get_negatives_count(x.subgroup_stats) for x in list_of_pairs
)
min_immediate_generalizations_delta_negatives = min(
x.generalisation_stats.min_delta_negatives for x in list_of_pairs
)
max_percentage_positives = max(
max(
get_percentage_positives(x.subgroup_stats), x.generalisation_stats.max_p
)
for x in list_of_pairs
)

sg_delta_negatives = (
min_immediate_generalizations_negatives - subgroup_negatives
)
min_delta_negatives = min(
sg_delta_negatives, min_immediate_generalizations_delta_negatives
)
return GeneralizationAware_StandardQF.ga_sQF_agg_tuple(
max_percentage_positives, min_delta_negatives, sg_delta_negatives
)

def difference_based_read_p(self, agg_tuple):
return agg_tuple.max_p
44 changes: 27 additions & 17 deletions src/pysubgroup/measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ def __hasattr__(self, name):
# GeneralizationAware Interestingness Measures
#####
class GeneralizationAwareQF(AbstractInterestingnessMeasure):
"""A class that computes the generalization aware qf as follows:
qf(sg) = qf(sg) - max_{generalizations} qf(sq)
"""

ga_tuple = namedtuple("ga_tuple", ["subgroup_quality", "generalisation_quality"])

def __init__(self, qf):
Expand Down Expand Up @@ -241,6 +245,8 @@ def evaluate(self, subgroup, target, data, statistics=None):
# GeneralizationAware Interestingness Measures
#####
class GeneralizationAwareQF_stats(AbstractInterestingnessMeasure):
"""An abstract base class that implements aggregation of stats of generalisations"""

ga_tuple = namedtuple("ga_stats_tuple", ["subgroup_stats", "generalisation_stats"])

def __init__(self, qf):
Expand All @@ -263,30 +269,34 @@ def calculate_constant_statistics(self, data, target):
def calculate_statistics(self, subgroup, target, data, statistics=None):
sg_repr = repr(subgroup)
if sg_repr in self.cache:
return GeneralizationAwareQF_stats.ga_tuple(*self.cache[sg_repr])
return self.cache[sg_repr]

(stats_sg, stats_prev) = self.get_stats_and_previous_stats(
subgroup, target, data
)
self.cache[sg_repr] = (stats_sg, stats_prev)
return GeneralizationAwareQF_stats.ga_tuple(stats_sg, stats_prev)
tpl = self.get_stats_and_previous_stats(subgroup, target, data)
self.cache[sg_repr] = tpl
return tpl

def get_stats_and_previous_stats(self, subgroup, target, data):
stats_subgroup = self.qf.calculate_statistics(subgroup, target, data)
max_stats = self.stats0
# pylint: disable=no-member
if len(subgroup.selectors) == 0:
return GeneralizationAwareQF_stats.ga_tuple(
stats_subgroup, self.aggregate_statistics(stats_subgroup, [])
)

selectors = subgroup.selectors
if len(selectors) > 0:
# compute quality of all generalizations
generalizations = combinations(selectors, len(selectors) - 1)
immediate_generalizations = combinations(selectors, len(selectors) - 1)

for sels in generalizations:
sgd = ps.Conjunction(list(sels))
(stats_sg, stats_prev) = self.calculate_statistics(sgd, target, data)
max_stats = self.get_max(max_stats, stats_sg, stats_prev)
return (stats_subgroup, max_stats)
list_of_pairs = []
for sels in immediate_generalizations:
sgd = ps.Conjunction(list(sels))
list_of_pairs.append(self.calculate_statistics(sgd, target, data))
agg_stats = self.aggregate_statistics(stats_subgroup, list_of_pairs)
# pylint: enable=no-member
return GeneralizationAwareQF_stats.ga_tuple(stats_subgroup, agg_stats)

def evaluate(self, subgroup, target, data, statistics=None):
raise NotImplementedError

def get_max(self, *args):
raise NotImplementedError

# def aggregate_statistics(self, *args):
# raise NotImplementedError
Loading

0 comments on commit 0a31243

Please sign in to comment.