Skip to content

Commit

Permalink
Merge branch 'master' of github.com:flemmerich/pysubgroup
Browse files Browse the repository at this point in the history
  • Loading branch information
mgbckr committed Jul 8, 2024
2 parents 82b63a5 + 1c8212e commit 1d22844
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 19 deletions.
33 changes: 23 additions & 10 deletions src/pysubgroup/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,13 @@ def __init__(
self.optimistic_estimate_name = "optimistic_estimate"
self.next_level = self.get_next_level
self.compiled_func = None
if use_numba:
if use_numba: # pragma: no cover
try:
# TODO: used?
import numba # pylint: disable=unused-import, import-outside-toplevel # noqa: F401, E501

self.next_level = self.get_next_level_numba
print("Apriori: Using numba for speedup")
except ImportError: # pragma: no cover
except ImportError:
pass

def get_next_level_candidates(self, task, result, next_level_candidates):
Expand Down Expand Up @@ -133,6 +132,9 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
promising_candidates = []
statistics = []
optimistic_estimate_function = getattr(task.qf, self.optimistic_estimate_name)
next_level_candidates = list(next_level_candidates)
if len(next_level_candidates) == 0:
return []
for sg in next_level_candidates:
statistics.append(task.qf.calculate_statistics(sg, task.target, task.data))
tpl_class = statistics[0].__class__
Expand All @@ -153,7 +155,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
promising_candidates.append(sg.selectors)
return promising_candidates

def get_next_level_numba(self, promising_candidates):
def get_next_level_numba(self, promising_candidates): # pragma: no cover
if not hasattr(self, "compiled_func") or self.compiled_func is None:
self.compiled_func = getNewCandidates

Expand All @@ -163,18 +165,25 @@ def get_next_level_numba(self, promising_candidates):
tuple(all_selectors_ids[sel] for sel in selectors)
for selectors in promising_candidates
]
arr = np.array(promising_candidates_selector_ids, dtype=int)
shape1 = len(promising_candidates_selector_ids)
if shape1 == 0:
return []
shape2 = len(promising_candidates_selector_ids[0])
arr = np.array(promising_candidates_selector_ids, dtype=np.int32).reshape(
shape1, shape2
)

print(len(arr))
hashes = np.array(
[hash(tuple(x[:-1])) for x in promising_candidates_selector_ids],
dtype=np.int64,
)
print(len(arr), arr.dtype, hashes.dtype)
candidates_int = self.compiled_func(arr, hashes)
return list(
return [
(*promising_candidates[i], promising_candidates[j][-1])
for i, j in candidates_int
)
]

def get_next_level(self, promising_candidates):
by_prefix_dict = defaultdict(list)
Expand Down Expand Up @@ -220,6 +229,8 @@ def execute(self, task):
promising_candidates = self.get_next_level_candidates(
task, result, next_level_candidates
)
if len(promising_candidates) == 0:
break

if depth == task.depth:
break
Expand All @@ -229,15 +240,17 @@ def execute(self, task):
# select those selectors and build a subgroup from them
# for which all subsets of length depth (=candidate length -1)
# are in the set of promising candidates
curr_depth = depth # WARNING: need copy of depth for lazy eval
set_promising_candidates = set(tuple(p) for p in promising_candidates)
next_level_candidates = [
next_level_candidates = (
combine_selectors(selectors)
for selectors in next_level_candidates_no_pruning
if all(
(subset in set_promising_candidates)
for subset in combinations(selectors, depth)
for subset in combinations(selectors, curr_depth)
)
]
)

depth = depth + 1

result = ps.prepare_subgroup_discovery_result(result, task)
Expand Down
54 changes: 45 additions & 9 deletions src/pysubgroup/subgroup_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,24 @@ def get_size(subgroup, data_len=None, data=None):
return size


def pandas_sparse_eq(col, value):
import pandas as pd # pylint: disable=import-outside-toplevel
from pandas._libs.sparse import (
IntIndex, # pylint: disable=import-outside-toplevel, no-name-in-module
)

col_arr = col.array
is_same_value = col_arr.sp_values == value
new_index_arr = col_arr.sp_index.indices[is_same_value]
index = IntIndex(len(col), new_index_arr)
return pd.arrays.SparseArray(
np.ones(len(new_index_arr), dtype=bool),
index,
col_arr.fill_value == value,
dtype=bool,
)


class EqualitySelector(SelectorBase):
def __init__(self, attribute_name, attribute_value, selector_name=None):
if attribute_name is None:
Expand Down Expand Up @@ -188,7 +206,13 @@ def __repr__(self):
def covers(self, data):
import pandas as pd # pylint: disable=import-outside-toplevel

row = data[self.attribute_name].to_numpy()
column = data[self.attribute_name]
if isinstance(column.dtype, pd.SparseDtype):
row = column
if not pd.isnull(self.attribute_value):
return pandas_sparse_eq(column, self.attribute_value)
else:
row = column.to_numpy()
if pd.isnull(self.attribute_value):
return pd.isnull(row)
return row == self.attribute_value
Expand Down Expand Up @@ -324,13 +348,13 @@ def compute_string(cls, attribute_name, lower_bound, upper_bound, rounding_digit
lb = formatter.format(lb)

if lower_bound == float("-inf") and upper_bound == float("inf"):
repre = attribute_name + " = anything"
repre = str(attribute_name) + " = anything"
elif lower_bound == float("-inf"):
repre = attribute_name + "<" + str(ub)
repre = str(attribute_name) + "<" + str(ub)
elif upper_bound == float("inf"):
repre = attribute_name + ">=" + str(lb)
repre = str(attribute_name) + ">=" + str(lb)
else:
repre = attribute_name + ": [" + str(lb) + ":" + str(ub) + "["
repre = str(attribute_name) + ": [" + str(lb) + ":" + str(ub) + "["
return repre

@staticmethod
Expand Down Expand Up @@ -434,12 +458,24 @@ def create_numeric_selectors(
def create_numeric_selectors_for_attribute(
data, attr_name, nbins=5, intervals_only=True, weighting_attribute=None
):
import pandas as pd # pylint: disable=import-outside-toplevel

numeric_selectors = []
data_not_null = data[data[attr_name].notnull()]
if isinstance(data[attr_name].dtype, pd.SparseDtype):
numeric_selectors.append(
EqualitySelector(attr_name, data[attr_name].sparse.fill_value)
)
dense_data = data[attr_name].sparse.sp_values
data_not_null = dense_data[pd.notnull(dense_data)]
uniqueValues = np.unique(data_not_null)
if len(data_not_null) < len(dense_data):
numeric_selectors.append(EqualitySelector(attr_name, np.nan))
else:
data_not_null = data[data[attr_name].notnull()]

uniqueValues = np.unique(data_not_null[attr_name])
if len(data_not_null.index) < len(data.index):
numeric_selectors.append(EqualitySelector(attr_name, np.nan))
uniqueValues = np.unique(data_not_null[attr_name])
if len(data_not_null) < len(data):
numeric_selectors.append(EqualitySelector(attr_name, np.nan))

if len(uniqueValues) <= nbins:
for val in uniqueValues:
Expand Down
5 changes: 5 additions & 0 deletions src/pysubgroup/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,14 @@ def prepare_subgroup_discovery_result(result, task):
def equal_frequency_discretization(
data, attribute_name, nbins=5, weighting_attribute=None
):
import pandas as pd # pylint: disable=import-outside-toplevel

cutpoints = []
if weighting_attribute is None:
cleaned_data = data[attribute_name]
if isinstance(data[attribute_name].dtype, pd.SparseDtype):
cleaned_data = data[attribute_name].sparse.sp_values

cleaned_data = cleaned_data[~np.isnan(cleaned_data)]
sorted_data = sorted(cleaned_data)
number_instances = len(sorted_data)
Expand Down

0 comments on commit 1d22844

Please sign in to comment.