diff --git a/src/pysubgroup/algorithms.py b/src/pysubgroup/algorithms.py index 0ba3a9a..ac81791 100644 --- a/src/pysubgroup/algorithms.py +++ b/src/pysubgroup/algorithms.py @@ -89,14 +89,13 @@ def __init__( self.optimistic_estimate_name = "optimistic_estimate" self.next_level = self.get_next_level self.compiled_func = None - if use_numba: + if use_numba: # pragma: no cover try: - # TODO: used? import numba # pylint: disable=unused-import, import-outside-toplevel # noqa: F401, E501 self.next_level = self.get_next_level_numba print("Apriori: Using numba for speedup") - except ImportError: # pragma: no cover + except ImportError: pass def get_next_level_candidates(self, task, result, next_level_candidates): @@ -133,6 +132,9 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat promising_candidates = [] statistics = [] optimistic_estimate_function = getattr(task.qf, self.optimistic_estimate_name) + next_level_candidates = list(next_level_candidates) + if len(next_level_candidates) == 0: + return [] for sg in next_level_candidates: statistics.append(task.qf.calculate_statistics(sg, task.target, task.data)) tpl_class = statistics[0].__class__ @@ -153,7 +155,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat promising_candidates.append(sg.selectors) return promising_candidates - def get_next_level_numba(self, promising_candidates): + def get_next_level_numba(self, promising_candidates): # pragma: no cover if not hasattr(self, "compiled_func") or self.compiled_func is None: self.compiled_func = getNewCandidates @@ -163,18 +165,25 @@ def get_next_level_numba(self, promising_candidates): tuple(all_selectors_ids[sel] for sel in selectors) for selectors in promising_candidates ] - arr = np.array(promising_candidates_selector_ids, dtype=int) + shape1 = len(promising_candidates_selector_ids) + if shape1 == 0: + return [] + shape2 = len(promising_candidates_selector_ids[0]) + arr = np.array(promising_candidates_selector_ids, dtype=np.int32).reshape( + shape1, shape2 + ) print(len(arr)) hashes = np.array( [hash(tuple(x[:-1])) for x in promising_candidates_selector_ids], dtype=np.int64, ) + print(len(arr), arr.dtype, hashes.dtype) candidates_int = self.compiled_func(arr, hashes) - return list( + return [ (*promising_candidates[i], promising_candidates[j][-1]) for i, j in candidates_int - ) + ] def get_next_level(self, promising_candidates): by_prefix_dict = defaultdict(list) @@ -220,6 +229,8 @@ def execute(self, task): promising_candidates = self.get_next_level_candidates( task, result, next_level_candidates ) + if len(promising_candidates) == 0: + break if depth == task.depth: break @@ -229,15 +240,17 @@ def execute(self, task): # select those selectors and build a subgroup from them # for which all subsets of length depth (=candidate length -1) # are in the set of promising candidates + curr_depth = depth # WARNING: need copy of depth for lazy eval set_promising_candidates = set(tuple(p) for p in promising_candidates) - next_level_candidates = [ + next_level_candidates = ( combine_selectors(selectors) for selectors in next_level_candidates_no_pruning if all( (subset in set_promising_candidates) - for subset in combinations(selectors, depth) + for subset in combinations(selectors, curr_depth) ) - ] + ) + depth = depth + 1 result = ps.prepare_subgroup_discovery_result(result, task) diff --git a/src/pysubgroup/subgroup_description.py b/src/pysubgroup/subgroup_description.py index ebe3cab..072319d 100644 --- a/src/pysubgroup/subgroup_description.py +++ b/src/pysubgroup/subgroup_description.py @@ -133,6 +133,24 @@ def get_size(subgroup, data_len=None, data=None): return size +def pandas_sparse_eq(col, value): + import pandas as pd # pylint: disable=import-outside-toplevel + from pandas._libs.sparse import ( + IntIndex, # pylint: disable=import-outside-toplevel, no-name-in-module + ) + + col_arr = col.array + is_same_value = col_arr.sp_values == value + new_index_arr = col_arr.sp_index.indices[is_same_value] + index = IntIndex(len(col), new_index_arr) + return pd.arrays.SparseArray( + np.ones(len(new_index_arr), dtype=bool), + index, + col_arr.fill_value == value, + dtype=bool, + ) + + class EqualitySelector(SelectorBase): def __init__(self, attribute_name, attribute_value, selector_name=None): if attribute_name is None: @@ -188,7 +206,13 @@ def __repr__(self): def covers(self, data): import pandas as pd # pylint: disable=import-outside-toplevel - row = data[self.attribute_name].to_numpy() + column = data[self.attribute_name] + if isinstance(column.dtype, pd.SparseDtype): + row = column + if not pd.isnull(self.attribute_value): + return pandas_sparse_eq(column, self.attribute_value) + else: + row = column.to_numpy() if pd.isnull(self.attribute_value): return pd.isnull(row) return row == self.attribute_value @@ -324,13 +348,13 @@ def compute_string(cls, attribute_name, lower_bound, upper_bound, rounding_digit lb = formatter.format(lb) if lower_bound == float("-inf") and upper_bound == float("inf"): - repre = attribute_name + " = anything" + repre = str(attribute_name) + " = anything" elif lower_bound == float("-inf"): - repre = attribute_name + "<" + str(ub) + repre = str(attribute_name) + "<" + str(ub) elif upper_bound == float("inf"): - repre = attribute_name + ">=" + str(lb) + repre = str(attribute_name) + ">=" + str(lb) else: - repre = attribute_name + ": [" + str(lb) + ":" + str(ub) + "[" + repre = str(attribute_name) + ": [" + str(lb) + ":" + str(ub) + "[" return repre @staticmethod @@ -434,12 +458,24 @@ def create_numeric_selectors( def create_numeric_selectors_for_attribute( data, attr_name, nbins=5, intervals_only=True, weighting_attribute=None ): + import pandas as pd # pylint: disable=import-outside-toplevel + numeric_selectors = [] - data_not_null = data[data[attr_name].notnull()] + if isinstance(data[attr_name].dtype, pd.SparseDtype): + numeric_selectors.append( + EqualitySelector(attr_name, data[attr_name].sparse.fill_value) + ) + dense_data = data[attr_name].sparse.sp_values + data_not_null = dense_data[pd.notnull(dense_data)] + uniqueValues = np.unique(data_not_null) + if len(data_not_null) < len(dense_data): + numeric_selectors.append(EqualitySelector(attr_name, np.nan)) + else: + data_not_null = data[data[attr_name].notnull()] - uniqueValues = np.unique(data_not_null[attr_name]) - if len(data_not_null.index) < len(data.index): - numeric_selectors.append(EqualitySelector(attr_name, np.nan)) + uniqueValues = np.unique(data_not_null[attr_name]) + if len(data_not_null) < len(data): + numeric_selectors.append(EqualitySelector(attr_name, np.nan)) if len(uniqueValues) <= nbins: for val in uniqueValues: diff --git a/src/pysubgroup/utils.py b/src/pysubgroup/utils.py index 721268c..51c45ea 100644 --- a/src/pysubgroup/utils.py +++ b/src/pysubgroup/utils.py @@ -33,9 +33,14 @@ def prepare_subgroup_discovery_result(result, task): def equal_frequency_discretization( data, attribute_name, nbins=5, weighting_attribute=None ): + import pandas as pd # pylint: disable=import-outside-toplevel + cutpoints = [] if weighting_attribute is None: cleaned_data = data[attribute_name] + if isinstance(data[attribute_name].dtype, pd.SparseDtype): + cleaned_data = data[attribute_name].sparse.sp_values + cleaned_data = cleaned_data[~np.isnan(cleaned_data)] sorted_data = sorted(cleaned_data) number_instances = len(sorted_data)