Merge branch 'master' of github.com:flemmerich/pysubgroup

flemmerich · Jul 8, 2024 · 1d22844 · 1d22844
2 parents 82b63a5 + 1c8212e
commit 1d22844
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 19 deletions.
diff --git a/src/pysubgroup/algorithms.py b/src/pysubgroup/algorithms.py
@@ -89,14 +89,13 @@ def __init__(
         self.optimistic_estimate_name = "optimistic_estimate"
         self.next_level = self.get_next_level
         self.compiled_func = None
-        if use_numba:
+        if use_numba:  # pragma: no cover
             try:
-                # TODO: used?
                 import numba  # pylint: disable=unused-import, import-outside-toplevel # noqa: F401, E501
 
                 self.next_level = self.get_next_level_numba
                 print("Apriori: Using numba for speedup")
-            except ImportError:  # pragma: no cover
+            except ImportError:
                 pass
 
     def get_next_level_candidates(self, task, result, next_level_candidates):
@@ -133,6 +132,9 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
         promising_candidates = []
         statistics = []
         optimistic_estimate_function = getattr(task.qf, self.optimistic_estimate_name)
+        next_level_candidates = list(next_level_candidates)
+        if len(next_level_candidates) == 0:
+            return []
         for sg in next_level_candidates:
             statistics.append(task.qf.calculate_statistics(sg, task.target, task.data))
         tpl_class = statistics[0].__class__
@@ -153,7 +155,7 @@ def get_next_level_candidates_vectorized(self, task, result, next_level_candidat
                 promising_candidates.append(sg.selectors)
         return promising_candidates
 
-    def get_next_level_numba(self, promising_candidates):
+    def get_next_level_numba(self, promising_candidates):  # pragma: no cover
         if not hasattr(self, "compiled_func") or self.compiled_func is None:
             self.compiled_func = getNewCandidates
 
@@ -163,18 +165,25 @@ def get_next_level_numba(self, promising_candidates):
             tuple(all_selectors_ids[sel] for sel in selectors)
             for selectors in promising_candidates
         ]
-        arr = np.array(promising_candidates_selector_ids, dtype=int)
+        shape1 = len(promising_candidates_selector_ids)
+        if shape1 == 0:
+            return []
+        shape2 = len(promising_candidates_selector_ids[0])
+        arr = np.array(promising_candidates_selector_ids, dtype=np.int32).reshape(
+            shape1, shape2
+        )
 
         print(len(arr))
         hashes = np.array(
             [hash(tuple(x[:-1])) for x in promising_candidates_selector_ids],
             dtype=np.int64,
         )
+        print(len(arr), arr.dtype, hashes.dtype)
         candidates_int = self.compiled_func(arr, hashes)
-        return list(
+        return [
             (*promising_candidates[i], promising_candidates[j][-1])
             for i, j in candidates_int
-        )
+        ]
 
     def get_next_level(self, promising_candidates):
         by_prefix_dict = defaultdict(list)
@@ -220,6 +229,8 @@ def execute(self, task):
                     promising_candidates = self.get_next_level_candidates(
                         task, result, next_level_candidates
                     )
+                if len(promising_candidates) == 0:
+                    break
 
                 if depth == task.depth:
                     break
@@ -229,15 +240,17 @@ def execute(self, task):
                 # select those selectors and build a subgroup from them
                 #   for which all subsets of length depth (=candidate length -1)
                 #   are in the set of promising candidates
+                curr_depth = depth  # WARNING: need copy of depth for lazy eval
                 set_promising_candidates = set(tuple(p) for p in promising_candidates)
-                next_level_candidates = [
+                next_level_candidates = (
                     combine_selectors(selectors)
                     for selectors in next_level_candidates_no_pruning
                     if all(
                         (subset in set_promising_candidates)
-                        for subset in combinations(selectors, depth)
+                        for subset in combinations(selectors, curr_depth)
                     )
-                ]
+                )
+
                 depth = depth + 1
 
         result = ps.prepare_subgroup_discovery_result(result, task)

diff --git a/src/pysubgroup/subgroup_description.py b/src/pysubgroup/subgroup_description.py
@@ -133,6 +133,24 @@ def get_size(subgroup, data_len=None, data=None):
     return size
 
 
+def pandas_sparse_eq(col, value):
+    import pandas as pd  # pylint: disable=import-outside-toplevel
+    from pandas._libs.sparse import (
+        IntIndex,  # pylint: disable=import-outside-toplevel, no-name-in-module
+    )
+
+    col_arr = col.array
+    is_same_value = col_arr.sp_values == value
+    new_index_arr = col_arr.sp_index.indices[is_same_value]
+    index = IntIndex(len(col), new_index_arr)
+    return pd.arrays.SparseArray(
+        np.ones(len(new_index_arr), dtype=bool),
+        index,
+        col_arr.fill_value == value,
+        dtype=bool,
+    )
+
+
 class EqualitySelector(SelectorBase):
     def __init__(self, attribute_name, attribute_value, selector_name=None):
         if attribute_name is None:
@@ -188,7 +206,13 @@ def __repr__(self):
     def covers(self, data):
         import pandas as pd  # pylint: disable=import-outside-toplevel
 
-        row = data[self.attribute_name].to_numpy()
+        column = data[self.attribute_name]
+        if isinstance(column.dtype, pd.SparseDtype):
+            row = column
+            if not pd.isnull(self.attribute_value):
+                return pandas_sparse_eq(column, self.attribute_value)
+        else:
+            row = column.to_numpy()
         if pd.isnull(self.attribute_value):
             return pd.isnull(row)
         return row == self.attribute_value
@@ -324,13 +348,13 @@ def compute_string(cls, attribute_name, lower_bound, upper_bound, rounding_digit
             lb = formatter.format(lb)
 
         if lower_bound == float("-inf") and upper_bound == float("inf"):
-            repre = attribute_name + " = anything"
+            repre = str(attribute_name) + " = anything"
         elif lower_bound == float("-inf"):
-            repre = attribute_name + "<" + str(ub)
+            repre = str(attribute_name) + "<" + str(ub)
         elif upper_bound == float("inf"):
-            repre = attribute_name + ">=" + str(lb)
+            repre = str(attribute_name) + ">=" + str(lb)
         else:
-            repre = attribute_name + ": [" + str(lb) + ":" + str(ub) + "["
+            repre = str(attribute_name) + ": [" + str(lb) + ":" + str(ub) + "["
         return repre
 
     @staticmethod
@@ -434,12 +458,24 @@ def create_numeric_selectors(
 def create_numeric_selectors_for_attribute(
     data, attr_name, nbins=5, intervals_only=True, weighting_attribute=None
 ):
+    import pandas as pd  # pylint: disable=import-outside-toplevel
+
     numeric_selectors = []
-    data_not_null = data[data[attr_name].notnull()]
+    if isinstance(data[attr_name].dtype, pd.SparseDtype):
+        numeric_selectors.append(
+            EqualitySelector(attr_name, data[attr_name].sparse.fill_value)
+        )
+        dense_data = data[attr_name].sparse.sp_values
+        data_not_null = dense_data[pd.notnull(dense_data)]
+        uniqueValues = np.unique(data_not_null)
+        if len(data_not_null) < len(dense_data):
+            numeric_selectors.append(EqualitySelector(attr_name, np.nan))
+    else:
+        data_not_null = data[data[attr_name].notnull()]
 
-    uniqueValues = np.unique(data_not_null[attr_name])
-    if len(data_not_null.index) < len(data.index):
-        numeric_selectors.append(EqualitySelector(attr_name, np.nan))
+        uniqueValues = np.unique(data_not_null[attr_name])
+        if len(data_not_null) < len(data):
+            numeric_selectors.append(EqualitySelector(attr_name, np.nan))
 
     if len(uniqueValues) <= nbins:
         for val in uniqueValues:

diff --git a/src/pysubgroup/utils.py b/src/pysubgroup/utils.py
@@ -33,9 +33,14 @@ def prepare_subgroup_discovery_result(result, task):
 def equal_frequency_discretization(
     data, attribute_name, nbins=5, weighting_attribute=None
 ):
+    import pandas as pd  # pylint: disable=import-outside-toplevel
+
     cutpoints = []
     if weighting_attribute is None:
         cleaned_data = data[attribute_name]
+        if isinstance(data[attribute_name].dtype, pd.SparseDtype):
+            cleaned_data = data[attribute_name].sparse.sp_values
+
         cleaned_data = cleaned_data[~np.isnan(cleaned_data)]
         sorted_data = sorted(cleaned_data)
         number_instances = len(sorted_data)