Optimize Leaf Node Mapping (#78)

* Optimize leaf node mapping
zillow · Aug 31, 2024 · 222c0a0 · 222c0a0
1 parent 64e9449
commit 222c0a0
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 25 deletions.
diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py
@@ -51,7 +51,7 @@ class calls the ``fit`` method of the ``ForestRegressor`` and creates a
     param_validation = False
 from sklearn.utils.validation import check_is_fitted
 
-from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices
+from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices, map_leaf_nodes
 
 sklearn_version = parse_version(sklearn.__version__)
 
@@ -69,8 +69,8 @@ def _group_by_value(a):
     a_sorted = a[sort_idx]
     unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1]))
     unq_items = a_sorted[unq_first]
-    unq_count = np.diff(np.concatenate(np.nonzero(unq_first) + ([a.size],)))
-    unq_idx = np.split(sort_idx, np.cumsum(unq_count[:-1]))
+    unq_indices = np.flatnonzero(unq_first)
+    unq_idx = np.array_split(sort_idx, unq_indices[1:])
     return unq_items, unq_idx
 
 
@@ -303,9 +303,8 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
         if sorter is not None:
             # Reassign bootstrap indices to account for target sorting.
             bootstrap_indices = np.argsort(sorter, axis=0)[bootstrap_indices]
-            if bootstrap_indices.shape[-1] == 1:
-                bootstrap_indices = np.squeeze(bootstrap_indices, -1)
 
+        bootstrap_indices = bootstrap_indices.reshape(-1, self.n_estimators, n_outputs)
         bootstrap_indices += 1  # for sparse matrix (0s as empty)
 
         # Get the maximum number of nodes (internal + leaves) across trees.
@@ -335,23 +334,32 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
             if leaf_subsample:
                 random.seed(estimator.random_state)
 
-            # Map each leaf node to its list of training indices.
-            for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
-                y_indices = bootstrap_indices[:, i][leaf_values].reshape(-1, n_outputs)
-
-                if sample_weight is not None:
-                    y_indices = y_indices[sample_weight[y_indices - 1] > 0]
-
-                # Subsample leaf training indices (without replacement).
-                if leaf_subsample and max_samples_leaf < len(y_indices):
-                    if not isinstance(y_indices, list):
-                        y_indices = list(y_indices)
-                    y_indices = random.sample(y_indices, max_samples_leaf)
-
-                y_indices = np.asarray(y_indices).T.reshape(n_outputs, -1)
+            if sample_weight is not None or leaf_subsample:
+                for j in range(len(leaf_values_list)):
+                    if sample_weight is not None:
+                        # Filter leaf samples with zero weight.
+                        weight_mask = sample_weight[leaf_values_list[j] - 1] > 0
+                        leaf_values_list[j] = leaf_values_list[j][weight_mask]
+                    if leaf_subsample:
+                        # Sample leaf to length `max_samples_leaf`.
+                        if len(leaf_values_list[j]) > max_samples_leaf:
+                            random.shuffle(leaf_values_list[j])  # to ensure random sampling
+                            leaf_values_list[j] = leaf_values_list[j][:max_samples_leaf]
+                    if len(leaf_values_list[j]) == 0:
+                        leaf_values_list[j] = [0]
 
+            # Map each leaf node to its list of training indices.
+            if max_samples_leaf == 1:  # optimize for single-sample-per-leaf performance
+                y_indices = bootstrap_indices[:, i][leaf_values_list].reshape(-1, 1, n_outputs)
                 for j in range(n_outputs):
-                    y_train_leaves[i, leaf_idx, j, : len(y_indices[j])] = y_indices[j]
+                    y_train_leaves[i, leaf_indices, j, 0] = y_indices[:, 0, j]
+            else:  # get mapping for arbitrary leaf sizes
+                y_train_leaves[i] = map_leaf_nodes(
+                    y_train_leaves=y_train_leaves[i],
+                    bootstrap_indices=bootstrap_indices[:, i],
+                    leaf_indices=leaf_indices,
+                    leaf_values_list=leaf_values_list,
+                )
 
         return y_train_leaves
 

diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx
@@ -523,18 +523,18 @@ cpdef vector[intp_t] generate_unsampled_indices(
     n_total_samples : int
         Number of total samples, sampled and unsampled.
 
-    duplicates : list of sets
+    duplicates : list of sets of ints
         List of sets of functionally identical indices.
 
     Returns
     -------
     unsampled_indices : array-like
         List of unsampled indices.
     """
+    cdef intp_t n_samples, n_duplicates
     cdef intp_t i
     cdef intp_t sampled_idx
     cdef set[intp_t] sampled_set
-    cdef intp_t n_samples, n_duplicates
     cdef vector[intp_t] unsampled_indices
 
     n_samples = sample_indices.size()
@@ -557,6 +557,61 @@ cpdef vector[intp_t] generate_unsampled_indices(
     return unsampled_indices
 
 
+cpdef map_leaf_nodes(
+    cnp.ndarray[intp_t, ndim=3] y_train_leaves,
+    cnp.ndarray[intp_t, ndim=2] bootstrap_indices,
+    vector[intp_t] leaf_indices,
+    vector[vector[intp_t]] leaf_values_list,
+) noexcept:
+    """Return a mapping of training sample indices to a tree's leaf nodes.
+
+    Parameters
+    ----------
+    y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples)
+        Unpopulated mapping representing a list of nodes, each with a list of
+        indices of the training samples residing at that node.
+
+    bootstrap_indices : array-like of shape (n_samples, n_outputs)
+        Bootstrap indices of training samples.
+
+    leaf_indices : list of ints
+        List of leaf node indices. Values correspond to `leaf_values_list`.
+
+    leaf_values_list : list of list of ints
+        List of leaf node sample indices. Values correspond to `leaf_indices`.
+
+    Returns
+    -------
+    y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples)
+        Populated mapping of training sample indices to leaf nodes.
+    """
+    cdef intp_t n_samples, n_outputs, n_leaves
+    cdef intp_t i, j, k
+    cdef vector[intp_t] leaf_values
+    cdef intp_t leaf_index, leaf_value, y_index
+    cdef intp_t[:, :, :] y_train_leaves_view
+
+    n_outputs = bootstrap_indices.shape[1]
+    n_leaves = leaf_indices.size()
+
+    y_train_leaves_view = y_train_leaves  # memoryview
+
+    with nogil:
+        for i in range(n_leaves):
+            leaf_index = leaf_indices[i]
+            leaf_values = leaf_values_list[i]
+
+            n_samples = leaf_values.size()
+            for j in range(n_samples):
+                leaf_value = leaf_values[j]
+                for k in range(n_outputs):
+                    y_index = bootstrap_indices[leaf_value, k]
+                    if y_index > 0:
+                        y_train_leaves_view[leaf_index, k, j] = y_index
+
+    return np.asarray(y_train_leaves_view)
+
+
 cdef class QuantileForest:
     """Representation of a quantile forest.
 

diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py
@@ -113,7 +113,7 @@ def check_california_criterion(name, criterion):
 
     # Test sample weights.
     regr = ForestRegressor(n_estimators=5, criterion=criterion, random_state=0)
-    sample_weight = np.ones(y_california.shape)
+    sample_weight = np.concatenate([np.zeros(1), np.ones(len(y_california) - 1)])
     regr.fit(X_california, y_california, sample_weight=sample_weight)
     score = regr.score(X_california, y_california, quantiles=0.5)
     assert score > 0.9, f"Failed with criterion {criterion}, sample weight and score={score}."
@@ -786,7 +786,12 @@ def check_max_samples_leaf(name):
 
     max_leaf_sizes = []
     for max_samples_leaf in [0.99 / len(X), 1, 3.0 / len(X), 5, 20, None]:
-        est = ForestRegressor(n_estimators=10, max_samples_leaf=max_samples_leaf, random_state=0)
+        est = ForestRegressor(
+            n_estimators=10,
+            min_samples_leaf=max_samples_leaf if max_samples_leaf is not None else len(X),
+            max_samples_leaf=max_samples_leaf,
+            random_state=0,
+        )
         est.fit(X, y)
 
         max_leaf_size = 0

diff --git a/setup.cfg b/setup.cfg
@@ -17,7 +17,7 @@ addopts =
 [flake8]
 max-line-length = 99
 ignore = E203, E266, E402, E501, W503, E731
-max-complexity = 24
+max-complexity = 25
 select = B,C,E,F,W,T4,B9
 exclude =
     .git,