From 7065c8cf79e768c053afaf31757ec23e1bc8b377 Mon Sep 17 00:00:00 2001 From: Reid Johnson Date: Sun, 1 Sep 2024 16:21:49 -0700 Subject: [PATCH] Parallelize Leaf Mapping (#80) * Parallelize map leaves --- quantile_forest/_quantile_forest.py | 148 ++++++++++++++++++++-------- quantile_forest/_utils.pyx | 19 ++-- setup.cfg | 1 - 3 files changed, 120 insertions(+), 48 deletions(-) diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py index e0d2e2e..3b91ed7 100755 --- a/quantile_forest/_quantile_forest.py +++ b/quantile_forest/_quantile_forest.py @@ -49,6 +49,7 @@ class calls the ``fit`` method of the ``ForestRegressor`` and creates a from sklearn.utils._param_validation import Interval, RealNotInt except ImportError: param_validation = False +from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import check_is_fitted from ._quantile_forest_fast import QuantileForest @@ -201,6 +202,96 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False): return self + def _map_indices_to_leaves( + self, + bootstrap_indices, + X_leaves_bootstrap, + sample_weight, + leaf_subsample, + max_node_count, + max_samples_leaf, + random_state, + ): + """Return a mapping of training sample indices to a tree's leaf nodes. + + Parameters + ---------- + bootstrap_indices : array-like of shape (n_samples, n_outputs) + Bootstrap indices of training samples. + + X_leaves_bootstrap : array-like of shape (n_samples, n_outputs) + Leaf node indices of the bootstrap training samples. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. In the case of + classification, splits are also ignored if they would result in any + single class carrying a negative weight in either child node. + + leaf_subsample : bool + Subsample leaf nodes. If True, leaves are randomly sampled to size + `max_samples_leaf`. + + max_node_count: int + Maximum number of leaf nodes across all trees. + + max_samples_leaf: int + Maximum number of samples per leaf node. + + random_state : int, RandomState instance + Controls the sampling of the training indices at each leaf node. + + Returns + ------- + y_train_leaves_slice : array-like of shape \ + (n_leaves, n_outputs, n_samples) + Mapping of training sample indices to tree's leaf nodes. Nodes with + no samples (e.g., internal nodes) are empty. Internal nodes are + included so that leaf node indices match their ``est.apply`` + outputs. Each node list is padded to equal length with 0s. + """ + n_outputs = bootstrap_indices.shape[1] + + shape = (max_node_count, n_outputs, max_samples_leaf) + y_train_leaves_slice = np.zeros(shape, dtype=np.int64) + + # Group training indices by leaf node. + leaf_indices, leaf_values_list = group_indices_by_value(X_leaves_bootstrap) + + if leaf_subsample: + random.seed(random_state) + + if leaf_subsample or sample_weight is not None: + for j in range(len(leaf_values_list)): + if sample_weight is not None: + # Filter leaf samples with zero weight. + weight_mask = sample_weight[leaf_values_list[j] - 1] > 0 + leaf_values_list[j] = leaf_values_list[j][weight_mask] + if leaf_subsample: + # Sample leaf to length `max_samples_leaf`. + if len(leaf_values_list[j]) > max_samples_leaf: + random.shuffle(leaf_values_list[j]) # to ensure random sampling + leaf_values_list[j] = leaf_values_list[j][:max_samples_leaf] + if len(leaf_values_list[j]) == 0: + leaf_values_list[j] = [0] + + # Map each leaf node to its list of training indices. + if max_samples_leaf == 1: # optimize for single-sample-per-leaf performance + y_indices = bootstrap_indices[leaf_values_list].reshape(-1, 1, n_outputs) + for j in range(n_outputs): + y_train_leaves_slice[leaf_indices, j, 0] = y_indices[:, 0, j] + + else: # get mapping for arbitrary leaf sizes + y_train_leaves_slice = map_indices_to_leaves( + y_train_leaves_slice=y_train_leaves_slice, + bootstrap_indices=bootstrap_indices, + leaf_indices=leaf_indices, + leaf_values_list=leaf_values_list, + ) + + return y_train_leaves_slice + def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None): """Return a mapping of each leaf node to its list of training indices. @@ -280,7 +371,7 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None): else: bootstrap_indices[:, i] = np.arange(n_samples) - # Get predictions on bootstrap indices. + # Get leaf node indices of bootstrap training samples. X_leaves_bootstrap[:, i] = X_leaves[bootstrap_indices[:, i], i] if sorter is not None: @@ -306,45 +397,24 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None): if sample_weight is not None: sample_weight = np.squeeze(sample_weight) - # Initialize NumPy array (more efficient serialization than dict/list). - shape = (self.n_estimators, max_node_count, n_outputs, max_samples_leaf) - y_train_leaves = np.zeros(shape, dtype=np.int64) - - for i, estimator in enumerate(self.estimators_): - # Group training indices by leaf node. - leaf_indices, leaf_values_list = group_indices_by_value(X_leaves_bootstrap[:, i]) - - if leaf_subsample: - random.seed(estimator.random_state) - - if sample_weight is not None or leaf_subsample: - for j in range(len(leaf_values_list)): - if sample_weight is not None: - # Filter leaf samples with zero weight. - weight_mask = sample_weight[leaf_values_list[j] - 1] > 0 - leaf_values_list[j] = leaf_values_list[j][weight_mask] - if leaf_subsample: - # Sample leaf to length `max_samples_leaf`. - if len(leaf_values_list[j]) > max_samples_leaf: - random.shuffle(leaf_values_list[j]) # to ensure random sampling - leaf_values_list[j] = leaf_values_list[j][:max_samples_leaf] - if len(leaf_values_list[j]) == 0: - leaf_values_list[j] = [0] - - # Map each leaf node to its list of training indices. - if max_samples_leaf == 1: # optimize for single-sample-per-leaf performance - y_indices = bootstrap_indices[:, i][leaf_values_list].reshape(-1, 1, n_outputs) - for j in range(n_outputs): - y_train_leaves[i, leaf_indices, j, 0] = y_indices[:, 0, j] - else: # get mapping for arbitrary leaf sizes - y_train_leaves[i] = map_indices_to_leaves( - y_train_leaves=y_train_leaves[i], - bootstrap_indices=bootstrap_indices[:, i], - leaf_indices=leaf_indices, - leaf_values_list=leaf_values_list, - ) + y_train_leaves = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(self._map_indices_to_leaves)( + bootstrap_indices[:, i], + X_leaves_bootstrap[:, i], + sample_weight, + leaf_subsample, + max_node_count, + max_samples_leaf, + estimator.random_state, + ) + for i, estimator in enumerate(self.estimators_) + ) - return y_train_leaves + return np.array(y_train_leaves) def _get_y_bound_leaves(self, y, y_train_leaves): """Return the bounds for target values for each leaf node. diff --git a/quantile_forest/_utils.pyx b/quantile_forest/_utils.pyx index 4c153e6..33613f0 100755 --- a/quantile_forest/_utils.pyx +++ b/quantile_forest/_utils.pyx @@ -172,7 +172,7 @@ cpdef group_indices_by_value(cnp.ndarray[intp_t, ndim=1] a): cpdef map_indices_to_leaves( - cnp.ndarray[intp_t, ndim=3] y_train_leaves, + cnp.ndarray[intp_t, ndim=3] y_train_leaves_slice, cnp.ndarray[intp_t, ndim=2] bootstrap_indices, vector[intp_t] leaf_indices, vector[vector[intp_t]] leaf_values_list, @@ -181,7 +181,7 @@ cpdef map_indices_to_leaves( Parameters ---------- - y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples) + y_train_leaves_slice : array-like of shape (n_leaves, n_outputs, n_samples) Unpopulated mapping representing a list of nodes, each with a list of indices of the training samples residing at that node. @@ -196,19 +196,22 @@ cpdef map_indices_to_leaves( Returns ------- - y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples) - Populated mapping of training sample indices to leaf nodes. + y_train_leaves_slice : array-like of shape (n_leaves, n_outputs, n_samples) + Populated mapping of training sample indices to leaf nodes. Nodes with + no samples (e.g., internal nodes) are empty. Internal nodes are + included so that leaf node indices match their ``est.apply`` + outputs. Each node list is padded to equal length with 0s. """ cdef intp_t n_samples, n_outputs, n_leaves cdef intp_t i, j, k cdef vector[intp_t] leaf_values cdef intp_t leaf_index, leaf_value, y_index - cdef intp_t[:, :, :] y_train_leaves_view + cdef intp_t[:, :, :] y_train_leaves_slice_view n_outputs = bootstrap_indices.shape[1] n_leaves = leaf_indices.size() - y_train_leaves_view = y_train_leaves # memoryview + y_train_leaves_slice_view = y_train_leaves_slice # memoryview with nogil: for i in range(n_leaves): @@ -221,6 +224,6 @@ cpdef map_indices_to_leaves( for k in range(n_outputs): y_index = bootstrap_indices[leaf_value, k] if y_index > 0: - y_train_leaves_view[leaf_index, k, j] = y_index + y_train_leaves_slice_view[leaf_index, k, j] = y_index - return np.asarray(y_train_leaves_view) + return np.asarray(y_train_leaves_slice_view) diff --git a/setup.cfg b/setup.cfg index f427825..d96c271 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,6 @@ addopts = [flake8] max-line-length = 99 ignore = E203, E266, E402, E501, W503, E731 -max-complexity = 25 select = B,C,E,F,W,T4,B9 exclude = .git,