Skip to content

Commit

Permalink
Parallelize Leaf Mapping (#80)
Browse files Browse the repository at this point in the history
* Parallelize map leaves
  • Loading branch information
reidjohnson authored Sep 1, 2024
1 parent 9858399 commit 7065c8c
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 48 deletions.
148 changes: 109 additions & 39 deletions quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class calls the ``fit`` method of the ``ForestRegressor`` and creates a
from sklearn.utils._param_validation import Interval, RealNotInt
except ImportError:
param_validation = False
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import check_is_fitted

from ._quantile_forest_fast import QuantileForest
Expand Down Expand Up @@ -201,6 +202,96 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False):

return self

def _map_indices_to_leaves(
self,
bootstrap_indices,
X_leaves_bootstrap,
sample_weight,
leaf_subsample,
max_node_count,
max_samples_leaf,
random_state,
):
"""Return a mapping of training sample indices to a tree's leaf nodes.
Parameters
----------
bootstrap_indices : array-like of shape (n_samples, n_outputs)
Bootstrap indices of training samples.
X_leaves_bootstrap : array-like of shape (n_samples, n_outputs)
Leaf node indices of the bootstrap training samples.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted. Splits
that would create child nodes with net zero or negative weight are
ignored while searching for a split in each node. In the case of
classification, splits are also ignored if they would result in any
single class carrying a negative weight in either child node.
leaf_subsample : bool
Subsample leaf nodes. If True, leaves are randomly sampled to size
`max_samples_leaf`.
max_node_count: int
Maximum number of leaf nodes across all trees.
max_samples_leaf: int
Maximum number of samples per leaf node.
random_state : int, RandomState instance
Controls the sampling of the training indices at each leaf node.
Returns
-------
y_train_leaves_slice : array-like of shape \
(n_leaves, n_outputs, n_samples)
Mapping of training sample indices to tree's leaf nodes. Nodes with
no samples (e.g., internal nodes) are empty. Internal nodes are
included so that leaf node indices match their ``est.apply``
outputs. Each node list is padded to equal length with 0s.
"""
n_outputs = bootstrap_indices.shape[1]

shape = (max_node_count, n_outputs, max_samples_leaf)
y_train_leaves_slice = np.zeros(shape, dtype=np.int64)

# Group training indices by leaf node.
leaf_indices, leaf_values_list = group_indices_by_value(X_leaves_bootstrap)

if leaf_subsample:
random.seed(random_state)

if leaf_subsample or sample_weight is not None:
for j in range(len(leaf_values_list)):
if sample_weight is not None:
# Filter leaf samples with zero weight.
weight_mask = sample_weight[leaf_values_list[j] - 1] > 0
leaf_values_list[j] = leaf_values_list[j][weight_mask]
if leaf_subsample:
# Sample leaf to length `max_samples_leaf`.
if len(leaf_values_list[j]) > max_samples_leaf:
random.shuffle(leaf_values_list[j]) # to ensure random sampling
leaf_values_list[j] = leaf_values_list[j][:max_samples_leaf]
if len(leaf_values_list[j]) == 0:
leaf_values_list[j] = [0]

# Map each leaf node to its list of training indices.
if max_samples_leaf == 1: # optimize for single-sample-per-leaf performance
y_indices = bootstrap_indices[leaf_values_list].reshape(-1, 1, n_outputs)
for j in range(n_outputs):
y_train_leaves_slice[leaf_indices, j, 0] = y_indices[:, 0, j]

else: # get mapping for arbitrary leaf sizes
y_train_leaves_slice = map_indices_to_leaves(
y_train_leaves_slice=y_train_leaves_slice,
bootstrap_indices=bootstrap_indices,
leaf_indices=leaf_indices,
leaf_values_list=leaf_values_list,
)

return y_train_leaves_slice

def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
"""Return a mapping of each leaf node to its list of training indices.
Expand Down Expand Up @@ -280,7 +371,7 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
else:
bootstrap_indices[:, i] = np.arange(n_samples)

# Get predictions on bootstrap indices.
# Get leaf node indices of bootstrap training samples.
X_leaves_bootstrap[:, i] = X_leaves[bootstrap_indices[:, i], i]

if sorter is not None:
Expand All @@ -306,45 +397,24 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
if sample_weight is not None:
sample_weight = np.squeeze(sample_weight)

# Initialize NumPy array (more efficient serialization than dict/list).
shape = (self.n_estimators, max_node_count, n_outputs, max_samples_leaf)
y_train_leaves = np.zeros(shape, dtype=np.int64)

for i, estimator in enumerate(self.estimators_):
# Group training indices by leaf node.
leaf_indices, leaf_values_list = group_indices_by_value(X_leaves_bootstrap[:, i])

if leaf_subsample:
random.seed(estimator.random_state)

if sample_weight is not None or leaf_subsample:
for j in range(len(leaf_values_list)):
if sample_weight is not None:
# Filter leaf samples with zero weight.
weight_mask = sample_weight[leaf_values_list[j] - 1] > 0
leaf_values_list[j] = leaf_values_list[j][weight_mask]
if leaf_subsample:
# Sample leaf to length `max_samples_leaf`.
if len(leaf_values_list[j]) > max_samples_leaf:
random.shuffle(leaf_values_list[j]) # to ensure random sampling
leaf_values_list[j] = leaf_values_list[j][:max_samples_leaf]
if len(leaf_values_list[j]) == 0:
leaf_values_list[j] = [0]

# Map each leaf node to its list of training indices.
if max_samples_leaf == 1: # optimize for single-sample-per-leaf performance
y_indices = bootstrap_indices[:, i][leaf_values_list].reshape(-1, 1, n_outputs)
for j in range(n_outputs):
y_train_leaves[i, leaf_indices, j, 0] = y_indices[:, 0, j]
else: # get mapping for arbitrary leaf sizes
y_train_leaves[i] = map_indices_to_leaves(
y_train_leaves=y_train_leaves[i],
bootstrap_indices=bootstrap_indices[:, i],
leaf_indices=leaf_indices,
leaf_values_list=leaf_values_list,
)
y_train_leaves = Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose,
prefer="threads",
)(
delayed(self._map_indices_to_leaves)(
bootstrap_indices[:, i],
X_leaves_bootstrap[:, i],
sample_weight,
leaf_subsample,
max_node_count,
max_samples_leaf,
estimator.random_state,
)
for i, estimator in enumerate(self.estimators_)
)

return y_train_leaves
return np.array(y_train_leaves)

def _get_y_bound_leaves(self, y, y_train_leaves):
"""Return the bounds for target values for each leaf node.
Expand Down
19 changes: 11 additions & 8 deletions quantile_forest/_utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ cpdef group_indices_by_value(cnp.ndarray[intp_t, ndim=1] a):


cpdef map_indices_to_leaves(
cnp.ndarray[intp_t, ndim=3] y_train_leaves,
cnp.ndarray[intp_t, ndim=3] y_train_leaves_slice,
cnp.ndarray[intp_t, ndim=2] bootstrap_indices,
vector[intp_t] leaf_indices,
vector[vector[intp_t]] leaf_values_list,
Expand All @@ -181,7 +181,7 @@ cpdef map_indices_to_leaves(
Parameters
----------
y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples)
y_train_leaves_slice : array-like of shape (n_leaves, n_outputs, n_samples)
Unpopulated mapping representing a list of nodes, each with a list of
indices of the training samples residing at that node.
Expand All @@ -196,19 +196,22 @@ cpdef map_indices_to_leaves(
Returns
-------
y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples)
Populated mapping of training sample indices to leaf nodes.
y_train_leaves_slice : array-like of shape (n_leaves, n_outputs, n_samples)
Populated mapping of training sample indices to leaf nodes. Nodes with
no samples (e.g., internal nodes) are empty. Internal nodes are
included so that leaf node indices match their ``est.apply``
outputs. Each node list is padded to equal length with 0s.
"""
cdef intp_t n_samples, n_outputs, n_leaves
cdef intp_t i, j, k
cdef vector[intp_t] leaf_values
cdef intp_t leaf_index, leaf_value, y_index
cdef intp_t[:, :, :] y_train_leaves_view
cdef intp_t[:, :, :] y_train_leaves_slice_view

n_outputs = bootstrap_indices.shape[1]
n_leaves = leaf_indices.size()

y_train_leaves_view = y_train_leaves # memoryview
y_train_leaves_slice_view = y_train_leaves_slice # memoryview

with nogil:
for i in range(n_leaves):
Expand All @@ -221,6 +224,6 @@ cpdef map_indices_to_leaves(
for k in range(n_outputs):
y_index = bootstrap_indices[leaf_value, k]
if y_index > 0:
y_train_leaves_view[leaf_index, k, j] = y_index
y_train_leaves_slice_view[leaf_index, k, j] = y_index

return np.asarray(y_train_leaves_view)
return np.asarray(y_train_leaves_slice_view)
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ addopts =
[flake8]
max-line-length = 99
ignore = E203, E266, E402, E501, W503, E731
max-complexity = 25
select = B,C,E,F,W,T4,B9
exclude =
.git,
Expand Down

0 comments on commit 7065c8c

Please sign in to comment.