Skip to content

Commit

Permalink
Optimize Leaf Node Mapping (#78)
Browse files Browse the repository at this point in the history
* Optimize leaf node mapping
  • Loading branch information
reidjohnson authored Aug 31, 2024
1 parent 64e9449 commit 222c0a0
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 25 deletions.
48 changes: 28 additions & 20 deletions quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class calls the ``fit`` method of the ``ForestRegressor`` and creates a
param_validation = False
from sklearn.utils.validation import check_is_fitted

from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices
from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices, map_leaf_nodes

sklearn_version = parse_version(sklearn.__version__)

Expand All @@ -69,8 +69,8 @@ def _group_by_value(a):
a_sorted = a[sort_idx]
unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1]))
unq_items = a_sorted[unq_first]
unq_count = np.diff(np.concatenate(np.nonzero(unq_first) + ([a.size],)))
unq_idx = np.split(sort_idx, np.cumsum(unq_count[:-1]))
unq_indices = np.flatnonzero(unq_first)
unq_idx = np.array_split(sort_idx, unq_indices[1:])
return unq_items, unq_idx


Expand Down Expand Up @@ -303,9 +303,8 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
if sorter is not None:
# Reassign bootstrap indices to account for target sorting.
bootstrap_indices = np.argsort(sorter, axis=0)[bootstrap_indices]
if bootstrap_indices.shape[-1] == 1:
bootstrap_indices = np.squeeze(bootstrap_indices, -1)

bootstrap_indices = bootstrap_indices.reshape(-1, self.n_estimators, n_outputs)
bootstrap_indices += 1 # for sparse matrix (0s as empty)

# Get the maximum number of nodes (internal + leaves) across trees.
Expand Down Expand Up @@ -335,23 +334,32 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
if leaf_subsample:
random.seed(estimator.random_state)

# Map each leaf node to its list of training indices.
for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
y_indices = bootstrap_indices[:, i][leaf_values].reshape(-1, n_outputs)

if sample_weight is not None:
y_indices = y_indices[sample_weight[y_indices - 1] > 0]

# Subsample leaf training indices (without replacement).
if leaf_subsample and max_samples_leaf < len(y_indices):
if not isinstance(y_indices, list):
y_indices = list(y_indices)
y_indices = random.sample(y_indices, max_samples_leaf)

y_indices = np.asarray(y_indices).T.reshape(n_outputs, -1)
if sample_weight is not None or leaf_subsample:
for j in range(len(leaf_values_list)):
if sample_weight is not None:
# Filter leaf samples with zero weight.
weight_mask = sample_weight[leaf_values_list[j] - 1] > 0
leaf_values_list[j] = leaf_values_list[j][weight_mask]
if leaf_subsample:
# Sample leaf to length `max_samples_leaf`.
if len(leaf_values_list[j]) > max_samples_leaf:
random.shuffle(leaf_values_list[j]) # to ensure random sampling
leaf_values_list[j] = leaf_values_list[j][:max_samples_leaf]
if len(leaf_values_list[j]) == 0:
leaf_values_list[j] = [0]

# Map each leaf node to its list of training indices.
if max_samples_leaf == 1: # optimize for single-sample-per-leaf performance
y_indices = bootstrap_indices[:, i][leaf_values_list].reshape(-1, 1, n_outputs)
for j in range(n_outputs):
y_train_leaves[i, leaf_idx, j, : len(y_indices[j])] = y_indices[j]
y_train_leaves[i, leaf_indices, j, 0] = y_indices[:, 0, j]
else: # get mapping for arbitrary leaf sizes
y_train_leaves[i] = map_leaf_nodes(
y_train_leaves=y_train_leaves[i],
bootstrap_indices=bootstrap_indices[:, i],
leaf_indices=leaf_indices,
leaf_values_list=leaf_values_list,
)

return y_train_leaves

Expand Down
59 changes: 57 additions & 2 deletions quantile_forest/_quantile_forest_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -523,18 +523,18 @@ cpdef vector[intp_t] generate_unsampled_indices(
n_total_samples : int
Number of total samples, sampled and unsampled.
duplicates : list of sets
duplicates : list of sets of ints
List of sets of functionally identical indices.
Returns
-------
unsampled_indices : array-like
List of unsampled indices.
"""
cdef intp_t n_samples, n_duplicates
cdef intp_t i
cdef intp_t sampled_idx
cdef set[intp_t] sampled_set
cdef intp_t n_samples, n_duplicates
cdef vector[intp_t] unsampled_indices

n_samples = sample_indices.size()
Expand All @@ -557,6 +557,61 @@ cpdef vector[intp_t] generate_unsampled_indices(
return unsampled_indices


cpdef map_leaf_nodes(
cnp.ndarray[intp_t, ndim=3] y_train_leaves,
cnp.ndarray[intp_t, ndim=2] bootstrap_indices,
vector[intp_t] leaf_indices,
vector[vector[intp_t]] leaf_values_list,
) noexcept:
"""Return a mapping of training sample indices to a tree's leaf nodes.
Parameters
----------
y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples)
Unpopulated mapping representing a list of nodes, each with a list of
indices of the training samples residing at that node.
bootstrap_indices : array-like of shape (n_samples, n_outputs)
Bootstrap indices of training samples.
leaf_indices : list of ints
List of leaf node indices. Values correspond to `leaf_values_list`.
leaf_values_list : list of list of ints
List of leaf node sample indices. Values correspond to `leaf_indices`.
Returns
-------
y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples)
Populated mapping of training sample indices to leaf nodes.
"""
cdef intp_t n_samples, n_outputs, n_leaves
cdef intp_t i, j, k
cdef vector[intp_t] leaf_values
cdef intp_t leaf_index, leaf_value, y_index
cdef intp_t[:, :, :] y_train_leaves_view

n_outputs = bootstrap_indices.shape[1]
n_leaves = leaf_indices.size()

y_train_leaves_view = y_train_leaves # memoryview

with nogil:
for i in range(n_leaves):
leaf_index = leaf_indices[i]
leaf_values = leaf_values_list[i]

n_samples = leaf_values.size()
for j in range(n_samples):
leaf_value = leaf_values[j]
for k in range(n_outputs):
y_index = bootstrap_indices[leaf_value, k]
if y_index > 0:
y_train_leaves_view[leaf_index, k, j] = y_index

return np.asarray(y_train_leaves_view)


cdef class QuantileForest:
"""Representation of a quantile forest.
Expand Down
9 changes: 7 additions & 2 deletions quantile_forest/tests/test_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def check_california_criterion(name, criterion):

# Test sample weights.
regr = ForestRegressor(n_estimators=5, criterion=criterion, random_state=0)
sample_weight = np.ones(y_california.shape)
sample_weight = np.concatenate([np.zeros(1), np.ones(len(y_california) - 1)])
regr.fit(X_california, y_california, sample_weight=sample_weight)
score = regr.score(X_california, y_california, quantiles=0.5)
assert score > 0.9, f"Failed with criterion {criterion}, sample weight and score={score}."
Expand Down Expand Up @@ -786,7 +786,12 @@ def check_max_samples_leaf(name):

max_leaf_sizes = []
for max_samples_leaf in [0.99 / len(X), 1, 3.0 / len(X), 5, 20, None]:
est = ForestRegressor(n_estimators=10, max_samples_leaf=max_samples_leaf, random_state=0)
est = ForestRegressor(
n_estimators=10,
min_samples_leaf=max_samples_leaf if max_samples_leaf is not None else len(X),
max_samples_leaf=max_samples_leaf,
random_state=0,
)
est.fit(X, y)

max_leaf_size = 0
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ addopts =
[flake8]
max-line-length = 99
ignore = E203, E266, E402, E501, W503, E731
max-complexity = 24
max-complexity = 25
select = B,C,E,F,W,T4,B9
exclude =
.git,
Expand Down

0 comments on commit 222c0a0

Please sign in to comment.