Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Aug 13, 2024
1 parent 8db4ac6 commit d7cc9d2
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 24 deletions.
117 changes: 108 additions & 9 deletions quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,13 +196,16 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False):
sample_weight = np.asarray(sample_weight)[sorter]

# Get map of tree leaf nodes to training indices.
y_train_leaves = self._get_y_train_leaves(
X, y_sorted.shape[-1], sorter=sorter, sample_weight=sample_weight
y_train_leaves, y_bound_leaves = self._get_y_train_leaves(
X, y_sorted, sorter=sorter, sample_weight=sample_weight
)

# Create quantile forest object.
self.forest_ = QuantileForest(
y_sorted.astype(np.float64).T, y_train_leaves, sparse_pickle=sparse_pickle
y_sorted.astype(np.float64).T,
y_train_leaves,
y_bound_leaves,
sparse_pickle=sparse_pickle,
)

self.sorter_ = sorter
Expand All @@ -212,7 +215,7 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False):

return self

def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
"""Return a mapping of each leaf node to its list of training indices.
The ``apply`` function is used on the ``X`` values to obtain the leaf
Expand All @@ -225,8 +228,8 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
to ``dtype=np.float32``. If a sparse matrix is provided, it will be
converted into a sparse ``csc_matrix``.
y_dim : int
The number of target outputs for each y value.
y : array-like of shape (n_samples, n_outputs)
The target outputs for each y value.
sorter : array-like of shape (n_samples, n_outputs), default=None
The indices that would sort the target values in ascending order.
Expand All @@ -240,14 +243,19 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
Returns
-------
y_train_leaves : array-like of shape \
(n_estimators, n_leaves, n_outputs, n_indices)
(n_estimators, n_leaves, n_indices, n_outputs)
List of trees, each with a list of nodes, each with a list of
indices of the training samples residing at that node. Nodes with
no samples (e.g., internal nodes) are empty. Internal nodes are
included so that leaf node indices match their ``est.apply``
outputs. Each node list is padded to equal length with 0s.
y_bound_leaves : array-like of shape (n_estimators, n_leaves, 2)
Minimum and maximum bounds for target values for each leaf node.
Used to enforce monotonicity constraints.
"""
n_samples = X.shape[0]
y_dim = y.shape[-1]

if isinstance(self.max_samples_leaf, (Integral, np.integer)):
max_samples_leaf = self.max_samples_leaf
Expand Down Expand Up @@ -341,7 +349,87 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
for j in range(y_dim):
y_train_leaves[i, leaf_idx, j, : len(y_indices)] = y_indices

return y_train_leaves
y_bound_leaves = self._get_y_bound_leaves(
X_leaves_bootstrap, bootstrap_indices, y, max_node_count
)

return y_train_leaves, y_bound_leaves

def _get_y_bound_leaves(self, X_leaves_bootstrap, bootstrap_indices, y, max_node_count):
if self.monotonic_cst is None:
return None

y_bound_leaves = np.full((self.n_estimators, max_node_count, 2), [-np.inf, np.inf])

Check warning on line 362 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L362

Added line #L362 was not covered by tests

for i, estimator in enumerate(self.estimators_):
tree = estimator.tree_

Check warning on line 365 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L364-L365

Added lines #L364 - L365 were not covered by tests

min_values = np.full(tree.node_count, np.inf)
max_values = np.full(tree.node_count, -np.inf)
mid_values = np.full(tree.node_count, np.nan)

Check warning on line 369 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L367-L369

Added lines #L367 - L369 were not covered by tests

# Populate the leaf nodes with actual target values.
for node_idx in range(tree.node_count):
if tree.children_left[node_idx] == tree.children_right[node_idx]: # leaf node
leaf_indices = np.where(X_leaves_bootstrap[:, i] == node_idx)[0]
leaf_targets = y[bootstrap_indices[:, i][leaf_indices] - 1]
if len(leaf_indices) > 0:
min_values[node_idx] = np.min(leaf_targets)
max_values[node_idx] = np.max(leaf_targets)
mid_values[node_idx] = np.mean(leaf_targets)

Check warning on line 379 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L372-L379

Added lines #L372 - L379 were not covered by tests

# Propagate values from leaves to root.
for node_idx in range(tree.node_count - 1, -1, -1):
if tree.children_left[node_idx] != tree.children_right[node_idx]: # non-leaf node
left_child = tree.children_left[node_idx]
right_child = tree.children_right[node_idx]

Check warning on line 385 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L382-L385

Added lines #L382 - L385 were not covered by tests

# The min and max of the parent is the min/max of its children.
min_values[node_idx] = min(min_values[left_child], min_values[right_child])
max_values[node_idx] = max(max_values[left_child], max_values[right_child])

Check warning on line 389 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L388-L389

Added lines #L388 - L389 were not covered by tests

# Traverse from root to leaves to enforce monotonicity.
stack = [(0, -np.inf, np.inf)] # start with root node (node 0)

Check warning on line 392 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L392

Added line #L392 was not covered by tests

while stack:
node_idx, min_bound, max_bound = stack.pop()

Check warning on line 395 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L394-L395

Added lines #L394 - L395 were not covered by tests

if tree.children_left[node_idx] == tree.children_right[node_idx]: # leaf node

Check warning on line 397 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L397

Added line #L397 was not covered by tests
# The bounds have already been calculated in the leaf.
y_bound_leaves[i, node_idx] = [

Check warning on line 399 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L399

Added line #L399 was not covered by tests
# max(min_values[node_idx], min_bound), min(max_values[node_idx], max_bound),
min_bound,
max_bound,
]
else: # non-leaf node
feature_idx = tree.feature[node_idx]
left_child = tree.children_left[node_idx]
right_child = tree.children_right[node_idx]

Check warning on line 407 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L405-L407

Added lines #L405 - L407 were not covered by tests

mid = (min_values[left_child] + max_values[right_child]) / 2
mid = max(min_values[left_child], min(mid, max_values[right_child]))

Check warning on line 410 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L409-L410

Added lines #L409 - L410 were not covered by tests

if self.monotonic_cst[feature_idx] == 1: # increasing monotonicity
stack.append(

Check warning on line 413 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L412-L413

Added lines #L412 - L413 were not covered by tests
# (left_child, min_bound, min(max_bound, max_values[node_idx]))
(left_child, min_bound, np.nanmin([mid, np.inf]))
)
stack.append(

Check warning on line 417 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L417

Added line #L417 was not covered by tests
# (right_child, max(min_bound, min_values[node_idx]), max_bound)
(right_child, np.nanmax([mid, -np.inf]), max_bound)
)
elif self.monotonic_cst[feature_idx] == -1: # decreasing monotonicity

Check warning on line 421 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L421

Added line #L421 was not covered by tests
# stack.append((left_child, max(min_bound, min_values[node_idx]), max_bound))
stack.append((left_child, np.nanmax([mid, -np.inf]), max_bound))

Check warning on line 423 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L423

Added line #L423 was not covered by tests
# stack.append(
# (right_child, min_bound, min(max_bound, max_values[node_idx]))
# )
stack.append((right_child, min_bound, np.nanmin([mid, np.inf])))

Check warning on line 427 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L427

Added line #L427 was not covered by tests
else:
stack.append((left_child, min_bound, max_bound))
stack.append((right_child, min_bound, max_bound))

Check warning on line 430 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L429-L430

Added lines #L429 - L430 were not covered by tests

return y_bound_leaves

Check warning on line 432 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L432

Added line #L432 was not covered by tests

def _oob_samples(self, X, indices=None, duplicates=None):
"""Generate out-of-bag (OOB) samples for each base estimator.
Expand Down Expand Up @@ -579,19 +667,30 @@ def predict(

if self.max_samples_leaf == 1: # optimize for single-sample-per-leaf performance
y_train_leaves = np.asarray(self.forest_.y_train_leaves)
y_bound_leaves = np.asarray(self.forest_.y_bound_leaves)
y_train = np.asarray(self.forest_.y_train).T
y_pred = np.empty((len(X), y_train.shape[1], len(quantiles)))
for output in range(y_train.shape[1]):
leaf_values = np.empty((len(X), self.n_estimators))
for tree in range(self.n_estimators):
if X_indices is None: # IB scoring
train_indices = y_train_leaves[tree, X_leaves[:, tree], output, 0]
leaves = X_leaves[:, tree]
train_indices = y_train_leaves[tree, leaves, output, 0]
if self.monotonic_cst is not None:
clip_min = y_bound_leaves[tree, leaves, 0]
clip_max = y_bound_leaves[tree, leaves, 1]

Check warning on line 681 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L680-L681

Added lines #L680 - L681 were not covered by tests
else: # OOB scoring
indices = X_indices[:, tree] == 1
leaves = X_leaves[indices, tree]
train_indices = np.zeros(len(X), dtype=int)
train_indices[indices] = y_train_leaves[tree, leaves, output, 0]
if self.monotonic_cst is not None:
clip_min, clip_max = np.zeros(len(X)), np.zeros(len(X))
clip_min[indices] = y_bound_leaves[tree, leaves, 0]
clip_max[indices] = y_bound_leaves[tree, leaves, 1]

Check warning on line 690 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L688-L690

Added lines #L688 - L690 were not covered by tests
leaf_values[:, tree] = y_train[train_indices - 1, output]
if self.monotonic_cst is not None:
leaf_values[:, tree] = np.clip(leaf_values[:, tree], clip_min, clip_max)

Check warning on line 693 in quantile_forest/_quantile_forest.py

View check run for this annotation

Codecov / codecov/patch

quantile_forest/_quantile_forest.py#L693

Added line #L693 was not covered by tests
leaf_values[train_indices == 0, tree] = np.nan
if len(quantiles) == 1 and quantiles[0] == -1: # calculate mean
func = np.mean if X_indices is None else np.nanmean
Expand Down
1 change: 1 addition & 0 deletions quantile_forest/_quantile_forest_fast.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ cdef class QuantileForest:
# Input/Output layout.
cdef public vector[vector[float64_t]] y_train
cdef public intp_t[:, :, :, :] y_train_leaves
cdef public float64_t[:, :, :] y_bound_leaves
cdef public bint sparse_pickle

# Methods.
Expand Down
71 changes: 58 additions & 13 deletions quantile_forest/_quantile_forest_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -563,13 +563,17 @@ cdef class QuantileForest:
Training target values. Assumes values are sorted in ascending order.
y_train_leaves : array-like of shape \
(n_estimators, n_leaves, n_outputs, n_indices)
(n_estimators, n_leaves, n_indices, n_outputs)
List of trees, each with a list of nodes, each with a list of indices
of the training samples residing at that node. Nodes with no samples
(e.g., internal nodes) are empty. Internal nodes are included so that
leaf node indices match their ``est.apply`` outputs. Each node list is
padded to equal length with 0s.
y_bound_leaves : array-like of shape (n_estimators, n_leaves, 2)
Minimum and maximum bounds for target values for each leaf node. Used
to enforce monotonicity constraints.
sparse_pickle : bool, default=False
Pickle using a SciPy sparse matrix.
"""
Expand All @@ -578,38 +582,56 @@ cdef class QuantileForest:
self,
cnp.ndarray[float64_t, ndim=2] y_train,
cnp.ndarray[intp_t, ndim=4] y_train_leaves,
cnp.ndarray[float64_t, ndim=3] y_bound_leaves=None,
bint sparse_pickle=<bint>False,
):
"""Constructor."""
self.y_train = y_train
self.y_train_leaves = y_train_leaves
self.y_bound_leaves = y_bound_leaves
self.sparse_pickle = sparse_pickle

def __reduce__(self):
"""Reduce re-implementation, for pickling."""
if self.sparse_pickle:
y_train_leaves = np.empty(shape=(0, 0, 0, 0), dtype=np.int64)
kwargs = {"y_train_leaves": np.asarray(self.y_train_leaves)}
y_bound_leaves = None
kwargs = {
"y_train_leaves": np.asarray(self.y_train_leaves),
"y_bound_leaves": (
None if self.y_bound_leaves is None else np.asarray(self.y_bound_leaves)
),
}
else:
y_train_leaves = np.asarray(self.y_train_leaves)
y_bound_leaves = (
None if self.y_bound_leaves is None else np.asarray(self.y_bound_leaves)
)
kwargs = {}
args = (np.asarray(self.y_train), y_train_leaves, self.sparse_pickle)
args = (np.asarray(self.y_train), y_train_leaves, y_bound_leaves, self.sparse_pickle)
return (QuantileForest, args, self.__getstate__(**kwargs))

def __getstate__(self, **kwargs):
"""Getstate re-implementation, for pickling."""
d = {}
if self.sparse_pickle:
matrix = kwargs["y_train_leaves"]
reshape = (matrix.shape[2], matrix.shape[0] * matrix.shape[1] * matrix.shape[2])
d["shape"] = matrix.shape
d["matrix"] = sparse.csc_matrix(matrix.reshape(reshape))
matrix1 = kwargs["y_train_leaves"]
reshape1 = (matrix1.shape[2], matrix1.shape[0] * matrix1.shape[1] * matrix1.shape[2])
d["shape1"] = matrix1.shape
d["matrix1"] = sparse.csc_matrix(matrix1.reshape(reshape1))

matrix2 = kwargs["y_bound_leaves"]
reshape2 = (matrix2.shape[2], matrix2.shape[0] * matrix2.shape[1])
d["shape2"] = matrix2.shape
d["matrix2"] = sparse.csc_matrix(matrix2.reshape(reshape2))

return d

def __setstate__(self, d):
"""Setstate re-implementation, for unpickling."""
if self.sparse_pickle:
self.y_train_leaves = d["matrix"].toarray().reshape(d["shape"])
self.y_train_leaves = d["matrix1"].toarray().reshape(d["shape1"])
self.y_bound_leaves = d["matrix2"].toarray().reshape(d["shape2"])

cpdef cnp.ndarray predict(
self,
Expand Down Expand Up @@ -664,12 +686,12 @@ cdef class QuantileForest:
cdef vector[double] leaf_samples
cdef vector[double] leaf_weights
cdef vector[vector[intp_t]] train_indices
cdef vector[vector[double]] train_weights
cdef vector[vector[double]] train_weights, clip_mins, clip_maxs
cdef intp_t idx, train_idx
cdef double train_wgt
cdef double clip_min, clip_max, train_wgt
cdef vector[int] n_leaf_samples
cdef int n_total_samples, n_total_trees
cdef double train_weight
cdef double train_weight, sample_value
cdef vector[vector[double]] leaf_preds
cdef vector[double] pred
cdef cnp.ndarray[float64_t, ndim=3] preds
Expand Down Expand Up @@ -710,6 +732,8 @@ cdef class QuantileForest:
with nogil:
idx = 1 if aggregate_leaves_first else n_trees
train_indices = vector[vector[intp_t]](idx)
clip_mins = vector[vector[double]](idx)
clip_maxs = vector[vector[double]](idx)

n_leaf_samples = vector[int](n_trees)
leaf_preds = vector[vector[double]](n_quantiles)
Expand Down Expand Up @@ -738,6 +762,12 @@ cdef class QuantileForest:
for k in range(<intp_t>(leaf_preds.size())):
leaf_preds[k].clear()

# Initialize arrays for clipping.
for k in range(<intp_t>(clip_mins.size())):
clip_mins[k].clear()
for k in range(<intp_t>(clip_maxs.size())):
clip_maxs[k].clear()

# Accumulate training indices across leaves for each tree.
# If `aggregate_leaves_first`, also accumulate across trees.
for k in range(n_trees):
Expand All @@ -749,6 +779,14 @@ cdef class QuantileForest:
&self.y_train_leaves[k, X_leaves[i, k], j, max_idx],
)

if self.y_bound_leaves is not None:
# Repeat the insertion of the corresponding clipping bounds.
clip_min = self.y_bound_leaves[k, X_leaves[i, k], 0]
clip_max = self.y_bound_leaves[k, X_leaves[i, k], 1]
for _ in range(max_idx):
clip_mins[idx].push_back(clip_min)
clip_maxs[idx].push_back(clip_max)

if weighted_quantile:
for k in range(<intp_t>(train_weights.size())):
train_weights[k].clear()
Expand Down Expand Up @@ -807,9 +845,16 @@ cdef class QuantileForest:
leaf_samples.clear()

# Get training target values associated with indices.
for train_idx in train_indices[k]:
for l in range(<intp_t>(train_indices[k].size())):
train_idx = train_indices[k][l]
if train_idx != 0:
leaf_samples.push_back(self.y_train[j][train_idx - 1])
# Apply clipping to each training sample.
sample_value = self.y_train[j][train_idx - 1]
if self.y_bound_leaves is not None:
clip_min = clip_mins[k][l]
clip_max = clip_maxs[k][l]
sample_value = max(min(sample_value, clip_max), clip_min)
leaf_samples.push_back(sample_value)

# Calculate quantiles (or mean).
if not use_mean:
Expand Down
6 changes: 4 additions & 2 deletions quantile_forest/tests/test_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,9 @@ def check_max_samples_leaf(name):
est.fit(X, y)

max_leaf_size = 0
for _, tree_lookup in enumerate(est._get_y_train_leaves(X, 1)):
y_train_leaves, _ = est._get_y_train_leaves(X, y[:1])
for i in range(len(y_train_leaves)):
tree_lookup = y_train_leaves[i]
for leaf_samples in np.squeeze(tree_lookup, -2):
n_leaf_samples = len([x for x in leaf_samples if x != 0])
if n_leaf_samples > max_leaf_size:
Expand All @@ -819,7 +821,7 @@ def check_max_samples_leaf(name):
est.param_validation = param_validation
assert_raises(ValueError, est.fit, X, y)
est.max_samples_leaf = max_samples_leaf
assert_raises(ValueError, est._get_y_train_leaves, X, 1)
assert_raises(ValueError, est._get_y_train_leaves, X, y)


@pytest.mark.parametrize("name", FOREST_REGRESSORS)
Expand Down

0 comments on commit d7cc9d2

Please sign in to comment.