diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py index f232bb3..fb1e4e7 100755 --- a/quantile_forest/_quantile_forest.py +++ b/quantile_forest/_quantile_forest.py @@ -196,13 +196,16 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False): sample_weight = np.asarray(sample_weight)[sorter] # Get map of tree leaf nodes to training indices. - y_train_leaves = self._get_y_train_leaves( - X, y_sorted.shape[-1], sorter=sorter, sample_weight=sample_weight + y_train_leaves, y_bound_leaves = self._get_y_train_leaves( + X, y_sorted, sorter=sorter, sample_weight=sample_weight ) # Create quantile forest object. self.forest_ = QuantileForest( - y_sorted.astype(np.float64).T, y_train_leaves, sparse_pickle=sparse_pickle + y_sorted.astype(np.float64).T, + y_train_leaves, + y_bound_leaves, + sparse_pickle=sparse_pickle, ) self.sorter_ = sorter @@ -212,7 +215,7 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False): return self - def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None): + def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None): """Return a mapping of each leaf node to its list of training indices. The ``apply`` function is used on the ``X`` values to obtain the leaf @@ -225,8 +228,8 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None): to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. - y_dim : int - The number of target outputs for each y value. + y : array-like of shape (n_samples, n_outputs) + The target outputs for each y value. sorter : array-like of shape (n_samples, n_outputs), default=None The indices that would sort the target values in ascending order. @@ -240,14 +243,19 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None): Returns ------- y_train_leaves : array-like of shape \ - (n_estimators, n_leaves, n_outputs, n_indices) + (n_estimators, n_leaves, n_indices, n_outputs) List of trees, each with a list of nodes, each with a list of indices of the training samples residing at that node. Nodes with no samples (e.g., internal nodes) are empty. Internal nodes are included so that leaf node indices match their ``est.apply`` outputs. Each node list is padded to equal length with 0s. + + y_bound_leaves : array-like of shape (n_estimators, n_leaves, 2) + Minimum and maximum bounds for target values for each leaf node. + Used to enforce monotonicity constraints. """ n_samples = X.shape[0] + y_dim = y.shape[-1] if isinstance(self.max_samples_leaf, (Integral, np.integer)): max_samples_leaf = self.max_samples_leaf @@ -341,7 +349,87 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None): for j in range(y_dim): y_train_leaves[i, leaf_idx, j, : len(y_indices)] = y_indices - return y_train_leaves + y_bound_leaves = self._get_y_bound_leaves( + X_leaves_bootstrap, bootstrap_indices, y, max_node_count + ) + + return y_train_leaves, y_bound_leaves + + def _get_y_bound_leaves(self, X_leaves_bootstrap, bootstrap_indices, y, max_node_count): + if self.monotonic_cst is None: + return None + + y_bound_leaves = np.full((self.n_estimators, max_node_count, 2), [-np.inf, np.inf]) + + for i, estimator in enumerate(self.estimators_): + tree = estimator.tree_ + + min_values = np.full(tree.node_count, np.inf) + max_values = np.full(tree.node_count, -np.inf) + mid_values = np.full(tree.node_count, np.nan) + + # Populate the leaf nodes with actual target values. + for node_idx in range(tree.node_count): + if tree.children_left[node_idx] == tree.children_right[node_idx]: # leaf node + leaf_indices = np.where(X_leaves_bootstrap[:, i] == node_idx)[0] + leaf_targets = y[bootstrap_indices[:, i][leaf_indices] - 1] + if len(leaf_indices) > 0: + min_values[node_idx] = np.min(leaf_targets) + max_values[node_idx] = np.max(leaf_targets) + mid_values[node_idx] = np.mean(leaf_targets) + + # Propagate values from leaves to root. + for node_idx in range(tree.node_count - 1, -1, -1): + if tree.children_left[node_idx] != tree.children_right[node_idx]: # non-leaf node + left_child = tree.children_left[node_idx] + right_child = tree.children_right[node_idx] + + # The min and max of the parent is the min/max of its children. + min_values[node_idx] = min(min_values[left_child], min_values[right_child]) + max_values[node_idx] = max(max_values[left_child], max_values[right_child]) + + # Traverse from root to leaves to enforce monotonicity. + stack = [(0, -np.inf, np.inf)] # start with root node (node 0) + + while stack: + node_idx, min_bound, max_bound = stack.pop() + + if tree.children_left[node_idx] == tree.children_right[node_idx]: # leaf node + # The bounds have already been calculated in the leaf. + y_bound_leaves[i, node_idx] = [ + # max(min_values[node_idx], min_bound), min(max_values[node_idx], max_bound), + min_bound, + max_bound, + ] + else: # non-leaf node + feature_idx = tree.feature[node_idx] + left_child = tree.children_left[node_idx] + right_child = tree.children_right[node_idx] + + mid = (min_values[left_child] + max_values[right_child]) / 2 + mid = max(min_values[left_child], min(mid, max_values[right_child])) + + if self.monotonic_cst[feature_idx] == 1: # increasing monotonicity + stack.append( + # (left_child, min_bound, min(max_bound, max_values[node_idx])) + (left_child, min_bound, np.nanmin([mid, np.inf])) + ) + stack.append( + # (right_child, max(min_bound, min_values[node_idx]), max_bound) + (right_child, np.nanmax([mid, -np.inf]), max_bound) + ) + elif self.monotonic_cst[feature_idx] == -1: # decreasing monotonicity + # stack.append((left_child, max(min_bound, min_values[node_idx]), max_bound)) + stack.append((left_child, np.nanmax([mid, -np.inf]), max_bound)) + # stack.append( + # (right_child, min_bound, min(max_bound, max_values[node_idx])) + # ) + stack.append((right_child, min_bound, np.nanmin([mid, np.inf]))) + else: + stack.append((left_child, min_bound, max_bound)) + stack.append((right_child, min_bound, max_bound)) + + return y_bound_leaves def _oob_samples(self, X, indices=None, duplicates=None): """Generate out-of-bag (OOB) samples for each base estimator. @@ -579,19 +667,30 @@ def predict( if self.max_samples_leaf == 1: # optimize for single-sample-per-leaf performance y_train_leaves = np.asarray(self.forest_.y_train_leaves) + y_bound_leaves = np.asarray(self.forest_.y_bound_leaves) y_train = np.asarray(self.forest_.y_train).T y_pred = np.empty((len(X), y_train.shape[1], len(quantiles))) for output in range(y_train.shape[1]): leaf_values = np.empty((len(X), self.n_estimators)) for tree in range(self.n_estimators): if X_indices is None: # IB scoring - train_indices = y_train_leaves[tree, X_leaves[:, tree], output, 0] + leaves = X_leaves[:, tree] + train_indices = y_train_leaves[tree, leaves, output, 0] + if self.monotonic_cst is not None: + clip_min = y_bound_leaves[tree, leaves, 0] + clip_max = y_bound_leaves[tree, leaves, 1] else: # OOB scoring indices = X_indices[:, tree] == 1 leaves = X_leaves[indices, tree] train_indices = np.zeros(len(X), dtype=int) train_indices[indices] = y_train_leaves[tree, leaves, output, 0] + if self.monotonic_cst is not None: + clip_min, clip_max = np.zeros(len(X)), np.zeros(len(X)) + clip_min[indices] = y_bound_leaves[tree, leaves, 0] + clip_max[indices] = y_bound_leaves[tree, leaves, 1] leaf_values[:, tree] = y_train[train_indices - 1, output] + if self.monotonic_cst is not None: + leaf_values[:, tree] = np.clip(leaf_values[:, tree], clip_min, clip_max) leaf_values[train_indices == 0, tree] = np.nan if len(quantiles) == 1 and quantiles[0] == -1: # calculate mean func = np.mean if X_indices is None else np.nanmean diff --git a/quantile_forest/_quantile_forest_fast.pxd b/quantile_forest/_quantile_forest_fast.pxd index 3f6a403..b6acde2 100755 --- a/quantile_forest/_quantile_forest_fast.pxd +++ b/quantile_forest/_quantile_forest_fast.pxd @@ -15,6 +15,7 @@ cdef class QuantileForest: # Input/Output layout. cdef public vector[vector[float64_t]] y_train cdef public intp_t[:, :, :, :] y_train_leaves + cdef public float64_t[:, :, :] y_bound_leaves cdef public bint sparse_pickle # Methods. diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx index a4be5b7..d33b0e9 100755 --- a/quantile_forest/_quantile_forest_fast.pyx +++ b/quantile_forest/_quantile_forest_fast.pyx @@ -563,13 +563,17 @@ cdef class QuantileForest: Training target values. Assumes values are sorted in ascending order. y_train_leaves : array-like of shape \ - (n_estimators, n_leaves, n_outputs, n_indices) + (n_estimators, n_leaves, n_indices, n_outputs) List of trees, each with a list of nodes, each with a list of indices of the training samples residing at that node. Nodes with no samples (e.g., internal nodes) are empty. Internal nodes are included so that leaf node indices match their ``est.apply`` outputs. Each node list is padded to equal length with 0s. + y_bound_leaves : array-like of shape (n_estimators, n_leaves, 2) + Minimum and maximum bounds for target values for each leaf node. Used + to enforce monotonicity constraints. + sparse_pickle : bool, default=False Pickle using a SciPy sparse matrix. """ @@ -578,38 +582,56 @@ cdef class QuantileForest: self, cnp.ndarray[float64_t, ndim=2] y_train, cnp.ndarray[intp_t, ndim=4] y_train_leaves, + cnp.ndarray[float64_t, ndim=3] y_bound_leaves=None, bint sparse_pickle=False, ): """Constructor.""" self.y_train = y_train self.y_train_leaves = y_train_leaves + self.y_bound_leaves = y_bound_leaves self.sparse_pickle = sparse_pickle def __reduce__(self): """Reduce re-implementation, for pickling.""" if self.sparse_pickle: y_train_leaves = np.empty(shape=(0, 0, 0, 0), dtype=np.int64) - kwargs = {"y_train_leaves": np.asarray(self.y_train_leaves)} + y_bound_leaves = None + kwargs = { + "y_train_leaves": np.asarray(self.y_train_leaves), + "y_bound_leaves": ( + None if self.y_bound_leaves is None else np.asarray(self.y_bound_leaves) + ), + } else: y_train_leaves = np.asarray(self.y_train_leaves) + y_bound_leaves = ( + None if self.y_bound_leaves is None else np.asarray(self.y_bound_leaves) + ) kwargs = {} - args = (np.asarray(self.y_train), y_train_leaves, self.sparse_pickle) + args = (np.asarray(self.y_train), y_train_leaves, y_bound_leaves, self.sparse_pickle) return (QuantileForest, args, self.__getstate__(**kwargs)) def __getstate__(self, **kwargs): """Getstate re-implementation, for pickling.""" d = {} if self.sparse_pickle: - matrix = kwargs["y_train_leaves"] - reshape = (matrix.shape[2], matrix.shape[0] * matrix.shape[1] * matrix.shape[2]) - d["shape"] = matrix.shape - d["matrix"] = sparse.csc_matrix(matrix.reshape(reshape)) + matrix1 = kwargs["y_train_leaves"] + reshape1 = (matrix1.shape[2], matrix1.shape[0] * matrix1.shape[1] * matrix1.shape[2]) + d["shape1"] = matrix1.shape + d["matrix1"] = sparse.csc_matrix(matrix1.reshape(reshape1)) + + matrix2 = kwargs["y_bound_leaves"] + reshape2 = (matrix2.shape[2], matrix2.shape[0] * matrix2.shape[1]) + d["shape2"] = matrix2.shape + d["matrix2"] = sparse.csc_matrix(matrix2.reshape(reshape2)) + return d def __setstate__(self, d): """Setstate re-implementation, for unpickling.""" if self.sparse_pickle: - self.y_train_leaves = d["matrix"].toarray().reshape(d["shape"]) + self.y_train_leaves = d["matrix1"].toarray().reshape(d["shape1"]) + self.y_bound_leaves = d["matrix2"].toarray().reshape(d["shape2"]) cpdef cnp.ndarray predict( self, @@ -664,12 +686,12 @@ cdef class QuantileForest: cdef vector[double] leaf_samples cdef vector[double] leaf_weights cdef vector[vector[intp_t]] train_indices - cdef vector[vector[double]] train_weights + cdef vector[vector[double]] train_weights, clip_mins, clip_maxs cdef intp_t idx, train_idx - cdef double train_wgt + cdef double clip_min, clip_max, train_wgt cdef vector[int] n_leaf_samples cdef int n_total_samples, n_total_trees - cdef double train_weight + cdef double train_weight, sample_value cdef vector[vector[double]] leaf_preds cdef vector[double] pred cdef cnp.ndarray[float64_t, ndim=3] preds @@ -710,6 +732,8 @@ cdef class QuantileForest: with nogil: idx = 1 if aggregate_leaves_first else n_trees train_indices = vector[vector[intp_t]](idx) + clip_mins = vector[vector[double]](idx) + clip_maxs = vector[vector[double]](idx) n_leaf_samples = vector[int](n_trees) leaf_preds = vector[vector[double]](n_quantiles) @@ -738,6 +762,12 @@ cdef class QuantileForest: for k in range((leaf_preds.size())): leaf_preds[k].clear() + # Initialize arrays for clipping. + for k in range((clip_mins.size())): + clip_mins[k].clear() + for k in range((clip_maxs.size())): + clip_maxs[k].clear() + # Accumulate training indices across leaves for each tree. # If `aggregate_leaves_first`, also accumulate across trees. for k in range(n_trees): @@ -749,6 +779,14 @@ cdef class QuantileForest: &self.y_train_leaves[k, X_leaves[i, k], j, max_idx], ) + if self.y_bound_leaves is not None: + # Repeat the insertion of the corresponding clipping bounds. + clip_min = self.y_bound_leaves[k, X_leaves[i, k], 0] + clip_max = self.y_bound_leaves[k, X_leaves[i, k], 1] + for _ in range(max_idx): + clip_mins[idx].push_back(clip_min) + clip_maxs[idx].push_back(clip_max) + if weighted_quantile: for k in range((train_weights.size())): train_weights[k].clear() @@ -807,9 +845,16 @@ cdef class QuantileForest: leaf_samples.clear() # Get training target values associated with indices. - for train_idx in train_indices[k]: + for l in range((train_indices[k].size())): + train_idx = train_indices[k][l] if train_idx != 0: - leaf_samples.push_back(self.y_train[j][train_idx - 1]) + # Apply clipping to each training sample. + sample_value = self.y_train[j][train_idx - 1] + if self.y_bound_leaves is not None: + clip_min = clip_mins[k][l] + clip_max = clip_maxs[k][l] + sample_value = max(min(sample_value, clip_max), clip_min) + leaf_samples.push_back(sample_value) # Calculate quantiles (or mean). if not use_mean: diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py index 7691556..207a8c9 100755 --- a/quantile_forest/tests/test_quantile_forest.py +++ b/quantile_forest/tests/test_quantile_forest.py @@ -793,7 +793,9 @@ def check_max_samples_leaf(name): est.fit(X, y) max_leaf_size = 0 - for _, tree_lookup in enumerate(est._get_y_train_leaves(X, 1)): + y_train_leaves, _ = est._get_y_train_leaves(X, y[:1]) + for i in range(len(y_train_leaves)): + tree_lookup = y_train_leaves[i] for leaf_samples in np.squeeze(tree_lookup, -2): n_leaf_samples = len([x for x in leaf_samples if x != 0]) if n_leaf_samples > max_leaf_size: @@ -819,7 +821,7 @@ def check_max_samples_leaf(name): est.param_validation = param_validation assert_raises(ValueError, est.fit, X, y) est.max_samples_leaf = max_samples_leaf - assert_raises(ValueError, est._get_y_train_leaves, X, 1) + assert_raises(ValueError, est._get_y_train_leaves, X, y) @pytest.mark.parametrize("name", FOREST_REGRESSORS)