Initial commit

zillow · Aug 13, 2024 · d7cc9d2 · d7cc9d2
1 parent 8db4ac6
commit d7cc9d2
Show file tree

Hide file tree

Showing 4 changed files with 171 additions and 24 deletions.
diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py
@@ -196,13 +196,16 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False):
             sample_weight = np.asarray(sample_weight)[sorter]
 
         # Get map of tree leaf nodes to training indices.
-        y_train_leaves = self._get_y_train_leaves(
-            X, y_sorted.shape[-1], sorter=sorter, sample_weight=sample_weight
+        y_train_leaves, y_bound_leaves = self._get_y_train_leaves(
+            X, y_sorted, sorter=sorter, sample_weight=sample_weight
         )
 
         # Create quantile forest object.
         self.forest_ = QuantileForest(
-            y_sorted.astype(np.float64).T, y_train_leaves, sparse_pickle=sparse_pickle
+            y_sorted.astype(np.float64).T,
+            y_train_leaves,
+            y_bound_leaves,
+            sparse_pickle=sparse_pickle,
         )
 
         self.sorter_ = sorter
@@ -212,7 +215,7 @@ def fit(self, X, y, sample_weight=None, sparse_pickle=False):
 
         return self
 
-    def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
+    def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):
         """Return a mapping of each leaf node to its list of training indices.
 
         The ``apply`` function is used on the ``X`` values to obtain the leaf
@@ -225,8 +228,8 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
             to ``dtype=np.float32``. If a sparse matrix is provided, it will be
             converted into a sparse ``csc_matrix``.
 
-        y_dim : int
-            The number of target outputs for each y value.
+        y : array-like of shape (n_samples, n_outputs)
+            The target outputs for each y value.
 
         sorter : array-like of shape (n_samples, n_outputs), default=None
             The indices that would sort the target values in ascending order.
@@ -240,14 +243,19 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
         Returns
         -------
         y_train_leaves : array-like of shape \
-                (n_estimators, n_leaves, n_outputs, n_indices)
+                (n_estimators, n_leaves, n_indices, n_outputs)
             List of trees, each with a list of nodes, each with a list of
             indices of the training samples residing at that node. Nodes with
             no samples (e.g., internal nodes) are empty. Internal nodes are
             included so that leaf node indices match their ``est.apply``
             outputs. Each node list is padded to equal length with 0s.
+
+        y_bound_leaves : array-like of shape (n_estimators, n_leaves, 2)
+            Minimum and maximum bounds for target values for each leaf node.
+            Used to enforce monotonicity constraints.
         """
         n_samples = X.shape[0]
+        y_dim = y.shape[-1]
 
         if isinstance(self.max_samples_leaf, (Integral, np.integer)):
             max_samples_leaf = self.max_samples_leaf
@@ -341,7 +349,87 @@ def _get_y_train_leaves(self, X, y_dim, sorter=None, sample_weight=None):
                     for j in range(y_dim):
                         y_train_leaves[i, leaf_idx, j, : len(y_indices)] = y_indices
 
-        return y_train_leaves
+        y_bound_leaves = self._get_y_bound_leaves(
+            X_leaves_bootstrap, bootstrap_indices, y, max_node_count
+        )
+
+        return y_train_leaves, y_bound_leaves
+
+    def _get_y_bound_leaves(self, X_leaves_bootstrap, bootstrap_indices, y, max_node_count):
+        if self.monotonic_cst is None:
+            return None
+
+        y_bound_leaves = np.full((self.n_estimators, max_node_count, 2), [-np.inf, np.inf])
+
+        for i, estimator in enumerate(self.estimators_):
+            tree = estimator.tree_
+
+            min_values = np.full(tree.node_count, np.inf)
+            max_values = np.full(tree.node_count, -np.inf)
+            mid_values = np.full(tree.node_count, np.nan)
+
+            # Populate the leaf nodes with actual target values.
+            for node_idx in range(tree.node_count):
+                if tree.children_left[node_idx] == tree.children_right[node_idx]:  # leaf node
+                    leaf_indices = np.where(X_leaves_bootstrap[:, i] == node_idx)[0]
+                    leaf_targets = y[bootstrap_indices[:, i][leaf_indices] - 1]
+                    if len(leaf_indices) > 0:
+                        min_values[node_idx] = np.min(leaf_targets)
+                        max_values[node_idx] = np.max(leaf_targets)
+                        mid_values[node_idx] = np.mean(leaf_targets)
+
+            # Propagate values from leaves to root.
+            for node_idx in range(tree.node_count - 1, -1, -1):
+                if tree.children_left[node_idx] != tree.children_right[node_idx]:  # non-leaf node
+                    left_child = tree.children_left[node_idx]
+                    right_child = tree.children_right[node_idx]
+
+                    # The min and max of the parent is the min/max of its children.
+                    min_values[node_idx] = min(min_values[left_child], min_values[right_child])
+                    max_values[node_idx] = max(max_values[left_child], max_values[right_child])
+
+            # Traverse from root to leaves to enforce monotonicity.
+            stack = [(0, -np.inf, np.inf)]  # start with root node (node 0)
+
+            while stack:
+                node_idx, min_bound, max_bound = stack.pop()
+
+                if tree.children_left[node_idx] == tree.children_right[node_idx]:  # leaf node
+                    # The bounds have already been calculated in the leaf.
+                    y_bound_leaves[i, node_idx] = [
+                        # max(min_values[node_idx], min_bound), min(max_values[node_idx], max_bound),
+                        min_bound,
+                        max_bound,
+                    ]
+                else:  # non-leaf node
+                    feature_idx = tree.feature[node_idx]
+                    left_child = tree.children_left[node_idx]
+                    right_child = tree.children_right[node_idx]
+
+                    mid = (min_values[left_child] + max_values[right_child]) / 2
+                    mid = max(min_values[left_child], min(mid, max_values[right_child]))
+
+                    if self.monotonic_cst[feature_idx] == 1:  # increasing monotonicity
+                        stack.append(
+                            # (left_child, min_bound, min(max_bound, max_values[node_idx]))
+                            (left_child, min_bound, np.nanmin([mid, np.inf]))
+                        )
+                        stack.append(
+                            # (right_child, max(min_bound, min_values[node_idx]), max_bound)
+                            (right_child, np.nanmax([mid, -np.inf]), max_bound)
+                        )
+                    elif self.monotonic_cst[feature_idx] == -1:  # decreasing monotonicity
+                        # stack.append((left_child, max(min_bound, min_values[node_idx]), max_bound))
+                        stack.append((left_child, np.nanmax([mid, -np.inf]), max_bound))
+                        # stack.append(
+                        #     (right_child, min_bound, min(max_bound, max_values[node_idx]))
+                        # )
+                        stack.append((right_child, min_bound, np.nanmin([mid, np.inf])))
+                    else:
+                        stack.append((left_child, min_bound, max_bound))
+                        stack.append((right_child, min_bound, max_bound))
+
+        return y_bound_leaves
 
     def _oob_samples(self, X, indices=None, duplicates=None):
         """Generate out-of-bag (OOB) samples for each base estimator.
@@ -579,19 +667,30 @@ def predict(
 
         if self.max_samples_leaf == 1:  # optimize for single-sample-per-leaf performance
             y_train_leaves = np.asarray(self.forest_.y_train_leaves)
+            y_bound_leaves = np.asarray(self.forest_.y_bound_leaves)
             y_train = np.asarray(self.forest_.y_train).T
             y_pred = np.empty((len(X), y_train.shape[1], len(quantiles)))
             for output in range(y_train.shape[1]):
                 leaf_values = np.empty((len(X), self.n_estimators))
                 for tree in range(self.n_estimators):
                     if X_indices is None:  # IB scoring
-                        train_indices = y_train_leaves[tree, X_leaves[:, tree], output, 0]
+                        leaves = X_leaves[:, tree]
+                        train_indices = y_train_leaves[tree, leaves, output, 0]
+                        if self.monotonic_cst is not None:
+                            clip_min = y_bound_leaves[tree, leaves, 0]
+                            clip_max = y_bound_leaves[tree, leaves, 1]
                     else:  # OOB scoring
                         indices = X_indices[:, tree] == 1
                         leaves = X_leaves[indices, tree]
                         train_indices = np.zeros(len(X), dtype=int)
                         train_indices[indices] = y_train_leaves[tree, leaves, output, 0]
+                        if self.monotonic_cst is not None:
+                            clip_min, clip_max = np.zeros(len(X)), np.zeros(len(X))
+                            clip_min[indices] = y_bound_leaves[tree, leaves, 0]
+                            clip_max[indices] = y_bound_leaves[tree, leaves, 1]
                     leaf_values[:, tree] = y_train[train_indices - 1, output]
+                    if self.monotonic_cst is not None:
+                        leaf_values[:, tree] = np.clip(leaf_values[:, tree], clip_min, clip_max)
                     leaf_values[train_indices == 0, tree] = np.nan
                 if len(quantiles) == 1 and quantiles[0] == -1:  # calculate mean
                     func = np.mean if X_indices is None else np.nanmean

diff --git a/quantile_forest/_quantile_forest_fast.pxd b/quantile_forest/_quantile_forest_fast.pxd
@@ -15,6 +15,7 @@ cdef class QuantileForest:
     # Input/Output layout.
     cdef public vector[vector[float64_t]] y_train
     cdef public intp_t[:, :, :, :] y_train_leaves
+    cdef public float64_t[:, :, :] y_bound_leaves
     cdef public bint sparse_pickle
 
     # Methods.

diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx
@@ -563,13 +563,17 @@ cdef class QuantileForest:
         Training target values. Assumes values are sorted in ascending order.
 
     y_train_leaves : array-like of shape \
-            (n_estimators, n_leaves, n_outputs, n_indices)
+            (n_estimators, n_leaves, n_indices, n_outputs)
         List of trees, each with a list of nodes, each with a list of indices
         of the training samples residing at that node. Nodes with no samples
         (e.g., internal nodes) are empty. Internal nodes are included so that
         leaf node indices match their ``est.apply`` outputs. Each node list is
         padded to equal length with 0s.
 
+    y_bound_leaves : array-like of shape (n_estimators, n_leaves, 2)
+        Minimum and maximum bounds for target values for each leaf node. Used
+        to enforce monotonicity constraints.
+
     sparse_pickle : bool, default=False
         Pickle using a SciPy sparse matrix.
     """
@@ -578,38 +582,56 @@ cdef class QuantileForest:
         self,
         cnp.ndarray[float64_t, ndim=2] y_train,
         cnp.ndarray[intp_t, ndim=4] y_train_leaves,
+        cnp.ndarray[float64_t, ndim=3] y_bound_leaves=None,
         bint sparse_pickle=<bint>False,
     ):
         """Constructor."""
         self.y_train = y_train
         self.y_train_leaves = y_train_leaves
+        self.y_bound_leaves = y_bound_leaves
         self.sparse_pickle = sparse_pickle
 
     def __reduce__(self):
         """Reduce re-implementation, for pickling."""
         if self.sparse_pickle:
             y_train_leaves = np.empty(shape=(0, 0, 0, 0), dtype=np.int64)
-            kwargs = {"y_train_leaves": np.asarray(self.y_train_leaves)}
+            y_bound_leaves = None
+            kwargs = {
+                "y_train_leaves": np.asarray(self.y_train_leaves),
+                "y_bound_leaves": (
+                    None if self.y_bound_leaves is None else np.asarray(self.y_bound_leaves)
+                ),
+            }
         else:
             y_train_leaves = np.asarray(self.y_train_leaves)
+            y_bound_leaves = (
+                None if self.y_bound_leaves is None else np.asarray(self.y_bound_leaves)
+            )
             kwargs = {}
-        args = (np.asarray(self.y_train), y_train_leaves, self.sparse_pickle)
+        args = (np.asarray(self.y_train), y_train_leaves, y_bound_leaves, self.sparse_pickle)
         return (QuantileForest, args, self.__getstate__(**kwargs))
 
     def __getstate__(self, **kwargs):
         """Getstate re-implementation, for pickling."""
         d = {}
         if self.sparse_pickle:
-            matrix = kwargs["y_train_leaves"]
-            reshape = (matrix.shape[2], matrix.shape[0] * matrix.shape[1] * matrix.shape[2])
-            d["shape"] = matrix.shape
-            d["matrix"] = sparse.csc_matrix(matrix.reshape(reshape))
+            matrix1 = kwargs["y_train_leaves"]
+            reshape1 = (matrix1.shape[2], matrix1.shape[0] * matrix1.shape[1] * matrix1.shape[2])
+            d["shape1"] = matrix1.shape
+            d["matrix1"] = sparse.csc_matrix(matrix1.reshape(reshape1))
+
+            matrix2 = kwargs["y_bound_leaves"]
+            reshape2 = (matrix2.shape[2], matrix2.shape[0] * matrix2.shape[1])
+            d["shape2"] = matrix2.shape
+            d["matrix2"] = sparse.csc_matrix(matrix2.reshape(reshape2))
+
         return d
 
     def __setstate__(self, d):
         """Setstate re-implementation, for unpickling."""
         if self.sparse_pickle:
-            self.y_train_leaves = d["matrix"].toarray().reshape(d["shape"])
+            self.y_train_leaves = d["matrix1"].toarray().reshape(d["shape1"])
+            self.y_bound_leaves = d["matrix2"].toarray().reshape(d["shape2"])
 
     cpdef cnp.ndarray predict(
         self,
@@ -664,12 +686,12 @@ cdef class QuantileForest:
         cdef vector[double] leaf_samples
         cdef vector[double] leaf_weights
         cdef vector[vector[intp_t]] train_indices
-        cdef vector[vector[double]] train_weights
+        cdef vector[vector[double]] train_weights, clip_mins, clip_maxs
         cdef intp_t idx, train_idx
-        cdef double train_wgt
+        cdef double clip_min, clip_max, train_wgt
         cdef vector[int] n_leaf_samples
         cdef int n_total_samples, n_total_trees
-        cdef double train_weight
+        cdef double train_weight, sample_value
         cdef vector[vector[double]] leaf_preds
         cdef vector[double] pred
         cdef cnp.ndarray[float64_t, ndim=3] preds
@@ -710,6 +732,8 @@ cdef class QuantileForest:
         with nogil:
             idx = 1 if aggregate_leaves_first else n_trees
             train_indices = vector[vector[intp_t]](idx)
+            clip_mins = vector[vector[double]](idx)
+            clip_maxs = vector[vector[double]](idx)
 
             n_leaf_samples = vector[int](n_trees)
             leaf_preds = vector[vector[double]](n_quantiles)
@@ -738,6 +762,12 @@ cdef class QuantileForest:
                     for k in range(<intp_t>(leaf_preds.size())):
                         leaf_preds[k].clear()
 
+                    # Initialize arrays for clipping.
+                    for k in range(<intp_t>(clip_mins.size())):
+                        clip_mins[k].clear()
+                    for k in range(<intp_t>(clip_maxs.size())):
+                        clip_maxs[k].clear()
+
                     # Accumulate training indices across leaves for each tree.
                     # If `aggregate_leaves_first`, also accumulate across trees.
                     for k in range(n_trees):
@@ -749,6 +779,14 @@ cdef class QuantileForest:
                                 &self.y_train_leaves[k, X_leaves[i, k], j, max_idx],
                             )
 
+                            if self.y_bound_leaves is not None:
+                                # Repeat the insertion of the corresponding clipping bounds.
+                                clip_min = self.y_bound_leaves[k, X_leaves[i, k], 0]
+                                clip_max = self.y_bound_leaves[k, X_leaves[i, k], 1]
+                                for _ in range(max_idx):
+                                    clip_mins[idx].push_back(clip_min)
+                                    clip_maxs[idx].push_back(clip_max)
+
                     if weighted_quantile:
                         for k in range(<intp_t>(train_weights.size())):
                             train_weights[k].clear()
@@ -807,9 +845,16 @@ cdef class QuantileForest:
                             leaf_samples.clear()
 
                             # Get training target values associated with indices.
-                            for train_idx in train_indices[k]:
+                            for l in range(<intp_t>(train_indices[k].size())):
+                                train_idx = train_indices[k][l]
                                 if train_idx != 0:
-                                    leaf_samples.push_back(self.y_train[j][train_idx - 1])
+                                    # Apply clipping to each training sample.
+                                    sample_value = self.y_train[j][train_idx - 1]
+                                    if self.y_bound_leaves is not None:
+                                        clip_min = clip_mins[k][l]
+                                        clip_max = clip_maxs[k][l]
+                                        sample_value = max(min(sample_value, clip_max), clip_min)
+                                    leaf_samples.push_back(sample_value)
 
                             # Calculate quantiles (or mean).
                             if not use_mean:

diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py
@@ -793,7 +793,9 @@ def check_max_samples_leaf(name):
         est.fit(X, y)
 
         max_leaf_size = 0
-        for _, tree_lookup in enumerate(est._get_y_train_leaves(X, 1)):
+        y_train_leaves, _ = est._get_y_train_leaves(X, y[:1])
+        for i in range(len(y_train_leaves)):
+            tree_lookup = y_train_leaves[i]
             for leaf_samples in np.squeeze(tree_lookup, -2):
                 n_leaf_samples = len([x for x in leaf_samples if x != 0])
                 if n_leaf_samples > max_leaf_size:
@@ -819,7 +821,7 @@ def check_max_samples_leaf(name):
             est.param_validation = param_validation
             assert_raises(ValueError, est.fit, X, y)
             est.max_samples_leaf = max_samples_leaf
-            assert_raises(ValueError, est._get_y_train_leaves, X, 1)
+            assert_raises(ValueError, est._get_y_train_leaves, X, y)
 
 
 @pytest.mark.parametrize("name", FOREST_REGRESSORS)