diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py index 5cb8425..bb61966 100755 --- a/quantile_forest/_quantile_forest.py +++ b/quantile_forest/_quantile_forest.py @@ -51,7 +51,7 @@ class calls the ``fit`` method of the ``ForestRegressor`` and creates a param_validation = False from sklearn.utils.validation import check_is_fitted -from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices +from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices, map_leaf_nodes sklearn_version = parse_version(sklearn.__version__) @@ -69,8 +69,8 @@ def _group_by_value(a): a_sorted = a[sort_idx] unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) unq_items = a_sorted[unq_first] - unq_count = np.diff(np.concatenate(np.nonzero(unq_first) + ([a.size],))) - unq_idx = np.split(sort_idx, np.cumsum(unq_count[:-1])) + unq_indices = np.flatnonzero(unq_first) + unq_idx = np.array_split(sort_idx, unq_indices[1:]) return unq_items, unq_idx @@ -303,9 +303,8 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None): if sorter is not None: # Reassign bootstrap indices to account for target sorting. bootstrap_indices = np.argsort(sorter, axis=0)[bootstrap_indices] - if bootstrap_indices.shape[-1] == 1: - bootstrap_indices = np.squeeze(bootstrap_indices, -1) + bootstrap_indices = bootstrap_indices.reshape(-1, self.n_estimators, n_outputs) bootstrap_indices += 1 # for sparse matrix (0s as empty) # Get the maximum number of nodes (internal + leaves) across trees. @@ -335,23 +334,32 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None): if leaf_subsample: random.seed(estimator.random_state) - # Map each leaf node to its list of training indices. - for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list): - y_indices = bootstrap_indices[:, i][leaf_values].reshape(-1, n_outputs) - - if sample_weight is not None: - y_indices = y_indices[sample_weight[y_indices - 1] > 0] - - # Subsample leaf training indices (without replacement). - if leaf_subsample and max_samples_leaf < len(y_indices): - if not isinstance(y_indices, list): - y_indices = list(y_indices) - y_indices = random.sample(y_indices, max_samples_leaf) - - y_indices = np.asarray(y_indices).T.reshape(n_outputs, -1) + if sample_weight is not None or leaf_subsample: + for j in range(len(leaf_values_list)): + if sample_weight is not None: + # Filter leaf samples with zero weight. + weight_mask = sample_weight[leaf_values_list[j] - 1] > 0 + leaf_values_list[j] = leaf_values_list[j][weight_mask] + if leaf_subsample: + # Sample leaf to length `max_samples_leaf`. + if len(leaf_values_list[j]) > max_samples_leaf: + random.shuffle(leaf_values_list[j]) # to ensure random sampling + leaf_values_list[j] = leaf_values_list[j][:max_samples_leaf] + if len(leaf_values_list[j]) == 0: + leaf_values_list[j] = [0] + # Map each leaf node to its list of training indices. + if max_samples_leaf == 1: # optimize for single-sample-per-leaf performance + y_indices = bootstrap_indices[:, i][leaf_values_list].reshape(-1, 1, n_outputs) for j in range(n_outputs): - y_train_leaves[i, leaf_idx, j, : len(y_indices[j])] = y_indices[j] + y_train_leaves[i, leaf_indices, j, 0] = y_indices[:, 0, j] + else: # get mapping for arbitrary leaf sizes + y_train_leaves[i] = map_leaf_nodes( + y_train_leaves=y_train_leaves[i], + bootstrap_indices=bootstrap_indices[:, i], + leaf_indices=leaf_indices, + leaf_values_list=leaf_values_list, + ) return y_train_leaves diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx index 30f0adf..5b685cb 100755 --- a/quantile_forest/_quantile_forest_fast.pyx +++ b/quantile_forest/_quantile_forest_fast.pyx @@ -523,7 +523,7 @@ cpdef vector[intp_t] generate_unsampled_indices( n_total_samples : int Number of total samples, sampled and unsampled. - duplicates : list of sets + duplicates : list of sets of ints List of sets of functionally identical indices. Returns @@ -531,10 +531,10 @@ cpdef vector[intp_t] generate_unsampled_indices( unsampled_indices : array-like List of unsampled indices. """ + cdef intp_t n_samples, n_duplicates cdef intp_t i cdef intp_t sampled_idx cdef set[intp_t] sampled_set - cdef intp_t n_samples, n_duplicates cdef vector[intp_t] unsampled_indices n_samples = sample_indices.size() @@ -557,6 +557,61 @@ cpdef vector[intp_t] generate_unsampled_indices( return unsampled_indices +cpdef map_leaf_nodes( + cnp.ndarray[intp_t, ndim=3] y_train_leaves, + cnp.ndarray[intp_t, ndim=2] bootstrap_indices, + vector[intp_t] leaf_indices, + vector[vector[intp_t]] leaf_values_list, +) noexcept: + """Return a mapping of training sample indices to a tree's leaf nodes. + + Parameters + ---------- + y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples) + Unpopulated mapping representing a list of nodes, each with a list of + indices of the training samples residing at that node. + + bootstrap_indices : array-like of shape (n_samples, n_outputs) + Bootstrap indices of training samples. + + leaf_indices : list of ints + List of leaf node indices. Values correspond to `leaf_values_list`. + + leaf_values_list : list of list of ints + List of leaf node sample indices. Values correspond to `leaf_indices`. + + Returns + ------- + y_train_leaves : array-like of shape (n_leaves, n_outputs, n_samples) + Populated mapping of training sample indices to leaf nodes. + """ + cdef intp_t n_samples, n_outputs, n_leaves + cdef intp_t i, j, k + cdef vector[intp_t] leaf_values + cdef intp_t leaf_index, leaf_value, y_index + cdef intp_t[:, :, :] y_train_leaves_view + + n_outputs = bootstrap_indices.shape[1] + n_leaves = leaf_indices.size() + + y_train_leaves_view = y_train_leaves # memoryview + + with nogil: + for i in range(n_leaves): + leaf_index = leaf_indices[i] + leaf_values = leaf_values_list[i] + + n_samples = leaf_values.size() + for j in range(n_samples): + leaf_value = leaf_values[j] + for k in range(n_outputs): + y_index = bootstrap_indices[leaf_value, k] + if y_index > 0: + y_train_leaves_view[leaf_index, k, j] = y_index + + return np.asarray(y_train_leaves_view) + + cdef class QuantileForest: """Representation of a quantile forest. diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py index 9da68a3..e4750f3 100755 --- a/quantile_forest/tests/test_quantile_forest.py +++ b/quantile_forest/tests/test_quantile_forest.py @@ -113,7 +113,7 @@ def check_california_criterion(name, criterion): # Test sample weights. regr = ForestRegressor(n_estimators=5, criterion=criterion, random_state=0) - sample_weight = np.ones(y_california.shape) + sample_weight = np.concatenate([np.zeros(1), np.ones(len(y_california) - 1)]) regr.fit(X_california, y_california, sample_weight=sample_weight) score = regr.score(X_california, y_california, quantiles=0.5) assert score > 0.9, f"Failed with criterion {criterion}, sample weight and score={score}." @@ -786,7 +786,12 @@ def check_max_samples_leaf(name): max_leaf_sizes = [] for max_samples_leaf in [0.99 / len(X), 1, 3.0 / len(X), 5, 20, None]: - est = ForestRegressor(n_estimators=10, max_samples_leaf=max_samples_leaf, random_state=0) + est = ForestRegressor( + n_estimators=10, + min_samples_leaf=max_samples_leaf if max_samples_leaf is not None else len(X), + max_samples_leaf=max_samples_leaf, + random_state=0, + ) est.fit(X, y) max_leaf_size = 0 diff --git a/setup.cfg b/setup.cfg index f40ae91..f427825 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ addopts = [flake8] max-line-length = 99 ignore = E203, E266, E402, E501, W503, E731 -max-complexity = 24 +max-complexity = 25 select = B,C,E,F,W,T4,B9 exclude = .git,