diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py index bb61966..1999463 100755 --- a/quantile_forest/_quantile_forest.py +++ b/quantile_forest/_quantile_forest.py @@ -51,29 +51,16 @@ class calls the ``fit`` method of the ``ForestRegressor`` and creates a param_validation = False from sklearn.utils.validation import check_is_fitted -from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices, map_leaf_nodes +from ._quantile_forest_fast import ( + QuantileForest, + generate_unsampled_indices, + group_by_value, + map_leaf_nodes, +) sklearn_version = parse_version(sklearn.__version__) -def _generate_unsampled_indices(sample_indices, n_total_samples, duplicates=None): - """Private function used by forest._get_y_train_leaves function.""" - if duplicates is None: - duplicates = [] - return generate_unsampled_indices(sample_indices, n_total_samples, duplicates) - - -def _group_by_value(a): - """Private function used by forest._get_y_train_leaves function.""" - sort_idx = np.argsort(a) - a_sorted = a[sort_idx] - unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1])) - unq_items = a_sorted[unq_first] - unq_indices = np.flatnonzero(unq_first) - unq_idx = np.array_split(sort_idx, unq_indices[1:]) - return unq_items, unq_idx - - class BaseForestQuantileRegressor(ForestRegressor): """ Base class for quantile regression forests. @@ -329,7 +316,7 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None): for i, estimator in enumerate(self.estimators_): # Group training indices by leaf node. - leaf_indices, leaf_values_list = _group_by_value(X_leaves_bootstrap[:, i]) + leaf_indices, leaf_values_list = group_by_value(X_leaves_bootstrap[:, i]) if leaf_subsample: random.seed(estimator.random_state) @@ -552,8 +539,10 @@ def _get_unsampled_indices(self, estimator, duplicates=None): sample_indices = _generate_sample_indices( estimator.random_state, n_train_samples, n_samples_bootstrap ) - unsampled_indices = _generate_unsampled_indices( - sample_indices, n_train_samples, duplicates=duplicates + unsampled_indices = generate_unsampled_indices( + sample_indices, + n_train_samples, + duplicates=[] if duplicates is None else duplicates, ) return np.asarray(unsampled_indices) diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx index 5b685cb..2df4c3a 100755 --- a/quantile_forest/_quantile_forest_fast.pyx +++ b/quantile_forest/_quantile_forest_fast.pyx @@ -1,4 +1,5 @@ from libc.math cimport ceil, fabs, floor, round +from libc.stdlib cimport free, malloc from libc.string cimport memset from libcpp.algorithm cimport sort as sort_cpp from libcpp.map cimport map @@ -557,6 +558,87 @@ cpdef vector[intp_t] generate_unsampled_indices( return unsampled_indices +cpdef group_by_value(cnp.ndarray[intp_t, ndim=1] a): + """Group indices of a sorted array based on unique values. + + Parameters + ---------- + a : array-like of shape (n_samples) + Input array. The array is expected to contain integers, and the + function will group the indices of elements with the same value. + + Returns + ------- + np_unq_items : array-like + A NumPy array containing the unique values from the input array `a`, + sorted in ascending order. + + unq_idx : list of array-like + A list of NumPy arrays, where each array contains the indices of the + input array `a` corresponding to each unique value in `np_unq_items`. + The indices are sorted based on the original order in `a`. + """ + cdef intp_t num_samples + cdef intp_t i + cdef cnp.ndarray[intp_t, ndim=1] sort_idx + cdef cnp.ndarray[intp_t, ndim=1] a_sorted + cdef intp_t prev_value + cdef intp_t count, unq_count_idx + cdef intp_t* unq_count + cdef bint* unq_first + cdef intp_t* unq_first_indices + + num_samples = a.shape[0] + sort_idx = np.argsort(a) + a_sorted = a[sort_idx] + unq_count_idx = 0 + unq_count = malloc(num_samples * sizeof(intp_t)) + unq_first = malloc(num_samples * sizeof(bint)) + unq_first_indices = malloc(num_samples * sizeof(intp_t)) + + if unq_count == NULL or unq_first == NULL or unq_first_indices == NULL: + raise MemoryError("Memory allocation failed.") + + with nogil: + # Initialize first element. + prev_value = a_sorted[0] + unq_first[0] = 1 + unq_first_indices[0] = 0 + count = 1 + + # Loop through sorted array and identify unique values. + for i in range(1, num_samples): + if a_sorted[i] != prev_value: + unq_first[i] = 1 + unq_first_indices[unq_count_idx + 1] = i + unq_count[unq_count_idx] = count + unq_count_idx += 1 + count = 1 + prev_value = a_sorted[i] + else: + unq_first[i] = 0 + count += 1 + + # Assign final count. + unq_count[unq_count_idx] = count + unq_count_idx += 1 + + # Allocate arrays for the output. + np_unq_items = np.empty(unq_count_idx, dtype=np.int64) + unq_idx = [None] * unq_count_idx + + for i in range(unq_count_idx): + np_unq_items[i] = a_sorted[unq_first_indices[i]] + unq_idx[i] = sort_idx[unq_first_indices[i]:unq_first_indices[i] + unq_count[i]] + + # Free allocated memory. + free(unq_count) + free(unq_first) + free(unq_first_indices) + + return np_unq_items, unq_idx + + cpdef map_leaf_nodes( cnp.ndarray[intp_t, ndim=3] y_train_leaves, cnp.ndarray[intp_t, ndim=2] bootstrap_indices,