Skip to content

Commit

Permalink
Refactor helper functions
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson committed Aug 31, 2024
1 parent 243f27f commit fba7cba
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 22 deletions.
33 changes: 11 additions & 22 deletions quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,29 +51,16 @@ class calls the ``fit`` method of the ``ForestRegressor`` and creates a
param_validation = False
from sklearn.utils.validation import check_is_fitted

from ._quantile_forest_fast import QuantileForest, generate_unsampled_indices, map_leaf_nodes
from ._quantile_forest_fast import (
QuantileForest,
generate_unsampled_indices,
group_by_value,
map_leaf_nodes,
)

sklearn_version = parse_version(sklearn.__version__)


def _generate_unsampled_indices(sample_indices, n_total_samples, duplicates=None):
"""Private function used by forest._get_y_train_leaves function."""
if duplicates is None:
duplicates = []
return generate_unsampled_indices(sample_indices, n_total_samples, duplicates)


def _group_by_value(a):
"""Private function used by forest._get_y_train_leaves function."""
sort_idx = np.argsort(a)
a_sorted = a[sort_idx]
unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1]))
unq_items = a_sorted[unq_first]
unq_indices = np.flatnonzero(unq_first)
unq_idx = np.array_split(sort_idx, unq_indices[1:])
return unq_items, unq_idx


class BaseForestQuantileRegressor(ForestRegressor):
"""
Base class for quantile regression forests.
Expand Down Expand Up @@ -329,7 +316,7 @@ def _get_y_train_leaves(self, X, y, sorter=None, sample_weight=None):

for i, estimator in enumerate(self.estimators_):
# Group training indices by leaf node.
leaf_indices, leaf_values_list = _group_by_value(X_leaves_bootstrap[:, i])
leaf_indices, leaf_values_list = group_by_value(X_leaves_bootstrap[:, i])

if leaf_subsample:
random.seed(estimator.random_state)
Expand Down Expand Up @@ -552,8 +539,10 @@ def _get_unsampled_indices(self, estimator, duplicates=None):
sample_indices = _generate_sample_indices(
estimator.random_state, n_train_samples, n_samples_bootstrap
)
unsampled_indices = _generate_unsampled_indices(
sample_indices, n_train_samples, duplicates=duplicates
unsampled_indices = generate_unsampled_indices(
sample_indices,
n_train_samples,
duplicates=[] if duplicates is None else duplicates,
)
return np.asarray(unsampled_indices)

Expand Down
82 changes: 82 additions & 0 deletions quantile_forest/_quantile_forest_fast.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from libc.math cimport ceil, fabs, floor, round
from libc.stdlib cimport free, malloc
from libc.string cimport memset
from libcpp.algorithm cimport sort as sort_cpp
from libcpp.map cimport map
Expand Down Expand Up @@ -557,6 +558,87 @@ cpdef vector[intp_t] generate_unsampled_indices(
return unsampled_indices


cpdef group_by_value(cnp.ndarray[intp_t, ndim=1] a):
"""Group indices of a sorted array based on unique values.
Parameters
----------
a : array-like of shape (n_samples)
Input array. The array is expected to contain integers, and the
function will group the indices of elements with the same value.
Returns
-------
np_unq_items : array-like
A NumPy array containing the unique values from the input array `a`,
sorted in ascending order.
unq_idx : list of array-like
A list of NumPy arrays, where each array contains the indices of the
input array `a` corresponding to each unique value in `np_unq_items`.
The indices are sorted based on the original order in `a`.
"""
cdef intp_t num_samples
cdef intp_t i
cdef cnp.ndarray[intp_t, ndim=1] sort_idx
cdef cnp.ndarray[intp_t, ndim=1] a_sorted
cdef intp_t prev_value
cdef intp_t count, unq_count_idx
cdef intp_t* unq_count
cdef bint* unq_first
cdef intp_t* unq_first_indices

num_samples = a.shape[0]
sort_idx = np.argsort(a)
a_sorted = a[sort_idx]
unq_count_idx = 0
unq_count = <intp_t*>malloc(num_samples * sizeof(intp_t))
unq_first = <bint*>malloc(num_samples * sizeof(bint))
unq_first_indices = <intp_t*>malloc(num_samples * sizeof(intp_t))

if unq_count == NULL or unq_first == NULL or unq_first_indices == NULL:
raise MemoryError("Memory allocation failed.")

with nogil:
# Initialize first element.
prev_value = a_sorted[0]
unq_first[0] = 1
unq_first_indices[0] = 0
count = 1

# Loop through sorted array and identify unique values.
for i in range(1, num_samples):
if a_sorted[i] != prev_value:
unq_first[i] = 1
unq_first_indices[unq_count_idx + 1] = i
unq_count[unq_count_idx] = count
unq_count_idx += 1
count = 1
prev_value = a_sorted[i]
else:
unq_first[i] = 0
count += 1

# Assign final count.
unq_count[unq_count_idx] = count
unq_count_idx += 1

# Allocate arrays for the output.
np_unq_items = np.empty(unq_count_idx, dtype=np.int64)
unq_idx = [None] * unq_count_idx

for i in range(unq_count_idx):
np_unq_items[i] = a_sorted[unq_first_indices[i]]
unq_idx[i] = sort_idx[unq_first_indices[i]:unq_first_indices[i] + unq_count[i]]

# Free allocated memory.
free(unq_count)
free(unq_first)
free(unq_first_indices)

return np_unq_items, unq_idx


cpdef map_leaf_nodes(
cnp.ndarray[intp_t, ndim=3] y_train_leaves,
cnp.ndarray[intp_t, ndim=2] bootstrap_indices,
Expand Down

0 comments on commit fba7cba

Please sign in to comment.