-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Estimator #57
base: main
Are you sure you want to change the base?
Estimator #57
Changes from 7 commits
9a2680e
65cdc16
30ca111
3978270
cc1d414
bf893c7
176f20a
c278ecc
3f7a5ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ joblib | |
*.pyc | ||
__pycache__ | ||
*.egg-info | ||
.coverage | ||
.coverage* | ||
|
||
# IDE specific folders | ||
.vscode | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
from sklearn.preprocessing import OneHotEncoder | ||
from sklearn.utils.validation import check_is_fitted | ||
|
||
from .utils import ( | ||
from ._utils.u_Dnn_learner import ( | ||
create_X_y, | ||
dnn_net, | ||
joblib_ensemble_dnnet, | ||
|
@@ -241,6 +241,7 @@ | |
loss = np.array(res_ens[4]) | ||
|
||
if self.n_ensemble == 1: | ||
raise Warning("The model can't be fit with n_ensemble = 1") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jpaillard Can you check if it's correct that without multiple ensembles, there is not fitting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is a fitting, as suggested by the function call The following lines select the I would suggest to have a function |
||
return [(res_ens[0][0], (res_ens[1][0], res_ens[2][0]))] | ||
|
||
# Keeping the optimal subset of DNNs | ||
|
@@ -283,6 +284,9 @@ | |
y = y.reshape(-1, 1) | ||
if self.problem_type == "regression": | ||
list_y.append(y) | ||
# Encoding the target with the ordinal case | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jpaillard @bthirion There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It stands for regression problems where the ordering of values matters, but not the values themselves. Usually the values are discretized. I propose to keep it for the moment. |
||
if self.problem_type == "ordinal": | ||
list_y = ordinal_encode(y) | ||
|
||
for col in range(y.shape[1]): | ||
if train: | ||
|
@@ -291,18 +295,12 @@ | |
self.enc_y.append(OneHotEncoder(handle_unknown="ignore")) | ||
curr_y = self.enc_y[col].fit_transform(y[:, [col]]).toarray() | ||
list_y.append(curr_y) | ||
|
||
# Encoding the target with the ordinal case | ||
if self.problem_type == "ordinal": | ||
y = ordinal_encode(y) | ||
|
||
else: | ||
# Encoding the target with the classification case | ||
if self.problem_type in ("classification", "binary"): | ||
curr_y = self.enc_y[col].transform(y[:, [col]]).toarray() | ||
list_y.append(curr_y) | ||
|
||
## ToDo Add the ordinal case | ||
|
||
return np.array(list_y) | ||
|
||
def hyper_tuning( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,13 @@ | |
class RandomForestClassifierModified(RandomForestClassifier): | ||
def fit(self, X, y): | ||
self.y_ = y | ||
super().fit(X, y) | ||
return super().fit(X, y) | ||
|
||
def predict(self, X): | ||
super().predict(X) | ||
return super().predict(X) | ||
|
||
def predict_proba(self, X): | ||
super().predict_proba(X) | ||
return super().predict_proba(X) | ||
|
||
def sample_same_leaf(self, X, y=None): | ||
if not (y is None): | ||
|
@@ -42,23 +42,24 @@ def sample_same_leaf(self, X, y=None): | |
)[0] | ||
|
||
# Append the samples to the list | ||
leaf_samples.append( | ||
y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] | ||
) | ||
if samples_in_leaf.size > 0: | ||
leaf_samples.append( | ||
y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jpaillard @bthirion There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should never be empty by construction (Random forests represent the samples in a tree structures). By default, there is a minimum number of samples in each leaf. |
||
|
||
predictions.append(leaf_samples) | ||
|
||
# Combine the predictions from all trees to make the final prediction | ||
return np.array(predictions) | ||
return np.array(predictions, dtype=object) | ||
|
||
|
||
class RandomForestRegressorModified(RandomForestRegressor): | ||
def fit(self, X, y): | ||
self.y_ = y | ||
super().fit(X, y) | ||
return super().fit(X, y) | ||
|
||
def predict(self, X): | ||
super().predict(X) | ||
return super().predict(X) | ||
|
||
def sample_same_leaf(self, X): | ||
rng = np.random.RandomState(self.get_params()["random_state"]) | ||
|
@@ -87,11 +88,12 @@ def sample_same_leaf(self, X): | |
)[0] | ||
|
||
# Append the samples to the list | ||
leaf_samples.append( | ||
y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] | ||
) | ||
if samples_in_leaf.size > 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can remove the condition here too. |
||
leaf_samples.append( | ||
y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] | ||
) | ||
|
||
predictions.append(leaf_samples) | ||
|
||
# Combine the predictions from all trees to make the final prediction | ||
return np.array(predictions) | ||
return np.array(predictions, dtype=object) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .Dnn_learner_single import DnnLearnerSingle | ||
|
||
__all__ = [ | ||
"DnnLearnerSingle", | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As discussed with you, as a user, I don't like integrating the ensembling (
n_ensemble
) and hyper-parameter tuning (do_hypertuning
,dict_hypertuning
) in a single class, which becomes huge.Also, I think other libraries (sklearn for ensembling, optuna for hyperparameters) offer more & better options for these advanced training strategies.
I suggest separating these aspects from the DNN_learner class and leaving it up to the user to optimize the training separately from humidistat.