From 588cd7a22c309d1f6955663a00cdfbf0a3df8881 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Sat, 6 Jul 2024 01:46:43 -0700 Subject: [PATCH] v2024.7.2 Reimplemntation using Optuna and Feature-Engine mod --- CHANGELOG.md | 12 +- LEGACY_README.md | 291 ++++ README.md | 297 ++-- clairvoyance/__init__.py | 59 +- clairvoyance/bayesian.py | 668 ++++++++ clairvoyance/feature_selection.py | 1069 ++++++++++++ clairvoyance/{clairvoyance.py => legacy.py} | 1626 ++++++++----------- clairvoyance/transformations.py | 85 + clairvoyance/utils.py | 475 ++++++ clairvoyance/visuals.py | 15 + requirements.txt | 11 + setup.py | 37 +- 12 files changed, 3399 insertions(+), 1246 deletions(-) create mode 100644 LEGACY_README.md create mode 100644 clairvoyance/bayesian.py create mode 100644 clairvoyance/feature_selection.py rename clairvoyance/{clairvoyance.py => legacy.py} (71%) create mode 100644 clairvoyance/transformations.py create mode 100644 clairvoyance/utils.py create mode 100644 clairvoyance/visuals.py create mode 100644 requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index ac75502..b1f5840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Change Log + +* [2024.7.6] - Using `get_feature_importances` from `Feature-Engine` instead of `format_weights`. +* [2024.7.6] - Demoted `ClairvoyanceBase`, `ClairvoyanceRegression`, `ClairvoyanceClassification`, and `ClairvoyanceRecursive` to `clairvoyance.legacy.` +* [2024.7.6] - Added `BayesianClairvoyanceBase`, `BayesianClairvoyanceClassification`, and `BayesianClairvoyanceRegression` in `clairvoyance.bayesian. which use `Optuna` for hyperparameter tuning and `ClairvoyanceRecursiveFeatureAddition` or `ClairvoyanceRecursiveElimination` for feature selection. +* [2024.7.6] - Added `ClairvoyanceRecursiveFeatureAddition` and `ClairvoyanceRecursiveElimination` in `clairvoyance.feature_selection` which are mods of `Feature-Engine` classes that can handle transformations during feature selection along with some other conveniences. +* [2024.6.14] - Changed `recursive_feature_inclusion` to `recursive_feature_addition` to be consistent with common usage. +* [2023.12.4] - Added support for models that do not converge or `nan` values in resulting scores. * [2023.10.12] - Replaced `np.mean` with `np.nanmean` to handle `nan` values in scores (e.g., `precision_score`) * [2023.10.10] - Made `method="asymmetric"` the new default * [2023.6.9] - Added `plot_scores_comparison` and `get_balanced_class_subset` for evaluating testing datasets. -* [2023.5.25] - Added `X_testing` and `y_testing` to `recursive_feature_elimination` functions/methods. \ No newline at end of file +* [2023.5.25] - Added `X_testing` and `y_testing` to `recursive_feature_elimination` functions/methods. + +Future: +* Use SHAP? \ No newline at end of file diff --git a/LEGACY_README.md b/LEGACY_README.md new file mode 100644 index 0000000..b9d5ab6 --- /dev/null +++ b/LEGACY_README.md @@ -0,0 +1,291 @@ + +``` + _______ _______ _____ ______ _ _ _____ __ __ _______ __ _ _______ _______ + | | |_____| | |_____/ \ / | | \_/ |_____| | \ | | |______ + |_____ |_____ | | __|__ | \_ \/ |_____| | | | | \_| |_____ |______ +``` +#### Description + +Reimplementation for `Clairvoyance` from [Espinoza & Dupont et al. 2021](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008857). The updated version includes regression support, support for all linear/tree-based models, and improved visualizations. `Clairvoyance` is currently in active development. + + +#### Details: +`import clairvoyance as cy` + +Stable: `__version__ = "2023.6.24"` + +Developmental: `__version__ = "2023.12.4"` + + + +#### Installation + +``` +# Stable: + +# via PyPI +pip install clairvoyance_feature_selection + +# via Conda +conda install -c jolespin clairvoyance + +# Developmental: +pip install git+https://github.com/jolespin/clairvoyance +``` + +#### Citation + +Espinoza JL, Dupont CL, O’Rourke A, Beyhan S, Morales P, Spoering A, et al. (2021) Predicting antimicrobial mechanism-of-action from transcriptomes: A generalizable explainable artificial intelligence approach. PLoS Comput Biol 17(3): e1008857. https://doi.org/10.1371/journal.pcbi.1008857 + +#### Development +*Clairvoyance* is currently under active development and undergoing a complete reimplementation from the ground up from the original publication. The following includes a list of new features: + +* Supports any linear or tree-based `Scikit-Learn` compatible model +* Supports any `Scikit-Learn` compatible performance metric +* Supports regression (in addition to classification as in original implementation) +* Properly implements transformations for compositional data (e.g., CLR and closure) based on the query features for each iteration +* Added option to remove zero weighted features and redundant feature sets +* Added asymmetric mode in addition to the symmetric mode from the original implementation +* Added informative publication-ready plots +* Outputs multiple combinations of hyperparameters and scores for each feature combination +* Option to use testing sets or alternative models for recursive feature inclusion + + +#### Usage + + + +##### Feature selection based on classification tasks +Here's a basic classifcation using a `LogisticRegression` model and a grid search for different `C` and `penalty` parameters. We add 996 noise variables within the range of values as the original Iris features. After that we normalize them so their scale is standardized. In this case, we are optimizing for `accuracy`. We are using a `LogisticRegression` where we don't really have to worry about features with zero weight in the end so we are going to set ` remove_zero_weighted_features=False`. This will allow us to plot a nice RCI curve with error. + +```python +from clairvoyance.legacy import ClairvoyanceClassification, ClairvoyanceRegression, ClairvoyanceRecursive +import numpy as np +import pandas as pd +from sklearn.datasets import load_iris +from sklearn.linear_model import LogisticRegression + +# Load iris dataset +X, y = load_iris(return_X_y=True, as_frame=True) +X.columns = X.columns.map(lambda j: j.split(" (cm")[0].replace(" ","_")) + +# Relabel targets +target_names = load_iris().target_names +y = y.map(lambda i: target_names[i]) + +# Add 996 noise features (total = 1000 features) in the same range of values as the original features +number_of_noise_features = 996 +vmin = X.values.ravel().min() +vmax = X.values.ravel().max() +X_noise = pd.DataFrame( + data=np.random.RandomState(0).randint(low=int(vmin*10), high=int(vmax*10), size=(150, number_of_noise_features))/10, + columns=map(lambda j:"noise_{}".format(j+1), range(number_of_noise_features)), +) + +X_iris_with_noise = pd.concat([X, X_noise], axis=1) +X_normalized = X_iris_with_noise - X_iris_with_noise.mean(axis=0).values +X_normalized = X_normalized/X_normalized.std(axis=0).values + +# Specify model algorithm and parameter grid +estimator=LogisticRegression(max_iter=1000, solver="liblinear", multi_class="ovr") +param_grid={ + "C":[1e-10] + (np.arange(1,11)/10).tolist(), + "penalty":["l1", "l2"], +} + +# Instantiate model +clf = ClairvoyanceClassification( + n_jobs=-1, + scorer="accuracy", + n_draws=10, + estimator=estimator, + param_grid=param_grid, + verbose=1, + remove_zero_weighted_features=False, + +) +clf.fit(X_normalized, y)#, sort_hyperparameters_by=["C", "penalty"], ascending=[True, False]) +history = clf.recursive_feature_addition(early_stopping=10) +history.head() + +``` +![](images/1a.png) + +```python +clf.plot_scores(title="Iris", xtick_rotation=90) +clf.plot_weights() +clf.plot_weights(weight_type="cross_validation") +``` +![](images/2a.png) + +There are still a few noise variables, though with much lower weight, suggesting our classifier is modeling noise. We can add an additional penalty where a change in score must exceed a threshold to add a new feature during the recursive feature inclusion algorithm. We are keeping ` remove_zero_weighted_features=False` for this example. + +```python +history = clf.recursive_feature_addition(early_stopping=10, minimum_improvement_in_score=0.05) +clf.plot_scores(title="Iris", xtick_rotation=90) +clf.plot_weights() +clf.plot_weights(weight_type="cross_validation") +``` +![](images/3a.png) + +Now let's do a binary classification but optimize `fbeta` score instead of `accuracy`. Instead of a fixed penalty, we are going to use a custom penalty that scales with the number of features included. + +```python +from sklearn.metrics import fbeta_score + +# Let's do a binary classification +y_notsetosa = y.map(lambda x: {True:"not_setosa", False:x}[x != "setosa"]) + +# Let's also use a FBeta scorer +scorer = make_scorer(fbeta_score, average="binary", beta=0.5, pos_label="setosa") + +# Instantiate model +clf_binary = ClairvoyanceClassification( + n_jobs=-1, + scorer="accuracy", + n_draws=10, + estimator=estimator, + param_grid=param_grid, + remove_zero_weighted_features=False, + verbose=1, +) + +# Let's also prefer lower C values and l1 over l2 (i.e., stronger regularization and sparsity) +clf_binary.fit(X_normalized, y, sort_hyperparameters_by=["C", "penalty"], ascending=[True, True]) + +# Instead of adding a fixed penalty for adding new features, let's add a function that scales with the number of features +history = clf_binary.recursive_feature_addition(early_stopping=10, additional_feature_penalty=lambda n: 1e-3*n**2) +history.head() +``` +![](images/4a.png) + +```python +clf_binary.plot_scores(title="Iris (Binary)", xtick_rotation=90) +clf_binary.plot_weights() +clf_binary.plot_weights(weight_type="cross_validation") +``` +![](images/5a.png) + +##### Feature selection based on regression tasks +Here's a basic regression using a `DecisionTreeRegressor` model and a grid search for different `min_samples_leaf` and `min_samples_split` parameters. We add 87 noise variables and normalize all of the features so their scale is standardized. In this case, we are optimizing for `neg_root_mean_squared_error`. We are using a validation set of ~16% of the data during our recursive feature inclusion. For decision trees, we have the issue of getting zero-weighted features which are uninformative and misleading for RCI. To get around this, we implement a recursive feature removal that only keeps non-zero weighted features. We can turn this on via `remove_zero_weighted_features=True`. This also ensures that there are no redundant feature sets (not an issue when `remove_zero_weighted_features=False` because they are recursively added). + +Note: When we use `remove_zero_weighted_features=True`, we get a scatter plot instead of a line plot with error because there are multiple feature sets (each with their own performance distribution on the CV set) that may have the same number of features. + +```python +from sklearn.tree import DecisionTreeRegressor +from sklearn.model_selection import train_test_split + +# Load Boston data +# from sklearn.datasets import load_boston; boston = load_boston() # Deprecated +data_url = "http://lib.stat.cmu.edu/datasets/boston" +raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) +data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) +target = raw_df.values[1::2, 2] +X = pd.DataFrame(data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']) +y = pd.Series(target) + +number_of_noise_features = 100 - X.shape[1] +X_noise = pd.DataFrame(np.random.RandomState(0).normal(size=(X.shape[0], number_of_noise_features)), columns=map(lambda j: f"noise_{j}", range(number_of_noise_features))) +X_boston_with_noise = pd.concat([X, X_noise], axis=1) +X_normalized = X_boston_with_noise - X_boston_with_noise.mean(axis=0).values +X_normalized = X_normalized/X_normalized.std(axis=0).values + +# Let's fit the model but leave a held out testing set +X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, random_state=0, test_size=0.3) + +# Get parameters +estimator = DecisionTreeRegressor(random_state=0) +param_grid = {"min_samples_leaf":[1,2,3,5,8],"min_samples_split":{ 0.1618, 0.382, 0.5, 0.618}} + +# Fit model +reg = ClairvoyanceRegression( + name="Boston", + n_jobs=-1, + n_draws=10, + estimator=estimator, + param_grid=param_grid, + verbose=1, + remove_zero_weighted_features=True, +) +reg.fit(X_training, y_training) +history = reg.recursive_feature_addition(early_stopping=10, X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing) +history.head() +``` +![](images/6a.png) + +```python +reg.plot_scores(title="Boston", xtick_rotation=90) +reg.plot_weights() +reg.plot_weights(weight_type="cross_validation") +``` +![](images/7a.png) + +Let's see if we can increase the performance using the weights fitted with a `DecisionTreeRegressor` but with an ensemble `GradientBoostingRegressor` for the actual feature inclusion algorithm. + +```python +from sklearn.ensemble import GradientBoostingRegressor +# Get the relevant parameters from the DecisionTreeRegressor that will be be applicable to the ensemble +relevant_params_from_best_decisiontree_estimator = {k:reg.best_estimator_.get_params()[k] for k in ["criterion", "min_samples_split"]} +# Get estimator +estimator = GradientBoostingRegressor(random_state=0, **relevant_params_from_best_decisiontree_estimator) +# Recursive feature inclusion using ensemble +history = reg.recursive_feature_addition(early_stopping=10, estimator=estimator, X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing) +reg.plot_scores(title="Boston", xtick_rotation=90) +reg.plot_weights() +reg.plot_weights(weight_type="cross_validation") + + +``` +![](images/8a.png) + +RMSE is looking better. + +##### Recursive feature selection based on classification tasks +Here we are running `Clairvoyance` recursively identifying several feature sets that work with different hyperparameters to get a range of feature sets to select from in the end. We will iterate through all of the hyperparamater configurations, recursively feed in the data using different percentiles of the weights, and use different score thresholds from the random draws. The recursive usage is similar to the legacy implementation used in [Espinoza & Dupont et al. 2021](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008857) (which is still provided as an executable). + +```python +# Get the iris data (again) +X_normalized = X_iris_with_noise - X_iris_with_noise.mean(axis=0).values +X_normalized = X_normalized/X_normalized.std(axis=0).values +target_names = load_iris().target_names +y = pd.Series(load_iris().target) +y = y.map(lambda i: target_names[i]) + +# Specify model algorithm and parameter grid +estimator=LogisticRegression(max_iter=1000, solver="liblinear", multi_class="ovr") +param_grid={ + "C":[1e-10] + (np.arange(1,11)/10).tolist(), + "penalty":["l1", "l2"], +} + +# Instantiate model +X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, random_state=0, test_size=0.3) + +rci = ClairvoyanceRecursive( + n_jobs=-1, + scorer="accuracy", + n_draws=10, + estimator=estimator, + param_grid=param_grid, + percentiles=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.925, 0.95, 0.975, 0.99], + minimum_scores=[-np.inf, 0.382, 0.5], + verbose=0, + remove_zero_weighted_features=False, +) +rci.fit(X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing, sort_hyperparameters_by=["C", "penalty"], ascending=[True, True]) +rci.plot_recursive_feature_selection() + +``` +![](images/9a.png) + +```python +# Plot score comparisons +rci.plot_scores_comparison() +rci.get_history().head() +``` + +Let's see which feature sets have the highest validation score (i.e., average cross-validation score) and highest testing score (not used during RCI) while also considering the number of features. + +![](images/10a.png) + +Looks like there are several hyperparameter sets that can predict at > 92% accuracy on the cross-validation and > 95% accuracy on the testing set using just the `petal_length` and `petal_width`. This was able to filter out both the 96 noise features and the 2 non-informative real features. \ No newline at end of file diff --git a/README.md b/README.md index 28addd6..f8c58c7 100644 --- a/README.md +++ b/README.md @@ -4,21 +4,18 @@ | | |_____| | |_____/ \ / | | \_/ |_____| | \ | | |______ |_____ |_____ | | __|__ | \_ \/ |_____| | | | | \_| |_____ |______ ``` -#### Description +### Description -Reimplementation for `Clairvoyance` from [Espinoza & Dupont et al. 2021](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008857). The updated version includes regression support, support for all linear/tree-based models, and improved visualizations. `Clairvoyance` is currently in active development. +Reimplementation of the `Clairvoyance` AutoML method from [Espinoza & Dupont et al. 2021](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008857). The updated version includes regression support, support for all linear/tree-based models, feature selection through modified `Feature-Engine` classes, and bayeisan optimization using `Optuna`. +`Clairvoyance` is currently under active development and API is subject to change. -#### Details: -`import clairvoyance as cy` - -Stable: `__version__ = "2023.6.26"` - -Developmental: `__version__ = "2023.10.13"` +### Details: +`import clairvoyance as cy` -#### Installation +### Installation ``` # Stable: @@ -26,37 +23,33 @@ Developmental: `__version__ = "2023.10.13"` # via PyPI pip install clairvoyance_feature_selection -# via Conda -conda install -c jolespin clairvoyance # Developmental: pip install git+https://github.com/jolespin/clairvoyance ``` -#### Citation +### Citation Espinoza JL, Dupont CL, O’Rourke A, Beyhan S, Morales P, Spoering A, et al. (2021) Predicting antimicrobial mechanism-of-action from transcriptomes: A generalizable explainable artificial intelligence approach. PLoS Comput Biol 17(3): e1008857. https://doi.org/10.1371/journal.pcbi.1008857 -#### Development +### Development *Clairvoyance* is currently under active development and undergoing a complete reimplementation from the ground up from the original publication. The following includes a list of new features: -* Supports any linear or tree-based `Scikit-Learn` compatible model +* Bayesian optimization using `Optuna` +* Supports any linear or tree-based `Scikit-Learn` compatible estimator * Supports any `Scikit-Learn` compatible performance metric * Supports regression (in addition to classification as in original implementation) * Properly implements transformations for compositional data (e.g., CLR and closure) based on the query features for each iteration -* Added option to remove zero weighted features and redundant feature sets -* Added asymmetric mode in addition to the symmetric mode from the original implementation -* Added informative publication-ready plots -* Outputs multiple combinations of hyperparameters and scores for each feature combination -* Option to use testing sets or alternative models for recursive feature inclusion - +* Option to remove zero weighted features during model refitting +* [Pending] Visualizations for AutoML -#### Usage +### Usage +#### Feature selection based on classification tasks +##### Let's try using a simple Logistic Regression which can be very powerful for some tasks: -##### Feature selection based on classification tasks -Here's a basic classifcation using a `LogisticRegression` model and a grid search for different `C` and `penalty` parameters. We add 996 noise variables within the range of values as the original Iris features. After that we normalize them so their scale is standardized. In this case, we are optimizing for `accuracy`. We are using a `LogisticRegression` where we don't really have to worry about features with zero weight in the end so we are going to set ` remove_zero_weighted_features=False`. This will allow us to plot a nice RCI curve with error. +Here's a simple usage case for the iris dataset with 996 noise features (total = 1000 features) ```python import clairvoyance as cy @@ -64,6 +57,7 @@ import numpy as np import pandas as pd from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split # Load iris dataset X, y = load_iris(return_X_y=True, as_frame=True) @@ -83,97 +77,90 @@ X_noise = pd.DataFrame( ) X_iris_with_noise = pd.concat([X, X_noise], axis=1) -X_normalized = X_iris_with_noise - X_iris_with_noise.mean(axis=0).values -X_normalized = X_normalized/X_normalized.std(axis=0).values +X_training, X_testing, y_training, y_testing = train_test_split(X_iris_with_noise, y, stratify=y, random_state=0, test_size=0.3) # Specify model algorithm and parameter grid -estimator=LogisticRegression(max_iter=1000, solver="liblinear", multi_class="ovr") -param_grid={ - "C":[1e-10] + (np.arange(1,11)/10).tolist(), - "penalty":["l1", "l2"], +estimator=LogisticRegression(max_iter=1000, solver="liblinear") +param_space={ + "C":["float", 0.0, 1.0], + "penalty":["categorical", ["l1", "l2"]], } -# Instantiate model -clf = cy.ClairvoyanceClassification( - n_jobs=-1, - scorer="accuracy", - n_draws=10, - estimator=estimator, - param_grid=param_grid, - verbose=1, - remove_zero_weighted_features=False, - -) -clf.fit(X_normalized, y)#, sort_hyperparameters_by=["C", "penalty"], ascending=[True, False]) -history = clf.recursive_feature_inclusion(early_stopping=10) -history.head() - -``` -![](images/1a.png) - -```python -clf.plot_scores(title="Iris", xtick_rotation=90) -clf.plot_weights() -clf.plot_weights(weight_type="cross_validation") +# Fit the AutoML model +model = cy.bayesian.BayesianClairvoyanceClassification(estimator, param_space, n_iter=4, n_trials=50, feature_selection_method="addition", n_jobs=-1, verbose=0, feature_selection_performance_threshold=0.025) +df_results = model.fit_transform(X_training, y_training, cv=3, optimize_with_training_and_testing=True, X_testing=X_testing, y_testing=y_testing) + +[I 2024-07-05 12:14:33,611] A new study created in memory with name: n_iter=1 +[I 2024-07-05 12:14:33,680] Trial 0 finished with values: [0.7238095238095238, 0.7333333333333333] and parameters: {'C': 0.417022004702574, 'penalty': 'l1'}. +[I 2024-07-05 12:14:33,866] Trial 1 finished with values: [0.7238095238095239, 0.7333333333333333] and parameters: {'C': 0.30233257263183977, 'penalty': 'l1'}. +[I 2024-07-05 12:14:34,060] Trial 2 finished with values: [0.39999999999999997, 0 +... +Recursive feature addition: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 170.02it/s] +Synopsis[n_iter=2] Input Features: 6, Selected Features: 1 +Initial Training Score: 0.9047619047619048, Feature Selected Training Score: 0.8761904761904762 +Initial Testing Score: 0.7777777777777778, Feature Selected Testing Score: 0.9333333333333333 ``` -![](images/2a.png) -There are still a few noise variables, though with much lower weight, suggesting our classifier is modeling noise. We can add an additional penalty where a change in score must exceed a threshold to add a new feature during the recursive feature inclusion algorithm. We are keeping ` remove_zero_weighted_features=False` for this example. +##### Example output: +We were able to filter out all the noise features and get just the most informative features but linear models might not be the best for this classification task. -```python -history = clf.recursive_feature_inclusion(early_stopping=10, minimum_improvement_in_score=0.05) -clf.plot_scores(title="Iris", xtick_rotation=90) -clf.plot_weights() -clf.plot_weights(weight_type="cross_validation") -``` -![](images/3a.png) -Now let's do a binary classification but optimize `fbeta` score instead of `accuracy`. Instead of a fixed penalty, we are going to use a custom penalty that scales with the number of features included. +| study_name | best_hyperparameters | best_estimator | best_trial | number_of_initial_features | initial_training_score | initial_testing_score | number_of_selected_features | feature_selected_training_score | feature_selected_testing_score | selected_features | +|--------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------|--------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------------------------------------------------------------------|--------------------------------|-------------------| +| n_iter=1 | {'C': 0.0745664572902166, 'penalty': 'l1'} | "LogisticRegression(C=0.0745664572902166, max_iter=1000, penalty='l1', | | | | | | | | | +| random_state=0, solver='liblinear')" | FrozenTrial(number=28, state=TrialState.COMPLETE, values=[0.7904761904761904, 0.7333333333333333], datetime_start=datetime.datetime(2024, 7, 5, 12, 14, 37, 977702), datetime_complete=datetime.datetime(2024, 7, 5, 12, 14, 38, 26179), params={'C': 0.0745664572902166, 'penalty': 'l1'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'C': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'penalty': CategoricalDistribution(choices=('l1', 'l2'))}, trial_id=28, value=None) | 1000 | 0.7904761904761904 | 0.7333333333333333 | 6 | 0.9047619047619048 | 0.7333333333333333 | ['petal_length', 'noise_25', 'noise_833', 'noise_48', 'noise_653', 'noise_793'] | | | +| n_iter=2 | {'C': 0.9875411040455084, 'penalty': 'l1'} | "LogisticRegression(C=0.9875411040455084, max_iter=1000, penalty='l1', | | | | | | | | | +| random_state=0, solver='liblinear')" | FrozenTrial(number=11, state=TrialState.COMPLETE, values=[0.9047619047619048, 0.7777777777777778], datetime_start=datetime.datetime(2024, 7, 5, 12, 14, 43, 176045), datetime_complete=datetime.datetime(2024, 7, 5, 12, 14, 43, 197241), params={'C': 0.9875411040455084, 'penalty': 'l1'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'C': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'penalty': CategoricalDistribution(choices=('l1', 'l2'))}, trial_id=11, value=None) | 6 | 0.9047619047619048 | 0.7777777777777778 | 1 | 0.8761904761904762 | 0.9333333333333333 | ['petal_length'] | | | -```python -from sklearn.metrics import fbeta_score - -# Let's do a binary classification -y_notsetosa = y.map(lambda x: {True:"not_setosa", False:x}[x != "setosa"]) - -# Let's also use a FBeta scorer -scorer = make_scorer(fbeta_score, average="binary", beta=0.5, pos_label="setosa") - -# Instantiate model -clf_binary = cy.ClairvoyanceClassification( - n_jobs=-1, - scorer="accuracy", - n_draws=10, - estimator=estimator, - param_grid=param_grid, - remove_zero_weighted_features=False, - verbose=1, -) -# Let's also prefer lower C values and l1 over l2 (i.e., stronger regularization and sparsity) -clf_binary.fit(X_normalized, y, sort_hyperparameters_by=["C", "penalty"], ascending=[True, True]) +##### Let's try it again with a tree-based model: -# Instead of adding a fixed penalty for adding new features, let's add a function that scales with the number of features -history = clf_binary.recursive_feature_inclusion(early_stopping=10, additional_feature_penalty=lambda n: 1e-3*n**2) -history.head() ``` -![](images/4a.png) +# Specify DecisionTree model algorithm and parameter grid +from sklearn.tree import DecisionTreeClassifier + +estimator=DecisionTreeClassifier(random_state=0) +param_space = { + "min_samples_leaf":["int", 1, 50], + "min_samples_split": ["float", 0.0, 0.5], + "max_features":["categorical", ["sqrt", "log2", None]], +} -```python -clf_binary.plot_scores(title="Iris (Binary)", xtick_rotation=90) -clf_binary.plot_weights() -clf_binary.plot_weights(weight_type="cross_validation") +model = cy.bayesian.BayesianClairvoyanceClassification(estimator, param_space, n_iter=4, n_trials=50, feature_selection_method="addition", n_jobs=-1, verbose=0, feature_selection_performance_threshold=0.025) +df_results = model.fit_transform(X_training, y_training, cv=3, optimize_with_training_and_testing=True, X_testing=X_testing, y_testing=y_testing) +df_results + +[I 2024-07-05 12:22:09,866] A new study created in memory with name: n_iter=1 +[I 2024-07-05 12:22:11,454] Trial 0 finished with values: [0.3523809523809524, 0.37777777777777777] and parameters: {'min_samples_leaf': 21, 'min_samples_split': 0.36016224672107905, 'max_features': 'log2'}. +[I 2024-07-05 12:22:12,243] Trial 1 finished with values: [0.9142857142857143, 0.9555555555555556] and parameters: {'min_samples_leaf': 5, 'min_samples_split': 0.09313010568883545, 'max_features': None}. +[I 2024-07-05 12:22:12,999] Trial 2 finished with values: [0.3523809523809524, 0.37777777777777777] and parameters: {'min_samples_leaf': 21, 'min_samples_split': 0.34260975019837975, 'max_features': 'log2'}. +... +/Users/jolespin/miniconda3/envs/soothsayer_env/lib/python3.9/site-packages/clairvoyance/feature_selection.py:632: UserWarning: remove_zero_weighted_features=True and removed 996/1000 features + warnings.warn("remove_zero_weighted_features=True and removed {}/{} features".format((n_features_initial - n_features_after_zero_removal), n_features_initial)) +Recursive feature addition: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 199.54it/s] +Synopsis[n_iter=1] Input Features: 1000, Selected Features: 1 +Initial Training Score: 0.9238095238095237, Feature Selected Training Score: 0.9619047619047619 +Initial Testing Score: 0.9555555555555556, Feature Selected Testing Score: 0.9555555555555556 ``` -![](images/5a.png) -##### Feature selection based on regression tasks -Here's a basic regression using a `DecisionTreeRegressor` model and a grid search for different `min_samples_leaf` and `min_samples_split` parameters. We add 87 noise variables and normalize all of the features so their scale is standardized. In this case, we are optimizing for `neg_root_mean_squared_error`. We are using a validation set of ~16% of the data during our recursive feature inclusion. For decision trees, we have the issue of getting zero-weighted features which are uninformative and misleading for RCI. To get around this, we implement a recursive feature removal that only keeps non-zero weighted features. We can turn this on via `remove_zero_weighted_features=True`. This also ensures that there are no redundant feature sets (not an issue when `remove_zero_weighted_features=False` because they are recursively added). +##### Example output: +We were able to get much higher perfomance on both the training and testing sets while identifying the most informative feature(s). + +| study_name | best_hyperparameters | best_estimator | best_trial | number_of_initial_features | initial_training_score | initial_testing_score | number_of_selected_features | feature_selected_training_score | feature_selected_testing_score | selected_features | +|---------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------|--------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------------------|--------------------------------|-------------------| +| n_iter=1 | {'min_samples_leaf': 9, 'min_samples_split': 0.10616631466626364, 'max_features': None} | "DecisionTreeClassifier(min_samples_leaf=9, | | | | | | | | | +| min_samples_split=0.10616631466626364, random_state=0)" | FrozenTrial(number=12, state=TrialState.COMPLETE, values=[0.9238095238095237, 0.9555555555555556], datetime_start=datetime.datetime(2024, 7, 5, 12, 22, 16, 644273), datetime_complete=datetime.datetime(2024, 7, 5, 12, 22, 16, 712703), params={'min_samples_leaf': 9, 'min_samples_split': 0.10616631466626364, 'max_features': None}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'min_samples_leaf': IntDistribution(high=50, log=False, low=1, step=1), 'min_samples_split': FloatDistribution(high=0.5, log=False, low=0.0, step=None), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2', None))}, trial_id=12, value=None) | 1000 | 0.9238095238095237 | 0.9555555555555556 | 1 | 0.9619047619047619 | 0.9555555555555556 | ['petal_width'] | | | + -Note: When we use `remove_zero_weighted_features=True`, we get a scatter plot instead of a line plot with error because there are multiple feature sets (each with their own performance distribution on the CV set) that may have the same number of features. +#### Feature selection based on regression tasks + +Alright, let's switch it up and model a regression task instead. We are going to do the controversial boston housing dataset just because it's easy. We are going to use the RMSE scorer from `Scikit-Learn` and increase the number of iterations for the bayesian hyperparamter optimzation. ```python +# Load modules from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error # Load Boston data # from sklearn.datasets import load_boston; boston = load_boston() # Deprecated @@ -184,108 +171,48 @@ target = raw_df.values[1::2, 2] X = pd.DataFrame(data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']) y = pd.Series(target) -number_of_noise_features = 100 - X.shape[1] +# Add some noise features to total 1000 features +number_of_noise_features = 1000 - X.shape[1] X_noise = pd.DataFrame(np.random.RandomState(0).normal(size=(X.shape[0], number_of_noise_features)), columns=map(lambda j: f"noise_{j}", range(number_of_noise_features))) X_boston_with_noise = pd.concat([X, X_noise], axis=1) X_normalized = X_boston_with_noise - X_boston_with_noise.mean(axis=0).values X_normalized = X_normalized/X_normalized.std(axis=0).values # Let's fit the model but leave a held out testing set -X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, random_state=0, test_size=0.3) +X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, random_state=0, test_size=0.1) -# Get parameters +# Define the parameter space estimator = DecisionTreeRegressor(random_state=0) -param_grid = {"min_samples_leaf":[1,2,3,5,8],"min_samples_split":{ 0.1618, 0.382, 0.5, 0.618}} - -# Fit model -reg = cy.ClairvoyanceRegression( - name="Boston", - n_jobs=-1, - n_draws=10, - estimator=estimator, - param_grid=param_grid, - verbose=1, - remove_zero_weighted_features=True, -) -reg.fit(X_training, y_training) -history = reg.recursive_feature_inclusion(early_stopping=10, X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing) -history.head() -``` -![](images/6a.png) - -```python -reg.plot_scores(title="Boston", xtick_rotation=90) -reg.plot_weights() -reg.plot_weights(weight_type="cross_validation") -``` -![](images/7a.png) - -Let's see if we can increase the performance using the weights fitted with a `DecisionTreeRegressor` but with an ensemble `GradientBoostingRegressor` for the actual feature inclusion algorithm. - -```python -from sklearn.ensemble import GradientBoostingRegressor -# Get the relevant parameters from the DecisionTreeRegressor that will be be applicable to the ensemble -relevant_params_from_best_decisiontree_estimator = {k:reg.best_estimator_.get_params()[k] for k in ["criterion", "min_samples_split"]} -# Get estimator -estimator = GradientBoostingRegressor(random_state=0, **relevant_params_from_best_decisiontree_estimator) -# Recursive feature inclusion using ensemble -history = reg.recursive_feature_inclusion(early_stopping=10, estimator=estimator, X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing) -reg.plot_scores(title="Boston", xtick_rotation=90) -reg.plot_weights() -reg.plot_weights(weight_type="cross_validation") - - -``` -![](images/8a.png) - -RMSE is looking better. - -##### Recursive feature selection based on classification tasks -Here we are running `Clairvoyance` recursively identifying several feature sets that work with different hyperparameters to get a range of feature sets to select from in the end. We will iterate through all of the hyperparamater configurations, recursively feed in the data using different percentiles of the weights, and use different score thresholds from the random draws. The recursive usage is similar to the legacy implementation used in [Espinoza & Dupont et al. 2021](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008857) (which is still provided as an executable). - -```python -# Get the iris data (again) -X_normalized = X_iris_with_noise - X_iris_with_noise.mean(axis=0).values -X_normalized = X_normalized/X_normalized.std(axis=0).values -target_names = load_iris().target_names -y = pd.Series(load_iris().target) -y = y.map(lambda i: target_names[i]) - -# Specify model algorithm and parameter grid -estimator=LogisticRegression(max_iter=1000, solver="liblinear", multi_class="ovr") -param_grid={ - "C":[1e-10] + (np.arange(1,11)/10).tolist(), - "penalty":["l1", "l2"], +param_space = { + "min_samples_leaf":["int", 1, 50], + "min_samples_split": ["float", 0.0, 0.5], + "max_features":["categorical", ["sqrt", "log2", None]], } +scorer = make_scorer(mean_squared_error, greater_is_better=False) + +# Fit the AutoML model +model = BayesianClairvoyanceRegression(estimator, param_space, n_iter=4, n_trials=10, feature_selection_method="addition", n_jobs=-1, verbose=1, feature_selection_performance_threshold=0.0) +df_results = model.fit_transform(X_training, y_training, cv=5, optimize_with_training_and_testing="auto", X_testing=X_testing, y_testing=y_testing) + +I 2024-07-06 01:30:03,567] A new study created in memory with name: n_iter=1 +[I 2024-07-06 01:30:03,781] Trial 0 finished with values: [-8.199129905056083, -10.15240690512492] and parameters: {'min_samples_leaf': 21, 'min_samples_split': 0.36016224672107905, 'max_features': 'log2'}. +[I 2024-07-06 01:30:04,653] Trial 1 finished with values: [-4.971853722495094, -6.666700255530846] and parameters: {'min_samples_leaf': 5, 'min_samples_split': 0.09313010568883545, 'max_features': None}. +[I 2024-07-06 01:30:05,188] Trial 2 finished with values: [-8.230463026740736, -10.167328393077224] and parameters: {'min_samples_leaf': 21, 'min_samples_split': 0.34260975019837975, 'max_features': 'log2'}. +... +Recursive feature addition: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 116.99it/s] +Synopsis[n_iter=4] Input Features: 3, Selected Features: 3 +Initial Training Score: -4.972940969198907, Feature Selected Training Score: -4.972940969198907 +Initial Testing Score: -6.313587662660524, Feature Selected Testing Score: -6.313587662660524 -# Instantiate model -X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, random_state=0, test_size=0.3) - -rci = cy.ClairvoyanceRecursive( - n_jobs=-1, - scorer="accuracy", - n_draws=10, - estimator=estimator, - param_grid=param_grid, - percentiles=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.925, 0.95, 0.975, 0.99], - minimum_scores=[-np.inf, 0.382, 0.5], - verbose=0, - remove_zero_weighted_features=False, -) -rci.fit(X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing, sort_hyperparameters_by=["C", "penalty"], ascending=[True, True]) -rci.plot_recursive_feature_selection() - -``` -![](images/9a.png) - -```python -# Plot score comparisons -rci.plot_scores_comparison() -rci.get_history().head() ``` -Let's see which feature sets have the highest validation score (i.e., average cross-validation score) and highest testing score (not used during RCI) while also considering the number of features. +#### Example output: -![](images/10a.png) +We successfully removed all the noise features and determined that `RM, LSTAT, CRIM` are the most important features. It's a controversial interpretation so I'm not going there but these results agree with what [other researchers](https://towardsdatascience.com/linear-regression-on-boston-housing-dataset-f409b7e4a155) have determined as well. -Looks like there are several hyperparameter sets that can predict at > 92% accuracy on the cross-validation and > 95% accuracy on the testing set using just the `petal_length` and `petal_width`. This was able to filter out both the 96 noise features and the 2 non-informative real features. \ No newline at end of file +| study_name | best_hyperparameters | best_estimator | best_trial | number_of_initial_features | initial_training_score | initial_testing_score | number_of_selected_features | feature_selected_training_score | feature_selected_testing_score | selected_features | +|------------|-------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------------------|--------------------------------|----------------------------------------------------------------------------------------------------------------------------------| +| n_iter=1 | {'min_samples_leaf': 5, 'min_samples_split': 0.09313010568883545, 'max_features': None} | DecisionTreeRegressor(min_samples_leaf=5, min_samples_split=0.09313010568883545, random_state=0) | FrozenTrial(number=1, state=TrialState.COMPLETE, values=[-4.971853722495094, -6.666700255530846], datetime_start=datetime.datetime(2024, 7, 6, 1, 30, 4, 256210), datetime_complete=datetime.datetime(2024, 7, 6, 1, 30, 4, 653385), params={'min_samples_leaf': 5, 'min_samples_split': 0.09313010568883545, 'max_features': None}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'min_samples_leaf': IntDistribution(high=50, log=False, low=1, step=1), 'min_samples_split': FloatDistribution(high=0.5, log=False, low=0.0, step=None), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2', None))}, trial_id=1, value=None) | 1000 | -4.971853722495094 | -6.666700255530846 | 12 | -4.167626439610535 | -6.497959383451274 | ['RM', 'LSTAT', 'CRIM', 'DIS', 'TAX', 'noise_657', 'noise_965', 'noise_711', 'noise_213', 'noise_930', 'noise_253', 'noise_484'] | +| n_iter=2 | {'min_samples_leaf': 30, 'min_samples_split': 0.11300600030211794, 'max_features': None} | DecisionTreeRegressor(min_samples_leaf=30, min_samples_split=0.11300600030211794, random_state=0) | FrozenTrial(number=5, state=TrialState.COMPLETE, values=[-4.971072001107094, -6.2892657979392474], datetime_start=datetime.datetime(2024, 7, 6, 1, 30, 12, 603770), datetime_complete=datetime.datetime(2024, 7, 6, 1, 30, 12, 619502), params={'min_samples_leaf': 30, 'min_samples_split': 0.11300600030211794, 'max_features': None}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'min_samples_leaf': IntDistribution(high=50, log=False, low=1, step=1), 'min_samples_split': FloatDistribution(high=0.5, log=False, low=0.0, step=None), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2', None))}, trial_id=5, value=None) | 12 | -4.971072001107094 | -6.2892657979392474 | 4 | -4.944562598653571 | -6.3774459339786524 | ['RM', 'LSTAT', 'CRIM', 'noise_213'] | +| n_iter=3 | {'min_samples_leaf': 45, 'min_samples_split': 0.06279265523191813, 'max_features': None} | DecisionTreeRegressor(min_samples_leaf=45, min_samples_split=0.06279265523191813, random_state=0) | FrozenTrial(number=1, state=TrialState.COMPLETE, values=[-5.236077512452411, -6.670753984555223], datetime_start=datetime.datetime(2024, 7, 6, 1, 30, 14, 831786), datetime_complete=datetime.datetime(2024, 7, 6, 1, 30, 14, 848240), params={'min_samples_leaf': 45, 'min_samples_split': 0.06279265523191813, 'max_features': None}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'min_samples_leaf': IntDistribution(high=50, log=False, low=1, step=1), 'min_samples_split': FloatDistribution(high=0.5, log=False, low=0.0, step=None), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2', None))}, trial_id=1, value=None) | 4 | -5.236077512452411 | -6.670753984555223 | 3 | -5.236077512452413 | -6.670753984555223 | ['RM', 'LSTAT', 'CRIM'] | +| n_iter=4 | {'min_samples_leaf': 30, 'min_samples_split': 0.004493048833777491, 'max_features': None} | DecisionTreeRegressor(min_samples_leaf=30, min_samples_split=0.004493048833777491, random_state=0) | FrozenTrial(number=3, state=TrialState.COMPLETE, values=[-4.972940969198907, -6.313587662660524], datetime_start=datetime.datetime(2024, 7, 6, 1, 30, 19, 160978), datetime_complete=datetime.datetime(2024, 7, 6, 1, 30, 19, 177029), params={'min_samples_leaf': 30, 'min_samples_split': 0.004493048833777491, 'max_features': None}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'min_samples_leaf': IntDistribution(high=50, log=False, low=1, step=1), 'min_samples_split': FloatDistribution(high=0.5, log=False, low=0.0, step=None), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2', None))}, trial_id=3, value=None) | 3 | -4.972940969198907 | -6.313587662660524 | 3 | -4.972940969198907 | -6.313587662660524 | ['RM', 'LSTAT', 'CRIM'] | \ No newline at end of file diff --git a/clairvoyance/__init__.py b/clairvoyance/__init__.py index 5465425..e71efcb 100644 --- a/clairvoyance/__init__.py +++ b/clairvoyance/__init__.py @@ -1,54 +1,7 @@ -#!/usr/bin/env python -# ======= -# Contact -# ======= -# Producer: Josh L. Espinoza -# Contact: jespinoz@jcvi.org, jol.espinoz@gmail.com -# Google Scholar: https://scholar.google.com/citations?user=r9y1tTQAAAAJ&hl -# ======= -# License: -# ======= -# GNU AFFERO GENERAL PUBLIC LICENSE -# Version 3, 19 November 2007 -# Copyright (C) 2007 Free Software Foundation, Inc. -# Everyone is permitted to copy and distribute verbatim copies -# of this license document, but changing it is not allowed. -# -# ======= -# Version -# ======= -__version__= "2023.10.13" -__author__ = "Josh L. Espinoza" -__email__ = "jespinoz@jcvi.org, jol.espinoz@gmail.com" -__url__ = "https://github.com/jolespin/clairvoyance" -__cite__ = "https://github.com/jolespin/clairvoyance" -__license__ = "BSD-3" -__developmental__ = True +__version__ = "2024.7.2" -functions = { - "format_stratify", - "format_weights", - "format_cross_validation", - "get_feature_importance_attribute", - "get_balanced_class_subset", - "recursive_feature_inclusion", - "plot_scores_line","plot_weights_bar", - "plot_weights_box", - "plot_recursive_feature_selection", - "plot_scores_comparison", -} -classes = { - "ClairvoyanceBase","ClairvoyanceClassification","ClairvoyanceRegression","ClairvoyanceRecursive", - -} -__all__ = sorted(functions | classes) -__doc__ = """ - - _______ _______ _____ ______ _ _ _____ __ __ _______ __ _ _______ _______ - | | |_____| | |_____/ \ / | | \_/ |_____| | \ | | |______ - |_____ |_____ | | __|__ | \_ \/ |_____| | | | | \_| |_____ |______ - - -""" -# ======= -from .clairvoyance import * +from . import utils +from . import visuals +from . import transformations +from . import bayesian +from . import legacy diff --git a/clairvoyance/bayesian.py b/clairvoyance/bayesian.py new file mode 100644 index 0000000..a95a566 --- /dev/null +++ b/clairvoyance/bayesian.py @@ -0,0 +1,668 @@ +# Built-ins +import os, sys, copy, warnings +from collections import OrderedDict +from collections.abc import Mapping + +# from multiprocessing import cpu_count + +# PyData +import pandas as pd +import numpy as np +# import xarray as xr + +# Machine learning +from scipy import stats +from sklearn.metrics import get_scorer #, make_scorer +from sklearn.model_selection import cross_val_score, cross_validate +from sklearn.base import clone, is_classifier, is_regressor + +# Feature selection +from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures # , RecursiveFeatureElimination, RecursiveFeatureAddition + +# Hyperparameter selection +import optuna + +from .utils import * +from .feature_selection import ClairvoyanceRecursiveFeatureAddition, ClairvoyanceRecursiveFeatureElimination + +_bayesianclairvoyancebase_docstring = """ + # Modeling parameters: + # ==================== + estimator: + sklearn-compatible estimator with either .feature_importances_ or .coef_ + + param_space: + + dict with {name_param: [suggestion_type, *]} + + suggestion_types: {"categorical", "discrete_uniform", "float", "int", "loguniform", "uniform"} + + categorical suggestion types must contain 2 items (e.g., [categorical, ['a','b','c']]) + uniform/loguniform suggestion types must contain 3 items [uniform/loguniform, low, high] + float/int suggestion type must contain either 3 items [float/int, low, high]) or + 4 items [float/int, low, high, {step:float/int, log:bool}] + + + scorer: + sklearn-compatible scorer or string with scorer name that can be used with `sklearn.metrics.get_scorer` + + n_iter [Default=10]: + Number of iterations to run Optuna + feature selection + + n_trials [Default=50]: + Number of trials for Optuna for each iteration + + transformation [Default=None]: + Transformation to use for each time a feature is added or removed. + This is useful for compositional data analysis where the feature transformations + are dependent on the other features. + None: No transformation + closure: Total sum-scaling where the values are normalized by the total counts + clr: CLR transformation. Cannot handle zeros so may require pseudocounts + clr_with_multiplicative_replacement: Closure followed by a pseudocount 1/m**2 where m = number of features + + # Optuna + # ====== + study_prefix [Default: "n_iter="]: + Prefix to use for Optuna iterations + + study_timeout [Default=None]: + Stop study after the given number of second(s). If this argument is set to None, the study is executed without time limitation. + If n_trials is also set to None, the study continues to create trials until it receives a termination signal such as Ctrl+C or SIGTERM. + + study_callbacks [Default=None]: + List of callback functions that are invoked at the end of each trial. + Each function must accept two parameters with the following types in this order: Study and FrozenTrial. + + training_testing_weights [Default = [1.0,1.0]]: + Training and testing multipliers to use in multi-objective optimization + + # Feature selection + # ================= + feature_selection_method:str [Default = "addition"]: + Feature selection method to use. + addition or elimination + + + feature_selection_performance_threshold [Default = 0.0]: + Minimum performance gain to consider a feature + + drop_constant_features [Default = True]: + Use FeatureEngine to drop constant features in cross-validation splits + + threshold_constant_features [Default = 1]: + Threshold to use for constant features + + drop_duplicate_features [Default = True]: + Use FeatureEngine to drop duplicate features in cross-validation splits + + feature_importances_from_cv [Default = True]: + If True, calculate feature importances from cross-validation splits + If False, use a final fit for the feature importances + + # Zero weights + # ============ + remove_zero_weighted_features [Default = True]: + When changing the feature set on tree-based estimators it is common for certain features + to have a zero-weight after refitting the model. This refits iteratively refits the + estimator until none of the features have zero weights. It is possible that this will + break poorly performing estimators and return zero features. + + maximum_tries_to_remove_zero_weighted_features [Default = 100]: + Maximum number of tries to remove zero weighted features during refits + + # Labeling + # ======== + name:str=None, + observation_type:str=None, + feature_type:str=None, + target_type:str=None, + + # Utility + # ======= + early_stopping [Default = 5]: + If the model does not improve in 5 iterations then stop. + + random_state [Default = 0]: + Random seed state used for TPESampler, cross-validation[, and estimator if one is not already set.] + + n_jobs [Default = 1]: + Number of processors to use. -1 uses all available. + + copy_X=True, + copy_y=True, + verbose=1, + log=sys.stdout, +""" + +class BayesianClairvoyanceBase(object): + + def __init__( + self, + # Modeling + estimator, + param_space:dict, + scorer, + n_iter=10, + n_trials=50, + transformation=None, + + # Optuna + study_prefix="n_iter=", + study_timeout=None, + study_callbacks=None, + training_testing_weights = [1.0,1.0], + + # Feature selection + feature_selection_method:str="addition", + feature_selection_performance_threshold=0.0, + drop_constant_features = True, + threshold_constant_features=1, + drop_duplicate_features = True, + feature_importances_from_cv=True, + + # Zero weights + remove_zero_weighted_features=True, + maximum_tries_to_remove_zero_weighted_features=100, + + # Labeling + name=None, + observation_type=None, + feature_type=None, + target_type=None, + + # Utility + early_stopping=5, + random_state=0, + n_jobs=1, + copy_X=True, + copy_y=True, + verbose=1, + log=sys.stdout, + ): + + _bayesianclairvoyancebase_docstring + + # Method + assert_acceptable_arguments(feature_selection_method, {"addition", "elimination"}) + self.feature_selection_method = {"addition":ClairvoyanceRecursiveFeatureAddition, "elimination":ClairvoyanceRecursiveFeatureElimination}[feature_selection_method] + + # Estimator + self.estimator_name = estimator.__class__.__name__ + if is_classifier(estimator): + self.estimator_type = "classifier" + if is_regressor(estimator): + self.estimator_type = "regressor" + self.estimator = clone(estimator) + + if "random_state" in self.estimator.get_params(deep=True): + query = self.estimator.get_params(deep=True)["random_state"] + if query is None: + if verbose > 0: + print("Updating `random_state=None` in `estimator` to `random_state={}`".format(random_state), file=log) + self.estimator.set_params(random_state=random_state) + self.feature_weight_attribute = get_feature_importance_attribute(estimator, "auto") + assert len(param_space) > 0, "`param_space` must have at least 1 key:[value_1, value_2, ..., value_n] pair" + self.param_space = self._check_param_space(estimator, param_space) + + # Training testing weights + training_testing_weights = np.asarray(training_testing_weights).astype(float) + msg = "`training_testing_weights` must be a float vector with values in the range [0,1]" + assert training_testing_weights.size == 2, msg + assert np.min(training_testing_weights) >= 0, msg + assert np.max(training_testing_weights) <= 1, msg + self.training_testing_weights = training_testing_weights + + + # Set attributes + self.name = name + self.observation_type = observation_type + self.feature_type = feature_type + self.target_type = target_type + self.is_fitted = False + self.testing_set_provided = False + self.n_iter = n_iter + self.n_trials = n_trials + self.n_jobs = n_jobs + self.early_stopping = early_stopping + self.study_prefix= study_prefix + self.study_timeout = study_timeout + self.study_callbacks = study_callbacks + + self.random_state = random_state + if isinstance(scorer, str): + scorer = get_scorer(scorer) + self.scorer = scorer + self.scorer_name = scorer._score_func.__name__ + self.verbose = verbose + self.log = log + + # Data + assert_acceptable_arguments(transformation, {None,"clr","closure", "clr_with_multiplicative_replacement"}) + self.transformation = transformation + self.copy_X = copy_X + self.copy_y = copy_y + + # Feature selection + self.feature_selection_performance_threshold = feature_selection_performance_threshold + self.feature_importances_from_cv = feature_importances_from_cv + self.drop_constant_features = drop_constant_features + self.threshold_constant_features = threshold_constant_features + self.drop_duplicate_features = drop_duplicate_features + self.remove_zero_weighted_features=remove_zero_weighted_features + self.maximum_tries_to_remove_zero_weighted_features=maximum_tries_to_remove_zero_weighted_features + + def __repr__(self): + pad = 4 + header = format_header("{}(Name:{})".format(self.__class__.__name__, self.name),line_character="=") + n = len(header.split("\n")[0]) + fields = [ + header, + pad*" " + "* Estimator: {}".format(self.estimator_name), + pad*" " + "* Estimator Type: {}".format(self.estimator_type), + pad*" " + "* Parameter Space: {}".format(self.param_space), + pad*" " + "* Scorer: {}".format(self.scorer_name), + + pad*" " + "- -- --- ----- -------- -------------", + + pad*" " + "* n_iter: {}".format(self.n_iter), + pad*" " + "* n_jobs: {}".format(self.n_jobs), + pad*" " + "* early_stopping: {}".format(self.early_stopping), + + pad*" " + "* random_state: {}".format(self.random_state), + + pad*" " + "- -- --- ----- -------- -------------", + pad*" " + "* Feature Selection Method: {}".format(self.feature_selection_method.__name__), + pad*" " + "* Feature Weight Attribute: {}".format(self.feature_weight_attribute), + pad*" " + "* Transformation: {}".format(self.transformation), + pad*" " + "* Remove Zero Weighted Features: {}".format(self.remove_zero_weighted_features), + pad*" " + "* Maximum Tries to Remove: {}".format(self.maximum_tries_to_remove_zero_weighted_features), + + pad*" " + "- -- --- ----- -------- -------------", + + pad*" " + "* Observation Type: {}".format(self.observation_type), + pad*" " + "* Feature Type: {}".format(self.feature_type), + pad*" " + "* Target Type: {}".format(self.target_type), + + pad*" " + "- -- --- ----- -------- -------------", + + pad*" " + "* Fitted: {}".format(self.is_fitted), + # pad*" " + "* Fitted(RCI): {}".format(self.is_fitted_rci), + + + ] + + return "\n".join(fields) + + def _check_param_space(self, estimator, param_space): + """ + estimator: A sklearn-compatible estimator + param_space: dict with {name_param: [suggestion_type, *]} + + suggestion_types: {"categorical", "discrete_uniform", "float", "int", "loguniform", "uniform"} + + categorical suggestion types must contain 2 items (e.g., [categorical, ['a','b','c']]) + uniform/loguniform suggestion types must contain 3 items [uniform/loguniform, low, high] + float/int suggestion type must contain either 3 items [float/int, low, high]) or 4 items [float/int, low, high, {step:float/int, log:bool}] + """ + # suggest_categorical() + # Suggest a value for the categorical parameter. + # suggest_discrete_uniform(name, low, high, q) + # Suggest a value for the discrete parameter. + # suggest_float(name, low, high, *[, step, log]) + # Suggest a value for the floating point parameter. + # suggest_int(name, low, high, *[, step, log]) + # Suggest a value for the integer parameter. + # suggest_loguniform(name, low, high) + # Suggest a value for the continuous parameter. + # suggest_uniform(name, low, high) + # Suggest a value for the continuous parameter. + + # Check if parameter names are valid + param_space = copy.deepcopy(param_space) + estimator_params = set(estimator.get_params(deep=True).keys()) + query_params = set(param_space.keys()) + assert query_params <= estimator_params, "The following parameters are not recognized for estimator {}:\n{}".format(estimator.__class__.__name__, "\n".join(sorted(query_params - estimator_params))) + + suggestion_types = {"categorical", "discrete_uniform", "float", "int", "loguniform", "uniform"} + for k, v in param_space.items(): + assert hasattr(v, "__iter__") & (not isinstance(v, str)), "space must be iterable" + assert len(v) > 1, "space must use the following format: [suggestion_type, *values] (e.g., [categorical, ['a','b','c']]\n[int, 1, 100])" + query_suggestion_type = v[0] + assert_acceptable_arguments(query_suggestion_type, suggestion_types) + if query_suggestion_type in {"categorical"}: + assert len(v) == 2, "categorical suggestion types must contain 2 items (e.g., [categorical, ['a','b','c']])" + assert hasattr(v[1], "__iter__") & (not isinstance(v[1], str)), "categorical suggestion types must contain 2 items [categorical, ['a','b','c']]" + if query_suggestion_type in {"uniform", "loguniform"}: + assert len(v) == 3, "uniform/loguniform suggestion types must contain 3 items [uniform/loguniform, low, high]" + if query_suggestion_type in {"discrete_uniform"}: + assert len(v) == 4, "discrete_uniform suggestion type must contain 4 items [discrete_uniform, low, high, q]" + if query_suggestion_type in {"float", "int"}: + suggest_float_int_error_message = "float/int suggestion type must contain either 3 items [float/int, low, high]) or 4 items [float/int, low, high, {step:float/int, log:bool}]" + assert len(v) in {3,4}, suggest_float_int_error_message + if len(v) == 3: + param_space[k] = [*v, {}] + if len(v) == 4: + query_dict = v[-1] + assert isinstance(query_dict, Mapping), suggest_float_int_error_message + query_dict = dict(query_dict) + assert set(query_dict.keys()) <= {"step", "log"}, suggest_float_int_error_message + if "step" in query_dict: + assert isinstance(query_dict["step"], (float, int)), suggest_float_int_error_message + if "log" in query_dict: + assert isinstance(query_dict["log"], bool), suggest_float_int_error_message + param_space[k] = [*v[:-1], query_dict] + return param_space + + def _compile_param_space(self, trial, param_space): + params = dict() + for k, v in param_space.items(): + suggestion_type = v[0] + suggest = getattr(trial, f"suggest_{suggestion_type}") + if suggestion_type in {"float", "int"}: + suggestion = suggest(k, *v[1:-1], **v[-1]) + else: + suggestion = suggest(k, *v[1:]) + params[k] = suggestion + return params + + def _optimize_hyperparameters(self, X, y, study_name, sampler, **study_kws): # test set here? + def _objective(trial): + + # Compile parameters + params = self._compile_param_space(trial, self.param_space) + + estimator = clone(self.estimator) + estimator.set_params(**params) + + cv_results = cross_val_score(estimator, X, y, scoring=self.scorer, n_jobs=self.n_jobs, cv=self.cv_splits_) + + return cv_results.mean() + + direction = "maximize" + study = optuna.create_study(direction=direction, study_name=study_name, sampler=sampler, **study_kws) + study.optimize(_objective, n_trials=self.n_trials, timeout=self.study_timeout, show_progress_bar=self.verbose >= 2, callbacks=self.study_callbacks, gc_after_trial=True) + return study + + def _optimize_hyperparameters_include_testing(self, X, y, X_testing, y_testing, study_name, sampler, **study_kws): # test set here? + def _objective(trial): + + # Compile parameters + params = self._compile_param_space(trial, self.param_space) + estimator = clone(self.estimator) + estimator.set_params(**params) + estimator.fit(X, y) + testing_score = self.scorer(estimator, X_testing, y_testing) + + cv_results = cross_val_score(estimator, X, y, scoring=self.scorer, n_jobs=self.n_jobs, cv=self.cv_splits_) + + return [cv_results.mean(), testing_score] + # if direction == "auto": + # direction = {"regressor":"minimize", "classifier":"maximize"}[self.estimator_type] + directions = ["maximize", "maximize"] + study = optuna.create_study(directions=directions, study_name=study_name, sampler=sampler, **study_kws) + study.optimize(_objective, n_trials=self.n_trials, timeout=self.study_timeout, show_progress_bar=self.verbose >= 2, callbacks=self.study_callbacks, gc_after_trial=True) + return study + + def _feature_selection(self, estimator, X, y, X_testing, y_testing, study_name): + + # initial_testing_score = np.nan + # if self.testing_set_provided: + # estimator.fit(X, y) + # initial_testing_score = self.scorer(estimator, X_testing, y_testing) + + # Feature selection + model_fs = self.feature_selection_method( + estimator=estimator, + scoring=self.scorer, + cv=self.cv_splits_, + threshold=self.feature_selection_performance_threshold, + feature_importances_from_cv=self.feature_importances_from_cv, + transformation=self.transformation, + remove_zero_weighted_features=self.remove_zero_weighted_features, + maximum_tries_to_remove_zero_weighted_features=self.maximum_tries_to_remove_zero_weighted_features, + verbose=self.verbose, + ) + model_fs.fit(X, y) + + selected_features = model_fs.selected_features_ + feature_selected_training_cv = model_fs.feature_selected_model_cv_ + feature_selected_testing_score = np.nan + if self.testing_set_provided: + X_training_query = X.loc[:,selected_features] + X_testing_query = X_testing.loc[:,selected_features] + if self.transformation is not None: + X_training_query = self.transformation(X_training_query) + X_testing_query = self.transformation(X_testing_query) + + estimator.fit(X_training_query, y) + feature_selected_testing_score = self.scorer(estimator, X_testing_query, y_testing) + + # Show the feature weights be scaled? Before or after + return (selected_features, model_fs.initial_feature_importances_, model_fs.feature_selected_importances_, model_fs.performance_drifts_, feature_selected_training_cv, feature_selected_testing_score) + + + def _fit(self, X, y, cv, X_testing=None, y_testing=None, optimize_with_training_and_testing="auto", **study_kws): # How to use the test set here? + if self.copy_X: + self.X_ = X.copy() + if self.copy_y: + self.y_ = y.copy() + + # Cross validation + self.cv_splits_, self.cv_labels_ = format_cross_validation(cv, X=X, y=y, random_state=self.random_state, stratify=self.estimator_type == "classifier") + + # Testing + self.testing_set_provided = check_testing_set(X.columns, X_testing, y_testing) + if optimize_with_training_and_testing == "auto": + optimize_with_training_and_testing = self.testing_set_provided + if optimize_with_training_and_testing: + assert self.testing_set_provided, "If `optimize_with_training_and_testing=True` then X_testing and y_testing must be provided." + + self.studies_ = OrderedDict() + self.results_ = OrderedDict() + self.feature_weights_ = OrderedDict() + self.feature_selection_performance_drifts_ = OrderedDict() + self.feature_weights_initial_ = OrderedDict() + # self.feature_selection_ = dict() + + if self.drop_constant_features: + model_dcf = DropConstantFeatures(tol=self.threshold_constant_features) + features_to_drop = set() + for cv_labels, (indices_training, indices_testing) in zip(self.cv_labels_, self.cv_splits_): + model_dcf.fit(X.iloc[indices_training]) + features_to_drop |= set(model_dcf.features_to_drop_) + if self.verbose > 0: + n_dropped_features = len(features_to_drop) + if n_dropped_features > 0: + print("Dropping {} constant features based on {} threshold: {}".format(n_dropped_features, self.threshold_constant_features, sorted(features_to_drop)), file=self.log) + X = X.drop(features_to_drop, axis=1) + del model_dcf + + if self.drop_duplicate_features: + model_ddf = DropConstantFeatures(tol=self.threshold_constant_features) + features_to_drop = set() + for cv_labels, (indices_training, indices_testing) in zip(self.cv_labels_, self.cv_splits_): + model_ddf.fit(X.iloc[indices_training]) + features_to_drop |= set(model_ddf.features_to_drop_) + if self.verbose > 0: + n_dropped_features = len(features_to_drop) + if n_dropped_features > 0: + print("Dropping {} duplicate features: {}".format(n_dropped_features, sorted(features_to_drop)), file=self.log) + X = X.drop(features_to_drop, axis=1) + del model_ddf + + query_features = X.columns + best_score = -np.inf + no_progress = 0 + for i in range(1, self.n_iter+1): + if no_progress > self.early_stopping: + warnings.warn(f"Stopping because score has not improved from {best_score} with {len(query_features)} features in {self.early_stopping} iterations") + break + else: + if len(query_features) > 1: + # Study + study_name = f"{self.study_prefix}{i}" + sampler = optuna.samplers.TPESampler(seed=self.random_state + i) + if optimize_with_training_and_testing: + self.multiobjective_ = True + study = self._optimize_hyperparameters_include_testing( + X=X.loc[:,query_features], + y=y, + X_testing=X_testing.loc[:,query_features], + y_testing=y_testing, + study_name=study_name, + sampler=sampler, + **study_kws, + ) + else: + self.multiobjective_ = False + study = self._optimize_hyperparameters( + X=X.loc[:,query_features], + y=y, + study_name=study_name, + sampler=sampler, + **study_kws, + ) + self.studies_[study_name] = study + + # Fit + initial_testing_score = np.nan + best_estimator = clone(self.estimator) + + if optimize_with_training_and_testing: + # Determine best trial from multiobjective study + weighted_scores = list() + for trial in study.best_trials: + scores = trial.values + ws = np.mean(scores * self.training_testing_weights) + weighted_scores.append(ws) + best_trial = study.best_trials[np.argmax(weighted_scores)] + initial_training_score, initial_testing_score = best_trial.values + + # Refit estimator with best params + best_params = best_trial.params + best_estimator.set_params(**best_params) + best_estimator.fit(X.loc[:,query_features], y) + else: + best_trial = study.best_trial + initial_training_score = study.best_value + + # Refit estimator with best params + best_params = study.best_params + best_estimator.set_params(**best_params) + best_estimator.fit(X.loc[:,query_features], y) + + # Get initial testing score + if self.testing_set_provided: + initial_testing_score = self.scorer(best_estimator, X_testing.loc[:,query_features], y_testing) + + # # Feature selection + selected_features, initial_feature_weights, feature_weights, feature_selection_performance_drifts, feature_selected_training_cv, feature_selected_testing_score = self._feature_selection( + estimator=best_estimator, + X=X.loc[:,query_features], + y=y, + X_testing=X_testing.loc[:,query_features] if self.testing_set_provided else None, + y_testing=y_testing, + # query_features=query_features, + study_name=study_name, + ) + feature_selected_training_score = feature_selected_training_cv.mean() + + self.feature_selection_performance_drifts_[study_name] = feature_selection_performance_drifts + self.feature_weights_initial_[study_name] = initial_feature_weights + self.feature_weights_[study_name] = feature_weights + + self.results_[study_name] = { + "best_hyperparameters":best_params, + "best_estimator":best_estimator, + "best_trial":best_trial, + "number_of_initial_features":len(query_features), + "initial_training_score":initial_training_score, + "initial_testing_score":initial_testing_score, + "number_of_selected_features":len(selected_features), + "feature_selected_training_score":feature_selected_training_score, + "feature_selected_testing_score":feature_selected_testing_score, + "selected_features":list(selected_features), + } + # print(study.best_value, model_fs.initial_model_performance_, feature_selected_training_performance) + print(f"Synopsis[{study_name}] Input Features: {len(query_features)}, Selected Features: {len(selected_features)}", file=self.log) + print(f"Initial Training Score: {initial_training_score}, Feature Selected Training Score: {feature_selected_training_score}", file=self.log) + if self.testing_set_provided: + print(f"Initial Testing Score: {initial_testing_score}, Feature Selected Testing Score: {feature_selected_testing_score}", file=self.log) + print(file=self.log) + + query_features = selected_features + if feature_selected_training_score > best_score: + best_score = feature_selected_training_score + else: + no_progress += 1 + else: + warnings.warn(f"Stopping because < 2 features remain {query_features}") + if len(query_features) == 0: + raise ValueError("0 features selected using current parameters. Try preprocessing feature matrix, using a different transformation, or a different estimator.") + break + self.is_fitted = True + return self + + def _get_results(self): + assert self.is_fitted, "Please `.fit` model before `.get_results()`" + df = pd.DataFrame(self.results_).T + df.index.name = "study_name" + return df + + def fit(self, X, y, cv=3, X_testing=None, y_testing=None, optimize_with_training_and_testing="auto", **study_kws): + self._fit( + X=X, + y=y, + cv=cv, + X_testing=X_testing, + y_testing=y_testing, + optimize_with_training_and_testing=optimize_with_training_and_testing, + **study_kws, + ) + self.results_ = self._get_results() + return self + + def fit_transform(self, X, y, cv=3, X_testing=None, y_testing=None, optimize_with_training_and_testing="auto", **study_kws): + self._fit( + X=X, + y=y, + cv=cv, + X_testing=X_testing, + y_testing=y_testing, + optimize_with_training_and_testing=optimize_with_training_and_testing, + **study_kws, + ) + self.results_ = self._get_results() + return self.results_ + + + def to_file(self, filepath): + self.log = None + write_object(self, filepath) + + + @classmethod + def from_file(cls, filepath, log=sys.stdout): + cls = read_object(filepath) + cls.log = log + return cls + + def copy(self): + return copy.deepcopy(self) + + +class BayesianClairvoyanceClassification(BayesianClairvoyanceBase): + def __init__(self, *args, **kwargs): + if "scorer" not in kwargs: + kwargs["scorer"] = "accuracy" + super(BayesianClairvoyanceClassification, self).__init__(*args, **kwargs) + + # Additional initialization for BayesianClairvoyanceClassification if needed + +class BayesianClairvoyanceRegression(BayesianClairvoyanceBase): + def __init__(self, *args, **kwargs): + if "scorer" not in kwargs: + kwargs["scorer"] = "neg_root_mean_squared_error" + super(BayesianClairvoyanceRegression, self).__init__(*args, **kwargs) + \ No newline at end of file diff --git a/clairvoyance/feature_selection.py b/clairvoyance/feature_selection.py new file mode 100644 index 0000000..8c8bb99 --- /dev/null +++ b/clairvoyance/feature_selection.py @@ -0,0 +1,1069 @@ +import sys,warnings +from typing import List, Union + +import numpy as np +import pandas as pd +from scipy.stats import sem +from sklearn.model_selection import cross_validate +from sklearn.metrics import get_scorer, make_scorer +from sklearn.base import is_classifier, is_regressor + +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) + +from feature_engine._docstrings.init_parameters.selection import ( + _confirm_variables_docstring, + _estimator_docstring, + +) + +from feature_engine.dataframe_checks import check_X_y +from feature_engine.selection.base_selection_functions import get_feature_importances +from feature_engine.selection.base_selector import BaseSelector +from feature_engine.tags import _return_tags +from feature_engine.variable_handling import ( + check_numerical_variables, + find_numerical_variables, + retain_variables_if_in_df, +) + +from feature_engine._docstrings.methods import _fit_transform_docstring + +from feature_engine._docstrings.fit_attributes import ( + _feature_importances_docstring, + _feature_names_in_docstring, + _n_features_in_docstring, + _performance_drifts_docstring, +) + +from feature_engine._docstrings.selection._docstring import ( + _cv_docstring, + _features_to_drop_docstring, + _fit_docstring, + _get_support_docstring, + _initial_model_performance_docstring, + _scoring_docstring, + _threshold_docstring, + _transform_docstring, + _variables_attribute_docstring, + _variables_numerical_docstring, +) +from feature_engine._docstrings.substitute import Substitution + +from tqdm import tqdm +from .utils import ( + assert_acceptable_arguments, + check_testing_set, + format_cross_validation, + format_feature_importances_from_cv, + format_feature_importances_from_data, +) +from .transformations import ( + closure, + clr, + clr_with_multiplicative_replacement, +) + +Variables = Union[None, int, str, List[Union[str, int]]] + + +_get_transformation_docstring = """ + transformation: str,callable, default=None + If provided, must be either 'closure', 'clr', or 'clr_with_multiplicative_replacement'. + Can also be callable that transforms a pd.DataFrame. This is useful + for feature selection on transformations that are based on the feature set such + as center log-ratio or closure transformations in compositional data. +""" + +_feature_importances_sem_docstring = """ + feature_importances_sem_: + Pandas Series with the SEM of feature importance (comes from step 2) +""" + +_get_remove_zero_weighted_features_docstring = """ + remove_zero_weighted_features_docstring: bool, default=True + Remove zero weighted features from feature selection +""" + +_threshold_docstring = _threshold_docstring.replace("default = 0.01", "default = 0.0") + + + +# Functions +def remove_zero_weighted_features( # Make this easier to use. Way too specific right now... + estimator, + X, + y, + selected_features, + initial_feature_importances, + initial_feature_importances_sem, + feature_selected_training_cv, + feature_selected_testing_score=np.nan, + transformation=None, + X_testing=None, + y_testing=None, + n_jobs=1, + cv=3, + scorer="auto", + feature_importances_from_cv=True, + maximum_tries_to_remove_zero_weighted_features=100, + verbose=1, + log=sys.stderr, + i=0, + ): + # Determine scoring + if scorer == "auto": + if is_classifier(estimator): + scorer = "accuracy" + warnings.warn("Detected estimator as classifier. Setting scorer to {}".format(scorer)) + if is_regressor(estimator): + scorer = "neg_root_mean_squared_error" + warnings.warn("Detected estimator as regressor. Setting scoring to {}".format(scorer)) + assert scorer != "auto", "`estimator` not recognized as classifier or regressor by sklearn. Are you using a sklearn-compatible estimator?" + if isinstance(scorer, str): + scorer = get_scorer(scorer) + + # Testing set + testing_set_provided = check_testing_set(X.columns, X_testing, y_testing) + + # Initial feature importances + feature_importances = initial_feature_importances.copy() + feature_importances_sem = initial_feature_importances_sem.copy() + feature_selected_training_score = feature_selected_training_cv.mean() + + + initial_feature_importances_equal_zero = initial_feature_importances == 0 + + n_features = len(selected_features) + if verbose > 0: + warnings.warn("Detected {}/{} zero-weighted features in final fitted model. Refitting to remove zero-weighted features".format(initial_feature_importances_equal_zero.sum(), n_features)) + features = list(initial_feature_importances[~initial_feature_importances_equal_zero].index) + for j in range(1, maximum_tries_to_remove_zero_weighted_features+1): + X_query = X.loc[:,features] + if all([ + transformation is not None, + len(features) > 1, + ]): + X_query = transformation(X_query) + # Cross validate + try: + cv_results = cross_validate( + estimator, + X_query, + y, + scoring=scorer, + n_jobs=n_jobs, + cv=cv, + return_estimator=True, + ) + except ValueError as e: + + # raise Exception("Cross-validation failed.\nNumber of samples: {}, Number of features: {}. Features: ".format(*X_query.shape, list(X_query.columns))) + if not isinstance(cv, int): + cv = len(cv) + return { + "selected_features":[], + "feature_importances":pd.Series([]), + "feature_importances_sem":pd.Series([]), + "feature_selected_training_cv":np.asarray(cv*[np.nan]), + "feature_selected_testing_score":np.nan, + } + + + # Summarize feature/coeff importance for each cross validation fold + if feature_importances_from_cv: + feature_selected_model_cv = cv_results["test_score"] + feature_importances_results = format_feature_importances_from_cv( + cv_results=cv_results, + features=X_query.columns, + ) + # Summarize feature/coeff importance for entire dataset + else: + feature_importances_results = format_feature_importances_from_data( + estimator=estimator, + X=X_query, + y=y, + ) + feature_selected_importances = feature_importances_results["mean"] + feature_selected_importances_sem = feature_importances_results["sem"] + + + mask_zero_weight_features = feature_selected_importances != 0 + + if np.all(mask_zero_weight_features): + updated_feature_selected_training_score = cv_results["test_score"].mean() + + # X_fs = transform(X=X_query, method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) + if verbose > 0: + + # if j > 0: + if updated_feature_selected_training_score > feature_selected_training_score: + msg = "[Success][Iteration={}, Try={}]: ☺ Removed all zero weighted features and the new training score improved from {} -> {}. The following {}/{} features remain: {}".format(i, j, feature_selected_training_score, updated_feature_selected_training_score, len(features), n_features, list(features)) + else: + msg = "[Success][Iteration={}, Try={}]: Removed all zero weighted features but the new training score declined from {} -> {}. The following {}/{} features remain: {}".format(i, j, feature_selected_training_score, updated_feature_selected_training_score, len(features), n_features, list(features)) + + print(msg, file=log) + + # Update these and stop: + # selected_features, feature_importances, performance_drifts, feature_selected_training_score, feature_selected_testing_score + selected_features = pd.Index(features) + feature_importances = feature_selected_importances.loc[features] + feature_importances_sem = feature_selected_importances_sem.loc[features] + + feature_selected_training_score = updated_feature_selected_training_score + feature_selected_training_cv = cv_results["test_score"] + + # Testing set with updated features + if testing_set_provided: + estimator.fit(X_query, y) + updated_feature_selected_testing_score = scorer(estimator, X_testing.loc[:,features], y_testing) + if verbose > 0: + # if j > 0: + if updated_feature_selected_testing_score > feature_selected_testing_score: + msg = "[Success][Iteration={}, Try={}]: ☺ New testing score improved from {} -> {}".format(i, j, feature_selected_testing_score, updated_feature_selected_testing_score) + else: + msg = "[Success][Iteration={}, Try={}]: New testing score declined from {} -> {}".format(i, j, feature_selected_testing_score, updated_feature_selected_testing_score) + print(msg, file=log) + feature_selected_testing_score = updated_feature_selected_testing_score + + break + else: + if verbose > 2: + if j > 0: + print("[...][Iteration={}, Try={}]: Removing {} features as they have zero weight in fitted model: {}".format(i, j, len(mask_zero_weight_features) - np.sum(mask_zero_weight_features), X_query.columns[~mask_zero_weight_features].tolist()), file=log) + features = X_query.columns[mask_zero_weight_features].tolist() + + return { + "selected_features":selected_features, + "feature_importances":feature_importances, + "feature_importances_sem":feature_importances_sem, + "feature_selected_training_cv":feature_selected_training_cv, + "feature_selected_testing_score":feature_selected_testing_score, + } + + +# Classes +class ClairvoyanceBaseRecursiveSelector(BaseSelector): + """ + This class has been modified from the BaseRecursiveSelector class from + the Feature Engine Python package (https://github.com/feature-engine/feature_engine) + + Shared functionality for recursive selectors. + + Parameters + ---------- + estimator: object + A Scikit-learn estimator for regression or classification. + The estimator must have either a `feature_importances` or `coef_` attribute + after fitting. + + variables: str or list, default=None + The list of variable to be evaluated. If None, the transformer will evaluate + all numerical features in the dataset. + + scoring: str, default='roc_auc' + Desired metric to optimise the performance of the estimator. Comes from + sklearn.metrics. See the model evaluation documentation for more options: + https://scikit-learn.org/stable/modules/model_evaluation.html + + threshold: float, int, default = 0.01 + The value that defines if a feature will be kept or removed. Note that for + metrics like roc-auc, r2_score and accuracy, the thresholds will be floats + between 0 and 1. For metrics like the mean_square_error and the + root_mean_square_error the threshold can be a big number. + The threshold must be defined by the user. Bigger thresholds will select less + features. + + cv: int, cross-validation generator or an iterable, default=3 + Determines the cross-validation splitting strategy. Possible inputs for cv are: + + - None, to use cross_validate's default 5-fold cross validation + + - int, to specify the number of folds in a (Stratified)KFold, + + - CV splitter + - (https://scikit-learn.org/stable/glossary.html#term-CV-splitter) + + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and y is either binary or + multiclass, StratifiedKFold is used. In all other cases, KFold is used. These + splitters are instantiated with `shuffle=False` so the splits will be the same + across calls. For more details check Scikit-learn's `cross_validate`'s + documentation. + + confirm_variables: bool, default=False + If set to True, variables that are not present in the input dataframe will be + removed from the list of variables. Only used when passing a variable list to + the parameter `variables`. See parameter variables for more details. + + {transformation} + + {remove_zero_weighted_features} + + Attributes + ---------- + initial_model_performance_: + Performance of the model trained using the original dataset. + + feature_importances_: + Pandas Series with the feature importance (comes from step 2) + + feature_importances_sem_: + Pandas Series with the SEM of feature importance (comes from step 2) + + performance_drifts_: + Dictionary with the performance drift per examined feature (comes from step 5). + + features_to_drop_: + List with the features to remove from the dataset. + + variables_: + The variables that will be considered for the feature selection. + + feature_names_in_: + List with the names of features seen during `fit`. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + Find the important features. + """.format( + transformation=_get_transformation_docstring, + remove_zero_weighted_features=_get_remove_zero_weighted_features_docstring, + ) + + def __init__( + self, + estimator, + scoring: str = "auto", + cv=3, + n_jobs=1, + threshold: Union[int, float] = 0.0, + variables: Variables = None, + confirm_variables: bool = False, + feature_importances_from_cv=True, + transformation = None, + remove_zero_weighted_features=True, + maximum_tries_to_remove_zero_weighted_features=100, + verbose=1, + ): + + if not isinstance(threshold, (int, float)): + raise ValueError("threshold can only be integer or float") + + if isinstance(transformation, str): + assert_acceptable_arguments(transformation, {"closure", "clr", "clr_with_multiplicative_replacement"}) + transformation = globals()[transformation] + + if transformation is not None: + assert hasattr(transformation, "__call__"), "If `transform` is not None, then it must be a callable function" + + if scoring == "auto": + if is_classifier(estimator): + scoring = "accuracy" + warnings.warn("Detected estimator as classifier. Setting scoring to {}".format(scoring)) + if is_regressor(estimator): + scoring = "neg_root_mean_squared_error" + warnings.warn("Detected estimator as regressor. Setting scoring to {}".format(scoring)) + assert scoring != "auto", "`estimator` not recognized as classifier or regressor by sklearn. Are you using a sklearn-compatible estimator?" + + super().__init__(confirm_variables) + self.variables = _check_variables_input_value(variables) + self.estimator = estimator + self.scoring = scoring + self.threshold = threshold + self.cv = cv + self.n_jobs = n_jobs + self.feature_importances_from_cv = feature_importances_from_cv + self.transformation = transformation + self.remove_zero_weighted_features=remove_zero_weighted_features + self.maximum_tries_to_remove_zero_weighted_features = maximum_tries_to_remove_zero_weighted_features + self.verbose = verbose + + + def fit(self, X: pd.DataFrame, y: pd.Series): + """ + Find initial model performance. Sort features by importance. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The input dataframe + + y: array-like of shape (n_samples) + Target variable. Required to train the estimator. + """ + + # check input dataframe + X, y = check_X_y(X, y) + + if self.variables is None: + self.variables_ = find_numerical_variables(X) + else: + if self.confirm_variables is True: + variables_ = retain_variables_if_in_df(X, self.variables) + self.variables_ = check_numerical_variables(X, variables_) + else: + self.variables_ = check_numerical_variables(X, self.variables) + + # check that there are more than 1 variable to select from + self._check_variable_number() + + # save input features + X = X[self.variables_] + self._get_feature_names_in(X) + + # train model with all features and cross-validation + if X.shape[1] > 1: + initial_model_cv_results = cross_validate( + self.estimator, + X if self.transformation is None else self.transformation(X), + y, + cv=self.cv, + scoring=self.scoring, + return_estimator=True, + n_jobs=self.n_jobs, + ) + else: + initial_model_cv_results = cross_validate( + self.estimator, + X, + y, + cv=self.cv, + scoring=self.scoring, + return_estimator=True, + n_jobs=self.n_jobs, + ) + + # store initial model performance + self.initial_model_cv_ = initial_model_cv_results["test_score"] + self.initial_model_performance_ = initial_model_cv_results["test_score"].mean() + + # Summarize feature/coeff importance for each cross validation fold + if self.feature_importances_from_cv: + feature_importances_results = format_feature_importances_from_cv( + cv_results=initial_model_cv_results, + features=X.columns, + ) + # Summarize feature/coeff importance for entire dataset + else: + feature_importances_results = format_feature_importances_from_data( + estimator=self.estimator, + X=X if self.transformation is None else self.transformation(X), + y=y, + ) + + self.initial_feature_importances_ = feature_importances_results["mean"] + self.initial_feature_importances_sem_ = feature_importances_results["sem"] + + assert self.initial_feature_importances_.abs().max() > 0, "Largest feature importances is zero. Something went wrong when training model." + assert not np.all(self.initial_feature_importances_ == 0), "All feature importances weight is zero. Something went wrong when training model." + + return X, y + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["variables"] = "numerical" + tags_dict["requires_y"] = True + # add additional test that fails + tags_dict["_xfail_checks"][ + "check_parameters_default_constructible" + ] = "transformer has 1 mandatory parameter" + tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA" + + msg = "transformers need more than 1 feature to work" + tags_dict["_xfail_checks"]["check_fit2d_1feature"] = msg + + return tags_dict + + +@Substitution( + estimator=_estimator_docstring, + scoring=_scoring_docstring, + threshold=_threshold_docstring, + cv=_cv_docstring, + variables=_variables_numerical_docstring, + confirm_variables=_confirm_variables_docstring, + initial_model_performance_=_initial_model_performance_docstring, + feature_importances_=_feature_importances_docstring, + feature_importances_sem_=_feature_importances_sem_docstring, + performance_drifts_=_performance_drifts_docstring, + features_to_drop_=_features_to_drop_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit=_fit_docstring, + transform=_transform_docstring, + fit_transform=_fit_transform_docstring, + get_support=_get_support_docstring, + transformation=_get_transformation_docstring, + remove_zero_weighted_features=_get_remove_zero_weighted_features_docstring, +) +class ClairvoyanceRecursiveFeatureAddition(ClairvoyanceBaseRecursiveSelector): + """ + This class has been modified from the RecursiveFeatureAddition class from + the Feature Engine Python package (https://github.com/feature-engine/feature_engine) + + ClairvoyanceRecursiveFeatureAddition() selects features following a recursive addition process. + + The process is as follows: + + 1. Train an estimator using all the features. + + 2. Rank the features according to their importance derived from the estimator. + + 3. Train an estimator with the most important feature and determine performance. + + 4. Add the second most important feature and train a new estimator. + + 5. Calculate the difference in performance between estimators. + + 6. If the performance increases beyond the threshold, the feature is kept. + + 7. Repeat steps 4-6 until all features have been evaluated. + + 8. Remove zero weighted features + + If transformations are provided, then each time a feature is removed or added the data is transformed. + + Model training and performance calculation can be performed using entire dataset or cross-validation. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + {estimator} + + {variables} + + {scoring} + + {threshold} + + {cv} + + {confirm_variables} + + {transformation} + + {remove_zero_weighted_features} + + Attributes + ---------- + {initial_model_performance_} + + {feature_importances_} + + {feature_importances_sem_} + + {performance_drifts_} + + {features_to_drop_} + + {variables_} + + {feature_names_in_} + + {n_features_in_} + + + Methods + ------- + {fit} + + {fit_transform} + + {get_support} + + {transform} + + Examples + -------- + + >>> import pandas as pd + >>> from sklearn.ensemble import RandomForestClassifier + >>> from feature_engine.selection import RecursiveFeatureAddition + >>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000], + >>> x2 = [2,4,3,1,2,2], + >>> x3 = [1,1,1,0,0,0], + >>> x4 = [1,2,1,1,0,1], + >>> x5 = [1,1,1,1,1,1])) + >>> y = pd.Series([1,0,0,1,1,0]) + >>> rfa = RecursiveFeatureAddition(RandomForestClassifier(random_state=42), cv=2) + >>> rfa.fit_transform(X, y) + x2 x4 + 0 2 1 + 1 4 2 + 2 3 1 + 3 1 1 + 4 2 0 + 5 2 1 + """ + + def fit(self, X: pd.DataFrame, y: pd.Series): + """ + Find the important features. Note that the selector trains various models at + each round of selection, so it might take a while. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The input dataframe + + y: array-like of shape (n_samples) + Target variable. Required to train the estimator. + """ + + X, y = super().fit(X, y) + + # Sort the feature importance values decreasingly + self.initial_feature_importances_.sort_values(ascending=False, inplace=True) + feature_importances_tmp = self.initial_feature_importances_.copy() + if self.remove_zero_weighted_features: + feature_importances_tmp = feature_importances_tmp[lambda x: abs(x) > 0] + n_features_after_zero_removal = feature_importances_tmp.size + n_features_initial = self.initial_feature_importances_.size + if n_features_initial > n_features_after_zero_removal: + warnings.warn("remove_zero_weighted_features=True and removed {}/{} features".format((n_features_initial - n_features_after_zero_removal), n_features_initial)) + + # Extract most important feature from the ordered list of features + first_most_important_feature = list(feature_importances_tmp.index)[0] + + # Run baseline model using only the most important feature + X_1 = X[first_most_important_feature].to_frame() + baseline_model_cv_results = cross_validate( + self.estimator, + X_1, + y, + cv=self.cv, + scoring=self.scoring, + return_estimator=True, + n_jobs=self.n_jobs, + ) + + # Save baseline model performance + baseline_model_performance = baseline_model_cv_results["test_score"].mean() + feature_selected_model_cv = baseline_model_cv_results["test_score"].copy() + + # Summarize feature/coeff importance for each cross validation fold + if self.feature_importances_from_cv: + feature_importances_results = format_feature_importances_from_cv( + cv_results=baseline_model_cv_results, + features=X_1.columns, + ) + # Summarize feature/coeff importance for entire dataset + else: + feature_importances_results = format_feature_importances_from_data( + estimator=self.estimator, + X=X_1, # This isn't needed because it's always going to be a single feature :if self.transformation is None else self.transformation(X_1), + y=y, + ) + + feature_selected_importances = feature_importances_results["mean"] + feature_selected_importances_sem = feature_importances_results["sem"] + + # list to collect selected features + # It is initialized with the most important feature + _selected_features = [first_most_important_feature] + + # dict to collect features and their performance_drift + # It is initialized with the performance drift of + # the most important feature + self.performance_drifts_ = {first_most_important_feature: 0} + + # loop over the ordered list of features by feature importance starting + # from the second element in the list. + for feature in tqdm(list(feature_importances_tmp.index)[1:], desc="Recursive feature addition"): + X_tmp = X[_selected_features + [feature]] + + # Add feature and train new model + query_model_cv_results = cross_validate( + self.estimator, + X_tmp if self.transformation is None else self.transformation(X_tmp), + y, + cv=self.cv, + scoring=self.scoring, + return_estimator=True, + ) + + # assign new model performance + query_model_performance = query_model_cv_results["test_score"].mean() + + # Calculate performance drift + performance_drift = query_model_performance - baseline_model_performance + + # Save feature and performance drift + self.performance_drifts_[feature] = performance_drift + + # Update selected features if performance improves + if performance_drift > self.threshold: + # add feature to the list of selected features + _selected_features.append(feature) + + # Update new baseline model performance + baseline_model_performance = query_model_performance + feature_selected_model_cv = query_model_cv_results["test_score"] + + # Summarize feature/coeff importance for each cross validation fold + if self.feature_importances_from_cv: + feature_importances_results = format_feature_importances_from_cv( + cv_results=query_model_cv_results, + features=X_tmp.columns, + ) + # Summarize feature/coeff importance for entire dataset + else: + feature_importances_results = format_feature_importances_from_data( + estimator=self.estimator, + X=X_tmp if self.transformation is None else self.transformation(X_tmp), + y=y, + ) + feature_selected_importances = feature_importances_results["mean"] + feature_selected_importances_sem = feature_importances_results["sem"] + + # Remove zero-weighted features + if self.remove_zero_weighted_features: + if np.any(feature_selected_importances == 0): + cleaned_results = remove_zero_weighted_features( + estimator=self.estimator, + X=X_tmp.loc[:,_selected_features], + y=y, + selected_features=_selected_features, + initial_feature_importances=feature_selected_importances.loc[_selected_features], + initial_feature_importances_sem=feature_selected_importances_sem.loc[_selected_features], + feature_selected_training_cv=feature_selected_model_cv, + feature_selected_testing_score=np.nan, + transformation=self.transformation, + X_testing=None, + y_testing=None, + n_jobs=self.n_jobs, + cv=self.cv, + scorer=self.scoring, + feature_importances_from_cv=self.feature_importances_from_cv, + maximum_tries_to_remove_zero_weighted_features=self.maximum_tries_to_remove_zero_weighted_features, + verbose=self.verbose, + log=sys.stderr, + ) + # ['selected_features', 'feature_importances', 'feature_importances_sem', 'feature_selected_training_cv', 'feature_selected_testing_performance'] + if len(cleaned_results["selected_features"]) >= 1: + _selected_features = cleaned_results["selected_features"] + feature_selected_importances = cleaned_results["feature_importances"] + feature_selected_importances_sem = cleaned_results["feature_importances_sem"] + feature_selected_model_cv = cleaned_results["feature_selected_training_cv"] + + self.features_to_drop_ = [ + f for f in self.variables_ if f not in _selected_features + ] + + + a = feature_selected_importances.size + b = len(_selected_features) + assert a == b, "Number of selected feature importances (N={}) is different than number of selected features (N={})".format(a, b) + if a == 0: + raise ValueError("0 features selected using current parameters. Try preprocessing feature matrix, using a different transformation, or a different estimator.") + self.selected_features_ = list(_selected_features) + self.feature_selected_importances_ = feature_selected_importances.loc[_selected_features] + self.feature_selected_importances_sem_ = feature_selected_importances_sem.loc[_selected_features] + self.feature_selected_model_cv_ = feature_selected_model_cv + self.feature_selected_model_performance_ = feature_selected_model_cv.mean() + + return self + +@Substitution( + estimator=_estimator_docstring, + scoring=_scoring_docstring, + threshold=_threshold_docstring, + cv=_cv_docstring, + variables=_variables_numerical_docstring, + confirm_variables=_confirm_variables_docstring, + initial_model_performance_=_initial_model_performance_docstring, + feature_importances_=_feature_importances_docstring, + performance_drifts_=_performance_drifts_docstring, + features_to_drop_=_features_to_drop_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit=_fit_docstring, + transform=_transform_docstring, + fit_transform=_fit_transform_docstring, + get_support=_get_support_docstring, + transformation=_get_transformation_docstring, + remove_zero_weighted_features=_get_remove_zero_weighted_features_docstring, +) +class ClairvoyanceRecursiveFeatureElimination(ClairvoyanceBaseRecursiveSelector): + """ + This class has been modified from the RecursiveFeatureElimination class from + the Feature Engine Python package (https://github.com/feature-engine/feature_engine) + + ClairvoyanceRecursiveFeatureElimination() selects features following a recursive elimination + process. + + The process is as follows: + + 1. Train an estimator using all the features. + + 2. Rank the features according to their importance derived from the estimator. + + 3. Remove the least important feature and fit a new estimator. + + 4. Calculate the performance of the new estimator. + + 5. Calculate the performance difference between the new and original estimator. + + 6. If the performance drop is below the threshold the feature is removed. + + 7. Repeat steps 3-6 until all features have been evaluated. + + 8. Remove zero weighted features + + If transformations are provided, then each time a feature is removed or added the data is transformed. + + Model training and performance calculation can be performed using entire dataset or cross-validation. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + {estimator} + + {variables} + + {scoring} + + {threshold} + + {cv} + + {confirm_variables} + + {transformation} + + {remove_zero_weighted_features} + + Attributes + ---------- + {initial_model_performance_} + + {feature_importances_} + + {performance_drifts_} + + {features_to_drop_} + + {variables_} + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + {fit} + + {fit_transform} + + {get_support} + + {transform} + + Examples + -------- + + >>> import pandas as pd + >>> from sklearn.ensemble import RandomForestClassifier + >>> from feature_engine.selection import RecursiveFeatureElimination + >>> X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000], + >>> x2 = [2,4,3,1,2,2], + >>> x3 = [1,1,1,0,0,0], + >>> x4 = [1,2,1,1,0,1], + >>> x5 = [1,1,1,1,1,1])) + >>> y = pd.Series([1,0,0,1,1,0]) + >>> rfe = RecursiveFeatureElimination(RandomForestClassifier(random_state=2), cv=2) + >>> rfe.fit_transform(X, y) + x2 + 0 2 + 1 4 + 2 3 + 3 1 + 4 2 + 5 2 + """ + + def fit(self, X: pd.DataFrame, y: pd.Series): + """ + Find the important features. Note that the selector trains various models at + each round of selection, so it might take a while. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The input dataframe + y: array-like of shape (n_samples) + Target variable. Required to train the estimator. + """ + + X, y = super().fit(X, y) + + # Sort the feature importance values increasingly + self.initial_feature_importances_.sort_values(ascending=True, inplace=True) + feature_importances_tmp = self.initial_feature_importances_.copy() + if self.remove_zero_weighted_features: + feature_importances_tmp = feature_importances_tmp[lambda x: abs(x) > 0] + n_features_after_zero_removal = feature_importances_tmp.size + n_features_initial = self.initial_feature_importances_.size + if n_features_initial > n_features_after_zero_removal: + warnings.warn("remove_zero_weighted_features=True and removed {}/{} features".format((n_features_initial - n_features_after_zero_removal), n_features_initial)) + + # to collect selected features + _selected_features = [] + + # temporary copy where we will remove features recursively + variables = self.variables_ + if self.remove_zero_weighted_features: + variables = list(set(feature_importances_tmp.index) & set(variables)) + X_tmp = X[variables].copy() + + # we need to update the performance as we remove features + baseline_model_performance = self.initial_model_performance_ + feature_selected_model_cv = self.initial_model_cv_.copy() + feature_selected_importances = self.initial_feature_importances_.copy() + feature_selected_importances_sem = self.initial_feature_importances_sem_.copy() + + # dict to collect features and their performance_drift after shuffling + self.performance_drifts_ = {} + + # evaluate every feature, starting from the least important + # remember that feature_importances_ is ordered already + for feature in tqdm(list(feature_importances_tmp.index), desc="Recursive feature elimination"): + + # if there is only 1 feature left + if X_tmp.shape[1] == 1: + self.performance_drifts_[feature] = 0 + _selected_features.append(feature) + break + + # remove feature and train new model + + X_query = X_tmp.drop(columns=feature) + query_model_cv_results = cross_validate( + self.estimator, + X_query if self.transformation is None else self.transformation(X_query), + y, + cv=self.cv, + scoring=self.scoring, + return_estimator=False, + n_jobs=self.n_jobs, + ) + + # assign new model performance + query_model_performance = query_model_cv_results["test_score"].mean() + + # Calculate performance drift + performance_drift = baseline_model_performance - query_model_performance + + # Save feature and performance drift + self.performance_drifts_[feature] = performance_drift + + # Update selected features if performance improves + if performance_drift > self.threshold: + _selected_features.append(feature) + + else: + # remove feature and adjust initial performance + X_tmp = X_tmp.drop(columns=feature) + + baseline_model_cv_results = cross_validate( + self.estimator, + X_tmp if self.transformation is None else self.transformation(X_tmp), + y, + cv=self.cv, + return_estimator=False, + scoring=self.scoring, + n_jobs=self.n_jobs, + ) + + # store initial model performance + baseline_model_performance = baseline_model_cv_results["test_score"].mean() + + + # Fit final model + if len(_selected_features) < X.shape[1]: + X_query = X_tmp.loc[:,_selected_features] + query_model_cv_results = cross_validate( + self.estimator, + X_query if self.transformation is None else self.transformation(X_query), + y, + cv=self.cv, + scoring=self.scoring, + return_estimator=True, + n_jobs=self.n_jobs, + ) + feature_selected_model_cv = query_model_cv_results["test_score"] + + # Summarize feature/coeff importance for each cross validation fold + if self.feature_importances_from_cv: + feature_importances_results = format_feature_importances_from_cv( + cv_results=query_model_cv_results, + features=X_query.columns, + ) + # Summarize feature/coeff importance for entire dataset + else: + feature_importances_results = format_feature_importances_from_data( + estimator=self.estimator, + X=X_query if self.transformation is None else self.transformation(X_query), + y=y, + ) + feature_selected_importances = feature_importances_results["mean"] + feature_selected_importances_sem = feature_importances_results["sem"] + + # Remove zero-weighted features + if self.remove_zero_weighted_features: + if np.any(feature_selected_importances == 0): + cleaned_results = remove_zero_weighted_features( + estimator=self.estimator, + X=X_tmp.loc[:,_selected_features], + y=y, + selected_features=_selected_features, + initial_feature_importances=feature_selected_importances.loc[_selected_features], + initial_feature_importances_sem=feature_selected_importances_sem.loc[_selected_features], + feature_selected_training_cv=feature_selected_model_cv, + feature_selected_testing_score=np.nan, + transformation=self.transformation, + X_testing=None, + y_testing=None, + n_jobs=self.n_jobs, + cv=self.cv, + scorer=self.scoring, + feature_importances_from_cv=self.feature_importances_from_cv, + maximum_tries_to_remove_zero_weighted_features=self.maximum_tries_to_remove_zero_weighted_features, + verbose=self.verbose, + log=sys.stderr, + ) + # ['selected_features', 'feature_importances', 'feature_importances_sem', 'feature_selected_training_cv', 'feature_selected_testing_performance'] + + _selected_features = cleaned_results["selected_features"] + feature_selected_importances = cleaned_results["feature_importances"] + feature_selected_importances_sem = cleaned_results["feature_importances_sem"] + feature_selected_model_cv = cleaned_results["feature_selected_training_cv"] + + + # Features to drop + self.features_to_drop_ = [ + f for f in self.variables_ if f not in _selected_features + ] + + a = feature_selected_importances.size + b = len(_selected_features) + assert a == b, "Number of selected feature importances (N={}) is different than number of selected features (N={})".format(a, b) + self.selected_features_ = list(_selected_features) + self.feature_selected_importances_ = feature_selected_importances.loc[_selected_features] + self.feature_selected_importances_sem_ = feature_selected_importances_sem.loc[_selected_features] + self.feature_selected_model_cv_ = feature_selected_model_cv + self.feature_selected_model_performance_ = feature_selected_model_cv.mean() + + return self + diff --git a/clairvoyance/clairvoyance.py b/clairvoyance/legacy.py similarity index 71% rename from clairvoyance/clairvoyance.py rename to clairvoyance/legacy.py index 791750b..aa82986 100644 --- a/clairvoyance/clairvoyance.py +++ b/clairvoyance/legacy.py @@ -2,7 +2,7 @@ # Built-ins from ast import Or -import os, sys, itertools, argparse, time, datetime, copy, warnings +import os, sys, itertools, argparse, time, copy, warnings from collections import OrderedDict # from multiprocessing import cpu_count @@ -11,625 +11,16 @@ import numpy as np import xarray as xr -## Plotting -import matplotlib.pyplot as plt -import seaborn as sns - -# Machine learning -from scipy import stats -from sklearn.metrics import get_scorer, make_scorer -from sklearn.model_selection import cross_val_score, train_test_split, RepeatedStratifiedKFold, StratifiedKFold, RepeatedKFold, KFold -from sklearn.base import clone, is_classifier, is_regressor -from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning - -# # Parallel -# import ray -# from ray.air.config import ScalingConfig -# from ray.train.sklearn import SklearnTrainer - -# Clairvoyance -try: - from . import __version__ -except ImportError: - __version__ = None - warnings.warn("Unable to load version information") - -# Soothsayer Utils -from soothsayer_utils import ( - assert_acceptable_arguments, - format_header, - format_duration, - # read_dataframe, - read_object, - write_object, - # to_precision, - pv, - flatten, -) - -# Specify maximum number of threads -# available_threads = cpu_count() - -# ============================================================================== -# Functions -# ============================================================================== -# Total sum scaling -def closure(X:pd.DataFrame): - sum_ = X.sum(axis=1) - return (X.T/sum_).T - -# Center Log-Ratio -def clr(X:pd.DataFrame): - X_tss = closure(X) - X_log = np.log(X_tss) - geometric_mean = X_log.mean(axis=1) - return (X_log - geometric_mean.values.reshape(-1,1)) - -# Transformations -def transform(X, method="closure", axis=1, multiplicative_replacement="auto", verbose=0, log=sys.stdout): -# """ -# Assumes X_data.index = Samples, X_data.columns = features -# axis = 0 = cols, axis = 1 = rows -# e.g. axis=1, method=closure: transform for relative abundance so each row sums to 1. -# "closure" = closure/total-sum-scaling -# "clr" = center log-ratio -# None = No transformation -# """ - # Transpose for axis == 0 - if axis == 0: - X = X.T - - # Base output - X_transformed = X.copy() - - # Checks - if X.ndim != 2: - raise ValueError("Input matrix must have two dimensions") - # if np.all(X == 0, axis=1).sum() > 0: - # raise ValueError("Input matrix cannot have rows with all zeros") - - # Transformed output - if method is not None: - if np.any(X.values < 0): - raise ValueError("Cannot have negative values") - if X.shape[1] > 1: - if method == "closure": - X_transformed = closure(X) - if method == "clr": - if multiplicative_replacement == "auto": - if not np.all(X > 0): - multiplicative_replacement = 1/X.shape[1]**2 - else: - multiplicative_replacement = None - if multiplicative_replacement is None: - multiplicative_replacement = 0 - assert isinstance(multiplicative_replacement, (float,int,np.floating,np.integer)) - X_transformed = clr(X + multiplicative_replacement) - else: - if verbose > 1: - print("Only 1 feature left. Ignoring transformation.", file=log) - - # Transpose back - if axis == 0: - X_transformed = X_transformed.T - - return X_transformed - -def format_stratify(stratify, estimator_type:str, y:pd.Series): - assert_acceptable_arguments(estimator_type, {"classifier", "regressor"}) - if isinstance(stratify, bool): - if stratify is True: - assert estimator_type == "classifier", "If `stratify=True` then the estimator must be a classifier. Please provide a stratification grouping or select None for regressor." - stratify = "auto" - if stratify is False: - stratify = None - if isinstance(stratify, str): - assert stratify == "auto", "`stratify` must be 'auto', a pd.Series, or None" - if estimator_type == "classifier": - stratify = y.copy() - if estimator_type == "regressor": - stratify = None - - if stratify is not None: - assert isinstance(stratify, pd.Series) - assert np.all(stratify.index == y.index) - return stratify - -def format_weights(W): - # with warnings.catch_warnings(): - # warnings.filterwarnings("ignore", category=ConvergenceWarning) - # warnings.filterwarnings("ignore", category=UndefinedMetricWarning) - # warnings.filterwarnings("ignore", category=UserWarning) - - W = np.abs(W) - if W.ndim > 1: - W = np.sum(W, axis=0) - W = W/W.sum() - return W - -def format_cross_validation(cv, X:pd.DataFrame, y:pd.Series, stratify=True, random_state=0, cv_prefix="cv=", training_column="training_index", testing_column="testing_index", return_type=tuple): - assert_acceptable_arguments({return_type}, (tuple, pd.DataFrame)) - if return_type == tuple: - assert np.all(X.index == y.index), "`X.index` and `y.index` must be the same ordering" - index = X.index - number_of_observations = len(index) - - # Simple stratified k-fold cross validation - if isinstance(cv, int): - splits = list() - labels = list() - - if isinstance(stratify, bool): - if stratify is True: - stratify = y.copy() - if stratify is False: - stratify = None - - if stratify is not None: - assert isinstance(stratify, pd.Series), "If `stratify` is not None, it must be a pd.Series" - assert np.all(y.index == stratify.index), "If `stratify` is not None then it must have the same index as `y`" - assert stratify.dtype != float, "`stratify` cannot be floating point values" - if y.dtype == float: - splitter = StratifiedKFold(n_splits=cv, random_state=random_state, shuffle=False if random_state is None else True).split(X, stratify, groups=stratify) - else: - splitter = StratifiedKFold(n_splits=cv, random_state=random_state, shuffle=False if random_state is None else True).split(X, y, groups=stratify) - else: - splitter = KFold(n_splits=cv, random_state=random_state, shuffle=False if random_state is None else True).split(X, y) - - for i, (training_index, testing_index) in enumerate(splitter, start=1): - id_cv = "{}{}".format(cv_prefix, i) - splits.append([training_index, testing_index]) - labels.append(id_cv) - - return (splits, labels) - - # Repeated stratified k-fold cross validation - if isinstance(cv, tuple): - assert len(cv) == 2, "If tuple is provided, it must be length 2" - assert map(lambda x: isinstance(x, int), cv), "If tuple is provided, both elements must be integers ≥ 2 for `RepeatedStratifiedKFold`" - - splits = list() - labels = list() - - if isinstance(stratify, bool): - if stratify is True: - stratify = y.copy() - if stratify is False: - stratify = None - - if stratify is not None: - assert isinstance(stratify, pd.Series), "If `stratify` is not None, it must be a pd.Series" - assert np.all(y.index == stratify.index), "If `stratify` is not None then it must have the same index as `y`" - assert stratify.dtype != float, "`stratify` cannot be floating point values" - if y.dtype == float: - splitter = RepeatedStratifiedKFold(n_splits=cv[0], n_repeats=cv[1], random_state=random_state).split(X, stratify, groups=stratify) - else: - splitter = RepeatedStratifiedKFold(n_splits=cv[0], n_repeats=cv[1], random_state=random_state).split(X, y, groups=stratify) - else: - splitter = RepeatedKFold(n_splits=cv[0], n_repeats=cv[1], random_state=random_state).split(X, y) - - for i, (training_index, testing_index) in enumerate(splitter, start=1): - id_cv = "{}{}".format(cv_prefix, i) - splits.append([training_index, testing_index]) - labels.append(id_cv) - - return (splits, labels) - - # List - if isinstance(cv, (list, np.ndarray)): - assert all(map(lambda query: len(query) == 2, cv)), "If `cv` is provided as a list, each element must be a sub-list of 2 indicies (i.e., training and testing indicies)" - cv = pd.DataFrame(cv, columns=[training_column, testing_column]) - cv.index = cv.index.map(lambda i: "{}{}".format(cv_prefix, i)) - - # DataFrame - if isinstance(cv, pd.DataFrame): - cv = cv.copy() - assert training_column in cv.columns - assert testing_column in cv.columns - assert np.all(cv[training_column].map(lambda x: isinstance(x, (list, np.ndarray)))), "`{}` must be either list or np.ndarray of indices".format(training_column) - assert np.all(cv[testing_column].map(lambda x: isinstance(x, (list, np.ndarray)))), "`{}` must be either list or np.ndarray of indices".format(testing_column) - - index_values = flatten(cv.values, into=list) - query = index_values[0] - labels = list(cv.index) - if isinstance(query, (int, np.integer)): - assert all(map(lambda x: isinstance(x,(int, np.integer)), index_values)), "If `cv` is provided as a list or pd.DataFrame, all values must either be intger values or keys in the X.index" - assert np.all(np.unique(index_values) >= 0), "All values in `cv` must be positive integers" - maxv = max(index_values) - assert maxv < number_of_observations, "Index values in `cv` cannot exceed ({}) the number of observations ({}).".format(maxv, number_of_observations) - else: - missing_keys = set(index_values) - set(index) - assert len(missing_keys) == 0, "If index values in `cv` are keys and not integers, then all keys must be in `X.index`. Missing keys: {}".format(missing_keys) - cv = cv.applymap(lambda observations: list(map(lambda x: X.index.get_loc(x), observations))) - cv = cv.applymap(np.asarray) - splits = cv.values.tolist() - return (splits, labels) - if return_type == pd.DataFrame: - splits, labels = format_cross_validation(cv=cv, X=X, y=y, stratify=stratify, random_state=random_state, cv_prefix=cv_prefix, training_column=training_column, testing_column=testing_column, return_type=tuple) - return pd.DataFrame(splits, index=labels, columns=[training_column, testing_column]) - -def get_feature_importance_attribute(estimator, importance_getter): - estimator = clone(estimator) - _X = np.random.normal(size=(5,2)) - if is_classifier(estimator): - _y = np.asarray(list("aabba")) - if is_regressor(estimator): - _y = np.asarray(np.random.normal(size=5)) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=ConvergenceWarning) - estimator.fit(_X,_y) - if importance_getter == "auto": - importance_getter = None - if hasattr(estimator, "coef_"): - importance_getter = "coef_" - if hasattr(estimator, "feature_importances_"): - importance_getter = "feature_importances_" - assert importance_getter is not None, "If `importance_getter='auto'`, `estimator` must be either a linear model with `.coef_` or a tree-based model with `.feature_importances_`" - assert hasattr(estimator, importance_getter), "Fitted estimator does not have feature weight attribute: `{}`".format(importance_getter) - return importance_getter - -def get_balanced_class_subset(y:pd.Series, size:float=0.1, random_state=None): - n = y.size - number_of_classes = y.nunique() - number_of_samples_in_subset = int(n * size) - subset_size_per_class = int(number_of_samples_in_subset/number_of_classes) - - index = list() - for id_class in y.unique(): - class_samples = y[lambda x: x == id_class].index - assert len(class_samples) >= subset_size_per_class - subset = np.random.RandomState(random_state).choice(class_samples, size=subset_size_per_class, replace=False) - index += subset.tolist() - return index - -def recursive_feature_inclusion( - estimator, - X:pd.DataFrame, - y:pd.Series, - scorer, - initial_feature_weights:pd.Series, - initial_feature_weights_name:str="initial_feature_weights", - feature_weight_attribute:str="auto", - transformation=None, - multiplicative_replacement="auto", - metric=np.nanmean, - early_stopping=25, - minimum_improvement_in_score=0.0, - additional_feature_penalty=None, - maximum_number_of_features="auto", - target_score=-np.inf, - less_features_is_better=True, - random_state=0, - n_jobs=1, - cv=(5,3), - stratify=None, - training_column="training_index", - testing_column="testing_index", - cv_prefix="cv=", - verbose=0, - log=sys.stdout, - progress_message="Recursive feature inclusion", - remove_zero_weighted_features=True, - maximum_tries_to_remove_zero_weighted_features=1000, - X_testing:pd.DataFrame=None, - y_testing:pd.Series=None, - # optimize_testing_score = "auto", - ) -> pd.Series: - - assert len(set(X.columns)) == X.shape[1], "Cannot have duplicate feature names in `X`" - if additional_feature_penalty is None: - additional_feature_penalty = lambda number_of_features: 0.0 - - # Testing - X_testing_is_provided = X_testing is not None - y_testing_is_provided = y_testing is not None - - if X_testing_is_provided is not None: - assert y_testing_is_provided is not None, "If `X_testing` is provided then `y_testing` must be provided" - - if y_testing_is_provided is not None: - assert X_testing_is_provided is not None, "If `y_testing` is provided then `X_testing` must be provided" - - testing_set_provided = False - if all([X_testing_is_provided, y_testing_is_provided]): - assert np.all(X_testing.index == y_testing.index), "X_testing.index and y_testing.index must have the same ordering" - assert np.all(X_testing.columns == X.columns), "X_testing.columns and X.columns must have the same ordering" - testing_set_provided = True - # if optimize_testing_score == "auto": - # if testing_set_provided: - # optimize_testing_score = True - # else: - # optimize_testing_score = False - - # Transformations - assert_acceptable_arguments(transformation, {None,"clr","closure"}) - if multiplicative_replacement is None: - multiplicative_replacement = 0.0 - if isinstance(multiplicative_replacement, str): - assert multiplicative_replacement == "auto", "If `multiplicative_replacement` is a string, it must be `auto`" - else: - assert isinstance(multiplicative_replacement, (float, np.floating, int, np.integer)), "If `multiplicative_replacement` is not set to `auto` it must be float or int" - - # Cross-validaiton - cv_splits, cv_labels = format_cross_validation(cv=cv, X=X, y=y, stratify=stratify, random_state=random_state, cv_prefix=cv_prefix, training_column=training_column, testing_column=testing_column) - - # Initial feature weights - assert set(X.columns) <= set(initial_feature_weights.index), "X.columns must be a subset of feature_weights.index" - initial_feature_weights = initial_feature_weights.sort_values(ascending=False) - feature_weight_attribute = get_feature_importance_attribute(estimator, feature_weight_attribute) - - # Scorer - if isinstance(scorer, str): - scorer = get_scorer(scorer) - - # Maximum number of features - if maximum_number_of_features == "auto": - maximum_number_of_features = min(X.shape) - if maximum_number_of_features == -1: - maximum_number_of_features = X.shape[1] - if maximum_number_of_features is None: - maximum_number_of_features = X.shape[1] - assert maximum_number_of_features > 0, "maximum_number_of_features must be greater than 0" - - # Best results - history = OrderedDict() - testing_scores = OrderedDict() - - best_features = None - best_score = target_score - - # Feature tracker - feature_tuples = list() - unique_feature_sets = list() - - # Progress tracker - no_progress = 0 - - if progress_message is None: - iterable = range(initial_feature_weights.size) - else: - iterable = pv(range(initial_feature_weights.size), description=progress_message) - - for i in iterable: - features = initial_feature_weights.index[:i+1].tolist() - X_rfi = X.loc[:,features] - - continue_algorithm = True - - # Transform features (if transformation = None, then there is no transformation) - if X_rfi.shape[1] > 1: - X_rfi = transform(X=X_rfi, method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) - else: - if transformation is not None: - continue_algorithm = False - if verbose > 2: - print("Only 1 feature left. Ignoring transformation.", file=log) - - if continue_algorithm: - - # Remove zero-weighted features - if remove_zero_weighted_features: - for j in range(maximum_tries_to_remove_zero_weighted_features): - X_query = X.loc[:,features] - - estimator.fit( - X=transform(X=X_query, method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1), - y=y, - ) - _W = getattr(estimator, feature_weight_attribute) - _w = format_weights(_W) - mask_zero_weight_features = _w != 0 - - if np.all(mask_zero_weight_features): - X_rfi = transform(X=X_query, method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) - if verbose > 1: - if j > 0: - print("[Success][Iteration={}, Try={}]: Removed all zero weighted features. The following features remain: {}".format(i, j, list(features)), file=log) - break - else: - if verbose > 2: - if j > 0: - print("[...][Iteration={}, Try={}]: Removing {} features as they have zero weight in fitted model: {}".format(i, j, len(mask_zero_weight_features) - np.sum(mask_zero_weight_features), X_query.columns[~mask_zero_weight_features].tolist()), file=log) - features = X_query.columns[mask_zero_weight_features].tolist() - - feature_set = frozenset(features) - if feature_set in unique_feature_sets: - if verbose > 0: - print("Skipping iteration {} because removing zero-weighted features has produced a feature set that has already been evaluated: {}".format(i, set(feature_set)), file=log) - else: - feature_tuples.append(tuple(features)) - unique_feature_sets.append(feature_set) - - # Training/Testing Scores - scores = cross_val_score(estimator=estimator, X=X_rfi, y=y, cv=cv_splits, n_jobs=n_jobs, scoring=scorer) - with warnings.catch_warnings(): - warnings.simplefilter('ignore', RuntimeWarning) - average_score = np.nanmean(scores) - history[i] = scores #{"average_score":average_score, "sem":sem} - - #! --- - # Testing Score - query_score = average_score - testing_score = np.nan - if testing_set_provided: - estimator.fit( - X=X_rfi, - y=y, - ) - - # Transform features (if transformation = None, then there is no transformation) - X_testing_rfi = transform(X=X_testing.loc[:,features], method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) - testing_score = scorer(estimator=estimator, X=X_testing_rfi, y_true=y_testing) - testing_scores[i] = testing_score - # if optimize_testing_score: - # query_score = testing_score - #! --- - - # Add penalties to score target - penalty_adjusted_score_target = (best_score + minimum_improvement_in_score + additional_feature_penalty(len(features))) - - if query_score <= penalty_adjusted_score_target: - if verbose > 1: - # if optimize_testing_score: - # print("Current iteration {} of N={} features has not improved score: Testing Score[{} ≤ {}]".format(i, len(features), testing_score, best_score), file=log) - # else: - print("Current iteration {} of N={} features has not improved score: Average Score[{} ≤ {}]".format(i, len(features), average_score, best_score), file=log) - - no_progress += 1 - else: - # if optimize_testing_score: - # if verbose > 0: - # print("Updating best score with N={} features : Testing Score[{} -> {}]".format(len(features), best_score, testing_score), file=log) - # best_score = testing_score - # else: - if verbose > 0: - print("Updating best score with N={} features : Average Score[{} -> {}]".format(len(features), best_score, average_score), file=log) - best_score = average_score - best_features = features - no_progress = 0 - if no_progress >= early_stopping: - break - if len(features) >= maximum_number_of_features: - if verbose > 0: - print("Terminating algorithm after {} iterations with a best score of {} as the maximum number of features has been reached.".format(i+1, best_score), file=log) - break - if verbose > 0: - if best_features is None: - print("Terminating algorithm after {} iterations with a best score of {} as no feature set improved the score with current parameters".format(i+1, best_score), file=log) - else: - print("Terminating algorithm at N={} features after {} iterations with a best score of {}".format(len(best_features), i+1, best_score), file=log) - - history = pd.DataFrame(history, index=list(map(lambda x: ("splits", x), cv_labels))).T - # if testing_set_provided: - # history[("testing","score")] = pd.Series(testing_scores) - history.index = feature_tuples - history.index.name = "features" - - # Summary - average_scores = history.mean(axis=1) - sems = history.sem(axis=1) - if testing_set_provided: - testing_scores = pd.Series(testing_scores) - testing_scores.index = feature_tuples - else: - testing_scores = pd.Series([np.nan]*len(feature_tuples), index=feature_tuples) - - history.insert(loc=0, column=("summary", "number_of_features"),value = history.index.map(len)) - history.insert(loc=1, column=("summary", "average_score"),value = average_scores) - history.insert(loc=2, column=("summary", "sem"),value = sems) - history.insert(loc=3, column=("summary", "testing_score"), value = testing_scores) - history.insert(loc=4, column=("summary", "∆(testing_score-average_score)"), value = average_scores - testing_scores) - - history.columns = pd.MultiIndex.from_tuples(history.columns) - - - # Highest scoring features (not necessarily the best since there can be many features added with minimal gains) - # if optimize_testing_score: - # highest_score = history[("summary", "testing_score")].max() - # highest_scoring_features = list(history.loc[history[("summary", "testing_score")] == highest_score].sort_values( - # by=[("summary", "average_score"), ("summary", "number_of_features")], - # ascending=[False, less_features_is_better]).index[0]) - # else: - highest_score = history[("summary", "average_score")].max() - try: - highest_scoring_features = list(history.loc[history[("summary", "average_score")] == highest_score, ("summary", "number_of_features")].sort_values(ascending=less_features_is_better).index[0]) - - - # # Best results - # if optimize_testing_score: - # # best_features = list(history.loc[history[("summary", "testing_score")] == best_score].sort_values( - # # by=[("summary", "average_score"), ("summary", "number_of_features")], - # # ascending=[False, less_features_is_better]).index[0]) - # else: - # best_features = list(history.loc[history[("summary", "average_score")] == best_score, ("summary", "number_of_features")].sort_values(ascending=less_features_is_better).index[0]) - - if testing_set_provided: - best_features = list(history.sort_values( - by=[("summary", "testing_score"), ("summary", "average_score"), ("summary", "number_of_features")], - ascending=[False, False, less_features_is_better]).index[0]) - - else: - best_features = list(history.loc[history[("summary", "average_score")] == best_score].sort_values( - by=[("summary", "testing_score"), ("summary", "average_score"), ("summary", "number_of_features")], - ascending=[False, False, less_features_is_better]).index[0]) - - best_estimator_sem = history.loc[[tuple(best_features)],("summary","sem")].values[0] - best_estimator_testing_score = history.loc[[tuple(best_features)],("summary","testing_score")].values[0] - - best_estimator_rci = clone(estimator) - X_best_features = transform(X=X.loc[:,best_features], method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=ConvergenceWarning) - best_estimator_rci.fit(X_best_features, y) - - # Score statement - if verbose > 0: - if highest_score > best_score: - if additional_feature_penalty is not None: - print("Highest score was %0.3f with %d features but with `minimum_improvement_in_score=%f` penalty and `additional_feature_penalty` adjustment, the best score was %0.3f with %d features.\n^ Both are stored as .highest_score_ / .highest_scoring_features_ and .best_score_ / .best_feautres_, respectively. ^"%(highest_score, len(highest_scoring_features), minimum_improvement_in_score, best_score, len(best_features)), file=log) - else: - print("Highest score was %0.3f with %d features but with `minimum_improvement_in_score=%f` penalty, the best score was %0.3f with %d features.\n^ Both are stored as .highest_score_ / .highest_scoring_features_ and .best_score_ / .best_feautres_, respectively. ^"%(highest_score, len(highest_scoring_features), minimum_improvement_in_score, best_score, len(best_features)), file=log) - - - - # Full training dataset weights - W = getattr(best_estimator_rci, feature_weight_attribute) - rci_feature_weights = pd.Series(format_weights(W), index=best_features, name="rci_weights") - feature_weights = pd.DataFrame([pd.Series(initial_feature_weights,name=initial_feature_weights_name), rci_feature_weights]).T - feature_weights.columns = feature_weights.columns.map(lambda x: ("full_dataset", x)) - - # Cross validation weights - cv_weights = OrderedDict() - for id_cv, (training_index, testing_index) in zip(cv_labels, cv_splits): - X_training = transform(X=X.iloc[training_index].loc[:,best_features], method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) - y_training = y.iloc[training_index] - clf = clone(estimator) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=ConvergenceWarning) - clf.fit(X_training, y_training) - cv_weights[id_cv] = pd.Series(format_weights(getattr(clf, feature_weight_attribute)), index=best_features) - cv_weights = pd.DataFrame(cv_weights) - cv_weights.columns = cv_weights.columns.map(lambda x: ("cross_validation", x)) - feature_weights = pd.concat([feature_weights, cv_weights], axis=1) - - return pd.Series( - dict( - history=history, - best_score=best_score, - best_estimator_sem=best_estimator_sem, - best_features=best_features, - best_estimator_rci=best_estimator_rci, - feature_weights=feature_weights, - highest_score=highest_score, - highest_scoring_features=highest_scoring_features, - cv_splits=cv_splits, - cv_labels=cv_labels, - testing_scores=testing_scores, - best_estimator_testing_score=best_estimator_testing_score, - ), - name="recursive_feature_elimination", - ) - except IndexError: - return pd.Series( - dict( - history=history, - best_score=np.nan, - best_estimator_sem=np.nan, - best_features=np.nan, - best_estimator_rci=np.nan, - feature_weights=np.nan, - highest_score=highest_score, - highest_scoring_features=np.nan, - cv_splits=cv_splits, - cv_labels=cv_labels, - testing_scores=testing_scores, - best_estimator_testing_score=np.nan, - ), - name="recursive_feature_elimination", - ) - +# Machine learning +from scipy import stats +from sklearn.metrics import get_scorer, make_scorer +from sklearn.model_selection import cross_val_score, train_test_split, RepeatedStratifiedKFold, StratifiedKFold, RepeatedKFold, KFold +from sklearn.base import clone, is_classifier, is_regressor +from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning +# Clairvoyance +from .utils import * +from .transformations import legacy_transform as transform # Plotting def plot_scores_line( @@ -643,7 +34,7 @@ def plot_scores_line( linecolor="black", errorcolor="gray", testing_linecolor="red", - style="seaborn-white", + style="seaborn-v0_8-white", xlim=None, ylim=None, ax=None, @@ -723,7 +114,7 @@ def plot_weights_bar( title=None, figsize=(13,3), color="black", - style="seaborn-white", + style="seaborn-v0_8-white", ylim=None, ax=None, ascending=False, @@ -787,7 +178,7 @@ def plot_weights_box( color="white", linecolor="coral", swarmcolor="gray", - style="seaborn-white", + style="seaborn-v0_8-white", ylim=None, ax=None, alpha=0.382, @@ -872,7 +263,7 @@ def plot_recursive_feature_selection( linewidth=0.618, alpha=0.618, edgecolor="black", - style="seaborn-white", + style="seaborn-v0_8-white", figsize=(8,3), title=None, xlabel="$N_{Features}$", @@ -951,7 +342,7 @@ def plot_scores_comparison( linewidth=0.618, alpha=0.618, edgecolor="black", - style="seaborn-white", + style="seaborn-v0_8-white", figsize=(8,5), title=None, xlabel="Average Score", @@ -989,87 +380,437 @@ def plot_scores_comparison( assert np.all(number_of_features.index == average_scores.index) assert np.all(number_of_features.index == testing_scores.index) - df = pd.DataFrame([number_of_features, average_scores, testing_scores], index=["number_of_features", "average_scores", "testing_scores"]).T + df = pd.DataFrame([number_of_features, average_scores, testing_scores], index=["number_of_features", "average_scores", "testing_scores"]).T + + if min_features: + df = df.query("number_of_features >= {}".format(min_features)) + if max_features: + df = df.query("number_of_features <= {}".format(max_features)) + if min_score: + df = df.query("average_scores >= {}".format(min_score)) + if max_score: + df = df.query("average_scores <= {}".format(max_score)) + if min_score: + df = df.query("testing_scores >= {}".format(min_score)) + if max_score: + df = df.query("testing_scores <= {}".format(max_score)) + + if feature_to_size_function == "auto": + def feature_to_size_function(n): + return 100/n + assert hasattr(feature_to_size_function, "__call__") + marker_sizes = feature_to_size_function(number_of_features) + + with plt.style.context(style): + _title_kws = {"fontsize":16, "fontweight":"bold"}; _title_kws.update(title_kws) + _xlabel_kws = {"fontsize":15}; _xlabel_kws.update(xlabel_kws) + _ylabel_kws = {"fontsize":15}; _ylabel_kws.update(ylabel_kws) + # _zlabel_kws = {"fontsize":15}; _zlabel_kws.update(zlabel_kws) + + _xticklabel_kws = {"fontsize":12, "rotation":xtick_rotation}; _xticklabel_kws.update(xticklabel_kws) + _yticklabel_kws = {"fontsize":12}; _yticklabel_kws.update(yticklabel_kws) + # _zticklabel_kws = {"fontsize":12}; _zticklabel_kws.update(zticklabel_kws) + + _legend_kws = {"fontsize":12, "frameon":True, "title_fontsize":"x-large"}; _legend_kws.update(legend_kws) + if ax is None: + fig, ax = plt.subplots(figsize=figsize) + + else: + fig = plt.gcf() + + + # ax.scatter(xs=df["number_of_features"].values, ys=df["average_scores"].values, zs=df["testing_scores"], edgecolor=edgecolor, alpha=alpha, linewidth=linewidth, color=color, **kwargs) + ax.scatter(x=df["average_scores"].values, y=df["testing_scores"], s=marker_sizes, edgecolor=edgecolor, alpha=alpha, linewidth=linewidth, color=color, **kwargs) + + + ax.set_xlabel(xlabel, **_xlabel_kws) + ax.set_ylabel(ylabel, **_ylabel_kws) + # ax.set_zlabel(zlabel, **_zlabel_kws) + + # ax.set_yticklabels(map(lambda x:"%0.2f"%x, ax.get_yticks()), **_yticklabel_kws) + if title: + ax.set_title(title, **_title_kws) + if show_legend: + legendary_features = number_of_features.describe()[legend_markers].astype(int) + + legend_elements = list() + for i,n in legendary_features.items(): + # marker = plt.Line2D([], [], color=color, marker='o', linestyle='None', markersize=feature_to_size_function(n), label="$N$ = %d"%(n)) + # legend_elements.append(marker) + i = i.replace("%","\%") + legend_marker = ax.scatter([], [], s=feature_to_size_function(n), label="$N_{%s}$ = %d"%(i,n), color=color, marker='o') + legend_elements.append(legend_marker) + + + legend = ax.legend(handles=legend_elements, title="$N_{Features}$", markerscale=1, **_legend_kws) + + if show_xgrid: + ax.xaxis.grid(True) + if show_ygrid: + ax.yaxis.grid(True) + # if show_zgrid: + # ax.zaxis.grid(True) + return fig, ax + + +def recursive_feature_addition( + estimator, + X:pd.DataFrame, + y:pd.Series, + scorer, + initial_feature_weights:pd.Series, + initial_feature_weights_name:str="initial_feature_weights", + feature_weight_attribute:str="auto", + transformation=None, + multiplicative_replacement="auto", + metric=np.nanmean, + early_stopping=25, + minimum_improvement_in_score=0.0, + additional_feature_penalty=None, + maximum_number_of_features="auto", + target_score=-np.inf, + less_features_is_better=True, + random_state=0, + n_jobs=1, + cv=(5,3), + stratify=None, + training_column="training_index", + testing_column="testing_index", + cv_prefix="cv=", + verbose=0, + log=sys.stdout, + progress_message="Recursive feature addition", + remove_zero_weighted_features=True, + maximum_tries_to_remove_zero_weighted_features=1000, + X_testing:pd.DataFrame=None, + y_testing:pd.Series=None, + # optimize_testing_score = "auto", + ) -> pd.Series: + + assert len(set(X.columns)) == X.shape[1], "Cannot have duplicate feature names in `X`" + if additional_feature_penalty is None: + additional_feature_penalty = lambda number_of_features: 0.0 + + # Testing + X_testing_is_provided = X_testing is not None + y_testing_is_provided = y_testing is not None + + if X_testing_is_provided is not None: + assert y_testing_is_provided is not None, "If `X_testing` is provided then `y_testing` must be provided" + + if y_testing_is_provided is not None: + assert X_testing_is_provided is not None, "If `y_testing` is provided then `X_testing` must be provided" + + testing_set_provided = False + if all([X_testing_is_provided, y_testing_is_provided]): + assert np.all(X_testing.index == y_testing.index), "X_testing.index and y_testing.index must have the same ordering" + assert np.all(X_testing.columns == X.columns), "X_testing.columns and X.columns must have the same ordering" + testing_set_provided = True + # if optimize_testing_score == "auto": + # if testing_set_provided: + # optimize_testing_score = True + # else: + # optimize_testing_score = False + + # Transformations + assert_acceptable_arguments(transformation, {None,"clr","closure"}) + if multiplicative_replacement is None: + multiplicative_replacement = 0.0 + if isinstance(multiplicative_replacement, str): + assert multiplicative_replacement == "auto", "If `multiplicative_replacement` is a string, it must be `auto`" + else: + assert isinstance(multiplicative_replacement, (float, np.floating, int, np.integer)), "If `multiplicative_replacement` is not set to `auto` it must be float or int" + + # Cross-validaiton + cv_splits, cv_labels = format_cross_validation(cv=cv, X=X, y=y, stratify=stratify, random_state=random_state, cv_prefix=cv_prefix, training_column=training_column, testing_column=testing_column) + + # Initial feature weights + assert set(X.columns) <= set(initial_feature_weights.index), "X.columns must be a subset of feature_weights.index" + initial_feature_weights = initial_feature_weights.sort_values(ascending=False) + feature_weight_attribute = get_feature_importance_attribute(estimator, feature_weight_attribute) + + # Scorer + if isinstance(scorer, str): + scorer = get_scorer(scorer) + + # Maximum number of features + if maximum_number_of_features == "auto": + maximum_number_of_features = min(X.shape) + if maximum_number_of_features == -1: + maximum_number_of_features = X.shape[1] + if maximum_number_of_features is None: + maximum_number_of_features = X.shape[1] + assert maximum_number_of_features > 0, "maximum_number_of_features must be greater than 0" + + # Best results + history = OrderedDict() + testing_scores = OrderedDict() + + best_features = None + best_score = target_score + + # Feature tracker + feature_tuples = list() + unique_feature_sets = list() + + # Progress tracker + no_progress = 0 + + if progress_message is None: + iterable = range(initial_feature_weights.size) + else: + iterable = pv(range(initial_feature_weights.size), description=progress_message) + + for i in iterable: + features = initial_feature_weights.index[:i+1].tolist() + X_rfa = X.loc[:,features] + + continue_algorithm = True + + # Transform features (if transformation = None, then there is no transformation) + if X_rfa.shape[1] > 1: + X_rfa = transform(X=X_rfa, method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) + else: + if transformation is not None: + continue_algorithm = False + if verbose > 2: + print("Only 1 feature left. Ignoring transformation.", file=log) + + if continue_algorithm: + + # Remove zero-weighted features + if remove_zero_weighted_features: + for j in range(maximum_tries_to_remove_zero_weighted_features): + X_query = X.loc[:,features] + + estimator.fit( + X=transform(X=X_query, method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1), + y=y, + ) + _W = getattr(estimator, feature_weight_attribute) + _w = format_weights(_W) + mask_zero_weight_features = _w != 0 + + if np.all(mask_zero_weight_features): + X_rfa = transform(X=X_query, method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) + if verbose > 1: + if j > 0: + print("[Success][Iteration={}, Try={}]: Removed all zero weighted features. The following features remain: {}".format(i, j, list(features)), file=log) + break + else: + if verbose > 2: + if j > 0: + print("[...][Iteration={}, Try={}]: Removing {} features as they have zero weight in fitted model: {}".format(i, j, len(mask_zero_weight_features) - np.sum(mask_zero_weight_features), X_query.columns[~mask_zero_weight_features].tolist()), file=log) + features = X_query.columns[mask_zero_weight_features].tolist() + + feature_set = frozenset(features) + if feature_set in unique_feature_sets: + if verbose > 0: + print("Skipping iteration {} because removing zero-weighted features has produced a feature set that has already been evaluated: {}".format(i, set(feature_set)), file=log) + else: + feature_tuples.append(tuple(features)) + unique_feature_sets.append(feature_set) + + # Training/Testing Scores + scores = cross_val_score(estimator=estimator, X=X_rfa, y=y, cv=cv_splits, n_jobs=n_jobs, scoring=scorer) + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + average_score = np.nanmean(scores) + history[i] = scores #{"average_score":average_score, "sem":sem} + + #! --- + # Testing Score + query_score = average_score + testing_score = np.nan + if testing_set_provided: + estimator.fit( + X=X_rfa, + y=y, + ) + + # Transform features (if transformation = None, then there is no transformation) + X_testing_rfa = transform(X=X_testing.loc[:,features], method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) + testing_score = scorer(estimator=estimator, X=X_testing_rfa, y_true=y_testing) + testing_scores[i] = testing_score + # if optimize_testing_score: + # query_score = testing_score + #! --- - if min_features: - df = df.query("number_of_features >= {}".format(min_features)) - if max_features: - df = df.query("number_of_features <= {}".format(max_features)) - if min_score: - df = df.query("average_scores >= {}".format(min_score)) - if max_score: - df = df.query("average_scores <= {}".format(max_score)) - if min_score: - df = df.query("testing_scores >= {}".format(min_score)) - if max_score: - df = df.query("testing_scores <= {}".format(max_score)) + # Add penalties to score target + penalty_adjusted_score_target = (best_score + minimum_improvement_in_score + additional_feature_penalty(len(features))) + + if query_score <= penalty_adjusted_score_target: + if verbose > 1: + # if optimize_testing_score: + # print("Current iteration {} of N={} features has not improved score: Testing Score[{} ≤ {}]".format(i, len(features), testing_score, best_score), file=log) + # else: + print("Current iteration {} of N={} features has not improved score: Average Score[{} ≤ {}]".format(i, len(features), average_score, best_score), file=log) - if feature_to_size_function == "auto": - def feature_to_size_function(n): - return 100/n - assert hasattr(feature_to_size_function, "__call__") - marker_sizes = feature_to_size_function(number_of_features) + no_progress += 1 + else: + # if optimize_testing_score: + # if verbose > 0: + # print("Updating best score with N={} features : Testing Score[{} -> {}]".format(len(features), best_score, testing_score), file=log) + # best_score = testing_score + # else: + if verbose > 0: + print("Updating best score with N={} features : Average Score[{} -> {}]".format(len(features), best_score, average_score), file=log) + best_score = average_score + best_features = features + no_progress = 0 + if no_progress >= early_stopping: + break + if len(features) >= maximum_number_of_features: + if verbose > 0: + print("Terminating algorithm after {} iterations with a best score of {} as the maximum number of features has been reached.".format(i+1, best_score), file=log) + break + if verbose > 0: + if best_features is None: + print("Terminating algorithm after {} iterations with a best score of {} as no feature set improved the score with current parameters".format(i+1, best_score), file=log) + else: + print("Terminating algorithm at N={} features after {} iterations with a best score of {}".format(len(best_features), i+1, best_score), file=log) + + history = pd.DataFrame(history, index=list(map(lambda x: ("splits", x), cv_labels))).T + # if testing_set_provided: + # history[("testing","score")] = pd.Series(testing_scores) + history.index = feature_tuples + history.index.name = "features" + + # Summary + average_scores = history.mean(axis=1) + sems = history.sem(axis=1) + if testing_set_provided: + testing_scores = pd.Series(testing_scores) + testing_scores.index = feature_tuples + else: + testing_scores = pd.Series([np.nan]*len(feature_tuples), index=feature_tuples) + + history.insert(loc=0, column=("summary", "number_of_features"),value = history.index.map(len)) + history.insert(loc=1, column=("summary", "average_score"),value = average_scores) + history.insert(loc=2, column=("summary", "sem"),value = sems) + history.insert(loc=3, column=("summary", "testing_score"), value = testing_scores) + history.insert(loc=4, column=("summary", "∆(testing_score-average_score)"), value = average_scores - testing_scores) - with plt.style.context(style): - _title_kws = {"fontsize":16, "fontweight":"bold"}; _title_kws.update(title_kws) - _xlabel_kws = {"fontsize":15}; _xlabel_kws.update(xlabel_kws) - _ylabel_kws = {"fontsize":15}; _ylabel_kws.update(ylabel_kws) - # _zlabel_kws = {"fontsize":15}; _zlabel_kws.update(zlabel_kws) + history.columns = pd.MultiIndex.from_tuples(history.columns) - _xticklabel_kws = {"fontsize":12, "rotation":xtick_rotation}; _xticklabel_kws.update(xticklabel_kws) - _yticklabel_kws = {"fontsize":12}; _yticklabel_kws.update(yticklabel_kws) - # _zticklabel_kws = {"fontsize":12}; _zticklabel_kws.update(zticklabel_kws) + + # Highest scoring features (not necessarily the best since there can be many features added with minimal gains) + # if optimize_testing_score: + # highest_score = history[("summary", "testing_score")].max() + # highest_scoring_features = list(history.loc[history[("summary", "testing_score")] == highest_score].sort_values( + # by=[("summary", "average_score"), ("summary", "number_of_features")], + # ascending=[False, less_features_is_better]).index[0]) + # else: + highest_score = history[("summary", "average_score")].max() + try: + highest_scoring_features = list(history.loc[history[("summary", "average_score")] == highest_score, ("summary", "number_of_features")].sort_values(ascending=less_features_is_better).index[0]) + + + # # Best results + # if optimize_testing_score: + # # best_features = list(history.loc[history[("summary", "testing_score")] == best_score].sort_values( + # # by=[("summary", "average_score"), ("summary", "number_of_features")], + # # ascending=[False, less_features_is_better]).index[0]) + # else: + # best_features = list(history.loc[history[("summary", "average_score")] == best_score, ("summary", "number_of_features")].sort_values(ascending=less_features_is_better).index[0]) - _legend_kws = {"fontsize":12, "frameon":True, "title_fontsize":"x-large"}; _legend_kws.update(legend_kws) - if ax is None: - fig, ax = plt.subplots(figsize=figsize) + if testing_set_provided: + best_features = list(history.sort_values( + by=[("summary", "testing_score"), ("summary", "average_score"), ("summary", "number_of_features")], + ascending=[False, False, less_features_is_better]).index[0]) else: - fig = plt.gcf() + best_features = list(history.loc[history[("summary", "average_score")] == best_score].sort_values( + by=[("summary", "testing_score"), ("summary", "average_score"), ("summary", "number_of_features")], + ascending=[False, False, less_features_is_better]).index[0]) + + best_estimator_sem = history.loc[[tuple(best_features)],("summary","sem")].values[0] + best_estimator_testing_score = history.loc[[tuple(best_features)],("summary","testing_score")].values[0] + best_estimator_rci = clone(estimator) + X_best_features = transform(X=X.loc[:,best_features], method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) - # ax.scatter(xs=df["number_of_features"].values, ys=df["average_scores"].values, zs=df["testing_scores"], edgecolor=edgecolor, alpha=alpha, linewidth=linewidth, color=color, **kwargs) - ax.scatter(x=df["average_scores"].values, y=df["testing_scores"], s=marker_sizes, edgecolor=edgecolor, alpha=alpha, linewidth=linewidth, color=color, **kwargs) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + best_estimator_rci.fit(X_best_features, y) + + # Score statement + if verbose > 0: + if highest_score > best_score: + if additional_feature_penalty is not None: + print("Highest score was %0.3f with %d features but with `minimum_improvement_in_score=%f` penalty and `additional_feature_penalty` adjustment, the best score was %0.3f with %d features.\n^ Both are stored as .highest_score_ / .highest_scoring_features_ and .best_score_ / .best_feautres_, respectively. ^"%(highest_score, len(highest_scoring_features), minimum_improvement_in_score, best_score, len(best_features)), file=log) + else: + print("Highest score was %0.3f with %d features but with `minimum_improvement_in_score=%f` penalty, the best score was %0.3f with %d features.\n^ Both are stored as .highest_score_ / .highest_scoring_features_ and .best_score_ / .best_feautres_, respectively. ^"%(highest_score, len(highest_scoring_features), minimum_improvement_in_score, best_score, len(best_features)), file=log) + - ax.set_xlabel(xlabel, **_xlabel_kws) - ax.set_ylabel(ylabel, **_ylabel_kws) - # ax.set_zlabel(zlabel, **_zlabel_kws) + # Full training dataset weights + W = getattr(best_estimator_rci, feature_weight_attribute) + rci_feature_weights = pd.Series(format_weights(W), index=best_features, name="rci_weights") + feature_weights = pd.DataFrame([pd.Series(initial_feature_weights,name=initial_feature_weights_name), rci_feature_weights]).T + feature_weights.columns = feature_weights.columns.map(lambda x: ("full_dataset", x)) - # ax.set_yticklabels(map(lambda x:"%0.2f"%x, ax.get_yticks()), **_yticklabel_kws) - if title: - ax.set_title(title, **_title_kws) - if show_legend: - legendary_features = number_of_features.describe()[legend_markers].astype(int) - - legend_elements = list() - for i,n in legendary_features.items(): - # marker = plt.Line2D([], [], color=color, marker='o', linestyle='None', markersize=feature_to_size_function(n), label="$N$ = %d"%(n)) - # legend_elements.append(marker) - i = i.replace("%","\%") - legend_marker = ax.scatter([], [], s=feature_to_size_function(n), label="$N_{%s}$ = %d"%(i,n), color=color, marker='o') - legend_elements.append(legend_marker) + # Cross validation weights + cv_weights = OrderedDict() + for id_cv, (training_index, testing_index) in zip(cv_labels, cv_splits): + X_training = transform(X=X.iloc[training_index].loc[:,best_features], method=transformation, multiplicative_replacement=multiplicative_replacement, axis=1) + y_training = y.iloc[training_index] + clf = clone(estimator) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + clf.fit(X_training, y_training) + cv_weights[id_cv] = pd.Series(format_weights(getattr(clf, feature_weight_attribute)), index=best_features) + cv_weights = pd.DataFrame(cv_weights) + cv_weights.columns = cv_weights.columns.map(lambda x: ("cross_validation", x)) + feature_weights = pd.concat([feature_weights, cv_weights], axis=1) + + return pd.Series( + dict( + history=history, + best_score=best_score, + best_estimator_sem=best_estimator_sem, + best_features=best_features, + best_estimator_rci=best_estimator_rci, + feature_weights=feature_weights, + highest_score=highest_score, + highest_scoring_features=highest_scoring_features, + cv_splits=cv_splits, + cv_labels=cv_labels, + testing_scores=testing_scores, + best_estimator_testing_score=best_estimator_testing_score, + ), + name="recursive_feature_elimination", + ) + except IndexError: + return pd.Series( + dict( + history=history, + best_score=np.nan, + best_estimator_sem=np.nan, + best_features=np.nan, + best_estimator_rci=np.nan, + feature_weights=np.nan, + highest_score=highest_score, + highest_scoring_features=np.nan, + cv_splits=cv_splits, + cv_labels=cv_labels, + testing_scores=testing_scores, + best_estimator_testing_score=np.nan, + ), + name="recursive_feature_elimination", + ) - legend = ax.legend(handles=legend_elements, title="$N_{Features}$", markerscale=1, **_legend_kws) - - if show_xgrid: - ax.xaxis.grid(True) - if show_ygrid: - ax.yaxis.grid(True) - # if show_zgrid: - # ax.zaxis.grid(True) - return fig, ax # ------- # Classes # ------- -class ClairvoyanceBase(object): +class LegacyClairvoyanceBase(object): def __init__( self, # Modeling estimator, - param_grid:dict, + param_space:dict, scorer, method:str="asymmetric", #imbalanced? importance_getter="auto", @@ -1119,8 +860,8 @@ def __init__( print("Updating `random_state=None` in `estimator` to `random_state={}`".format(random_state), file=log) self.estimator.set_params(random_state=random_state) self.feature_weight_attribute = get_feature_importance_attribute(estimator, importance_getter) - assert len(param_grid) > 0, "`param_grid` must have at least 1 key:[value_1, value_2, ..., value_n] pair" - self.param_grid = param_grid + assert len(param_space) > 0, "`param_space` must have at least 1 key:[value_1, value_2, ..., value_n] pair" + self.param_space = param_space # Set attributes self.name = name @@ -1163,7 +904,7 @@ def __repr__(self): header, pad*" " + "* Estimator: {}".format(self.estimator_name), pad*" " + "* Estimator Type: {}".format(self.estimator_type), - pad*" " + "* Parameter Space: {}".format(self.param_grid), + pad*" " + "* Parameter Space: {}".format(self.param_space), pad*" " + "* Scorer: {}".format(self.scorer_name), pad*" " + "* Method: {}".format(self.method), pad*" " + "* Feature Weight Attribute: {}".format(self.feature_weight_attribute), @@ -1216,8 +957,8 @@ def _get_estimators(): # Indexing for hyperparameters and models estimators_ = OrderedDict() - param_grid_expanded = list(map(lambda x:dict(zip(self.param_grid.keys(), x)), itertools.product(*self.param_grid.values()))) - for i, params in enumerate(param_grid_expanded): + param_space_expanded = list(map(lambda x:dict(zip(self.param_space.keys(), x)), itertools.product(*self.param_space.values()))) + for i, params in enumerate(param_space_expanded): # Construct model estimator = clone(self.estimator) estimator.set_params(**params) @@ -1386,7 +1127,7 @@ def _run(X, y, stratify, split_size, method, progress_message): assert isinstance(ascending, list), "`sort_hyperparameters_by` must be a list of size n accompanied by an `ascending` list of n boolean vaues" assert len(sort_hyperparameters_by) == len(ascending), "`sort_hyperparameters_by` must be a list of size n accompanied by an `ascending` list of n boolean vaues" assert all(map(lambda x: isinstance(x, bool), ascending)), "`sort_hyperparameters_by` must be a list of size n accompanied by an `ascending` list of n boolean vaues" - assert set(sort_hyperparameters_by) <= set(self.param_grid.keys()), "`sort_hyperparameters_by` must contain a list of keys in `param_grid`" + assert set(sort_hyperparameters_by) <= set(self.param_space.keys()), "`sort_hyperparameters_by` must contain a list of keys in `param_space`" else: sort_hyperparameters_by = [] ascending = [] @@ -1405,7 +1146,9 @@ def _run(X, y, stratify, split_size, method, progress_message): df = df.sort_values(["average_score"] + sort_hyperparameters_by, ascending=[False] + ascending) self.best_estimator_ = clone(self.estimator) - self.best_estimator_.set_params(**dict(df.index[0])) + + if not df.empty: + self.best_estimator_.set_params(**dict(df.index[0])) self.best_hyperparameters_ = self.best_estimator_.get_params(deep=True) self.hyperparameter_average_scores_ = df @@ -1451,7 +1194,7 @@ def get_weights(self, minimum_score=None, metrics=[np.nanmean, stats.sem], minim return df.squeeze() - def recursive_feature_inclusion( + def recursive_feature_addition( self, estimator=None, X:pd.DataFrame=None, @@ -1512,7 +1255,7 @@ def recursive_feature_inclusion( # Recursive feature incusion - rci_results = recursive_feature_inclusion( + rci_results = recursive_feature_addition( estimator=estimator, X=X, y=y, @@ -1561,17 +1304,28 @@ def recursive_feature_inclusion( self.best_estimator_rci_ = clone(estimator) self.best_estimator_testing_score_ = rci_results["best_estimator_testing_score"] - X_rci = transform(X=X.loc[:,self.best_features_], method=self.transformation, multiplicative_replacement=self.multiplicative_replacement, axis=1) - with warnings.catch_warnings(): #! - warnings.filterwarnings("ignore", category=ConvergenceWarning) - self.best_estimator_rci_.fit(X_rci, y) + self.status_ok_ = False + if isinstance(self.best_features_, list): + if np.all(pd.notnull(self.best_features_)): + self.status_ok_ = True + + if self.status_ok_: + X_rci = transform(X=X.loc[:,self.best_features_], method=self.transformation, multiplicative_replacement=self.multiplicative_replacement, axis=1) + with warnings.catch_warnings(): #! + warnings.filterwarnings("ignore", category=ConvergenceWarning) + self.best_estimator_rci_.fit(X_rci, y) + self.rci_feature_weights_ = rci_results["feature_weights"][("full_dataset", "rci_weights")].loc[self.best_features_] + else: + self.rci_feature_weights_ = np.nan self.feature_weights_ = rci_results["feature_weights"] - self.rci_feature_weights_ = rci_results["feature_weights"][("full_dataset", "rci_weights")].loc[self.best_features_] self.cv_splits_ = rci_results["cv_splits"] self.cv_labels_ = rci_results["cv_labels"] if copy_X_rci: - self.X_rci_ = X_rci.copy() + if self.status_ok_: + self.X_rci_ = X_rci.copy() + else: + self.X_rci_ = np.nan self.is_fitted_rci = True @@ -1597,7 +1351,7 @@ def plot_scores( if ylabel == "auto": ylabel = self.scorer_name kwargs["ylabel"] = ylabel - assert self.is_fitted_rci, "Please run `recursive_feature_inclusion` before proceeding." + assert self.is_fitted_rci, "Please run `recursive_feature_addition` before proceeding." if self.best_score_ < self.highest_score_: vertical_lines = [len(self.best_features_)-1, len(self.highest_scoring_features_)-1] else: @@ -1613,7 +1367,7 @@ def plot_weights( weight_type=("full_dataset","rci_weights"), **kwargs, ): - assert self.is_fitted_rci, "Please run `recursive_feature_inclusion` before proceeding." + assert self.is_fitted_rci, "Please run `recursive_feature_addition` before proceeding." assert_acceptable_arguments(weight_type, {("full_dataset","rci_weights"), ("full_dataset","clairvoyance_weights"), "cross_validation"}) if weight_type in {("full_dataset","rci_weights"), ("full_dataset","clairvoyance_weights")}: @@ -1626,8 +1380,8 @@ def plot_scores_comparison( self, **kwargs, ): - assert self.is_fitted_rci, "Please run `recursive_feature_inclusion` before proceeding." - assert self.testing_set_provided, "Please run `recursive_feature_inclusion` with a testing set before proceeding." + assert self.is_fitted_rci, "Please run `recursive_feature_addition` before proceeding." + assert self.testing_set_provided, "Please run `recursive_feature_addition` with a testing set before proceeding." return plot_scores_comparison(number_of_features=self.history_[("summary", "number_of_features")], average_scores=self.history_[("summary", "average_score")], testing_scores=self.history_[("summary", "testing_score")], **kwargs) def copy(self): @@ -1642,12 +1396,12 @@ def copy(self): # return cls -class ClairvoyanceClassification(ClairvoyanceBase): +class LegacyClairvoyanceClassification(LegacyClairvoyanceBase): def __init__( self, # Modeling estimator, - param_grid:dict, + param_space:dict, scorer="accuracy", method="asymmetric", importance_getter="auto", @@ -1679,9 +1433,9 @@ def __init__( scorer = get_scorer(scorer) - super(ClairvoyanceClassification, self).__init__( + super(LegacyClairvoyanceClassification, self).__init__( estimator=estimator, - param_grid=param_grid, + param_space=param_space, scorer=scorer, method=method, importance_getter=importance_getter, @@ -1708,13 +1462,13 @@ def __init__( log=log, ) -class ClairvoyanceRegression(ClairvoyanceBase): +class LegacyClairvoyanceRegression(LegacyClairvoyanceBase): def __init__( self, # Modeling estimator, - param_grid:dict, + param_space:dict, scorer="neg_root_mean_squared_error", method="asymmetric", importance_getter="auto", @@ -1743,9 +1497,9 @@ def __init__( ): - super(ClairvoyanceRegression, self).__init__( + super(LegacyClairvoyanceRegression, self).__init__( estimator=estimator, - param_grid=param_grid, + param_space=param_space, scorer=scorer, method=method, importance_getter=importance_getter, @@ -1774,12 +1528,12 @@ def __init__( ) -class ClairvoyanceRecursive(object): +class LegacyClairvoyanceRecursive(object): def __init__( self, # Modeling estimator, - param_grid:dict, + param_space:dict, scorer, method="asymmetric", importance_getter="auto", @@ -1826,10 +1580,10 @@ def __init__( self.estimator_name = estimator.__class__.__name__ if is_classifier(estimator): self.estimator_type = "classifier" - self.clairvoyance_class = ClairvoyanceClassification + self.clairvoyance_class = LegacyClairvoyanceClassification if is_regressor(estimator): self.estimator_type = "regressor" - self.clairvoyance_class = ClairvoyanceRegression + self.clairvoyance_class = LegacyClairvoyanceRegression self.estimator = clone(estimator) @@ -1840,8 +1594,8 @@ def __init__( print("Updating `random_state=None` in `estimator` to `random_state={}`".format(random_state), file=log) self.estimator.set_params(random_state=random_state) self.feature_weight_attribute = get_feature_importance_attribute(estimator, importance_getter) - assert len(param_grid) > 0, "`param_grid` must have at least 1 key:[value_1, value_2, ..., value_n] pair" - self.param_grid = param_grid + assert len(param_space) > 0, "`param_space` must have at least 1 key:[value_1, value_2, ..., value_n] pair" + self.param_space = param_space # Transformations assert_acceptable_arguments(transformation, {None,"clr","closure"}) @@ -1900,7 +1654,7 @@ def __repr__(self): header, pad*" " + "* Estimator: {}".format(self.estimator_name), pad*" " + "* Estimator Type: {}".format(self.estimator_type), - pad*" " + "* Parameter Space: {}".format(self.param_grid), + pad*" " + "* Parameter Space: {}".format(self.param_space), pad*" " + "* Scorer: {}".format(self.scorer_name), pad*" " + "* Method: {}".format(self.method), pad*" " + "* Feature Weight Attribute: {}".format(self.feature_weight_attribute), @@ -1991,6 +1745,7 @@ def fit( self.history_ = OrderedDict() self.results_ = OrderedDict() self.results_baseline_ = OrderedDict() + self.status_ok_ = True with np.errstate(divide='ignore', invalid='ignore'): current_features_for_percentile = X.columns @@ -2003,7 +1758,7 @@ def fit( # Initiate model model = self.clairvoyance_class( estimator=self.estimator, - param_grid=self.param_grid, + param_space=self.param_space, scorer=self.scorer, method=self.method, importance_getter=self.feature_weight_attribute, @@ -2056,113 +1811,142 @@ def fit( X_query = transform(X=self.X_initial_.loc[:,current_features_for_percentile], method=self.transformation, multiplicative_replacement=self.multiplicative_replacement, axis=1) for params, estimator in model.estimators_.items(): + if self.verbose > 2: print("[Start] recursive feature inclusion [percentile={}, estimator_params={}]".format(pctl, params), file=self.log) - # Baseline - self._debug = dict(X=X_query, y=self.y_, scoring=self.scorer, cv=self.cv_splits_, n_jobs=self.n_jobs) #? + if not np.all(X_query == 0): + # Baseline + self._debug = dict(X=X_query, y=self.y_, scoring=self.scorer, cv=self.cv_splits_, n_jobs=self.n_jobs) #? - baseline_scores_for_percentile = cross_val_score(estimator=estimator, X=X_query, y=self.y_, scoring=self.scorer, cv=self.cv_splits_, n_jobs=self.n_jobs) + baseline_scores_for_percentile = cross_val_score(estimator=estimator, X=X_query, y=self.y_, scoring=self.scorer, cv=self.cv_splits_, n_jobs=self.n_jobs) - # break #? - if self.verbose > 3: - print("[Completed] Baseline cross-validation for training set [percentile={}, estimator_params={}]".format(pctl, params), file=self.log) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=ConvergenceWarning) - baseline_rci_weights = getattr(estimator.fit(X_query, self.y_), self.feature_weight_attribute) - if np.all(baseline_rci_weights == 0): - if self.verbose > 2: - print("Excluding results from [percentile={}, estimator_params={}] becaue baseline model could not be fit with parameter set".format(pctl, params), file=self.log) - else: + # break #? + if self.verbose > 3: + print("[Completed] Baseline cross-validation for training set [percentile={}, estimator_params={}]".format(pctl, params), file=self.log) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + baseline_rci_weights = getattr(estimator.fit(X_query, self.y_), self.feature_weight_attribute) + if np.all(baseline_rci_weights == 0): + if self.verbose > 2: + print("Excluding results from [percentile={}, estimator_params={}] becaue baseline model could not be fit with parameter set".format(pctl, params), file=self.log) + else: - baseline_testing_score = np.nan - if self.testing_set_provided: - # Transform features (if transformation = None, then there is no transformation) - X_testing_query = transform(X=X_testing.loc[:,current_features_for_percentile], method=self.transformation, multiplicative_replacement=self.multiplicative_replacement, axis=1) - baseline_testing_score = self.scorer(estimator=estimator, X=X_testing_query, y_true=y_testing) - if self.verbose > 3: - print("[Completed] Baseline cross-validation for testing set [percentile={}, estimator_params={}]".format(pctl, params), file=self.log) - baseline_rci_weights = format_weights(baseline_rci_weights) - self.results_baseline_[(pctl,"baseline", params)] = { - "testing_score":baseline_testing_score, - "average_score":np.nanmean(baseline_scores_for_percentile), - "sem":stats.sem(baseline_scores_for_percentile), - "number_of_features":X_query.shape[1], - "features":X_query.columns.tolist(), - "clairvoyance_weights":np.nan, - "rci_weights":baseline_rci_weights, - "estimator":estimator, - } - - # Minimum score thresholds - for s in sorted(feature_ordering_from_minimum_scores): - progress_message = {True:"Recursive feature inclusion [percentile={}, estimator_params={}, minimum_score={}]".format(pctl, params, s), False:None}[self.verbose > 1] - rci_params = dict( - estimator=estimator, - X=self.X_initial_.loc[:,current_features_for_percentile], - y=self.y_, - cv=self.cv_splits_, - minimum_score=s, - metric=np.nanmean, - early_stopping=self.early_stopping, - minimum_improvement_in_score=self.minimum_improvement_in_score, - additional_feature_penalty=self.additional_feature_penalty, - maximum_number_of_features=self.maximum_number_of_features, - target_score=-np.inf, - less_features_is_better=less_features_is_better, - progress_message=progress_message, - ) + baseline_testing_score = np.nan if self.testing_set_provided: - rci_params.update( - dict( - X_testing=X_testing.loc[:,current_features_for_percentile], - y_testing=y_testing, - ) + # Transform features (if transformation = None, then there is no transformation) + X_testing_query = transform(X=X_testing.loc[:,current_features_for_percentile], method=self.transformation, multiplicative_replacement=self.multiplicative_replacement, axis=1) + baseline_testing_score = self.scorer(estimator=estimator, X=X_testing_query, y_true=y_testing) + if self.verbose > 3: + print("[Completed] Baseline cross-validation for testing set [percentile={}, estimator_params={}]".format(pctl, params), file=self.log) + baseline_rci_weights = format_weights(baseline_rci_weights) + self.results_baseline_[(pctl,"baseline", params)] = { + "testing_score":baseline_testing_score, + "average_score":np.nanmean(baseline_scores_for_percentile), + "sem":stats.sem(baseline_scores_for_percentile), + "number_of_features":X_query.shape[1], + "features":X_query.columns.tolist(), + "clairvoyance_weights":np.nan, + "rci_weights":baseline_rci_weights, + "estimator":estimator, + } + + # Minimum score thresholds + for s in sorted(feature_ordering_from_minimum_scores): + progress_message = {True:"Recursive feature inclusion [percentile={}, estimator_params={}, minimum_score={}]".format(pctl, params, s), False:None}[self.verbose > 1] + rci_params = dict( + estimator=estimator, + X=self.X_initial_.loc[:,current_features_for_percentile], + y=self.y_, + cv=self.cv_splits_, + minimum_score=s, + metric=np.nanmean, + early_stopping=self.early_stopping, + minimum_improvement_in_score=self.minimum_improvement_in_score, + additional_feature_penalty=self.additional_feature_penalty, + maximum_number_of_features=self.maximum_number_of_features, + target_score=-np.inf, + less_features_is_better=less_features_is_better, + progress_message=progress_message, ) + if self.testing_set_provided: + rci_params.update( + dict( + X_testing=X_testing.loc[:,current_features_for_percentile], + y_testing=y_testing, + ) + ) - rci_history = model.recursive_feature_inclusion(**rci_params) - - - - #! - # Update weights if applicable - if model.best_score_ > best_score_for_percentile: - best_score_for_percentile = model.best_score_ - best_hyperparameters_for_percentile = params - best_minimum_score_for_percentile = s - best_clairvoyance_feature_weights_for_percentile = model.clairvoyance_feature_weights_.copy() - - # Store results - rci_feature_weights = model.rci_feature_weights_[model.best_features_] - if not np.any(rci_feature_weights.isnull()) and np.any(rci_feature_weights > 0): - self.results_[(pctl, params, s)] = { - "testing_score":model.best_estimator_testing_score_, - "average_score":model.best_score_, - "sem":model.best_estimator_sem_, - "number_of_features":len(model.best_features_), - "features":list(model.best_features_), - "clairvoyance_weights":model.clairvoyance_feature_weights_[model.best_features_].values.tolist(), - "rci_weights":rci_feature_weights.values.tolist(), - "estimator":estimator, - } - else: - if self.verbose > 2: - print("Excluding results from [percentile={}, estimator_params={}, minimum_score={}] becaue model could not be fit with parameter set".format(pctl, params, s), file=self.log) - self.history_[(pctl,params, s)] = rci_history + rci_history = model.recursive_feature_addition(**rci_params) - if self.verbose > 2: - print("[End] recursive feature inclusion [percentile={}, estimator_params={}]".format(pctl, params), file=self.log) - # Reset estimator - model.estimators_[params] = clone(estimator) + + # Update feature weights + # if not np.all(pd.isnull(model.rci_feature_weights_)): + feature_weights_ok = model.status_ok_ + #! + # Update weights if applicable + # if pd.notnull(model.best_score_): + model_score_ok = pd.notnull(model.best_score_) + + if all([feature_weights_ok, model_score_ok]): + rci_feature_weights = model.rci_feature_weights_[model.best_features_] + + if model.best_score_ > best_score_for_percentile: + best_score_for_percentile = model.best_score_ + best_hyperparameters_for_percentile = params + best_minimum_score_for_percentile = s + best_clairvoyance_feature_weights_for_percentile = model.clairvoyance_feature_weights_.copy() + + + + if np.any(rci_feature_weights > 0): + self.results_[(pctl, params, s)] = { + "testing_score":model.best_estimator_testing_score_, + "average_score":model.best_score_, + "sem":model.best_estimator_sem_, + "number_of_features":len(model.best_features_), + "features":list(model.best_features_), + "clairvoyance_weights":model.clairvoyance_feature_weights_[model.best_features_].values.tolist(), + "rci_weights":rci_feature_weights.values.tolist(), + "estimator":estimator, + } + else: + if self.verbose > 2: + print("Excluding results from [percentile={}, estimator_params={}, minimum_score={}] because model could not be fit with parameter set".format(pctl, params, s), file=self.log) + self.history_[(pctl,params, s)] = rci_history + + if self.verbose > 2: + print("[End] recursive feature inclusion [percentile={}, estimator_params={}]".format(pctl, params), file=self.log) + + # Reset estimator + model.estimators_[params] = clone(estimator) + else: + if self.verbose > 1: + print("Excluding results from [percentile={}, estimator_params={}, minimum_score={}] failed the following checks:\n * feature_weights_ok = {}\n * model_score_ok = {}".format(pctl, params, s, feature_weights_ok, model_score_ok), file=self.log) + # Get new features if i < len(self.percentiles) - 1: - if self.remove_zero_weighted_features: #! Used to be exclude_zero_weighted_features which had a separate functionality from removing zero weighted features in the models. Keep eye on this - nonzero_weights = best_clairvoyance_feature_weights_for_percentile[lambda x: x > 0] - current_features_for_percentile = best_clairvoyance_feature_weights_for_percentile[lambda w: w >= np.percentile(nonzero_weights, q=100*self.percentiles[i+1])].sort_values(ascending=False).index + if np.all(pd.isnull(best_clairvoyance_feature_weights_for_percentile)): + if self.verbose > 0: + print("Terminating algorithm. All weights returned as NaN.", file=self.log) + self.status_ok_ = False + break else: - current_features_for_percentile = best_clairvoyance_feature_weights_for_percentile[lambda w: w >= np.percentile(best_clairvoyance_feature_weights_for_percentile, q=100*self.percentiles[i+1])].sort_values(ascending=False).index + if self.remove_zero_weighted_features: #! Used to be exclude_zero_weighted_features which had a separate functionality from removing zero weighted features in the models. Keep eye on this + nonzero_weights = best_clairvoyance_feature_weights_for_percentile[lambda x: x > 0] + self._debug2 = { + "model":model, + "best_score_for_percentile":best_score_for_percentile, + "best_hyperparameters_for_percentile":best_hyperparameters_for_percentile, + "best_minimum_score_for_percentile":best_minimum_score_for_percentile, + "best_clairvoyance_feature_weights_for_percentile":best_clairvoyance_feature_weights_for_percentile, + } + + current_features_for_percentile = best_clairvoyance_feature_weights_for_percentile[lambda w: w >= np.percentile(nonzero_weights, q=100*self.percentiles[i+1])].sort_values(ascending=False).index + else: + current_features_for_percentile = best_clairvoyance_feature_weights_for_percentile[lambda w: w >= np.percentile(best_clairvoyance_feature_weights_for_percentile, q=100*self.percentiles[i+1])].sort_values(ascending=False).index else: if self.verbose > 0: print("Terminating algorithm. Last percentile has been processed.", file=self.log) @@ -2171,32 +1955,38 @@ def fit( print("Terminating algorithm. Only 1 feature remains.", file=self.log) break - self.results_ = pd.DataFrame(self.results_).T.sort_values(["testing_score", "average_score", "number_of_features", "sem"], ascending=[False, False,less_features_is_better, True]) - self.results_.index.names = ["percentile", "hyperparameters", "minimum_score"] - - self.results_ = self.results_.loc[:,["testing_score", "average_score", "sem", "number_of_features", "features", "clairvoyance_weights", "rci_weights", "estimator"]] - - # Dtypes - for field in ["testing_score", "average_score", "sem"]: - self.results_[field] = self.results_[field].astype(float) - for field in ["number_of_features"]: - self.results_[field] = self.results_[field].astype(int) - - # Remove redundancy - if remove_redundancy: - unique_results = set() - unique_index = list() - for idx, row in pv(self.results_.iterrows(), "Removing duplicate results", total=self.results_.shape[0]): - id_unique = frozenset([row["average_score"], row["sem"], tuple(row["features"])]) - if id_unique not in unique_results: - unique_results.add(id_unique) - unique_index.append(idx) - else: - if self.verbose > 2: - print("Removing duplicate result: {}".format(id_unique), file=self.log) - self.results_ = self.results_.loc[unique_index] - self.results_baseline_ = pd.DataFrame(self.results_baseline_).T - self.results_baseline_.index.names = ["percentile", "hyperparameters", "minimum_score"] + self.results_ = pd.DataFrame(self.results_).T + + if self.status_ok_: + self.results_ = self.results_.sort_values(["testing_score", "average_score", "number_of_features", "sem"], ascending=[False, False,less_features_is_better, True]) + self.results_.index.names = ["percentile", "hyperparameters", "minimum_score"] + + self.results_ = self.results_.loc[:,["testing_score", "average_score", "sem", "number_of_features", "features", "clairvoyance_weights", "rci_weights", "estimator"]] + + # Dtypes + for field in ["testing_score", "average_score", "sem"]: + self.results_[field] = self.results_[field].astype(float) + for field in ["number_of_features"]: + self.results_[field] = self.results_[field].astype(int) + + # Remove redundancy + if remove_redundancy: + unique_results = set() + unique_index = list() + for idx, row in pv(self.results_.iterrows(), "Removing duplicate results", total=self.results_.shape[0]): + id_unique = frozenset([row["average_score"], row["sem"], tuple(row["features"])]) + if id_unique not in unique_results: + unique_results.add(id_unique) + unique_index.append(idx) + else: + if self.verbose > 2: + print("Removing duplicate result: {}".format(id_unique), file=self.log) + self.results_ = self.results_.loc[unique_index] + self.results_baseline_ = pd.DataFrame(self.results_baseline_).T + self.results_baseline_.index.names = ["percentile", "hyperparameters", "minimum_score"] + else: + if self.verbose > 0: + print("All models failed for parameter set and data input", file=self.log) self.is_fitted_rci = True return self @@ -2258,8 +2048,8 @@ def plot_scores_comparison( self, **kwargs, ): - assert self.is_fitted_rci, "Please run `recursive_feature_inclusion` before proceeding." - assert self.testing_set_provided, "Please run `recursive_feature_inclusion` with a testing set before proceeding." + assert self.is_fitted_rci, "Please run `recursive_feature_addition` before proceeding." + assert self.testing_set_provided, "Please run `recursive_feature_addition` with a testing set before proceeding." df = self.get_history(summary=True) number_of_features = df["number_of_features"] average_scores = df["average_score"] @@ -2276,156 +2066,10 @@ def from_file(cls, path:str): cls = read_object(path) return cls -def main(): - s = """ -_______ _______ _____ ______ _ _ _____ __ __ _______ __ _ _______ _______ -| | |_____| | |_____/ \ / | | \_/ |_____| | \ | | |______ -|_____ |_____ | | __|__ | \_ \/ |_____| | | | | \_| |_____ |______ - """ - print(s, file=sys.stderr) - print("Hello There.\nI live here: https://github.com/jolespin/clairvoyance", file=sys.stderr) - if len(sys.argv) > 0: - if sys.argv[1] in {"--test", "-t"}: - from sklearn.linear_model import LogisticRegression - from sklearn.tree import DecisionTreeClassifier - - # 1. - - # Classification - print(format_header("1. Running test for `ClairvoyanceClassification` on iris dataset with noise"), file=sys.stderr) - import numpy as np - import pandas as pd - from sklearn.datasets import load_iris - from sklearn.linear_model import LogisticRegression - - # Load iris dataset - X, y = load_iris(return_X_y=True, as_frame=True) - X.columns = X.columns.map(lambda j: j.split(" (cm")[0].replace(" ","_")) - - # Relabel targets - target_names = load_iris().target_names - y = y.map(lambda i: target_names[i]) - - # Add 21 noise features (total = 25 features) in the same range of values as the original features - number_of_noise_features = 21 - vmin = X.values.ravel().min() - vmax = X.values.ravel().max() - X_noise = pd.DataFrame( - data=np.random.RandomState(0).randint(low=int(vmin*10), high=int(vmax*10), size=(150, number_of_noise_features))/10, - columns=map(lambda j:"noise_{}".format(j+1), range(number_of_noise_features)), - ) - - X_iris_with_noise = pd.concat([X, X_noise], axis=1) - X_normalized = X_iris_with_noise - X_iris_with_noise.mean(axis=0).values - X_normalized = X_normalized/X_normalized.std(axis=0).values - - # Specify model algorithm and parameter grid - estimator=LogisticRegression(max_iter=1000, solver="liblinear", multi_class="ovr") - param_grid={ - "C":[1e-10] + (np.arange(1,11)/10).tolist(), - "penalty":["l1", "l2"], - } - - # Instantiate model - clf = ClairvoyanceClassification( - n_jobs=-1, - scorer="accuracy", - n_draws=10, - estimator=estimator, - param_grid=param_grid, - verbose=1, - ) - clf.fit(X_normalized, y)#, sort_hyperparameters_by=["C", "penalty"], ascending=[True, False]) - history = clf.recursive_feature_inclusion(early_stopping=10) - print(history.head(), file=sys.stdout) - - # 2. - # Regression - print(format_header("2. Running test for `ClairvoyanceRegression` on boston dataset with noise"), file=sys.stderr) - # from sklearn.datasets import fetch_openml - from sklearn.tree import DecisionTreeRegressor - from sklearn.model_selection import train_test_split - - # Load housing data - # housing = fetch_openml(name="house_prices", as_frame=True) - data_url = "http://lib.stat.cmu.edu/datasets/boston" - raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) - data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) - target = raw_df.values[1::2, 2] - X = pd.DataFrame(data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']) - y = pd.Series(target) - - number_of_noise_features = 25 - X.shape[1] - X_noise = pd.DataFrame(np.random.RandomState(0).normal(size=(X.shape[0], number_of_noise_features)), columns=map(lambda j: f"noise_{j}", range(number_of_noise_features))) - X_housing_with_noise = pd.concat([X, X_noise], axis=1) - X_normalized = X_housing_with_noise - X_housing_with_noise.mean(axis=0).values - X_normalized = X_normalized/X_normalized.std(axis=0).values - - # Let's fit the model but leave a held out testing set - X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, random_state=0, test_size=0.1618) - - # Get parameters - estimator = DecisionTreeRegressor(random_state=0) - param_grid = {"min_samples_leaf":[1,2,3,5,8],"min_samples_split":{ 0.1618, 0.382, 0.5, 0.618}} - - # Fit model - reg = ClairvoyanceRegression(name="Housing", n_jobs=-1, n_draws=10, estimator=estimator, param_grid=param_grid, verbose=1) - reg.fit(X_training, y_training) - history = reg.recursive_feature_inclusion(early_stopping=10, X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing) - print(history.head(), file=sys.stdout) - - # 3. - # Recursive - print(format_header("3. Running test for `ClairvoyanceRecursive` on iris dataset with noise"), file=sys.stderr) - from sklearn.tree import DecisionTreeClassifier - - X_normalized = X_iris_with_noise - X_iris_with_noise.mean(axis=0).values - X_normalized = X_normalized/X_normalized.std(axis=0).values - target_names = load_iris().target_names - y = pd.Series(load_iris().target) - y = y.map(lambda i: target_names[i]) - - # Specify model algorithm and parameter grid - estimator=DecisionTreeClassifier() - param_grid={ - "criterion":["gini","entropy"], - "max_features":["log2", "sqrt", 0.382, 0.618], - "min_samples_leaf":[1,2,3,5,8], - } - - # Instantiate model - rci = ClairvoyanceRecursive( - n_jobs=-1, - scorer="accuracy", - n_draws=10, - estimator=estimator, - param_grid=param_grid, - percentiles=[0.0, 0.25, 0.5, 0.75, 0.9, 0.925, 0.95, 0.975, 0.99], - minimum_scores=[-np.inf, 0.382], - verbose=0, - ) - rci.fit(X_normalized, y) - print(rci.results_.head(), file=sys.stdout) - - # 4. - print(format_header("4. Running test for `ClairvoyanceRecursive` on iris dataset with noise [includes testing data]"), file=sys.stderr) - - X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, random_state=0, test_size=0.1618) - - # Instantiate model - rci = ClairvoyanceRecursive( - n_jobs=-1, - scorer="accuracy", - n_draws=10, - estimator=estimator, - param_grid=param_grid, - percentiles=[0.0, 0.25, 0.5, 0.75, 0.9, 0.925, 0.95, 0.975, 0.99], - minimum_scores=[-np.inf, 0.382], - verbose=0, - ) - rci.fit(X=X_training, y=y_training, X_testing=X_testing, y_testing=y_testing) - print(rci.results_.head(), file=sys.stdout) - else: - print("Unrecognized command. Available commands {--test|-t}", file=sys.stderr) -if __name__ == "__main__": - main() \ No newline at end of file +# BayesianClairvoyanceBase +# BayesianClairvoyanceClassification +# BayesianClairvoyanceRegression +# BayesianClairvoyanceRecursive +# Shap? +# feature addition or elimination +# * Algo is going to be hyperparameter tuning then feature selection, then hyperparameter tuning then feature selection, for n_iter times. Should be good after like 10. diff --git a/clairvoyance/transformations.py b/clairvoyance/transformations.py new file mode 100644 index 0000000..191c77b --- /dev/null +++ b/clairvoyance/transformations.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Built-ins +from ast import Or +import os, sys, time, warnings + +# PyData +import pandas as pd +import numpy as np + +# ============================================================================== +# Functions +# ============================================================================== +# Total sum scaling +def closure(X:pd.DataFrame): + sum_ = X.sum(axis=1) + return (X.T/sum_).T + +# Center Log-Ratio +def clr(X:pd.DataFrame): + X_tss = closure(X) + X_log = np.log(X_tss) + geometric_mean = X_log.mean(axis=1) + return (X_log - geometric_mean.values.reshape(-1,1)) + +def clr_with_multiplicative_replacement(X:pd.DataFrame, pseudocount="auto"): + if pseudocount == "auto": + if np.any(X == 0): + pseudocount = 1/X.shape[1]**2 + else: + pseudocount = None + if pseudocount is None: + pseudocount = 0.0 + X = X + pseudocount + return clr(X) + +# Transformations +def legacy_transform(X, method="closure", axis=1, multiplicative_replacement="auto", verbose=0, log=sys.stdout): +# """ +# Assumes X_data.index = Samples, X_data.columns = features +# axis = 0 = cols, axis = 1 = rows +# e.g. axis=1, method=closure: transform for relative abundance so each row sums to 1. +# "closure" = closure/total-sum-scaling +# "clr" = center log-ratio +# None = No transformation +# """ + # Transpose for axis == 0 + if axis == 0: + X = X.T + + # Base output + X_transformed = X.copy() + + # Checks + if X.ndim != 2: + raise ValueError("Input matrix must have two dimensions") + # if np.all(X == 0, axis=1).sum() > 0: + # raise ValueError("Input matrix cannot have rows with all zeros") + + # Transformed output + if method is not None: + if np.any(X.values < 0): + raise ValueError("Cannot have negative values") + if X.shape[1] > 1: + if method == "closure": + X_transformed = closure(X) + if method == "clr": + if multiplicative_replacement == "auto": + if not np.all(X > 0): + multiplicative_replacement = 1/X.shape[1]**2 + else: + multiplicative_replacement = None + if multiplicative_replacement is None: + multiplicative_replacement = 0 + assert isinstance(multiplicative_replacement, (float,int,np.floating,np.integer)) + X_transformed = clr(X + multiplicative_replacement) + else: + if verbose > 1: + print("Only 1 feature left. Ignoring transformation.", file=log) + + # Transpose back + if axis == 0: + X_transformed = X_transformed.T + + return X_transformed diff --git a/clairvoyance/utils.py b/clairvoyance/utils.py new file mode 100644 index 0000000..c575a11 --- /dev/null +++ b/clairvoyance/utils.py @@ -0,0 +1,475 @@ +# -*- coding: utf-8 -*- + +# Built-ins +from ast import Or +import os, sys, time, warnings, gzip, bz2, operator, pickle +from tqdm import tqdm, tqdm_notebook + +# PyData +import pandas as pd +import numpy as np +from scipy.stats import sem + +# Machine learning +from sklearn.base import clone, is_classifier, is_regressor +from sklearn.exceptions import ConvergenceWarning +from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, StratifiedKFold, RepeatedKFold, KFold +from feature_engine.selection.base_selection_functions import get_feature_importances + + +# from shap import Explainer, TreeExplainer, LinearExplainer +# from shap.maskers import Independent + + +# ============================================================================== +# Checks from https://github.com/jolespin/soothsayer_utils +# ============================================================================== +def is_file_like(obj): + return hasattr(obj, "read") + +def is_nonstring_iterable(obj): + condition_1 = hasattr(obj, "__iter__") + condition_2 = not type(obj) == str + return all([condition_1,condition_2]) + +def assert_acceptable_arguments(query, target, operation="le", message="Invalid option provided. Please refer to the following for acceptable arguments:"): + """ + le: operator.le(a, b) : <= + eq: operator.eq(a, b) : == + ge: operator.ge(a, b) : >= + """ + # If query is not a nonstring iterable or a tuple + if any([ + not is_nonstring_iterable(query), + isinstance(query,tuple), + ]): + query = [query] + query = set(query) + target = set(target) + func_operation = getattr(operator, operation) + assert func_operation(query,target), "{}\n{}".format(message, target) + +def check_testing_set(features, X_testing, y_testing): + # Testing + X_testing_is_provided = X_testing is not None + y_testing_is_provided = y_testing is not None + + if X_testing_is_provided is not None: + assert y_testing_is_provided is not None, "If `X_testing` is provided then `y_testing` must be provided" + + if y_testing_is_provided is not None: + assert X_testing_is_provided is not None, "If `y_testing` is provided then `X_testing` must be provided" + + testing_set_provided = False + if all([X_testing_is_provided, y_testing_is_provided]): + assert np.all(X_testing.index == y_testing.index), "X_testing.index and y_testing.index must have the same ordering" + assert np.all(X_testing.columns == pd.Index(features)), "X_testing.columns and X.columns must have the same ordering" + testing_set_provided = True + return testing_set_provided + +# ============================================================================== +# Format +# ============================================================================== + +# Format file path +def format_path(path, into=str, absolute=False): + assert not is_file_like(path), "`path` cannot be file-like" + if hasattr(path, "absolute"): + path = str(path.absolute()) + if hasattr(path, "path"): + path = str(path.path) + if absolute: + path = os.path.abspath(path) + return into(path) + +# Format header for printing +def format_header(text, line_character="=", n=None): + if n is None: + n = len(text) + line = n*line_character + return "{}\n{}\n{}".format(line, text, line) + +# Get duration +def format_duration(t0): + """ + Adapted from @john-fouhy: + https://stackoverflow.com/questions/538666/python-format-timedelta-to-string + """ + duration = time.time() - t0 + hours, remainder = divmod(duration, 3600) + minutes, seconds = divmod(remainder, 60) + return "{:02}:{:02}:{:02}".format(int(hours), int(minutes), int(seconds)) + +# Format stratified data +def format_stratify(stratify, estimator_type:str, y:pd.Series): + assert_acceptable_arguments(estimator_type, {"classifier", "regressor"}) + if isinstance(stratify, bool): + if stratify is True: + assert estimator_type == "classifier", "If `stratify=True` then the estimator must be a classifier. Please provide a stratification grouping or select None for regressor." + stratify = "auto" + if stratify is False: + stratify = None + if isinstance(stratify, str): + assert stratify == "auto", "`stratify` must be 'auto', a pd.Series, or None" + if estimator_type == "classifier": + stratify = y.copy() + if estimator_type == "regressor": + stratify = None + + if stratify is not None: + assert isinstance(stratify, pd.Series) + assert np.all(stratify.index == y.index) + return stratify + + +def format_cross_validation(cv, X:pd.DataFrame, y:pd.Series, stratify=True, random_state=0, cv_prefix="cv=", training_column="training_index", testing_column="testing_index", return_type=tuple): + assert_acceptable_arguments({return_type}, (tuple, pd.DataFrame)) + if return_type == tuple: + assert np.all(X.index == y.index), "`X.index` and `y.index` must be the same ordering" + index = X.index + number_of_observations = len(index) + + # Simple stratified k-fold cross validation + if isinstance(cv, int): + splits = list() + labels = list() + + if isinstance(stratify, bool): + if stratify is True: + stratify = y.copy() + if stratify is False: + stratify = None + + if stratify is not None: + assert isinstance(stratify, pd.Series), "If `stratify` is not None, it must be a pd.Series" + assert np.all(y.index == stratify.index), "If `stratify` is not None then it must have the same index as `y`" + assert stratify.dtype != float, "`stratify` cannot be floating point values" + if y.dtype == float: + splitter = StratifiedKFold(n_splits=cv, random_state=random_state, shuffle=False if random_state is None else True).split(X, stratify, groups=stratify) + else: + splitter = StratifiedKFold(n_splits=cv, random_state=random_state, shuffle=False if random_state is None else True).split(X, y, groups=stratify) + else: + splitter = KFold(n_splits=cv, random_state=random_state, shuffle=False if random_state is None else True).split(X, y) + + for i, (training_index, testing_index) in enumerate(splitter, start=1): + id_cv = "{}{}".format(cv_prefix, i) + splits.append([training_index, testing_index]) + labels.append(id_cv) + + return (splits, labels) + + # Repeated stratified k-fold cross validation + if isinstance(cv, tuple): + assert len(cv) == 2, "If tuple is provided, it must be length 2" + assert map(lambda x: isinstance(x, int), cv), "If tuple is provided, both elements must be integers ≥ 2 for `RepeatedStratifiedKFold`" + + splits = list() + labels = list() + + if isinstance(stratify, bool): + if stratify is True: + stratify = y.copy() + if stratify is False: + stratify = None + + if stratify is not None: + assert isinstance(stratify, pd.Series), "If `stratify` is not None, it must be a pd.Series" + assert np.all(y.index == stratify.index), "If `stratify` is not None then it must have the same index as `y`" + assert stratify.dtype != float, "`stratify` cannot be floating point values" + if y.dtype == float: + splitter = RepeatedStratifiedKFold(n_splits=cv[0], n_repeats=cv[1], random_state=random_state).split(X, stratify, groups=stratify) + else: + splitter = RepeatedStratifiedKFold(n_splits=cv[0], n_repeats=cv[1], random_state=random_state).split(X, y, groups=stratify) + else: + splitter = RepeatedKFold(n_splits=cv[0], n_repeats=cv[1], random_state=random_state).split(X, y) + + for i, (training_index, testing_index) in enumerate(splitter, start=1): + id_cv = "{}{}".format(cv_prefix, i) + splits.append([training_index, testing_index]) + labels.append(id_cv) + + return (splits, labels) + + # List + if isinstance(cv, (list, np.ndarray)): + assert all(map(lambda query: len(query) == 2, cv)), "If `cv` is provided as a list, each element must be a sub-list of 2 indicies (i.e., training and testing indicies)" + cv = pd.DataFrame(cv, columns=[training_column, testing_column]) + cv.index = cv.index.map(lambda i: "{}{}".format(cv_prefix, i)) + + # DataFrame + if isinstance(cv, pd.DataFrame): + cv = cv.copy() + assert training_column in cv.columns + assert testing_column in cv.columns + assert np.all(cv[training_column].map(lambda x: isinstance(x, (list, np.ndarray)))), "`{}` must be either list or np.ndarray of indices".format(training_column) + assert np.all(cv[testing_column].map(lambda x: isinstance(x, (list, np.ndarray)))), "`{}` must be either list or np.ndarray of indices".format(testing_column) + + index_values = flatten(cv.values, into=list) + query = index_values[0] + labels = list(cv.index) + if isinstance(query, (int, np.integer)): + assert all(map(lambda x: isinstance(x,(int, np.integer)), index_values)), "If `cv` is provided as a list or pd.DataFrame, all values must either be intger values or keys in the X.index" + assert np.all(np.unique(index_values) >= 0), "All values in `cv` must be positive integers" + maxv = max(index_values) + assert maxv < number_of_observations, "Index values in `cv` cannot exceed ({}) the number of observations ({}).".format(maxv, number_of_observations) + else: + missing_keys = set(index_values) - set(index) + assert len(missing_keys) == 0, "If index values in `cv` are keys and not integers, then all keys must be in `X.index`. Missing keys: {}".format(missing_keys) + cv = cv.applymap(lambda observations: list(map(lambda x: X.index.get_loc(x), observations))) + cv = cv.applymap(np.asarray) + splits = cv.values.tolist() + return (splits, labels) + if return_type == pd.DataFrame: + splits, labels = format_cross_validation(cv=cv, X=X, y=y, stratify=stratify, random_state=random_state, cv_prefix=cv_prefix, training_column=training_column, testing_column=testing_column, return_type=tuple) + return pd.DataFrame(splits, index=labels, columns=[training_column, testing_column]) + + +# I/O +# ==== +# Reading serial object +def read_object(path, compression="infer", serialization_module=pickle): + path = format_path(path, str) + + if compression == "infer": + _ , ext = os.path.splitext(path) + if (ext == ".pkl") or (ext == ".dill"): + compression = None + if ext in {".pgz", ".gz"}: + compression = "gzip" + if ext in {".pbz2", ".bz2"}: + compression = "bz2" + if compression is not None: + if compression == "gzip": + f = gzip.open(path, "rb") + if compression == "bz2": + f = bz2.open(path, "rb") + else: + f = open(path, "rb") + obj = serialization_module.load(f) + f.close() + return obj + +# Writing serial object +def write_object(obj, path, compression="infer", serialization_module=pickle, protocol=None, *args): + """ + Extensions: + pickle ==> .pkl + dill ==> .dill + gzipped-pickle ==> .pgz + bzip2-pickle ==> .pbz2 + """ + assert obj is not None, "Warning: `obj` is NoneType" + path = format_path(path, str) + + # Use infer_compression here + if compression == "infer": + _ , ext = os.path.splitext(path) + if ext in {".pkl", ".dill"}: # if ext in (ext == ".pkl") or (ext == ".dill"): + compression = None + if ext in {".pgz", ".gz"}: + compression = "gzip" + if ext in {".pbz2", ".bz2"}: + compression = "bz2" + if compression is not None: + if compression == "bz2": + f = bz2.BZ2File(path, "wb") + if compression == "gzip": + f = gzip.GzipFile(path, "wb") + else: + f = open(path, "wb") + serialization_module.dump(obj, f, protocol=protocol, *args) + f.close() + +# Misc +# ==== +# Wrapper for tqdm +def pv(iterable, description=None, version=None, total=None, unit='it'): + """ + Progress viewer + Wrapper for `tqdm`: + https://github.com/tqdm/tqdm + """ + assert_acceptable_arguments([version], {None, "gui", "notebook"}) + func = tqdm + if version == "notebook": + func = tqdm_notebook + if version == "gui": + func = tqdm_gui + + return tqdm( + iterable, + desc=description, + total=total, + unit=unit, + ) + +# Flatten nested iterables +def flatten(nested_iterable, into=list, unique=False, **kwargs_iterable): + # Adapted from @wim: + # https://stackoverflow.com/questions/16312257/flatten-an-iterable-of-iterables + def _func_recursive(nested_iterable): + for x in nested_iterable: + if is_nonstring_iterable(x): + for element in flatten(x): + yield element + else: + yield x + # Unpack data + data_flattened = list(_func_recursive(nested_iterable)) + if unique: + data_flattened = sorted(set(data_flattened)) + # Convert type + return into(data_flattened, **kwargs_iterable) + + +def get_balanced_class_subset(y:pd.Series, size:float=0.1, random_state=None): + n = y.size + number_of_classes = y.nunique() + if isinstance(size, float): + number_of_samples_in_subset = int(n * size) + else: + number_of_samples_in_subset = n + subset_size_per_class = int(number_of_samples_in_subset/number_of_classes) + + index = list() + for id_class in y.unique(): + class_samples = y[lambda x: x == id_class].index + assert len(class_samples) >= subset_size_per_class + subset = np.random.RandomState(random_state).choice(class_samples, size=subset_size_per_class, replace=False) + index += subset.tolist() + return index + + + + +# =================== +# Feature importances: +# =================== +def format_weights(W, scale:bool=True): # Deprecate? + # with warnings.catch_warnings(): + # warnings.filterwarnings("ignore", category=ConvergenceWarning) + # warnings.filterwarnings("ignore", category=UndefinedMetricWarning) + # warnings.filterwarnings("ignore", category=UserWarning) + + W = np.abs(W) + if W.ndim > 1: + W = np.sum(W, axis=0) + if scale: + W = W/W.sum() + return W + + +# Get feature importance attribute from estimator +def get_feature_importance_attribute(estimator, importance_getter="auto"): # Deprecate? + estimator = clone(estimator) + _X = np.random.normal(size=(5,2)) + if is_classifier(estimator): + _y = np.asarray(list("aabba")) + if is_regressor(estimator): + _y = np.asarray(np.random.normal(size=5)) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + estimator.fit(_X,_y) + if importance_getter == "auto": + importance_getter = None + if hasattr(estimator, "coef_"): + importance_getter = "coef_" + if hasattr(estimator, "feature_importances_"): + importance_getter = "feature_importances_" + assert importance_getter is not None, "If `importance_getter='auto'`, `estimator` must be either a linear model with `.coef_` or a tree-based model with `.feature_importances_`" + assert hasattr(estimator, importance_getter), "Fitted estimator does not have feature weight attribute: `{}`".format(importance_getter) + return importance_getter + +# def shap_importances(estimator, X, explainer_type="auto", masker=None, **explainer_kws): +# """ +# Modified from Source: +# https://github.com/cerlymarco/shap-hypetune + +# DOES NOT WORK WITH BASE LEARNERS + +# Extract feature importances using Shap + +# explainer_type: +# tree, linear +# Returns +# ------- +# array of feature importances. +# """ +# if explainer_type == "auto": +# importance_getter = get_feature_importance_attribute(estimator, importance_getter="auto") +# if importance_getter == "feature_importances_": +# explainer_type = "tree" +# if importance_getter == "coef_": +# explainer_type = "linear" + +# if explainer_type == "tree": +# explainer = TreeExplainer( +# estimator, +# data=X, +# feature_perturbation="tree_path_dependent", +# **explainer_kws, +# ) +# if explainer_type == "linear": +# masker = Independent(data = X) +# explainer = LinearExplainer( +# estimator, +# masker=masker, +# **explainer_kws, +# # data=X, +# # feature_perturbation="interventional", +# ) + + +# coefs = explainer.shap_values(X) + +# if isinstance(coefs, list): +# coefs = list(map(lambda x: np.abs(x).mean(0), coefs)) +# # coefs = np.sum(coefs, axis=0) +# # else: +# # coefs = np.abs(coefs).mean(0) +# weights = format_weights(coefs) +# return weights + + +# def feature_importances(estimator, importance_getter="auto"): +# """ +# Extract feature importances from estimator + +# Returns +# ------- +# array of feature importances. +# """ +# importance_getter = get_feature_importance_attribute(estimator, importance_getter=importance_getter) +# if importance_getter == 'coef_': ## booster='gblinear' (xgb) +# # coefs = np.square(model.coef_).sum(axis=0) +# weights = format_weights(estimator.coef_) +# if importance_getter == 'feature_importances_': ## booster='gblinear' (xgb) +# weights = estimator.feature_importances_ + +# return weights + +# Feature importances from CV +def format_feature_importances_from_cv(cv_results, features): + # importance for each cross validation fold + feature_importances_cv = list() + for estimator in cv_results["estimator"]: + feature_importances_cv.append(get_feature_importances(estimator)) + feature_importances_mean = pd.Series(np.mean(feature_importances_cv, axis=0), index=features) + feature_importances_sem = pd.Series(sem(feature_importances_cv, axis=0), index=features) + + return { + "mean":feature_importances_mean, + "sem":feature_importances_sem, + } + +# Feature importances from data +def format_feature_importances_from_data(estimator, X, y): + # importance for each full dataset + estimator.fit(X,y) + feature_importances_mean = pd.Series(get_feature_importances(estimator), index=X.columns) + feature_importances_sem = pd.Series([np.nan]*X.shape[1], index=X.columns) + + return { + "mean":feature_importances_mean, + "sem":feature_importances_sem, + } diff --git a/clairvoyance/visuals.py b/clairvoyance/visuals.py new file mode 100644 index 0000000..85ad7c1 --- /dev/null +++ b/clairvoyance/visuals.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +# Built-ins +from ast import Or +import os, sys + +# PyData +import pandas as pd +import numpy as np + +## Plotting +import matplotlib.pyplot as plt +import seaborn as sns + +from .utils import * diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..eaacff9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +tqdm>=4.19 +pandas>=1.2.4, +numpy>=1.13 +scipy>=1.0 +scikit-learn>=1.0 +matplotlib>=2 +xarray>=0.10.3 +seaborn>=0.10.1 +optuna>=3.6.1 +feature_engine>=1.8.0 +#shap>=0.45.1 diff --git a/setup.py b/setup.py index a189a0d..9da64f0 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,29 @@ -from setuptools import setup +from setuptools import setup, find_packages -# Version +from os import path + +script_directory = path.abspath(path.dirname(__file__)) + +package_name = "clairvoyance" version = None -with open("./clairvoyance/__init__.py", "r") as f: +with open(path.join(script_directory, package_name, '__init__.py')) as f: for line in f.readlines(): line = line.strip() if line.startswith("__version__"): version = line.split("=")[-1].strip().strip('"') -assert version is not None, "Check version in clairvoyance/__init__.py" +assert version is not None, f"Check version in {package_name}/__init__.py" + +with open(path.join(script_directory, 'README.md')) as f: + long_description = f.read() + +requirements = list() +with open(path.join(script_directory, 'requirements.txt')) as f: + for line in f.readlines(): + line = line.strip() + if line: + if not line.startswith("#"): + requirements.append(line) + setup(name='clairvoyance_feature_selection', version=version, @@ -17,18 +33,7 @@ author_email='jespinoz@jcvi.org', license='BSD-3', packages=["clairvoyance"], - install_requires=[ - 'pandas >= 1.2.4', - "numpy >= 1.13", - "xarray >= 0.10.3", - "matplotlib >= 2", - "seaborn >= 0.10.1", - "scipy >= 1.0", - "scikit-learn >= 1.0", - "soothsayer_utils >= 2022.6.24", - "tqdm >=4.19", - ], - include_package_data=True, + install_requires=requirements[::-1], scripts=['bin/clairvoyance_v1.py'], ) \ No newline at end of file