forked from WillKoehrsen/feature-selector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_selector.py
650 lines (456 loc) · 27.9 KB
/
feature_selector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
# numpy and pandas for data manipulation
import pandas as pd
import numpy as np
# model used for feature importances
import lightgbm as lgb
# utility for early stopping with a validation set
from sklearn.model_selection import train_test_split
# visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# memory management
import gc
# utilities
from itertools import chain
class FeatureSelector():
"""
Class for performing feature selection for machine learning or data preprocessing.
Implements five different methods to identify features for removal
1. Find columns with a missing percentage greater than a specified threshold
2. Find columns with a single unique value
3. Find collinear variables with a correlation greater than a specified correlation coefficient
4. Find features with 0.0 feature importance from a gradient boosting machine (gbm)
5. Find low importance features that do not contribute to a specified cumulative feature importance from the gbm
Parameters
--------
data : dataframe
A dataset with observations in the rows and features in the columns
labels : array or series, default = None
Array of labels for training the machine learning model to find feature importances. These can be either binary labels
(if task is 'classification') or continuous targets (if task is 'regression').
If no labels are provided, then the feature importance based methods are not available.
Attributes
--------
ops : dict
Dictionary of operations run and features identified for removal
missing_stats : dataframe
The fraction of missing values for all features
record_missing : dataframe
The fraction of missing values for features with missing fraction above threshold
unique_stats : dataframe
Number of unique values for all features
record_single_unique : dataframe
Records the features that have a single unique value
corr_matrix : dataframe
All correlations between all features in the data
record_collinear : dataframe
Records the pairs of collinear variables with a correlation coefficient above the threshold
feature_importances : dataframe
All feature importances from the gradient boosting machine
record_zero_importance : dataframe
Records the zero importance features in the data according to the gbm
record_low_importance : dataframe
Records the lowest importance features not needed to reach the threshold of cumulative importance according to the gbm
Notes
--------
- All 5 operations can be run with the `identify_all` method.
- If using feature importances, one-hot encoding is used for categorical variables which creates new columns
"""
def __init__(self, data, labels=None):
# Dataset and optional training labels
self.data = data
self.labels = labels
if labels is None:
print('No labels provided. Feature importance based methods are not available.')
self.base_features = list(data.columns)
self.one_hot_features = None
# Dataframes recording information about features to remove
self.record_missing = None
self.record_single_unique = None
self.record_collinear = None
self.record_zero_importance = None
self.record_low_importance = None
self.missing_stats = None
self.unique_stats = None
self.corr_matrix = None
self.feature_importances = None
# Dictionary to hold removal operations
self.ops = {}
self.one_hot_correlated = False
def identify_missing(self, missing_threshold):
"""Find the features with a fraction of missing values above `missing_threshold`"""
self.missing_threshold = missing_threshold
# Calculate the fraction of missing in each column
missing_series = self.data.isnull().sum() / self.data.shape[0]
self.missing_stats = pd.DataFrame(missing_series).rename(columns = {'index': 'feature', 0: 'missing_fraction'})
# Sort with highest number of missing values on top
self.missing_stats = self.missing_stats.sort_values('missing_fraction', ascending = False)
# Find the columns with a missing percentage above the threshold
record_missing = pd.DataFrame(missing_series[missing_series > missing_threshold]).reset_index().rename(columns =
{'index': 'feature',
0: 'missing_fraction'})
to_drop = list(record_missing['feature'])
self.record_missing = record_missing
self.ops['missing'] = to_drop
print('%d features with greater than %0.2f missing values.\n' % (len(self.ops['missing']), self.missing_threshold))
def identify_single_unique(self):
"""Finds features with only a single unique value. NaNs do not count as a unique value. """
# Calculate the unique counts in each column
unique_counts = self.data.nunique()
self.unique_stats = pd.DataFrame(unique_counts).rename(columns = {'index': 'feature', 0: 'nunique'})
self.unique_stats = self.unique_stats.sort_values('nunique', ascending = True)
# Find the columns with only one unique count
record_single_unique = pd.DataFrame(unique_counts[unique_counts == 1]).reset_index().rename(columns = {'index': 'feature',
0: 'nunique'})
to_drop = list(record_single_unique['feature'])
self.record_single_unique = record_single_unique
self.ops['single_unique'] = to_drop
print('%d features with a single unique value.\n' % len(self.ops['single_unique']))
def identify_collinear(self, correlation_threshold, one_hot=False):
"""
Finds collinear features based on the correlation coefficient between features.
For each pair of features with a correlation coefficient greather than `correlation_threshold`,
only one of the pair is identified for removal.
Using code adapted from: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
Parameters
--------
correlation_threshold : float between 0 and 1
Value of the Pearson correlation cofficient for identifying correlation features
one_hot : boolean, default = False
Whether to one-hot encode the features before calculating the correlation coefficients
"""
self.correlation_threshold = correlation_threshold
self.one_hot_correlated = one_hot
# Calculate the correlations between every column
if one_hot:
# One hot encoding
features = pd.get_dummies(self.data)
self.one_hot_features = [column for column in features.columns if column not in self.base_features]
# Add one hot encoded data to original data
self.data_all = pd.concat([features[self.one_hot_features], self.data], axis = 1)
corr_matrix = pd.get_dummies(features).corr()
else:
corr_matrix = self.data.corr()
self.corr_matrix = corr_matrix
# Extract the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
# Select the features with correlations above the threshold
# Need to use the absolute value
to_drop = [column for column in upper.columns if any(upper[column].abs() > correlation_threshold)]
# Dataframe to hold correlated pairs
record_collinear = pd.DataFrame(columns = ['drop_feature', 'corr_feature', 'corr_value'])
# Iterate through the columns to drop to record pairs of correlated features
for column in to_drop:
# Find the correlated features
corr_features = list(upper.index[upper[column].abs() > correlation_threshold])
# Find the correlated values
corr_values = list(upper[column][upper[column].abs() > correlation_threshold])
drop_features = [column for _ in range(len(corr_features))]
# Record the information (need a temp df for now)
temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
'corr_feature': corr_features,
'corr_value': corr_values})
# Add to dataframe
record_collinear = record_collinear.append(temp_df, ignore_index = True)
self.record_collinear = record_collinear
self.ops['collinear'] = to_drop
print('%d features with a correlation magnitude greater than %0.2f.\n' % (len(self.ops['collinear']), self.correlation_threshold))
def identify_zero_importance(self, task, eval_metric=None,
n_iterations=10, early_stopping = True):
"""
Identify the features with zero importance according to a gradient boosting machine.
The gbm can be trained with early stopping using a validation set to prevent overfitting.
The feature importances are averaged over `n_iterations` to reduce variance.
Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html)
Parameters
--------
eval_metric : string
Evaluation metric to use for the gradient boosting machine for early stopping. Must be
provided if `early_stopping` is True
task : string
The machine learning task, either 'classification' or 'regression'
n_iterations : int, default = 10
Number of iterations to train the gradient boosting machine
early_stopping : boolean, default = True
Whether or not to use early stopping with a validation set when training
Notes
--------
- Features are one-hot encoded to handle the categorical variables before training.
- The gbm is not optimized for any particular task and might need some hyperparameter tuning
- Feature importances, including zero importance features, can change across runs
"""
if early_stopping and eval_metric is None:
raise ValueError("""eval metric must be provided with early stopping. Examples include "auc" for classification or
"l2" for regression.""")
if self.labels is None:
raise ValueError("No training labels provided.")
# One hot encoding
features = pd.get_dummies(self.data)
self.one_hot_features = [column for column in features.columns if column not in self.base_features]
# Add one hot encoded data to original data
self.data_all = pd.concat([features[self.one_hot_features], self.data], axis = 1)
# Extract feature names
feature_names = list(features.columns)
# Convert to np array
features = np.array(features)
labels = np.array(self.labels).reshape((-1, ))
# Empty array for feature importances
feature_importance_values = np.zeros(len(feature_names))
print('Training Gradient Boosting Model\n')
# Iterate through each fold
for _ in range(n_iterations):
if task == 'classification':
model = lgb.LGBMClassifier(n_estimators=1000, learning_rate = 0.05, verbose = -1)
elif task == 'regression':
model = lgb.LGBMRegressor(n_estimators=1000, learning_rate = 0.05, verbose = -1)
else:
raise ValueError('Task must be either "classification" or "regression"')
# If training using early stopping need a validation set
if early_stopping:
train_features, valid_features, train_labels, valid_labels = train_test_split(features, labels, test_size = 0.15)
# Train the model with early stopping
model.fit(train_features, train_labels, eval_metric = eval_metric,
eval_set = [(valid_features, valid_labels)],
early_stopping_rounds = 100, verbose = -1)
# Clean up memory
gc.enable()
del train_features, train_labels, valid_features, valid_labels
gc.collect()
else:
model.fit(features, labels)
# Record the feature importances
feature_importance_values += model.feature_importances_ / n_iterations
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
# Sort features according to importance
feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)
# Normalize the feature importances to add up to one
feature_importances['normalized_importance'] = feature_importances['importance'] / feature_importances['importance'].sum()
feature_importances['cumulative_importance'] = np.cumsum(feature_importances['normalized_importance'])
# Extract the features with zero importance
record_zero_importance = feature_importances[feature_importances['importance'] == 0.0]
to_drop = list(record_zero_importance['feature'])
self.feature_importances = feature_importances
self.record_zero_importance = record_zero_importance
self.ops['zero_importance'] = to_drop
print('\n%d features with zero importance after one-hot encoding.\n' % len(self.ops['zero_importance']))
def identify_low_importance(self, cumulative_importance):
"""
Finds the lowest importance features not needed to account for `cumulative_importance` fraction
of the total feature importance from the gradient boosting machine. As an example, if cumulative
importance is set to 0.95, this will retain only the most important features needed to
reach 95% of the total feature importance. The identified features are those not needed.
Parameters
--------
cumulative_importance : float between 0 and 1
The fraction of cumulative importance to account for
"""
self.cumulative_importance = cumulative_importance
# The feature importances need to be calculated before running
if self.feature_importances is None:
raise NotImplementedError("""Feature importances have not yet been determined.
Call the `identify_zero_importance` method first.""")
# Make sure most important features are on top
self.feature_importances = self.feature_importances.sort_values('cumulative_importance')
# Identify the features not needed to reach the cumulative_importance
record_low_importance = self.feature_importances[self.feature_importances['cumulative_importance'] > cumulative_importance]
to_drop = list(record_low_importance['feature'])
self.record_low_importance = record_low_importance
self.ops['low_importance'] = to_drop
print('%d features required for cumulative importance of %0.2f after one hot encoding.' % (len(self.feature_importances) -
len(self.record_low_importance), self.cumulative_importance))
print('%d features do not contribute to cumulative importance of %0.2f.\n' % (len(self.ops['low_importance']),
self.cumulative_importance))
def identify_all(self, selection_params):
"""
Use all five of the methods to identify features to remove.
Parameters
--------
selection_params : dict
Parameters to use in the five feature selection methhods.
Params must contain the keys ['missing_threshold', 'correlation_threshold', 'eval_metric', 'task', 'cumulative_importance']
"""
# Check for all required parameters
for param in ['missing_threshold', 'correlation_threshold', 'eval_metric', 'task', 'cumulative_importance']:
if param not in selection_params.keys():
raise ValueError('%s is a required parameter for this method.' % param)
# Implement each of the five methods
self.identify_missing(selection_params['missing_threshold'])
self.identify_single_unique()
self.identify_collinear(selection_params['correlation_threshold'])
self.identify_zero_importance(task = selection_params['task'], eval_metric = selection_params['eval_metric'])
self.identify_low_importance(selection_params['cumulative_importance'])
# Find the number of features identified to drop
self.all_identified = set(list(chain(*list(self.ops.values()))))
self.n_identified = len(self.all_identified)
print('%d total features out of %d identified for removal after one-hot encoding.\n' % (self.n_identified,
self.data_all.shape[1]))
def check_removal(self, keep_one_hot=True):
"""Check the identified features before removal. Returns a list of the unique features identified."""
self.all_identified = set(list(chain(*list(self.ops.values()))))
print('Total of %d features identified for removal' % len(self.all_identified))
if not keep_one_hot:
if self.one_hot_features is None:
print('Data has not been one-hot encoded')
else:
one_hot_to_remove = [x for x in self.one_hot_features if x not in self.all_identified]
print('%d additional one-hot features can be removed' % len(one_hot_to_remove))
return list(self.all_identified)
def remove(self, methods, keep_one_hot = True):
"""
Remove the features from the data according to the specified methods.
Parameters
--------
methods : 'all' or list of methods
If methods == 'all', any methods that have identified features will be used
Otherwise, only the specified methods will be used.
Can be one of ['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance']
keep_one_hot : boolean, default = True
Whether or not to keep one-hot encoded features
Return
--------
data : dataframe
Dataframe with identified features removed
Notes
--------
- If feature importances are used, the one-hot encoded columns will be added to the data (and then may be removed)
- Check the features that will be removed before transforming data!
"""
features_to_drop = []
if methods == 'all':
# Need to use one-hot encoded data as well
data = self.data_all
print('{} methods have been run\n'.format(list(self.ops.keys())))
# Find the unique features to drop
features_to_drop = set(list(chain(*list(self.ops.values()))))
else:
# Need to use one-hot encoded data as well
if 'zero_importance' in methods or 'low_importance' in methods or self.one_hot_correlated:
data = self.data_all
else:
data = self.data
# Iterate through the specified methods
for method in methods:
# Check to make sure the method has been run
if method not in self.ops.keys():
raise NotImplementedError('%s method has not been run' % method)
# Append the features identified for removal
else:
features_to_drop.append(self.ops[method])
# Find the unique features to drop
features_to_drop = set(list(chain(*features_to_drop)))
features_to_drop = list(features_to_drop)
if not keep_one_hot:
if self.one_hot_features is None:
print('Data has not been one-hot encoded')
else:
features_to_drop = list(set(features_to_drop) | set(self.one_hot_features))
# Remove the features and return the data
data = data.drop(columns = features_to_drop)
self.removed_features = features_to_drop
if not keep_one_hot:
print('Removed %d features including one-hot features.' % len(features_to_drop))
else:
print('Removed %d features.' % len(features_to_drop))
return data
def plot_missing(self):
"""Histogram of missing fraction in each feature"""
if self.record_missing is None:
raise NotImplementedError("Missing values have not been calculated. Run `identify_missing`")
self.reset_plot()
# Histogram of missing values
plt.style.use('seaborn-white')
plt.figure(figsize = (7, 5))
plt.hist(self.missing_stats['missing_fraction'], bins = np.linspace(0, 1, 11), edgecolor = 'k', color = 'red', linewidth = 1.5)
plt.xticks(np.linspace(0, 1, 11));
plt.xlabel('Missing Fraction', size = 14); plt.ylabel('Count of Features', size = 14);
plt.title("Fraction of Missing Values Histogram", size = 16);
def plot_unique(self):
"""Histogram of number of unique values in each feature"""
if self.record_single_unique is None:
raise NotImplementedError('Unique values have not been calculated. Run `identify_single_unique`')
self.reset_plot()
# Histogram of number of unique values
self.unique_stats.plot.hist(edgecolor = 'k', figsize = (7, 5))
plt.ylabel('Frequency', size = 14); plt.xlabel('Unique Values', size = 14);
plt.title('Number of Unique Values Histogram', size = 16);
def plot_collinear(self, plot_all = False):
"""
Heatmap of the correlation values. If plot_all = True plots all the correlations otherwise
plots only those features that have a correlation above the threshold
Notes
--------
- Not all of the plotted correlations are above the threshold because this plots
all the variables that have been idenfitied as having even one correlation above the threshold
- The features on the x-axis are those that will be removed. The features on the y-axis
are the correlated features with those on the x-axis
Code adapted from https://seaborn.pydata.org/examples/many_pairwise_correlations.html
"""
if self.record_collinear is None:
raise NotImplementedError('Collinear features have not been idenfitied. Run `identify_collinear`.')
if plot_all:
corr_matrix_plot = self.corr_matrix
title = 'All Correlations'
else:
# Identify the correlations that were above the threshold
# columns (x-axis) are features to drop and rows (y_axis) are correlated pairs
corr_matrix_plot = self.corr_matrix.loc[list(set(self.record_collinear['corr_feature'])),
list(set(self.record_collinear['drop_feature']))]
title = "Correlations Above Threshold"
f, ax = plt.subplots(figsize=(10, 8))
# Diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with a color bar
sns.heatmap(corr_matrix_plot, cmap=cmap, center=0,
linewidths=.25, cbar_kws={"shrink": 0.6})
# Set the ylabels
ax.set_yticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[0]))])
ax.set_yticklabels(list(corr_matrix_plot.index), size = int(160 / corr_matrix_plot.shape[0]));
# Set the xlabels
ax.set_xticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[1]))])
ax.set_xticklabels(list(corr_matrix_plot.columns), size = int(160 / corr_matrix_plot.shape[1]));
plt.title(title, size = 14)
def plot_feature_importances(self, plot_n = 15, threshold = None):
"""
Plots `plot_n` most important features and the cumulative importance of features.
If `threshold` is provided, prints the number of features needed to reach `threshold` cumulative importance.
Parameters
--------
plot_n : int, default = 15
Number of most important features to plot. Defaults to 15 or the maximum number of features whichever is smaller
threshold : float, between 0 and 1 default = None
Threshold for printing information about cumulative importances
"""
if self.record_zero_importance is None:
raise NotImplementedError('Feature importances have not been determined. Run `idenfity_zero_importance`')
# Need to adjust number of features if greater than the features in the data
if plot_n > self.feature_importances.shape[0]:
plot_n = self.feature_importances.shape[0] - 1
self.reset_plot()
# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()
# Need to reverse the index to plot most important on top
# There might be a more efficient method to accomplish this
ax.barh(list(reversed(list(self.feature_importances.index[:plot_n]))),
self.feature_importances['normalized_importance'][:plot_n],
align = 'center', edgecolor = 'k')
# Set the yticks and labels
ax.set_yticks(list(reversed(list(self.feature_importances.index[:plot_n]))))
ax.set_yticklabels(self.feature_importances['feature'][:plot_n], size = 12)
# Plot labeling
plt.xlabel('Normalized Importance', size = 16); plt.title('Feature Importances', size = 18)
plt.show()
# Cumulative importance plot
plt.figure(figsize = (6, 4))
plt.plot(list(range(1, len(self.feature_importances) + 1)), self.feature_importances['cumulative_importance'], 'r-')
plt.xlabel('Number of Features', size = 14); plt.ylabel('Cumulative Importance', size = 14);
plt.title('Cumulative Feature Importance', size = 16);
if threshold:
# Index of minimum number of features needed for cumulative importance threshold
# np.where returns the index so need to add 1 to have correct number
importance_index = np.min(np.where(self.feature_importances['cumulative_importance'] > threshold))
plt.vlines(x = importance_index + 1, ymin = 0, ymax = 1, linestyles='--', colors = 'blue')
plt.show();
print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
def reset_plot(self):
plt.rcParams = plt.rcParamsDefault