forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduce_dimensionality.py
75 lines (59 loc) · 2.91 KB
/
reduce_dimensionality.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Apply a dimensionality reduction technique.
"""
import argparse, pickle
from sklearn.feature_selection import SelectKBest, mutual_info_classif
def main():
# setting up CLI
parser = argparse.ArgumentParser(description = "Dimensionality reduction")
parser.add_argument("input_file", help = "path to the input pickle file")
parser.add_argument("output_file", help = "path to the output pickle file")
parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None)
parser.add_argument("-m", "--mutual_information", type = int, help = "select K best features with Mutual Information", default = None)
parser.add_argument("--verbose", action = "store_true", help = "print information about feature selection process")
args = parser.parse_args()
# load the data
with open(args.input_file, 'rb') as f_in:
input_data = pickle.load(f_in)
features = input_data["features"]
labels = input_data["labels"]
feature_names = input_data["feature_names"]
if args.import_file is not None:
# simply import an already fitted dimensionality reducer
with open(args.import_file, 'rb') as f_in:
dim_red = pickle.load(f_in)
else: # need to set things up manually
# select K best based on Mutual Information
k_param = args.mutual_information or "all"
dim_red = SelectKBest(mutual_info_classif, k = k_param)
dim_red.fit(features, labels.ravel())
# resulting feature names based on support given by SelectKBest
def get_feature_names(kbest, names):
support = kbest.get_support()
result = []
for name, selected in zip(names, support):
if selected:
result.append(name)
return result
if args.verbose:
print(" SelectKBest with Mutual Information and k = {0}".format(args.mutual_information))
print(" {0}".format(feature_names))
print(" " + str(dim_red.scores_))
print(" " + str(get_feature_names(dim_red, feature_names)))
# apply the dimensionality reduction to the given features
reduced_features = dim_red.transform(features)
# print("reduced_features \n --- \n ", reduced_features)
# store the results
output_data = {"features": reduced_features,
"labels": labels}
with open(args.output_file, 'wb') as f_out:
pickle.dump(output_data, f_out)
# export the dimensionality reduction technique as pickle file if desired by user
if args.export_file is not None:
with open(args.export_file, 'wb') as f_out:
pickle.dump(dim_red, f_out)
if __name__ == "__main__":
main()