forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_features.py
92 lines (71 loc) · 3.26 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Runs the specified collection of feature extractors.
"""
import argparse, csv, pickle
import pandas as pd
import numpy as np
from src.feature_extraction.feature_extractors.character_length import CharacterLengthFE
from src.feature_extraction.feature_extractors.counter_fe import CounterFE
from src.feature_extraction.feature_extractors.feature_collector import FeatureCollector
from src.feature_extraction.feature_extractors.sentiment_fe import SentimentFE
from src.util import COLUMN_MENTIONS, COLUMN_PHOTOS, COLUMN_TWEET
from src.util import COLUMN_LABEL, COLUMN_HASHTAGS , COLUMN_URLS
from src.util import COLUMN_CASHTAGS, COLUMN_REPLY_TO, COLUMN_TWEET_TOKENIZED
def main():
# setting up CLI
parser = argparse.ArgumentParser(description = "Feature Extraction")
parser.add_argument("input_file", help = "path to the input csv file")
parser.add_argument("output_file", help = "path to the output pickle file")
parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
parser.add_argument("-i", "--import_file", help = "import an existing pipeline from the given location", default = None)
args = parser.parse_args()
df = pd.read_csv(args.input_file, quoting = csv.QUOTE_NONNUMERIC, lineterminator = "\n")
is_feature_collector_provided = args.import_file is not None
if is_feature_collector_provided:
feature_collector = get_feature_collector_from_file(args.import_file)
else:
feature_collector = create_and_fit_feature_collector(df)
# apply the given FeatureCollector on the current data set
# maps the pandas DataFrame to an numpy array
feature_array = feature_collector.transform(df)
# get label array
label_array = np.array(df[COLUMN_LABEL])
label_array = label_array.reshape(-1, 1)
# store the results
results = {"features": feature_array, "labels": label_array,
"feature_names": feature_collector.get_feature_names()}
with open(args.output_file, 'wb') as f_out:
pickle.dump(results, f_out)
# export the FeatureCollector as pickle file if desired by user
if args.export_file is not None:
with open(args.export_file, 'wb') as f_out:
pickle.dump(feature_collector, f_out)
def get_feature_collector_from_file(filepath):
with open(filepath, "rb") as f_in:
feature_collector = pickle.load(f_in)
return feature_collector
def create_and_fit_feature_collector(df: pd.DataFrame):
featureExtractors = instantiate_feature_extractors()
feature_collector = FeatureCollector(featureExtractors)
feature_collector.fit(df) # assumed to be training data
return feature_collector
def instantiate_feature_extractors():
featureExtractors = []
featureExtractors.append(CharacterLengthFE(COLUMN_TWEET))
featureExtractors.append(SentimentFE(COLUMN_TWEET))
count_columns = [
COLUMN_MENTIONS,
COLUMN_PHOTOS,
COLUMN_HASHTAGS,
COLUMN_URLS,
COLUMN_CASHTAGS,
COLUMN_REPLY_TO,
COLUMN_TWEET_TOKENIZED
]
for count_columns in count_columns:
featureExtractors.append(CounterFE(count_columns))
return featureExtractors
if __name__ == "__main__":
main()