forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_preprocessing.py
72 lines (59 loc) · 3.17 KB
/
run_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Runs the specified collection of preprocessing steps
"""
import argparse, csv, pickle
import pandas as pd
from sklearn.pipeline import make_pipeline
from src.preprocessing.preprocessors.column_dropper import ColumnDropper
from src.preprocessing.preprocessors.non_english_remover import NonEnglishRemover
from src.preprocessing.preprocessors.punctuation_remover import PunctuationRemover
from src.preprocessing.preprocessors.tokenizer import Tokenizer
from src.util import COLUMN_TWEET, SUFFIX_TOKENIZED
def main():
# setting up CLI
parser = argparse.ArgumentParser(description = "Various preprocessing steps")
parser.add_argument("input_file", help = "path to the input csv file")
parser.add_argument("output_file", help = "path to the output csv file")
parser.add_argument("-p", "--punctuation", action = "store_true", help = "remove punctuation")
parser.add_argument("-t", "--tokenize", action = "store_true", help = "tokenize given column into individual words")
parser.add_argument("-o", "--other", action = "store_true", help = "remove non-english tweets and unnecessary columns")
parser.add_argument("--tokenize_input", help = "input column to tokenize", default = COLUMN_TWEET)
parser.add_argument("-e", "--export_file", help = "create a pipeline and export to the given location", default = None)
args = parser.parse_args()
# load data
df = pd.read_csv(args.input_file,
quoting=csv.QUOTE_NONNUMERIC,
lineterminator="\n",
verbose=False,
dtype={"quote_url": object, "place": object, "tweet": object, "language": object, "thumbnail": object},
converters={'mentions': eval, 'photos': eval, 'urls': eval})
# collect all preprocessors
preprocessors = []
if args.punctuation:
preprocessors.append(PunctuationRemover())
if args.tokenize:
preprocessors.append(Tokenizer(args.tokenize_input, args.tokenize_input + SUFFIX_TOKENIZED))
if args.other:
preprocessors.append(NonEnglishRemover())
DROP_COLS = [
"id", "conversation_id", "created_at", "timezone", "user_id", "name", "place",
"replies_count", "retweets_count", "likes_count", "language",
# "cashtag" only few records have this filled. Might be useless
# below columns have always the same value for all records
"retweet", "near", "geo", "source", "user_rt_id", "user_rt", "retweet_id",
"retweet_date", "translate", "trans_src", 'trans_dest\r']
preprocessors.append(ColumnDropper(DROP_COLS))
# call all preprocessing steps
for preprocessor in preprocessors:
df = preprocessor.fit_transform(df)
# store the results
df.to_csv(args.output_file, index = False, quoting = csv.QUOTE_NONNUMERIC, line_terminator = "\n")
# create a pipeline if necessary and store it as pickle file
if args.export_file is not None:
pipeline = make_pipeline(*preprocessors)
with open(args.export_file, 'wb') as f_out:
pickle.dump(pipeline, f_out)
if __name__ == "__main__":
main()