-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
executable file
·182 lines (147 loc) · 6.31 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""
#####################
Simple Python class for performing text pre-processing on a CSV file.
Developed as part of the BRAID project at the University of Sheffield.
Run with -h flag for a list of optional arguments.
#####################
"""
import os
import sys
import argparse
import re
from datetime import datetime
import pandas as pd
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
class Preprocessing():
def __init__(self):
self.date = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
self.output_dir = 'output' # set up directory for output files
if not os.path.exists(self.output_dir):
os.mkdir(self.output_dir)
self.inputs = self.parse_args()
self.df = pd.read_csv('data.csv', encoding='latin-1')
self.df = self.df.fillna('') # fill nan with the empty string
self.column_name = 'Description: (Collection Details)/Description' # set column name
self.all_tokens = {} # count tokens
self.types = set() # count types
def parse_args(self):
"""
Parse command line arguments.
"""
parser = argparse.ArgumentParser(
description='Text pre-processing for BRAID project. Pass in optional arguments to apply stopword removal, stemming or lemmatization. Stemming and Lemmatization are mutually exclusive.')
parser.add_argument(
'--stop', action=argparse.BooleanOptionalAction, default=False, help='Applies stopword removal to text.')
# stemming and lemmatization are mutually exclusive
xor_group = parser.add_mutually_exclusive_group(required=False)
xor_group.add_argument(
'--stem', action=argparse.BooleanOptionalAction, default=False, help='Applies stemming to text. Cannot be used in conjunction with --lemma.')
xor_group.add_argument(
'--lemma', action=argparse.BooleanOptionalAction, default=False, help='Applies lemmatization to text. Cannot be used in conjunction with --stem.')
args = parser.parse_args()
return args
def lowercase(self):
"""
Lowercase text.
"""
self.df[self.column_name] = self.df[self.column_name].str.lower()
def remove_punctuation(self):
"""
Removes punctuation. In a second step, also turns double spaces into single spaces.
"""
# remove punctuation using regex - this pattern will preserve hypens in the middle of words
pattern = re.compile(r'(?<!\w)-(?!\w)|[^\w\s-]')
self.df[self.column_name] = self.df[self.column_name].apply(
lambda x: re.sub(pattern, '', str(x)))
# remove double spaces
double_space_re = re.compile(r"\s{2}")
self.df[self.column_name] = self.df[self.column_name].str.replace(
double_space_re, ' ', regex=True)
def remove_numbers(self):
"""
Remove numerical digits from the text.
"""
self.df[self.column_name] = self.df[self.column_name].apply(
lambda x: re.sub(r'\d+', '', x))
def remove_stopwords(self):
"""
Remove common stoplist words.
"""
download('stopwords')
stopwords_en = stopwords.words('english')
def remove_stop_words(phrase):
words = phrase.split()
filtered = [w for w in words if w.lower() not in stopwords_en]
return ' '.join(filtered)
self.df[self.column_name] = self.df[self.column_name].apply(
remove_stop_words)
def stem_words(self):
"""
Perform stemming on words in the text.
"""
stemmer = PorterStemmer()
def stem_phrase(phrase):
words = phrase.split()
stemmed = [stemmer.stem(w) for w in words]
return ' '.join(stemmed)
self.df[self.column_name] = self.df[self.column_name].apply(
stem_phrase)
def lemmatize_words(self):
"""
Perform lemmatization on words in the text.
"""
lemmatizer = WordNetLemmatizer()
download('wordnet')
def lemmatize_phrase(phrase):
words = phrase.split()
lemmatized = [lemmatizer.lemmatize(w) for w in words]
return ' '.join(lemmatized)
self.df[self.column_name] = self.df[self.column_name].apply(
lemmatize_phrase)
def create_bow(self):
"""
Create a bag-of-words representation of the text and output to CSV.
"""
self.bow = self.df[self.column_name].str.split(
expand=True).stack().value_counts()
self.bow.reset_index().to_csv(f'{self.output_dir}/{str(self.date)}_bow.csv', header=[
'word', 'count'], index=False)
def count_types_and_tokens(self):
"""
Counts the number of types and tokens in sentences.
These are stored in the class variables self.types and self.all_tokens.
"""
def count_tokens(sentence):
"""
Helper function for counting tokens in a sentence
"""
tokens = sentence.split() # Split the sentence into tokens (words)
for token in tokens:
self.types.add(token) # Put unique tokens (types) into the set
# If token not in the dictionary of all tokens, add it, else bump the count
if token not in self.all_tokens:
self.all_tokens[token] = 1
else:
self.all_tokens[token] += 1
self.df[self.column_name].apply(count_tokens)
def main(self):
self.lowercase()
self.remove_numbers() # we need to remove numbers before punctuation removal to catch date ranges e.g. AD 1530-1, or serial numbers e.g. 59-4-2-25
self.remove_punctuation()
if self.inputs.stop:
self.remove_stopwords()
if self.inputs.stem:
self.stem_words()
if self.inputs.lemma:
self.lemmatize_words()
self.create_bow()
self.count_types_and_tokens()
# write out pre-processed descriptions to csv
filename = f'{self.output_dir}/{str(self.date)}{sys.argv[1:]}.txt'
self.df[self.column_name].to_csv(filename, index=True, header=False)
if __name__ == '__main__':
preprocessing_instance = Preprocessing()
preprocessing_instance.main()