-
Notifications
You must be signed in to change notification settings - Fork 0
/
message_parser.py
343 lines (255 loc) · 13.1 KB
/
message_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import os
from tqdm import tqdm
from datetime import datetime
class WhatsAppParser():
def __init__(self,
username: str,
group_message_allowance: int = 1200,
answer_time_allowance: int = 10000,
max_context_message_length: int = 200,
debug: bool = False,
):
'''
Description:
Parsers WhatsApp chat files and converts them into data points
Params:
- username (str): the username of the person to contruct the answers to the prompts around
- group_message_allowance (int): max number of seconds allowed between messages in order for them to be treated as a single message
- answer_time_allowance (int): max number of seconds allowed between the prevouis message and a candidate answer in order for them to be paired
- max_context_message_length (int): the maximum number of characters allowed in a context string
- debug (bool): if True then debugging messages (such as caught exceptions) will be printed to the terminal
'''
self.username = username
self.group_message_allowance = group_message_allowance
self.answer_time_allowance = answer_time_allowance
self.max_context_message_length = max_context_message_length
self.debug = debug
# hard coded list of auto generated WhatsApp messages that should be ignored
self.messages_to_ignore = [
"image omitted",
"missed voice call",
"this message was deleted",
"you deleted this message",
"audio omitted",
"gif omitted",
"missed video call",
"\u200e",
"\u200d",
]
def parse_folder(self, folder_path: str):
'''
Description:
Parse a series of files, all located inside a folder
Params:
- folder_path: the global path to the folder containing the WhatsApp txt files
Returns:
- folder_datapoints: a list of dictionaries, each corresponding to a prompt answer datapoint
'''
# initialise list to store the prompt answer pairs
folder_datapoints = []
# only keep files that end in .txt
valid_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
# initialise tqdm object to store valid files
tqdm_files = tqdm(valid_files)
tqdm_files.set_description('Parsing {0} chat files'.format(len(tqdm_files)))
# loop over each file, attempting to parse its contents
for file_name in tqdm_files:
# generate global file path
file_path = os.path.join(folder_path, file_name)
# attempt to parse file, catch exception if parsing fails
try:
# parse the file
file_datapoints = self.parse_file(file_path)
if self.debug:
print(' - {0} data points were successfully generated from {1}'.format(len(file_datapoints), file_name))
# add the files data
folder_datapoints = folder_datapoints + file_datapoints
except Exception as e:
if self.debug:
print(' - Failed parsing {0} with exception:\n {1}'.format(file_name, e))
print('\nFound {0} files while parsing folder "{1}", successfully generated {2} datapoints with username "{3}"'.format(len(valid_files), folder_path, len(folder_datapoints), self.username))
return folder_datapoints
def parse_file(self, file_path: str):
'''
Description:
Parses a WhatsApp txt file
Params:
- file_path (str): file_path of the file to parse
Returns:
- datapoints (list[Dict]): a list of dictionaries, each dictionary contains a context and response data pair
'''
with open(file_path, encoding='utf-8') as f:
raw_lines = f.readlines()
# process the raw chat log lines
processed_lines = self.process_raw_lines(raw_lines)
# combine messages sent in quick succession from the same person into one message
grouped_messages = self.group_messages(processed_lines)
# convert them into prompt answer pairs
datapoints = self.generate_datapoints(grouped_messages)
return datapoints
def generate_datapoints(self, grouped_messages: list[dict]):
'''
Description:
Convert a list of processed lines into a list of datapoints (context and response pairs)
Params:
- processed_lines (list[dict]): list of grouped messages
Returns:
- datapoints (list[dict]): a list of dictionaries, each dictionary contains information about a prompt answer pair
'''
# initialise datapoints list
datapoints = []
# start with the most recent message sent
# and loop backwards to the very first
i = len(grouped_messages) - 1
while i > 0:
# read the current message as a potential response datapoint
response_candidate = grouped_messages[i]
# check the author of the response message is the target username we are looking for
if response_candidate['name'] == self.username:
# initialise the context string
context_string = ''
# use this bool to flip between messages being sent by other users and the target username
different_user = True
# initialise j
j = i - 1
while j >= 0:
# read context candidate
context_candidate = grouped_messages[j]
# this if statement is to ensure that each message we add to our data point alternates between being written by the target username,
# and then by someone else, and then by the target username, etc...
# Also checks the message was sent within the time limit
if ((different_user and context_candidate['name'] != self.username) or (not different_user and context_candidate['name'] == (self.username)) and (context_candidate['time'] - response_candidate['time']) < self.answer_time_allowance) and len(context_string) < self.max_context_message_length:
# prepend the message to the front of the context string (with splitting tokens)
context_string = '<s> ' + context_candidate['message'] + ' </s> ' + context_string
# flip the different user bool
different_user = not different_user
else:
# if the message at the front of the context string was sent by the target user
# then append a dummy message to be the first message
if context_candidate['name'] == self.username:
context_string = '<s> Hello </s> ' + context_string
# create data point and add it to the list
data = {
'context': context_string[4:-6], # remove the first <s> and last </s> from the string
'response': '<s> ' + response_candidate['message'] + '</s>'
}
datapoints.append(data)
break
# if this is the last message in the whatsapp file then move to the next iteration which will add it to the datapoints list
# otherwise decrease j
if j == 0:
continue
else:
j -= 1
i -= 1
return datapoints
def group_messages(self, processed_lines):
'''
Description:
Group 'rapid' messages into a singular message, rapid messages are ones sent in quick succession by the same person (with no other messaegs in between)
Params:
- processed_lines (list[dict]): list of processed messages
Returns:
- grouped_message (list[dict]): list of grouped messages
'''
grouped_messages = []
# start with the first message and loop to the end of the lines
i = 0
while i < len(processed_lines):
# read information about starting messages
start_name = processed_lines[i]['name']
start_time = processed_lines[i]['time']
message = processed_lines[i]['message']
# loop forward for looking for new messages to group together
j = i + 1
while j < len(processed_lines):
# read a group candidate message
candidate = processed_lines[j]
# if group conditions are reached then group together the message strings, otherwise exit the loop
if candidate['name'] == start_name and (candidate['time'] - start_time) < self.group_message_allowance:
message += '. {0}'.format(candidate['message'])
else:
break
j += 1
# create new message dictionary and add to list
new_message = {
'name': start_name,
'time': start_time,
'message': message,
}
grouped_messages.append(new_message)
# increment i to j's position, so that already grouped messages are skipped
i = j
return grouped_messages
def process_raw_lines(self, raw_lines: list, print_exceptions: bool = False):
'''
Description:
Takes a list of raw lines from a WhatsApp chat txt file and attempts to format each line
Params:
- raw_lines: list storing each line of the txt file
- print_exceptions: if true then any time a line is unsuccessully processed the exception
to the terminal
Returns:
- processed_lines: list of dictionaries, each dictionary stores information about a single message
'''
# list to store processed lines
processed_lines = []
# loop over each line and attempt to extract the time, name, and message
for line in raw_lines:
line = line.strip()
# attempt to read the time from the raw line
try:
# read time stampt, and convert it to seconds
time_start_idx = line.find('[') + 1
time_end_idx = line[time_start_idx:].find(']') + time_start_idx
time = datetime.strptime(line[time_start_idx:time_end_idx], '%d/%m/%Y, %X').timestamp()
except Exception as time_exception:
if print_exceptions:
print('time reading exception: {0}'.format(time_exception))
else:
continue
# attempt to read the name from the raw line
try:
# read name from line
name_start_idx = time_end_idx + 2
name_end_idx = line[name_start_idx:].find(':') + name_start_idx
name = line[name_start_idx:name_end_idx].lower()
# make sure a valid name is read
assert len(name) > 0, 'no name was found, only an empty string'
except Exception as name_exception:
if print_exceptions:
print('name reading exception: {0}'.format(name_exception))
else:
continue
# attempt to read the message from the raw line
try:
# read message content
message_start_idx = name_end_idx + 2
message = line[message_start_idx:].strip().lower()
# make sure the message isnt empty
assert len(message) > 0, 'no message found, only an empty string'
# this assertion removes WhatsApp generated messages like "image omitted" or "missed call"
# these are not real messages but notifications added to the chat by whatsapp
# these messages are proceded by a special character, which we ignore in this assertion check by indexing with [1:]
assert not any(substring in message.lower() for substring in self.messages_to_ignore), 'message "{0}" is in ignore message list'
except Exception as message_exception:
if print_exceptions:
print('message reading exceptions: {0}'.format(message_exception))
else:
continue
processed_lines.append({
'time':time,
'name':name,
'message':message
})
return processed_lines
# if __name__ == '__main__':
# parser = WhatsAppParser(username='joshua', debug=True)
# datapoints = parser.parse_folder('F:/Dev/datasets/whatsapp')
# while True:
# from random import randrange
# r = randrange(len(datapoints))
# print(datapoints[r])
# import time
# time.sleep(10)
# print('\n')