-
Notifications
You must be signed in to change notification settings - Fork 1
/
transcript.py
95 lines (84 loc) · 3.48 KB
/
transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Include all the subtleties that are required to read a whatsapp chat transcript
import sys
import csv
import pandas as pd
import numpy as np
import codecs
class Transcript():
def __init__(self, inputFileName, outputFileName):
self.inputFileName = inputFileName
self.outputFileName = outputFileName
self.raw_messages = []
self.speakerlist = []
self.messagelist = []
self.paragraphList = []
self.datelist = []
self.timelist = []
def open_file(self):
arq = codecs.open(self.inputFileName, "r", "utf-8-sig")
content = arq.read()
arq.close()
lines = content.split("\n")
lines = [l for l in lines if len(l) > 4]
for l in lines:
self.raw_messages.append(l.encode("utf-8"))
def valid_date(self, date_str):
valid = True
separator = "/"
try:
year, month, day = map(int, date_str.split(separator))
except ValueError:
valid = False
return valid
def feed_lists(self):
lineNo = 0
seqNo = 0
for l in self.raw_messages: ## raw_messages is list of lines
l = l.rstrip()
msg_date, sep, msg = l.partition("] ")
#msg_date, sep, msg = l.partition(": ") #previously used
# Date and time has a , separator
raw_date, sep, time = msg_date.partition(", ")
speaker, sep, message = msg.partition(": ")
# speaker = speaker.encode('utf-8')
lineNo += 1
# A proper whatsapp conversation with date, time, speaker, text
if message:
self.datelist.append(raw_date)
self.timelist.append(time)
self.speakerlist.append(speaker)
self.messagelist.append(message)
# store the previous speaker so that you can use it to print when there is only a line
prevSender = speaker
prevRawDate = raw_date
prevTime = time
seqNo += 1
# A message. date, time, message
elif ((speaker != "") & (self.valid_date(raw_date))):
self.datelist.append(raw_date)
self.timelist.append(time)
self.speakerlist.append('MESSAGE')
self.messagelist.append(speaker)
# store the previous speaker so that you can use it to print when there is only a line
prevSender = 'MESSAGE'
prevRawDate = raw_date
prevTime = time
seqNo += 1
# A continuing conversation with no date time or name
else:
self.datelist.append(prevRawDate)
self.timelist.append(prevTime)
self.speakerlist.append(prevSender)
self.messagelist.append(l)
self.paragraphList.append(seqNo)
def write_transcript(self, end=0):
if end == 0:
end = len(self.messagelist)
writer = csv.writer(open(self.outputFileName, 'w'))
writer.writerow(["SentenceNo", "SequenceNo", "Date", "Time", "Speaker", "Text"])
for i in range(len(self.messagelist[:end])):
writer.writerow([i, self.paragraphList[i], self.datelist[i], self.timelist[i], self.speakerlist[i],
self.messagelist[i]])
def get_speakers(self):
speakers_set = set(self.speakerlist)
return [e for e in speakers_set]