-
Notifications
You must be signed in to change notification settings - Fork 2
/
transcripts.py
144 lines (113 loc) · 4.15 KB
/
transcripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
from os.path import splitext, exists
import re
from io import StringIO
import datetime
import nltk
import docx2txt
import streamlit as st
from nltk.tokenize import word_tokenize
def clean_transcript(filepath: str) -> str:
"""Clean up the content of a subtitle file (vtt) to a string
Args:
filepath (str): path to vtt file
Returns:
str: clean content
"""
if filepath.endswith(".vtt"):
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
elif filepath.endswith(".docx"):
content = docx2txt.process(filepath)
else:
print("Invalid file format.")
return
# replace <v> tags
content = content.replace("<v ","").replace(">"," ").replace("</v","")
# remove header & empty lines
lines = [line.strip() for line in content.split("\n") if line.strip()]
lines = lines[1:] if "WEBVTT" in lines[0].upper() else lines
# remove indexes
lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]
# remove tcode
pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
no_tcode_lines = [lines[i] for i in range(len(lines))
if not re.match(pattern, lines[i])]
if len(no_tcode_lines) < len(lines):
st.write("Participant names were found to be encrypted in the transcript file. The app will summarize accordingly.\n")
lines = no_tcode_lines
# remove timestamps
lines = [lines[i] for i in range(len(lines)) if "--" not in lines[i]] # remove timestamps
content = " ".join(lines)
# remove duplicate spaces
pattern = r"\s+"
content = re.sub(pattern, r" ", content)
# add space after punctuation marks if it doesn't exist
pattern = r"([\.!?])(\w)"
content = re.sub(pattern, r"\1 \2", content)
return content
def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
"""Save clean content of a subtitle file to text file
Args:
file_in (str): path to vtt file
file_out (None, optional): path to text file
**kwargs (optional): arguments for other parameters
- no_message (bool): do not show message of result.
Default is False
Returns:
str: path to text file
"""
# set default values
no_message = kwargs.get("no_message", False)
if not file_out:
filename = splitext(file_in)[0]
file_out = "%s.txt" % filename
i = 0
while exists(file_out):
i += 1
file_out = "%s_%s.txt" % (filename, i)
content = clean_transcript(file_in)
with open(file_out, "w+", encoding="utf-8") as fp:
fp.write(content)
if not no_message:
print("clean content is written to file: %s" % file_out)
return file_out
def count_tokens(filename):
with open(filename, "r") as f:
text = f.read()
tokens = word_tokenize(text)
return len(tokens)
def break_up_file(tokens, chunk_size, overlap_size):
if len(tokens) <= chunk_size:
yield tokens
else:
chunk = tokens[:chunk_size]
yield chunk
yield from break_up_file(
tokens[chunk_size - overlap_size :], chunk_size, overlap_size
)
def break_up_file_to_chunks(filename, chunk_size=2000, overlap_size=100):
with open(filename, "r") as f:
text = f.read()
try:
tokens = word_tokenize(text)
except Exception as err:
print(err)
nltk.download('punkt')
return list(break_up_file(tokens, chunk_size, overlap_size))
def convert_to_prompt_text(tokenized_text):
prompt_text = " ".join(tokenized_text)
prompt_text = prompt_text.replace(" 's", "'s")
return prompt_text
@st.cache_data
def save_and_clean_file(uploaded_file):
# To convert to a string based IO
#stringio = StringIO(uploaded_file.getvalue()).decode("utf-7"))
filename = uploaded_file.name
file_in = f"tmp/{filename}"
file_out = "".join(file_in.split('.')[:-1]) + "_cleaned.txt"
# save the file temporarily
with open(file_in, mode="wb") as f:
f.write(uploaded_file.getvalue())
filepath = vtt_to_clean_file(file_in, file_out)
return filepath