forked from abhishek-vinjamoori/SubtitleExtractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BBC_XmlToSrt.py
86 lines (62 loc) · 1.92 KB
/
BBC_XmlToSrt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import re
import sys
from bs4 import BeautifulSoup
def toSrt(xml_string):
srt = ''
xml_string = xml_string.replace("<br/>", "\n")
texts = BeautifulSoup(xml_string, "lxml", from_encoding="utf8")
listOfTranscripts = texts.findAll("p")
colorDict = {}
colorInfo = texts.findAll("style")
for i in colorInfo:
try:
colorDict[i['id']] = i['tts:color']
except:
pass
captionNumber = 1
for captions in listOfTranscripts:
spanList = captions.findAll("span")
for i in spanList:
newtag = texts.new_tag("font", color=i['tts:color'])
newtag.string = i.string
i.replace_with(newtag)
start = captions['begin']
end = captions['end']
start = formatTime(start)
end = formatTime(end)
captionContent = captions.contents
caption = ""
personalStyle = False
# The default font for this caption must be changed from the color
# dictionary
if captions.has_attr('style'):
personalStyle = True
if personalStyle:
colorName = colorDict[captions['style']]
tagName = '<font color="%s">' % (colorName)
caption += tagName
for content in captionContent:
caption += str(content)
if personalStyle:
caption += '</font>'
srt += str(captionNumber) + '\n'
srt += start + ' --> ' + end + '\n'
srt += caption + '\n\n'
captionNumber += 1
return srt
def formatTime(time):
try:
pieces = time.split(".")
pieces[1] = (pieces[1] + "0" * 3)[0:3]
formatted = "%s,%s" % (pieces[0], pieces[1])
except:
pieces = ("0" * 3)[0:3]
formatted = "%s,%s" % (time, pieces)
return formatted
def main():
f = open(sys.argv[1], "r")
q = f.read()
s = toSrt(q)
print(s)
if __name__ == "__main__":
main()