forked from abhishek-vinjamoori/SubtitleExtractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Netflix_XmlToSrt.py
97 lines (82 loc) · 3.71 KB
/
Netflix_XmlToSrt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import codecs
import re
import math
import argparse
def leading_zeros(value, digits=2):
value = "000000" + str(value)
return value[-digits:]
def convert_time(raw_time):
ms = leading_zeros(int(raw_time[:-4]) % 1000, 3)
# only interested in milliseconds, let's drop the additional digits
time_in_seconds = int(raw_time[:-7])
second = leading_zeros(time_in_seconds % 60)
minute = leading_zeros(int(math.floor(time_in_seconds / 60)) % 60)
hour = leading_zeros(int(math.floor(time_in_seconds / 3600)))
return "{}:{}:{},{}".format(hour, minute, second, ms)
def to_srt(text):
def append_subs(start, end, prev_content, format_time):
subs.append({
"start_time": convert_time(start) if format_time else start,
"end_time": convert_time(end) if format_time else end,
"content": u"\n".join(prev_content),
})
begin_re = re.compile(u"\s*<p begin=")
sub_lines = (l for l in text.split("\n") if re.search(begin_re, l))
subs = []
prev_time = {"start": 0, "end": 0}
prev_content = []
start = end = ''
start_re = re.compile(u'begin\="([0-9:\.]*)')
end_re = re.compile(u'end\="([0-9:\.]*)')
# this regex was sometimes too strict. I hope the new one is never too lax
# content_re = re.compile(u'xml\:id\=\"subtitle[0-9]+\">(.*)</p>')
content_re = re.compile(u'\">(.*)</p>')
alt_content_re = re.compile(u'<span style=\"[a-zA-Z0-9_]+\">(.*?)</span>')
br_re = re.compile(u'(<br\s*\/?>)+')
fmt_t = True
for s in sub_lines:
content = []
alt_content = re.search(alt_content_re, s)
while (alt_content): # background text may have additional styling.
# background may also contain several `<span> </span>` groups
s = s.replace(alt_content.group(0), alt_content.group(1))
alt_content = re.search(alt_content_re, s)
content = re.search(content_re, s).group(1)
br_tags = re.search(br_re, content)
if br_tags:
content = u"\n".join(content.split(br_tags.group()))
prev_start = prev_time["start"]
start = re.search(start_re, s).group(1)
end = re.search(end_re, s).group(1)
if len(start.split(":")) > 1:
fmt_t = False
start = start.replace(".", ",")
end = end.replace(".", ",")
if (prev_start == start and prev_time["end"] == end) or not prev_start:
# Fix for multiple lines starting at the same time
prev_time = {"start": start, "end": end}
prev_content.append(content)
continue
append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t)
prev_time = {"start": start, "end": end}
prev_content = [content]
append_subs(start, end, prev_content, fmt_t)
lines = (u"{}\n{} --> {}\n{}\n".format(
s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"])
for s in range(len(subs)))
return u"\n".join(lines)
def main():
filename = "../SubtitleExtractor/NetflixCaptions.xml"
help_text = "path to the {} file (defaults to {})"
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str, default=filename,
help=help_text.format("input", filename))
parser.add_argument("-o", "--output", type=str, default=filename + ".srt",
help=help_text.format("output", filename + ".srt"))
a = parser.parse_args()
with codecs.open(a.input, 'rb', "utf-8") as f:
text = f.read()
with codecs.open(a.output, 'wb', "utf-8") as f:
f.write(to_srt(text))
if __name__ == '__main__':
main()