-
Notifications
You must be signed in to change notification settings - Fork 1
/
converter.py
91 lines (81 loc) · 3.34 KB
/
converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Author Viacheslav Kovalevskyi ([email protected])
# source https://github.com/b0noI/dialog_converter
from sklearn.model_selection import train_test_split
DATA_DIR = "data/"
FNAME = DATA_DIR + "movie_lines.txt"
LINE_SEP = " +++$+++ "
DEBUG = False
# Example of the lineId: L19690
def get_line_number_from_id(line_id):
return int(line_id[-1:])
def parse_line(dialogs):
result = [[], []]
# Buffer of the stat machine.
last_ch_id = None
last_movie_id = None
last_line = None
last_line_number = None
i = 0
for j in range(0, len(dialogs)):
i = len(dialogs) - j - 1
line_id, character_id, movie_id, _, line_txt = dialogs[i].split(LINE_SEP)
line_number = get_line_number_from_id(line_id)
# If movie ID has changed, bufer of the stat machine need to be set to new dialog.
if movie_id != last_movie_id:
if DEBUG:
print("Movie id have changed from {} to {}, dropping buffer.".format(last_movie_id, movie_id))
last_ch_id = character_id
last_movie_id = movie_id
last_line = line_txt
last_line_number = line_number
continue
# If lines are from different dialogs, buufer of the stat machine need to be set to new dialog.
if abs(line_number - last_line_number) > 1:
if DEBUG:
print("Line number changed to more then 1 from {} to {}. Dropping buffer.".format(last_line_number, line_number))
last_ch_id = character_id
last_movie_id = movie_id
last_line = line_txt
last_line_number = line_number
continue
# If same characters appears 2+ times buffer need to be erased.
if last_ch_id == character_id:
if DEBUG:
print("Same character({} == {}) speaking 2 times in row.".format(last_ch_id, character_id))
last_ch_id = None
last_movie_id = None
last_line = None
last_line_number = None
continue
else:
if DEBUG:
print("Looks like: same film ({} == {}), line only diff on 1 ({} = {} + 1), and characters are different ({} != {}). Saving"
.format(last_movie_id, movie_id, last_line_number, line_number, last_ch_id, character_id))
result[0].append(last_line.lower())
result[1].append(line_txt.lower())
last_ch_id = None
last_movie_id = None
last_line = None
last_line_number = None
continue
return result
def write_dialogs(dialogs, file_prefix):
size = len(dialogs[0])
left_f = open(DATA_DIR + file_prefix + '.a'.format(size), 'w')
right_f = open(DATA_DIR + file_prefix + '.b'.format(size), 'w')
for i in range(0, len(dialogs[0])):
left_f.write(dialogs[0][i])
right_f.write(dialogs[1][i])
left_f.close()
right_f.close()
if __name__ == "__main__":
dialogs = None
with open(FNAME) as f:
dialogs = f.readlines()
result = parse_line(dialogs)
if DEBUG:
for i in range(0, len(result[0])):
print ("FROM {}\n TO {}".format(result[0][i], result[1][i]))
train_a, test_a, train_b, test_b = train_test_split(result[0], result[1], test_size=0.2)
write_dialogs([train_a, train_b], "train")
write_dialogs([test_a, test_b], "test")