-
Notifications
You must be signed in to change notification settings - Fork 0
/
exclusives_sets.py
77 lines (52 loc) · 2.28 KB
/
exclusives_sets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from utils.io import read, intersect
# useful to generate sets of vocabulary specific to each corpus
def exclusive_sets():
filename1 = "SB_bpe1000"
filename2 = "SB_w2v_7k"
data1 = read(filename1)
data2 = read(filename2)
data1, data2 = intersect(data1, data2)
dico_list = dict()
dico_list[filename1] = []
dico_list[filename2] = []
vocab_reference = set()
vocab1 = set()
vocab2 = set()
for id, refhyp in data1.items():
vocab_reference.update(data1[id][0].split())
vocab1.update(data1[id][1].split())
dico_list[filename1] += data1[id][1].split()
vocab2.update(data2[id][1].split())
dico_list[filename2] += data2[id][1].split()
exclusive_vocab = dict()
exclusive_vocab[filename1] = sorted(vocab1 - vocab2 - vocab_reference)
exclusive_vocab[filename2] = sorted(vocab2 - vocab1 - vocab_reference)
# print sorted list of vocab1 and vocab2
# for i, word1 in enumerate(exclusive_vocab1):
# word2 = exclusive_vocab2[i]
# print(word1, word2)
# input()
print(dico_list[filename1][0])
for filename in [filename1, filename2]:
with open("results/" + filename + "_exclusive_vocab.txt", "w", encoding="utf8") as file:
for word in exclusive_vocab[filename]:
file.write(word + "," + str(dico_list[filename].count(word)) + "\n")
# ne pas faire ça, générer juste l'exclusion (sans prendre en compte l'intersection de deux systèmes)
def original_vocab(filename):
data = read(filename)
with open("/local_disk/atlantia/laboinfo/rouvier/lm/vocab/uniq.old", "r", encoding="utf8") as file:
vocab_to_delete = set(file.read().split())
vocab_reference = set()
vocab_hypothesis = set()
for id, refhyp in data.items():
vocab_reference.update(data[id][0].split())
vocab_hypothesis.update(data[id][1].split())
exclusive_vocab = sorted(vocab_hypothesis - vocab_reference - vocab_to_delete)
with open("results/" + filename + "_exclusive_reference.txt", "w", encoding="utf8") as file:
for word in exclusive_vocab:
file.write(word + "\n")
if __name__ == "__main__":
# exclusive_sets()
original_vocab("SB_bpe750")
original_vocab("SB_bpe1000")
original_vocab("SB_w2v_7k")