-
Notifications
You must be signed in to change notification settings - Fork 94
/
to_gap_tsv.py
74 lines (65 loc) · 2.89 KB
/
to_gap_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys
import json
import util
def find_pronoun_cluster(prediction, pronoun_subtoken_span, cluster_key='predicted_clusters'):
for cluster in prediction[cluster_key]:
if pronoun_subtoken_span in cluster:
return cluster
return []
def read_json(json_file):
data = {}
with open(json_file) as f:
for line in f:
line = json.loads(line)
data[line['doc_key']] = line
return data
def is_aligned(span1, span2):
if span1[0] >= span2[0] and span1[1] <= span2[1]:
return True
if span2[0] >= span1[0] and span2[1] <= span1[1]:
return True
return False
def is_substring_aligned(span1, sents, name):
span_text = ' '.join(sents[span1[0]:span1[1] + 1])
if span_text in name or name in span_text:
return True
return False
def read_tsv_file(tsv_file):
tsv = {}
with open(tsv_file) as f:
for line in f:
cols = line.split('\t')
tsv[cols[0]] = cols
return tsv
def convert(json_file, tsv_file):
data = read_json(json_file)
tsv = read_tsv_file(tsv_file) if tsv_file is not None else None
predictions = ['\t'.join(['ID', 'A-coref', 'B-coref'])]
for key, datum in data.items():
prediction = data[key]
sents = util.flatten(prediction['sentences'])
if tsv is not None:
print(list(enumerate(tsv[key])))
a_offset, b_offset, pronoun_offset = tuple(map(int, tsv[key][5].split(':'))), tuple(map(int, tsv[key][8].split(':'))), tuple(map(int, tsv[key][3].split(':')))
assert ' '.join(sents[a_offset[0]:a_offset[1]]) == tsv[key][4], (sents[a_offset[0]:a_offset[1]], tsv[key][4])
assert ' '.join(sents[b_offset[0]:b_offset[1]]) == tsv[key][7], (sents[b_offset[0]:b_offset[1]], tsv[key][7])
assert ' '.join(sents[pronoun_offset[0]:pronoun_offset[1]]) == tsv[key][2], (sents[pronoun_offset[0]:pronoun_offset[1]], tsv[key][2])
# continue
pronoun_cluster = find_pronoun_cluster(prediction, prediction['pronoun_subtoken_span'])
a_coref, b_coref = 'FALSE', 'FALSE'
a_text, b_text = (tsv[key][4], tsv[key][7]) if tsv is not None else (None, None)
for span in pronoun_cluster:
a_aligned = is_aligned(span, prediction['a_subtoken_span']) if tsv is None else is_substring_aligned(span, sents, a_text)
b_aligned = is_aligned(span, prediction['b_subtoken_span']) if tsv is None else is_substring_aligned(span, sents, b_text)
if a_aligned:
a_coref = 'TRUE'
if b_aligned:
b_coref = 'TRUE'
predictions += ['\t'.join([key, a_coref, b_coref])]
# write file
with open(json_file.replace('jsonlines', 'tsv'), 'w') as f:
f.write('\n'.join(predictions))
if __name__ == '__main__':
json_file = sys.argv[1]
tsv_file = sys.argv[2] if len(sys.argv) == 3 else None
convert(json_file, tsv_file)