-
Notifications
You must be signed in to change notification settings - Fork 8
/
morpho_utils.py
128 lines (99 loc) · 4.14 KB
/
morpho_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def candidate_selection(wn,
token,
target_lemma,
pos,
morphofeat,
use_case=False,
use_number=False,
gold_lexkeys=set(),
case_freq=None,
plural_freq=None,
debug=False):
"""
return candidate synsets of a token
based on chosen morphological strategies
:param str token: a token e.g. Congress or ethics
:param str target_lemma: a token, e.g. Congress or ethic
:param str pos: supported: n
:param str morphofeat: morphofeat tags
:param bool use_case: if set to True,
morphological strategy case is used to reduce the polysemy
:param bool use_number: if set to True,
morphological strategy number is used to reduce the polysemy
:param str gold_lexkeys: {'congress%1:14:00::'}
:param dict case_freq: mapping of (lemma, pos) ->
sensekey -> freq of capitalized tokens that refer to this sensekey
:param dict plural_freq: mapping of (lemma, pos) ->
sensekey -> freq of plural tokens that refer to this sensekey
:rtype: tuple
:return: (candidate_synsets,
new_candidate_synsets,
gold_in_candidates)
"""
# assertions on input arguments
if use_case:
assert case_freq is not None, 'case_freq should not be None'
if use_number:
assert plural_freq is not None, 'plural_freq should not be None'
apply_morph_strategy = True
# check if candidate_synsets without morphological information is monosemous
if pos is None:
candidate_synsets = wn.synsets(target_lemma)
else:
candidate_synsets = wn.synsets(target_lemma, pos)
if len(candidate_synsets) == 1:
apply_morph_strategy = False
new_candidate_synsets = []
gold_in_candidates = False
if debug:
print(candidate_synsets)
for synset in candidate_synsets:
add = False
if all([use_number,
morphofeat in {'NNS', 'NNPS'},
apply_morph_strategy]):
key = (target_lemma.lower(), pos)
lemma_plural_freq = dict()
if key in plural_freq:
lemma_plural_freq = plural_freq[(target_lemma.lower(), pos)]
plural_match = False
for lemma in synset.lemmas():
if lemma.key() in lemma_plural_freq:
plural_match = True
if plural_match:
add = True
if all([use_case,
token.istitle(),
apply_morph_strategy]):
# check synset_lemma
capital_lemma_match = any([lemma.name() == token
for lemma in synset.lemmas()])
# check sense annotated corpus
key = (target_lemma.lower(), pos)
lemma_case_freq = dict()
if key in case_freq:
lemma_case_freq = case_freq[(target_lemma.lower(), pos)]
freq_match = False
for lemma in synset.lemmas():
if lemma.key() in lemma_case_freq:
freq_match = True
if any([capital_lemma_match, # whether lemma matches with token
freq_match]): # whether lemma of sensekey is used with capital
add = True
if add:
new_candidate_synsets.append(synset)
# check if gold in candidate
lexkeys = {lemma.key() for lemma in synset.lemmas()}
if any(gold_key in lexkeys
for gold_key in gold_lexkeys):
gold_in_candidates = True
# if no synsets remain, use original ones
if not new_candidate_synsets:
new_candidate_synsets = candidate_synsets
for synset in candidate_synsets:
# check if gold in candidate
lexkeys = {lemma.key() for lemma in synset.lemmas()}
if any(gold_key in lexkeys
for gold_key in gold_lexkeys):
gold_in_candidates = True
return candidate_synsets, new_candidate_synsets, gold_in_candidates