-
Notifications
You must be signed in to change notification settings - Fork 23
/
wikt.py
146 lines (114 loc) · 4.87 KB
/
wikt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""Wiktionary-related classes and functionality."""
import re
from ucca.core import UCCAError
from ucca.postags import POSTags
class MultipleLemmasError(UCCAError):
pass
class LemmaNotFound(UCCAError):
pass
class WiktEntry:
"""Represents an entry of a phrase, with its POS, definition and lemma."""
_tags = {'Noun': POSTags.Noun,
'Proper noun': POSTags.Noun,
'Verb': POSTags.Verb}
def __init__(self, phrase, pos, defn, lemma):
self.phrase = phrase
self.defn = defn
self.lemma = lemma
if pos in self._tags:
self.pos = self._tags[pos]
else:
self.pos = POSTags.Other
def __str__(self):
return "{} ({})\t{}\t{}".format(self.phrase, self.lemma, self.pos,
self.defn)
class Wiktionary:
"""Wiktionary object which provides lemmas and definitions.
For lemmatization, we use the structure of wiktionary definitions.
On English wiktionary (and others) all inflected/transformed or alternative
forms of a word in general receive their own entry, whose definition is
a template of the kind {{X of|Y}}, e.g. {{abbreviation of|et cetera}}.
These templates are parsed and used to create a static lemmatizer, where
each form maps either to itself or to the main form extracted from the
template (assuming one for each entry).
"""
# The basic pattern form is {{X of|Y}}.
# As this template can appear with other templates, we restrict X and Y
# to exclude '}' symbols so we won't match two templates together.
# Some templates have options, given as a third argument, e.g. {{X of|Y|Z}}
# We therefore exclude '|' from X and Y matching and add a possible
# match for this addition.
_pattern = r'{{([^|}]+) of\|([^|}]+)\|?.*?}}'
def __init__(self, raw):
"""Initializes a wiktionary object from raw entries.
Args:
raw: a list of wiktionary entries, given as tab-separated strings
of language (ignored), phrase, Part-of-speech and definition.
"""
self._entries = {}
for line in raw:
_, phrase, pos, defn = line.split('\t')
match = re.search(Wiktionary._pattern, defn)
if match is None:
lemma = phrase
else:
lemma = match.group(2)
if lemma.startswith('[[') and lemma.endswith(']]'):
lemma = lemma[2:-2] # removing wikilink markup
self._entries[phrase] = self._entries.get(phrase, []) + [WiktEntry(
phrase, pos, defn, lemma)]
def lemmatize(self, phrase, pos=None):
"""Lemmatizes the phrase and returns the lemmatized string.
Args:
phrase: the phrase to lemmatize
pos: the part-of-speech of the phrase, according to which trying to
lemmatize. If not given, picks (one of) the shortest lemma
exists for that phrase
Returns:
the lemma of the phrase
Raises:
LemmaNotFound if the phrase doesn't exist in the lemmatizer.
MultipleLemmasError if the phrase have more than one lemma for
the POS given (isn't raised when pos is None).
"""
if phrase not in self._entries:
raise LemmaNotFound()
# Phrase ==> possible lemmas ==> entries for these lemmas
entries = []
for orig_entry in self._entries[phrase]:
entries.extend(self._entries.get(orig_entry.lemma, []))
if pos is None:
return sorted(entries, key=lambda x: len(x.phrase))[0].lemma
else:
lemmas = {e.lemma for e in entries if e.pos == pos}
if len(lemmas) == 0:
raise LemmaNotFound
elif len(lemmas) == 1:
return lemmas.pop()
else:
raise MultipleLemmasError
def lemmas(self, phrase):
"""Returns a set of possible lemmas for the given phrase.
Args:
phrase: the phrase to lemmatize
Returns:
a set of (part-of-speech, lemma) tuples for all possible
lemmatizations of the phrase. Can be empty.
"""
return {e.lemma for e in self._entries[phrase]}
def by_form(self, phrase):
"""Returns a list of entries which have the phrase as their form.
Args:
phrase: the phrase to search for
Returns:
a list of all WiktEntry objects whose lemma is the given phrase,
or that appear as a lemma of this phrase WiktEntry.
"""
entries = []
for e in self._entries.get(phrase, []):
if e.phrase == e.lemma:
entries.append(e)
else:
entries.extend(x for x in self._entries.get(e.lemma, [])
if x.pos == e.pos)
return entries