-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlemmatizer.py
66 lines (52 loc) · 2.05 KB
/
lemmatizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from conjugator import conjugate
with open('data\\verb_prefixes.txt', 'r', encoding='utf-8') as f:
PREFIXES = f.read().split('\n') + ['ne', 'be']
with open('data\\allconjs.txt', 'r', encoding='utf-8') as f:
CONJS = {i.split(', ')[0]: i.split(', ')[1] for i in f.read().split('\n')}
with open('data\\data.txt', 'r', encoding='utf-8') as f:
VERBS = f.read().split('\n')
def nested_vals(d, word):
for i, val in enumerate(d.values()):
cur_key = list(d.keys())[i]
if isinstance(val, dict):
nested = nested_vals(val, word)
if(nested):
return(cur_key + ' ' + nested)
else:
is_word = d[cur_key] == word
if('(' in d[cur_key] and not is_word):
is_word = d[cur_key].replace('(', '').replace(')', '') == word
if('(' in d[cur_key] and not is_word):
removed_parens = d[cur_key].split('(')[0] + d[cur_key].split('(')[1].split(')')[1]
is_word = removed_parens == word
if(is_word):
return(cur_key)
def main_prefix_removal(verb, unprefixed_verb):
if(verb == unprefixed_verb):
return(verb)
for i in PREFIXES:
if(verb.startswith(i)):
if(verb[len(i):] == unprefixed_verb):
return(verb[len(i):])
else:
verb_found = main_prefix_removal(verb[len(i):], unprefixed_verb)
if(verb_found):
return(verb_found)
def get_prefix(word, unlemmatized):
root = main_prefix_removal(word, unlemmatized)
try:
return(word.replace(root, ''))
except TypeError:
return('')
def get_info(word):
for unlemmatized, lemma in CONJS.items():
if(word.endswith(unlemmatized)):
prefix = get_prefix(word, unlemmatized)
if(prefix != ''):
word = word.strip(prefix)
return [prefix + lemma, conjugate(prefix + lemma)]
def get_lemma(word):
return get_info(word)[0]
# TODO: Fix get morph
# def get_morph(word):
# return get_info(word)[1]