-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsing.py
155 lines (120 loc) · 4.78 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# parsing.py
#
# Created by Benjamin Wade
# In April, 2016
#
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
def get_wordnet_pos(treebank_tag):
# converts penn treebank pos tags to wordnet pos tags
# credit to Suzana_K on Stack Overflow:
# http://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
if treebank_tag.startswith('J'):
return wn.ADJ
elif treebank_tag.startswith('V'):
return wn.VERB
elif treebank_tag.startswith('N'):
return wn.NOUN
elif treebank_tag.startswith('R'):
return wn.ADV
else:
return treebank_tag
def meaning_tag(tagged_word):
# grab a converted pos tag (penn -> wordnet)
wn_tag = get_wordnet_pos(tagged_word[1])
# grab meanings from wordnet
meanings = wn.synsets(tagged_word[0])
pos_meanings = []
# prioritize meanings of the correct pos and exact word
for meaning in meanings:
meaning_name = meaning.name().split('.')
if meaning.pos() == wn_tag and meaning_name[0] == tagged_word[0]:
pos_meanings.insert(0,meaning)
break
elif meaning.pos() == wn_tag:
pos_meanings.append(meaning)
# assign meaning (synset), prioritizing pos first
if len(pos_meanings) > 0:
tagged_word.append(pos_meanings[0].name())
elif len(meanings) > 0:
tagged_word.append(meanings[0].name())
# handle unknown words
else:
tagged_word.append('?')
def parser(sent):
#main function of this file
# create variable for verb checking
vb = False
# start with nltk's default pos tagger
tagged_sent = pos_tag(word_tokenize(sent))
#print(tagged_sent) #for testing
# convert tuples to lists to allow for mutability
for i in range(len(tagged_sent)):
tagged_sent[i] = list(tagged_sent[i])
# set default sentence type and loop variable
sent_type = 'DEC'
i = 0
# looping through the sentence
while i < len(tagged_sent):
#print(i) #for testing
# grab a converted pos tag (penn -> wordnet)
wn_tag = get_wordnet_pos(tagged_sent[i][1])
#print(tagged_sent) #for testing
# dealing with punctuation (which also affects sentence type)
if tagged_sent[i][0] in '.,;:?!':
punct = tagged_sent.pop(i)[0]
if punct == '.':
sent_type = 'DEC'
if punct == '?':
sent_type = 'INT'
if punct == '!' and sent_type != 'INT':
sent_type = 'EXC'
continue
# handle some common contractions
if tagged_sent[i][0] == "'s" and tagged_sent[i-1][1][0] != 'N':
tagged_sent[i][0] = 'is'
if tagged_sent[i][0] == "'re":
tagged_sent[i][0] = 'are'
if tagged_sent[i][0] == 'ca':
tagged_sent[i][0] = 'can'
if tagged_sent[i][0] == 'wo' and tagged_sent[i+1][0] == "n't":
tagged_sent[i][0] = 'will'
if tagged_sent[i][0] == "n't":
tagged_sent[i][0] = 'not'
# tag for meaning
if tagged_sent[i][1] not in ('DT','PRP$','PRP'):
meaning_tag(tagged_sent[i])
# assign pronouns and determiners an unknown meaning
else:
tagged_sent[i].append('?')
#print(tagged_sent) #for testing
# concatenate and tag infinitives
if tagged_sent[i-1][1] == 'TO':
if tagged_sent[i][1] in ('VB','VBP'):
new_thing = tagged_sent[i-1][0] + ' ' + tagged_sent[i][0]
tagged_sent[i-1] = [new_thing,'INF',tagged_sent.pop(i)[2]]
continue
else:
tagged_sent[i-1] = [tagged_sent[i-1][1],'IN']
# check whether the current word is a finite verb
if tagged_sent[i][1].startswith('V'):
vb = True
i += 1
#print('Verb: {}'.format(vb)) # for testing
# if there is no verb, run the sentence through the tagger again
# (assumes that the problem was fixed by joining infinitives)
if vb == 0:
new_sent = [i[0] for i in tagged_sent]
#print('new_sent = {}'.format(new_sent)) #for testing
new_tagged = pos_tag(new_sent)
#print('new_tagged = {}'.format(new_tagged)) #for testing
for i in range(len(tagged_sent)):
if tagged_sent[i][1] != new_tagged[i][1] and new_tagged[i][1].startswith('V'):
tagged_sent[i][1] = new_tagged[i][1]
return tagged_sent, sent_type
if __name__ == '__main__':
sent = input('Write a sentence: ')
parsed_sent,kind = parser(sent)
print(parsed_sent)
print(kind)