This repository has been archived by the owner on Mar 2, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
150 lines (137 loc) · 4.74 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import worker
import pickle
import spacy
import json
from stemming.porter2 import stem
import nltk
import os
nlp = spacy.load('en')
tenK = pickle.load(open(os.path.dirname(__file__) + '/data/tenK.words', 'rb'))
#Map POS (spaCy form) to api
#PUNCT, PART, SYM, X, INTJ are insignificant pos
POS_MAP = {'ADJ': '(adj)', 'PROPN': '(noun)', 'ADV': '(adv)', 'NOUN': '(noun)', 'VERB': '(verb)'}
class token:
def __init__(self, word, pos, lemma_, tag_):
self.word = word
self.pos = pos
self.lemma_ = lemma_
self.tag_ = tag_
def get_word(self):
return self.word
def set_word(self, word):
self.word = word
def get_pos(self):
return self.pos
def set_pos(self, pos):
self.pos = pos
def get_lemma(self):
return self.lemma_
def get_tag(self):
return self.tag_
def tokenize(sentence):
'''
Parse sentense and identify the pos and lemma_ of each word.
Return a tuple (e.g. Alzheimer's disease) when there's a PART pos.
'''
doc = nlp(unicode(sentence))
tokens = []
for word in doc:
curr_token = token(word.orth_, word.pos_, word.lemma_, word.tag_)
if ((word.pos_ == 'PART' and word.tag_ != 'TO' and word.tag_ != 'RP') or (word.tag_ == 'RB' and "'" in word.orth_) or (word.pos_ == 'VERB' and "'" in word.orth_)) and len(tokens) != 0:
tokens[-1].set_word(tokens[-1].get_word() + word.orth_)
continue
elif (word.pos_ == 'NOUN' or word.pos_ == 'PROPN') and (len(tokens) != 0 and _isNounGroup(tokens[-1])):
tokens[-1].set_word(tokens[-1].get_word() + ' ' + word.orth_)
tokens[-1].set_pos(word.pos_)
continue
elif (word.pos_ == 'ADJ' and not word.tag_ == 'PRP$') and (len(tokens) != 0 and (tokens[-1].get_pos() == 'NOUN' or tokens[-1].get_pos() == 'PROPN')):
tokens[-1].set_word(tokens[-1].get_word() + ' ' + word.orth_)
continue
tokens.append(curr_token)
#for tok in tokens:
# print tok.get_word() + ' ' + tok.get_pos()
return tokens
def _isNounGroup(token):
'''
NounGroup includes NOUN, PROPN, ADJ
'''
if type(token) == tuple:
token = token[-1]
pos = token.get_pos()
return pos == 'NOUN' or pos == 'PROPN' or (pos == 'ADJ' and not token.get_tag() == 'PRP$')
def get_best_synonym(word_token):
'''get the returned list of synonyms from a given response'''
word = word_token.get_word()
try:
pos = POS_MAP[word_token.get_pos()]
json_resp = json.loads(worker.request(word))['response']
except:
return word
synonyms = []
for w_type in json_resp:
if w_type['list']['category'] == pos:
synonyms.extend(w_type['list']['synonyms'].split('|'))
most_freq = ('', 0)
for synonym in synonyms:
score = 0
syn_tokens = synonym.split()
synonym_len = len(syn_tokens)
for syn_token in syn_tokens:
syn_token = syn_token.lower()
if syn_token == word.lower():
synonym_len -= 1
continue
syn = stem(syn_token)
if tenK.get(syn) != None:
score += tenK[syn]
else:
score = 0
break
try:
score = score/float(synonym_len)
except:
score = 0
if score > most_freq[1]:
most_freq = (synonym, score)
if most_freq[1] == 0:
return word
return most_freq[0]
def smmrize(paragraph):
'''
Summarize paragraph to 7 sentences with smmrize API.
'''
f = worker.smmry_request(paragraph)
s = f.read()
j = json.loads(s)
try:
return j["sm_api_content"]
except:
return paragraph
SPECIAL = set(['SYM', 'PUNCT', 'SPACE'])
def simpli5(paragraph):
'''
Simplify paragraph by replacing uncommon vocabs with a common synonym.
'''
tokens = tokenize(paragraph)
result = ''
for i, tok in enumerate(tokens):
text = tok.get_word()
pos = tok.get_pos()
words = text.split(' ')
common = True
for word in words:
if not stem(word).lower() in tenK and (pos != 'VERB' or "'" not in word):
common = False
break
if not common and pos not in SPECIAL and pos != 'PRON' and len(text) > 1 and pos != 'NUM':
synonym = get_best_synonym(tok)
wiki_link = worker.wiki_request(text)
orig = text
if wiki_link != None:
text = '[' + text + '](' + wiki_link + ')'
if synonym != orig:
text = synonym + ' (' + text + ')'
if i != 0 and pos not in SPECIAL and text != '-' and text != '(':
result += ' '
result += text
return result