-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspacy_1_basicFeatures.py
204 lines (157 loc) · 5.87 KB
/
spacy_1_basicFeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# -*- coding: utf-8 -*-
"""
Created on Tue May 22 2019
@author: Stacy Bridges
rem: using SpaCy library NLP features
rem: export annotations to numpy arrays
rem: lu prodigy @ https://prodi.gy/
lu:
(spacy POS distinctions)
JJ NN PRP MD VB IN NNP
lu: NER returns which labels (token.label_) ?
iob tagging = IOB: inside/outside/beginning of entity
"""
import spacy
from spacy.lang.en.examples import sentences
from spacy import displacy
from spacy.matcher import Matcher
# from spacy import displacy
# from spacy.lang.en.stop_words import STOP_WORDS
# spacy provides pre-trained models for syntax
def main():
nlp = spacy.load('en_core_web_sm')
# sentence functions
print('setence example: ---------------------------')
sent = nlp(sentences[0])
print(sent.text)
for token in sent:
print(token.text, token.pos_, token.dep_)
print('\n')
# string example functions
print('string example: ---------------------------')
sampleString = u"I can't imagine spending $3000 for a single bedroom apartment in N.Y.C."
str = nlp(sampleString)
print(str.text)
for token in str:
print(token.text, token.pos_, token.dep_)
print('\n')
# product file functions
print('products example 1: ---------------------------')
infile = open('products_DescriptionOnly_short.csv', 'rt')
print(infile.read(), '\n')
# reset cursor
infile.seek(0)
# start for
for line in infile:
nextLine = line.rstrip()
# nextStr = nextLine
nlpStr = nlp(nextLine)
for token in nlpStr:
print(token.text, token.pos_, token.dep_)
print('\n')
# end for
# close input data file
infile.close()
# product file functions (2)
print('products example 2: ---------------------------')
# print all data
infile = open('products_DescriptionOnly.csv', 'rt')
fData = infile.read()
# the doc object is processed as it is passed
# to the language object
nlpData = nlp(fData)
print(nlpData)
# print tokens
print('\ntokens:')
for tok in nlpData[:6]:
print('{} -> {} -> {}'.format(tok.text, tok.pos_, tok.ent_type_))
# print entities
print('\nentities:')
for ent in nlpData.ents:
print('{} --> {}'.format(ent.string, ent.label_))
# print persons
# rem: NLTK comes with pre-trained models for splitting text
# to sentences and sentences to words
print('\n')
orgNum = 0
carNum = 0
perNum = 0
print('ORGs:')
for ent in nlpData.ents:
if ent.label == spacy.symbols.ORG:
orgNum += 1
print(ent.text)
if ent.label == spacy.symbols.CARDINAL:
carNum += 1
if ent.label == spacy.symbols.PERSON:
perNum += 1
# end if
print('\n')
print('# of ORG: ', orgNum)
print('# of CARDINAL: ', carNum)
print('# of PERSON: ', perNum)
infile.close()
# examine additional spacy functions
print('\nexplore additional spacy functions: ---------------')
for token in nlpData[:6]:
print('token.text: ', token.text) # the original string
print('token.ent_type_: ', token.ent_type_) # entity
print('token.ent_iob_: ', token.ent_iob_) # ?
print('token.pos_: ', token.pos_) # the part of speech
print('token.tag_: ', token.tag_) # ?
print('token.dep_: ', token.dep_) # dependency
print('token.head.text: ', token.head.text) # navigate up the tree
print('token.lefts: ', token.lefts) # left child of head
print('token.rights: ', token.rights) # right child of head
print('\n-----------------')
# apply more spacy features to a string
nuDoc = nlp('This is an SKF product called Ball Bearing for $45 USD')
for token in nuDoc:
print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}'.format(
token.text, # original string
token.idx, # index
token.lemma_, # base form of the word
token.is_punct, # bool: is it punctuation
token.is_space, # bool: is it a space
token.shape_, # visual signature ie: Xxxxx
token.pos_, # part of speech
token.tag_ # ?
)
)
# end for
# test displaCy
# viewable in jupyter notebook
print('\ndisplaCy snippet for jupyter notebook ---------------------------')
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)
# test the chunker
print('\ntest the chunker 1 -----------')
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
print(chunk.text, chunk.label_, chunk.root.text)
# test the chunker
print('\ntest the chunker 2 -----------')
doc = nlp('Bore Diameter 40mm inner ring width 23 mm spherial roller bearing')
for chunk in doc.noun_chunks:
print(chunk.text, chunk.label_, chunk.root.text)
# test span object
print('\ntest span object -----------')
span = doc[2:6] # 40mm inner ring
print(span.text)
# test lexical attributes
print('\ntest lexical attributes ---------------')
doc = nlp("It costs $5.")
print('Text: ', 'It costs $5')
print('Index: ', [token.i for token in doc])
print('Text: ', [token.text for token in doc])
print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])
# test the dependency parcer
print('\ntest the dependency parcer -----------')
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
for token in doc:
print("{0}/{1} <--{2}-- {3}/{4}".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
# end program
print('\nDone.')
if __name__ == '__main__' : main()