-
Notifications
You must be signed in to change notification settings - Fork 0
/
latest.py
106 lines (80 loc) · 3.35 KB
/
latest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
"""
Created on Fri May 7 00:40:03 2021
@author: sahil
"""
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
from pathlib import Path
import pandas as pd
import spacy
import copy
from spacy.util import minibatch, compounding
import re
from spacy.training import Example
def clean_string(mystring):
return re.sub('[^A-Za-z\ 0-9 ]+', '', mystring)
def train(model=None, output_dir=None, n_iter=10):
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
textcat = nlp.add_pipe('textcat')
# otherwise, get it, so we can add labels to it
else:
textcat = nlp.get_pipe('textcat')
# add label to text classifier
for i in ['Neutral', 'Positive', 'Extremely Negative', 'Extremely Positive']:
textcat.add_label(i)
df = pd.read_csv('Corona_NLP_train.csv', error_bad_lines=False)
df.drop(['Location', 'ScreenName', 'TweetAt', 'UserName', ], axis=1, inplace=True)
#df = df[df['Sentiment'] != 'empty']
sentiment_values = df['Sentiment'].unique()
labels_default = dict((v, 0) for v in sentiment_values)
train_data = []
for i, row in df.iterrows():
label_values = copy.deepcopy(labels_default)
label_values[row['Sentiment']] = 1
train_data.append((str(clean_string(row['OriginalTweet'])), {"cats": label_values}))
train_data = train_data[:5000]
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
print("Training the model...")
print('{:^5}\t'.format('LOSS'))
for i in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
for batch in batches:
for text, annotations in batch:
doc = nlp.make_doc(text)
#create Example object to pass in nlp.update (spacy v3 update)
example = Example.from_dict(doc, annotations)
nlp.update([example],
sgd=optimizer, drop=0.2,
losses=losses)
print("Losses", losses)
#print('{0:.3f}' # print a simple table
#.format(losses['textcat']))
# test the trained model
test_text = "Modi government is so bad"
doc = nlp(test_text)
print(test_text, sorted(doc.cats.items(), key=lambda val: val[1], reverse=True))
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
print(test_text, doc2.cats)