This repository has been archived by the owner on Jun 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
server.py
174 lines (136 loc) · 4.87 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import tensorflow as tf
import numpy as np
import flask
from flask import request
from flask_cors import CORS
import os
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')
pad_char = 0
start_char=1
oov_char=2
tokenizer = RegexpTokenizer(r'\w+')
def load():
""" load the model and all the dictionary need
"""
global word_id_dict, id_label_dict, num_words, stop_words, model
punctuation = ['-', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}', '’', '”', '“', '``', "''"]
stop_words = set(stopwords.words('italian'))
stop_words.update(punctuation)
with open('./data/id_word_dict.json', 'r') as f:
id_word_dict = json.load(f)
word_id_dict = {v:int(k) for k,v in id_word_dict.items()}
with open('./data/label_index.json', 'r') as f:
label_id_dict = json.load(f)
id_label_dict = {v: k for k, v in label_id_dict.items()}
with tf.device('/cpu:0'):
model = tf.keras.models.load_model('./data/dropout_model.hdf5')
model._make_predict_function()
print(model.summary())
num_words = model.input_shape[-1]
def hasnumbers(value):
return any(c.isdigit() for c in value)
def tokenize_sentence(sentence, remove_stopwords=True, tokenizer=tokenizer.tokenize):
"""
Tokenize the sentence and remove stopwords if true
:param sentence: the sentence to be tokenized
:param remove_stopwords: True to remove stopwa
:param tokenizer:
:return:
"""
sentence = sentence.replace('`', ' ')
sentence = sentence.replace("'", " ")
sentence = sentence.replace("”", ' ')
sentence = sentence.replace("“", ' ')
words = []
for w in tokenizer(sentence):
if not hasnumbers(w) and len(w) > 2:
w = w.replace('_', '')
if remove_stopwords:
if w not in stop_words:
words.append(w.lower())
elif w in stop_words or len(w) > 1:
words.append(w.lower())
yield words
def sentence_to_idxs(tokenized_sentence):
"""
convert a tokenized sentence into a sequence of idx
:param tokenized_sentence:
:param max_idx:
:return:
"""
results = []
for sample in tokenized_sentence:
encoded_sample = []
for w in sample:
if w in word_id_dict:
encoded_sample.append(word_id_dict[w])
else:
encoded_sample.append(oov_char)
results.append(encoded_sample)
return results
def vectorize_sequences(sequences, num_words):
"""
:param sequences:
:param dimension:
:return: sequences encoded as indicator arrays
"""
results = np.zeros((len(sequences), num_words))
for i, sequence in enumerate(sequences):
results[i, sequence] = 1.
return results
def sentence_pipeline(sentence):
"""
:param sentence:
:return: the sentence into its vectorized form
"""
tokenized = list(tokenize_sentence(sentence))
sequence = list(sentence_to_idxs(tokenized))
vectorized = vectorize_sequences([sequence], num_words)
return vectorized
# create the app
app = flask.Flask("api-server", static_folder='public')
CORS(app)
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def serve(path):
if path != "" and os.path.exists("public" + path):
return flask.send_from_directory('public/', path)
else:
return flask.send_from_directory('public', 'index.html')
@app.route('/healtz', methods=['GET'])
def healtz():
return flask.jsonify({'healtz': 'ok'})
@app.route('/predict', methods=['POST'])
def predict():
data = {'success': False}
content = request.get_json()
try:
if content:
if 'sentence' in content:
sentence = content['sentence']
vectorized = sentence_pipeline(sentence)
with tf.device('/cpu:0'):
predictions = model.predict(vectorized)
predictions_dict = {id_label_dict[i]: "{:.10f}".format(v) for i, v in enumerate(predictions[0]) if i in id_label_dict}
predicted_class = np.argmax(predictions)
data['prediction'] = str(id_label_dict[predicted_class])
data['prediction_prob'] = "{:.10f}".format(predictions[0, predicted_class])[:5]
data['prediction_probabilities'] = predictions_dict
data['success'] = True
else:
data['message'] = 'missing sentence parameter'
else:
data['message'] = 'missing body'
except Exception as e:
data['message'] = 'got error {}'.format(e)
return flask.jsonify(data)
if __name__ == "__main__":
print(("* Loading Keras model and Flask starting server..."
"please wait until server has fully started"))
load()
app.run(debug=False, host='0.0.0.0')