-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapi.py
381 lines (324 loc) · 15.8 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
#!/usr/bin/env python
"""API to Data Community DC Data Lake built on Flask"""
from flask import Flask, request, Response
from werkzeug.exceptions import NotFound, Unauthorized, UnsupportedMediaType, BadRequest
import boto
import json, ujson
import uuid
import redis
import amqp
import ThreeScalePY
import urllib, urllib2
import sys
import traceback
import logging
import math
import magic
import xlrd
from logging import FileHandler
from config import Config
from logging.handlers import RotatingFileHandler
# from config import *
# Install Dependencies:
# sudo apt-get update
# sudo apt-get install python-setuptools python-libxml2
# sudo easy_install pip
# pip install flask boto redis celery filemagic xlrd
# pip install Flask-DotEnv
# Get the latest ThreeScale python library here: https://github.com/3scale/3scale_ws_api_for_python
# and follow the directions to install
app = Flask(__name__)
app.config.from_object(__name__)
app.config.from_object(Config["DC2DL_config"])
HANDLER = FileHandler(LOGFILE)
HANDLER.setLevel(logging.INFO)
app.logger.addHandler(HANDLER)
if USESSL:
from OpenSSL import SSL
ctx = SSL.Context(SSL.SSLv23_METHOD)
ctx.use_privatekey_file('ssl/ssl.key')
ctx.use_certificate_file('ssl/ssl.cert')
MAX_MEGABYTES = 10000 # 10Gigs, what do we believe would limit what we ingest?
app.config['MAX_CONTENT_LENGTH'] = MAX_MEGABYTES * 1024 * 1024
ACCEPTED_MIMETYPES = ["application/json",
"text/plain",
"text/csv",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
ERROR_MESSAGES = {404: "The requested resource cannot be found. Please check the API documentation and try again.",
401: "You are not authenticated to use this resource. Please provide a valid user_id and user_key pair.",
413: "The file that was submitted is too large. Please submit a file smaller than %i megabytes." % MAX_MEGABYTES,
415: "The file that was submitted is an unsupported type. Please submit valid plain text, CSV, or an Excel spreadsheet.",
500: "An error has occurred in the analysis. Please contact [email protected] for assistance.",
400: "The request is missing required parameters or otherwise malformed. Please check the API documentation and try again."
}
ROOT = "/v1/documents"
# Connections
RED = redis.Redis(host=REDISHOST, port=REDISPORT, db=0)
S3 = boto.connect_s3(aws_access_key_id=AWSKEY, aws_secret_access_key=AWSSECRET).get_bucket(BUCKET)
def make_key(user_id, user_key, task_id):
"""
This is a naming convention for both redis and s3
The task_id keeps track of the data the user is uploading or maintaining. NECESSARY?
"""
return user_id + "/" + user_key + "/" + task_id
def check_credentials():
"""Check the credentials with 3scale. Return user_id and user_key pair if valid.
Raise Unauthorized error if not."""
# We'd like to know who is accessing our API in general.
try:
origin = str(request.environ['HTTP_ORIGIN'])
except:
origin = 'UNKNOWN'
app.logger.info("Checking Credentials.")
user_id = str(request.args.get("user_id"))
user_key = str(request.args.get("user_key"))
app.logger.info("Origin: "+origin)
app.logger.info("Checking " + user_id + " against " + str(USER_IDS))
if user_id == None or user_key == None:
app.logger.info("They have to give us an id and key to work with!")
raise Unauthorized()
if (user_id in USER_IDS) & (user_key in USER_KEYS[user_id]):
app.logger.info("Ok, they know what they are asking for. Moving on.")
return user_id, user_key
app.logger.info("Are they trying to hack us?")
raise Unauthorized()
def validate_parameters(params, expected_params):
"""Validate a list of parameters. Return BadRequest error if invalid."""
for key, ptype in expected_params.items():
try:
ptype(params[key])
except KeyError:
raise BadRequest()
def make_response(data, code, headers=None):
"""Create a complete JSON response from an object using the Flask Response type"""
response = Response(json.dumps(data, indent=4)+"\n", status=code, mimetype='application/json')
response.headers["Server"] = "DC2 Data Lake API"
if headers != None:
for key, value in headers.items():
response.headers[key] = value
return response
@app.errorhandler(401)
@app.errorhandler(404)
@app.errorhandler(413)
@app.errorhandler(415)
@app.errorhandler(400)
def handle_user_error(err):
"""Handle all 400 level errors. Don't send a support email since this is the user's problem."""
message = {"status" : "fail",
"data" : {"reason" : ERROR_MESSAGES[err.code]}}
app.logger.warning("A user error occurred. code: %i, path: %s", err.code, request.path)
return make_response(message, err.code)
@app.errorhandler(500)
def handle_internal_error(err):
"""Handle all 500 level errors. Send a detailed email about the error. Respond with a helpful message."""
message = {"status" : "fail",
"data" : {"reason" : ERROR_MESSAGES[500]}}
exc_type, exc_value, exc_traceback = sys.exc_info()
exc = traceback.format_exception(exc_type,
exc_value,
exc_traceback)
exc = ''.join(exc)
error_message = {"api_user": "",
"api_key": "",
"to": SUPPORTEMAIL,
"from": "[email protected]",
"subject": "DC2 Data Lake API Error",
"text": "An error ocurred in the DC2 Data Lake API: \n\nENDPOINT: " \
+ request.path + " " + request.method + "\n\n" + exc}
data = urllib.urlencode(error_message)
urllib2.urlopen(url="https://api.sendgrid.com/api/mail.send.json", data=data).read()
app.logger.error("ERROR: " + error_message["text"])
return make_response(message, 500)
def submit_job(user_id, user_key, task_id, mimetype):
"""Submit a job to the queue for the Celery worker. Create the required JSON message and post it to RabbitMQ."""
# These are the args that the Python function in the adjunct processor will use.
kwargs = {"user_id": user_id,
"user_key": user_key,
"task_id": task_id,
"format": mimetype,
"s3_endpoint": S3ENDPOINT,
"bucket": BUCKET,
"redis_port": REDISPORT,
"redis_host": REDISHOST}
S3.buckets.all()
S3.create_bucket(BUCKET)
# Recreate a celery message manually so that we don't need to import celery_tasks.py which has heavy dependencies.
job = {"id": task_id,
"task": "dc2_master",
"kwargs": kwargs}
# Connect to RabbitMQ and post.
conn = amqp.Connection(host=RMQHOST, port=RMQPORT, userid=RMQUSERNAME, password=RMQPASSWORD, virtual_host=RMQVHOST, insist=False)
cha = conn.channel()
msg = amqp.Message(json.dumps(job))
msg.properties["content_type"] = "application/json"
cha.basic_publish(routing_key=RMQEXCHANGE,
msg=msg)
cha.close()
conn.close()
# The @app.route function decorators map endpoints to functions.
@app.route(ROOT, methods=['POST', 'GET'])
def documents():
"""The POST method for this endpoint is where API clients submit jobs.
The GET method returns a list of previous task ids."""
if request.method == "POST":
# Log that we got a request
app.logger.error("Got a file POST request")
# Extract and validate credentials.
user_id, user_key = check_credentials()
# Get the file, validate the type, and make sure the file itself is valid.
# The EntityTooLarge error is raised automatically by Flask.
submitted_file = request.files.get('file')
# size = len(submitted_file.read())
ctype = submitted_file.content_type
if ctype not in ACCEPTED_MIMETYPES:
app.logger.error("Unsupported Media Type: %s", ctype)
raise UnsupportedMediaType
validate_input_file(submitted_file, ctype)
# Validate the parameters and set a default.
if ctype != "text/plain":
validate_parameters(request.form, {"text_col": int})
text_col = request.form.get("text_col")
else:
text_col = 0
# If we've reached this point, everything looks good so generate a task id.
task_id = str(uuid.uuid4())
# Post initial status to Redis, upload to s3, and submit the job to RabbitMQ.
key = make_key(user_id, user_key, task_id)
post_initial_status(key)
###_____________________________________________________________________
# HOW DO WE WANT TO ORGANIZE S3?? HOW DO WE WANT TO ORGANIZE S3??
S3.new_key("input/"+key).set_contents_from_file(submitted_file)
# APPLICABLE ONLY IF WE HAVE AN AUTOMATED PROCESS ON ANOTHER SERVER WITH EACH SUBMISSION
# submit_job(user_id, user_key, task_id, ctype, text_col, dedupe)
# Finally, return a message to the client and write to the log file.
data = {"status": "success",
"data": {"job_id": task_id,
"file_size": size,
"mime_type": ctype,
"links": [{"rel": "queue",
"href": ROOT + "/queue/" + task_id,
"type": "application/json"}]}}
app.logger.error("File successfully submitted. type: %s, size: %i, user_id: %s, task_id: %s, dedupe: %s", ctype, size, user_id, task_id, dedupe)
return make_response(data, 202, headers = {"Location": ROOT + "/queue/" + task_id})
if request.method == "GET":
# Extract and validate credentials.
user_id, user_key = check_credentials()
# Get the list of previous task ids from s3.
outputs = set([key.name.split("/")[-2] for key in S3.list(prefix="output/" + user_id + "/" + user_key)])
# optionally paginate results
if request.args.get("max_results"):
per_page = int(request.args.get("max_results"))
offset = 0
if request.args.get("offset"): offset = int(request.args.get("offset"))
outputs = list(outputs)[:offset + per_page]
if len(outputs) == 0:
raise NotFound() # should this be a 404?
# Return the list in a JSON response.
data = {"status": "success",
"data": {}}
data["data"]["links"] = []
for task_id in outputs:
data["data"]["links"].append({"rel": task_id,
"href": ROOT + "/" + task_id,
"type" : "application/json"})
return make_response(data, 200)
@app.route(ROOT + '/twitter', methods=['GET'])
def get_twitter():
time_range = request.args.get("time_range") #MMDDYYYY-MMDDYYYY
@app.route(ROOT + '/queue/<string:task_id>', methods=['GET'])
def queue(task_id):
"""This endpoint is where clients poll to find the status of their jobs."""
# Extract and validate credentials.
user_id, user_key = check_credentials()
# Get status from Redis.
status = get_status(user_id, user_key, task_id)
# Build the response.
data = {"status": "success",
"data": status}
data["job_id"] = task_id
headers = {}
code = 200 # default
# If the job is complete, give a link to the listing endpoint.
if status["documentStatus"] == "COMPLETE":
headers["Location"] = ROOT + "/" + task_id
data["data"]["links"] = [{"rel": task_id,
"href": ROOT + "/" + task_id,
"type": "application/json"}]
# If the adjunct had an error, log it and return a 500 status.
elif status["documentStatus"] == "FAIL":
app.logger.info("Looks like there was a DC2 Master error:\n%s", status)
data["sad_face"] = ":_("
try:
data["taskStatusUpdate"] += " Please contact [email protected]."
except KeyError:
app.logger.error("Key ERROR! No taskStatusUpdate")
data["taskStatusUpdate"] = " Please contact [email protected]."
data["error"] = data["taskStatusUpdate"]
app.logger.warning("An error ocurred in the adjunct: %s", status["error"])
code = 500
# If the job is queued or processing, give a link to this same endpoint
else:
headers["Location"] = ROOT + "/queue/" + task_id
data["data"]["links"] = [{"rel": "queue",
"href": ROOT + "/queue/" + task_id,
"type": "application/json"}]
app.logger.info("Polling task_id: %s, status: %s, percentComplete: %i", task_id, status["documentStatus"], status["percentComplete"])
return make_response(data, code, headers)
@app.route(ROOT + '/<string:task_id>', methods=['GET'])
def list_results(task_id):
"""This endpoint lists the locations of the results."""
# Extract and validate credentials. Verify that the job is done.
user_id, user_key, status = check_credentials_and_status(task_id)
# Get the list of outputs for this task id
# outputs = [key for key in S3.list(prefix="output/" + make_key(user_id, user_key, task_id)) if key.content_type == "text/csv"]
outputs = [key for key in S3.list(prefix="output/" + make_key(user_id, user_key, task_id)) if key.name[-3:] == "csv"]
# outputs = []
# [outputs.append(key) for key in S3.list(prefix="output/" + make_key(user_id, user_key, task_id))]
if len(outputs) == 0:
app.logger.error("From list_results, document was complete but cannot find files!")
raise NotFound()
# Build the JSON response with the locations of the results.
data = {"status": "success",
"data": {} }
data["data"]["links"] = []
for key in outputs:
name = key.name.split("/")[-1]
data["data"]["links"].append({"rel": name,
"href": ROOT + "/" + task_id + "/" + name,
"type" : "text/csv",
"size" : key.size,
"completion_date": key.last_modified}) #,
# For some reason expiry_date doesn't work so it's gone for now
#"expiry_date": key.expiry_date})
return make_response(data, 200)
def get_file(task_id, name):
"""Helper function for spitting out a file on s3"""
# Validate credentials and check status
user_id, user_key, status = check_credentials_and_status(task_id)
key = "output/" + make_key(user_id, user_key, task_id) + "/" + name
app.logger.info("Results delivered. name: %s, user_id: %s, task_id: %s", name, user_id, task_id)
# Return results directly from s3
return S3.get_key(key).get_contents_as_string(), 200
@app.route(ROOT + '/<string:task_id>/comments.csv', methods=['GET'])
def comments(task_id):
"""Return the comments.csv results"""
return get_file(task_id, "comments.csv")
@app.route(ROOT + '/<string:task_id>/graph.json', methods=['GET'])
def graph(task_id):
"""Return the graph.json results"""
return get_file(task_id, "graph.json")
@app.route('/')
def info():
"""Return some info, useful for testing deployment"""
return "This is the Data Community DC Data Lake API. Please see the documentation for use."
if __name__ == '__main__':
#handler = RotatingFileHandler(LOGFILE, maxBytes=1024*1024, backupCount=10)
handler = FileHandler(LOGFILE)
handler.setLevel(logging.INFO)
app.logger.addHandler(handler)
if USESSL:
#app.run(host='0.0.0.0', port=FLASKPORT, ssl_context=ctx)
app.run(host='0.0.0.0', port=FLASKPORT, ssl_context='adhoc')
else:
app.run(host='0.0.0.0', port=FLASKPORT)