-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.py
444 lines (366 loc) · 15.5 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/usr/bin/env python3
"""The API implementation.
This file contains callbacks extending the basic REST functionality,
validations and authorization implementation.
"""
import re
from datetime import datetime, date, timedelta
import os
import os.path
import glob
import shutil
import json
import urllib.parse
import requests
from eve import Eve
from eve.io.mongo import Validator
from eve.auth import BasicAuth
from eve.utils import config, debug_error_message
from flask import request, current_app, Flask, jsonify, Response, abort
from flask.ext.cors import CORS
from bson.objectid import ObjectId
import settings
def on_fetched_item_callback(resource, response):
"""Removes `_id` from the fetched document and embeds related
items requested in the `embed` URL query parameter.
Allows multilevel embedding (limited to 3 levels). Nested embedded
relations are separated by dot. Example:
.../organizations/1234567890?embed=["parent", "memberships.person"]
We cannot use Eve's built-in embedded resource serialization
because it handles only entities directly referenced by a field in
the document and replaces that field. However, we need to embed
reverse relations too, e.g. `memberships` in the organization
entity lists all memberships referencing the organization.
Furthermore, unlike the Eve built-in embedding Popolo requires that
referenced items are embedded as a field with different name from
the referencing one (e.g. `organization` vs. `organization_id`) and
allows multilevel embedding.
"""
del response['_id']
if 'embed' in request.args:
try:
embed = json.loads(request.args['embed'])
for path in embed:
_embed_relation(resource, path, response, [(resource, response['id'])])
except:
abort(400, description=debug_error_message('Unable to parse `embed` clause'))
def on_fetched_resource_callback(resource, response):
"""Removes `_id` from all fetched documents and embeds related
items requested in the `embed` URL query parameter.
See `on_fetched_item_callback`. Example:
.../organizations?where={...}&embed=["memberships.person"]
"""
for item in response['_items']:
on_fetched_item_callback(resource, item)
def _embed_relation(resource, path, document, ancestors):
"""Embeds entities of a given (eventually multilevel) relation into
the document.
:param resource: resource of the document
:param path: dot separated chain of relation names
:param document: document to embed into
:param ancestors: list of entities on the current 'path' of embedding
List in the `ancestors` parameter containing tuples of resource
name and entity id is used to prevent embedding of an entity into
itself and to limit the depth of nested embedding.
"""
# Extract the topmost relation from the chain of relations and check
# if there is a reference to a related entity in the actual document
rel_name, _, tail = path.partition('.')
relation = config.DOMAIN[resource]['relations'].get(rel_name)
if not relation or relation['field'] not in document: return
# Embed unless the entity is already embedded
if rel_name not in document:
# Retrieve the related entities
related_resource = current_app.data.driver.db[relation['resource']]
value = document[relation['field']]
results = related_resource.find({relation['fkey']: value})
entities = []
for result in results:
# Prevent embedding of an entity into itself
if (relation['resource'], result['id']) in ancestors:
continue
result.pop('_id')
# Omit xxx_id property in embedded entity - it is redundant with id it references
if relation['fkey'] != 'id':
result.pop(relation['fkey'])
entities.append(result)
if entities:
# Either entity or list of entities will be embedded depending on singular or plural of relation name
if rel_name.endswith('s'):
document[rel_name] = entities
else:
document[rel_name] = entities[0]
# Omit xxx_id property in embedding entity - it is redundant with id it references
if relation['field'] != 'id':
document.pop(relation['field'])
# Recursively resolve deeper levels of embedding (limited to 3 levels)
if tail and rel_name in document and len(ancestors) < 3:
entities = document[rel_name]
if not isinstance(entities, list):
entities = [entities]
for subdoc in entities:
ancestors.append((relation['resource'], subdoc['id']))
_embed_relation(relation['resource'], tail, subdoc, ancestors)
ancestors.pop()
def on_insert_callback(resource, documents):
"""Creates default id-s (ObjectId as a string) in documents being inserted
not containing `id` field and mirrors remote files referenced
in the documents.
"""
for doc in documents:
if 'id' not in doc:
doc['id'] = str(ObjectId())
# Download remote files referenced in the document and modify
# document fields to point to the local copy.
for field in config.DOMAIN[resource].get('save_files', []):
_relocate_url_field(resource, field, doc, {'id': doc['id']})
def on_update_callback(resource, updates, original):
"""Adds all changes in updated tracked properties to the list in
`changed` property of the resource.
"""
for field in config.DOMAIN[resource].get('save_files', []):
_relocate_url_field(resource, field, updates, original)
effective_date = request.args.get('effective_date') or date.today().isoformat()
if effective_date == 'fix': return
changes = []
for field in config.DOMAIN[resource].get('track_changes', []):
if field in updates and updates[field] != original.get(field):
change = _build_change(field, original, effective_date)
changes.append(change)
if changes or 'changes' in updates:
updates['changes'] = changes + updates.get('changes', []) + original.get('changes', [])
def on_replace_callback(resource, document, original):
"""Adds all changes in all tracked properties to the list in
`changed` property of the resource.
"""
for field in config.DOMAIN[resource].get('save_files', []):
_relocate_url_field(resource, field, document, original)
effective_date = request.args.get('effective_date') or date.today().isoformat()
if effective_date == 'fix': return
changes = []
for field in config.DOMAIN[resource].get('track_changes', []):
if document.get(field) != original.get(field):
change = _build_change(field, original, effective_date)
changes.append(change)
if changes or 'changes' in document or 'changes' in original:
document['changes'] = changes + document.get('changes', []) + original.get('changes', [])
def _relocate_url_field(resource, field, document, original):
"""Translates remote URL in the given field of the document to a
corresponding locally hosted file. Also if the field is newly added
or the remote file has changed, download it and store as a new
file.
If the remote URL does not exist, the field is left unchanged.
Implementation for `image` field in Slovak parliament (sk/nrsr) is
specific. Due to padded files it is impossible to detect changed
file by its changed length so the files are compared by content.
"""
if field not in document: return
# Get info about the URL target file.
try:
remote_url = document[field]
if config.URL_PREFIX == 'sk/nrsr' and field == 'image':
resp = requests.get(remote_url)
else:
resp = requests.head(remote_url, allow_redirects=True)
resp.raise_for_status()
except requests.exceptions.RequestException:
return
content_type = resp.headers['content-type']
_, ext = content_type.split('/')
# If the field is newly added or the remote file has changed, it must be downloaded.
pathfile = (config.FILES_DIR + '/' + config.URL_PREFIX + '/' +
resource + '/' + str(original['id']) + '/' + field)
if not original.get(field):
create_file = pathfile + '.' + ext
else:
create_file = None
url = urllib.parse.unquote(original[field])
host = '%s://%s' % (config.FILES_PROTOCOL, config.FILES_HOST)
existing_file = url.replace(host, config.FILES_DIR, 1)
if not os.path.isfile(existing_file) or \
int(resp.headers['content-length']) != os.path.getsize(existing_file):
n = len(glob.glob(pathfile + '.*'))
create_file = pathfile + '.' + str(n+1) + '.' + ext
if config.URL_PREFIX == 'sk/nrsr' and field == 'image':
with open(existing_file, 'rb') as f:
existing_content = f.read()
if resp.content != existing_content:
n = len(glob.glob(pathfile + '.*'))
create_file = pathfile + '.' + str(n+1) + '.' + ext
# Download and store the remote file.
if create_file:
if not (config.URL_PREFIX == 'sk/nrsr' and field == 'image'):
resp = requests.get(remote_url)
resp.raise_for_status()
os.makedirs(os.path.dirname(create_file), exist_ok=True)
with open(create_file, 'wb') as f:
f.write(resp.content)
# Modify the field in the document to contain the local file.
if create_file:
url = create_file.replace(config.FILES_DIR, config.FILES_HOST, 1)
url = urllib.parse.quote(url)
document[field] = '%s://%s' % (config.FILES_PROTOCOL, url)
else:
document[field] = original[field]
def _build_change(field, original, effective_date):
"""Creates a change holding original value of the field to include
into the list of changes.
"""
change = {
'property': field,
'value': original.get(field),
'end_date': _datestring_add(effective_date, -1),
}
# If there are older values of this property already present in `changes`,
# then validity of the current value will start immediately after the most
# recent one.
former_end_dates = [ch['end_date']
for ch in original.get('changes', [])
if ch['property'] == field]
if former_end_dates:
change['start_date'] = _datestring_add(max(former_end_dates), 1)
return change
def _datestring_add(datestring, days):
"""Returns the date specified as string in ISO format with given
number of days added.
"""
return (datetime.strptime(datestring, '%Y-%m-%d') + timedelta(days=days)).date().isoformat()
def on_delete_item_callback(resource, document):
"""Deletes downloaded and stored files related to the entity
about to be deleted.
"""
path = (config.FILES_DIR + '/' + config.URL_PREFIX + '/' +
resource + '/' + str(document['id']))
shutil.rmtree(path, ignore_errors=True)
def on_delete_resource_callback(resource):
"""Deletes all downloaded and stored files related to the resource.
"""
path = (config.FILES_DIR + '/' + config.URL_PREFIX + '/' + resource)
shutil.rmtree(path, ignore_errors=True)
class VpapiValidator(Validator):
"""Additional validations in the schema.
"""
def _validate_format(self, format, field, value):
"""Validates custom rule `format`.
No validation is performed for URLs because virtually
everything is a valid URL according to RFC 3986.
"""
if format not in ['partialdate', 'partialdatetime', 'email', 'url']:
self._error(field, 'Unknown format "{0}"'.format(format))
partialdate_regex = r'^[0-9]{4}(-[0-9]{2}){0,2}$'
partialdatetime_regex = r'^[0-9]{4}((-[0-9]{2}){0,2}|(-[0-9]{2}){2}(T[0-9]{2}(:[0-9]{2}(:[0-9]{2})?)?Z)?)$'
email_regex = r'[^@]+@[^@]+\.[^@]+'
if format == 'partialdate' and not re.match(partialdate_regex, value) or \
format == 'partialdatetime' and not re.match(partialdatetime_regex, value) or \
format == 'email' and not re.match(email_regex, value):
self._error(field, "Value '{0}' does not satisfy format '{1}'".format(value, format))
def _validate_disjoint(self, disjoint, field, value):
"""Validates custom rule `disjoint` that is similar to the
`unique` rule, but instead of comparing values as a whole it
checks for no common element in the lists.
Applicable only to fields with type `list`.
"""
if not isinstance(value, list):
self._error(field, '`disjoint` rule allowed only for `list` fields')
if disjoint:
query = {}
if self._id:
query[config.ID_FIELD] = {'$ne': self._id}
for element in value:
if isinstance(element, dict):
query[field] = {'$elemMatch': element}
else:
query[field] = element
if current_app.data.find_one(self.resource, None, **query):
self._error(field,
'value `%s` for field `%s` contains a common element with an existing value' %
(value, field))
break
def _validate_unique_elements(self, unique_elements, field, value):
"""Validates custom rule `unique_elements` that checks all
values are unique within the list.
Applicable only to fields with type `list`.
"""
if not isinstance(value, list):
self._error(field, '`unique_elements` rule allowed only for `list` fields')
if unique_elements and len(value) > 1:
if isinstance(value[0], dict):
uniqified = set(frozenset(element.items()) for element in value)
else:
uniqified = set(frozenset(element) for element in value)
if len(uniqified) < len(value):
self._error(field,
'elements within the list `%s` are not unique' %
value)
class VpapiBasicAuth(BasicAuth):
"""Authentication used for write access to the API."""
def check_auth(self, username, password, allowed_roles, resource, method):
return [username, password] in config.AUTHORIZED_USERS
def create_app(country_code, parliament):
# Merge parliament specific settings on top of common settings.
instance_settings = settings.common
instance_settings.update({
'URL_PREFIX': country_code + '/' + parliament['code'],
'MONGO_DBNAME': country_code + '_' + parliament['code'].replace('-', '_'),
'AUTHORIZED_USERS': parliament['authorized_users'],
})
app = Eve(
settings=instance_settings,
validator=VpapiValidator,
auth=VpapiBasicAuth
)
# Removing of _id-s and embedding of related entities.
app.on_fetched_item += on_fetched_item_callback
app.on_fetched_resource += on_fetched_resource_callback
# Creation of missing id-s and mirroring of referenced files.
app.on_insert += on_insert_callback
# Tracking of changed values on update and replace.
app.on_update += on_update_callback
app.on_replace += on_replace_callback
# Removing of mirrored files related to deleted entities.
app.on_delete_item += on_delete_item_callback
app.on_delete_resource += on_delete_resource_callback
return app
# A special application serving HATEOAS links to available countries and parliaments.
hateoas_app = Flask(__name__)
cors = CORS(hateoas_app)
@hateoas_app.route('/')
def country_list():
with open(os.path.join(os.path.dirname(__file__), 'conf', 'countries.json'), 'r') as f:
countries = json.load(f)
if 'application/xml' in request.headers['accept']:
resp = '<resource>'
for country in countries:
resp += '<link rel="child" href="%s" title="%s" locale="%s" timezone="%s"/>' % (
country['code'], country['name'], country['locale'], country['timezone'])
resp += '</resource>'
return Response(response=resp, mimetype='application/xml')
else:
resp = {'_links': {'child': []}}
for country in countries:
resp['_links']['child'].append({
'title': country['name'],
'href': country['code'],
'locale': country['locale'],
'timezone': country['timezone'],
})
return jsonify(**resp)
@hateoas_app.route('/<country>/')
def parliament_list(country):
with open(os.path.join(os.path.dirname(__file__), 'conf', 'parliaments.json'), 'r') as f:
parliaments = json.load(f)
if 'application/xml' in request.headers['accept']:
resp = '<resource>'
for parl in parliaments[country]:
resp += '<link rel="child" href="%s" title="%s"/>' % (parl['code'], parl['name'])
resp += '</resource>'
return Response(response=resp, mimetype='application/xml')
else:
resp = {'_links': {'child': []}}
for parl in parliaments[country]:
resp['_links']['child'].append({'title': parl['name'], 'href': parl['code']})
return jsonify(**resp)
# If executed directly by built-in application server, use example parliament xx/example.
if __name__ == '__main__':
app = create_app('xx', {'code': 'example', 'authorized_users': [['scraper', 'secret']]})
app.run()