Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mongo filetypes #966

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .handlers.containerhandler import ContainerHandler
from .handlers.dataexplorerhandler import DataExplorerHandler
from .handlers.devicehandler import DeviceHandler
from .handlers.filetypehandler import FileType
from .handlers.grouphandler import GroupHandler
from .handlers.listhandler import FileListHandler, NotesListHandler, PermissionsListHandler, TagsListHandler
from .handlers.refererhandler import AnalysesHandler
Expand Down Expand Up @@ -43,6 +44,9 @@
# Filename
'fname': '[^/]+',

# File type name
'ftypename': '[^/]+',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same regex as fname above, does it need to be separate?


# Note ID
'nid': '[0-9a-f]{24}',

Expand Down Expand Up @@ -77,9 +81,11 @@ def prefix(path, routes):

# System configuration

route('/config', Config, m=['GET']),
route('/config.js', Config, h='get_js', m=['GET']),
route('/version', Version, m=['GET']),
route('/config', Config, m=['GET']),
route('/config.js', Config, h='get_js', m=['GET']),
route('/version', Version, m=['GET']),
route('/filetype', FileType, m=['GET', 'POST']),
route('/filetype/<_id:{ftypename}>', FileType, m=['DELETE']),


# General-purpose upload & download
Expand Down
11 changes: 9 additions & 2 deletions api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import elasticsearch

from . import util
from .dao.dbutil import try_replace_one
from .dao.dbutil import try_replace_one, try_update_one

logging.basicConfig(
format='%(asctime)s %(name)16.16s %(filename)24.24s %(lineno)5d:%(levelname)4.4s %(message)s',
Expand Down Expand Up @@ -161,6 +161,7 @@ def apply_env_variables(config):
'container.json',
'device.json',
'file.json',
'filetype.json',
'file-update.json',
'group-new.json',
'group-update.json',
Expand Down Expand Up @@ -226,6 +227,7 @@ def create_or_recreate_ttl_index(coll_name, index_name, ttl):

def initialize_db():
log.info('Initializing database, creating indexes')

# TODO review all indexes
db.users.create_index('api_key.key')
db.projects.create_index([('gid', 1), ('name', 1)])
Expand All @@ -251,7 +253,12 @@ def initialize_db():
create_or_recreate_ttl_index('downloads', 'timestamp', 60)

now = datetime.datetime.utcnow()
db.groups.update_one({'_id': 'unknown'}, {'$setOnInsert': { 'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True)
try_update_one(db,
'groups', {'_id': 'unknown'},
{'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}},
upsert=True)

log.info('Initializing database, creating indexes ....DONE')

def get_config():
global __last_update, __config, __config_persisted #pylint: disable=global-statement
Expand Down
16 changes: 16 additions & 0 deletions api/dao/dbutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pymongo.errors import DuplicateKeyError
from ..web.errors import APIStorageException


def try_replace_one(db, coll_name, query, update, upsert=False):
"""
Mongo does not see replace w/ upsert as an atomic action:
Expand Down Expand Up @@ -39,3 +40,18 @@ def fault_tolerant_replace_one(db, coll_name, query, update, upsert=False):
time.sleep(random.uniform(0.01,0.05))

raise APIStorageException('Unable to replace object.')


def try_update_one(db, coll_name, query, update, upsert=False):
"""
Mongo does not see replace w/ upsert as an atomic action:
https://jira.mongodb.org/browse/SERVER-14322

This function will try a replace_one operation, returning the result and if the operation succeeded.
"""
try:
result = db[coll_name].update_one(query, update, upsert=upsert)
except DuplicateKeyError:
return result, False
else:
return result, True
29 changes: 13 additions & 16 deletions api/files.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
import cgi
import json
import shutil
import hashlib
import collections
import hashlib
import os
import re
import shutil

from . import util
from . import config
Expand Down Expand Up @@ -150,18 +150,15 @@ def get_hash(self):

# File extension --> scitran file type detection hueristics.
# Listed in precendence order.
with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd:
TYPE_MAP = json.load(fd)

KNOWN_FILETYPES = {ext: filetype for filetype, extensions in TYPE_MAP.iteritems() for ext in extensions}

def guess_type_from_filename(filename):
particles = filename.split('.')[1:]
extentions = ['.' + '.'.join(particles[i:]) for i in range(len(particles))]
for ext in extentions:
filetype = KNOWN_FILETYPES.get(ext.lower())
if filetype:
break
else:
filetype = None
filetype = None
m_length = 0
cursor = config.db.filetypes.find({})

for document in cursor:
m = re.search(document['regex'], filename)
if m and m_length < len(m.group(0)):
filetype = document['_id']

return filetype
29 changes: 0 additions & 29 deletions api/filetypes.json

This file was deleted.

42 changes: 42 additions & 0 deletions api/handlers/filetypehandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import re

from .. import config
from ..auth import require_admin, require_login
from ..validators import validate_data, InputValidationException
from ..web import base


class FileType(base.RequestHandler):

@require_login
def get(self):
"""Get file types"""
return config.db.filetypes.find()

@require_admin
def post(self):
"""
Insert or replace a file type. Required fields: '_id' and 'regex' where the '_id' is the unique name of
the file type and 'regex' is a regular expression which is used to figure out the file type from the file name.
"""
payload = self.request.json_body
validate_data(payload, 'filetype.json', 'input', 'POST')
try:
re.compile(payload['regex'])
except re.error:
raise InputValidationException('Invalid regular expression')
result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True)
if result.acknowledged:
_id = result.upserted_id if result.upserted_id else payload['_id']
return {'_id': _id}
else:
self.abort(404, 'File type {} not updated'.format(payload['_id']))

@require_admin
def delete(self, _id):
"""Delete a file type"""
result = config.db.filetypes.delete_one({'_id': _id})
if result.deleted_count:
return {'deleted': result.deleted_count}
else:
self.abort(404, 'File type {} not removed'.format(_id))
97 changes: 96 additions & 1 deletion bin/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@

from api import config
from api import util
from api import files as files_module
from api.dao import containerutil
from api.dao.containerstorage import ProjectStorage
from api.jobs.jobs import Job
from api.jobs import gears
from api.types import Origin
from api.jobs import batch

CURRENT_DATABASE_VERSION = 40 # An int that is bumped when a new schema change is made
CURRENT_DATABASE_VERSION = 41 # An int that is bumped when a new schema change is made

def get_db_version():

Expand Down Expand Up @@ -1301,6 +1302,100 @@ def upgrade_to_40():
cursor = config.db.acquisitions.find({'timestamp':{'$type':'string'}})
process_cursor(cursor, upgrade_to_40_closure)


def upgrade_to_41_closure(cont, context):
"""
Re-type files based on new filetypes stored in mongo collection
"""

# passing filetypes rather than using util function to speed upgrade and skip db lookup
filetypes = context['filetypes']
cont_name = context['cont_name']

files = cont.get('files', [])

for f in files:

new_type = None
m_length = 0

for document in filetypes:
m = re.search(document['regex'], f['name'])
if m and m_length < len(m.group(0)):
new_type = document['_id']
if new_type is not None:
f['type'] = new_type

config.db['cont_name'].update_one({'_id': cont['_id']}, {'$set': {'files': files}})

return True


def upgrade_to_41():
"""
Load initial filetypes into mongo, retype existing files
"""

# It was decided an initial load of filetypes here for existing users was
# easiest way to move those users forward. Future changes a site's
# filetypes will happen through the API endpoints as expected
filetypes = [
{ "_id": "BVAL", "regex": "\\.(bval|bvals)$" },
{ "_id": "BVEC", "regex": "\\.(bvec|bvecs)$" },
{ "_id": "DICOM", "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" },
{ "_id": "EFile", "regex": "^E.*P.*\\.7$" },
{ "_id": "GE Physio", "regex": "\\.gephysio\\.zip$" },
{ "_id": "MGH Data", "regex": "\\.(mgh|mgz|mgh\\.gz)$" },
{ "_id": "NIfTI", "regex": "\\.(nii\\.gz|nii)$" },
{ "_id": "PAR/REC", "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" },
{ "_id": "PFile Header", "regex": "\\.(7\\.hdr)$" },
{ "_id": "PFile", "regex": "\\.(7\\.gz|7|7\\.zip)$" },

{ "_id": "EEG", "regex": "\\.eeg\\.zip$" },

{ "_id": "QC", "regex": "\\.(q[ac]\\.png|q[ac]\\.json|q[ac]\\.html)$" },

{ "_id": "MATLAB Data", "regex": "\\.mat$" },
{ "_id": "PsychoPy Data", "regex": "\\.psydat$" },

{ "_id": "C/C++", "regex": "\\.(c|cpp)$" },
{ "_id": "CSS", "regex": "\\.css$" },
{ "_id": "HDF5", "regex": "\\.(h5|hdf5)$" },
{ "_id": "HTML", "regex": "\\.(html|htm)$" },
{ "_id": "JSON", "regex": "\\.json$" },
{ "_id": "Java", "regex": "\\.java$" },
{ "_id": "JavaScript", "regex": "\\.js$" },
{ "_id": "Jupyter", "regex": "\\.ipynb$" },
{ "_id": "MATLAB", "regex": "\\.(m|mex|mlx)$" },
{ "_id": "Markdown", "regex": "\\.(md|markdown)$" },
{ "_id": "PHP", "regex": "\\.php$" },
{ "_id": "Plain Text", "regex": "\\.txt$" },
{ "_id": "Python", "regex": "\\.py$" },
{ "_id": "TOML", "regex": "\\.toml$" },
{ "_id": "XML", "regex": "\\.xml$" },
{ "_id": "YAML", "regex": "\\.(yaml|yml)$" },

{ "_id": "Archive", "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" },
{ "_id": "Audio", "regex": "\\.(mp3|wav|wave)$" },
{ "_id": "Document", "regex": "\\.(docx|doc)$" },
{ "_id": "Image", "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" },
{ "_id": "Log", "regex": "\\.log$" },
{ "_id": "PDF", "regex": "\\.pdf$" },
{ "_id": "Presentation", "regex": "\\.(ppt|pptx)$" },
{ "_id": "Spreadsheet", "regex": "\\.(xls|xlsx)$" },
{ "_id": "Tabular Data", "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
{ "_id": "Video", "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
]

for ft in filetypes:
config.db.filetypes.replace_one({'_id': ft['_id']}, ft, upsert=True)

for cont_name in ['projects', 'sessions', 'acquisitions', 'analyses', 'collections']:

# Find all containers that have at least one file
cursor = config.db[cont_name].find({'files': { '$gt': [] }})
process_cursor(cursor, upgrade_to_41_closure, context={'filetypes': filetypes, 'cont_name': cont_name})

###
### BEGIN RESERVED UPGRADE SECTION
###
Expand Down
13 changes: 9 additions & 4 deletions bin/load_users_drone_secret.py → bin/load_drone_secret.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ def _upsert_permission(request_session, api_url, permission_doc, group_id):
full_permission_url = "{0}/{1}".format(base_permission_url, permission_doc['_id'])
return request_session.put(full_permission_url, json=permission_doc)

def users(filepath, api_url, http_headers, insecure):
def bootstrap(filepath, api_url, http_headers, insecure):
"""
Upserts the users/groups/permissions defined in filepath parameter.
Upserts the users/groups/permissions/file types defined in filepath parameter.

Raises:
requests.HTTPError: Upsert failed.
Expand Down Expand Up @@ -95,7 +95,7 @@ def users(filepath, api_url, http_headers, insecure):

log.info('bootstrapping projects...')
for p in input_data.get('projects', []):
r = rs.post(api_url + '/projects?inherit=true' , json=p)
r = rs.post(api_url + '/projects?inherit=true', json=p)
r.raise_for_status()

project_id = r.json()['_id']
Expand All @@ -111,6 +111,11 @@ def users(filepath, api_url, http_headers, insecure):
r = rs.post(api_url + '/projects/' + project_id + '/rules', json=rule)
r.raise_for_status()

log.info('bootstrapping file types...')
for f in input_data.get('filetypes', []):
r = rs.post(api_url + '/filetype', json=f)
r.raise_for_status()

log.info('bootstrapping complete')


Expand All @@ -134,7 +139,7 @@ def users(filepath, api_url, http_headers, insecure):
# TODO: extend this to support oauth tokens

try:
users(args.json, args.url, http_headers, args.insecure)
bootstrap(args.json, args.url, http_headers, args.insecure)
except requests.HTTPError as ex:
log.error(ex)
log.error("request_body={0}".format(ex.response.request.body))
Expand Down
Loading