scitran · davidfarkas · Oct 20, 2017 · Oct 20, 2017 · Oct 25, 2017 · Oct 25, 2017
diff --git a/api/api.py b/api/api.py
@@ -7,6 +7,7 @@
 from .handlers.containerhandler     import ContainerHandler
 from .handlers.dataexplorerhandler  import DataExplorerHandler
 from .handlers.devicehandler        import DeviceHandler
+from .handlers.filetypehandler      import FileType
 from .handlers.grouphandler         import GroupHandler
 from .handlers.listhandler          import FileListHandler, NotesListHandler, PermissionsListHandler, TagsListHandler
 from .handlers.refererhandler       import AnalysesHandler
@@ -43,6 +44,9 @@
     # Filename
     'fname': '[^/]+',
 
+    # File type name
+    'ftypename': '[^/]+',
+
     # Note ID
     'nid': '[0-9a-f]{24}',
 
@@ -77,9 +81,11 @@ def prefix(path, routes):
 
         # System configuration
 
-        route('/config',           Config,              m=['GET']),
-        route('/config.js',        Config,  h='get_js', m=['GET']),
-        route('/version',          Version,             m=['GET']),
+        route('/config',                        Config,              m=['GET']),
+        route('/config.js',                     Config,  h='get_js', m=['GET']),
+        route('/version',                       Version,             m=['GET']),
+        route('/filetype',                      FileType,            m=['GET', 'POST']),
+        route('/filetype/<_id:{ftypename}>',    FileType,            m=['DELETE']),
 
 
         # General-purpose upload & download

diff --git a/api/config.py b/api/config.py
@@ -8,7 +8,7 @@
 import elasticsearch
 
 from . import util
-from .dao.dbutil import try_replace_one
+from .dao.dbutil import try_replace_one, try_update_one
 
 logging.basicConfig(
     format='%(asctime)s %(name)16.16s %(filename)24.24s %(lineno)5d:%(levelname)4.4s %(message)s',
@@ -161,6 +161,7 @@ def apply_env_variables(config):
     'container.json',
     'device.json',
     'file.json',
+    'filetype.json',
     'file-update.json',
     'group-new.json',
     'group-update.json',
@@ -226,6 +227,7 @@ def create_or_recreate_ttl_index(coll_name, index_name, ttl):
 
 def initialize_db():
     log.info('Initializing database, creating indexes')
+
     # TODO review all indexes
     db.users.create_index('api_key.key')
     db.projects.create_index([('gid', 1), ('name', 1)])
@@ -251,7 +253,12 @@ def initialize_db():
     create_or_recreate_ttl_index('downloads', 'timestamp', 60)
 
     now = datetime.datetime.utcnow()
-    db.groups.update_one({'_id': 'unknown'}, {'$setOnInsert': { 'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}}, upsert=True)
+    try_update_one(db,
+                   'groups', {'_id': 'unknown'},
+                   {'$setOnInsert': {'created': now, 'modified': now, 'label': 'Unknown', 'permissions': []}},
+                   upsert=True)
+
+    log.info('Initializing database, creating indexes ....DONE')
 
 def get_config():
     global __last_update, __config, __config_persisted #pylint: disable=global-statement

diff --git a/api/dao/dbutil.py b/api/dao/dbutil.py
@@ -4,6 +4,7 @@
 from pymongo.errors import DuplicateKeyError
 from ..web.errors import APIStorageException
 
+
 def try_replace_one(db, coll_name, query, update, upsert=False):
     """
     Mongo does not see replace w/ upsert as an atomic action:
@@ -39,3 +40,18 @@ def fault_tolerant_replace_one(db, coll_name, query, update, upsert=False):
             time.sleep(random.uniform(0.01,0.05))
 
     raise APIStorageException('Unable to replace object.')
+
+
+def try_update_one(db, coll_name, query, update, upsert=False):
+    """
+    Mongo does not see replace w/ upsert as an atomic action:
+    https://jira.mongodb.org/browse/SERVER-14322
+
+    This function will try a replace_one operation, returning the result and if the operation succeeded.
+    """
+    try:
+        result = db[coll_name].update_one(query, update, upsert=upsert)
+    except DuplicateKeyError:
+        return result, False
+    else:
+        return result, True
diff --git a/api/files.py b/api/files.py
@@ -1,9 +1,9 @@
-import os
 import cgi
-import json
-import shutil
-import hashlib
 import collections
+import hashlib
+import os
+import re
+import shutil
 
 from . import util
 from . import config
@@ -150,18 +150,15 @@ def get_hash(self):
 
 # File extension --> scitran file type detection hueristics.
 # Listed in precendence order.
-with open(os.path.join(os.path.dirname(__file__), 'filetypes.json')) as fd:
-    TYPE_MAP = json.load(fd)
-
-KNOWN_FILETYPES = {ext: filetype for filetype, extensions in TYPE_MAP.iteritems() for ext in extensions}
 
 def guess_type_from_filename(filename):
-    particles = filename.split('.')[1:]
-    extentions = ['.' + '.'.join(particles[i:]) for i in range(len(particles))]
-    for ext in extentions:
-        filetype = KNOWN_FILETYPES.get(ext.lower())
-        if filetype:
-            break
-    else:
-        filetype = None
+    filetype = None
+    m_length = 0
+    cursor = config.db.filetypes.find({})
+
+    for document in cursor:
+        m = re.search(document['regex'], filename)
+        if m and m_length < len(m.group(0)):
+            filetype = document['_id']
+
     return filetype
diff --git a/api/filetypes.json b/api/filetypes.json
diff --git a/api/handlers/filetypehandler.py b/api/handlers/filetypehandler.py
@@ -0,0 +1,42 @@
+import re
+
+from .. import config
+from ..auth import require_admin, require_login
+from ..validators import validate_data, InputValidationException
+from ..web import base
+
+
+class FileType(base.RequestHandler):
+
+    @require_login
+    def get(self):
+        """Get file types"""
+        return config.db.filetypes.find()
+
+    @require_admin
+    def post(self):
+        """
+        Insert or replace a file type. Required fields: '_id' and 'regex' where the '_id' is the unique name of
+        the file type and 'regex' is a regular expression which is used to figure out the file type from the file name.
+        """
+        payload = self.request.json_body
+        validate_data(payload, 'filetype.json', 'input', 'POST')
+        try:
+            re.compile(payload['regex'])
+        except re.error:
+            raise InputValidationException('Invalid regular expression')
+        result = config.db.filetypes.replace_one({'_id': payload['_id']}, payload, upsert=True)
+        if result.acknowledged:
+            _id = result.upserted_id if result.upserted_id else payload['_id']
+            return {'_id': _id}
+        else:
+            self.abort(404, 'File type {} not updated'.format(payload['_id']))
+
+    @require_admin
+    def delete(self, _id):
+        """Delete a file type"""
+        result = config.db.filetypes.delete_one({'_id': _id})
+        if result.deleted_count:
+            return {'deleted': result.deleted_count}
+        else:
+            self.abort(404, 'File type {} not removed'.format(_id))
diff --git a/bin/database.py b/bin/database.py
@@ -15,14 +15,15 @@
 
 from api import config
 from api import util
+from api import files as files_module
 from api.dao import containerutil
 from api.dao.containerstorage import ProjectStorage
 from api.jobs.jobs import Job
 from api.jobs import gears
 from api.types import Origin
 from api.jobs import batch
 
-CURRENT_DATABASE_VERSION = 40 # An int that is bumped when a new schema change is made
+CURRENT_DATABASE_VERSION = 41 # An int that is bumped when a new schema change is made
 
 def get_db_version():
 
@@ -1301,6 +1302,100 @@ def upgrade_to_40():
     cursor = config.db.acquisitions.find({'timestamp':{'$type':'string'}})
     process_cursor(cursor, upgrade_to_40_closure)
 
+
+def upgrade_to_41_closure(cont, context):
+    """
+    Re-type files based on new filetypes stored in mongo collection
+    """
+
+    # passing filetypes rather than using util function to speed upgrade and skip db lookup
+    filetypes = context['filetypes']
+    cont_name = context['cont_name']
+
+    files = cont.get('files', [])
+
+    for f in files:
+
+        new_type = None
+        m_length = 0
+
+        for document in filetypes:
+            m = re.search(document['regex'], f['name'])
+            if m and m_length < len(m.group(0)):
+                new_type = document['_id']
+        if new_type is not None:
+            f['type'] = new_type
+
+    config.db['cont_name'].update_one({'_id': cont['_id']}, {'$set': {'files': files}})
+
+    return True
+
+
+def upgrade_to_41():
+    """
+    Load initial filetypes into mongo, retype existing files
+    """
+
+    # It was decided an initial load of filetypes here for existing users was
+    # easiest way to move those users forward. Future changes a site's
+    # filetypes will happen through the API endpoints as expected
+    filetypes = [
+        { "_id": "BVAL",            "regex": "\\.(bval|bvals)$" },
+        { "_id": "BVEC",            "regex": "\\.(bvec|bvecs)$" },
+        { "_id": "DICOM",           "regex": "\\.(dcm|dcm\\.zip|dicom\\.zip)$" },
+        { "_id": "EFile",           "regex": "^E.*P.*\\.7$" },
+        { "_id": "GE Physio",       "regex": "\\.gephysio\\.zip$" },
+        { "_id": "MGH Data",        "regex": "\\.(mgh|mgz|mgh\\.gz)$" },
+        { "_id": "NIfTI",           "regex": "\\.(nii\\.gz|nii)$" },
+        { "_id": "PAR/REC",         "regex": "\\.(parrec\\.zip|par-rec\\.zip)$" },
+        { "_id": "PFile Header",    "regex": "\\.(7\\.hdr)$" },
+        { "_id": "PFile",           "regex": "\\.(7\\.gz|7|7\\.zip)$" },
+
+        { "_id": "EEG",             "regex": "\\.eeg\\.zip$" },
+
+        { "_id": "QC",              "regex": "\\.(q[ac]\\.png|q[ac]\\.json|q[ac]\\.html)$" },
+
+        { "_id": "MATLAB Data",     "regex": "\\.mat$" },
+        { "_id": "PsychoPy Data",   "regex": "\\.psydat$" },
+
+        { "_id": "C/C++",           "regex": "\\.(c|cpp)$" },
+        { "_id": "CSS",             "regex": "\\.css$" },
+        { "_id": "HDF5",            "regex": "\\.(h5|hdf5)$" },
+        { "_id": "HTML",            "regex": "\\.(html|htm)$" },
+        { "_id": "JSON",            "regex": "\\.json$" },
+        { "_id": "Java",            "regex": "\\.java$" },
+        { "_id": "JavaScript",      "regex": "\\.js$" },
+        { "_id": "Jupyter",         "regex": "\\.ipynb$" },
+        { "_id": "MATLAB",          "regex": "\\.(m|mex|mlx)$" },
+        { "_id": "Markdown",        "regex": "\\.(md|markdown)$" },
+        { "_id": "PHP",             "regex": "\\.php$" },
+        { "_id": "Plain Text",      "regex": "\\.txt$" },
+        { "_id": "Python",          "regex": "\\.py$" },
+        { "_id": "TOML",            "regex": "\\.toml$" },
+        { "_id": "XML",             "regex": "\\.xml$" },
+        { "_id": "YAML",            "regex": "\\.(yaml|yml)$" },
+
+        { "_id": "Archive",         "regex": "\\.(zip|tbz2|tar\\.gz|tbz|tar\\.bz2|tgz|tar|txz|tar\\.xz)$" },
+        { "_id": "Audio",           "regex": "\\.(mp3|wav|wave)$" },
+        { "_id": "Document",        "regex": "\\.(docx|doc)$" },
+        { "_id": "Image",           "regex": "\\.(jpg|tif|jpeg|gif|bmp|png|tiff)$" },
+        { "_id": "Log",             "regex": "\\.log$" },
+        { "_id": "PDF",             "regex": "\\.pdf$" },
+        { "_id": "Presentation",    "regex": "\\.(ppt|pptx)$" },
+        { "_id": "Spreadsheet",     "regex": "\\.(xls|xlsx)$" },
+        { "_id": "Tabular Data",    "regex": "\\.([ct]sv\\.gz|[ct]sv)$" },
+        { "_id": "Video",           "regex": "\\.(mpeg|mpg|mov|mp4|m4v|mts)$" }
+    ]
+
+    for ft in filetypes:
+        config.db.filetypes.replace_one({'_id': ft['_id']}, ft, upsert=True)
+
+    for cont_name in ['projects', 'sessions', 'acquisitions', 'analyses', 'collections']:
+
+        # Find all containers that have at least one file
+        cursor = config.db[cont_name].find({'files': { '$gt': [] }})
+        process_cursor(cursor, upgrade_to_41_closure, context={'filetypes': filetypes, 'cont_name': cont_name})
+
 ###
 ### BEGIN RESERVED UPGRADE SECTION
 ###

diff --git a/bin/load_users_drone_secret.py → bin/load_drone_secret.py b/bin/load_users_drone_secret.py → bin/load_drone_secret.py
@@ -62,9 +62,9 @@ def _upsert_permission(request_session, api_url, permission_doc, group_id):
     full_permission_url = "{0}/{1}".format(base_permission_url, permission_doc['_id'])
     return request_session.put(full_permission_url, json=permission_doc)
 
-def users(filepath, api_url, http_headers, insecure):
+def bootstrap(filepath, api_url, http_headers, insecure):
     """
-    Upserts the users/groups/permissions defined in filepath parameter.
+    Upserts the users/groups/permissions/file types defined in filepath parameter.
 
     Raises:
         requests.HTTPError: Upsert failed.
@@ -95,7 +95,7 @@ def users(filepath, api_url, http_headers, insecure):
 
         log.info('bootstrapping projects...')
         for p in input_data.get('projects', []):
-            r = rs.post(api_url + '/projects?inherit=true' , json=p)
+            r = rs.post(api_url + '/projects?inherit=true', json=p)
             r.raise_for_status()
 
             project_id = r.json()['_id']
@@ -111,6 +111,11 @@ def users(filepath, api_url, http_headers, insecure):
                     r = rs.post(api_url + '/projects/' +  project_id + '/rules', json=rule)
                     r.raise_for_status()
 
+        log.info('bootstrapping file types...')
+        for f in input_data.get('filetypes', []):
+            r = rs.post(api_url + '/filetype', json=f)
+            r.raise_for_status()
+
     log.info('bootstrapping complete')
 
 
@@ -134,7 +139,7 @@ def users(filepath, api_url, http_headers, insecure):
 # TODO: extend this to support oauth tokens
 
 try:
-    users(args.json, args.url, http_headers, args.insecure)
+    bootstrap(args.json, args.url, http_headers, args.insecure)
 except requests.HTTPError as ex:
     log.error(ex)
     log.error("request_body={0}".format(ex.response.request.body))