Merge pull request #205 from kbase/dts-manifest-file

DTS manifest file part 1
kbase · Nov 27, 2024 · a028bda · a028bda
2 parents 3cd1c56 + bd55791
commit a028bda
Show file tree

Hide file tree

Showing 8 changed files with 312 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -49,6 +49,16 @@ gzip, bzip2, md5sum, head, tail, wc
 
 in the docker container all of these should be available
 
+## Updating importer mapping types
+The types returned by the `importer_mappings` endpoint (see [Get Importer Mappings](#get-importer-mappings) below)
+are generated by the `GenerateMappings.py` script here: `staging_service/autodetect/GenerateMappings.py`. 
+Running this script will build the `supported_apps_w_extensions.json` file that should be placed in the
+`deployment/conf` directory.
+
+To add new mappings, update the `GenerateMappings.py` script. New file types and object types should be added
+to the `staging_service.autodetect.Mappings` module and included from there. See `GenerateMappings.py` 
+docstrings for more details.
+
 ## API
 
 all paths should be specified treating the user's home directory as root
@@ -1079,6 +1089,8 @@ For example,
 - for files for which there is no predicted app, the return is a null value
 - this endpoint is used to power the dropdowns for the staging service window in the Narrative
 
+Note: to update these mappings see instructions [here](#updating-importer-mapping-types)
+
 **URL** : `ci.kbase.us/services/staging_service/importer_mappings`
 
 **local URL** : `localhost:3000/importer_mappings`

diff --git a/deployment/conf/supported_apps_w_extensions.json b/deployment/conf/supported_apps_w_extensions.json
@@ -121,7 +121,8 @@
       "csv",
       "tsv",
       "xls",
-      "xlsx"
+      "xlsx",
+      "json"
     ],
     "media": [
       "tsv",
@@ -1101,26 +1102,31 @@
         }
       ]
     },
-    "smbl": {
+    "json": {
       "file_ext_type": [
-        "SBML"
+        "JSON"
       ],
       "mappings": [
         {
-          "id": "fba_model",
-          "title": "FBA Model",
+          "id": "import_specification",
+          "title": "Import Specification",
+          "app_weight": 1
+        },
+        {
+          "id": "escher_map",
+          "title": "EscherMap",
           "app_weight": 1
         }
       ]
     },
-    "json": {
+    "smbl": {
       "file_ext_type": [
-        "JSON"
+        "SBML"
       ],
       "mappings": [
         {
-          "id": "escher_map",
-          "title": "EscherMap",
+          "id": "fba_model",
+          "title": "FBA Model",
           "app_weight": 1
         }
       ]

diff --git a/staging_service/app.py b/staging_service/app.py
@@ -12,7 +12,7 @@
 
 from .app_error_formatter import format_import_spec_errors
 from .auth2Client import KBaseAuth2
-from .autodetect.Mappings import CSV, EXCEL, TSV
+from .autodetect.Mappings import CSV, EXCEL, TSV, JSON
 from .AutoDetectUtils import AutoDetectUtils
 from .globus import assert_globusid_exists, is_globusid
 from .import_specifications.file_parser import (
@@ -26,7 +26,12 @@
     write_excel,
     write_tsv,
 )
-from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv
+from .import_specifications.individual_parsers import (
+    parse_csv,
+    parse_excel,
+    parse_tsv,
+    parse_dts_manifest,
+)
 from .JGIMetadata import read_metadata_for
 from .metadata import add_upa, dir_info, similar, some_metadata
 from .utils import AclManager, Path, run_command
@@ -43,6 +48,7 @@
     CSV: parse_csv,
     TSV: parse_tsv,
     EXCEL: parse_excel,
+    JSON: parse_dts_manifest,
 }
 
 _IMPSPEC_FILE_TO_WRITER = {
@@ -110,6 +116,10 @@ async def bulk_specification(request: web.Request) -> web.json_response:
     type, in the `types` key.
 
     :param request: contains a comma separated list of files, e.g. folder1/file1.txt,file2.txt
+
+    TODO: since JSON files are rather generic and we might want to use a different JSON bulk-spec
+    format later, add a separate query parameter to request that the selected file is treated as a
+    Data Transfer Service manifest.
     """
     username = await authorize_request(request)
     files = parse_qs(request.query_string).get("files", [])

diff --git a/staging_service/autodetect/GenerateMappings.py b/staging_service/autodetect/GenerateMappings.py
@@ -23,6 +23,15 @@
 * Note: We should serve the generated content from memory
 * Note: This doesn't handle if we want to have different output types based on file extensions
   feeding into the same app
+
+Adding new file types and app ids:
+* file types and app ids should be added to staging_service.autodetect.Mappings
+* file types should be all upper-case and represent the file suffix.
+* app ids should be in snake_case, and represent either an object import type, or some
+  internal application to be run on selection (i.e. decompress or import specification).
+* app ids that map to actual object import maps should match the app ids in the narrative
+  interface configuration here:
+  https://github.com/kbase/narrative/blob/main/kbase-extension/static/kbase/config/staging_upload.json
 """
 
 from collections import defaultdict
@@ -120,7 +129,7 @@
     fba_model_id,
     import_specification,
 ]
-file_format_to_app_mapping[JSON] = [escher_map_id]
+file_format_to_app_mapping[JSON] = [escher_map_id, import_specification]
 file_format_to_app_mapping[SBML] = [fba_model_id]
 
 app_id_to_extensions = defaultdict(list)

diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py
@@ -3,6 +3,7 @@
 """
 
 import csv
+import json
 import math
 import re
 from pathlib import Path
@@ -316,3 +317,52 @@ def parse_excel(path: Path) -> ParseResults:
         return ParseResults(frozendict(results))
     else:
         return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc))
+
+
+def parse_dts_manifest(path: Path) -> ParseResults:
+    """
+    Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise.
+    The manifest should have this format, with expected keys included:
+    {
+        "resources": [{ file manifest info isn't currently relevant }],
+        "instructions": {
+            "protocol": "KBase narrative import",
+            "objects": [{
+                "data_type": str,
+                "parameters": {
+                    "<param1>": value,
+                    "<param2>": value,
+                }, ...
+            }]
+        }
+    }
+
+    This will get parsed and returned in line with the other parsers as a valid
+    ParseResults object, i.e. each result will be keyed on the data type string,
+    and its value will be a Tuple of frozendicts of the parameters. Also, in keeping
+    with the xsv parsers, each parameter value is expected to be a PRIMITIVE_TYPE.
+
+    TODO: include further details here, and in separate documentation - ADR?
+    """
+    spcsrc = SpecificationSource(path)
+    errors = []
+    # dummy for now
+    results = {}
+    try:
+        with open(path, "r") as manifest:
+            manifest_json = json.load(manifest)
+        if not isinstance(manifest_json, dict):
+            errors.append(Error(ErrorType.PARSE_FAIL, "Manifest is not a dictionary", spcsrc))
+
+    except json.JSONDecodeError:
+        return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc))
+    except FileNotFoundError:
+        return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc))
+    except IsADirectoryError:
+        return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc))
+    if errors:
+        return ParseResults(errors=tuple(errors))
+    elif results:
+        return ParseResults(frozendict(results))
+    else:
+        return _error(Error(ErrorType.PARSE_FAIL, "No import specification data in file", spcsrc))
diff --git a/tests/import_specifications/test_data/manifest_small.json b/tests/import_specifications/test_data/manifest_small.json
@@ -0,0 +1,122 @@
+{
+  "name": "manifest",
+  "resources": [
+    {
+      "id": "JDP:555518eb0d8785178e712d88",
+      "name": "61564.assembled",
+      "path": "img/submissions/61564/61564.assembled.gff",
+      "format": "gff",
+      "media_type": "text/plain",
+      "bytes": 455161,
+      "hash": "",
+      "credit": {
+        "comment": "",
+        "content_url": "",
+        "contributors": null,
+        "credit_metadata_source": "",
+        "dates": null,
+        "descriptions": null,
+        "funding": null,
+        "identifier": "JDP:555518eb0d8785178e712d88",
+        "license": {
+          "id": "",
+          "url": ""
+        },
+        "publisher": {
+          "organization_id": "",
+          "organization_name": ""
+        },
+        "related_identifiers": null,
+        "resource_type": "dataset",
+        "titles": null,
+        "url": "",
+        "version": ""
+      },
+      "instructions": {
+        "data_type": "gff_metagenome",
+        "parameters": {
+          "param1": "value1",
+          "param2": "value2"
+        }
+      }
+    },
+    {
+      "id": "JDP:555518eb0d8785178e712d84",
+      "name": "61564.assembled",
+      "path": "img/submissions/61564/61564.assembled.fna",
+      "format": "fasta",
+      "media_type": "text/plain",
+      "bytes": 6354414,
+      "hash": "",
+      "credit": {
+        "comment": "",
+        "content_url": "",
+        "contributors": null,
+        "credit_metadata_source": "",
+        "dates": null,
+        "descriptions": null,
+        "funding": null,
+        "identifier": "JDP:555518eb0d8785178e712d84",
+        "license": {
+          "id": "",
+          "url": ""
+        },
+        "publisher": {
+          "organization_id": "",
+          "organization_name": ""
+        },
+        "related_identifiers": null,
+        "resource_type": "dataset",
+        "titles": null,
+        "url": "",
+        "version": ""
+      },
+      "instructions": {
+        "data_type": "gff_metagenome",
+        "parameters": {
+          "param1": "value1",
+          "param2": "value2"
+        }
+      }
+    },
+    {
+      "id": "JDP:555518ec0d8785178e712d9f",
+      "name": "61567.assembled",
+      "path": "img/submissions/61567/61567.assembled.gff",
+      "format": "gff",
+      "media_type": "text/plain",
+      "bytes": 545583,
+      "hash": "",
+      "credit": {
+        "comment": "",
+        "content_url": "",
+        "contributors": null,
+        "credit_metadata_source": "",
+        "dates": null,
+        "descriptions": null,
+        "funding": null,
+        "identifier": "JDP:555518ec0d8785178e712d9f",
+        "license": {
+          "id": "",
+          "url": ""
+        },
+        "publisher": {
+          "organization_id": "",
+          "organization_name": ""
+        },
+        "related_identifiers": null,
+        "resource_type": "dataset",
+        "titles": null,
+        "url": "",
+        "version": ""
+      },
+      "instructions": {
+        "data_type": "gff_metagenome",
+        "parameters": {
+          "param1": "value1",
+          "param2": "value2"
+        }
+      }
+    }
+  ]
+}