Skip to content

Commit

Permalink
Merge pull request #205 from kbase/dts-manifest-file
Browse files Browse the repository at this point in the history
DTS manifest file part 1
  • Loading branch information
briehl authored Nov 27, 2024
2 parents 3cd1c56 + bd55791 commit a028bda
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 13 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ gzip, bzip2, md5sum, head, tail, wc

in the docker container all of these should be available

## Updating importer mapping types
The types returned by the `importer_mappings` endpoint (see [Get Importer Mappings](#get-importer-mappings) below)
are generated by the `GenerateMappings.py` script here: `staging_service/autodetect/GenerateMappings.py`.
Running this script will build the `supported_apps_w_extensions.json` file that should be placed in the
`deployment/conf` directory.

To add new mappings, update the `GenerateMappings.py` script. New file types and object types should be added
to the `staging_service.autodetect.Mappings` module and included from there. See `GenerateMappings.py`
docstrings for more details.

## API

all paths should be specified treating the user's home directory as root
Expand Down Expand Up @@ -1079,6 +1089,8 @@ For example,
- for files for which there is no predicted app, the return is a null value
- this endpoint is used to power the dropdowns for the staging service window in the Narrative

Note: to update these mappings see instructions [here](#updating-importer-mapping-types)

**URL** : `ci.kbase.us/services/staging_service/importer_mappings`

**local URL** : `localhost:3000/importer_mappings`
Expand Down
24 changes: 15 additions & 9 deletions deployment/conf/supported_apps_w_extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@
"csv",
"tsv",
"xls",
"xlsx"
"xlsx",
"json"
],
"media": [
"tsv",
Expand Down Expand Up @@ -1101,26 +1102,31 @@
}
]
},
"smbl": {
"json": {
"file_ext_type": [
"SBML"
"JSON"
],
"mappings": [
{
"id": "fba_model",
"title": "FBA Model",
"id": "import_specification",
"title": "Import Specification",
"app_weight": 1
},
{
"id": "escher_map",
"title": "EscherMap",
"app_weight": 1
}
]
},
"json": {
"smbl": {
"file_ext_type": [
"JSON"
"SBML"
],
"mappings": [
{
"id": "escher_map",
"title": "EscherMap",
"id": "fba_model",
"title": "FBA Model",
"app_weight": 1
}
]
Expand Down
14 changes: 12 additions & 2 deletions staging_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .app_error_formatter import format_import_spec_errors
from .auth2Client import KBaseAuth2
from .autodetect.Mappings import CSV, EXCEL, TSV
from .autodetect.Mappings import CSV, EXCEL, TSV, JSON
from .AutoDetectUtils import AutoDetectUtils
from .globus import assert_globusid_exists, is_globusid
from .import_specifications.file_parser import (
Expand All @@ -26,7 +26,12 @@
write_excel,
write_tsv,
)
from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv
from .import_specifications.individual_parsers import (
parse_csv,
parse_excel,
parse_tsv,
parse_dts_manifest,
)
from .JGIMetadata import read_metadata_for
from .metadata import add_upa, dir_info, similar, some_metadata
from .utils import AclManager, Path, run_command
Expand All @@ -43,6 +48,7 @@
CSV: parse_csv,
TSV: parse_tsv,
EXCEL: parse_excel,
JSON: parse_dts_manifest,
}

_IMPSPEC_FILE_TO_WRITER = {
Expand Down Expand Up @@ -110,6 +116,10 @@ async def bulk_specification(request: web.Request) -> web.json_response:
type, in the `types` key.
:param request: contains a comma separated list of files, e.g. folder1/file1.txt,file2.txt
TODO: since JSON files are rather generic and we might want to use a different JSON bulk-spec
format later, add a separate query parameter to request that the selected file is treated as a
Data Transfer Service manifest.
"""
username = await authorize_request(request)
files = parse_qs(request.query_string).get("files", [])
Expand Down
11 changes: 10 additions & 1 deletion staging_service/autodetect/GenerateMappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
* Note: We should serve the generated content from memory
* Note: This doesn't handle if we want to have different output types based on file extensions
feeding into the same app
Adding new file types and app ids:
* file types and app ids should be added to staging_service.autodetect.Mappings
* file types should be all upper-case and represent the file suffix.
* app ids should be in snake_case, and represent either an object import type, or some
internal application to be run on selection (i.e. decompress or import specification).
* app ids that map to actual object import maps should match the app ids in the narrative
interface configuration here:
https://github.com/kbase/narrative/blob/main/kbase-extension/static/kbase/config/staging_upload.json
"""

from collections import defaultdict
Expand Down Expand Up @@ -120,7 +129,7 @@
fba_model_id,
import_specification,
]
file_format_to_app_mapping[JSON] = [escher_map_id]
file_format_to_app_mapping[JSON] = [escher_map_id, import_specification]
file_format_to_app_mapping[SBML] = [fba_model_id]

app_id_to_extensions = defaultdict(list)
Expand Down
50 changes: 50 additions & 0 deletions staging_service/import_specifications/individual_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import csv
import json
import math
import re
from pathlib import Path
Expand Down Expand Up @@ -316,3 +317,52 @@ def parse_excel(path: Path) -> ParseResults:
return ParseResults(frozendict(results))
else:
return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc))


def parse_dts_manifest(path: Path) -> ParseResults:
"""
Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise.
The manifest should have this format, with expected keys included:
{
"resources": [{ file manifest info isn't currently relevant }],
"instructions": {
"protocol": "KBase narrative import",
"objects": [{
"data_type": str,
"parameters": {
"<param1>": value,
"<param2>": value,
}, ...
}]
}
}
This will get parsed and returned in line with the other parsers as a valid
ParseResults object, i.e. each result will be keyed on the data type string,
and its value will be a Tuple of frozendicts of the parameters. Also, in keeping
with the xsv parsers, each parameter value is expected to be a PRIMITIVE_TYPE.
TODO: include further details here, and in separate documentation - ADR?
"""
spcsrc = SpecificationSource(path)
errors = []
# dummy for now
results = {}
try:
with open(path, "r") as manifest:
manifest_json = json.load(manifest)
if not isinstance(manifest_json, dict):
errors.append(Error(ErrorType.PARSE_FAIL, "Manifest is not a dictionary", spcsrc))

except json.JSONDecodeError:
return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc))
except FileNotFoundError:
return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc))
except IsADirectoryError:
return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc))
if errors:
return ParseResults(errors=tuple(errors))
elif results:
return ParseResults(frozendict(results))
else:
return _error(Error(ErrorType.PARSE_FAIL, "No import specification data in file", spcsrc))
122 changes: 122 additions & 0 deletions tests/import_specifications/test_data/manifest_small.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
{
"name": "manifest",
"resources": [
{
"id": "JDP:555518eb0d8785178e712d88",
"name": "61564.assembled",
"path": "img/submissions/61564/61564.assembled.gff",
"format": "gff",
"media_type": "text/plain",
"bytes": 455161,
"hash": "",
"credit": {
"comment": "",
"content_url": "",
"contributors": null,
"credit_metadata_source": "",
"dates": null,
"descriptions": null,
"funding": null,
"identifier": "JDP:555518eb0d8785178e712d88",
"license": {
"id": "",
"url": ""
},
"publisher": {
"organization_id": "",
"organization_name": ""
},
"related_identifiers": null,
"resource_type": "dataset",
"titles": null,
"url": "",
"version": ""
},
"instructions": {
"data_type": "gff_metagenome",
"parameters": {
"param1": "value1",
"param2": "value2"
}
}
},
{
"id": "JDP:555518eb0d8785178e712d84",
"name": "61564.assembled",
"path": "img/submissions/61564/61564.assembled.fna",
"format": "fasta",
"media_type": "text/plain",
"bytes": 6354414,
"hash": "",
"credit": {
"comment": "",
"content_url": "",
"contributors": null,
"credit_metadata_source": "",
"dates": null,
"descriptions": null,
"funding": null,
"identifier": "JDP:555518eb0d8785178e712d84",
"license": {
"id": "",
"url": ""
},
"publisher": {
"organization_id": "",
"organization_name": ""
},
"related_identifiers": null,
"resource_type": "dataset",
"titles": null,
"url": "",
"version": ""
},
"instructions": {
"data_type": "gff_metagenome",
"parameters": {
"param1": "value1",
"param2": "value2"
}
}
},
{
"id": "JDP:555518ec0d8785178e712d9f",
"name": "61567.assembled",
"path": "img/submissions/61567/61567.assembled.gff",
"format": "gff",
"media_type": "text/plain",
"bytes": 545583,
"hash": "",
"credit": {
"comment": "",
"content_url": "",
"contributors": null,
"credit_metadata_source": "",
"dates": null,
"descriptions": null,
"funding": null,
"identifier": "JDP:555518ec0d8785178e712d9f",
"license": {
"id": "",
"url": ""
},
"publisher": {
"organization_id": "",
"organization_name": ""
},
"related_identifiers": null,
"resource_type": "dataset",
"titles": null,
"url": "",
"version": ""
},
"instructions": {
"data_type": "gff_metagenome",
"parameters": {
"param1": "value1",
"param2": "value2"
}
}
}
]
}
Loading

0 comments on commit a028bda

Please sign in to comment.