From 24d608cadec576e5689d5bf1df070ae1f3d9064c Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 4 Dec 2024 17:06:26 -0500 Subject: [PATCH 1/8] do the parsing, update schema --- .../schema/dts_manifest.json | 15 +- .../individual_parsers.py | 57 +++- .../test_data/manifest_errors.json | 81 +++++ .../test_data/manifest_multiple.json | 62 ++++ .../test_data/manifest_small.json | 39 ++- .../test_individual_parsers.py | 302 +++++++++++++++++- tests/test_app.py | 72 +++-- 7 files changed, 559 insertions(+), 69 deletions(-) create mode 100644 tests/import_specifications/test_data/manifest_errors.json create mode 100644 tests/import_specifications/test_data/manifest_multiple.json diff --git a/import_specifications/schema/dts_manifest.json b/import_specifications/schema/dts_manifest.json index fc70d7ef..07578596 100644 --- a/import_specifications/schema/dts_manifest.json +++ b/import_specifications/schema/dts_manifest.json @@ -13,7 +13,20 @@ "type": "object", "properties": { "data_type": {"type": "string"}, - "parameters": {"type": "object"} + "parameters": { + "type": "object", + "patternProperties": { + ".*": { + "oneOf": [ + {"type": "string"}, + {"type": "number"}, + {"type": "boolean"}, + {"type": "null"} + ] + } + }, + "additionalProperties": false + } }, "required": ["data_type", "parameters"] } diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 3b1dde5d..47becc33 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -61,6 +61,14 @@ "null", ] +_DTS_INSTRUCTIONS_KEY = "instructions" +_DTS_INSTRUCTIONS_DATATYPE_KEY = "data_type" +_DTS_INSTRUCTIONS_PARAMETERS_KEY = "parameters" +_DTS_INSTRUCTIONS_REQUIRED_KEYS = [_DTS_INSTRUCTIONS_DATATYPE_KEY, _DTS_INSTRUCTIONS_PARAMETERS_KEY] +_DTS_INSTRUCTIONS_PROTOCOL_KEY = "protocol" +_DTS_INSTRUCTIONS_PROTOCOL = "KBase narrative import" +_DTS_INSTRUCTIONS_OBJECTS_KEY = "objects" + class _ParseException(Exception): pass @@ -361,10 +369,18 @@ def parse_dts_manifest(path: Path, dts_manifest_schema: dict) -> ParseResults: err_str = err.message err_path = err.absolute_path if err_path: - if isinstance(err_path[-1], int): - err_path[-1] = f"item {err_path[-1]}" - err_str += f" at {'/'.join(err_path)}" + # paths can look like, say, ["instructions", "objects", 0, "data_type"] + # convert that '0' to "item 0" to be slightly more readable to users. + # kind of a mouthful below, but does that conversion in place + err_path = [f"item {elem}" if isinstance(elem, int) else elem for elem in err_path] + prep = "for" + if len(err_path) > 1: + prep = "at" + err_str += f" {prep} {'/'.join(err_path)}" errors.append(Error(ErrorType.PARSE_FAIL, err_str, spcsrc)) + if not errors: + results = _process_dts_manifest(manifest_json, spcsrc) + except jsonschema.exceptions.SchemaError: return _error(Error(ErrorType.OTHER, "Manifest schema is invalid", spcsrc)) except json.JSONDecodeError: @@ -373,9 +389,44 @@ def parse_dts_manifest(path: Path, dts_manifest_schema: dict) -> ParseResults: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) + except _ParseException as err: + return _error(err.args[0]) if errors: return ParseResults(errors=tuple(errors)) elif results: return ParseResults(frozendict(results)) else: return _error(Error(ErrorType.PARSE_FAIL, "No import specification data in file", spcsrc)) + + +def _process_dts_manifest( + manifest: dict[str, Any], spcsrc: SpecificationSource +) -> Tuple[dict[str, ParseResult]]: + """Parse the DTS manifest file and return the results and a list of errors if applicable. + + Results are returned as a dictionary where keys are data types, and values are ParseResults for that data type. + This assumes that the manifest has the correct structure, i.e. is validated via jsonschema. + Will raise KeyErrors otherwise. + """ + results = {} + instructions = manifest[_DTS_INSTRUCTIONS_KEY] + # Make sure the protocol value matches. + if instructions[_DTS_INSTRUCTIONS_PROTOCOL_KEY] != _DTS_INSTRUCTIONS_PROTOCOL: + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"The instructions protocol must be '{_DTS_INSTRUCTIONS_PROTOCOL}'", + spcsrc, + ) + ) + for resource_obj in instructions[_DTS_INSTRUCTIONS_OBJECTS_KEY]: + datatype = resource_obj[_DTS_INSTRUCTIONS_DATATYPE_KEY] + parameters = frozendict(resource_obj[_DTS_INSTRUCTIONS_PARAMETERS_KEY]) + if datatype not in results: + results[datatype] = [] + results[datatype].append(parameters) + # Package results as a dict of {datatype: ParseResult} + parsed_result = { + source: ParseResult(spcsrc, tuple(parsed)) for source, parsed in results.items() + } + return parsed_result diff --git a/tests/import_specifications/test_data/manifest_errors.json b/tests/import_specifications/test_data/manifest_errors.json new file mode 100644 index 00000000..5960b51d --- /dev/null +++ b/tests/import_specifications/test_data/manifest_errors.json @@ -0,0 +1,81 @@ +{ + "name": "manifest", + "resources": [ + { + "id": "JDP:555518eb0d8785178e712d88", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.gff", + "format": "gff", + "media_type": "text/plain", + "bytes": 455161, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518eb0d8785178e712d88", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + } + }, + { + "id": "JDP:555518eb0d8785178e712d84", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.fna", + "format": "fasta", + "media_type": "text/plain", + "bytes": 6354414, + "hash": "", + "credit": { + "comment": "", + "content_url": "", + "contributors": null, + "credit_metadata_source": "", + "dates": null, + "descriptions": null, + "funding": null, + "identifier": "JDP:555518eb0d8785178e712d84", + "license": { + "id": "", + "url": "" + }, + "publisher": { + "organization_id": "", + "organization_name": "" + }, + "related_identifiers": null, + "resource_type": "dataset", + "titles": null, + "url": "", + "version": "" + } + } + ], + "instructions": { + "protocol": "KBase narrative import", + "objects": [ + { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + } + ] + } + } diff --git a/tests/import_specifications/test_data/manifest_multiple.json b/tests/import_specifications/test_data/manifest_multiple.json new file mode 100644 index 00000000..393bfb84 --- /dev/null +++ b/tests/import_specifications/test_data/manifest_multiple.json @@ -0,0 +1,62 @@ +{ + "name": "manifest", + "resources": [ + { + "id": "JDP:555518eb0d8785178e712d88", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.gff", + "format": "gff" + }, + { + "id": "JDP:555518eb0d8785178e712d84", + "name": "61564.assembled", + "path": "img/submissions/61564/61564.assembled.fna", + "format": "fasta" + }, + { + "id": "JDP:555518ec0d8785178e712d9f", + "name": "61567.assembled", + "path": "img/submissions/61567/61567.assembled.gff", + "format": "gff" + }, + { + "id": "JDP:555518ec0d8785178e712d9b", + "name": "61567.assembled", + "path": "img/submissions/61567/61567.assembled.fna", + "format": "fasta" + } + ], + "instructions": { + "protocol": "KBase narrative import", + "objects": [ + { + "data_type": "gff_metagenome", + "parameters": { + "mg_param1": "value1", + "mg_param2": "value2" + } + }, + { + "data_type": "gff_metagenome", + "parameters": { + "mg_param1": "value3", + "mg_param2": "value4" + } + }, + { + "data_type": "gff_genome", + "parameters": { + "gen_param1": "value1", + "gen_param2": "value2" + } + }, + { + "data_type": "gff_genome", + "parameters": { + "gen_param1": "value3", + "gen_param2": "value4" + } + } + ] + } + } diff --git a/tests/import_specifications/test_data/manifest_small.json b/tests/import_specifications/test_data/manifest_small.json index e87472d5..dd9b7595 100644 --- a/tests/import_specifications/test_data/manifest_small.json +++ b/tests/import_specifications/test_data/manifest_small.json @@ -31,13 +31,6 @@ "titles": null, "url": "", "version": "" - }, - "instructions": { - "data_type": "gff_metagenome", - "parameters": { - "param1": "value1", - "param2": "value2" - } } }, { @@ -70,13 +63,6 @@ "titles": null, "url": "", "version": "" - }, - "instructions": { - "data_type": "gff_metagenome", - "parameters": { - "param1": "value1", - "param2": "value2" - } } }, { @@ -109,14 +95,33 @@ "titles": null, "url": "", "version": "" + } + } + ], + "instructions": { + "protocol": "KBase narrative import", + "objects": [ + { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } }, - "instructions": { + { + "data_type": "gff_metagenome", + "parameters": { + "param1": "value1", + "param2": "value2" + } + }, + { "data_type": "gff_metagenome", "parameters": { "param1": "value1", "param2": "value2" } } - } - ] + ] + } } diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 4ef05d8b..8b6d7ef2 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -18,6 +18,7 @@ parse_dts_manifest, parse_excel, parse_tsv, + _DTS_INSTRUCTIONS_PROTOCOL, ) from tests.test_app import FileUtil from tests.test_utils import bootstrap_config @@ -778,22 +779,6 @@ def test_excel_parse_fail_unequal_rows(): ) -def test_dts_manifest_parse_success(dts_schema: dict[str, Any]): - f = _get_test_file("manifest_small.json") - res = parse_dts_manifest(f, dts_schema) - # fails for now - assert res.results is None - assert res.errors == tuple( - [ - Error( - ErrorType.PARSE_FAIL, - "'instructions' is a required property", - SpecificationSource(f), - ) - ] - ) - - @pytest.fixture(scope="module") def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[dict | list], Path]: def manifest_writer(input_json: dict | list) -> Path: @@ -805,6 +790,51 @@ def manifest_writer(input_json: dict | list) -> Path: return manifest_writer +def test_dts_manifest_parse_success(dts_schema: dict[str, Any]): + f = _get_test_file("manifest_small.json") + res = parse_dts_manifest(f, dts_schema) + assert res.results + assert res.errors is None + assert list(res.results.keys()) == ["gff_metagenome"] + assert res.results["gff_metagenome"] + assert res.results["gff_metagenome"].source.file == f + assert len(res.results["gff_metagenome"].result) == 3 + for parsed in res.results["gff_metagenome"].result: + assert parsed == {"param1": "value1", "param2": "value2"} + + +def test_dts_manifest_parse_multi_data_types_success(dts_schema: dict[str, Any]): + f = _get_test_file("manifest_multiple.json") + res = parse_dts_manifest(f, dts_schema) + assert res.results + assert res.errors is None + assert len(res.results.keys()) == 2 + expected = { + "gff_metagenome": ( + frozendict({"mg_param1": "value1", "mg_param2": "value2"}), + frozendict({"mg_param1": "value3", "mg_param2": "value4"}), + ), + "gff_genome": ( + frozendict( + { + "gen_param1": "value1", + "gen_param2": "value2", + } + ), + frozendict( + { + "gen_param1": "value3", + "gen_param2": "value4", + } + ), + ), + } + for key in expected.keys(): + assert key in res.results + assert res.results[key].source.file == f + assert res.results[key].result == expected[key] + + def _dts_manifest_parse_fail(input_file: Path, schema: dict, errors: list[Error]): """ Tests a failing DTS manifest parse. @@ -931,3 +961,243 @@ def test_dts_manifest_fail_with_path( ) ], ) + + +malformed_dict = [[], 1, "nope", None] + + +@pytest.mark.parametrize("bad_instruction", malformed_dict) +def test_dts_manifest_malformed_instructions( + write_dts_manifest: Callable[[dict | list], Path], + dts_schema: dict[str, Any], + bad_instruction: list | int | str | None, +): + manifest_file = write_dts_manifest({"resources": [], "instructions": bad_instruction}) + err_val = bad_instruction + if isinstance(err_val, str): + err_val = f"'{err_val}'" + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + f"{err_val} is not of type 'object' for instructions", + SpecificationSource(manifest_file), + ) + ], + ) + + +@pytest.mark.parametrize("bad_parameters", malformed_dict) +def test_dts_manifest_malformed_parameters( + write_dts_manifest: Callable[[dict | list], Path], + dts_schema: dict[str, Any], + bad_parameters: list | int | str | None, +): + manifest_file = write_dts_manifest( + { + "resources": [], + "instructions": { + "protocol": _DTS_INSTRUCTIONS_PROTOCOL, + "objects": [{"data_type": "some_type", "parameters": bad_parameters}], + }, + } + ) + err_val = bad_parameters + if isinstance(err_val, str): + err_val = f"'{err_val}'" + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + f"{err_val} is not of type 'object' at instructions/objects/item 0/parameters", + SpecificationSource(manifest_file), + ) + ], + ) + + +missing_key_cases = [["data_type"], ["parameters"], ["data_type", "parameters"]] + + +@pytest.mark.parametrize("missing_keys", missing_key_cases) +def test_dts_manifest_missing_instruction_keys( + write_dts_manifest: Callable[[dict | list], Path], + dts_schema: dict[str, Any], + missing_keys: list[str], +): + resource_obj = {"data_type": "some_type", "parameters": {"p1": "v1"}} + for key in missing_keys: + del resource_obj[key] + manifest_file = write_dts_manifest( + { + "resources": [], + "instructions": {"protocol": _DTS_INSTRUCTIONS_PROTOCOL, "objects": [resource_obj]}, + } + ) + error_list = [] + for key in missing_keys: + error_list.append( + Error( + ErrorType.PARSE_FAIL, + f"'{key}' is a required property at instructions/objects/item 0", + SpecificationSource(manifest_file), + ) + ) + _dts_manifest_parse_fail(manifest_file, dts_schema, error_list) + + +def test_dts_manifest_empty( + write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] +): + manifest_file = write_dts_manifest( + {"resources": [], "instructions": {"protocol": _DTS_INSTRUCTIONS_PROTOCOL, "objects": []}} + ) + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + "No import specification data in file", + SpecificationSource(manifest_file), + ) + ], + ) + + +@pytest.mark.parametrize("non_str", [{"a": "b"}, ["a", "b"], 1, None]) +def test_dts_manifest_fail_data_type_not_str( + write_dts_manifest: Callable[[dict | list], Path], + dts_schema: dict[str, Any], + non_str: dict | list | int | None, +): + manifest_file = write_dts_manifest( + { + "resources": [], + "instructions": { + "protocol": _DTS_INSTRUCTIONS_PROTOCOL, + "objects": [{"data_type": non_str, "parameters": {}}], + }, + } + ) + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + f"{non_str} is not of type 'string' at instructions/objects/item 0/data_type", + SpecificationSource(manifest_file), + ) + ], + ) + + +def test_dts_manifest_missing_instructions_protocol( + write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] +): + manifest_file = write_dts_manifest({"resources": [], "instructions": {"objects": []}}) + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + "'protocol' is a required property for instructions", + SpecificationSource(manifest_file), + ) + ], + ) + + +def test_dts_manifest_wrong_protocol( + write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] +): + manifest_file = write_dts_manifest( + {"resources": [], "instructions": {"protocol": "some wrong protocol", "objects": []}} + ) + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + f"The instructions protocol must be '{_DTS_INSTRUCTIONS_PROTOCOL}'", + SpecificationSource(manifest_file), + ) + ], + ) + + +def test_dts_manifest_missing_objects( + write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] +): + manifest_file = write_dts_manifest( + {"resources": [], "instructions": {"protocol": _DTS_INSTRUCTIONS_PROTOCOL}} + ) + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + "'objects' is a required property for instructions", + SpecificationSource(manifest_file), + ) + ], + ) + + +@pytest.mark.parametrize("not_dict", malformed_dict) +def test_dts_manifest_resource_not_dict( + write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any], not_dict +): + manifest_file = write_dts_manifest( + { + "resources": [], + "instructions": {"protocol": _DTS_INSTRUCTIONS_PROTOCOL, "objects": [not_dict]}, + } + ) + err_val = not_dict + if isinstance(err_val, str): + err_val = f"'{err_val}'" + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + f"{err_val} is not of type 'object' at instructions/objects/item 0", + SpecificationSource(manifest_file), + ) + ], + ) + + +def test_dts_manifest_parameter_not_primitive( + write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] +): + manifest_file = write_dts_manifest( + { + "resources": [], + "instructions": { + "protocol": _DTS_INSTRUCTIONS_PROTOCOL, + "objects": [{"data_type": "some_type", "parameters": {"foo": ["not", "allowed"]}}], + }, + } + ) + _dts_manifest_parse_fail( + manifest_file, + dts_schema, + [ + Error( + ErrorType.PARSE_FAIL, + "['not', 'allowed'] is not valid under any of the given schemas at instructions/objects/item 0/parameters/foo", + SpecificationSource(manifest_file), + ) + ], + ) diff --git a/tests/test_app.py b/tests/test_app.py index c1fcd8d8..592c7c29 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1111,40 +1111,48 @@ async def test_bulk_specification_dts_success(): resp = await cli.get(f"bulk_specification/?files={manifest_1} , {manifest_2}&dts=1") jsn = await resp.json() # fails for now. will update when schema/parser is properly finished. + # assert jsn == { + # "errors": [ + # { + # "type": "cannot_parse_file", + # "file": f"testuser/{manifest_1}", + # "message": "No import specification data in file", + # "tab": None, + # }, + # { + # "type": "cannot_parse_file", + # "file": f"testuser/{manifest_2}", + # "message": "No import specification data in file", + # "tab": None, + # }, + # ] + # } + # soon will be this: assert jsn == { - "errors": [ - { - "type": "cannot_parse_file", - "file": f"testuser/{manifest_1}", - "message": "No import specification data in file", - "tab": None, - }, - { - "type": "cannot_parse_file", - "file": f"testuser/{manifest_2}", - "message": "No import specification data in file", - "tab": None, - }, - ] + "types": { + "gff_genome": [ + { + "fasta_file": "g_fasta_1", + "gff_file": "g_gff_1", + "genome_name": "genome_1", + }, + { + "fasta_file": "g_fasta_2", + "gff_file": "g_gff_2", + "genome_name": "genome_2", + }, + ], + "gff_metagenome": [ + {"fasta_file": "fasta_1", "gff_file": "gff_1", "genome_name": "mg_1"}, + {"fasta_file": "fasta_2", "gff_file": "gff_2", "genome_name": "mg_2"}, + ], + }, + "files": { + "gff_metagenome": {"file": f"testuser/{manifest_1}", "tab": None}, + "gff_genome": {"file": f"testuser/{manifest_2}", "tab": None}, + }, } - # soon will be this: - # assert json == { - # "types": { - # "gff_genome": [ - # {"fasta_file": "g_fasta_1", "gff_file": "g_gff_1", "genome_name": "genome_1"}, - # {"fasta_file": "g_fasta_2", "gff_file": "g_gff_2", "genome_name": "genome_2"} - # ], - # "gff_metagenome": [ - # {"fasta_file": "fasta_1", "gff_file": "gff_1", "genome_name": "mg_1"}, - # {"fasta_file": "fasta_2", "gff_file": "gff_2", "genome_name": "mg_2"} - # ] - # }, - # "files": { - # "gff_metagenome": {"file": f"testuser/{manifest_1}", "tab": None}, - # "gff_genome": {"file": f"testuser/{manifest_2}", "tab": None} - # } - # } - assert resp.status == 400 + assert resp.status == 200 async def test_bulk_specification_dts_fail_json_without_dts(): From aa3e8df9a6f3ee1839012c7da3106580be73a432 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 4 Dec 2024 17:09:29 -0500 Subject: [PATCH 2/8] remove commented code --- tests/test_app.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/test_app.py b/tests/test_app.py index 592c7c29..3cbd265d 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1110,24 +1110,6 @@ async def test_bulk_specification_dts_success(): json.dump(manifest_2_dict, f) resp = await cli.get(f"bulk_specification/?files={manifest_1} , {manifest_2}&dts=1") jsn = await resp.json() - # fails for now. will update when schema/parser is properly finished. - # assert jsn == { - # "errors": [ - # { - # "type": "cannot_parse_file", - # "file": f"testuser/{manifest_1}", - # "message": "No import specification data in file", - # "tab": None, - # }, - # { - # "type": "cannot_parse_file", - # "file": f"testuser/{manifest_2}", - # "message": "No import specification data in file", - # "tab": None, - # }, - # ] - # } - # soon will be this: assert jsn == { "types": { "gff_genome": [ From 1c8ef68b6514310432dd097ce16cd49c28b3ed7a Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 11 Dec 2024 22:23:32 -0500 Subject: [PATCH 3/8] update parser, tests --- .../individual_parsers.py | 2 - .../test_individual_parsers.py | 85 ++++++++----------- tests/test_app.py | 4 +- 3 files changed, 36 insertions(+), 55 deletions(-) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 5a7751ec..56e2af11 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -371,8 +371,6 @@ def parse_dts_manifest(path: Path, validator: Draft202012Validator) -> ParseResu if not errors: results = _process_dts_manifest(manifest_json, spcsrc) - except jsonschema.exceptions.SchemaError: - return _error(Error(ErrorType.OTHER, "Manifest schema is invalid", spcsrc)) except json.JSONDecodeError: return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc)) except FileNotFoundError: diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index d449b022..2408208b 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -780,22 +780,6 @@ def test_excel_parse_fail_unequal_rows(): ) -def test_dts_manifest_parse_success(dts_validator: Draft202012Validator): - f = _get_test_file("manifest_small.json") - res = parse_dts_manifest(f, dts_validator) - # fails for now - assert res.results is None - assert res.errors == tuple( - [ - Error( - ErrorType.PARSE_FAIL, - "'instructions' is a required property", - SpecificationSource(f), - ) - ] - ) - - @pytest.fixture(scope="module") def write_dts_manifest(temp_dir: Generator[Path, None, None]) -> Callable[[dict | list], Path]: def manifest_writer(input_json: dict | list) -> Path: @@ -807,9 +791,9 @@ def manifest_writer(input_json: dict | list) -> Path: return manifest_writer -def test_dts_manifest_parse_success(dts_schema: dict[str, Any]): +def test_dts_manifest_parse_success(dts_validator: Draft202012Validator): f = _get_test_file("manifest_small.json") - res = parse_dts_manifest(f, dts_schema) + res = parse_dts_manifest(f, dts_validator) assert res.results assert res.errors is None assert list(res.results.keys()) == ["gff_metagenome"] @@ -820,9 +804,9 @@ def test_dts_manifest_parse_success(dts_schema: dict[str, Any]): assert parsed == {"param1": "value1", "param2": "value2"} -def test_dts_manifest_parse_multi_data_types_success(dts_schema: dict[str, Any]): +def test_dts_manifest_parse_multi_data_types_success(dts_validator: Draft202012Validator): f = _get_test_file("manifest_multiple.json") - res = parse_dts_manifest(f, dts_schema) + res = parse_dts_manifest(f, dts_validator) assert res.results assert res.errors is None assert len(res.results.keys()) == 2 @@ -852,7 +836,6 @@ def test_dts_manifest_parse_multi_data_types_success(dts_schema: dict[str, Any]) assert res.results[key].result == expected[key] -def _dts_manifest_parse_fail(input_file: Path, schema: dict, errors: list[Error]): def _dts_manifest_parse_fail( input_file: Path, validator: Draft202012Validator, errors: list[Error] ): @@ -959,7 +942,7 @@ def test_dts_manifest_fail_with_path( { "resources": [], "instructions": { - "protocol": "KBase narrative import", + "protocol": _DTS_INSTRUCTIONS_PROTOCOL, "objects": [{"data_type": "foo", "parameters": {}}, {"parameters": {}}], }, } @@ -983,7 +966,7 @@ def test_dts_manifest_fail_with_path( @pytest.mark.parametrize("bad_instruction", malformed_dict) def test_dts_manifest_malformed_instructions( write_dts_manifest: Callable[[dict | list], Path], - dts_schema: dict[str, Any], + dts_validator: Draft202012Validator, bad_instruction: list | int | str | None, ): manifest_file = write_dts_manifest({"resources": [], "instructions": bad_instruction}) @@ -992,11 +975,11 @@ def test_dts_manifest_malformed_instructions( err_val = f"'{err_val}'" _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - f"{err_val} is not of type 'object' for instructions", + f"{err_val} is not of type 'object' at ['instructions']", SpecificationSource(manifest_file), ) ], @@ -1006,7 +989,7 @@ def test_dts_manifest_malformed_instructions( @pytest.mark.parametrize("bad_parameters", malformed_dict) def test_dts_manifest_malformed_parameters( write_dts_manifest: Callable[[dict | list], Path], - dts_schema: dict[str, Any], + dts_validator: Draft202012Validator, bad_parameters: list | int | str | None, ): manifest_file = write_dts_manifest( @@ -1023,11 +1006,11 @@ def test_dts_manifest_malformed_parameters( err_val = f"'{err_val}'" _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - f"{err_val} is not of type 'object' at instructions/objects/item 0/parameters", + f"{err_val} is not of type 'object' at ['instructions', 'objects', 0, 'parameters']", SpecificationSource(manifest_file), ) ], @@ -1040,7 +1023,7 @@ def test_dts_manifest_malformed_parameters( @pytest.mark.parametrize("missing_keys", missing_key_cases) def test_dts_manifest_missing_instruction_keys( write_dts_manifest: Callable[[dict | list], Path], - dts_schema: dict[str, Any], + dts_validator: Draft202012Validator, missing_keys: list[str], ): resource_obj = {"data_type": "some_type", "parameters": {"p1": "v1"}} @@ -1057,22 +1040,22 @@ def test_dts_manifest_missing_instruction_keys( error_list.append( Error( ErrorType.PARSE_FAIL, - f"'{key}' is a required property at instructions/objects/item 0", + f"'{key}' is a required property at ['instructions', 'objects', 0]", SpecificationSource(manifest_file), ) ) - _dts_manifest_parse_fail(manifest_file, dts_schema, error_list) + _dts_manifest_parse_fail(manifest_file, dts_validator, error_list) def test_dts_manifest_empty( - write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] + write_dts_manifest: Callable[[dict | list], Path], dts_validator: Draft202012Validator ): manifest_file = write_dts_manifest( {"resources": [], "instructions": {"protocol": _DTS_INSTRUCTIONS_PROTOCOL, "objects": []}} ) _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, @@ -1086,7 +1069,7 @@ def test_dts_manifest_empty( @pytest.mark.parametrize("non_str", [{"a": "b"}, ["a", "b"], 1, None]) def test_dts_manifest_fail_data_type_not_str( write_dts_manifest: Callable[[dict | list], Path], - dts_schema: dict[str, Any], + dts_validator: Draft202012Validator, non_str: dict | list | int | None, ): manifest_file = write_dts_manifest( @@ -1100,11 +1083,11 @@ def test_dts_manifest_fail_data_type_not_str( ) _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - f"{non_str} is not of type 'string' at instructions/objects/item 0/data_type", + f"{non_str} is not of type 'string' at ['instructions', 'objects', 0, 'data_type']", SpecificationSource(manifest_file), ) ], @@ -1112,16 +1095,16 @@ def test_dts_manifest_fail_data_type_not_str( def test_dts_manifest_missing_instructions_protocol( - write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] + write_dts_manifest: Callable[[dict | list], Path], dts_validator: Draft202012Validator ): manifest_file = write_dts_manifest({"resources": [], "instructions": {"objects": []}}) _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - "'protocol' is a required property for instructions", + "'protocol' is a required property at ['instructions']", SpecificationSource(manifest_file), ) ], @@ -1129,18 +1112,18 @@ def test_dts_manifest_missing_instructions_protocol( def test_dts_manifest_wrong_protocol( - write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] + write_dts_manifest: Callable[[dict | list], Path], dts_validator: Draft202012Validator ): manifest_file = write_dts_manifest( {"resources": [], "instructions": {"protocol": "some wrong protocol", "objects": []}} ) _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - f"The instructions protocol must be '{_DTS_INSTRUCTIONS_PROTOCOL}'", + f"'{_DTS_INSTRUCTIONS_PROTOCOL}' was expected at ['instructions', 'protocol']", SpecificationSource(manifest_file), ) ], @@ -1148,18 +1131,18 @@ def test_dts_manifest_wrong_protocol( def test_dts_manifest_missing_objects( - write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] + write_dts_manifest: Callable[[dict | list], Path], dts_validator: Draft202012Validator ): manifest_file = write_dts_manifest( {"resources": [], "instructions": {"protocol": _DTS_INSTRUCTIONS_PROTOCOL}} ) _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - "'objects' is a required property for instructions", + "'objects' is a required property at ['instructions']", SpecificationSource(manifest_file), ) ], @@ -1168,7 +1151,7 @@ def test_dts_manifest_missing_objects( @pytest.mark.parametrize("not_dict", malformed_dict) def test_dts_manifest_resource_not_dict( - write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any], not_dict + write_dts_manifest: Callable[[dict | list], Path], dts_validator: Draft202012Validator, not_dict ): manifest_file = write_dts_manifest( { @@ -1181,11 +1164,11 @@ def test_dts_manifest_resource_not_dict( err_val = f"'{err_val}'" _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - f"{err_val} is not of type 'object' at instructions/objects/item 0", + f"{err_val} is not of type 'object' at ['instructions', 'objects', 0]", SpecificationSource(manifest_file), ) ], @@ -1193,7 +1176,7 @@ def test_dts_manifest_resource_not_dict( def test_dts_manifest_parameter_not_primitive( - write_dts_manifest: Callable[[dict | list], Path], dts_schema: dict[str, Any] + write_dts_manifest: Callable[[dict | list], Path], dts_validator: Draft202012Validator ): manifest_file = write_dts_manifest( { @@ -1206,11 +1189,11 @@ def test_dts_manifest_parameter_not_primitive( ) _dts_manifest_parse_fail( manifest_file, - dts_schema, + dts_validator, [ Error( ErrorType.PARSE_FAIL, - "['not', 'allowed'] is not valid under any of the given schemas at instructions/objects/item 0/parameters/foo", + "['not', 'allowed'] is not valid under any of the given schemas at ['instructions', 'objects', 0, 'parameters', 'foo']", SpecificationSource(manifest_file), ) ], diff --git a/tests/test_app.py b/tests/test_app.py index 0fb5e10b..a2f7135c 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1137,8 +1137,8 @@ async def test_bulk_specification_dts_success(): ], }, "files": { - "gff_metagenome": {"file": f"testuser/{manifest_1}", "tab": None}, - "gff_genome": {"file": f"testuser/{manifest_2}", "tab": None}, + "gff_metagenome": {"file": f"testuser/{sub_dir}/{manifest_1}", "tab": None}, + "gff_genome": {"file": f"testuser/{sub_dir}/{manifest_2}", "tab": None}, }, } assert resp.status == 200 From 33ffb2c68cbfe985a8e5ead90aa8b7fc13de79cc Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Wed, 11 Dec 2024 22:37:19 -0500 Subject: [PATCH 4/8] simply and reformat --- .../individual_parsers.py | 57 +++++-------------- .../test_individual_parsers.py | 2 +- 2 files changed, 15 insertions(+), 44 deletions(-) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 56e2af11..65e038b5 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -2,6 +2,7 @@ Contains parser functions for use with the file parser framework. """ +from collections import defaultdict import csv import json import math @@ -63,9 +64,6 @@ _DTS_INSTRUCTIONS_KEY = "instructions" _DTS_INSTRUCTIONS_DATATYPE_KEY = "data_type" _DTS_INSTRUCTIONS_PARAMETERS_KEY = "parameters" -_DTS_INSTRUCTIONS_REQUIRED_KEYS = [_DTS_INSTRUCTIONS_DATATYPE_KEY, _DTS_INSTRUCTIONS_PARAMETERS_KEY] -_DTS_INSTRUCTIONS_PROTOCOL_KEY = "protocol" -_DTS_INSTRUCTIONS_PROTOCOL = "KBase narrative import" _DTS_INSTRUCTIONS_OBJECTS_KEY = "objects" @@ -359,62 +357,35 @@ def parse_dts_manifest(path: Path, validator: Draft202012Validator) -> ParseResu results = {} try: with open(path, "r") as manifest: - manifest_json = json.load(manifest) - if not isinstance(manifest_json, dict): + manifest = json.load(manifest) + if not isinstance(manifest, dict): return _error(Error(ErrorType.PARSE_FAIL, "Manifest is not a dictionary", spcsrc)) - for err in validator.iter_errors(manifest_json): + for err in validator.iter_errors(manifest): err_str = err.message err_path = list(err.absolute_path) if err_path: err_str += f" at {err_path}" errors.append(Error(ErrorType.PARSE_FAIL, err_str, spcsrc)) if not errors: - results = _process_dts_manifest(manifest_json, spcsrc) - + results = defaultdict(list) + instructions = manifest[_DTS_INSTRUCTIONS_KEY] + for resource_obj in instructions[_DTS_INSTRUCTIONS_OBJECTS_KEY]: + datatype = resource_obj[_DTS_INSTRUCTIONS_DATATYPE_KEY] + parameters = frozendict(resource_obj[_DTS_INSTRUCTIONS_PARAMETERS_KEY]) + results[datatype].append(parameters) + # Re-package results as a dict of {datatype: ParseResult} + results = { + source: ParseResult(spcsrc, tuple(parsed)) for source, parsed in results.items() + } except json.JSONDecodeError: return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc)) except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) - except _ParseException as err: - return _error(err.args[0]) if errors: return ParseResults(errors=tuple(errors)) elif results: return ParseResults(frozendict(results)) else: return _error(Error(ErrorType.PARSE_FAIL, "No import specification data in file", spcsrc)) - - -def _process_dts_manifest( - manifest: dict[str, Any], spcsrc: SpecificationSource -) -> Tuple[dict[str, ParseResult]]: - """Parse the DTS manifest file and return the results and a list of errors if applicable. - - Results are returned as a dictionary where keys are data types, and values are ParseResults for that data type. - This assumes that the manifest has the correct structure, i.e. is validated via jsonschema. - Will raise KeyErrors otherwise. - """ - results = {} - instructions = manifest[_DTS_INSTRUCTIONS_KEY] - # Make sure the protocol value matches. - if instructions[_DTS_INSTRUCTIONS_PROTOCOL_KEY] != _DTS_INSTRUCTIONS_PROTOCOL: - raise _ParseException( - Error( - ErrorType.PARSE_FAIL, - f"The instructions protocol must be '{_DTS_INSTRUCTIONS_PROTOCOL}'", - spcsrc, - ) - ) - for resource_obj in instructions[_DTS_INSTRUCTIONS_OBJECTS_KEY]: - datatype = resource_obj[_DTS_INSTRUCTIONS_DATATYPE_KEY] - parameters = frozendict(resource_obj[_DTS_INSTRUCTIONS_PARAMETERS_KEY]) - if datatype not in results: - results[datatype] = [] - results[datatype].append(parameters) - # Package results as a dict of {datatype: ParseResult} - parsed_result = { - source: ParseResult(spcsrc, tuple(parsed)) for source, parsed in results.items() - } - return parsed_result diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 2408208b..33ed584b 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -17,7 +17,6 @@ parse_dts_manifest, parse_excel, parse_tsv, - _DTS_INSTRUCTIONS_PROTOCOL, ) from tests.test_app import FileUtil from tests.test_utils import bootstrap_config @@ -25,6 +24,7 @@ _TEST_DATA_DIR = (Path(__file__).parent / "test_data").resolve() +_DTS_INSTRUCTIONS_PROTOCOL = "KBase narrative import" @pytest.fixture(scope="module", name="temp_dir") From 6136140cb39e054990bf62fa581706779586df7a Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Mon, 16 Dec 2024 10:41:18 -0500 Subject: [PATCH 5/8] minor tweaks --- .../individual_parsers.py | 2 +- .../test_data/manifest_small.json | 8 ++++---- .../test_individual_parsers.py | 19 ++++++++++++++----- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 65e038b5..9aafe79b 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -375,7 +375,7 @@ def parse_dts_manifest(path: Path, validator: Draft202012Validator) -> ParseResu results[datatype].append(parameters) # Re-package results as a dict of {datatype: ParseResult} results = { - source: ParseResult(spcsrc, tuple(parsed)) for source, parsed in results.items() + datatype: ParseResult(spcsrc, tuple(parsed)) for datatype, parsed in results.items() } except json.JSONDecodeError: return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc)) diff --git a/tests/import_specifications/test_data/manifest_small.json b/tests/import_specifications/test_data/manifest_small.json index dd9b7595..54a102f8 100644 --- a/tests/import_specifications/test_data/manifest_small.json +++ b/tests/import_specifications/test_data/manifest_small.json @@ -111,15 +111,15 @@ { "data_type": "gff_metagenome", "parameters": { - "param1": "value1", - "param2": "value2" + "param1": "value3", + "param2": "value4" } }, { "data_type": "gff_metagenome", "parameters": { - "param1": "value1", - "param2": "value2" + "param1": "value5", + "param2": "value6" } } ] diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 33ed584b..4ce8aef6 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -797,12 +797,21 @@ def test_dts_manifest_parse_success(dts_validator: Draft202012Validator): assert res.results assert res.errors is None assert list(res.results.keys()) == ["gff_metagenome"] - assert res.results["gff_metagenome"] assert res.results["gff_metagenome"].source.file == f - assert len(res.results["gff_metagenome"].result) == 3 - for parsed in res.results["gff_metagenome"].result: - assert parsed == {"param1": "value1", "param2": "value2"} - + assert res.results["gff_metagenome"].result == ( + frozendict({ + "param1": "value1", + "param2": "value2" + }), + frozendict({ + "param1": "value3", + "param2": "value4" + }), + frozendict({ + "param1": "value5", + "param2": "value6" + }) + ) def test_dts_manifest_parse_multi_data_types_success(dts_validator: Draft202012Validator): f = _get_test_file("manifest_multiple.json") From 499dfe4de3d87f9c5aeddef32e7a65732d836532 Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Mon, 16 Dec 2024 10:41:52 -0500 Subject: [PATCH 6/8] ruff formatting --- .../test_individual_parsers.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index 4ce8aef6..61232d9f 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -799,20 +799,12 @@ def test_dts_manifest_parse_success(dts_validator: Draft202012Validator): assert list(res.results.keys()) == ["gff_metagenome"] assert res.results["gff_metagenome"].source.file == f assert res.results["gff_metagenome"].result == ( - frozendict({ - "param1": "value1", - "param2": "value2" - }), - frozendict({ - "param1": "value3", - "param2": "value4" - }), - frozendict({ - "param1": "value5", - "param2": "value6" - }) + frozendict({"param1": "value1", "param2": "value2"}), + frozendict({"param1": "value3", "param2": "value4"}), + frozendict({"param1": "value5", "param2": "value6"}), ) + def test_dts_manifest_parse_multi_data_types_success(dts_validator: Draft202012Validator): f = _get_test_file("manifest_multiple.json") res = parse_dts_manifest(f, dts_validator) From 224153aa5bbf9fb68804df481030adc252904baa Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Tue, 17 Dec 2024 10:30:40 -0500 Subject: [PATCH 7/8] fix typo --- staging_service/import_specifications/individual_parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 9aafe79b..35d2a88f 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -375,7 +375,7 @@ def parse_dts_manifest(path: Path, validator: Draft202012Validator) -> ParseResu results[datatype].append(parameters) # Re-package results as a dict of {datatype: ParseResult} results = { - datatype: ParseResult(spcsrc, tuple(parsed)) for datatype, parsed in results.items() + datatype: ParseResult(spcsrc, tuple(paramlist)) for datatype, paramlist in results.items() } except json.JSONDecodeError: return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc)) From d0e08b942383fc7b02c0aa5e4aa56b3e2654e01a Mon Sep 17 00:00:00 2001 From: Bill Riehl Date: Tue, 17 Dec 2024 10:33:12 -0500 Subject: [PATCH 8/8] ruff formatting --- staging_service/import_specifications/individual_parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 35d2a88f..b13aba62 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -375,7 +375,8 @@ def parse_dts_manifest(path: Path, validator: Draft202012Validator) -> ParseResu results[datatype].append(parameters) # Re-package results as a dict of {datatype: ParseResult} results = { - datatype: ParseResult(spcsrc, tuple(paramlist)) for datatype, paramlist in results.items() + datatype: ParseResult(spcsrc, tuple(paramlist)) + for datatype, paramlist in results.items() } except json.JSONDecodeError: return _error(Error(ErrorType.PARSE_FAIL, "File must be in JSON format", spcsrc))