Skip to content

Commit

Permalink
Merge branch 'dts-manifest-schema' into dts-manifest-parse
Browse files Browse the repository at this point in the history
  • Loading branch information
briehl committed Dec 12, 2024
2 parents aa3e8df + cb4b95a commit c099ca5
Show file tree
Hide file tree
Showing 7 changed files with 198 additions and 86 deletions.
2 changes: 1 addition & 1 deletion deployment/conf/deployment.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ DATA_DIR = /kb/deployment/lib/src/data/bulk/
AUTH_URL = https://ci.kbase.us/services/auth/api/V2/token
CONCIERGE_PATH = /kbaseconcierge
FILE_EXTENSION_MAPPINGS = /kb/deployment/conf/supported_apps_w_extensions.json
DTS_MANIFEST_SCHEMA = /kb/deployment/import_specifications/schema/dts_manifest.json
DTS_MANIFEST_SCHEMA = /kb/deployment/import_specifications/schema/dts_manifest_schema.json
2 changes: 1 addition & 1 deletion deployment/conf/testing.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ AUTH_URL = https://ci.kbase.us/services/auth/api/V2/token
CONCIERGE_PATH = /kbaseconcierge
FILE_EXTENSION_MAPPINGS = ./deployment/conf/supported_apps_w_extensions.json
;FILE_EXTENSION_MAPPINGS_PYCHARM = ../deployment/conf/supported_apps_w_extensions.json
DTS_MANIFEST_SCHEMA = ./import_specifications/schema/dts_manifest.json
DTS_MANIFEST_SCHEMA = ./import_specifications/schema/dts_manifest_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
"instructions": {
"type": "object",
"properties": {
"protocol": {"type": "string"},
"protocol": {
"type": "string",
"const": "KBase narrative import"
},
"objects": {
"type": "array",
"items": {
Expand All @@ -15,17 +18,14 @@
"data_type": {"type": "string"},
"parameters": {
"type": "object",
"patternProperties": {
".*": {
"oneOf": [
{"type": "string"},
{"type": "number"},
{"type": "boolean"},
{"type": "null"}
]
}
},
"additionalProperties": false
"additionalProperties": {
"oneOf": [
{"type": "string"},
{"type": "number"},
{"type": "boolean"},
{"type": "null"}
]
}
}
},
"required": ["data_type", "parameters"]
Expand Down
55 changes: 36 additions & 19 deletions staging_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import aiohttp_cors
from aiohttp import web
import jsonschema

from .app_error_formatter import format_import_spec_errors
from .auth2Client import KBaseAuth2
Expand Down Expand Up @@ -42,7 +43,7 @@
VERSION = "1.3.6"

_DATATYPE_MAPPINGS = None
_DTS_MANIFEST_SCHEMA = None
_DTS_MANIFEST_VALIDATOR: jsonschema.Draft202012Validator | None = None

_APP_JSON = "application/json"

Expand All @@ -54,6 +55,10 @@
EXCEL: write_excel,
}

# The constant in autodetect.Mappings isn't guaranteed to be the string we want.
JSON_EXTENSION = "json"
NO_EXTENSION = "missing extension"


@routes.get("/importer_filetypes/")
async def importer_filetypes(_: web.Request) -> web.json_response:
Expand Down Expand Up @@ -106,17 +111,17 @@ def _file_type_resolver(path: PathPy) -> FileTypeResolution:


def _make_dts_file_resolver() -> Callable[[Path], FileTypeResolution]:
"""Makes a DTS file resolver.
"""Makes a DTS file resolver
This looks a little goofy, but it ensures that the DTS manifest schema file
only gets loaded once per API call, no matter how many DTS manifest files are
expected to be parsed. It also prevents having it stick around in memory.
This injects the DTS schema into the FileTypeResolution's parser call.
"""
with open(_DTS_MANIFEST_SCHEMA) as schema_file:
dts_schema = json.load(schema_file)

def dts_file_resolver(_: PathPy):
return FileTypeResolution(parser=lambda p: parse_dts_manifest(p, dts_schema))
def dts_file_resolver(path: PathPy) -> FileTypeResolution:
# must be a ".json" file
suffix = path.suffix[1:] if path.suffix else NO_EXTENSION
if suffix.lower() != JSON_EXTENSION:
return FileTypeResolution(unsupported_type=suffix)
return FileTypeResolution(parser=lambda p: parse_dts_manifest(p, _DTS_MANIFEST_VALIDATOR))

return dts_file_resolver

Expand All @@ -130,23 +135,21 @@ async def bulk_specification(request: web.Request) -> web.json_response:
:param request: contains the URL parameters for the request. Expected to have the following:
* files (required) - a comma separated list of files, e.g. folder1/file1.txt,file2.txt
* dts (optional) - if present, and has the value "1", this will treat all of the given
files as DTS manifest files, and attempt to parse them accordingly.
* dts (optional) - if present this will treat all of the given files as DTS manifest files,
and attempt to parse them accordingly.
"""
username = await authorize_request(request)
params = parse_qs(request.query_string)
files = params.get("files", [])
files = parse_qs(request.query_string).get("files", [])
files = files[0].split(",") if files else []
files = [f.strip() for f in files if f.strip()]
paths = {}
for f in files:
p = Path.validate_path(username, f)
paths[PathPy(p.full_path)] = PathPy(p.user_path)
as_dts = params.get("dts", ["0"])[0] == "1"

# list(dict) returns a list of the dict keys in insertion order (py3.7+)
file_type_resolver = _file_type_resolver
if as_dts:
if "dts" in request.query:
file_type_resolver = _make_dts_file_resolver()
res = parse_import_specifications(
tuple(list(paths)),
Expand Down Expand Up @@ -605,6 +608,18 @@ async def authorize_request(request):
return username


def load_and_validate_schema(schema_path: PathPy) -> jsonschema.Draft202012Validator:
with open(schema_path) as schema_file:
dts_schema = json.load(schema_file)
try:
jsonschema.Draft202012Validator.check_schema(dts_schema)
except jsonschema.exceptions.SchemaError as err:
raise Exception(
f"Schema file {schema_path} is not a valid JSON schema: {err.message}"
) from err
return jsonschema.Draft202012Validator(dts_schema)


def inject_config_dependencies(config):
"""
# TODO this is pretty hacky dependency injection
Expand Down Expand Up @@ -637,7 +652,7 @@ def inject_config_dependencies(config):
Path._DATA_DIR = DATA_DIR
Path._META_DIR = META_DIR
Path._CONCIERGE_PATH = CONCIERGE_PATH
Path._DTS_MANIFEST_SCHEMA_PATH = DTS_MANIFEST_SCHEMA_PATH
_DTS_MANIFEST_SCHEMA_PATH = DTS_MANIFEST_SCHEMA_PATH

if Path._DATA_DIR is None:
raise Exception("Please provide DATA_DIR in the config file ")
Expand All @@ -648,10 +663,12 @@ def inject_config_dependencies(config):
if Path._CONCIERGE_PATH is None:
raise Exception("Please provide CONCIERGE_PATH in the config file ")

if Path._DTS_MANIFEST_SCHEMA_PATH is None:
if _DTS_MANIFEST_SCHEMA_PATH is None:
raise Exception("Please provide DTS_MANIFEST_SCHEMA in the config file")
global _DTS_MANIFEST_SCHEMA
_DTS_MANIFEST_SCHEMA = DTS_MANIFEST_SCHEMA_PATH

global _DTS_MANIFEST_VALIDATOR
# will raise an Exception if the schema is invalid
_DTS_MANIFEST_VALIDATOR = load_and_validate_schema(DTS_MANIFEST_SCHEMA_PATH)

if FILE_EXTENSION_MAPPINGS is None:
raise Exception("Please provide FILE_EXTENSION_MAPPINGS in the config file ")
Expand Down
20 changes: 5 additions & 15 deletions staging_service/import_specifications/individual_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@

import csv
import json
import jsonschema
import math
import re
from pathlib import Path
from typing import Any, Optional, Tuple, Union

import jsonschema.exceptions
from jsonschema import Draft202012Validator
import magic
import pandas
from frozendict import frozendict
Expand Down Expand Up @@ -329,7 +328,7 @@ def parse_excel(path: Path) -> ParseResults:
return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc))


def parse_dts_manifest(path: Path, dts_manifest_schema: dict) -> ParseResults:
def parse_dts_manifest(path: Path, validator: Draft202012Validator) -> ParseResults:
"""
Parse the provided DTS manifest file. Expected to be JSON, and will fail otherwise.
The manifest should have this format, with expected keys included:
Expand All @@ -352,31 +351,22 @@ def parse_dts_manifest(path: Path, dts_manifest_schema: dict) -> ParseResults:
and its value will be a Tuple of frozendicts of the parameters. Also, in keeping
with the xsv parsers, each parameter value is expected to be a PRIMITIVE_TYPE.
TODO: include further details here, and in separate documentation - ADR?
TODO: include further details in separate documentation
"""
spcsrc = SpecificationSource(path)
errors = []
# dummy for now
results = {}
try:
jsonschema.Draft202012Validator.check_schema(dts_manifest_schema)
with open(path, "r") as manifest:
manifest_json = json.load(manifest)
if not isinstance(manifest_json, dict):
return _error(Error(ErrorType.PARSE_FAIL, "Manifest is not a dictionary", spcsrc))
validator = jsonschema.Draft202012Validator(dts_manifest_schema)
for err in validator.iter_errors(manifest_json):
err_str = err.message
err_path = err.absolute_path
err_path = list(err.absolute_path)
if err_path:
# paths can look like, say, ["instructions", "objects", 0, "data_type"]
# convert that '0' to "item 0" to be slightly more readable to users.
# kind of a mouthful below, but does that conversion in place
err_path = [f"item {elem}" if isinstance(elem, int) else elem for elem in err_path]
prep = "for"
if len(err_path) > 1:
prep = "at"
err_str += f" {prep} {'/'.join(err_path)}"
err_str += f" at {err_path}"
errors.append(Error(ErrorType.PARSE_FAIL, err_str, spcsrc))
if not errors:
results = _process_dts_manifest(manifest_json, spcsrc)
Expand Down
Loading

0 comments on commit c099ca5

Please sign in to comment.