From 0965c284363107dc79104b9794e66b58f5069a6c Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Thu, 11 Jan 2024 18:42:26 +0100 Subject: [PATCH] refactor: make preprocessor use yaml files from data directory --- fuji_server/helper/preprocessor.py | 78 +++++++++++++++--------------- tests/helper/test_preprocessor.py | 26 ++++------ 2 files changed, 48 insertions(+), 56 deletions(-) diff --git a/fuji_server/helper/preprocessor.py b/fuji_server/helper/preprocessor.py index fc606b8c..fc3f03a1 100644 --- a/fuji_server/helper/preprocessor.py +++ b/fuji_server/helper/preprocessor.py @@ -2,11 +2,10 @@ # # SPDX-License-Identifier: MIT -import json import logging import mimetypes -import os import time +from pathlib import Path from urllib.parse import urlparse import requests @@ -52,8 +51,8 @@ class Preprocessor: identifiers_org_data = {} google_data_dois = [] google_data_urls = [] - # fuji_server_dir = os.path.dirname(sys.modules['__main__'].__file__) - fuji_server_dir = os.path.dirname(os.path.dirname(__file__)) # project_root + fuji_server_dir = Path(__file__).parent.parent # project_root + data_dir = fuji_server_dir / "data" header = {"Accept": "application/json"} logger = logging.getLogger(__name__) data_files_limit = 3 @@ -114,9 +113,9 @@ def get_identifiers_org_data(cls): @classmethod def retrieve_identifiers_org_data(cls): - std_uri_path = os.path.join(cls.fuji_server_dir, "data", "identifiers_org_resolver_data.json") - with open(std_uri_path, encoding="utf8") as f: - identifiers_data = json.load(f) + std_uri_path = cls.data_dir / "identifiers_org_resolver_data.yaml" + with open(std_uri_path, encoding="utf-8") as f: + identifiers_data = yaml.safe_load(f) if identifiers_data: for namespace in identifiers_data["payload"]["namespaces"]: cls.identifiers_org_data[namespace["prefix"]] = { @@ -133,7 +132,7 @@ def get_resource_types(cls): @classmethod def retrieve_resource_types(cls): ns = [] - ns_file_path = os.path.join(cls.fuji_server_dir, "data", "ResourceTypes.txt") + ns_file_path = cls.data_dir / "ResourceTypes.txt" with open(ns_file_path) as f: ns = [line.lower().rstrip() for line in f] if ns: @@ -142,9 +141,9 @@ def retrieve_resource_types(cls): @classmethod def retrieve_schema_org_context(cls): data = {} - std_uri_path = os.path.join(cls.fuji_server_dir, "data", "jsonldcontext.json") + std_uri_path = cls.data_dir / "jsonldcontext.yaml" with open(std_uri_path) as f: - data = json.load(f) + data = yaml.safe_load(f) if data: for context, schemadict in data.get("@context").items(): if isinstance(schemadict, dict): @@ -158,11 +157,11 @@ def retrieve_schema_org_context(cls): @classmethod def retrieve_schema_org_creativeworks(cls, include_bioschemas=True): data = [] - cw_path = os.path.join(cls.fuji_server_dir, "data", "creativeworktypes.txt") + cw_path = cls.data_dir / "creativeworktypes.txt" with open(cw_path) as f: data = f.read().splitlines() if include_bioschemas: - bs_path = os.path.join(cls.fuji_server_dir, "data", "bioschemastypes.txt") + bs_path = cls.data_dir / "bioschemastypes.txt" with open(bs_path) as f: bdata = f.read().splitlines() data.extend(bdata) @@ -215,8 +214,8 @@ def retrieve_metrics_yaml(cls, yaml_metric_path): def retrieve_datacite_re3repos(cls): # retrieve all client id and re3data doi from datacite isDebugMode = True - re3dict_path = os.path.join(cls.fuji_server_dir, "data", "repodois.yaml") - repolistdate = os.path.getmtime(re3dict_path) + re3dict_path = cls.data_dir / "repodois.yaml" + repolistdate = re3dict_path.stat().st_mtime try: # update once a day if time.time() - repolistdate >= 86400: @@ -242,7 +241,7 @@ def retrieve_datacite_re3repos(cls): # fix wrong entry cls.re3repositories["bl.imperial"] = "http://doi.org/10.17616/R3K64N" with open(re3dict_path, "w") as f2: - yaml.dump(cls.re3repositories, f2) + yaml.safe_dump(cls.re3repositories, f2) except requests.exceptions.RequestException as e: print("Preprocessor Error: " + str(e)) @@ -251,20 +250,20 @@ def retrieve_datacite_re3repos(cls): @classmethod def get_access_rights(cls): data = None - jsn_path = os.path.join(cls.fuji_server_dir, "data", "access_rights.json") - with open(jsn_path) as f: - data = json.load(f) + path = cls.data_dir / "access_rights.yaml" + with path.open() as f: + data = yaml.safe_load(f) return data @classmethod def retrieve_licenses(cls, isDebugMode): data = None - jsn_path = os.path.join(cls.fuji_server_dir, "data", "licenses.json") + path = cls.data_dir / "licenses.yaml" # The repository can be found at https://github.com/spdx/license-list-data # https://spdx.org/spdx-license-list/license-list-overview if isDebugMode: # use local file instead of downloading the file online - with open(jsn_path) as f: - data = json.load(f) + with open(path) as f: + data = yaml.safe_load(f) else: # cls.SPDX_URL = license_path try: @@ -275,12 +274,12 @@ def retrieve_licenses(cls, isDebugMode): data = resp["licenses"] for d in data: d["name"] = d["name"].lower() # convert license name to lowercase - with open(jsn_path, "w") as f: - json.dump(data, f) - except json.decoder.JSONDecodeError as e1: - cls.logger.error(e1) - except requests.exceptions.RequestException as e2: - cls.logger.error(e2) + with open(path, "w") as f: + yaml.safe_dump(data, f) + except yaml.YAMLError as exc1: + cls.logger.error(exc1) + except requests.exceptions.RequestException as exc2: + cls.logger.error(exc2) if data: cls.all_licenses = data for licenceitem in cls.all_licenses: @@ -305,9 +304,9 @@ def retrieve_licenses(cls, isDebugMode): @classmethod def retrieve_metadata_standards_uris(cls): data = {} - std_uri_path = os.path.join(cls.fuji_server_dir, "data", "metadata_standards_uris.json") + std_uri_path = cls.data_dir / "metadata_standards_uris.yaml" with open(std_uri_path) as f: - data = json.load(f) + data = yaml.safe_load(f) if data: cls.metadata_standards_uris = data @@ -315,11 +314,11 @@ def retrieve_metadata_standards_uris(cls): def retrieve_metadata_standards(cls): # cls.retrieve_metadata_standards_uris() data = {} - std_path = os.path.join(cls.fuji_server_dir, "data", "metadata_standards.json") + std_path = cls.data_dir / "metadata_standards.yaml" # The original repository can be retrieved via https://rdamsc.bath.ac.uk/api/m # or at https://github.com/rd-alliance/metadata-catalog-dev with open(std_path) as f: - data = json.load(f) + data = yaml.safe_load(f) """else: try: r = requests.get(catalog_url) @@ -357,9 +356,9 @@ def retrieve_metadata_standards(cls): @classmethod def retrieve_all_file_formats(cls): data = {} - sci_file_path = os.path.join(cls.fuji_server_dir, "data", "file_formats.json") + sci_file_path = cls.data_dir / "file_formats.yaml" with open(sci_file_path) as f: - data = json.load(f) + data = yaml.safe_load(f) if data: cls.all_file_formats = data @@ -420,18 +419,17 @@ def retrieve_open_file_formats(cls, isDebugMode): @classmethod def retrieve_standard_protocols(cls, isDebugMode): data = {} - protocols_path = os.path.join(cls.fuji_server_dir, "data", "standard_uri_protocols.json") + protocols_path = cls.data_dir / "standard_uri_protocols.yaml" with open(protocols_path) as f: - data = json.load(f) + data = yaml.safe_load(f) if data: cls.standard_protocols = data @classmethod def retrieve_default_namespaces(cls): ns = [] - ns_file_path = os.path.join(cls.fuji_server_dir, "data", "default_namespaces.txt") + ns_file_path = cls.data_dir / "default_namespaces.txt" with open(ns_file_path) as f: - # ns = [line.split(':',1)[1].strip() for line in f] ns = [line.rstrip().rstrip("/#") for line in f] if ns: cls.default_namespaces = ns @@ -456,11 +454,11 @@ def retrieve_linkedvocabs(cls, lov_api, lodcloud_api, isDebugMode): cls.LOD_CLOUDNET = lodcloud_api # cls.BIOPORTAL_API = bioportal_api # cls.BIOPORTAL_KEY = bioportal_key - ld_path = os.path.join(cls.fuji_server_dir, "data", "linked_vocab.json") + ld_path = cls.data_dir / "linked_vocab.yaml" vocabs = [] if isDebugMode: with open(ld_path) as f: - cls.linked_vocabs = json.load(f) + cls.linked_vocabs = yaml.safe_load(f) else: # 1. retrieve records from https://lov.linkeddata.es/dataset/lov/api # 714 vocabs, of which 104 vocabs uri specified are broken (02072020) @@ -540,7 +538,7 @@ def retrieve_linkedvocabs(cls, lov_api, lodcloud_api, isDebugMode): # 3. write to a local file try: with open(ld_path, "w") as f: - json.dump(vocabs, f) + yaml.safe_dump(vocabs, f) cls.linked_vocabs = vocabs except OSError: cls.logger.error(f"Couldn't write to file {ld_path}.") diff --git a/tests/helper/test_preprocessor.py b/tests/helper/test_preprocessor.py index d7cc8c0c..87b5a910 100644 --- a/tests/helper/test_preprocessor.py +++ b/tests/helper/test_preprocessor.py @@ -19,7 +19,6 @@ They mock the fuji_server/data path to not override the files under fuji server """ -import json from typing import Any import pytest @@ -31,11 +30,6 @@ isDebug = True -def load_json_from_data_directory(filename: str): - path = DATA_DIR.joinpath(filename) - return json.loads(path.read_text()) - - def load_yaml_from_data_directory(filename: str): path = DATA_DIR.joinpath(filename) return yaml.safe_load(path.read_text()) @@ -48,12 +42,12 @@ def load_txt_from_data_directory(filename: str): @pytest.fixture(scope="session") def licenses(): - return load_json_from_data_directory("licenses.json") + return load_yaml_from_data_directory("licenses.yaml") @pytest.fixture(scope="session") def metadata_standards(): - return load_json_from_data_directory("metadata_standards.json") + return load_yaml_from_data_directory("metadata_standards.yaml") @pytest.fixture(scope="session") @@ -63,42 +57,42 @@ def repodois(): @pytest.fixture(scope="session") def metadata_standards_uris(): - return load_json_from_data_directory("metadata_standards_uris.json") + return load_yaml_from_data_directory("metadata_standards_uris.yaml") @pytest.fixture(scope="session") def science_formats(): - return load_json_from_data_directory("science_formats.json") + return load_yaml_from_data_directory("science_formats.yaml") @pytest.fixture(scope="session") def linked_vocab(): - return load_json_from_data_directory("linked_vocab.json") + return load_yaml_from_data_directory("linked_vocab.yaml") @pytest.fixture(scope="session") def identifiers_org_resolver_data(): - return load_json_from_data_directory("identifiers_org_resolver_data.json") + return load_yaml_from_data_directory("identifiers_org_resolver_data.yaml") @pytest.fixture(scope="session") def jsonldcontext(): - return load_json_from_data_directory("jsonldcontext.json") + return load_yaml_from_data_directory("jsonldcontext.yaml") @pytest.fixture(scope="session") def longterm_formats(): - return load_json_from_data_directory("longterm_formats.json") + return load_yaml_from_data_directory("longterm_formats.yaml") @pytest.fixture(scope="session") def open_formats(): - return load_json_from_data_directory("open_formats.json") + return load_yaml_from_data_directory("open_formats.yaml") @pytest.fixture(scope="session") def standard_uri_protocols(): - return load_json_from_data_directory("standard_uri_protocols.json") + return load_yaml_from_data_directory("standard_uri_protocols.yaml") @pytest.fixture(scope="session")