Skip to content

Commit

Permalink
refactor: make preprocessor use yaml files from data directory
Browse files Browse the repository at this point in the history
  • Loading branch information
afuetterer committed Jan 17, 2024
1 parent 3c3685e commit 0965c28
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 56 deletions.
78 changes: 38 additions & 40 deletions fuji_server/helper/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
#
# SPDX-License-Identifier: MIT

import json
import logging
import mimetypes
import os
import time
from pathlib import Path
from urllib.parse import urlparse

import requests
Expand Down Expand Up @@ -52,8 +51,8 @@ class Preprocessor:
identifiers_org_data = {}
google_data_dois = []
google_data_urls = []
# fuji_server_dir = os.path.dirname(sys.modules['__main__'].__file__)
fuji_server_dir = os.path.dirname(os.path.dirname(__file__)) # project_root
fuji_server_dir = Path(__file__).parent.parent # project_root
data_dir = fuji_server_dir / "data"
header = {"Accept": "application/json"}
logger = logging.getLogger(__name__)
data_files_limit = 3
Expand Down Expand Up @@ -114,9 +113,9 @@ def get_identifiers_org_data(cls):

@classmethod
def retrieve_identifiers_org_data(cls):
std_uri_path = os.path.join(cls.fuji_server_dir, "data", "identifiers_org_resolver_data.json")
with open(std_uri_path, encoding="utf8") as f:
identifiers_data = json.load(f)
std_uri_path = cls.data_dir / "identifiers_org_resolver_data.yaml"
with open(std_uri_path, encoding="utf-8") as f:
identifiers_data = yaml.safe_load(f)
if identifiers_data:
for namespace in identifiers_data["payload"]["namespaces"]:
cls.identifiers_org_data[namespace["prefix"]] = {
Expand All @@ -133,7 +132,7 @@ def get_resource_types(cls):
@classmethod
def retrieve_resource_types(cls):
ns = []
ns_file_path = os.path.join(cls.fuji_server_dir, "data", "ResourceTypes.txt")
ns_file_path = cls.data_dir / "ResourceTypes.txt"
with open(ns_file_path) as f:
ns = [line.lower().rstrip() for line in f]
if ns:
Expand All @@ -142,9 +141,9 @@ def retrieve_resource_types(cls):
@classmethod
def retrieve_schema_org_context(cls):
data = {}
std_uri_path = os.path.join(cls.fuji_server_dir, "data", "jsonldcontext.json")
std_uri_path = cls.data_dir / "jsonldcontext.yaml"
with open(std_uri_path) as f:
data = json.load(f)
data = yaml.safe_load(f)
if data:
for context, schemadict in data.get("@context").items():
if isinstance(schemadict, dict):
Expand All @@ -158,11 +157,11 @@ def retrieve_schema_org_context(cls):
@classmethod
def retrieve_schema_org_creativeworks(cls, include_bioschemas=True):
data = []
cw_path = os.path.join(cls.fuji_server_dir, "data", "creativeworktypes.txt")
cw_path = cls.data_dir / "creativeworktypes.txt"
with open(cw_path) as f:
data = f.read().splitlines()
if include_bioschemas:
bs_path = os.path.join(cls.fuji_server_dir, "data", "bioschemastypes.txt")
bs_path = cls.data_dir / "bioschemastypes.txt"
with open(bs_path) as f:
bdata = f.read().splitlines()
data.extend(bdata)
Expand Down Expand Up @@ -215,8 +214,8 @@ def retrieve_metrics_yaml(cls, yaml_metric_path):
def retrieve_datacite_re3repos(cls):
# retrieve all client id and re3data doi from datacite
isDebugMode = True
re3dict_path = os.path.join(cls.fuji_server_dir, "data", "repodois.yaml")
repolistdate = os.path.getmtime(re3dict_path)
re3dict_path = cls.data_dir / "repodois.yaml"
repolistdate = re3dict_path.stat().st_mtime
try:
# update once a day
if time.time() - repolistdate >= 86400:
Expand All @@ -242,7 +241,7 @@ def retrieve_datacite_re3repos(cls):
# fix wrong entry
cls.re3repositories["bl.imperial"] = "http://doi.org/10.17616/R3K64N"
with open(re3dict_path, "w") as f2:
yaml.dump(cls.re3repositories, f2)
yaml.safe_dump(cls.re3repositories, f2)

except requests.exceptions.RequestException as e:
print("Preprocessor Error: " + str(e))
Expand All @@ -251,20 +250,20 @@ def retrieve_datacite_re3repos(cls):
@classmethod
def get_access_rights(cls):
data = None
jsn_path = os.path.join(cls.fuji_server_dir, "data", "access_rights.json")
with open(jsn_path) as f:
data = json.load(f)
path = cls.data_dir / "access_rights.yaml"
with path.open() as f:
data = yaml.safe_load(f)
return data

@classmethod
def retrieve_licenses(cls, isDebugMode):
data = None
jsn_path = os.path.join(cls.fuji_server_dir, "data", "licenses.json")
path = cls.data_dir / "licenses.yaml"
# The repository can be found at https://github.com/spdx/license-list-data
# https://spdx.org/spdx-license-list/license-list-overview
if isDebugMode: # use local file instead of downloading the file online
with open(jsn_path) as f:
data = json.load(f)
with open(path) as f:
data = yaml.safe_load(f)
else:
# cls.SPDX_URL = license_path
try:
Expand All @@ -275,12 +274,12 @@ def retrieve_licenses(cls, isDebugMode):
data = resp["licenses"]
for d in data:
d["name"] = d["name"].lower() # convert license name to lowercase
with open(jsn_path, "w") as f:
json.dump(data, f)
except json.decoder.JSONDecodeError as e1:
cls.logger.error(e1)
except requests.exceptions.RequestException as e2:
cls.logger.error(e2)
with open(path, "w") as f:
yaml.safe_dump(data, f)
except yaml.YAMLError as exc1:
cls.logger.error(exc1)
except requests.exceptions.RequestException as exc2:
cls.logger.error(exc2)
if data:
cls.all_licenses = data
for licenceitem in cls.all_licenses:
Expand All @@ -305,21 +304,21 @@ def retrieve_licenses(cls, isDebugMode):
@classmethod
def retrieve_metadata_standards_uris(cls):
data = {}
std_uri_path = os.path.join(cls.fuji_server_dir, "data", "metadata_standards_uris.json")
std_uri_path = cls.data_dir / "metadata_standards_uris.yaml"
with open(std_uri_path) as f:
data = json.load(f)
data = yaml.safe_load(f)
if data:
cls.metadata_standards_uris = data

@classmethod
def retrieve_metadata_standards(cls):
# cls.retrieve_metadata_standards_uris()
data = {}
std_path = os.path.join(cls.fuji_server_dir, "data", "metadata_standards.json")
std_path = cls.data_dir / "metadata_standards.yaml"
# The original repository can be retrieved via https://rdamsc.bath.ac.uk/api/m
# or at https://github.com/rd-alliance/metadata-catalog-dev
with open(std_path) as f:
data = json.load(f)
data = yaml.safe_load(f)
"""else:
try:
r = requests.get(catalog_url)
Expand Down Expand Up @@ -357,9 +356,9 @@ def retrieve_metadata_standards(cls):
@classmethod
def retrieve_all_file_formats(cls):
data = {}
sci_file_path = os.path.join(cls.fuji_server_dir, "data", "file_formats.json")
sci_file_path = cls.data_dir / "file_formats.yaml"
with open(sci_file_path) as f:
data = json.load(f)
data = yaml.safe_load(f)
if data:
cls.all_file_formats = data

Expand Down Expand Up @@ -420,18 +419,17 @@ def retrieve_open_file_formats(cls, isDebugMode):
@classmethod
def retrieve_standard_protocols(cls, isDebugMode):
data = {}
protocols_path = os.path.join(cls.fuji_server_dir, "data", "standard_uri_protocols.json")
protocols_path = cls.data_dir / "standard_uri_protocols.yaml"
with open(protocols_path) as f:
data = json.load(f)
data = yaml.safe_load(f)
if data:
cls.standard_protocols = data

@classmethod
def retrieve_default_namespaces(cls):
ns = []
ns_file_path = os.path.join(cls.fuji_server_dir, "data", "default_namespaces.txt")
ns_file_path = cls.data_dir / "default_namespaces.txt"
with open(ns_file_path) as f:
# ns = [line.split(':',1)[1].strip() for line in f]
ns = [line.rstrip().rstrip("/#") for line in f]
if ns:
cls.default_namespaces = ns
Expand All @@ -456,11 +454,11 @@ def retrieve_linkedvocabs(cls, lov_api, lodcloud_api, isDebugMode):
cls.LOD_CLOUDNET = lodcloud_api
# cls.BIOPORTAL_API = bioportal_api
# cls.BIOPORTAL_KEY = bioportal_key
ld_path = os.path.join(cls.fuji_server_dir, "data", "linked_vocab.json")
ld_path = cls.data_dir / "linked_vocab.yaml"
vocabs = []
if isDebugMode:
with open(ld_path) as f:
cls.linked_vocabs = json.load(f)
cls.linked_vocabs = yaml.safe_load(f)
else:
# 1. retrieve records from https://lov.linkeddata.es/dataset/lov/api
# 714 vocabs, of which 104 vocabs uri specified are broken (02072020)
Expand Down Expand Up @@ -540,7 +538,7 @@ def retrieve_linkedvocabs(cls, lov_api, lodcloud_api, isDebugMode):
# 3. write to a local file
try:
with open(ld_path, "w") as f:
json.dump(vocabs, f)
yaml.safe_dump(vocabs, f)
cls.linked_vocabs = vocabs
except OSError:
cls.logger.error(f"Couldn't write to file {ld_path}.")
Expand Down
26 changes: 10 additions & 16 deletions tests/helper/test_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
They mock the fuji_server/data path to not override the files under fuji server
"""
import json
from typing import Any

import pytest
Expand All @@ -31,11 +30,6 @@
isDebug = True


def load_json_from_data_directory(filename: str):
path = DATA_DIR.joinpath(filename)
return json.loads(path.read_text())


def load_yaml_from_data_directory(filename: str):
path = DATA_DIR.joinpath(filename)
return yaml.safe_load(path.read_text())
Expand All @@ -48,12 +42,12 @@ def load_txt_from_data_directory(filename: str):

@pytest.fixture(scope="session")
def licenses():
return load_json_from_data_directory("licenses.json")
return load_yaml_from_data_directory("licenses.yaml")


@pytest.fixture(scope="session")
def metadata_standards():
return load_json_from_data_directory("metadata_standards.json")
return load_yaml_from_data_directory("metadata_standards.yaml")


@pytest.fixture(scope="session")
Expand All @@ -63,42 +57,42 @@ def repodois():

@pytest.fixture(scope="session")
def metadata_standards_uris():
return load_json_from_data_directory("metadata_standards_uris.json")
return load_yaml_from_data_directory("metadata_standards_uris.yaml")


@pytest.fixture(scope="session")
def science_formats():
return load_json_from_data_directory("science_formats.json")
return load_yaml_from_data_directory("science_formats.yaml")


@pytest.fixture(scope="session")
def linked_vocab():
return load_json_from_data_directory("linked_vocab.json")
return load_yaml_from_data_directory("linked_vocab.yaml")


@pytest.fixture(scope="session")
def identifiers_org_resolver_data():
return load_json_from_data_directory("identifiers_org_resolver_data.json")
return load_yaml_from_data_directory("identifiers_org_resolver_data.yaml")


@pytest.fixture(scope="session")
def jsonldcontext():
return load_json_from_data_directory("jsonldcontext.json")
return load_yaml_from_data_directory("jsonldcontext.yaml")


@pytest.fixture(scope="session")
def longterm_formats():
return load_json_from_data_directory("longterm_formats.json")
return load_yaml_from_data_directory("longterm_formats.yaml")


@pytest.fixture(scope="session")
def open_formats():
return load_json_from_data_directory("open_formats.json")
return load_yaml_from_data_directory("open_formats.yaml")


@pytest.fixture(scope="session")
def standard_uri_protocols():
return load_json_from_data_directory("standard_uri_protocols.json")
return load_yaml_from_data_directory("standard_uri_protocols.yaml")


@pytest.fixture(scope="session")
Expand Down

0 comments on commit 0965c28

Please sign in to comment.